1use crate::{IsccError, IsccResult};
7use std::cmp;
8
9pub fn alg_simhash(hash_digests: &[impl AsRef<[u8]>]) -> IsccResult<Vec<u8>> {
15 if hash_digests.len() >= 2 {
16 let expected_len = hash_digests[0].as_ref().len();
17 for (i, digest) in hash_digests.iter().enumerate().skip(1) {
18 if digest.as_ref().len() != expected_len {
19 return Err(IsccError::InvalidInput(format!(
20 "All hash digests must have equal length (expected {}, got {} at index {})",
21 expected_len,
22 digest.as_ref().len(),
23 i
24 )));
25 }
26 }
27 }
28 Ok(alg_simhash_inner(hash_digests))
29}
30
31pub(crate) fn alg_simhash_inner(hash_digests: &[impl AsRef<[u8]>]) -> Vec<u8> {
36 if hash_digests.is_empty() {
37 return vec![0u8; 32];
38 }
39
40 let n_bytes = hash_digests[0].as_ref().len();
41 let n_bits = n_bytes * 8;
42 let mut vector = vec![0u32; n_bits];
43
44 for digest in hash_digests {
45 let bytes = digest.as_ref();
46 for (i, count) in vector.iter_mut().enumerate() {
47 let byte_idx = i / 8;
48 let bit_idx = 7 - (i % 8); if (bytes[byte_idx] >> bit_idx) & 1 == 1 {
50 *count += 1;
51 }
52 }
53 }
54
55 let n = hash_digests.len() as u32;
57 let mut result = vec![0u8; n_bytes];
58 for (i, &count) in vector.iter().enumerate() {
59 if count * 2 >= n {
60 let byte_idx = i / 8;
61 let bit_idx = 7 - (i % 8);
62 result[byte_idx] |= 1 << bit_idx;
63 }
64 }
65
66 result
67}
68
69pub fn sliding_window(seq: &str, width: usize) -> IsccResult<Vec<String>> {
77 if width < 2 {
78 return Err(IsccError::InvalidInput(
79 "Sliding window width must be 2 or bigger.".into(),
80 ));
81 }
82 let chars: Vec<char> = seq.chars().collect();
83 let len = chars.len();
84 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
85 Ok((0..range)
86 .map(|i| {
87 let end = cmp::min(i + width, len);
88 chars[i..end].iter().collect()
89 })
90 .collect())
91}
92
93#[cfg(feature = "text-processing")]
99pub(crate) fn sliding_window_strs(seq: &str, width: usize) -> Vec<&str> {
100 debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
101 let char_indices: Vec<usize> = seq.char_indices().map(|(i, _)| i).collect();
102 let len = char_indices.len();
103 if len == 0 {
104 return vec![seq]; }
106 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
107 (0..range)
108 .map(|i| {
109 let start = char_indices[i];
110 let end = if i + width >= len {
111 seq.len()
112 } else {
113 char_indices[i + width]
114 };
115 &seq[start..end]
116 })
117 .collect()
118}
119
120#[cfg(feature = "meta-code")]
126pub(crate) fn sliding_window_bytes(data: &[u8], width: usize) -> Vec<&[u8]> {
127 debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
128 let len = data.len();
129 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
130 (0..range)
131 .map(|i| {
132 let end = cmp::min(i + width, len);
133 &data[i..end]
134 })
135 .collect()
136}
137
138#[cfg(test)]
139mod tests {
140 use super::*;
141
142 #[test]
143 fn test_sliding_window_basic() {
144 assert_eq!(
145 sliding_window("Hello", 4).unwrap(),
146 vec!["Hell".to_string(), "ello".to_string()]
147 );
148 }
149
150 #[test]
151 fn test_sliding_window_shorter_than_width() {
152 assert_eq!(sliding_window("ab", 3).unwrap(), vec!["ab".to_string()]);
153 }
154
155 #[test]
156 fn test_sliding_window_exact_width() {
157 assert_eq!(sliding_window("abc", 3).unwrap(), vec!["abc".to_string()]);
158 }
159
160 #[test]
161 fn test_sliding_window_empty() {
162 assert_eq!(sliding_window("", 3).unwrap(), vec!["".to_string()]);
163 }
164
165 #[test]
166 fn test_sliding_window_unicode() {
167 assert_eq!(
168 sliding_window("äöü", 2).unwrap(),
169 vec!["äö".to_string(), "öü".to_string()]
170 );
171 }
172
173 #[test]
174 fn test_alg_simhash_single_digest() {
175 let digest = blake3::hash(b"test");
176 let result = alg_simhash(&[digest.as_bytes().to_vec()]).unwrap();
177 assert_eq!(result, digest.as_bytes().to_vec());
178 }
179
180 #[test]
181 fn test_alg_simhash_empty() {
182 let empty: Vec<Vec<u8>> = vec![];
183 let result = alg_simhash(&empty).unwrap();
184 assert_eq!(result, vec![0u8; 32]);
185 }
186
187 #[test]
188 fn test_alg_simhash_identical_digests() {
189 let digest = vec![0xFFu8; 32];
190 let result = alg_simhash(&[digest.clone(), digest.clone(), digest]).unwrap();
191 assert_eq!(result, vec![0xFFu8; 32]);
192 }
193
194 #[test]
195 fn test_alg_simhash_opposite_digests() {
196 let ones = vec![0xFFu8; 32];
200 let zeros = vec![0x00u8; 32];
201 let result = alg_simhash(&[ones, zeros]).unwrap();
202 assert_eq!(result, vec![0xFFu8; 32]);
203 }
204
205 #[test]
206 fn test_alg_simhash_mismatched_lengths_returns_error() {
207 let result = alg_simhash(&[vec![1u8, 2], vec![1u8, 2, 3]]);
209 assert!(result.is_err());
210 let msg = result.unwrap_err().to_string();
211 assert!(
212 msg.contains("equal length"),
213 "error message should mention equal length, got: {msg}"
214 );
215 }
216
217 #[test]
218 fn test_sliding_window_width_too_small() {
219 let result = sliding_window("test", 1);
220 assert!(result.is_err());
221 let msg = result.unwrap_err().to_string();
222 assert!(
223 msg.contains("width must be 2"),
224 "error message should mention width constraint, got: {msg}"
225 );
226 }
227
228 #[cfg(feature = "text-processing")]
231 #[test]
232 fn test_sliding_window_strs_basic() {
233 assert_eq!(sliding_window_strs("Hello", 4), vec!["Hell", "ello"]);
234 }
235
236 #[cfg(feature = "text-processing")]
237 #[test]
238 fn test_sliding_window_strs_shorter_than_width() {
239 assert_eq!(sliding_window_strs("ab", 3), vec!["ab"]);
240 }
241
242 #[cfg(feature = "text-processing")]
243 #[test]
244 fn test_sliding_window_strs_exact_width() {
245 assert_eq!(sliding_window_strs("abc", 3), vec!["abc"]);
246 }
247
248 #[cfg(feature = "text-processing")]
249 #[test]
250 fn test_sliding_window_strs_empty() {
251 assert_eq!(sliding_window_strs("", 3), vec![""]);
252 }
253
254 #[cfg(feature = "text-processing")]
255 #[test]
256 fn test_sliding_window_strs_unicode() {
257 assert_eq!(sliding_window_strs("äöü", 2), vec!["äö", "öü"]);
258 }
259
260 #[cfg(feature = "text-processing")]
261 #[cfg(debug_assertions)]
262 #[test]
263 #[should_panic(expected = "width must be 2")]
264 fn test_sliding_window_strs_width_too_small() {
265 sliding_window_strs("test", 1);
266 }
267
268 #[cfg(feature = "text-processing")]
269 #[test]
270 fn test_sliding_window_strs_matches_sliding_window() {
271 let cases = vec![
273 ("Hello World", 4),
274 ("ab", 3),
275 ("abc", 3),
276 ("", 3),
277 ("äöü", 2),
278 ("Hello", 13),
279 ("a longer test string for n-grams", 5),
280 ];
281 for (input, width) in cases {
282 let strings = sliding_window(input, width).unwrap();
283 let strs = sliding_window_strs(input, width);
284 assert_eq!(
285 strings.len(),
286 strs.len(),
287 "length mismatch for input={input:?} width={width}"
288 );
289 for (s, sr) in strings.iter().zip(strs.iter()) {
290 assert_eq!(
291 s.as_str(),
292 *sr,
293 "content mismatch for input={input:?} width={width}"
294 );
295 }
296 }
297 }
298
299 #[cfg(feature = "meta-code")]
302 #[test]
303 fn test_sliding_window_bytes_basic() {
304 assert_eq!(
305 sliding_window_bytes(b"Hello", 4),
306 vec![&b"Hell"[..], &b"ello"[..]]
307 );
308 }
309
310 #[cfg(feature = "meta-code")]
311 #[test]
312 fn test_sliding_window_bytes_shorter_than_width() {
313 assert_eq!(sliding_window_bytes(b"ab", 3), vec![&b"ab"[..]]);
314 }
315
316 #[cfg(feature = "meta-code")]
317 #[test]
318 fn test_sliding_window_bytes_exact_width() {
319 assert_eq!(sliding_window_bytes(b"abc", 3), vec![&b"abc"[..]]);
320 }
321
322 #[cfg(feature = "meta-code")]
323 #[test]
324 fn test_sliding_window_bytes_empty() {
325 assert_eq!(sliding_window_bytes(b"", 3), vec![&b""[..]]);
326 }
327
328 #[cfg(feature = "meta-code")]
329 #[test]
330 fn test_sliding_window_bytes_width_4() {
331 assert_eq!(
332 sliding_window_bytes(b"abcdef", 4),
333 vec![&b"abcd"[..], &b"bcde"[..], &b"cdef"[..]]
334 );
335 }
336
337 #[cfg(feature = "meta-code")]
338 #[cfg(debug_assertions)]
339 #[test]
340 #[should_panic(expected = "width must be 2")]
341 fn test_sliding_window_bytes_width_too_small() {
342 sliding_window_bytes(b"test", 1);
343 }
344}