use crate::{IsccError, IsccResult};
use std::cmp;
pub fn alg_simhash(hash_digests: &[impl AsRef<[u8]>]) -> IsccResult<Vec<u8>> {
if hash_digests.len() >= 2 {
let expected_len = hash_digests[0].as_ref().len();
for (i, digest) in hash_digests.iter().enumerate().skip(1) {
if digest.as_ref().len() != expected_len {
return Err(IsccError::InvalidInput(format!(
"All hash digests must have equal length (expected {}, got {} at index {})",
expected_len,
digest.as_ref().len(),
i
)));
}
}
}
Ok(alg_simhash_inner(hash_digests))
}
pub(crate) fn alg_simhash_inner(hash_digests: &[impl AsRef<[u8]>]) -> Vec<u8> {
if hash_digests.is_empty() {
return vec![0u8; 32];
}
let n_bytes = hash_digests[0].as_ref().len();
let n_bits = n_bytes * 8;
let mut vector = vec![0u32; n_bits];
for digest in hash_digests {
let bytes = digest.as_ref();
for (i, count) in vector.iter_mut().enumerate() {
let byte_idx = i / 8;
let bit_idx = 7 - (i % 8); if (bytes[byte_idx] >> bit_idx) & 1 == 1 {
*count += 1;
}
}
}
let n = hash_digests.len() as u32;
let mut result = vec![0u8; n_bytes];
for (i, &count) in vector.iter().enumerate() {
if count * 2 >= n {
let byte_idx = i / 8;
let bit_idx = 7 - (i % 8);
result[byte_idx] |= 1 << bit_idx;
}
}
result
}
pub fn sliding_window(seq: &str, width: usize) -> IsccResult<Vec<String>> {
if width < 2 {
return Err(IsccError::InvalidInput(
"Sliding window width must be 2 or bigger.".into(),
));
}
let chars: Vec<char> = seq.chars().collect();
let len = chars.len();
let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
Ok((0..range)
.map(|i| {
let end = cmp::min(i + width, len);
chars[i..end].iter().collect()
})
.collect())
}
#[cfg(feature = "text-processing")]
pub(crate) fn sliding_window_strs(seq: &str, width: usize) -> Vec<&str> {
debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
let char_indices: Vec<usize> = seq.char_indices().map(|(i, _)| i).collect();
let len = char_indices.len();
if len == 0 {
return vec![seq]; }
let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
(0..range)
.map(|i| {
let start = char_indices[i];
let end = if i + width >= len {
seq.len()
} else {
char_indices[i + width]
};
&seq[start..end]
})
.collect()
}
#[cfg(feature = "meta-code")]
pub(crate) fn sliding_window_bytes(data: &[u8], width: usize) -> Vec<&[u8]> {
debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
let len = data.len();
let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
(0..range)
.map(|i| {
let end = cmp::min(i + width, len);
&data[i..end]
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sliding_window_basic() {
assert_eq!(
sliding_window("Hello", 4).unwrap(),
vec!["Hell".to_string(), "ello".to_string()]
);
}
#[test]
fn test_sliding_window_shorter_than_width() {
assert_eq!(sliding_window("ab", 3).unwrap(), vec!["ab".to_string()]);
}
#[test]
fn test_sliding_window_exact_width() {
assert_eq!(sliding_window("abc", 3).unwrap(), vec!["abc".to_string()]);
}
#[test]
fn test_sliding_window_empty() {
assert_eq!(sliding_window("", 3).unwrap(), vec!["".to_string()]);
}
#[test]
fn test_sliding_window_unicode() {
assert_eq!(
sliding_window("äöü", 2).unwrap(),
vec!["äö".to_string(), "öü".to_string()]
);
}
#[test]
fn test_alg_simhash_single_digest() {
let digest = blake3::hash(b"test");
let result = alg_simhash(&[digest.as_bytes().to_vec()]).unwrap();
assert_eq!(result, digest.as_bytes().to_vec());
}
#[test]
fn test_alg_simhash_empty() {
let empty: Vec<Vec<u8>> = vec![];
let result = alg_simhash(&empty).unwrap();
assert_eq!(result, vec![0u8; 32]);
}
#[test]
fn test_alg_simhash_identical_digests() {
let digest = vec![0xFFu8; 32];
let result = alg_simhash(&[digest.clone(), digest.clone(), digest]).unwrap();
assert_eq!(result, vec![0xFFu8; 32]);
}
#[test]
fn test_alg_simhash_opposite_digests() {
let ones = vec![0xFFu8; 32];
let zeros = vec![0x00u8; 32];
let result = alg_simhash(&[ones, zeros]).unwrap();
assert_eq!(result, vec![0xFFu8; 32]);
}
#[test]
fn test_alg_simhash_mismatched_lengths_returns_error() {
let result = alg_simhash(&[vec![1u8, 2], vec![1u8, 2, 3]]);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("equal length"),
"error message should mention equal length, got: {msg}"
);
}
#[test]
fn test_sliding_window_width_too_small() {
let result = sliding_window("test", 1);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("width must be 2"),
"error message should mention width constraint, got: {msg}"
);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_basic() {
assert_eq!(sliding_window_strs("Hello", 4), vec!["Hell", "ello"]);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_shorter_than_width() {
assert_eq!(sliding_window_strs("ab", 3), vec!["ab"]);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_exact_width() {
assert_eq!(sliding_window_strs("abc", 3), vec!["abc"]);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_empty() {
assert_eq!(sliding_window_strs("", 3), vec![""]);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_unicode() {
assert_eq!(sliding_window_strs("äöü", 2), vec!["äö", "öü"]);
}
#[cfg(feature = "text-processing")]
#[cfg(debug_assertions)]
#[test]
#[should_panic(expected = "width must be 2")]
fn test_sliding_window_strs_width_too_small() {
sliding_window_strs("test", 1);
}
#[cfg(feature = "text-processing")]
#[test]
fn test_sliding_window_strs_matches_sliding_window() {
let cases = vec![
("Hello World", 4),
("ab", 3),
("abc", 3),
("", 3),
("äöü", 2),
("Hello", 13),
("a longer test string for n-grams", 5),
];
for (input, width) in cases {
let strings = sliding_window(input, width).unwrap();
let strs = sliding_window_strs(input, width);
assert_eq!(
strings.len(),
strs.len(),
"length mismatch for input={input:?} width={width}"
);
for (s, sr) in strings.iter().zip(strs.iter()) {
assert_eq!(
s.as_str(),
*sr,
"content mismatch for input={input:?} width={width}"
);
}
}
}
#[cfg(feature = "meta-code")]
#[test]
fn test_sliding_window_bytes_basic() {
assert_eq!(
sliding_window_bytes(b"Hello", 4),
vec![&b"Hell"[..], &b"ello"[..]]
);
}
#[cfg(feature = "meta-code")]
#[test]
fn test_sliding_window_bytes_shorter_than_width() {
assert_eq!(sliding_window_bytes(b"ab", 3), vec![&b"ab"[..]]);
}
#[cfg(feature = "meta-code")]
#[test]
fn test_sliding_window_bytes_exact_width() {
assert_eq!(sliding_window_bytes(b"abc", 3), vec![&b"abc"[..]]);
}
#[cfg(feature = "meta-code")]
#[test]
fn test_sliding_window_bytes_empty() {
assert_eq!(sliding_window_bytes(b"", 3), vec![&b""[..]]);
}
#[cfg(feature = "meta-code")]
#[test]
fn test_sliding_window_bytes_width_4() {
assert_eq!(
sliding_window_bytes(b"abcdef", 4),
vec![&b"abcd"[..], &b"bcde"[..], &b"cdef"[..]]
);
}
#[cfg(feature = "meta-code")]
#[cfg(debug_assertions)]
#[test]
#[should_panic(expected = "width must be 2")]
fn test_sliding_window_bytes_width_too_small() {
sliding_window_bytes(b"test", 1);
}
}