use crate::ops::string_similarity::validation::{validate_input, SimilarityError};
pub const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
pub const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
pub const DEFAULT_GRAM: usize = 4;
pub fn simhash64(input: &[u8]) -> Result<u64, SimilarityError> {
validate_input("input", input)?;
let mut weights = [0i32; 64];
if input.is_empty() {
return Ok(0);
}
if input.len() < DEFAULT_GRAM {
add_hash(&mut weights, fnv1a64(input));
} else {
for gram in input.windows(DEFAULT_GRAM) {
add_hash(&mut weights, fnv1a64(gram));
}
}
let mut out = 0u64;
for (bit, &weight) in weights.iter().enumerate() {
if weight >= 0 {
out |= 1u64 << bit;
}
}
Ok(out)
}
pub fn add_hash(weights: &mut [i32; 64], hash: u64) {
for (bit, weight) in weights.iter_mut().enumerate() {
if ((hash >> bit) & 1) == 1 {
*weight += 1;
} else {
*weight -= 1;
}
}
}
pub fn fnv1a64(input: &[u8]) -> u64 {
input.iter().fold(FNV_OFFSET, |hash, &byte| {
(hash ^ u64::from(byte)).wrapping_mul(FNV_PRIME)
})
}
pub const WGSL: &str = concat!(
include_str!("../wgsl/common_params.wgsl"),
"\n",
include_str!("wgsl/simhash64.wgsl"),
);