use crate::pipeline::StreamingPreprocessor;
#[inline]
fn hash_index(i: usize, seed: u64) -> u64 {
let mut h = seed ^ (i as u64);
h = h.wrapping_mul(0x517cc1b727220a95);
h ^= h >> 32;
h
}
const BUCKET_SEED: u64 = 0x9e3779b97f4a7c15; const SIGN_SEED: u64 = 0x6c62272e07bb0142;
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde-json", derive(serde::Serialize, serde::Deserialize))]
pub struct FeatureHasher {
n_buckets: usize,
}
impl FeatureHasher {
pub fn new(n_buckets: usize) -> Self {
assert!(n_buckets > 0, "FeatureHasher: n_buckets must be > 0");
Self { n_buckets }
}
pub fn n_buckets(&self) -> usize {
self.n_buckets
}
fn hash_features(&self, features: &[f64]) -> Vec<f64> {
let mut output = vec![0.0; self.n_buckets];
for (i, &val) in features.iter().enumerate() {
let bucket = (hash_index(i, BUCKET_SEED) % self.n_buckets as u64) as usize;
let sign = if hash_index(i, SIGN_SEED) & 1 == 0 {
1.0
} else {
-1.0
};
output[bucket] += sign * val;
}
output
}
}
impl StreamingPreprocessor for FeatureHasher {
fn update_and_transform(&mut self, features: &[f64]) -> Vec<f64> {
self.hash_features(features)
}
fn transform(&self, features: &[f64]) -> Vec<f64> {
self.hash_features(features)
}
fn output_dim(&self) -> Option<usize> {
Some(self.n_buckets)
}
fn reset(&mut self) {
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn output_dimension_matches_n_buckets() {
for &n in &[1, 4, 16, 128, 1024] {
let hasher = FeatureHasher::new(n);
assert_eq!(
hasher.output_dim(),
Some(n),
"output_dim should be Some({n}) for n_buckets={n}"
);
let input = vec![1.0; 50];
let out = hasher.transform(&input);
assert_eq!(
out.len(),
n,
"output length should be {n} for n_buckets={n}, got {}",
out.len()
);
}
}
#[test]
fn deterministic_hashing() {
let hasher = FeatureHasher::new(32);
let input = vec![1.0, -2.5, 3.3, 0.0, 7.7];
let out_a = hasher.transform(&input);
let out_b = hasher.transform(&input);
assert_eq!(
out_a, out_b,
"same input must always produce identical output"
);
}
#[test]
fn different_inputs_different_outputs() {
let hasher = FeatureHasher::new(16);
let out_a = hasher.transform(&[1.0, 0.0, 0.0]);
let out_b = hasher.transform(&[0.0, 0.0, 1.0]);
assert_ne!(
out_a, out_b,
"[1,0,0] and [0,0,1] should produce different hashed outputs"
);
}
#[test]
fn sign_preserves_inner_product_approx() {
let n_buckets = 8192;
let hasher = FeatureHasher::new(n_buckets);
let dim = 50;
let a: Vec<f64> = (0..dim).map(|i| i as f64 + 1.0).collect();
let b: Vec<f64> = (0..dim).map(|i| (i as f64 + 1.0) * 0.5).collect();
let true_ip: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let ha = hasher.transform(&a);
let hb = hasher.transform(&b);
let hashed_ip: f64 = ha.iter().zip(hb.iter()).map(|(x, y)| x * y).sum();
let rel_error = ((hashed_ip - true_ip) / true_ip).abs();
assert!(
rel_error < 0.15,
"hashed inner product ({hashed_ip:.6}) should approximate true inner product \
({true_ip:.6}), relative error = {rel_error:.6}"
);
}
#[test]
fn reset_is_noop() {
let mut hasher = FeatureHasher::new(16);
let input = vec![1.0, 2.0, 3.0, 4.0];
let before = hasher.transform(&input);
hasher.reset();
let after = hasher.transform(&input);
assert_eq!(
before, after,
"reset should not change transform output (hasher is stateless)"
);
}
#[test]
fn single_feature_maps_to_one_bucket() {
let hasher = FeatureHasher::new(32);
let out = hasher.transform(&[5.0]);
let nonzero_count = out.iter().filter(|&&v| v != 0.0).count();
assert_eq!(
nonzero_count, 1,
"a single input feature should map to exactly one bucket, \
but {nonzero_count} buckets were nonzero"
);
let nonzero_val = out.iter().find(|&&v| v != 0.0).unwrap();
assert!(
(*nonzero_val - 5.0).abs() < 1e-12 || (*nonzero_val + 5.0).abs() < 1e-12,
"single feature value 5.0 should map to +5.0 or -5.0, got {nonzero_val}"
);
}
#[test]
fn update_and_transform_equals_transform() {
let mut hasher = FeatureHasher::new(64);
let input = vec![0.5, -1.2, 3.7, 2.1, -0.8, 4.4];
let out_transform = hasher.transform(&input);
let out_update = hasher.update_and_transform(&input);
assert_eq!(
out_transform, out_update,
"update_and_transform and transform must return identical output (stateless hasher)"
);
}
#[test]
#[should_panic(expected = "n_buckets must be > 0")]
fn zero_buckets_panics() {
FeatureHasher::new(0);
}
#[test]
fn empty_input_returns_zero_vector() {
let hasher = FeatureHasher::new(8);
let out = hasher.transform(&[]);
assert_eq!(out.len(), 8, "output length should match n_buckets");
assert!(
out.iter().all(|&v| v == 0.0),
"hashing an empty feature vector should produce all zeros"
);
}
}