irithyll/preprocessing/
feature_hasher.rs1use crate::pipeline::StreamingPreprocessor;
26
27#[inline]
36fn hash_index(i: usize, seed: u64) -> u64 {
37 let mut h = seed ^ (i as u64);
38 h = h.wrapping_mul(0x517cc1b727220a95);
39 h ^= h >> 32;
40 h
41}
42
43const BUCKET_SEED: u64 = 0x9e3779b97f4a7c15; const SIGN_SEED: u64 = 0x6c62272e07bb0142; #[derive(Clone, Debug)]
69#[cfg_attr(feature = "serde-json", derive(serde::Serialize, serde::Deserialize))]
70pub struct FeatureHasher {
71 n_buckets: usize,
73}
74
75impl FeatureHasher {
76 pub fn new(n_buckets: usize) -> Self {
88 assert!(n_buckets > 0, "FeatureHasher: n_buckets must be > 0");
89 Self { n_buckets }
90 }
91
92 pub fn n_buckets(&self) -> usize {
94 self.n_buckets
95 }
96
97 fn hash_features(&self, features: &[f64]) -> Vec<f64> {
99 let mut output = vec![0.0; self.n_buckets];
100 for (i, &val) in features.iter().enumerate() {
101 let bucket = (hash_index(i, BUCKET_SEED) % self.n_buckets as u64) as usize;
102 let sign = if hash_index(i, SIGN_SEED) & 1 == 0 {
103 1.0
104 } else {
105 -1.0
106 };
107 output[bucket] += sign * val;
108 }
109 output
110 }
111}
112
113impl StreamingPreprocessor for FeatureHasher {
118 fn update_and_transform(&mut self, features: &[f64]) -> Vec<f64> {
119 self.hash_features(features)
121 }
122
123 fn transform(&self, features: &[f64]) -> Vec<f64> {
124 self.hash_features(features)
125 }
126
127 fn output_dim(&self) -> Option<usize> {
128 Some(self.n_buckets)
129 }
130
131 fn reset(&mut self) {
132 }
134}
135
136#[cfg(test)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn output_dimension_matches_n_buckets() {
146 for &n in &[1, 4, 16, 128, 1024] {
147 let hasher = FeatureHasher::new(n);
148 assert_eq!(
149 hasher.output_dim(),
150 Some(n),
151 "output_dim should be Some({n}) for n_buckets={n}"
152 );
153
154 let input = vec![1.0; 50];
155 let out = hasher.transform(&input);
156 assert_eq!(
157 out.len(),
158 n,
159 "output length should be {n} for n_buckets={n}, got {}",
160 out.len()
161 );
162 }
163 }
164
165 #[test]
166 fn deterministic_hashing() {
167 let hasher = FeatureHasher::new(32);
168 let input = vec![1.0, -2.5, 3.3, 0.0, 7.7];
169
170 let out_a = hasher.transform(&input);
171 let out_b = hasher.transform(&input);
172
173 assert_eq!(
174 out_a, out_b,
175 "same input must always produce identical output"
176 );
177 }
178
179 #[test]
180 fn different_inputs_different_outputs() {
181 let hasher = FeatureHasher::new(16);
182
183 let out_a = hasher.transform(&[1.0, 0.0, 0.0]);
184 let out_b = hasher.transform(&[0.0, 0.0, 1.0]);
185
186 assert_ne!(
187 out_a, out_b,
188 "[1,0,0] and [0,0,1] should produce different hashed outputs"
189 );
190 }
191
192 #[test]
193 fn sign_preserves_inner_product_approx() {
194 let n_buckets = 8192;
202 let hasher = FeatureHasher::new(n_buckets);
203
204 let dim = 50;
206 let a: Vec<f64> = (0..dim).map(|i| i as f64 + 1.0).collect();
207 let b: Vec<f64> = (0..dim).map(|i| (i as f64 + 1.0) * 0.5).collect();
208
209 let true_ip: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
210 let ha = hasher.transform(&a);
213 let hb = hasher.transform(&b);
214 let hashed_ip: f64 = ha.iter().zip(hb.iter()).map(|(x, y)| x * y).sum();
215
216 let rel_error = ((hashed_ip - true_ip) / true_ip).abs();
217
218 assert!(
219 rel_error < 0.15,
220 "hashed inner product ({hashed_ip:.6}) should approximate true inner product \
221 ({true_ip:.6}), relative error = {rel_error:.6}"
222 );
223 }
224
225 #[test]
226 fn reset_is_noop() {
227 let mut hasher = FeatureHasher::new(16);
228 let input = vec![1.0, 2.0, 3.0, 4.0];
229
230 let before = hasher.transform(&input);
231 hasher.reset();
232 let after = hasher.transform(&input);
233
234 assert_eq!(
235 before, after,
236 "reset should not change transform output (hasher is stateless)"
237 );
238 }
239
240 #[test]
241 fn single_feature_maps_to_one_bucket() {
242 let hasher = FeatureHasher::new(32);
243 let out = hasher.transform(&[5.0]);
244
245 let nonzero_count = out.iter().filter(|&&v| v != 0.0).count();
246 assert_eq!(
247 nonzero_count, 1,
248 "a single input feature should map to exactly one bucket, \
249 but {nonzero_count} buckets were nonzero"
250 );
251
252 let nonzero_val = out.iter().find(|&&v| v != 0.0).unwrap();
254 assert!(
255 (*nonzero_val - 5.0).abs() < 1e-12 || (*nonzero_val + 5.0).abs() < 1e-12,
256 "single feature value 5.0 should map to +5.0 or -5.0, got {nonzero_val}"
257 );
258 }
259
260 #[test]
261 fn update_and_transform_equals_transform() {
262 let mut hasher = FeatureHasher::new(64);
263 let input = vec![0.5, -1.2, 3.7, 2.1, -0.8, 4.4];
264
265 let out_transform = hasher.transform(&input);
266 let out_update = hasher.update_and_transform(&input);
267
268 assert_eq!(
269 out_transform, out_update,
270 "update_and_transform and transform must return identical output (stateless hasher)"
271 );
272 }
273
274 #[test]
275 #[should_panic(expected = "n_buckets must be > 0")]
276 fn zero_buckets_panics() {
277 FeatureHasher::new(0);
278 }
279
280 #[test]
281 fn empty_input_returns_zero_vector() {
282 let hasher = FeatureHasher::new(8);
283 let out = hasher.transform(&[]);
284 assert_eq!(out.len(), 8, "output length should match n_buckets");
285 assert!(
286 out.iter().all(|&v| v == 0.0),
287 "hashing an empty feature vector should produce all zeros"
288 );
289 }
290}