Skip to main content

irithyll/preprocessing/
feature_hasher.rs

1//! Feature hashing (hashing trick) for dimensionality reduction.
2//!
3//! [`FeatureHasher`] maps a d-dimensional input to a fixed `n_buckets`-dimensional
4//! output using the hashing trick (Weinberger et al., 2009). Each input feature
5//! index is hashed to a bucket, and a separate sign hash determines +/-1 to
6//! preserve inner products in expectation.
7//!
8//! The transform is completely stateless -- there is nothing to learn, so
9//! `update_and_transform` and `transform` produce identical results, and
10//! `reset` is a no-op.
11//!
12//! # Example
13//!
14//! ```
15//! use irithyll::preprocessing::FeatureHasher;
16//! use irithyll::pipeline::StreamingPreprocessor;
17//!
18//! let hasher = FeatureHasher::new(8);
19//! assert_eq!(hasher.n_buckets(), 8);
20//!
21//! let out = hasher.transform(&[1.0, 2.0, 3.0, 4.0, 5.0]);
22//! assert_eq!(out.len(), 8);
23//! ```
24
25use crate::pipeline::StreamingPreprocessor;
26
27// ---------------------------------------------------------------------------
28// Hash helpers
29// ---------------------------------------------------------------------------
30
31/// Multiply-xorshift hash for mapping feature indices to buckets/signs.
32///
33/// Two different seeds yield two independent hash functions, one for the
34/// bucket assignment and one for the sign selection.
35#[inline]
36fn hash_index(i: usize, seed: u64) -> u64 {
37    let mut h = seed ^ (i as u64);
38    h = h.wrapping_mul(0x517cc1b727220a95);
39    h ^= h >> 32;
40    h
41}
42
43/// Seed used for the bucket hash.
44const BUCKET_SEED: u64 = 0x9e3779b97f4a7c15; // golden-ratio derived
45/// Seed used for the sign hash.
46const SIGN_SEED: u64 = 0x6c62272e07bb0142; // different constant
47
48// ---------------------------------------------------------------------------
49// FeatureHasher
50// ---------------------------------------------------------------------------
51
52/// Feature hashing (hashing trick) preprocessor.
53///
54/// Maps an arbitrary-length input feature vector to a fixed-size output of
55/// `n_buckets` dimensions. For each input feature at index `i`:
56///
57/// 1. A **bucket** is chosen: `bucket = hash(i, BUCKET_SEED) % n_buckets`.
58/// 2. A **sign** is chosen: `sign = if hash(i, SIGN_SEED) & 1 == 0 { +1 } else { -1 }`.
59/// 3. The output accumulates: `output[bucket] += sign * features[i]`.
60///
61/// The sign-flipping ensures that the inner product of two hashed vectors is
62/// an unbiased estimator of the original inner product (Weinberger et al., 2009).
63///
64/// Because the transform is purely deterministic and stateless, both
65/// [`update_and_transform`](StreamingPreprocessor::update_and_transform) and
66/// [`transform`](StreamingPreprocessor::transform) behave identically, and
67/// [`reset`](StreamingPreprocessor::reset) is a no-op.
68#[derive(Clone, Debug)]
69#[cfg_attr(feature = "serde-json", derive(serde::Serialize, serde::Deserialize))]
70pub struct FeatureHasher {
71    /// Number of output buckets (output dimensionality).
72    n_buckets: usize,
73}
74
75impl FeatureHasher {
76    /// Create a new feature hasher with the given number of output buckets.
77    ///
78    /// # Panics
79    ///
80    /// Panics if `n_buckets` is zero.
81    ///
82    /// ```
83    /// use irithyll::preprocessing::FeatureHasher;
84    /// let hasher = FeatureHasher::new(16);
85    /// assert_eq!(hasher.n_buckets(), 16);
86    /// ```
87    pub fn new(n_buckets: usize) -> Self {
88        assert!(n_buckets > 0, "FeatureHasher: n_buckets must be > 0");
89        Self { n_buckets }
90    }
91
92    /// Number of output buckets (output dimensionality).
93    pub fn n_buckets(&self) -> usize {
94        self.n_buckets
95    }
96
97    /// Core hashing logic shared by both trait methods.
98    fn hash_features(&self, features: &[f64]) -> Vec<f64> {
99        let mut output = vec![0.0; self.n_buckets];
100        for (i, &val) in features.iter().enumerate() {
101            let bucket = (hash_index(i, BUCKET_SEED) % self.n_buckets as u64) as usize;
102            let sign = if hash_index(i, SIGN_SEED) & 1 == 0 {
103                1.0
104            } else {
105                -1.0
106            };
107            output[bucket] += sign * val;
108        }
109        output
110    }
111}
112
113// ---------------------------------------------------------------------------
114// StreamingPreprocessor impl
115// ---------------------------------------------------------------------------
116
117impl StreamingPreprocessor for FeatureHasher {
118    fn update_and_transform(&mut self, features: &[f64]) -> Vec<f64> {
119        // Stateless -- identical to transform.
120        self.hash_features(features)
121    }
122
123    fn transform(&self, features: &[f64]) -> Vec<f64> {
124        self.hash_features(features)
125    }
126
127    fn output_dim(&self) -> Option<usize> {
128        Some(self.n_buckets)
129    }
130
131    fn reset(&mut self) {
132        // No-op: the hasher is entirely stateless.
133    }
134}
135
136// ---------------------------------------------------------------------------
137// Tests
138// ---------------------------------------------------------------------------
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn output_dimension_matches_n_buckets() {
146        for &n in &[1, 4, 16, 128, 1024] {
147            let hasher = FeatureHasher::new(n);
148            assert_eq!(
149                hasher.output_dim(),
150                Some(n),
151                "output_dim should be Some({n}) for n_buckets={n}"
152            );
153
154            let input = vec![1.0; 50];
155            let out = hasher.transform(&input);
156            assert_eq!(
157                out.len(),
158                n,
159                "output length should be {n} for n_buckets={n}, got {}",
160                out.len()
161            );
162        }
163    }
164
165    #[test]
166    fn deterministic_hashing() {
167        let hasher = FeatureHasher::new(32);
168        let input = vec![1.0, -2.5, 3.3, 0.0, 7.7];
169
170        let out_a = hasher.transform(&input);
171        let out_b = hasher.transform(&input);
172
173        assert_eq!(
174            out_a, out_b,
175            "same input must always produce identical output"
176        );
177    }
178
179    #[test]
180    fn different_inputs_different_outputs() {
181        let hasher = FeatureHasher::new(16);
182
183        let out_a = hasher.transform(&[1.0, 0.0, 0.0]);
184        let out_b = hasher.transform(&[0.0, 0.0, 1.0]);
185
186        assert_ne!(
187            out_a, out_b,
188            "[1,0,0] and [0,0,1] should produce different hashed outputs"
189        );
190    }
191
192    #[test]
193    fn sign_preserves_inner_product_approx() {
194        // With a large number of buckets (>> input dim), the inner product of
195        // hashed vectors should approximate the inner product of the originals.
196        //
197        // We use vectors with a substantial inner product so that relative
198        // error is a meaningful measure. The hashing trick is an unbiased
199        // estimator -- collisions add noise, but with buckets >> dim the
200        // variance is low.
201        let n_buckets = 8192;
202        let hasher = FeatureHasher::new(n_buckets);
203
204        // Construct two vectors with a large, well-defined inner product.
205        let dim = 50;
206        let a: Vec<f64> = (0..dim).map(|i| i as f64 + 1.0).collect();
207        let b: Vec<f64> = (0..dim).map(|i| (i as f64 + 1.0) * 0.5).collect();
208
209        let true_ip: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
210        // true_ip = sum_{i=1}^{50} i * (i * 0.5) = 0.5 * sum(i^2) = 0.5 * 42925 = 21462.5
211
212        let ha = hasher.transform(&a);
213        let hb = hasher.transform(&b);
214        let hashed_ip: f64 = ha.iter().zip(hb.iter()).map(|(x, y)| x * y).sum();
215
216        let rel_error = ((hashed_ip - true_ip) / true_ip).abs();
217
218        assert!(
219            rel_error < 0.15,
220            "hashed inner product ({hashed_ip:.6}) should approximate true inner product \
221             ({true_ip:.6}), relative error = {rel_error:.6}"
222        );
223    }
224
225    #[test]
226    fn reset_is_noop() {
227        let mut hasher = FeatureHasher::new(16);
228        let input = vec![1.0, 2.0, 3.0, 4.0];
229
230        let before = hasher.transform(&input);
231        hasher.reset();
232        let after = hasher.transform(&input);
233
234        assert_eq!(
235            before, after,
236            "reset should not change transform output (hasher is stateless)"
237        );
238    }
239
240    #[test]
241    fn single_feature_maps_to_one_bucket() {
242        let hasher = FeatureHasher::new(32);
243        let out = hasher.transform(&[5.0]);
244
245        let nonzero_count = out.iter().filter(|&&v| v != 0.0).count();
246        assert_eq!(
247            nonzero_count, 1,
248            "a single input feature should map to exactly one bucket, \
249             but {nonzero_count} buckets were nonzero"
250        );
251
252        // The nonzero value should be +5.0 or -5.0 (sign hash).
253        let nonzero_val = out.iter().find(|&&v| v != 0.0).unwrap();
254        assert!(
255            (*nonzero_val - 5.0).abs() < 1e-12 || (*nonzero_val + 5.0).abs() < 1e-12,
256            "single feature value 5.0 should map to +5.0 or -5.0, got {nonzero_val}"
257        );
258    }
259
260    #[test]
261    fn update_and_transform_equals_transform() {
262        let mut hasher = FeatureHasher::new(64);
263        let input = vec![0.5, -1.2, 3.7, 2.1, -0.8, 4.4];
264
265        let out_transform = hasher.transform(&input);
266        let out_update = hasher.update_and_transform(&input);
267
268        assert_eq!(
269            out_transform, out_update,
270            "update_and_transform and transform must return identical output (stateless hasher)"
271        );
272    }
273
274    #[test]
275    #[should_panic(expected = "n_buckets must be > 0")]
276    fn zero_buckets_panics() {
277        FeatureHasher::new(0);
278    }
279
280    #[test]
281    fn empty_input_returns_zero_vector() {
282        let hasher = FeatureHasher::new(8);
283        let out = hasher.transform(&[]);
284        assert_eq!(out.len(), 8, "output length should match n_buckets");
285        assert!(
286            out.iter().all(|&v| v == 0.0),
287            "hashing an empty feature vector should produce all zeros"
288        );
289    }
290}