ghostflow_ml/
feature_engineering.rs

1//! Feature Engineering Utilities
2//!
3//! Tools for creating and transforming features to improve model performance.
4
5use ghostflow_core::Tensor;
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use std::collections::hash_map::DefaultHasher;
9
10/// Polynomial Feature Generator
11/// 
12/// Generates polynomial and interaction features.
13/// Example: [a, b] with degree=2 -> [1, a, b, a², ab, b²]
14pub struct PolynomialFeatures {
15    pub degree: usize,
16    pub interaction_only: bool,
17    pub include_bias: bool,
18    n_input_features: usize,
19    n_output_features: usize,
20}
21
22impl PolynomialFeatures {
23    pub fn new(degree: usize) -> Self {
24        Self {
25            degree,
26            interaction_only: false,
27            include_bias: true,
28            n_input_features: 0,
29            n_output_features: 0,
30        }
31    }
32
33    pub fn interaction_only(mut self, value: bool) -> Self {
34        self.interaction_only = value;
35        self
36    }
37
38    pub fn include_bias(mut self, value: bool) -> Self {
39        self.include_bias = value;
40        self
41    }
42
43    /// Fit to determine output feature count
44    pub fn fit(&mut self, x: &Tensor) {
45        self.n_input_features = x.dims()[1];
46        self.n_output_features = self.calculate_n_output_features();
47    }
48
49    fn calculate_n_output_features(&self) -> usize {
50        let n = self.n_input_features;
51        let d = self.degree;
52        
53        let mut count = if self.include_bias { 1 } else { 0 };
54
55        if self.interaction_only {
56            // Only interaction terms
57            for degree in 1..=d {
58                count += Self::n_combinations(n, degree);
59            }
60        } else {
61            // All polynomial terms
62            count += Self::n_combinations_with_replacement(n + d, d) - 1;
63            if !self.include_bias {
64                count -= 1;
65            }
66        }
67
68        count
69    }
70
71    fn n_combinations(n: usize, k: usize) -> usize {
72        if k > n {
73            return 0;
74        }
75        let mut result = 1;
76        for i in 0..k {
77            result = result * (n - i) / (i + 1);
78        }
79        result
80    }
81
82    fn n_combinations_with_replacement(n: usize, k: usize) -> usize {
83        Self::n_combinations(n + k - 1, k)
84    }
85
86    /// Transform features to polynomial features
87    pub fn transform(&self, x: &Tensor) -> Tensor {
88        let n_samples = x.dims()[0];
89        let x_data = x.data_f32();
90        
91        let mut all_features = Vec::new();
92        let mut actual_n_features = 0;
93
94        for i in 0..n_samples {
95            let sample = &x_data[i * self.n_input_features..(i + 1) * self.n_input_features];
96            let poly_features = self.generate_polynomial_features(sample);
97            if i == 0 {
98                actual_n_features = poly_features.len();
99            }
100            all_features.extend(poly_features);
101        }
102
103        Tensor::from_slice(&all_features, &[n_samples, actual_n_features]).unwrap()
104    }
105
106    fn generate_polynomial_features(&self, sample: &[f32]) -> Vec<f32> {
107        let mut features = Vec::new();
108
109        if self.include_bias {
110            features.push(1.0);
111        }
112
113        // Generate all combinations up to degree
114        self.generate_combinations(sample, &mut features, &mut Vec::new(), 0, 0);
115
116        features
117    }
118
119    fn generate_combinations(
120        &self,
121        sample: &[f32],
122        features: &mut Vec<f32>,
123        current: &mut Vec<usize>,
124        start: usize,
125        current_degree: usize,
126    ) {
127        if current_degree > 0 {
128            // Calculate product of current combination
129            let mut product = 1.0;
130            for &idx in current.iter() {
131                product *= sample[idx];
132            }
133            features.push(product);
134        }
135
136        if current_degree >= self.degree {
137            return;
138        }
139
140        for i in start..self.n_input_features {
141            current.push(i);
142            
143            let next_start = if self.interaction_only { i + 1 } else { i };
144            self.generate_combinations(sample, features, current, next_start, current_degree + 1);
145            
146            current.pop();
147        }
148    }
149
150    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
151        self.fit(x);
152        self.transform(x)
153    }
154}
155
156/// Feature Hashing (Hashing Trick)
157/// 
158/// Maps features to a fixed-size feature space using hashing.
159/// Useful for high-dimensional sparse features.
160pub struct FeatureHasher {
161    pub n_features: usize,
162    pub alternate_sign: bool,
163}
164
165impl FeatureHasher {
166    pub fn new(n_features: usize) -> Self {
167        Self {
168            n_features,
169            alternate_sign: true,
170        }
171    }
172
173    pub fn alternate_sign(mut self, value: bool) -> Self {
174        self.alternate_sign = value;
175        self
176    }
177
178    /// Transform string features to hashed features
179    pub fn transform_strings(&self, features: &[Vec<String>]) -> Tensor {
180        let n_samples = features.len();
181        let mut output = vec![0.0f32; n_samples * self.n_features];
182
183        for (i, sample_features) in features.iter().enumerate() {
184            for feature in sample_features {
185                let hash = self.hash_feature(feature);
186                let idx = (hash % self.n_features as u64) as usize;
187                let sign = if self.alternate_sign && (hash / self.n_features as u64) % 2 == 1 {
188                    -1.0
189                } else {
190                    1.0
191                };
192                output[i * self.n_features + idx] += sign;
193            }
194        }
195
196        Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
197    }
198
199    /// Transform feature-value pairs
200    pub fn transform_pairs(&self, features: &[Vec<(String, f32)>]) -> Tensor {
201        let n_samples = features.len();
202        let mut output = vec![0.0f32; n_samples * self.n_features];
203
204        for (i, sample_features) in features.iter().enumerate() {
205            for (feature, value) in sample_features {
206                let hash = self.hash_feature(feature);
207                let idx = (hash % self.n_features as u64) as usize;
208                let sign = if self.alternate_sign && (hash / self.n_features as u64) % 2 == 1 {
209                    -1.0
210                } else {
211                    1.0
212                };
213                output[i * self.n_features + idx] += sign * value;
214            }
215        }
216
217        Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
218    }
219
220    fn hash_feature(&self, feature: &str) -> u64 {
221        let mut hasher = DefaultHasher::new();
222        feature.hash(&mut hasher);
223        hasher.finish()
224    }
225}
226
227/// Target Encoder
228/// 
229/// Encodes categorical features using target statistics.
230/// Useful for high-cardinality categorical features.
231pub struct TargetEncoder {
232    pub smoothing: f32,
233    pub min_samples_leaf: usize,
234    encodings: HashMap<String, f32>,
235    global_mean: f32,
236}
237
238impl TargetEncoder {
239    pub fn new() -> Self {
240        Self {
241            smoothing: 1.0,
242            min_samples_leaf: 1,
243            encodings: HashMap::new(),
244            global_mean: 0.0,
245        }
246    }
247
248    pub fn smoothing(mut self, value: f32) -> Self {
249        self.smoothing = value;
250        self
251    }
252
253    pub fn min_samples_leaf(mut self, value: usize) -> Self {
254        self.min_samples_leaf = value;
255        self
256    }
257
258    /// Fit encoder on categorical features and target
259    pub fn fit(&mut self, categories: &[String], target: &[f32]) {
260        assert_eq!(categories.len(), target.len());
261
262        // Calculate global mean
263        self.global_mean = target.iter().sum::<f32>() / target.len() as f32;
264
265        // Calculate category statistics
266        let mut category_stats: HashMap<String, (f32, usize)> = HashMap::new();
267
268        for (cat, &tgt) in categories.iter().zip(target.iter()) {
269            let entry = category_stats.entry(cat.clone()).or_insert((0.0, 0));
270            entry.0 += tgt;
271            entry.1 += 1;
272        }
273
274        // Calculate smoothed encodings
275        for (category, (sum, count)) in category_stats {
276            if count >= self.min_samples_leaf {
277                let category_mean = sum / count as f32;
278                // Smoothing formula: (count * category_mean + smoothing * global_mean) / (count + smoothing)
279                let smoothed = (count as f32 * category_mean + self.smoothing * self.global_mean) 
280                    / (count as f32 + self.smoothing);
281                self.encodings.insert(category, smoothed);
282            }
283        }
284    }
285
286    /// Transform categories to encoded values
287    pub fn transform(&self, categories: &[String]) -> Vec<f32> {
288        categories
289            .iter()
290            .map(|cat| {
291                *self.encodings.get(cat).unwrap_or(&self.global_mean)
292            })
293            .collect()
294    }
295
296    pub fn fit_transform(&mut self, categories: &[String], target: &[f32]) -> Vec<f32> {
297        self.fit(categories, target);
298        self.transform(categories)
299    }
300}
301
302/// One-Hot Encoder
303/// 
304/// Converts categorical features to one-hot encoded vectors.
305pub struct OneHotEncoder {
306    categories: Vec<Vec<String>>,
307    n_features: usize,
308}
309
310impl OneHotEncoder {
311    pub fn new() -> Self {
312        Self {
313            categories: Vec::new(),
314            n_features: 0,
315        }
316    }
317
318    /// Fit encoder to learn categories
319    pub fn fit(&mut self, data: &[Vec<String>]) {
320        if data.is_empty() {
321            return;
322        }
323
324        let n_cols = data[0].len();
325        self.categories = vec![Vec::new(); n_cols];
326
327        // Collect unique categories for each column
328        for sample in data {
329            for (col_idx, value) in sample.iter().enumerate() {
330                if !self.categories[col_idx].contains(value) {
331                    self.categories[col_idx].push(value.clone());
332                }
333            }
334        }
335
336        // Sort categories for consistency
337        for cats in &mut self.categories {
338            cats.sort();
339        }
340
341        // Calculate total number of features
342        self.n_features = self.categories.iter().map(|cats| cats.len()).sum();
343    }
344
345    /// Transform categorical data to one-hot encoded
346    pub fn transform(&self, data: &[Vec<String>]) -> Tensor {
347        let n_samples = data.len();
348        let mut output = vec![0.0f32; n_samples * self.n_features];
349
350        for (sample_idx, sample) in data.iter().enumerate() {
351            let mut feature_offset = 0;
352
353            for (col_idx, value) in sample.iter().enumerate() {
354                if let Some(cat_idx) = self.categories[col_idx].iter().position(|c| c == value) {
355                    let output_idx = sample_idx * self.n_features + feature_offset + cat_idx;
356                    output[output_idx] = 1.0;
357                }
358                feature_offset += self.categories[col_idx].len();
359            }
360        }
361
362        Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
363    }
364
365    pub fn fit_transform(&mut self, data: &[Vec<String>]) -> Tensor {
366        self.fit(data);
367        self.transform(data)
368    }
369
370    /// Get feature names for the one-hot encoded output
371    pub fn get_feature_names(&self) -> Vec<String> {
372        let mut names = Vec::new();
373
374        for (col_idx, cats) in self.categories.iter().enumerate() {
375            for cat in cats {
376                names.push(format!("col{}_{}", col_idx, cat));
377            }
378        }
379
380        names
381    }
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn test_polynomial_features() {
390        let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
391        
392        let mut poly = PolynomialFeatures::new(2);
393        let transformed = poly.fit_transform(&x);
394
395        // Should have more features than input
396        assert!(transformed.dims()[1] > x.dims()[1]);
397        assert_eq!(transformed.dims()[0], 2); // Same number of samples
398    }
399
400    #[test]
401    fn test_feature_hasher() {
402        let features = vec![
403            vec!["feature1".to_string(), "feature2".to_string()],
404            vec!["feature3".to_string()],
405        ];
406
407        let hasher = FeatureHasher::new(10);
408        let hashed = hasher.transform_strings(&features);
409
410        assert_eq!(hashed.dims(), &[2, 10]);
411    }
412
413    #[test]
414    fn test_target_encoder() {
415        let categories = vec![
416            "A".to_string(),
417            "B".to_string(),
418            "A".to_string(),
419            "B".to_string(),
420        ];
421        let target = vec![1.0, 0.0, 1.0, 0.0];
422
423        let mut encoder = TargetEncoder::new();
424        let encoded = encoder.fit_transform(&categories, &target);
425
426        assert_eq!(encoded.len(), 4);
427    }
428
429    #[test]
430    fn test_one_hot_encoder() {
431        let data = vec![
432            vec!["A".to_string(), "X".to_string()],
433            vec!["B".to_string(), "Y".to_string()],
434            vec!["A".to_string(), "X".to_string()],
435        ];
436
437        let mut encoder = OneHotEncoder::new();
438        let encoded = encoder.fit_transform(&data);
439
440        // 2 categories in col0 + 2 categories in col1 = 4 features
441        assert_eq!(encoded.dims()[1], 4);
442    }
443}
444
445