ghostflow_ml/
ensemble.rs

1//! Ensemble methods - Random Forest and Gradient Boosting
2
3use ghostflow_core::Tensor;
4use crate::tree::{DecisionTreeClassifier, DecisionTreeRegressor, Criterion};
5use rayon::prelude::*;
6use rand::prelude::*;
7
8/// Random Forest Classifier
9pub struct RandomForestClassifier {
10    pub n_estimators: usize,
11    pub max_depth: Option<usize>,
12    pub min_samples_split: usize,
13    pub min_samples_leaf: usize,
14    pub max_features: Option<usize>,
15    pub bootstrap: bool,
16    pub n_jobs: Option<usize>,
17    trees: Vec<DecisionTreeClassifier>,
18    n_classes: usize,
19}
20
21impl RandomForestClassifier {
22    pub fn new(n_estimators: usize) -> Self {
23        RandomForestClassifier {
24            n_estimators,
25            max_depth: None,
26            min_samples_split: 2,
27            min_samples_leaf: 1,
28            max_features: None, // sqrt(n_features) by default
29            bootstrap: true,
30            n_jobs: None,
31            trees: Vec::new(),
32            n_classes: 0,
33        }
34    }
35
36    pub fn max_depth(mut self, depth: usize) -> Self {
37        self.max_depth = Some(depth);
38        self
39    }
40
41    pub fn min_samples_split(mut self, n: usize) -> Self {
42        self.min_samples_split = n;
43        self
44    }
45
46    pub fn max_features(mut self, n: usize) -> Self {
47        self.max_features = Some(n);
48        self
49    }
50
51    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
52        let n_samples = x.dims()[0];
53        let n_features = x.dims()[1];
54        
55        // Determine number of classes
56        let y_data = y.data_f32();
57        self.n_classes = y_data.iter()
58            .map(|&v| v as usize)
59            .max()
60            .unwrap_or(0) + 1;
61        
62        // Default max_features = sqrt(n_features)
63        let max_features = self.max_features
64            .unwrap_or_else(|| (n_features as f32).sqrt() as usize);
65        
66        // Build trees in parallel
67        self.trees = (0..self.n_estimators)
68            .into_par_iter()
69            .map(|_| {
70                let mut rng = thread_rng();
71                
72                // Bootstrap sampling
73                let indices: Vec<usize> = if self.bootstrap {
74                    (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
75                } else {
76                    (0..n_samples).collect()
77                };
78                
79                // Create bootstrap sample
80                let x_data = x.data_f32();
81                let y_data = y.data_f32();
82                
83                let x_boot: Vec<f32> = indices.iter()
84                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
85                    .collect();
86                let y_boot: Vec<f32> = indices.iter()
87                    .map(|&i| y_data[i])
88                    .collect();
89                
90                let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
91                let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
92                
93                // Build tree
94                let mut tree = DecisionTreeClassifier::new()
95                    .criterion(Criterion::Gini);
96                
97                if let Some(depth) = self.max_depth {
98                    tree = tree.max_depth(depth);
99                }
100                tree = tree.min_samples_split(self.min_samples_split)
101                    .min_samples_leaf(self.min_samples_leaf);
102                
103                tree.max_features = Some(max_features);
104                tree.fit(&x_tensor, &y_tensor);
105                tree
106            })
107            .collect();
108    }
109
110    pub fn predict(&self, x: &Tensor) -> Tensor {
111        let proba = self.predict_proba(x);
112        let proba_data = proba.data_f32();
113        let n_samples = x.dims()[0];
114        
115        let predictions: Vec<f32> = (0..n_samples)
116            .map(|i| {
117                let start = i * self.n_classes;
118                let sample_probs = &proba_data[start..start + self.n_classes];
119                sample_probs.iter()
120                    .enumerate()
121                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
122                    .map(|(idx, _)| idx as f32)
123                    .unwrap_or(0.0)
124            })
125            .collect();
126        
127        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
128    }
129
130    pub fn predict_proba(&self, x: &Tensor) -> Tensor {
131        let n_samples = x.dims()[0];
132        
133        // Collect predictions from all trees
134        let all_probs: Vec<Vec<f32>> = self.trees.par_iter()
135            .map(|tree| tree.predict_proba(x).data_f32())
136            .collect();
137        
138        // Average probabilities
139        let mut avg_probs = vec![0.0f32; n_samples * self.n_classes];
140        
141        for probs in &all_probs {
142            for (i, &p) in probs.iter().enumerate() {
143                avg_probs[i] += p;
144            }
145        }
146        
147        let n_trees = self.trees.len() as f32;
148        for p in &mut avg_probs {
149            *p /= n_trees;
150        }
151        
152        Tensor::from_slice(&avg_probs, &[n_samples, self.n_classes]).unwrap()
153    }
154
155    /// Feature importance based on impurity decrease
156    pub fn feature_importances(&self, n_features: usize) -> Vec<f32> {
157        let mut importances = vec![0.0f32; n_features];
158        
159        // This would require tracking impurity decrease during tree building
160        // For now, return uniform importances
161        let uniform = 1.0 / n_features as f32;
162        importances.iter_mut().for_each(|x| *x = uniform);
163        
164        importances
165    }
166}
167
168/// Random Forest Regressor
169pub struct RandomForestRegressor {
170    pub n_estimators: usize,
171    pub max_depth: Option<usize>,
172    pub min_samples_split: usize,
173    pub min_samples_leaf: usize,
174    pub max_features: Option<usize>,
175    pub bootstrap: bool,
176    trees: Vec<DecisionTreeRegressor>,
177}
178
179impl RandomForestRegressor {
180    pub fn new(n_estimators: usize) -> Self {
181        RandomForestRegressor {
182            n_estimators,
183            max_depth: None,
184            min_samples_split: 2,
185            min_samples_leaf: 1,
186            max_features: None,
187            bootstrap: true,
188            trees: Vec::new(),
189        }
190    }
191
192    pub fn max_depth(mut self, depth: usize) -> Self {
193        self.max_depth = Some(depth);
194        self
195    }
196
197    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
198        let n_samples = x.dims()[0];
199        let n_features = x.dims()[1];
200        
201        let max_features = self.max_features
202            .unwrap_or(n_features / 3);
203        
204        self.trees = (0..self.n_estimators)
205            .into_par_iter()
206            .map(|_| {
207                let mut rng = thread_rng();
208                
209                let indices: Vec<usize> = if self.bootstrap {
210                    (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
211                } else {
212                    (0..n_samples).collect()
213                };
214                
215                let x_data = x.data_f32();
216                let y_data = y.data_f32();
217                
218                let x_boot: Vec<f32> = indices.iter()
219                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
220                    .collect();
221                let y_boot: Vec<f32> = indices.iter()
222                    .map(|&i| y_data[i])
223                    .collect();
224                
225                let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
226                let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
227                
228                let mut tree = DecisionTreeRegressor::new();
229                if let Some(depth) = self.max_depth {
230                    tree = tree.max_depth(depth);
231                }
232                tree.max_features = Some(max_features);
233                tree.fit(&x_tensor, &y_tensor);
234                tree
235            })
236            .collect();
237    }
238
239    pub fn predict(&self, x: &Tensor) -> Tensor {
240        let n_samples = x.dims()[0];
241        
242        let all_preds: Vec<Vec<f32>> = self.trees.par_iter()
243            .map(|tree| tree.predict(x).data_f32())
244            .collect();
245        
246        let mut avg_preds = vec![0.0f32; n_samples];
247        
248        for preds in &all_preds {
249            for (i, &p) in preds.iter().enumerate() {
250                avg_preds[i] += p;
251            }
252        }
253        
254        let n_trees = self.trees.len() as f32;
255        for p in &mut avg_preds {
256            *p /= n_trees;
257        }
258        
259        Tensor::from_slice(&avg_preds, &[n_samples]).unwrap()
260    }
261}
262
263/// Gradient Boosting Classifier
264pub struct GradientBoostingClassifier {
265    pub n_estimators: usize,
266    pub learning_rate: f32,
267    pub max_depth: usize,
268    pub min_samples_split: usize,
269    pub subsample: f32,
270    trees: Vec<Vec<DecisionTreeRegressor>>, // One set of trees per class
271    initial_predictions: Vec<f32>,
272    n_classes: usize,
273}
274
275impl GradientBoostingClassifier {
276    pub fn new(n_estimators: usize) -> Self {
277        GradientBoostingClassifier {
278            n_estimators,
279            learning_rate: 0.1,
280            max_depth: 3,
281            min_samples_split: 2,
282            subsample: 1.0,
283            trees: Vec::new(),
284            initial_predictions: Vec::new(),
285            n_classes: 0,
286        }
287    }
288
289    pub fn learning_rate(mut self, lr: f32) -> Self {
290        self.learning_rate = lr;
291        self
292    }
293
294    pub fn max_depth(mut self, depth: usize) -> Self {
295        self.max_depth = depth;
296        self
297    }
298
299    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
300        let n_samples = x.dims()[0];
301        let n_features = x.dims()[1];
302        let x_data = x.data_f32();
303        let y_data = y.data_f32();
304        
305        self.n_classes = y_data.iter()
306            .map(|&v| v as usize)
307            .max()
308            .unwrap_or(0) + 1;
309        
310        // Initialize with log-odds
311        self.initial_predictions = vec![0.0; self.n_classes];
312        for c in 0..self.n_classes {
313            let count = y_data.iter().filter(|&&v| v as usize == c).count();
314            let prob = count as f32 / n_samples as f32;
315            self.initial_predictions[c] = (prob / (1.0 - prob + 1e-10)).ln();
316        }
317        
318        // Initialize predictions
319        let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
320        #[allow(clippy::needless_range_loop)]
321        for c in 0..self.n_classes {
322            for i in 0..n_samples {
323                f[c][i] = self.initial_predictions[c];
324            }
325        }
326        
327        self.trees = (0..self.n_classes).map(|_| Vec::new()).collect();
328        
329        // Boosting iterations
330        for _ in 0..self.n_estimators {
331            // Compute probabilities using softmax
332            let probs = self.softmax_predictions(&f);
333            
334            // For each class, fit a tree to the negative gradient
335            for c in 0..self.n_classes {
336                // Compute residuals (negative gradient)
337                let residuals: Vec<f32> = (0..n_samples)
338                    .map(|i| {
339                        let y_true = if y_data[i] as usize == c { 1.0 } else { 0.0 };
340                        y_true - probs[c][i]
341                    })
342                    .collect();
343                
344                let _residual_tensor = Tensor::from_slice(&residuals, &[n_samples]).unwrap();
345                
346                // Subsample
347                let sample_indices: Vec<usize> = if self.subsample < 1.0 {
348                    let mut rng = thread_rng();
349                    let n_subsample = (n_samples as f32 * self.subsample) as usize;
350                    (0..n_samples).choose_multiple(&mut rng, n_subsample)
351                } else {
352                    (0..n_samples).collect()
353                };
354                
355                let x_sub: Vec<f32> = sample_indices.iter()
356                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
357                    .collect();
358                let r_sub: Vec<f32> = sample_indices.iter()
359                    .map(|&i| residuals[i])
360                    .collect();
361                
362                let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
363                let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
364                
365                // Fit tree
366                let mut tree = DecisionTreeRegressor::new()
367                    .max_depth(self.max_depth);
368                tree.min_samples_split = self.min_samples_split;
369                tree.fit(&x_tensor, &r_tensor);
370                
371                // Update predictions
372                let tree_preds = tree.predict(x);
373                let tree_pred_data = tree_preds.data_f32();
374                
375                #[allow(clippy::needless_range_loop)]
376                for i in 0..n_samples {
377                    f[c][i] += self.learning_rate * tree_pred_data[i];
378                }
379                
380                self.trees[c].push(tree);
381            }
382        }
383    }
384
385    fn softmax_predictions(&self, f: &[Vec<f32>]) -> Vec<Vec<f32>> {
386        let n_samples = f[0].len();
387        let mut probs = vec![vec![0.0f32; n_samples]; self.n_classes];
388        
389        for i in 0..n_samples {
390            let max_f = (0..self.n_classes).map(|c| f[c][i]).fold(f32::NEG_INFINITY, f32::max);
391            let sum_exp: f32 = (0..self.n_classes).map(|c| (f[c][i] - max_f).exp()).sum();
392            
393            for c in 0..self.n_classes {
394                probs[c][i] = (f[c][i] - max_f).exp() / sum_exp;
395            }
396        }
397        
398        probs
399    }
400
401    pub fn predict(&self, x: &Tensor) -> Tensor {
402        let proba = self.predict_proba(x);
403        let proba_data = proba.data_f32();
404        let n_samples = x.dims()[0];
405        
406        let predictions: Vec<f32> = (0..n_samples)
407            .map(|i| {
408                let start = i * self.n_classes;
409                let sample_probs = &proba_data[start..start + self.n_classes];
410                sample_probs.iter()
411                    .enumerate()
412                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
413                    .map(|(idx, _)| idx as f32)
414                    .unwrap_or(0.0)
415            })
416            .collect();
417        
418        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
419    }
420
421    pub fn predict_proba(&self, x: &Tensor) -> Tensor {
422        let n_samples = x.dims()[0];
423        
424        // Initialize with initial predictions
425        let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
426        #[allow(clippy::needless_range_loop)]
427        for c in 0..self.n_classes {
428            for i in 0..n_samples {
429                f[c][i] = self.initial_predictions[c];
430            }
431        }
432        
433        // Add tree predictions
434        for c in 0..self.n_classes {
435            for tree in &self.trees[c] {
436                let preds = tree.predict(x);
437                let pred_data = preds.data_f32();
438                for i in 0..n_samples {
439                    f[c][i] += self.learning_rate * pred_data[i];
440                }
441            }
442        }
443        
444        // Softmax
445        let probs = self.softmax_predictions(&f);
446        
447        let mut result = Vec::with_capacity(n_samples * self.n_classes);
448        for i in 0..n_samples {
449            for c in 0..self.n_classes {
450                result.push(probs[c][i]);
451            }
452        }
453        
454        Tensor::from_slice(&result, &[n_samples, self.n_classes]).unwrap()
455    }
456}
457
458/// Gradient Boosting Regressor
459pub struct GradientBoostingRegressor {
460    pub n_estimators: usize,
461    pub learning_rate: f32,
462    pub max_depth: usize,
463    pub min_samples_split: usize,
464    pub subsample: f32,
465    pub loss: GBLoss,
466    trees: Vec<DecisionTreeRegressor>,
467    initial_prediction: f32,
468}
469
470#[derive(Clone, Copy)]
471pub enum GBLoss {
472    SquaredError,
473    AbsoluteError,
474    Huber,
475}
476
477impl GradientBoostingRegressor {
478    pub fn new(n_estimators: usize) -> Self {
479        GradientBoostingRegressor {
480            n_estimators,
481            learning_rate: 0.1,
482            max_depth: 3,
483            min_samples_split: 2,
484            subsample: 1.0,
485            loss: GBLoss::SquaredError,
486            trees: Vec::new(),
487            initial_prediction: 0.0,
488        }
489    }
490
491    pub fn learning_rate(mut self, lr: f32) -> Self {
492        self.learning_rate = lr;
493        self
494    }
495
496    pub fn max_depth(mut self, depth: usize) -> Self {
497        self.max_depth = depth;
498        self
499    }
500
501    pub fn loss(mut self, loss: GBLoss) -> Self {
502        self.loss = loss;
503        self
504    }
505
506    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
507        let n_samples = x.dims()[0];
508        let n_features = x.dims()[1];
509        let x_data = x.data_f32();
510        let y_data = y.data_f32();
511        
512        // Initialize with mean
513        self.initial_prediction = y_data.iter().sum::<f32>() / n_samples as f32;
514        
515        let mut predictions = vec![self.initial_prediction; n_samples];
516        
517        for _ in 0..self.n_estimators {
518            // Compute residuals
519            let residuals: Vec<f32> = match self.loss {
520                GBLoss::SquaredError => {
521                    (0..n_samples).map(|i| y_data[i] - predictions[i]).collect()
522                }
523                GBLoss::AbsoluteError => {
524                    (0..n_samples).map(|i| {
525                        let diff = y_data[i] - predictions[i];
526                        if diff > 0.0 { 1.0 } else if diff < 0.0 { -1.0 } else { 0.0 }
527                    }).collect()
528                }
529                GBLoss::Huber => {
530                    let delta = 1.0;
531                    (0..n_samples).map(|i| {
532                        let diff = y_data[i] - predictions[i];
533                        if diff.abs() <= delta {
534                            diff
535                        } else {
536                            delta * diff.signum()
537                        }
538                    }).collect()
539                }
540            };
541            
542            // Subsample
543            let sample_indices: Vec<usize> = if self.subsample < 1.0 {
544                let mut rng = thread_rng();
545                let n_subsample = (n_samples as f32 * self.subsample) as usize;
546                (0..n_samples).choose_multiple(&mut rng, n_subsample)
547            } else {
548                (0..n_samples).collect()
549            };
550            
551            let x_sub: Vec<f32> = sample_indices.iter()
552                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
553                .collect();
554            let r_sub: Vec<f32> = sample_indices.iter()
555                .map(|&i| residuals[i])
556                .collect();
557            
558            let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
559            let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
560            
561            // Fit tree
562            let mut tree = DecisionTreeRegressor::new()
563                .max_depth(self.max_depth);
564            tree.min_samples_split = self.min_samples_split;
565            tree.fit(&x_tensor, &r_tensor);
566            
567            // Update predictions
568            let tree_preds = tree.predict(x);
569            let tree_pred_data = tree_preds.data_f32();
570            
571            for i in 0..n_samples {
572                predictions[i] += self.learning_rate * tree_pred_data[i];
573            }
574            
575            self.trees.push(tree);
576        }
577    }
578
579    pub fn predict(&self, x: &Tensor) -> Tensor {
580        let n_samples = x.dims()[0];
581        let mut predictions = vec![self.initial_prediction; n_samples];
582        
583        for tree in &self.trees {
584            let tree_preds = tree.predict(x);
585            let tree_pred_data = tree_preds.data_f32();
586            
587            for i in 0..n_samples {
588                predictions[i] += self.learning_rate * tree_pred_data[i];
589            }
590        }
591        
592        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
593    }
594}
595
596#[cfg(test)]
597mod tests {
598    use super::*;
599
600    #[test]
601    fn test_random_forest_classifier() {
602        let x = Tensor::from_slice(&[0.0f32, 0.0,
603            0.0, 1.0,
604            1.0, 0.0,
605            1.0, 1.0,
606            0.5, 0.5,
607        ], &[5, 2]).unwrap();
608        
609        let y = Tensor::from_slice(&[0.0f32, 1.0, 1.0, 0.0, 0.0], &[5]).unwrap();
610        
611        let mut rf = RandomForestClassifier::new(10).max_depth(3);
612        rf.fit(&x, &y);
613        
614        let predictions = rf.predict(&x);
615        assert_eq!(predictions.dims(), &[5]);
616    }
617
618    #[test]
619    fn test_gradient_boosting_regressor() {
620        let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0,
621        ], &[5, 1]).unwrap();
622        
623        let y = Tensor::from_slice(&[2.0f32, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
624        
625        let mut gb = GradientBoostingRegressor::new(50)
626            .learning_rate(0.1)
627            .max_depth(3);
628        gb.fit(&x, &y);
629        
630        let predictions = gb.predict(&x);
631        assert_eq!(predictions.dims(), &[5]);
632    }
633}
634
635