ghostflow_ml/
ensemble.rs

1//! Ensemble methods - Random Forest and Gradient Boosting
2
3use ghostflow_core::Tensor;
4use crate::tree::{DecisionTreeClassifier, DecisionTreeRegressor, Criterion};
5use rayon::prelude::*;
6use rand::prelude::*;
7
8/// Random Forest Classifier
9pub struct RandomForestClassifier {
10    pub n_estimators: usize,
11    pub max_depth: Option<usize>,
12    pub min_samples_split: usize,
13    pub min_samples_leaf: usize,
14    pub max_features: Option<usize>,
15    pub bootstrap: bool,
16    pub n_jobs: Option<usize>,
17    trees: Vec<DecisionTreeClassifier>,
18    n_classes: usize,
19}
20
21impl RandomForestClassifier {
22    pub fn new(n_estimators: usize) -> Self {
23        RandomForestClassifier {
24            n_estimators,
25            max_depth: None,
26            min_samples_split: 2,
27            min_samples_leaf: 1,
28            max_features: None, // sqrt(n_features) by default
29            bootstrap: true,
30            n_jobs: None,
31            trees: Vec::new(),
32            n_classes: 0,
33        }
34    }
35
36    pub fn max_depth(mut self, depth: usize) -> Self {
37        self.max_depth = Some(depth);
38        self
39    }
40
41    pub fn min_samples_split(mut self, n: usize) -> Self {
42        self.min_samples_split = n;
43        self
44    }
45
46    pub fn max_features(mut self, n: usize) -> Self {
47        self.max_features = Some(n);
48        self
49    }
50
51    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
52        let n_samples = x.dims()[0];
53        let n_features = x.dims()[1];
54        
55        // Determine number of classes
56        let y_data = y.data_f32();
57        self.n_classes = y_data.iter()
58            .map(|&v| v as usize)
59            .max()
60            .unwrap_or(0) + 1;
61        
62        // Default max_features = sqrt(n_features)
63        let max_features = self.max_features
64            .unwrap_or_else(|| (n_features as f32).sqrt() as usize);
65        
66        // Build trees in parallel
67        self.trees = (0..self.n_estimators)
68            .into_par_iter()
69            .map(|_| {
70                let mut rng = thread_rng();
71                
72                // Bootstrap sampling
73                let indices: Vec<usize> = if self.bootstrap {
74                    (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
75                } else {
76                    (0..n_samples).collect()
77                };
78                
79                // Create bootstrap sample
80                let x_data = x.data_f32();
81                let y_data = y.data_f32();
82                
83                let x_boot: Vec<f32> = indices.iter()
84                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
85                    .collect();
86                let y_boot: Vec<f32> = indices.iter()
87                    .map(|&i| y_data[i])
88                    .collect();
89                
90                let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
91                let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
92                
93                // Build tree
94                let mut tree = DecisionTreeClassifier::new()
95                    .criterion(Criterion::Gini);
96                
97                if let Some(depth) = self.max_depth {
98                    tree = tree.max_depth(depth);
99                }
100                tree = tree.min_samples_split(self.min_samples_split)
101                    .min_samples_leaf(self.min_samples_leaf);
102                
103                tree.max_features = Some(max_features);
104                tree.fit(&x_tensor, &y_tensor);
105                tree
106            })
107            .collect();
108    }
109
110    pub fn predict(&self, x: &Tensor) -> Tensor {
111        let proba = self.predict_proba(x);
112        let proba_data = proba.data_f32();
113        let n_samples = x.dims()[0];
114        
115        let predictions: Vec<f32> = (0..n_samples)
116            .map(|i| {
117                let start = i * self.n_classes;
118                let sample_probs = &proba_data[start..start + self.n_classes];
119                sample_probs.iter()
120                    .enumerate()
121                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
122                    .map(|(idx, _)| idx as f32)
123                    .unwrap_or(0.0)
124            })
125            .collect();
126        
127        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
128    }
129
130    pub fn predict_proba(&self, x: &Tensor) -> Tensor {
131        let n_samples = x.dims()[0];
132        
133        // Collect predictions from all trees
134        let all_probs: Vec<Vec<f32>> = self.trees.par_iter()
135            .map(|tree| tree.predict_proba(x).data_f32())
136            .collect();
137        
138        // Average probabilities
139        let mut avg_probs = vec![0.0f32; n_samples * self.n_classes];
140        
141        for probs in &all_probs {
142            for (i, &p) in probs.iter().enumerate() {
143                avg_probs[i] += p;
144            }
145        }
146        
147        let n_trees = self.trees.len() as f32;
148        for p in &mut avg_probs {
149            *p /= n_trees;
150        }
151        
152        Tensor::from_slice(&avg_probs, &[n_samples, self.n_classes]).unwrap()
153    }
154
155    /// Feature importance based on impurity decrease
156    pub fn feature_importances(&self, n_features: usize) -> Vec<f32> {
157        let mut importances = vec![0.0f32; n_features];
158        
159        // This would require tracking impurity decrease during tree building
160        // For now, return uniform importances
161        let uniform = 1.0 / n_features as f32;
162        importances.iter_mut().for_each(|x| *x = uniform);
163        
164        importances
165    }
166}
167
168/// Random Forest Regressor
169pub struct RandomForestRegressor {
170    pub n_estimators: usize,
171    pub max_depth: Option<usize>,
172    pub min_samples_split: usize,
173    pub min_samples_leaf: usize,
174    pub max_features: Option<usize>,
175    pub bootstrap: bool,
176    trees: Vec<DecisionTreeRegressor>,
177}
178
179impl RandomForestRegressor {
180    pub fn new(n_estimators: usize) -> Self {
181        RandomForestRegressor {
182            n_estimators,
183            max_depth: None,
184            min_samples_split: 2,
185            min_samples_leaf: 1,
186            max_features: None,
187            bootstrap: true,
188            trees: Vec::new(),
189        }
190    }
191
192    pub fn max_depth(mut self, depth: usize) -> Self {
193        self.max_depth = Some(depth);
194        self
195    }
196
197    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
198        let n_samples = x.dims()[0];
199        let n_features = x.dims()[1];
200        
201        let max_features = self.max_features
202            .unwrap_or_else(|| n_features / 3);
203        
204        self.trees = (0..self.n_estimators)
205            .into_par_iter()
206            .map(|_| {
207                let mut rng = thread_rng();
208                
209                let indices: Vec<usize> = if self.bootstrap {
210                    (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
211                } else {
212                    (0..n_samples).collect()
213                };
214                
215                let x_data = x.data_f32();
216                let y_data = y.data_f32();
217                
218                let x_boot: Vec<f32> = indices.iter()
219                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
220                    .collect();
221                let y_boot: Vec<f32> = indices.iter()
222                    .map(|&i| y_data[i])
223                    .collect();
224                
225                let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
226                let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
227                
228                let mut tree = DecisionTreeRegressor::new();
229                if let Some(depth) = self.max_depth {
230                    tree = tree.max_depth(depth);
231                }
232                tree.max_features = Some(max_features);
233                tree.fit(&x_tensor, &y_tensor);
234                tree
235            })
236            .collect();
237    }
238
239    pub fn predict(&self, x: &Tensor) -> Tensor {
240        let n_samples = x.dims()[0];
241        
242        let all_preds: Vec<Vec<f32>> = self.trees.par_iter()
243            .map(|tree| tree.predict(x).data_f32())
244            .collect();
245        
246        let mut avg_preds = vec![0.0f32; n_samples];
247        
248        for preds in &all_preds {
249            for (i, &p) in preds.iter().enumerate() {
250                avg_preds[i] += p;
251            }
252        }
253        
254        let n_trees = self.trees.len() as f32;
255        for p in &mut avg_preds {
256            *p /= n_trees;
257        }
258        
259        Tensor::from_slice(&avg_preds, &[n_samples]).unwrap()
260    }
261}
262
263/// Gradient Boosting Classifier
264pub struct GradientBoostingClassifier {
265    pub n_estimators: usize,
266    pub learning_rate: f32,
267    pub max_depth: usize,
268    pub min_samples_split: usize,
269    pub subsample: f32,
270    trees: Vec<Vec<DecisionTreeRegressor>>, // One set of trees per class
271    initial_predictions: Vec<f32>,
272    n_classes: usize,
273}
274
275impl GradientBoostingClassifier {
276    pub fn new(n_estimators: usize) -> Self {
277        GradientBoostingClassifier {
278            n_estimators,
279            learning_rate: 0.1,
280            max_depth: 3,
281            min_samples_split: 2,
282            subsample: 1.0,
283            trees: Vec::new(),
284            initial_predictions: Vec::new(),
285            n_classes: 0,
286        }
287    }
288
289    pub fn learning_rate(mut self, lr: f32) -> Self {
290        self.learning_rate = lr;
291        self
292    }
293
294    pub fn max_depth(mut self, depth: usize) -> Self {
295        self.max_depth = depth;
296        self
297    }
298
299    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
300        let n_samples = x.dims()[0];
301        let n_features = x.dims()[1];
302        let x_data = x.data_f32();
303        let y_data = y.data_f32();
304        
305        self.n_classes = y_data.iter()
306            .map(|&v| v as usize)
307            .max()
308            .unwrap_or(0) + 1;
309        
310        // Initialize with log-odds
311        self.initial_predictions = vec![0.0; self.n_classes];
312        for c in 0..self.n_classes {
313            let count = y_data.iter().filter(|&&v| v as usize == c).count();
314            let prob = count as f32 / n_samples as f32;
315            self.initial_predictions[c] = (prob / (1.0 - prob + 1e-10)).ln();
316        }
317        
318        // Initialize predictions
319        let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
320        for c in 0..self.n_classes {
321            for i in 0..n_samples {
322                f[c][i] = self.initial_predictions[c];
323            }
324        }
325        
326        self.trees = (0..self.n_classes).map(|_| Vec::new()).collect();
327        
328        // Boosting iterations
329        for _ in 0..self.n_estimators {
330            // Compute probabilities using softmax
331            let probs = self.softmax_predictions(&f);
332            
333            // For each class, fit a tree to the negative gradient
334            for c in 0..self.n_classes {
335                // Compute residuals (negative gradient)
336                let residuals: Vec<f32> = (0..n_samples)
337                    .map(|i| {
338                        let y_true = if y_data[i] as usize == c { 1.0 } else { 0.0 };
339                        y_true - probs[c][i]
340                    })
341                    .collect();
342                
343                let _residual_tensor = Tensor::from_slice(&residuals, &[n_samples]).unwrap();
344                
345                // Subsample
346                let sample_indices: Vec<usize> = if self.subsample < 1.0 {
347                    let mut rng = thread_rng();
348                    let n_subsample = (n_samples as f32 * self.subsample) as usize;
349                    (0..n_samples).choose_multiple(&mut rng, n_subsample)
350                } else {
351                    (0..n_samples).collect()
352                };
353                
354                let x_sub: Vec<f32> = sample_indices.iter()
355                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
356                    .collect();
357                let r_sub: Vec<f32> = sample_indices.iter()
358                    .map(|&i| residuals[i])
359                    .collect();
360                
361                let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
362                let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
363                
364                // Fit tree
365                let mut tree = DecisionTreeRegressor::new()
366                    .max_depth(self.max_depth);
367                tree.min_samples_split = self.min_samples_split;
368                tree.fit(&x_tensor, &r_tensor);
369                
370                // Update predictions
371                let tree_preds = tree.predict(x);
372                let tree_pred_data = tree_preds.data_f32();
373                
374                for i in 0..n_samples {
375                    f[c][i] += self.learning_rate * tree_pred_data[i];
376                }
377                
378                self.trees[c].push(tree);
379            }
380        }
381    }
382
383    fn softmax_predictions(&self, f: &[Vec<f32>]) -> Vec<Vec<f32>> {
384        let n_samples = f[0].len();
385        let mut probs = vec![vec![0.0f32; n_samples]; self.n_classes];
386        
387        for i in 0..n_samples {
388            let max_f = (0..self.n_classes).map(|c| f[c][i]).fold(f32::NEG_INFINITY, f32::max);
389            let sum_exp: f32 = (0..self.n_classes).map(|c| (f[c][i] - max_f).exp()).sum();
390            
391            for c in 0..self.n_classes {
392                probs[c][i] = (f[c][i] - max_f).exp() / sum_exp;
393            }
394        }
395        
396        probs
397    }
398
399    pub fn predict(&self, x: &Tensor) -> Tensor {
400        let proba = self.predict_proba(x);
401        let proba_data = proba.data_f32();
402        let n_samples = x.dims()[0];
403        
404        let predictions: Vec<f32> = (0..n_samples)
405            .map(|i| {
406                let start = i * self.n_classes;
407                let sample_probs = &proba_data[start..start + self.n_classes];
408                sample_probs.iter()
409                    .enumerate()
410                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
411                    .map(|(idx, _)| idx as f32)
412                    .unwrap_or(0.0)
413            })
414            .collect();
415        
416        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
417    }
418
419    pub fn predict_proba(&self, x: &Tensor) -> Tensor {
420        let n_samples = x.dims()[0];
421        
422        // Initialize with initial predictions
423        let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
424        for c in 0..self.n_classes {
425            for i in 0..n_samples {
426                f[c][i] = self.initial_predictions[c];
427            }
428        }
429        
430        // Add tree predictions
431        for c in 0..self.n_classes {
432            for tree in &self.trees[c] {
433                let preds = tree.predict(x);
434                let pred_data = preds.data_f32();
435                for i in 0..n_samples {
436                    f[c][i] += self.learning_rate * pred_data[i];
437                }
438            }
439        }
440        
441        // Softmax
442        let probs = self.softmax_predictions(&f);
443        
444        let mut result = Vec::with_capacity(n_samples * self.n_classes);
445        for i in 0..n_samples {
446            for c in 0..self.n_classes {
447                result.push(probs[c][i]);
448            }
449        }
450        
451        Tensor::from_slice(&result, &[n_samples, self.n_classes]).unwrap()
452    }
453}
454
455/// Gradient Boosting Regressor
456pub struct GradientBoostingRegressor {
457    pub n_estimators: usize,
458    pub learning_rate: f32,
459    pub max_depth: usize,
460    pub min_samples_split: usize,
461    pub subsample: f32,
462    pub loss: GBLoss,
463    trees: Vec<DecisionTreeRegressor>,
464    initial_prediction: f32,
465}
466
467#[derive(Clone, Copy)]
468pub enum GBLoss {
469    SquaredError,
470    AbsoluteError,
471    Huber,
472}
473
474impl GradientBoostingRegressor {
475    pub fn new(n_estimators: usize) -> Self {
476        GradientBoostingRegressor {
477            n_estimators,
478            learning_rate: 0.1,
479            max_depth: 3,
480            min_samples_split: 2,
481            subsample: 1.0,
482            loss: GBLoss::SquaredError,
483            trees: Vec::new(),
484            initial_prediction: 0.0,
485        }
486    }
487
488    pub fn learning_rate(mut self, lr: f32) -> Self {
489        self.learning_rate = lr;
490        self
491    }
492
493    pub fn max_depth(mut self, depth: usize) -> Self {
494        self.max_depth = depth;
495        self
496    }
497
498    pub fn loss(mut self, loss: GBLoss) -> Self {
499        self.loss = loss;
500        self
501    }
502
503    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
504        let n_samples = x.dims()[0];
505        let n_features = x.dims()[1];
506        let x_data = x.data_f32();
507        let y_data = y.data_f32();
508        
509        // Initialize with mean
510        self.initial_prediction = y_data.iter().sum::<f32>() / n_samples as f32;
511        
512        let mut predictions = vec![self.initial_prediction; n_samples];
513        
514        for _ in 0..self.n_estimators {
515            // Compute residuals
516            let residuals: Vec<f32> = match self.loss {
517                GBLoss::SquaredError => {
518                    (0..n_samples).map(|i| y_data[i] - predictions[i]).collect()
519                }
520                GBLoss::AbsoluteError => {
521                    (0..n_samples).map(|i| {
522                        let diff = y_data[i] - predictions[i];
523                        if diff > 0.0 { 1.0 } else if diff < 0.0 { -1.0 } else { 0.0 }
524                    }).collect()
525                }
526                GBLoss::Huber => {
527                    let delta = 1.0;
528                    (0..n_samples).map(|i| {
529                        let diff = y_data[i] - predictions[i];
530                        if diff.abs() <= delta {
531                            diff
532                        } else {
533                            delta * diff.signum()
534                        }
535                    }).collect()
536                }
537            };
538            
539            // Subsample
540            let sample_indices: Vec<usize> = if self.subsample < 1.0 {
541                let mut rng = thread_rng();
542                let n_subsample = (n_samples as f32 * self.subsample) as usize;
543                (0..n_samples).choose_multiple(&mut rng, n_subsample)
544            } else {
545                (0..n_samples).collect()
546            };
547            
548            let x_sub: Vec<f32> = sample_indices.iter()
549                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
550                .collect();
551            let r_sub: Vec<f32> = sample_indices.iter()
552                .map(|&i| residuals[i])
553                .collect();
554            
555            let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
556            let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
557            
558            // Fit tree
559            let mut tree = DecisionTreeRegressor::new()
560                .max_depth(self.max_depth);
561            tree.min_samples_split = self.min_samples_split;
562            tree.fit(&x_tensor, &r_tensor);
563            
564            // Update predictions
565            let tree_preds = tree.predict(x);
566            let tree_pred_data = tree_preds.data_f32();
567            
568            for i in 0..n_samples {
569                predictions[i] += self.learning_rate * tree_pred_data[i];
570            }
571            
572            self.trees.push(tree);
573        }
574    }
575
576    pub fn predict(&self, x: &Tensor) -> Tensor {
577        let n_samples = x.dims()[0];
578        let mut predictions = vec![self.initial_prediction; n_samples];
579        
580        for tree in &self.trees {
581            let tree_preds = tree.predict(x);
582            let tree_pred_data = tree_preds.data_f32();
583            
584            for i in 0..n_samples {
585                predictions[i] += self.learning_rate * tree_pred_data[i];
586            }
587        }
588        
589        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
590    }
591}
592
593#[cfg(test)]
594mod tests {
595    use super::*;
596
597    #[test]
598    fn test_random_forest_classifier() {
599        let x = Tensor::from_slice(&[
600            0.0, 0.0,
601            0.0, 1.0,
602            1.0, 0.0,
603            1.0, 1.0,
604            0.5, 0.5,
605        ], &[5, 2]).unwrap();
606        
607        let y = Tensor::from_slice(&[0.0, 1.0, 1.0, 0.0, 0.0], &[5]).unwrap();
608        
609        let mut rf = RandomForestClassifier::new(10).max_depth(3);
610        rf.fit(&x, &y);
611        
612        let predictions = rf.predict(&x);
613        assert_eq!(predictions.dims(), &[5]);
614    }
615
616    #[test]
617    fn test_gradient_boosting_regressor() {
618        let x = Tensor::from_slice(&[
619            1.0, 2.0, 3.0, 4.0, 5.0,
620        ], &[5, 1]).unwrap();
621        
622        let y = Tensor::from_slice(&[2.0, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
623        
624        let mut gb = GradientBoostingRegressor::new(50)
625            .learning_rate(0.1)
626            .max_depth(3);
627        gb.fit(&x, &y);
628        
629        let predictions = gb.predict(&x);
630        assert_eq!(predictions.dims(), &[5]);
631    }
632}