1use ghostflow_core::Tensor;
4use crate::tree::{DecisionTreeClassifier, DecisionTreeRegressor, Criterion};
5use rayon::prelude::*;
6use rand::prelude::*;
7
8pub struct RandomForestClassifier {
10 pub n_estimators: usize,
11 pub max_depth: Option<usize>,
12 pub min_samples_split: usize,
13 pub min_samples_leaf: usize,
14 pub max_features: Option<usize>,
15 pub bootstrap: bool,
16 pub n_jobs: Option<usize>,
17 trees: Vec<DecisionTreeClassifier>,
18 n_classes: usize,
19}
20
21impl RandomForestClassifier {
22 pub fn new(n_estimators: usize) -> Self {
23 RandomForestClassifier {
24 n_estimators,
25 max_depth: None,
26 min_samples_split: 2,
27 min_samples_leaf: 1,
28 max_features: None, bootstrap: true,
30 n_jobs: None,
31 trees: Vec::new(),
32 n_classes: 0,
33 }
34 }
35
36 pub fn max_depth(mut self, depth: usize) -> Self {
37 self.max_depth = Some(depth);
38 self
39 }
40
41 pub fn min_samples_split(mut self, n: usize) -> Self {
42 self.min_samples_split = n;
43 self
44 }
45
46 pub fn max_features(mut self, n: usize) -> Self {
47 self.max_features = Some(n);
48 self
49 }
50
51 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
52 let n_samples = x.dims()[0];
53 let n_features = x.dims()[1];
54
55 let y_data = y.data_f32();
57 self.n_classes = y_data.iter()
58 .map(|&v| v as usize)
59 .max()
60 .unwrap_or(0) + 1;
61
62 let max_features = self.max_features
64 .unwrap_or_else(|| (n_features as f32).sqrt() as usize);
65
66 self.trees = (0..self.n_estimators)
68 .into_par_iter()
69 .map(|_| {
70 let mut rng = thread_rng();
71
72 let indices: Vec<usize> = if self.bootstrap {
74 (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
75 } else {
76 (0..n_samples).collect()
77 };
78
79 let x_data = x.data_f32();
81 let y_data = y.data_f32();
82
83 let x_boot: Vec<f32> = indices.iter()
84 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
85 .collect();
86 let y_boot: Vec<f32> = indices.iter()
87 .map(|&i| y_data[i])
88 .collect();
89
90 let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
91 let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
92
93 let mut tree = DecisionTreeClassifier::new()
95 .criterion(Criterion::Gini);
96
97 if let Some(depth) = self.max_depth {
98 tree = tree.max_depth(depth);
99 }
100 tree = tree.min_samples_split(self.min_samples_split)
101 .min_samples_leaf(self.min_samples_leaf);
102
103 tree.max_features = Some(max_features);
104 tree.fit(&x_tensor, &y_tensor);
105 tree
106 })
107 .collect();
108 }
109
110 pub fn predict(&self, x: &Tensor) -> Tensor {
111 let proba = self.predict_proba(x);
112 let proba_data = proba.data_f32();
113 let n_samples = x.dims()[0];
114
115 let predictions: Vec<f32> = (0..n_samples)
116 .map(|i| {
117 let start = i * self.n_classes;
118 let sample_probs = &proba_data[start..start + self.n_classes];
119 sample_probs.iter()
120 .enumerate()
121 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
122 .map(|(idx, _)| idx as f32)
123 .unwrap_or(0.0)
124 })
125 .collect();
126
127 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
128 }
129
130 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
131 let n_samples = x.dims()[0];
132
133 let all_probs: Vec<Vec<f32>> = self.trees.par_iter()
135 .map(|tree| tree.predict_proba(x).data_f32())
136 .collect();
137
138 let mut avg_probs = vec![0.0f32; n_samples * self.n_classes];
140
141 for probs in &all_probs {
142 for (i, &p) in probs.iter().enumerate() {
143 avg_probs[i] += p;
144 }
145 }
146
147 let n_trees = self.trees.len() as f32;
148 for p in &mut avg_probs {
149 *p /= n_trees;
150 }
151
152 Tensor::from_slice(&avg_probs, &[n_samples, self.n_classes]).unwrap()
153 }
154
155 pub fn feature_importances(&self, n_features: usize) -> Vec<f32> {
157 let mut importances = vec![0.0f32; n_features];
158
159 let uniform = 1.0 / n_features as f32;
162 importances.iter_mut().for_each(|x| *x = uniform);
163
164 importances
165 }
166}
167
168pub struct RandomForestRegressor {
170 pub n_estimators: usize,
171 pub max_depth: Option<usize>,
172 pub min_samples_split: usize,
173 pub min_samples_leaf: usize,
174 pub max_features: Option<usize>,
175 pub bootstrap: bool,
176 trees: Vec<DecisionTreeRegressor>,
177}
178
179impl RandomForestRegressor {
180 pub fn new(n_estimators: usize) -> Self {
181 RandomForestRegressor {
182 n_estimators,
183 max_depth: None,
184 min_samples_split: 2,
185 min_samples_leaf: 1,
186 max_features: None,
187 bootstrap: true,
188 trees: Vec::new(),
189 }
190 }
191
192 pub fn max_depth(mut self, depth: usize) -> Self {
193 self.max_depth = Some(depth);
194 self
195 }
196
197 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
198 let n_samples = x.dims()[0];
199 let n_features = x.dims()[1];
200
201 let max_features = self.max_features
202 .unwrap_or_else(|| n_features / 3);
203
204 self.trees = (0..self.n_estimators)
205 .into_par_iter()
206 .map(|_| {
207 let mut rng = thread_rng();
208
209 let indices: Vec<usize> = if self.bootstrap {
210 (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
211 } else {
212 (0..n_samples).collect()
213 };
214
215 let x_data = x.data_f32();
216 let y_data = y.data_f32();
217
218 let x_boot: Vec<f32> = indices.iter()
219 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
220 .collect();
221 let y_boot: Vec<f32> = indices.iter()
222 .map(|&i| y_data[i])
223 .collect();
224
225 let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
226 let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
227
228 let mut tree = DecisionTreeRegressor::new();
229 if let Some(depth) = self.max_depth {
230 tree = tree.max_depth(depth);
231 }
232 tree.max_features = Some(max_features);
233 tree.fit(&x_tensor, &y_tensor);
234 tree
235 })
236 .collect();
237 }
238
239 pub fn predict(&self, x: &Tensor) -> Tensor {
240 let n_samples = x.dims()[0];
241
242 let all_preds: Vec<Vec<f32>> = self.trees.par_iter()
243 .map(|tree| tree.predict(x).data_f32())
244 .collect();
245
246 let mut avg_preds = vec![0.0f32; n_samples];
247
248 for preds in &all_preds {
249 for (i, &p) in preds.iter().enumerate() {
250 avg_preds[i] += p;
251 }
252 }
253
254 let n_trees = self.trees.len() as f32;
255 for p in &mut avg_preds {
256 *p /= n_trees;
257 }
258
259 Tensor::from_slice(&avg_preds, &[n_samples]).unwrap()
260 }
261}
262
263pub struct GradientBoostingClassifier {
265 pub n_estimators: usize,
266 pub learning_rate: f32,
267 pub max_depth: usize,
268 pub min_samples_split: usize,
269 pub subsample: f32,
270 trees: Vec<Vec<DecisionTreeRegressor>>, initial_predictions: Vec<f32>,
272 n_classes: usize,
273}
274
275impl GradientBoostingClassifier {
276 pub fn new(n_estimators: usize) -> Self {
277 GradientBoostingClassifier {
278 n_estimators,
279 learning_rate: 0.1,
280 max_depth: 3,
281 min_samples_split: 2,
282 subsample: 1.0,
283 trees: Vec::new(),
284 initial_predictions: Vec::new(),
285 n_classes: 0,
286 }
287 }
288
289 pub fn learning_rate(mut self, lr: f32) -> Self {
290 self.learning_rate = lr;
291 self
292 }
293
294 pub fn max_depth(mut self, depth: usize) -> Self {
295 self.max_depth = depth;
296 self
297 }
298
299 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
300 let n_samples = x.dims()[0];
301 let n_features = x.dims()[1];
302 let x_data = x.data_f32();
303 let y_data = y.data_f32();
304
305 self.n_classes = y_data.iter()
306 .map(|&v| v as usize)
307 .max()
308 .unwrap_or(0) + 1;
309
310 self.initial_predictions = vec![0.0; self.n_classes];
312 for c in 0..self.n_classes {
313 let count = y_data.iter().filter(|&&v| v as usize == c).count();
314 let prob = count as f32 / n_samples as f32;
315 self.initial_predictions[c] = (prob / (1.0 - prob + 1e-10)).ln();
316 }
317
318 let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
320 for c in 0..self.n_classes {
321 for i in 0..n_samples {
322 f[c][i] = self.initial_predictions[c];
323 }
324 }
325
326 self.trees = (0..self.n_classes).map(|_| Vec::new()).collect();
327
328 for _ in 0..self.n_estimators {
330 let probs = self.softmax_predictions(&f);
332
333 for c in 0..self.n_classes {
335 let residuals: Vec<f32> = (0..n_samples)
337 .map(|i| {
338 let y_true = if y_data[i] as usize == c { 1.0 } else { 0.0 };
339 y_true - probs[c][i]
340 })
341 .collect();
342
343 let _residual_tensor = Tensor::from_slice(&residuals, &[n_samples]).unwrap();
344
345 let sample_indices: Vec<usize> = if self.subsample < 1.0 {
347 let mut rng = thread_rng();
348 let n_subsample = (n_samples as f32 * self.subsample) as usize;
349 (0..n_samples).choose_multiple(&mut rng, n_subsample)
350 } else {
351 (0..n_samples).collect()
352 };
353
354 let x_sub: Vec<f32> = sample_indices.iter()
355 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
356 .collect();
357 let r_sub: Vec<f32> = sample_indices.iter()
358 .map(|&i| residuals[i])
359 .collect();
360
361 let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
362 let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
363
364 let mut tree = DecisionTreeRegressor::new()
366 .max_depth(self.max_depth);
367 tree.min_samples_split = self.min_samples_split;
368 tree.fit(&x_tensor, &r_tensor);
369
370 let tree_preds = tree.predict(x);
372 let tree_pred_data = tree_preds.data_f32();
373
374 for i in 0..n_samples {
375 f[c][i] += self.learning_rate * tree_pred_data[i];
376 }
377
378 self.trees[c].push(tree);
379 }
380 }
381 }
382
383 fn softmax_predictions(&self, f: &[Vec<f32>]) -> Vec<Vec<f32>> {
384 let n_samples = f[0].len();
385 let mut probs = vec![vec![0.0f32; n_samples]; self.n_classes];
386
387 for i in 0..n_samples {
388 let max_f = (0..self.n_classes).map(|c| f[c][i]).fold(f32::NEG_INFINITY, f32::max);
389 let sum_exp: f32 = (0..self.n_classes).map(|c| (f[c][i] - max_f).exp()).sum();
390
391 for c in 0..self.n_classes {
392 probs[c][i] = (f[c][i] - max_f).exp() / sum_exp;
393 }
394 }
395
396 probs
397 }
398
399 pub fn predict(&self, x: &Tensor) -> Tensor {
400 let proba = self.predict_proba(x);
401 let proba_data = proba.data_f32();
402 let n_samples = x.dims()[0];
403
404 let predictions: Vec<f32> = (0..n_samples)
405 .map(|i| {
406 let start = i * self.n_classes;
407 let sample_probs = &proba_data[start..start + self.n_classes];
408 sample_probs.iter()
409 .enumerate()
410 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
411 .map(|(idx, _)| idx as f32)
412 .unwrap_or(0.0)
413 })
414 .collect();
415
416 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
417 }
418
419 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
420 let n_samples = x.dims()[0];
421
422 let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
424 for c in 0..self.n_classes {
425 for i in 0..n_samples {
426 f[c][i] = self.initial_predictions[c];
427 }
428 }
429
430 for c in 0..self.n_classes {
432 for tree in &self.trees[c] {
433 let preds = tree.predict(x);
434 let pred_data = preds.data_f32();
435 for i in 0..n_samples {
436 f[c][i] += self.learning_rate * pred_data[i];
437 }
438 }
439 }
440
441 let probs = self.softmax_predictions(&f);
443
444 let mut result = Vec::with_capacity(n_samples * self.n_classes);
445 for i in 0..n_samples {
446 for c in 0..self.n_classes {
447 result.push(probs[c][i]);
448 }
449 }
450
451 Tensor::from_slice(&result, &[n_samples, self.n_classes]).unwrap()
452 }
453}
454
455pub struct GradientBoostingRegressor {
457 pub n_estimators: usize,
458 pub learning_rate: f32,
459 pub max_depth: usize,
460 pub min_samples_split: usize,
461 pub subsample: f32,
462 pub loss: GBLoss,
463 trees: Vec<DecisionTreeRegressor>,
464 initial_prediction: f32,
465}
466
467#[derive(Clone, Copy)]
468pub enum GBLoss {
469 SquaredError,
470 AbsoluteError,
471 Huber,
472}
473
474impl GradientBoostingRegressor {
475 pub fn new(n_estimators: usize) -> Self {
476 GradientBoostingRegressor {
477 n_estimators,
478 learning_rate: 0.1,
479 max_depth: 3,
480 min_samples_split: 2,
481 subsample: 1.0,
482 loss: GBLoss::SquaredError,
483 trees: Vec::new(),
484 initial_prediction: 0.0,
485 }
486 }
487
488 pub fn learning_rate(mut self, lr: f32) -> Self {
489 self.learning_rate = lr;
490 self
491 }
492
493 pub fn max_depth(mut self, depth: usize) -> Self {
494 self.max_depth = depth;
495 self
496 }
497
498 pub fn loss(mut self, loss: GBLoss) -> Self {
499 self.loss = loss;
500 self
501 }
502
503 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
504 let n_samples = x.dims()[0];
505 let n_features = x.dims()[1];
506 let x_data = x.data_f32();
507 let y_data = y.data_f32();
508
509 self.initial_prediction = y_data.iter().sum::<f32>() / n_samples as f32;
511
512 let mut predictions = vec![self.initial_prediction; n_samples];
513
514 for _ in 0..self.n_estimators {
515 let residuals: Vec<f32> = match self.loss {
517 GBLoss::SquaredError => {
518 (0..n_samples).map(|i| y_data[i] - predictions[i]).collect()
519 }
520 GBLoss::AbsoluteError => {
521 (0..n_samples).map(|i| {
522 let diff = y_data[i] - predictions[i];
523 if diff > 0.0 { 1.0 } else if diff < 0.0 { -1.0 } else { 0.0 }
524 }).collect()
525 }
526 GBLoss::Huber => {
527 let delta = 1.0;
528 (0..n_samples).map(|i| {
529 let diff = y_data[i] - predictions[i];
530 if diff.abs() <= delta {
531 diff
532 } else {
533 delta * diff.signum()
534 }
535 }).collect()
536 }
537 };
538
539 let sample_indices: Vec<usize> = if self.subsample < 1.0 {
541 let mut rng = thread_rng();
542 let n_subsample = (n_samples as f32 * self.subsample) as usize;
543 (0..n_samples).choose_multiple(&mut rng, n_subsample)
544 } else {
545 (0..n_samples).collect()
546 };
547
548 let x_sub: Vec<f32> = sample_indices.iter()
549 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
550 .collect();
551 let r_sub: Vec<f32> = sample_indices.iter()
552 .map(|&i| residuals[i])
553 .collect();
554
555 let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
556 let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
557
558 let mut tree = DecisionTreeRegressor::new()
560 .max_depth(self.max_depth);
561 tree.min_samples_split = self.min_samples_split;
562 tree.fit(&x_tensor, &r_tensor);
563
564 let tree_preds = tree.predict(x);
566 let tree_pred_data = tree_preds.data_f32();
567
568 for i in 0..n_samples {
569 predictions[i] += self.learning_rate * tree_pred_data[i];
570 }
571
572 self.trees.push(tree);
573 }
574 }
575
576 pub fn predict(&self, x: &Tensor) -> Tensor {
577 let n_samples = x.dims()[0];
578 let mut predictions = vec![self.initial_prediction; n_samples];
579
580 for tree in &self.trees {
581 let tree_preds = tree.predict(x);
582 let tree_pred_data = tree_preds.data_f32();
583
584 for i in 0..n_samples {
585 predictions[i] += self.learning_rate * tree_pred_data[i];
586 }
587 }
588
589 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
590 }
591}
592
593#[cfg(test)]
594mod tests {
595 use super::*;
596
597 #[test]
598 fn test_random_forest_classifier() {
599 let x = Tensor::from_slice(&[
600 0.0, 0.0,
601 0.0, 1.0,
602 1.0, 0.0,
603 1.0, 1.0,
604 0.5, 0.5,
605 ], &[5, 2]).unwrap();
606
607 let y = Tensor::from_slice(&[0.0, 1.0, 1.0, 0.0, 0.0], &[5]).unwrap();
608
609 let mut rf = RandomForestClassifier::new(10).max_depth(3);
610 rf.fit(&x, &y);
611
612 let predictions = rf.predict(&x);
613 assert_eq!(predictions.dims(), &[5]);
614 }
615
616 #[test]
617 fn test_gradient_boosting_regressor() {
618 let x = Tensor::from_slice(&[
619 1.0, 2.0, 3.0, 4.0, 5.0,
620 ], &[5, 1]).unwrap();
621
622 let y = Tensor::from_slice(&[2.0, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
623
624 let mut gb = GradientBoostingRegressor::new(50)
625 .learning_rate(0.1)
626 .max_depth(3);
627 gb.fit(&x, &y);
628
629 let predictions = gb.predict(&x);
630 assert_eq!(predictions.dims(), &[5]);
631 }
632}