1use ghostflow_core::Tensor;
4use crate::tree::{DecisionTreeClassifier, DecisionTreeRegressor, Criterion};
5use rayon::prelude::*;
6use rand::prelude::*;
7
8pub struct RandomForestClassifier {
10 pub n_estimators: usize,
11 pub max_depth: Option<usize>,
12 pub min_samples_split: usize,
13 pub min_samples_leaf: usize,
14 pub max_features: Option<usize>,
15 pub bootstrap: bool,
16 pub n_jobs: Option<usize>,
17 trees: Vec<DecisionTreeClassifier>,
18 n_classes: usize,
19}
20
21impl RandomForestClassifier {
22 pub fn new(n_estimators: usize) -> Self {
23 RandomForestClassifier {
24 n_estimators,
25 max_depth: None,
26 min_samples_split: 2,
27 min_samples_leaf: 1,
28 max_features: None, bootstrap: true,
30 n_jobs: None,
31 trees: Vec::new(),
32 n_classes: 0,
33 }
34 }
35
36 pub fn max_depth(mut self, depth: usize) -> Self {
37 self.max_depth = Some(depth);
38 self
39 }
40
41 pub fn min_samples_split(mut self, n: usize) -> Self {
42 self.min_samples_split = n;
43 self
44 }
45
46 pub fn max_features(mut self, n: usize) -> Self {
47 self.max_features = Some(n);
48 self
49 }
50
51 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
52 let n_samples = x.dims()[0];
53 let n_features = x.dims()[1];
54
55 let y_data = y.data_f32();
57 self.n_classes = y_data.iter()
58 .map(|&v| v as usize)
59 .max()
60 .unwrap_or(0) + 1;
61
62 let max_features = self.max_features
64 .unwrap_or_else(|| (n_features as f32).sqrt() as usize);
65
66 self.trees = (0..self.n_estimators)
68 .into_par_iter()
69 .map(|_| {
70 let mut rng = thread_rng();
71
72 let indices: Vec<usize> = if self.bootstrap {
74 (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
75 } else {
76 (0..n_samples).collect()
77 };
78
79 let x_data = x.data_f32();
81 let y_data = y.data_f32();
82
83 let x_boot: Vec<f32> = indices.iter()
84 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
85 .collect();
86 let y_boot: Vec<f32> = indices.iter()
87 .map(|&i| y_data[i])
88 .collect();
89
90 let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
91 let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
92
93 let mut tree = DecisionTreeClassifier::new()
95 .criterion(Criterion::Gini);
96
97 if let Some(depth) = self.max_depth {
98 tree = tree.max_depth(depth);
99 }
100 tree = tree.min_samples_split(self.min_samples_split)
101 .min_samples_leaf(self.min_samples_leaf);
102
103 tree.max_features = Some(max_features);
104 tree.fit(&x_tensor, &y_tensor);
105 tree
106 })
107 .collect();
108 }
109
110 pub fn predict(&self, x: &Tensor) -> Tensor {
111 let proba = self.predict_proba(x);
112 let proba_data = proba.data_f32();
113 let n_samples = x.dims()[0];
114
115 let predictions: Vec<f32> = (0..n_samples)
116 .map(|i| {
117 let start = i * self.n_classes;
118 let sample_probs = &proba_data[start..start + self.n_classes];
119 sample_probs.iter()
120 .enumerate()
121 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
122 .map(|(idx, _)| idx as f32)
123 .unwrap_or(0.0)
124 })
125 .collect();
126
127 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
128 }
129
130 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
131 let n_samples = x.dims()[0];
132
133 let all_probs: Vec<Vec<f32>> = self.trees.par_iter()
135 .map(|tree| tree.predict_proba(x).data_f32())
136 .collect();
137
138 let mut avg_probs = vec![0.0f32; n_samples * self.n_classes];
140
141 for probs in &all_probs {
142 for (i, &p) in probs.iter().enumerate() {
143 avg_probs[i] += p;
144 }
145 }
146
147 let n_trees = self.trees.len() as f32;
148 for p in &mut avg_probs {
149 *p /= n_trees;
150 }
151
152 Tensor::from_slice(&avg_probs, &[n_samples, self.n_classes]).unwrap()
153 }
154
155 pub fn feature_importances(&self, n_features: usize) -> Vec<f32> {
157 let mut importances = vec![0.0f32; n_features];
158
159 let uniform = 1.0 / n_features as f32;
162 importances.iter_mut().for_each(|x| *x = uniform);
163
164 importances
165 }
166}
167
168pub struct RandomForestRegressor {
170 pub n_estimators: usize,
171 pub max_depth: Option<usize>,
172 pub min_samples_split: usize,
173 pub min_samples_leaf: usize,
174 pub max_features: Option<usize>,
175 pub bootstrap: bool,
176 trees: Vec<DecisionTreeRegressor>,
177}
178
179impl RandomForestRegressor {
180 pub fn new(n_estimators: usize) -> Self {
181 RandomForestRegressor {
182 n_estimators,
183 max_depth: None,
184 min_samples_split: 2,
185 min_samples_leaf: 1,
186 max_features: None,
187 bootstrap: true,
188 trees: Vec::new(),
189 }
190 }
191
192 pub fn max_depth(mut self, depth: usize) -> Self {
193 self.max_depth = Some(depth);
194 self
195 }
196
197 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
198 let n_samples = x.dims()[0];
199 let n_features = x.dims()[1];
200
201 let max_features = self.max_features
202 .unwrap_or(n_features / 3);
203
204 self.trees = (0..self.n_estimators)
205 .into_par_iter()
206 .map(|_| {
207 let mut rng = thread_rng();
208
209 let indices: Vec<usize> = if self.bootstrap {
210 (0..n_samples).map(|_| rng.gen_range(0..n_samples)).collect()
211 } else {
212 (0..n_samples).collect()
213 };
214
215 let x_data = x.data_f32();
216 let y_data = y.data_f32();
217
218 let x_boot: Vec<f32> = indices.iter()
219 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
220 .collect();
221 let y_boot: Vec<f32> = indices.iter()
222 .map(|&i| y_data[i])
223 .collect();
224
225 let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
226 let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
227
228 let mut tree = DecisionTreeRegressor::new();
229 if let Some(depth) = self.max_depth {
230 tree = tree.max_depth(depth);
231 }
232 tree.max_features = Some(max_features);
233 tree.fit(&x_tensor, &y_tensor);
234 tree
235 })
236 .collect();
237 }
238
239 pub fn predict(&self, x: &Tensor) -> Tensor {
240 let n_samples = x.dims()[0];
241
242 let all_preds: Vec<Vec<f32>> = self.trees.par_iter()
243 .map(|tree| tree.predict(x).data_f32())
244 .collect();
245
246 let mut avg_preds = vec![0.0f32; n_samples];
247
248 for preds in &all_preds {
249 for (i, &p) in preds.iter().enumerate() {
250 avg_preds[i] += p;
251 }
252 }
253
254 let n_trees = self.trees.len() as f32;
255 for p in &mut avg_preds {
256 *p /= n_trees;
257 }
258
259 Tensor::from_slice(&avg_preds, &[n_samples]).unwrap()
260 }
261}
262
263pub struct GradientBoostingClassifier {
265 pub n_estimators: usize,
266 pub learning_rate: f32,
267 pub max_depth: usize,
268 pub min_samples_split: usize,
269 pub subsample: f32,
270 trees: Vec<Vec<DecisionTreeRegressor>>, initial_predictions: Vec<f32>,
272 n_classes: usize,
273}
274
275impl GradientBoostingClassifier {
276 pub fn new(n_estimators: usize) -> Self {
277 GradientBoostingClassifier {
278 n_estimators,
279 learning_rate: 0.1,
280 max_depth: 3,
281 min_samples_split: 2,
282 subsample: 1.0,
283 trees: Vec::new(),
284 initial_predictions: Vec::new(),
285 n_classes: 0,
286 }
287 }
288
289 pub fn learning_rate(mut self, lr: f32) -> Self {
290 self.learning_rate = lr;
291 self
292 }
293
294 pub fn max_depth(mut self, depth: usize) -> Self {
295 self.max_depth = depth;
296 self
297 }
298
299 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
300 let n_samples = x.dims()[0];
301 let n_features = x.dims()[1];
302 let x_data = x.data_f32();
303 let y_data = y.data_f32();
304
305 self.n_classes = y_data.iter()
306 .map(|&v| v as usize)
307 .max()
308 .unwrap_or(0) + 1;
309
310 self.initial_predictions = vec![0.0; self.n_classes];
312 for c in 0..self.n_classes {
313 let count = y_data.iter().filter(|&&v| v as usize == c).count();
314 let prob = count as f32 / n_samples as f32;
315 self.initial_predictions[c] = (prob / (1.0 - prob + 1e-10)).ln();
316 }
317
318 let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
320 #[allow(clippy::needless_range_loop)]
321 for c in 0..self.n_classes {
322 for i in 0..n_samples {
323 f[c][i] = self.initial_predictions[c];
324 }
325 }
326
327 self.trees = (0..self.n_classes).map(|_| Vec::new()).collect();
328
329 for _ in 0..self.n_estimators {
331 let probs = self.softmax_predictions(&f);
333
334 for c in 0..self.n_classes {
336 let residuals: Vec<f32> = (0..n_samples)
338 .map(|i| {
339 let y_true = if y_data[i] as usize == c { 1.0 } else { 0.0 };
340 y_true - probs[c][i]
341 })
342 .collect();
343
344 let _residual_tensor = Tensor::from_slice(&residuals, &[n_samples]).unwrap();
345
346 let sample_indices: Vec<usize> = if self.subsample < 1.0 {
348 let mut rng = thread_rng();
349 let n_subsample = (n_samples as f32 * self.subsample) as usize;
350 (0..n_samples).choose_multiple(&mut rng, n_subsample)
351 } else {
352 (0..n_samples).collect()
353 };
354
355 let x_sub: Vec<f32> = sample_indices.iter()
356 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
357 .collect();
358 let r_sub: Vec<f32> = sample_indices.iter()
359 .map(|&i| residuals[i])
360 .collect();
361
362 let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
363 let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
364
365 let mut tree = DecisionTreeRegressor::new()
367 .max_depth(self.max_depth);
368 tree.min_samples_split = self.min_samples_split;
369 tree.fit(&x_tensor, &r_tensor);
370
371 let tree_preds = tree.predict(x);
373 let tree_pred_data = tree_preds.data_f32();
374
375 #[allow(clippy::needless_range_loop)]
376 for i in 0..n_samples {
377 f[c][i] += self.learning_rate * tree_pred_data[i];
378 }
379
380 self.trees[c].push(tree);
381 }
382 }
383 }
384
385 fn softmax_predictions(&self, f: &[Vec<f32>]) -> Vec<Vec<f32>> {
386 let n_samples = f[0].len();
387 let mut probs = vec![vec![0.0f32; n_samples]; self.n_classes];
388
389 for i in 0..n_samples {
390 let max_f = (0..self.n_classes).map(|c| f[c][i]).fold(f32::NEG_INFINITY, f32::max);
391 let sum_exp: f32 = (0..self.n_classes).map(|c| (f[c][i] - max_f).exp()).sum();
392
393 for c in 0..self.n_classes {
394 probs[c][i] = (f[c][i] - max_f).exp() / sum_exp;
395 }
396 }
397
398 probs
399 }
400
401 pub fn predict(&self, x: &Tensor) -> Tensor {
402 let proba = self.predict_proba(x);
403 let proba_data = proba.data_f32();
404 let n_samples = x.dims()[0];
405
406 let predictions: Vec<f32> = (0..n_samples)
407 .map(|i| {
408 let start = i * self.n_classes;
409 let sample_probs = &proba_data[start..start + self.n_classes];
410 sample_probs.iter()
411 .enumerate()
412 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
413 .map(|(idx, _)| idx as f32)
414 .unwrap_or(0.0)
415 })
416 .collect();
417
418 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
419 }
420
421 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
422 let n_samples = x.dims()[0];
423
424 let mut f = vec![vec![0.0f32; n_samples]; self.n_classes];
426 #[allow(clippy::needless_range_loop)]
427 for c in 0..self.n_classes {
428 for i in 0..n_samples {
429 f[c][i] = self.initial_predictions[c];
430 }
431 }
432
433 for c in 0..self.n_classes {
435 for tree in &self.trees[c] {
436 let preds = tree.predict(x);
437 let pred_data = preds.data_f32();
438 for i in 0..n_samples {
439 f[c][i] += self.learning_rate * pred_data[i];
440 }
441 }
442 }
443
444 let probs = self.softmax_predictions(&f);
446
447 let mut result = Vec::with_capacity(n_samples * self.n_classes);
448 for i in 0..n_samples {
449 for c in 0..self.n_classes {
450 result.push(probs[c][i]);
451 }
452 }
453
454 Tensor::from_slice(&result, &[n_samples, self.n_classes]).unwrap()
455 }
456}
457
458pub struct GradientBoostingRegressor {
460 pub n_estimators: usize,
461 pub learning_rate: f32,
462 pub max_depth: usize,
463 pub min_samples_split: usize,
464 pub subsample: f32,
465 pub loss: GBLoss,
466 trees: Vec<DecisionTreeRegressor>,
467 initial_prediction: f32,
468}
469
470#[derive(Clone, Copy)]
471pub enum GBLoss {
472 SquaredError,
473 AbsoluteError,
474 Huber,
475}
476
477impl GradientBoostingRegressor {
478 pub fn new(n_estimators: usize) -> Self {
479 GradientBoostingRegressor {
480 n_estimators,
481 learning_rate: 0.1,
482 max_depth: 3,
483 min_samples_split: 2,
484 subsample: 1.0,
485 loss: GBLoss::SquaredError,
486 trees: Vec::new(),
487 initial_prediction: 0.0,
488 }
489 }
490
491 pub fn learning_rate(mut self, lr: f32) -> Self {
492 self.learning_rate = lr;
493 self
494 }
495
496 pub fn max_depth(mut self, depth: usize) -> Self {
497 self.max_depth = depth;
498 self
499 }
500
501 pub fn loss(mut self, loss: GBLoss) -> Self {
502 self.loss = loss;
503 self
504 }
505
506 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
507 let n_samples = x.dims()[0];
508 let n_features = x.dims()[1];
509 let x_data = x.data_f32();
510 let y_data = y.data_f32();
511
512 self.initial_prediction = y_data.iter().sum::<f32>() / n_samples as f32;
514
515 let mut predictions = vec![self.initial_prediction; n_samples];
516
517 for _ in 0..self.n_estimators {
518 let residuals: Vec<f32> = match self.loss {
520 GBLoss::SquaredError => {
521 (0..n_samples).map(|i| y_data[i] - predictions[i]).collect()
522 }
523 GBLoss::AbsoluteError => {
524 (0..n_samples).map(|i| {
525 let diff = y_data[i] - predictions[i];
526 if diff > 0.0 { 1.0 } else if diff < 0.0 { -1.0 } else { 0.0 }
527 }).collect()
528 }
529 GBLoss::Huber => {
530 let delta = 1.0;
531 (0..n_samples).map(|i| {
532 let diff = y_data[i] - predictions[i];
533 if diff.abs() <= delta {
534 diff
535 } else {
536 delta * diff.signum()
537 }
538 }).collect()
539 }
540 };
541
542 let sample_indices: Vec<usize> = if self.subsample < 1.0 {
544 let mut rng = thread_rng();
545 let n_subsample = (n_samples as f32 * self.subsample) as usize;
546 (0..n_samples).choose_multiple(&mut rng, n_subsample)
547 } else {
548 (0..n_samples).collect()
549 };
550
551 let x_sub: Vec<f32> = sample_indices.iter()
552 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
553 .collect();
554 let r_sub: Vec<f32> = sample_indices.iter()
555 .map(|&i| residuals[i])
556 .collect();
557
558 let x_tensor = Tensor::from_slice(&x_sub, &[sample_indices.len(), n_features]).unwrap();
559 let r_tensor = Tensor::from_slice(&r_sub, &[sample_indices.len()]).unwrap();
560
561 let mut tree = DecisionTreeRegressor::new()
563 .max_depth(self.max_depth);
564 tree.min_samples_split = self.min_samples_split;
565 tree.fit(&x_tensor, &r_tensor);
566
567 let tree_preds = tree.predict(x);
569 let tree_pred_data = tree_preds.data_f32();
570
571 for i in 0..n_samples {
572 predictions[i] += self.learning_rate * tree_pred_data[i];
573 }
574
575 self.trees.push(tree);
576 }
577 }
578
579 pub fn predict(&self, x: &Tensor) -> Tensor {
580 let n_samples = x.dims()[0];
581 let mut predictions = vec![self.initial_prediction; n_samples];
582
583 for tree in &self.trees {
584 let tree_preds = tree.predict(x);
585 let tree_pred_data = tree_preds.data_f32();
586
587 for i in 0..n_samples {
588 predictions[i] += self.learning_rate * tree_pred_data[i];
589 }
590 }
591
592 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
593 }
594}
595
596#[cfg(test)]
597mod tests {
598 use super::*;
599
600 #[test]
601 fn test_random_forest_classifier() {
602 let x = Tensor::from_slice(&[0.0f32, 0.0,
603 0.0, 1.0,
604 1.0, 0.0,
605 1.0, 1.0,
606 0.5, 0.5,
607 ], &[5, 2]).unwrap();
608
609 let y = Tensor::from_slice(&[0.0f32, 1.0, 1.0, 0.0, 0.0], &[5]).unwrap();
610
611 let mut rf = RandomForestClassifier::new(10).max_depth(3);
612 rf.fit(&x, &y);
613
614 let predictions = rf.predict(&x);
615 assert_eq!(predictions.dims(), &[5]);
616 }
617
618 #[test]
619 fn test_gradient_boosting_regressor() {
620 let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0,
621 ], &[5, 1]).unwrap();
622
623 let y = Tensor::from_slice(&[2.0f32, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
624
625 let mut gb = GradientBoostingRegressor::new(50)
626 .learning_rate(0.1)
627 .max_depth(3);
628 gb.fit(&x, &y);
629
630 let predictions = gb.predict(&x);
631 assert_eq!(predictions.dims(), &[5]);
632 }
633}
634
635