1use ghostflow_core::Tensor;
4use crate::tree::{DecisionTreeClassifier, Criterion};
5use rayon::prelude::*;
6use rand::prelude::*;
7
8pub struct AdaBoostClassifier {
10 pub n_estimators: usize,
11 pub learning_rate: f32,
12 pub algorithm: AdaBoostAlgorithm,
13 estimators_: Vec<DecisionTreeClassifier>,
14 estimator_weights_: Vec<f32>,
15 n_classes_: usize,
16}
17
18#[derive(Clone, Copy, Debug)]
19pub enum AdaBoostAlgorithm {
20 Samme,
21 SammeR,
22}
23
24impl AdaBoostClassifier {
25 pub fn new(n_estimators: usize) -> Self {
26 AdaBoostClassifier {
27 n_estimators,
28 learning_rate: 1.0,
29 algorithm: AdaBoostAlgorithm::SammeR,
30 estimators_: Vec::new(),
31 estimator_weights_: Vec::new(),
32 n_classes_: 0,
33 }
34 }
35
36 pub fn learning_rate(mut self, lr: f32) -> Self {
37 self.learning_rate = lr;
38 self
39 }
40
41 pub fn algorithm(mut self, algo: AdaBoostAlgorithm) -> Self {
42 self.algorithm = algo;
43 self
44 }
45
46 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
47 let x_data = x.data_f32();
48 let y_data = y.data_f32();
49 let n_samples = x.dims()[0];
50 let n_features = x.dims()[1];
51
52 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
53
54 let mut sample_weights = vec![1.0f32 / n_samples as f32; n_samples];
56
57 self.estimators_.clear();
58 self.estimator_weights_.clear();
59
60 for _ in 0..self.n_estimators {
61 let mut tree = DecisionTreeClassifier::new()
63 .max_depth(1)
64 .criterion(Criterion::Gini);
65
66 let mut rng = thread_rng();
68 let indices: Vec<usize> = (0..n_samples)
69 .map(|_| {
70 let r: f32 = rng.gen();
71 let mut cumsum = 0.0f32;
72 for (i, &w) in sample_weights.iter().enumerate() {
73 cumsum += w;
74 if r < cumsum {
75 return i;
76 }
77 }
78 n_samples - 1
79 })
80 .collect();
81
82 let x_boot: Vec<f32> = indices.iter()
83 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
84 .collect();
85 let y_boot: Vec<f32> = indices.iter().map(|&i| y_data[i]).collect();
86
87 let x_tensor = Tensor::from_slice(&x_boot, &[indices.len(), n_features]).unwrap();
88 let y_tensor = Tensor::from_slice(&y_boot, &[indices.len()]).unwrap();
89
90 tree.fit(&x_tensor, &y_tensor);
91
92 let predictions = tree.predict(x);
94 let pred_data = predictions.data_f32();
95
96 let mut weighted_error = 0.0f32;
97 for i in 0..n_samples {
98 if (pred_data[i] - y_data[i]).abs() > 0.5 {
99 weighted_error += sample_weights[i];
100 }
101 }
102
103 weighted_error = weighted_error.clamp(1e-10, 1.0 - 1e-10);
105
106 let estimator_weight = self.learning_rate *
108 ((1.0 - weighted_error) / weighted_error).ln() +
109 (self.n_classes_ as f32 - 1.0).ln();
110
111 for i in 0..n_samples {
113 if (pred_data[i] - y_data[i]).abs() > 0.5 {
114 sample_weights[i] *= (estimator_weight).exp();
115 }
116 }
117
118 let sum: f32 = sample_weights.iter().sum();
120 for w in &mut sample_weights {
121 *w /= sum;
122 }
123
124 self.estimators_.push(tree);
125 self.estimator_weights_.push(estimator_weight);
126 }
127 }
128
129 pub fn predict(&self, x: &Tensor) -> Tensor {
130 let n_samples = x.dims()[0];
131
132 let mut class_scores = vec![vec![0.0f32; self.n_classes_]; n_samples];
134
135 for (tree, &weight) in self.estimators_.iter().zip(self.estimator_weights_.iter()) {
136 let predictions = tree.predict(x);
137 let pred_data = predictions.data_f32();
138
139 for i in 0..n_samples {
140 let class = pred_data[i] as usize;
141 if class < self.n_classes_ {
142 class_scores[i][class] += weight;
143 }
144 }
145 }
146
147 let predictions: Vec<f32> = class_scores.iter()
148 .map(|scores| {
149 scores.iter()
150 .enumerate()
151 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
152 .map(|(c, _)| c as f32)
153 .unwrap_or(0.0)
154 })
155 .collect();
156
157 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
158 }
159
160 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
161 let predictions = self.predict(x);
162 let pred_data = predictions.data_f32();
163 let y_data = y.data_f32();
164
165 let correct: usize = pred_data.iter()
166 .zip(y_data.iter())
167 .filter(|(&p, &y)| (p - y).abs() < 0.5)
168 .count();
169
170 correct as f32 / y_data.len() as f32
171 }
172}
173
174pub struct BaggingClassifier {
176 pub n_estimators: usize,
177 pub max_samples: f32,
178 pub max_features: f32,
179 pub bootstrap: bool,
180 pub bootstrap_features: bool,
181 estimators_: Vec<DecisionTreeClassifier>,
182 n_classes_: usize,
183}
184
185impl BaggingClassifier {
186 pub fn new(n_estimators: usize) -> Self {
187 BaggingClassifier {
188 n_estimators,
189 max_samples: 1.0,
190 max_features: 1.0,
191 bootstrap: true,
192 bootstrap_features: false,
193 estimators_: Vec::new(),
194 n_classes_: 0,
195 }
196 }
197
198 pub fn max_samples(mut self, ratio: f32) -> Self {
199 self.max_samples = ratio.clamp(0.1, 1.0);
200 self
201 }
202
203 pub fn max_features(mut self, ratio: f32) -> Self {
204 self.max_features = ratio.clamp(0.1, 1.0);
205 self
206 }
207
208 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
209 let x_data = x.data_f32();
210 let y_data = y.data_f32();
211 let n_samples = x.dims()[0];
212 let n_features = x.dims()[1];
213
214 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
215
216 let n_samples_bootstrap = (n_samples as f32 * self.max_samples) as usize;
217 let n_features_bootstrap = (n_features as f32 * self.max_features) as usize;
218
219 self.estimators_ = (0..self.n_estimators)
220 .into_par_iter()
221 .map(|_| {
222 let mut rng = thread_rng();
223
224 let sample_indices: Vec<usize> = if self.bootstrap {
226 (0..n_samples_bootstrap).map(|_| rng.gen_range(0..n_samples)).collect()
227 } else {
228 (0..n_samples).choose_multiple(&mut rng, n_samples_bootstrap)
229 };
230
231 let feature_indices: Vec<usize> = if self.bootstrap_features {
233 (0..n_features_bootstrap).map(|_| rng.gen_range(0..n_features)).collect()
234 } else {
235 (0..n_features).choose_multiple(&mut rng, n_features_bootstrap)
236 };
237
238 let x_boot: Vec<f32> = sample_indices.iter()
240 .flat_map(|&i| {
241 feature_indices.iter().map(|&j| x_data[i * n_features + j]).collect::<Vec<_>>()
242 })
243 .collect();
244 let y_boot: Vec<f32> = sample_indices.iter().map(|&i| y_data[i]).collect();
245
246 let x_tensor = Tensor::from_slice(&x_boot, &[sample_indices.len(), feature_indices.len()]).unwrap();
247 let y_tensor = Tensor::from_slice(&y_boot, &[sample_indices.len()]).unwrap();
248
249 let mut tree = DecisionTreeClassifier::new();
250 tree.fit(&x_tensor, &y_tensor);
251 tree
252 })
253 .collect();
254 }
255
256 pub fn predict(&self, x: &Tensor) -> Tensor {
257 let n_samples = x.dims()[0];
258
259 let mut class_votes = vec![vec![0usize; self.n_classes_]; n_samples];
260
261 for tree in &self.estimators_ {
262 let predictions = tree.predict(x);
263 let pred_data = predictions.data_f32();
264
265 for i in 0..n_samples {
266 let class = pred_data[i] as usize;
267 if class < self.n_classes_ {
268 class_votes[i][class] += 1;
269 }
270 }
271 }
272
273 let predictions: Vec<f32> = class_votes.iter()
274 .map(|votes| {
275 votes.iter()
276 .enumerate()
277 .max_by_key(|(_, &v)| v)
278 .map(|(c, _)| c as f32)
279 .unwrap_or(0.0)
280 })
281 .collect();
282
283 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
284 }
285
286 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
287 let predictions = self.predict(x);
288 let pred_data = predictions.data_f32();
289 let y_data = y.data_f32();
290
291 let correct: usize = pred_data.iter()
292 .zip(y_data.iter())
293 .filter(|(&p, &y)| (p - y).abs() < 0.5)
294 .count();
295
296 correct as f32 / y_data.len() as f32
297 }
298}
299
300
301pub struct ExtraTreesClassifier {
303 pub n_estimators: usize,
304 pub max_depth: Option<usize>,
305 pub min_samples_split: usize,
306 pub max_features: Option<usize>,
307 trees_: Vec<DecisionTreeClassifier>,
308 n_classes_: usize,
309}
310
311impl ExtraTreesClassifier {
312 pub fn new(n_estimators: usize) -> Self {
313 ExtraTreesClassifier {
314 n_estimators,
315 max_depth: None,
316 min_samples_split: 2,
317 max_features: None,
318 trees_: Vec::new(),
319 n_classes_: 0,
320 }
321 }
322
323 pub fn max_depth(mut self, depth: usize) -> Self {
324 self.max_depth = Some(depth);
325 self
326 }
327
328 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
329 let _n_samples = x.dims()[0];
330 let n_features = x.dims()[1];
331 let y_data = y.data_f32();
332
333 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
334
335 let max_features = self.max_features.unwrap_or_else(|| (n_features as f32).sqrt() as usize);
336
337 self.trees_ = (0..self.n_estimators)
339 .into_par_iter()
340 .map(|_| {
341 let mut tree = DecisionTreeClassifier::new()
342 .criterion(Criterion::Gini)
343 .min_samples_split(self.min_samples_split);
344
345 if let Some(depth) = self.max_depth {
346 tree = tree.max_depth(depth);
347 }
348 tree.max_features = Some(max_features);
349
350 tree.fit(x, y);
352 tree
353 })
354 .collect();
355 }
356
357 pub fn predict(&self, x: &Tensor) -> Tensor {
358 let n_samples = x.dims()[0];
359
360 let mut class_votes = vec![vec![0usize; self.n_classes_]; n_samples];
361
362 for tree in &self.trees_ {
363 let predictions = tree.predict(x);
364 let pred_data = predictions.data_f32();
365
366 for i in 0..n_samples {
367 let class = pred_data[i] as usize;
368 if class < self.n_classes_ {
369 class_votes[i][class] += 1;
370 }
371 }
372 }
373
374 let predictions: Vec<f32> = class_votes.iter()
375 .map(|votes| {
376 votes.iter()
377 .enumerate()
378 .max_by_key(|(_, &v)| v)
379 .map(|(c, _)| c as f32)
380 .unwrap_or(0.0)
381 })
382 .collect();
383
384 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
385 }
386
387 pub fn predict_proba(&self, x: &Tensor) -> Tensor {
388 let n_samples = x.dims()[0];
389
390 let mut class_probs = vec![vec![0.0f32; self.n_classes_]; n_samples];
391
392 for tree in &self.trees_ {
393 let proba = tree.predict_proba(x);
394 let proba_data = proba.data_f32();
395
396 for i in 0..n_samples {
397 for c in 0..self.n_classes_ {
398 class_probs[i][c] += proba_data[i * self.n_classes_ + c];
399 }
400 }
401 }
402
403 let n_trees = self.trees_.len() as f32;
405 let mut result = Vec::with_capacity(n_samples * self.n_classes_);
406 for i in 0..n_samples {
407 for c in 0..self.n_classes_ {
408 result.push(class_probs[i][c] / n_trees);
409 }
410 }
411
412 Tensor::from_slice(&result, &[n_samples, self.n_classes_]).unwrap()
413 }
414
415 pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
416 let predictions = self.predict(x);
417 let pred_data = predictions.data_f32();
418 let y_data = y.data_f32();
419
420 let correct: usize = pred_data.iter()
421 .zip(y_data.iter())
422 .filter(|(&p, &y)| (p - y).abs() < 0.5)
423 .count();
424
425 correct as f32 / y_data.len() as f32
426 }
427}
428
429pub struct VotingClassifier {
431 pub voting: VotingType,
432 pub weights: Option<Vec<f32>>,
433 estimators_: Vec<Box<dyn Classifier + Send + Sync>>,
434 n_classes_: usize,
435}
436
437#[derive(Clone, Copy, Debug)]
438pub enum VotingType {
439 Hard,
440 Soft,
441}
442
443pub trait Classifier {
444 fn fit(&mut self, x: &Tensor, y: &Tensor);
445 fn predict(&self, x: &Tensor) -> Tensor;
446 fn predict_proba(&self, x: &Tensor) -> Option<Tensor>;
447}
448
449impl VotingClassifier {
450 pub fn new(voting: VotingType) -> Self {
451 VotingClassifier {
452 voting,
453 weights: None,
454 estimators_: Vec::new(),
455 n_classes_: 0,
456 }
457 }
458
459 pub fn weights(mut self, w: Vec<f32>) -> Self {
460 self.weights = Some(w);
461 self
462 }
463
464 pub fn add_estimator(&mut self, estimator: Box<dyn Classifier + Send + Sync>) {
465 self.estimators_.push(estimator);
466 }
467
468 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
469 let y_data = y.data_f32();
470 self.n_classes_ = y_data.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
471
472 for estimator in &mut self.estimators_ {
473 estimator.fit(x, y);
474 }
475 }
476
477 pub fn predict(&self, x: &Tensor) -> Tensor {
478 let n_samples = x.dims()[0];
479 let weights = self.weights.clone().unwrap_or_else(|| vec![1.0; self.estimators_.len()]);
480
481 match self.voting {
482 VotingType::Hard => {
483 let mut class_votes = vec![vec![0.0f32; self.n_classes_]; n_samples];
484
485 for (estimator, &weight) in self.estimators_.iter().zip(weights.iter()) {
486 let predictions = estimator.predict(x);
487 let pred_data = predictions.data_f32();
488
489 for i in 0..n_samples {
490 let class = pred_data[i] as usize;
491 if class < self.n_classes_ {
492 class_votes[i][class] += weight;
493 }
494 }
495 }
496
497 let predictions: Vec<f32> = class_votes.iter()
498 .map(|votes| {
499 votes.iter()
500 .enumerate()
501 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
502 .map(|(c, _)| c as f32)
503 .unwrap_or(0.0)
504 })
505 .collect();
506
507 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
508 }
509 VotingType::Soft => {
510 let mut class_probs = vec![vec![0.0f32; self.n_classes_]; n_samples];
511
512 for (estimator, &weight) in self.estimators_.iter().zip(weights.iter()) {
513 if let Some(proba) = estimator.predict_proba(x) {
514 let proba_data = proba.data_f32();
515
516 for i in 0..n_samples {
517 for c in 0..self.n_classes_ {
518 class_probs[i][c] += weight * proba_data[i * self.n_classes_ + c];
519 }
520 }
521 }
522 }
523
524 let predictions: Vec<f32> = class_probs.iter()
525 .map(|probs| {
526 probs.iter()
527 .enumerate()
528 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
529 .map(|(c, _)| c as f32)
530 .unwrap_or(0.0)
531 })
532 .collect();
533
534 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
535 }
536 }
537 }
538}
539
540pub struct IsolationForest {
542 pub n_estimators: usize,
543 pub max_samples: usize,
544 pub contamination: f32,
545 trees_: Vec<IsolationTree>,
546 threshold_: f32,
547}
548
549struct IsolationTree {
550 root: Option<IsolationNode>,
551 #[allow(dead_code)]
552 max_depth: usize,
553}
554
555struct IsolationNode {
556 feature: Option<usize>,
557 threshold: Option<f32>,
558 left: Option<Box<IsolationNode>>,
559 right: Option<Box<IsolationNode>>,
560 size: usize,
561}
562impl IsolationForest {
563 pub fn new(n_estimators: usize) -> Self {
564 IsolationForest {
565 n_estimators,
566 max_samples: 256,
567 contamination: 0.1,
568 trees_: Vec::new(),
569 threshold_: 0.0,
570 }
571 }
572
573 pub fn max_samples(mut self, n: usize) -> Self {
574 self.max_samples = n;
575 self
576 }
577
578 pub fn contamination(mut self, c: f32) -> Self {
579 self.contamination = c.clamp(0.0, 0.5);
580 self
581 }
582
583 fn build_tree(&self, x: &[f32], indices: &[usize], n_features: usize, depth: usize, max_depth: usize) -> IsolationNode {
584 let n = indices.len();
585
586 if depth >= max_depth || n <= 1 {
587 return IsolationNode {
588 feature: None,
589 threshold: None,
590 left: None,
591 right: None,
592 size: n,
593 };
594 }
595
596 let mut rng = thread_rng();
597 let feature = rng.gen_range(0..n_features);
598
599 let values: Vec<f32> = indices.iter()
601 .map(|&i| x[i * n_features + feature])
602 .collect();
603
604 let min_val = values.iter().cloned().fold(f32::INFINITY, f32::min);
605 let max_val = values.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
606
607 if (max_val - min_val).abs() < 1e-10 {
608 return IsolationNode {
609 feature: None,
610 threshold: None,
611 left: None,
612 right: None,
613 size: n,
614 };
615 }
616
617 let threshold = rng.gen_range(min_val..max_val);
618
619 let (left_indices, right_indices): (Vec<_>, Vec<_>) = indices.iter()
620 .partition(|&&i| x[i * n_features + feature] < threshold);
621
622 IsolationNode {
623 feature: Some(feature),
624 threshold: Some(threshold),
625 left: Some(Box::new(self.build_tree(x, &left_indices, n_features, depth + 1, max_depth))),
626 right: Some(Box::new(self.build_tree(x, &right_indices, n_features, depth + 1, max_depth))),
627 size: n,
628 }
629 }
630
631 fn path_length(&self, node: &IsolationNode, sample: &[f32], depth: usize) -> f32 {
632 if node.feature.is_none() {
633 let c = self.c_factor(node.size);
634 return depth as f32 + c;
635 }
636
637 let feature = node.feature.unwrap();
638 let threshold = node.threshold.unwrap();
639
640 if sample[feature] < threshold {
641 if let Some(ref left) = node.left {
642 return self.path_length(left, sample, depth + 1);
643 }
644 } else {
645 if let Some(ref right) = node.right {
646 return self.path_length(right, sample, depth + 1);
647 }
648 }
649
650 depth as f32
651 }
652
653 fn c_factor(&self, n: usize) -> f32 {
654 if n <= 1 {
655 return 0.0;
656 }
657 let n = n as f32;
658 2.0 * ((n - 1.0).ln() + 0.5772156649) - 2.0 * (n - 1.0) / n
659 }
660
661 pub fn fit(&mut self, x: &Tensor) {
662 let x_data = x.data_f32();
663 let n_samples = x.dims()[0];
664 let n_features = x.dims()[1];
665
666 let subsample_size = self.max_samples.min(n_samples);
667 let max_depth = (subsample_size as f32).log2().ceil() as usize;
668
669 self.trees_ = (0..self.n_estimators)
670 .map(|_| {
671 let mut rng = thread_rng();
672 let indices: Vec<usize> = (0..n_samples)
673 .choose_multiple(&mut rng, subsample_size);
674
675 let root = self.build_tree(&x_data, &indices, n_features, 0, max_depth);
676 IsolationTree { root: Some(root), max_depth }
677 })
678 .collect();
679
680 let scores = self.decision_function(x);
682 let mut score_data = scores.data_f32();
683 score_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
684
685 let threshold_idx = ((1.0 - self.contamination) * n_samples as f32) as usize;
686 self.threshold_ = score_data[threshold_idx.min(n_samples - 1)];
687 }
688
689 pub fn decision_function(&self, x: &Tensor) -> Tensor {
690 let x_data = x.data_f32();
691 let n_samples = x.dims()[0];
692 let n_features = x.dims()[1];
693
694 let c = self.c_factor(self.max_samples);
695
696 let scores: Vec<f32> = (0..n_samples)
697 .map(|i| {
698 let sample = &x_data[i * n_features..(i + 1) * n_features];
699
700 let avg_path_length: f32 = self.trees_.iter()
701 .map(|tree| {
702 if let Some(ref root) = tree.root {
703 self.path_length(root, sample, 0)
704 } else {
705 0.0
706 }
707 })
708 .sum::<f32>() / self.trees_.len() as f32;
709
710 -2.0f32.powf(-avg_path_length / c)
712 })
713 .collect();
714
715 Tensor::from_slice(&scores, &[n_samples]).unwrap()
716 }
717
718 pub fn predict(&self, x: &Tensor) -> Tensor {
719 let scores = self.decision_function(x);
720 let score_data = scores.data_f32();
721 let n_samples = x.dims()[0];
722
723 let predictions: Vec<f32> = score_data.iter()
724 .map(|&s| if s < self.threshold_ { 1.0 } else { -1.0 }) .collect();
726
727 Tensor::from_slice(&predictions, &[n_samples]).unwrap()
728 }
729}
730
731#[cfg(test)]
732mod tests {
733 use super::*;
734
735 #[test]
736 fn test_adaboost() {
737 let x = Tensor::from_slice(&[
738 0.0, 0.0,
739 0.0, 1.0,
740 1.0, 0.0,
741 1.0, 1.0,
742 ], &[4, 2]).unwrap();
743
744 let y = Tensor::from_slice(&[0.0, 0.0, 0.0, 1.0], &[4]).unwrap();
745
746 let mut ada = AdaBoostClassifier::new(10);
747 ada.fit(&x, &y);
748
749 let predictions = ada.predict(&x);
750 assert_eq!(predictions.dims(), &[4]);
751 }
752
753 #[test]
754 fn test_bagging() {
755 let x = Tensor::from_slice(&[
756 0.0, 0.0,
757 0.0, 1.0,
758 1.0, 0.0,
759 1.0, 1.0,
760 ], &[4, 2]).unwrap();
761
762 let y = Tensor::from_slice(&[0.0, 0.0, 1.0, 1.0], &[4]).unwrap();
763
764 let mut bag = BaggingClassifier::new(10);
765 bag.fit(&x, &y);
766
767 let score = bag.score(&x, &y);
768 assert!(score >= 0.5);
769 }
770
771 #[test]
772 fn test_isolation_forest() {
773 let x = Tensor::from_slice(&[
774 0.0, 0.0,
775 0.1, 0.1,
776 0.2, 0.0,
777 10.0, 10.0, ], &[4, 2]).unwrap();
779
780 let mut iso = IsolationForest::new(10).contamination(0.25);
781 iso.fit(&x);
782
783 let predictions = iso.predict(&x);
784 assert_eq!(predictions.dims(), &[4]);
785 }
786}