Skip to main content

sklears_python/
tree.rs

1//! Python bindings for tree-based algorithms
2//!
3//! This module provides PyO3-based Python bindings for sklears tree algorithms,
4//! including Decision Trees, Random Forest, and Extra Trees.
5
6use crate::linear::common::core_array1_to_py;
7use crate::utils::{numpy_to_ndarray1, numpy_to_ndarray2};
8use numpy::{PyArray1, PyArray2};
9use pyo3::exceptions::{PyRuntimeError, PyValueError};
10use pyo3::prelude::*;
11use scirs2_core::ndarray::{Array1, Array2};
12use sklears_core::traits::{Fit, Predict, Trained};
13use sklears_tree::random_forest::RandomForestRegressor;
14use sklears_tree::{DecisionTree, MaxFeatures, RandomForestClassifier, SplitCriterion};
15
16/// Python wrapper for Decision Tree Classifier
17#[pyclass(name = "DecisionTreeClassifier")]
18pub struct PyDecisionTreeClassifier {
19    inner: Option<DecisionTree>,
20    trained: Option<DecisionTree<Trained>>,
21}
22
23#[pymethods]
24impl PyDecisionTreeClassifier {
25    #[new]
26    #[allow(clippy::too_many_arguments)]
27    #[pyo3(signature = (
28        criterion="gini",
29        _splitter="best",
30        max_depth=None,
31        min_samples_split=2,
32        min_samples_leaf=1,
33        _min_weight_fraction_leaf=0.0,
34        _max_features=None,
35        random_state=None,
36        _max_leaf_nodes=None,
37        _min_impurity_decrease=0.0,
38        _class_weight=None,
39        _ccp_alpha=0.0
40    ))]
41    fn new(
42        criterion: &str,
43        _splitter: &str,
44        max_depth: Option<usize>,
45        min_samples_split: usize,
46        min_samples_leaf: usize,
47        _min_weight_fraction_leaf: f64,
48        _max_features: Option<&str>,
49        random_state: Option<u64>,
50        _max_leaf_nodes: Option<usize>,
51        _min_impurity_decrease: f64,
52        _class_weight: Option<&str>,
53        _ccp_alpha: f64,
54    ) -> PyResult<Self> {
55        let split_criterion = match criterion {
56            "gini" => SplitCriterion::Gini,
57            "entropy" => SplitCriterion::Entropy,
58            "log_loss" => SplitCriterion::LogLoss,
59            _ => {
60                return Err(PyValueError::new_err(format!(
61                    "Unknown criterion: {}",
62                    criterion
63                )))
64            }
65        };
66
67        let mut tree = DecisionTree::new()
68            .criterion(split_criterion)
69            .min_samples_split(min_samples_split)
70            .min_samples_leaf(min_samples_leaf);
71
72        if let Some(depth) = max_depth {
73            tree = tree.max_depth(depth);
74        }
75
76        if let Some(seed) = random_state {
77            tree = tree.random_state(Some(seed));
78        }
79
80        Ok(Self {
81            inner: Some(tree),
82            trained: None,
83        })
84    }
85
86    /// Fit the decision tree classifier
87    fn fit<'py>(
88        &mut self,
89        x: &Bound<'py, PyArray2<f64>>,
90        y: &Bound<'py, PyArray1<f64>>,
91    ) -> PyResult<()> {
92        let x_array = numpy_to_ndarray2(x)?;
93        let y_array = numpy_to_ndarray1(y)?;
94
95        let model = self.inner.take().ok_or_else(|| {
96            PyRuntimeError::new_err("Model has already been fitted or was not initialized")
97        })?;
98
99        match model.fit(&x_array, &y_array) {
100            Ok(trained_model) => {
101                self.trained = Some(trained_model);
102                Ok(())
103            }
104            Err(e) => Err(PyRuntimeError::new_err(format!(
105                "Failed to fit model: {}",
106                e
107            ))),
108        }
109    }
110
111    /// Make predictions using the fitted model
112    fn predict<'py>(
113        &self,
114        py: Python<'py>,
115        x: &Bound<'py, PyArray2<f64>>,
116    ) -> PyResult<Py<PyArray1<f64>>> {
117        let trained_model = self.trained.as_ref().ok_or_else(|| {
118            PyRuntimeError::new_err("Model must be fitted before making predictions")
119        })?;
120
121        let x_array = numpy_to_ndarray2(x)?;
122
123        let predictions: Array1<f64> =
124            Predict::<Array2<f64>, Array1<f64>>::predict(trained_model, &x_array)
125                .map_err(|e| PyRuntimeError::new_err(format!("Prediction failed: {}", e)))?;
126        Ok(core_array1_to_py(py, &predictions))
127    }
128
129    /// Get feature importances
130    fn feature_importances_<'py>(&self, py: Python<'py>) -> PyResult<Py<PyArray1<f64>>> {
131        let trained_model = self.trained.as_ref().ok_or_else(|| {
132            PyRuntimeError::new_err("Model must be fitted before accessing feature importances")
133        })?;
134
135        match trained_model.feature_importances() {
136            Some(importances) => Ok(core_array1_to_py(py, importances)),
137            None => Err(PyRuntimeError::new_err("Feature importances not available")),
138        }
139    }
140
141    fn __repr__(&self) -> String {
142        if self.trained.is_some() {
143            "DecisionTreeClassifier(fitted=True)".to_string()
144        } else {
145            "DecisionTreeClassifier(fitted=False)".to_string()
146        }
147    }
148}
149
150/// Python wrapper for Decision Tree Regressor
151#[pyclass(name = "DecisionTreeRegressor")]
152pub struct PyDecisionTreeRegressor {
153    inner: Option<DecisionTree>,
154    trained: Option<DecisionTree<Trained>>,
155}
156
157#[pymethods]
158impl PyDecisionTreeRegressor {
159    #[new]
160    #[allow(clippy::too_many_arguments)]
161    #[pyo3(signature = (
162        criterion="squared_error",
163        _splitter="best",
164        max_depth=None,
165        min_samples_split=2,
166        min_samples_leaf=1,
167        _min_weight_fraction_leaf=0.0,
168        _max_features=None,
169        random_state=None,
170        _max_leaf_nodes=None,
171        _min_impurity_decrease=0.0,
172        _ccp_alpha=0.0
173    ))]
174    fn new(
175        criterion: &str,
176        _splitter: &str,
177        max_depth: Option<usize>,
178        min_samples_split: usize,
179        min_samples_leaf: usize,
180        _min_weight_fraction_leaf: f64,
181        _max_features: Option<&str>,
182        random_state: Option<u64>,
183        _max_leaf_nodes: Option<usize>,
184        _min_impurity_decrease: f64,
185        _ccp_alpha: f64,
186    ) -> PyResult<Self> {
187        let split_criterion = match criterion {
188            "squared_error" | "mse" => SplitCriterion::MSE,
189            "mae" | "absolute_error" => SplitCriterion::MAE,
190            _ => {
191                return Err(PyValueError::new_err(format!(
192                    "Unknown criterion: {}",
193                    criterion
194                )))
195            }
196        };
197
198        let mut tree = DecisionTree::new()
199            .criterion(split_criterion)
200            .min_samples_split(min_samples_split)
201            .min_samples_leaf(min_samples_leaf);
202
203        if let Some(depth) = max_depth {
204            tree = tree.max_depth(depth);
205        }
206
207        if let Some(seed) = random_state {
208            tree = tree.random_state(Some(seed));
209        }
210
211        Ok(Self {
212            inner: Some(tree),
213            trained: None,
214        })
215    }
216
217    /// Fit the decision tree regressor
218    fn fit(&mut self, x: &Bound<'_, PyArray2<f64>>, y: &Bound<'_, PyArray1<f64>>) -> PyResult<()> {
219        let x_array = numpy_to_ndarray2(x)?;
220        let y_array = numpy_to_ndarray1(y)?;
221
222        let model = self.inner.take().ok_or_else(|| {
223            PyRuntimeError::new_err("Model has already been fitted or was not initialized")
224        })?;
225
226        match model.fit(&x_array, &y_array) {
227            Ok(trained_model) => {
228                self.trained = Some(trained_model);
229                Ok(())
230            }
231            Err(e) => Err(PyRuntimeError::new_err(format!(
232                "Failed to fit model: {}",
233                e
234            ))),
235        }
236    }
237
238    /// Make predictions using the fitted model
239    fn predict<'py>(
240        &self,
241        py: Python<'py>,
242        x: &Bound<'py, PyArray2<f64>>,
243    ) -> PyResult<Py<PyArray1<f64>>> {
244        let trained_model = self.trained.as_ref().ok_or_else(|| {
245            PyRuntimeError::new_err("Model must be fitted before making predictions")
246        })?;
247
248        let x_array = numpy_to_ndarray2(x)?;
249
250        let predictions: Array1<f64> =
251            Predict::<Array2<f64>, Array1<f64>>::predict(trained_model, &x_array)
252                .map_err(|e| PyRuntimeError::new_err(format!("Prediction failed: {}", e)))?;
253        Ok(core_array1_to_py(py, &predictions))
254    }
255
256    /// Get feature importances
257    fn feature_importances_<'py>(&self, py: Python<'py>) -> PyResult<Py<PyArray1<f64>>> {
258        let trained_model = self.trained.as_ref().ok_or_else(|| {
259            PyRuntimeError::new_err("Model must be fitted before accessing feature importances")
260        })?;
261
262        match trained_model.feature_importances() {
263            Some(importances) => Ok(core_array1_to_py(py, importances)),
264            None => Err(PyRuntimeError::new_err("Feature importances not available")),
265        }
266    }
267
268    fn __repr__(&self) -> String {
269        if self.trained.is_some() {
270            "DecisionTreeRegressor(fitted=True)".to_string()
271        } else {
272            "DecisionTreeRegressor(fitted=False)".to_string()
273        }
274    }
275}
276
277/// Python wrapper for Random Forest Classifier
278#[pyclass(name = "RandomForestClassifier")]
279pub struct PyRandomForestClassifier {
280    inner: Option<RandomForestClassifier>,
281    trained: Option<RandomForestClassifier<Trained>>,
282}
283
284#[pymethods]
285impl PyRandomForestClassifier {
286    #[new]
287    #[allow(clippy::too_many_arguments)]
288    #[pyo3(signature = (
289        n_estimators=100,
290        criterion="gini",
291        max_depth=None,
292        min_samples_split=2,
293        min_samples_leaf=1,
294        _min_weight_fraction_leaf=0.0,
295        max_features="sqrt",
296        _max_leaf_nodes=None,
297        _min_impurity_decrease=0.0,
298        bootstrap=true,
299        _oob_score=false,
300        n_jobs=None,
301        random_state=None,
302        _verbose=0,
303        _warm_start=false,
304        _class_weight=None,
305        _ccp_alpha=0.0,
306        _max_samples=None
307    ))]
308    fn new(
309        n_estimators: usize,
310        criterion: &str,
311        max_depth: Option<usize>,
312        min_samples_split: usize,
313        min_samples_leaf: usize,
314        _min_weight_fraction_leaf: f64,
315        max_features: &str,
316        _max_leaf_nodes: Option<usize>,
317        _min_impurity_decrease: f64,
318        bootstrap: bool,
319        _oob_score: bool,
320        n_jobs: Option<i32>,
321        random_state: Option<u64>,
322        _verbose: i32,
323        _warm_start: bool,
324        _class_weight: Option<&str>,
325        _ccp_alpha: f64,
326        _max_samples: Option<f64>,
327    ) -> PyResult<Self> {
328        let split_criterion = match criterion {
329            "gini" => SplitCriterion::Gini,
330            "entropy" => SplitCriterion::Entropy,
331            "log_loss" => SplitCriterion::LogLoss,
332            _ => {
333                return Err(PyValueError::new_err(format!(
334                    "Unknown criterion: {}",
335                    criterion
336                )))
337            }
338        };
339
340        let max_features_strategy = match max_features {
341            "auto" | "sqrt" => MaxFeatures::Sqrt,
342            "log2" => MaxFeatures::Log2,
343            _ => {
344                return Err(PyValueError::new_err(format!(
345                    "Unknown max_features: {}",
346                    max_features
347                )))
348            }
349        };
350
351        let mut forest = RandomForestClassifier::new()
352            .n_estimators(n_estimators)
353            .criterion(split_criterion)
354            .min_samples_split(min_samples_split)
355            .min_samples_leaf(min_samples_leaf)
356            .max_features(max_features_strategy)
357            .bootstrap(bootstrap);
358
359        if let Some(depth) = max_depth {
360            forest = forest.max_depth(depth);
361        }
362
363        if let Some(seed) = random_state {
364            forest = forest.random_state(seed);
365        }
366
367        if let Some(jobs) = n_jobs {
368            forest = forest.n_jobs(jobs);
369        }
370
371        Ok(Self {
372            inner: Some(forest),
373            trained: None,
374        })
375    }
376
377    /// Fit the random forest classifier
378    fn fit(&mut self, x: &Bound<'_, PyArray2<f64>>, y: &Bound<'_, PyArray1<f64>>) -> PyResult<()> {
379        let x_array = numpy_to_ndarray2(x)?;
380        let y_array = numpy_to_ndarray1(y)?;
381
382        let y_int: Array1<i32> = y_array.mapv(|val| val as i32);
383
384        let model = self.inner.take().ok_or_else(|| {
385            PyRuntimeError::new_err("Model has already been fitted or was not initialized")
386        })?;
387
388        match model.fit(&x_array, &y_int) {
389            Ok(trained_model) => {
390                self.trained = Some(trained_model);
391                Ok(())
392            }
393            Err(e) => Err(PyRuntimeError::new_err(format!(
394                "Failed to fit model: {}",
395                e
396            ))),
397        }
398    }
399
400    /// Make predictions using the fitted model
401    fn predict<'py>(
402        &self,
403        py: Python<'py>,
404        x: &Bound<'py, PyArray2<f64>>,
405    ) -> PyResult<Py<PyArray1<f64>>> {
406        let trained_model = self.trained.as_ref().ok_or_else(|| {
407            PyRuntimeError::new_err("Model must be fitted before making predictions")
408        })?;
409
410        let x_array = numpy_to_ndarray2(x)?;
411
412        let predictions: Array1<i32> =
413            Predict::<Array2<f64>, Array1<i32>>::predict(trained_model, &x_array)
414                .map_err(|e| PyRuntimeError::new_err(format!("Prediction failed: {}", e)))?;
415        let predictions_f64: Vec<f64> = predictions.iter().map(|&v| v as f64).collect();
416        Ok(PyArray1::from_vec(py, predictions_f64).unbind())
417    }
418
419    /// Get feature importances
420    fn feature_importances_<'py>(&self, py: Python<'py>) -> PyResult<Py<PyArray1<f64>>> {
421        let trained_model = self.trained.as_ref().ok_or_else(|| {
422            PyRuntimeError::new_err("Model must be fitted before accessing feature importances")
423        })?;
424
425        match trained_model.feature_importances() {
426            Ok(importances) => Ok(core_array1_to_py(py, &importances)),
427            Err(e) => Err(PyRuntimeError::new_err(format!(
428                "Failed to compute feature importances: {}",
429                e
430            ))),
431        }
432    }
433
434    fn __repr__(&self) -> String {
435        if self.trained.is_some() {
436            "RandomForestClassifier(fitted=True)".to_string()
437        } else {
438            "RandomForestClassifier(fitted=False)".to_string()
439        }
440    }
441}
442
443/// Python wrapper for Random Forest Regressor
444#[pyclass(name = "RandomForestRegressor")]
445pub struct PyRandomForestRegressor {
446    inner: Option<RandomForestRegressor>,
447    trained: Option<RandomForestRegressor<Trained>>,
448}
449
450#[pymethods]
451impl PyRandomForestRegressor {
452    #[new]
453    #[allow(clippy::too_many_arguments)]
454    #[pyo3(signature = (
455        n_estimators=100,
456        criterion="squared_error",
457        max_depth=None,
458        min_samples_split=2,
459        min_samples_leaf=1,
460        _min_weight_fraction_leaf=0.0,
461        max_features=1.0,
462        _max_leaf_nodes=None,
463        _min_impurity_decrease=0.0,
464        bootstrap=true,
465        _oob_score=false,
466        n_jobs=None,
467        random_state=None,
468        _verbose=0,
469        _warm_start=false,
470        _ccp_alpha=0.0,
471        _max_samples=None
472    ))]
473    fn new(
474        n_estimators: usize,
475        criterion: &str,
476        max_depth: Option<usize>,
477        min_samples_split: usize,
478        min_samples_leaf: usize,
479        _min_weight_fraction_leaf: f64,
480        max_features: f64,
481        _max_leaf_nodes: Option<usize>,
482        _min_impurity_decrease: f64,
483        bootstrap: bool,
484        _oob_score: bool,
485        n_jobs: Option<i32>,
486        random_state: Option<u64>,
487        _verbose: i32,
488        _warm_start: bool,
489        _ccp_alpha: f64,
490        _max_samples: Option<f64>,
491    ) -> PyResult<Self> {
492        let split_criterion = match criterion {
493            "squared_error" | "mse" => SplitCriterion::MSE,
494            "mae" | "absolute_error" => SplitCriterion::MAE,
495            _ => {
496                return Err(PyValueError::new_err(format!(
497                    "Unknown criterion: {}",
498                    criterion
499                )))
500            }
501        };
502
503        let max_features_strategy = if (max_features - 1.0).abs() < f64::EPSILON {
504            MaxFeatures::All
505        } else {
506            MaxFeatures::Fraction(max_features)
507        };
508
509        let mut forest = RandomForestRegressor::new()
510            .n_estimators(n_estimators)
511            .criterion(split_criterion)
512            .min_samples_split(min_samples_split)
513            .min_samples_leaf(min_samples_leaf)
514            .max_features(max_features_strategy)
515            .bootstrap(bootstrap);
516
517        if let Some(depth) = max_depth {
518            forest = forest.max_depth(depth);
519        }
520
521        if let Some(seed) = random_state {
522            forest = forest.random_state(seed);
523        }
524
525        if let Some(jobs) = n_jobs {
526            forest = forest.n_jobs(jobs);
527        }
528
529        Ok(Self {
530            inner: Some(forest),
531            trained: None,
532        })
533    }
534
535    /// Fit the random forest regressor
536    fn fit(&mut self, x: &Bound<'_, PyArray2<f64>>, y: &Bound<'_, PyArray1<f64>>) -> PyResult<()> {
537        let x_array = numpy_to_ndarray2(x)?;
538        let y_array = numpy_to_ndarray1(y)?;
539
540        let model = self.inner.take().ok_or_else(|| {
541            PyRuntimeError::new_err("Model has already been fitted or was not initialized")
542        })?;
543
544        match model.fit(&x_array, &y_array) {
545            Ok(trained_model) => {
546                self.trained = Some(trained_model);
547                Ok(())
548            }
549            Err(e) => Err(PyRuntimeError::new_err(format!(
550                "Failed to fit model: {}",
551                e
552            ))),
553        }
554    }
555
556    /// Make predictions using the fitted model
557    fn predict<'py>(
558        &self,
559        py: Python<'py>,
560        x: &Bound<'py, PyArray2<f64>>,
561    ) -> PyResult<Py<PyArray1<f64>>> {
562        let trained_model = self.trained.as_ref().ok_or_else(|| {
563            PyRuntimeError::new_err("Model must be fitted before making predictions")
564        })?;
565
566        let x_array = numpy_to_ndarray2(x)?;
567
568        let predictions: Array1<f64> =
569            Predict::<Array2<f64>, Array1<f64>>::predict(trained_model, &x_array)
570                .map_err(|e| PyRuntimeError::new_err(format!("Prediction failed: {}", e)))?;
571        Ok(core_array1_to_py(py, &predictions))
572    }
573
574    /// Get feature importances
575    fn feature_importances_<'py>(&self, py: Python<'py>) -> PyResult<Py<PyArray1<f64>>> {
576        let trained_model = self.trained.as_ref().ok_or_else(|| {
577            PyRuntimeError::new_err("Model must be fitted before accessing feature importances")
578        })?;
579
580        match trained_model.feature_importances() {
581            Ok(importances) => Ok(core_array1_to_py(py, &importances)),
582            Err(e) => Err(PyRuntimeError::new_err(format!(
583                "Failed to compute feature importances: {}",
584                e
585            ))),
586        }
587    }
588
589    fn __repr__(&self) -> String {
590        if self.trained.is_some() {
591            "RandomForestRegressor(fitted=True)".to_string()
592        } else {
593            "RandomForestRegressor(fitted=False)".to_string()
594        }
595    }
596}