use ferrolearn_core::error::FerroError;
use ferrolearn_core::introspection::HasClasses;
use ferrolearn_core::pipeline::{FittedPipelineEstimator, PipelineEstimator};
use ferrolearn_core::traits::{Fit, Predict};
use ndarray::{Array1, Array2};
use num_traits::{Float, FromPrimitive, ToPrimitive};
use serde::{Deserialize, Serialize};
use crate::decision_tree::{
ClassificationCriterion, DecisionTreeClassifier, DecisionTreeRegressor,
FittedDecisionTreeClassifier, FittedDecisionTreeRegressor,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VotingClassifier<F> {
pub max_depths: Vec<Option<usize>>,
pub min_samples_split: usize,
pub min_samples_leaf: usize,
pub criterion: ClassificationCriterion,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float> VotingClassifier<F> {
#[must_use]
pub fn new() -> Self {
Self {
max_depths: vec![Some(2), Some(4), Some(6), None],
min_samples_split: 2,
min_samples_leaf: 1,
criterion: ClassificationCriterion::Gini,
_marker: std::marker::PhantomData,
}
}
#[must_use]
pub fn with_max_depths(mut self, max_depths: Vec<Option<usize>>) -> Self {
self.max_depths = max_depths;
self
}
#[must_use]
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
self.min_samples_split = min_samples_split;
self
}
#[must_use]
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
self.min_samples_leaf = min_samples_leaf;
self
}
#[must_use]
pub fn with_criterion(mut self, criterion: ClassificationCriterion) -> Self {
self.criterion = criterion;
self
}
}
impl<F: Float> Default for VotingClassifier<F> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct FittedVotingClassifier<F> {
trees: Vec<FittedDecisionTreeClassifier<F>>,
classes: Vec<usize>,
}
impl<F: Float + Send + Sync + 'static> FittedVotingClassifier<F> {
#[must_use]
pub fn n_estimators(&self) -> usize {
self.trees.len()
}
pub fn score(&self, x: &Array2<F>, y: &Array1<usize>) -> Result<F, FerroError> {
if x.nrows() != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows()],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
let preds = self.predict(x)?;
Ok(crate::mean_accuracy(&preds, y))
}
pub fn predict_proba(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let n_samples = x.nrows();
let n_classes = self.classes.len();
let n_trees_f = F::from(self.trees.len()).unwrap();
let mut proba = Array2::<F>::zeros((n_samples, n_classes));
for tree in &self.trees {
let tree_proba = tree.predict_proba(x)?;
for i in 0..n_samples {
for j in 0..n_classes {
proba[[i, j]] = proba[[i, j]] + tree_proba[[i, j]];
}
}
}
for i in 0..n_samples {
for j in 0..n_classes {
proba[[i, j]] = proba[[i, j]] / n_trees_f;
}
}
Ok(proba)
}
pub fn predict_log_proba(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
let proba = self.predict_proba(x)?;
Ok(crate::log_proba(&proba))
}
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, Array1<usize>> for VotingClassifier<F> {
type Fitted = FittedVotingClassifier<F>;
type Error = FerroError;
fn fit(
&self,
x: &Array2<F>,
y: &Array1<usize>,
) -> Result<FittedVotingClassifier<F>, FerroError> {
let n_samples = x.nrows();
if n_samples != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![n_samples],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "VotingClassifier requires at least one sample".into(),
});
}
if self.max_depths.is_empty() {
return Err(FerroError::InvalidParameter {
name: "max_depths".into(),
reason: "must contain at least one entry".into(),
});
}
let mut classes: Vec<usize> = y.iter().copied().collect();
classes.sort_unstable();
classes.dedup();
let mut trees = Vec::with_capacity(self.max_depths.len());
for &max_depth in &self.max_depths {
let tree = DecisionTreeClassifier::<F>::new()
.with_max_depth(max_depth)
.with_min_samples_split(self.min_samples_split)
.with_min_samples_leaf(self.min_samples_leaf)
.with_criterion(self.criterion);
let fitted = tree.fit(x, y)?;
trees.push(fitted);
}
Ok(FittedVotingClassifier { trees, classes })
}
}
impl<F: Float + Send + Sync + 'static> Predict<Array2<F>> for FittedVotingClassifier<F> {
type Output = Array1<usize>;
type Error = FerroError;
fn predict(&self, x: &Array2<F>) -> Result<Array1<usize>, FerroError> {
let n_samples = x.nrows();
let n_classes = self.classes.len();
let all_preds: Vec<Array1<usize>> = self
.trees
.iter()
.map(|tree| tree.predict(x))
.collect::<Result<Vec<_>, _>>()?;
let mut predictions = Array1::zeros(n_samples);
for i in 0..n_samples {
let mut votes = vec![0usize; n_classes];
for tree_preds in &all_preds {
let pred = tree_preds[i];
if let Some(class_idx) = self.classes.iter().position(|&c| c == pred) {
votes[class_idx] += 1;
}
}
let winner = votes
.iter()
.enumerate()
.max_by_key(|&(_, &count)| count)
.map_or(0, |(idx, _)| idx);
predictions[i] = self.classes[winner];
}
Ok(predictions)
}
}
impl<F: Float + Send + Sync + 'static> HasClasses for FittedVotingClassifier<F> {
fn classes(&self) -> &[usize] {
&self.classes
}
fn n_classes(&self) -> usize {
self.classes.len()
}
}
impl<F: Float + ToPrimitive + FromPrimitive + Send + Sync + 'static> PipelineEstimator<F>
for VotingClassifier<F>
{
fn fit_pipeline(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineEstimator<F>>, FerroError> {
let y_usize: Array1<usize> = y.mapv(|v| v.to_usize().unwrap_or(0));
let fitted = self.fit(x, &y_usize)?;
Ok(Box::new(FittedVotingClassifierPipelineAdapter(fitted)))
}
}
struct FittedVotingClassifierPipelineAdapter<F: Float + Send + Sync + 'static>(
FittedVotingClassifier<F>,
);
impl<F: Float + ToPrimitive + FromPrimitive + Send + Sync + 'static> FittedPipelineEstimator<F>
for FittedVotingClassifierPipelineAdapter<F>
{
fn predict_pipeline(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
let preds = self.0.predict(x)?;
Ok(preds.mapv(|v| F::from_usize(v).unwrap_or_else(F::nan)))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VotingRegressor<F> {
pub max_depths: Vec<Option<usize>>,
pub min_samples_split: usize,
pub min_samples_leaf: usize,
_marker: std::marker::PhantomData<F>,
}
impl<F: Float> VotingRegressor<F> {
#[must_use]
pub fn new() -> Self {
Self {
max_depths: vec![Some(2), Some(4), Some(6), None],
min_samples_split: 2,
min_samples_leaf: 1,
_marker: std::marker::PhantomData,
}
}
#[must_use]
pub fn with_max_depths(mut self, max_depths: Vec<Option<usize>>) -> Self {
self.max_depths = max_depths;
self
}
#[must_use]
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
self.min_samples_split = min_samples_split;
self
}
#[must_use]
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
self.min_samples_leaf = min_samples_leaf;
self
}
}
impl<F: Float> Default for VotingRegressor<F> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct FittedVotingRegressor<F> {
trees: Vec<FittedDecisionTreeRegressor<F>>,
}
impl<F: Float + Send + Sync + 'static> FittedVotingRegressor<F> {
#[must_use]
pub fn n_estimators(&self) -> usize {
self.trees.len()
}
pub fn score(&self, x: &Array2<F>, y: &Array1<F>) -> Result<F, FerroError> {
if x.nrows() != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows()],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
let preds = self.predict(x)?;
Ok(crate::r2_score(&preds, y))
}
}
impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, Array1<F>> for VotingRegressor<F> {
type Fitted = FittedVotingRegressor<F>;
type Error = FerroError;
fn fit(&self, x: &Array2<F>, y: &Array1<F>) -> Result<FittedVotingRegressor<F>, FerroError> {
let n_samples = x.nrows();
if n_samples != y.len() {
return Err(FerroError::ShapeMismatch {
expected: vec![n_samples],
actual: vec![y.len()],
context: "y length must match number of samples in X".into(),
});
}
if n_samples == 0 {
return Err(FerroError::InsufficientSamples {
required: 1,
actual: 0,
context: "VotingRegressor requires at least one sample".into(),
});
}
if self.max_depths.is_empty() {
return Err(FerroError::InvalidParameter {
name: "max_depths".into(),
reason: "must contain at least one entry".into(),
});
}
let mut trees = Vec::with_capacity(self.max_depths.len());
for &max_depth in &self.max_depths {
let tree = DecisionTreeRegressor::<F>::new()
.with_max_depth(max_depth)
.with_min_samples_split(self.min_samples_split)
.with_min_samples_leaf(self.min_samples_leaf);
let fitted = tree.fit(x, y)?;
trees.push(fitted);
}
Ok(FittedVotingRegressor { trees })
}
}
impl<F: Float + Send + Sync + 'static> Predict<Array2<F>> for FittedVotingRegressor<F> {
type Output = Array1<F>;
type Error = FerroError;
fn predict(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
let n_samples = x.nrows();
let n_trees_f = F::from(self.trees.len()).unwrap();
let all_preds: Vec<Array1<F>> = self
.trees
.iter()
.map(|tree| tree.predict(x))
.collect::<Result<Vec<_>, _>>()?;
let mut predictions = Array1::zeros(n_samples);
for i in 0..n_samples {
let mut sum = F::zero();
for tree_preds in &all_preds {
sum = sum + tree_preds[i];
}
predictions[i] = sum / n_trees_f;
}
Ok(predictions)
}
}
impl<F: Float + Send + Sync + 'static> PipelineEstimator<F> for VotingRegressor<F> {
fn fit_pipeline(
&self,
x: &Array2<F>,
y: &Array1<F>,
) -> Result<Box<dyn FittedPipelineEstimator<F>>, FerroError> {
let fitted = self.fit(x, y)?;
Ok(Box::new(fitted))
}
}
impl<F: Float + Send + Sync + 'static> FittedPipelineEstimator<F> for FittedVotingRegressor<F> {
fn predict_pipeline(&self, x: &Array2<F>) -> Result<Array1<F>, FerroError> {
self.predict(x)
}
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::array;
fn make_classification_data() -> (Array2<f64>, Array1<usize>) {
let x = Array2::from_shape_vec(
(8, 2),
vec![
1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 9.0,
],
)
.unwrap();
let y = array![0, 0, 0, 0, 1, 1, 1, 1];
(x, y)
}
fn make_regression_data() -> (Array2<f64>, Array1<f64>) {
let x = Array2::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0],
)
.unwrap();
let y = array![1.0, 2.0, 3.0, 5.0, 6.0, 7.0];
(x, y)
}
#[test]
fn test_voting_classifier_default() {
let model = VotingClassifier::<f64>::new();
assert_eq!(model.max_depths.len(), 4);
assert_eq!(model.min_samples_split, 2);
assert_eq!(model.min_samples_leaf, 1);
}
#[test]
fn test_voting_classifier_builder() {
let model = VotingClassifier::<f64>::new()
.with_max_depths(vec![Some(1), Some(3)])
.with_min_samples_split(5)
.with_min_samples_leaf(2)
.with_criterion(ClassificationCriterion::Entropy);
assert_eq!(model.max_depths.len(), 2);
assert_eq!(model.min_samples_split, 5);
assert_eq!(model.min_samples_leaf, 2);
assert_eq!(model.criterion, ClassificationCriterion::Entropy);
}
#[test]
fn test_voting_classifier_fit_predict() {
let (x, y) = make_classification_data();
let model = VotingClassifier::<f64>::new();
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 8);
for i in 0..4 {
assert_eq!(preds[i], 0, "sample {i} should be class 0");
}
for i in 4..8 {
assert_eq!(preds[i], 1, "sample {i} should be class 1");
}
}
#[test]
fn test_voting_classifier_has_classes() {
let (x, y) = make_classification_data();
let model = VotingClassifier::<f64>::new();
let fitted = model.fit(&x, &y).unwrap();
assert_eq!(fitted.classes(), &[0, 1]);
assert_eq!(fitted.n_classes(), 2);
}
#[test]
fn test_voting_classifier_n_estimators() {
let (x, y) = make_classification_data();
let model = VotingClassifier::<f64>::new().with_max_depths(vec![Some(2), Some(4), None]);
let fitted = model.fit(&x, &y).unwrap();
assert_eq!(fitted.n_estimators(), 3);
}
#[test]
fn test_voting_classifier_empty_data_error() {
let x = Array2::<f64>::zeros((0, 2));
let y = Array1::<usize>::zeros(0);
let model = VotingClassifier::<f64>::new();
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_classifier_shape_mismatch_error() {
let x = Array2::<f64>::zeros((5, 2));
let y = Array1::<usize>::zeros(3);
let model = VotingClassifier::<f64>::new();
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_classifier_empty_depths_error() {
let (x, y) = make_classification_data();
let model = VotingClassifier::<f64>::new().with_max_depths(vec![]);
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_classifier_multiclass() {
let x = Array2::from_shape_vec(
(9, 2),
vec![
1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 4.0, 4.0, 5.0, 4.0, 4.0, 5.0, 8.0, 8.0, 9.0, 8.0,
8.0, 9.0,
],
)
.unwrap();
let y = array![0, 0, 0, 1, 1, 1, 2, 2, 2];
let model = VotingClassifier::<f64>::new();
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 9);
assert_eq!(fitted.n_classes(), 3);
}
#[test]
fn test_voting_regressor_default() {
let model = VotingRegressor::<f64>::new();
assert_eq!(model.max_depths.len(), 4);
assert_eq!(model.min_samples_split, 2);
assert_eq!(model.min_samples_leaf, 1);
}
#[test]
fn test_voting_regressor_builder() {
let model = VotingRegressor::<f64>::new()
.with_max_depths(vec![Some(1), Some(5)])
.with_min_samples_split(3)
.with_min_samples_leaf(2);
assert_eq!(model.max_depths.len(), 2);
assert_eq!(model.min_samples_split, 3);
assert_eq!(model.min_samples_leaf, 2);
}
#[test]
fn test_voting_regressor_fit_predict() {
let (x, y) = make_regression_data();
let model = VotingRegressor::<f64>::new();
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 6);
for i in 0..6 {
let err = (preds[i] - y[i]).abs();
assert!(
err < 3.0,
"prediction {:.2} should be close to target {:.2}",
preds[i],
y[i]
);
}
}
#[test]
fn test_voting_regressor_n_estimators() {
let (x, y) = make_regression_data();
let model = VotingRegressor::<f64>::new().with_max_depths(vec![Some(2), None]);
let fitted = model.fit(&x, &y).unwrap();
assert_eq!(fitted.n_estimators(), 2);
}
#[test]
fn test_voting_regressor_empty_data_error() {
let x = Array2::<f64>::zeros((0, 2));
let y = Array1::<f64>::zeros(0);
let model = VotingRegressor::<f64>::new();
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_regressor_shape_mismatch_error() {
let x = Array2::<f64>::zeros((5, 2));
let y = Array1::<f64>::zeros(3);
let model = VotingRegressor::<f64>::new();
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_regressor_empty_depths_error() {
let (x, y) = make_regression_data();
let model = VotingRegressor::<f64>::new().with_max_depths(vec![]);
let result = model.fit(&x, &y);
assert!(result.is_err());
}
#[test]
fn test_voting_regressor_averaging() {
let (x, y) = make_regression_data();
let model = VotingRegressor::<f64>::new().with_max_depths(vec![None]);
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
for i in 0..6 {
assert!(
(preds[i] - y[i]).abs() < 1e-10,
"single unlimited tree should overfit training data"
);
}
}
#[test]
fn test_voting_classifier_f32() {
let x = Array2::<f32>::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0],
)
.unwrap();
let y = array![0, 0, 0, 1, 1, 1];
let model = VotingClassifier::<f32>::new();
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 6);
}
#[test]
fn test_voting_regressor_f32() {
let x = Array2::<f32>::from_shape_vec(
(6, 2),
vec![1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 6.0, 7.0, 7.0, 8.0],
)
.unwrap();
let y = array![1.0_f32, 2.0, 3.0, 5.0, 6.0, 7.0];
let model = VotingRegressor::<f32>::new();
let fitted = model.fit(&x, &y).unwrap();
let preds = fitted.predict(&x).unwrap();
assert_eq!(preds.len(), 6);
}
}