use ferrolearn_core::error::FerroError;
use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
use ferrolearn_core::traits::{Fit, Transform};
use ndarray::{Array1, Array2};
#[derive(Debug, Clone)]
pub enum ColumnSelector {
Indices(Vec<usize>),
}
impl ColumnSelector {
fn resolve(&self, n_features: usize) -> Result<Vec<usize>, FerroError> {
match self {
ColumnSelector::Indices(indices) => {
for &idx in indices {
if idx >= n_features {
return Err(FerroError::InvalidParameter {
name: "ColumnSelector::Indices".into(),
reason: format!(
"column index {idx} is out of range for input with {n_features} features"
),
});
}
}
Ok(indices.clone())
}
}
}
}
#[derive(Debug, Clone)]
pub enum Remainder {
Drop,
Passthrough,
}
fn select_columns(x: &Array2<f64>, indices: &[usize]) -> Array2<f64> {
let nrows = x.nrows();
let ncols = indices.len();
if ncols == 0 {
return Array2::zeros((nrows, 0));
}
let mut out = Array2::zeros((nrows, ncols));
for (new_j, &old_j) in indices.iter().enumerate() {
out.column_mut(new_j).assign(&x.column(old_j));
}
out
}
fn hstack(matrices: &[Array2<f64>]) -> Result<Array2<f64>, FerroError> {
if matrices.is_empty() {
return Ok(Array2::zeros((0, 0)));
}
let nrows = matrices[0].nrows();
let total_cols: usize = matrices.iter().map(ndarray::ArrayBase::ncols).sum();
if total_cols == 0 {
return Ok(Array2::zeros((nrows, 0)));
}
let mut out = Array2::zeros((nrows, total_cols));
let mut col_offset = 0;
for m in matrices {
if m.nrows() != nrows {
return Err(FerroError::ShapeMismatch {
expected: vec![nrows, m.ncols()],
actual: vec![m.nrows(), m.ncols()],
context: "ColumnTransformer hstack: row count mismatch".into(),
});
}
let end = col_offset + m.ncols();
if m.ncols() > 0 {
out.slice_mut(ndarray::s![.., col_offset..end]).assign(m);
}
col_offset = end;
}
Ok(out)
}
pub struct ColumnTransformer {
transformers: Vec<(String, Box<dyn PipelineTransformer<f64>>, ColumnSelector)>,
remainder: Remainder,
}
impl ColumnTransformer {
#[must_use]
pub fn new(
transformers: Vec<(String, Box<dyn PipelineTransformer<f64>>, ColumnSelector)>,
remainder: Remainder,
) -> Self {
Self {
transformers,
remainder,
}
}
}
impl Fit<Array2<f64>, ()> for ColumnTransformer {
type Fitted = FittedColumnTransformer;
type Error = FerroError;
fn fit(&self, x: &Array2<f64>, _y: &()) -> Result<FittedColumnTransformer, FerroError> {
let n_features = x.ncols();
let n_rows = x.nrows();
let dummy_y = Array1::<f64>::zeros(n_rows);
let mut resolved_selectors: Vec<Vec<usize>> = Vec::with_capacity(self.transformers.len());
for (name, _, selector) in &self.transformers {
let indices = selector.resolve(n_features).map_err(|e| {
FerroError::InvalidParameter {
name: format!("ColumnTransformer step '{name}'"),
reason: e.to_string(),
}
})?;
resolved_selectors.push(indices);
}
let covered: std::collections::HashSet<usize> = resolved_selectors
.iter()
.flat_map(|v| v.iter().copied())
.collect();
let remainder_indices: Vec<usize> =
(0..n_features).filter(|c| !covered.contains(c)).collect();
let mut fitted_transformers: Vec<FittedSubTransformer> =
Vec::with_capacity(self.transformers.len());
for ((name, transformer, _), indices) in self.transformers.iter().zip(resolved_selectors) {
let sub_x = select_columns(x, &indices);
let fitted = transformer.fit_pipeline(&sub_x, &dummy_y)?;
fitted_transformers.push((name.clone(), fitted, indices));
}
Ok(FittedColumnTransformer {
fitted_transformers,
remainder: self.remainder.clone(),
remainder_indices,
n_features_in: n_features,
})
}
}
impl PipelineTransformer<f64> for ColumnTransformer {
fn fit_pipeline(
&self,
x: &Array2<f64>,
_y: &Array1<f64>,
) -> Result<Box<dyn FittedPipelineTransformer<f64>>, FerroError> {
let fitted = self.fit(x, &())?;
Ok(Box::new(fitted))
}
}
type FittedSubTransformer = (String, Box<dyn FittedPipelineTransformer<f64>>, Vec<usize>);
pub struct FittedColumnTransformer {
fitted_transformers: Vec<FittedSubTransformer>,
remainder: Remainder,
remainder_indices: Vec<usize>,
n_features_in: usize,
}
impl FittedColumnTransformer {
#[must_use]
pub fn n_features_in(&self) -> usize {
self.n_features_in
}
#[must_use]
pub fn transformer_names(&self) -> Vec<&str> {
self.fitted_transformers
.iter()
.map(|(name, _, _)| name.as_str())
.collect()
}
#[must_use]
pub fn remainder_indices(&self) -> &[usize] {
&self.remainder_indices
}
}
impl Transform<Array2<f64>> for FittedColumnTransformer {
type Output = Array2<f64>;
type Error = FerroError;
fn transform(&self, x: &Array2<f64>) -> Result<Array2<f64>, FerroError> {
if x.ncols() != self.n_features_in {
return Err(FerroError::ShapeMismatch {
expected: vec![x.nrows(), self.n_features_in],
actual: vec![x.nrows(), x.ncols()],
context: "FittedColumnTransformer::transform".into(),
});
}
let mut parts: Vec<Array2<f64>> = Vec::with_capacity(self.fitted_transformers.len() + 1);
for (_, fitted, indices) in &self.fitted_transformers {
let sub_x = select_columns(x, indices);
let transformed = fitted.transform_pipeline(&sub_x)?;
parts.push(transformed);
}
if matches!(self.remainder, Remainder::Passthrough) && !self.remainder_indices.is_empty() {
let remainder_sub = select_columns(x, &self.remainder_indices);
parts.push(remainder_sub);
}
hstack(&parts)
}
}
impl FittedPipelineTransformer<f64> for FittedColumnTransformer {
fn transform_pipeline(&self, x: &Array2<f64>) -> Result<Array2<f64>, FerroError> {
self.transform(x)
}
}
#[must_use]
pub fn make_column_transformer(
transformers: Vec<(Box<dyn PipelineTransformer<f64>>, ColumnSelector)>,
remainder: Remainder,
) -> ColumnTransformer {
let named: Vec<(String, Box<dyn PipelineTransformer<f64>>, ColumnSelector)> = transformers
.into_iter()
.enumerate()
.map(|(i, (t, s))| (format!("transformer-{i}"), t, s))
.collect();
ColumnTransformer::new(named, remainder)
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ferrolearn_core::pipeline::{Pipeline, PipelineEstimator};
use ndarray::{Array2, array};
use crate::{MinMaxScaler, StandardScaler};
fn make_x() -> Array2<f64> {
array![
[1.0, 2.0, 10.0, 20.0],
[2.0, 4.0, 20.0, 40.0],
[3.0, 6.0, 30.0, 60.0],
[4.0, 8.0, 40.0, 80.0],
]
}
#[test]
fn test_basic_two_transformers_drop_remainder() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![
(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
),
(
"mm".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![2, 3]),
),
],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4);
}
#[test]
fn test_remainder_drop() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 2, "uncovered cols should be dropped");
}
#[test]
fn test_remainder_passthrough() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Passthrough,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4, "passthrough: 2 transformed + 2 remainder");
for i in 0..4 {
assert_abs_diff_eq!(out[[i, 2]], x[[i, 2]], epsilon = 1e-12);
assert_abs_diff_eq!(out[[i, 3]], x[[i, 3]], epsilon = 1e-12);
}
}
#[test]
fn test_invalid_column_index_out_of_range() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 99]), )],
Remainder::Drop,
);
let result = ct.fit(&x, &());
assert!(result.is_err(), "expected error for out-of-range index");
}
#[test]
fn test_empty_transformer_list_drop() {
let x = make_x();
let ct = ColumnTransformer::new(vec![], Remainder::Drop);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 0, "hstack of nothing with no passthrough");
}
#[test]
fn test_empty_transformer_list_passthrough() {
let x = make_x(); let ct = ColumnTransformer::new(vec![], Remainder::Passthrough);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4);
for i in 0..4 {
for j in 0..4 {
assert_abs_diff_eq!(out[[i, j]], x[[i, j]], epsilon = 1e-12);
}
}
}
#[test]
fn test_overlapping_column_selections() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![
(
"std1".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
),
(
"mm1".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 2]), ),
],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4);
}
#[test]
fn test_single_transformer() {
let x = array![[1.0_f64, 2.0], [3.0, 4.0], [5.0, 6.0]];
let ct = ColumnTransformer::new(
vec![(
"mm".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 3);
assert_eq!(out.ncols(), 2);
assert_abs_diff_eq!(out[[0, 0]], 0.0, epsilon = 1e-10);
assert_abs_diff_eq!(out[[2, 0]], 1.0, epsilon = 1e-10);
assert_abs_diff_eq!(out[[0, 1]], 0.0, epsilon = 1e-10);
assert_abs_diff_eq!(out[[2, 1]], 1.0, epsilon = 1e-10);
}
#[test]
fn test_make_column_transformer_auto_names() {
let x = make_x();
let ct = make_column_transformer(
vec![
(
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
),
(
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![2, 3]),
),
],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
assert_eq!(
fitted.transformer_names(),
vec!["transformer-0", "transformer-1"]
);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4);
}
#[test]
fn test_pipeline_integration() {
let x = make_x();
let y = Array1::<f64>::zeros(4);
let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1, 2, 3]),
)],
Remainder::Drop,
);
struct SumEstimator;
impl PipelineEstimator<f64> for SumEstimator {
fn fit_pipeline(
&self,
_x: &Array2<f64>,
_y: &Array1<f64>,
) -> Result<Box<dyn ferrolearn_core::pipeline::FittedPipelineEstimator<f64>>, FerroError>
{
Ok(Box::new(FittedSum))
}
}
struct FittedSum;
impl ferrolearn_core::pipeline::FittedPipelineEstimator<f64> for FittedSum {
fn predict_pipeline(&self, x: &Array2<f64>) -> Result<Array1<f64>, FerroError> {
let sums: Vec<f64> = x.rows().into_iter().map(|r| r.sum()).collect();
Ok(Array1::from_vec(sums))
}
}
let pipeline = Pipeline::new()
.transform_step("ct", Box::new(ct))
.estimator_step("sum", Box::new(SumEstimator));
use ferrolearn_core::Fit as _;
let fitted_pipeline = pipeline.fit(&x, &y).unwrap();
use ferrolearn_core::Predict as _;
let preds = fitted_pipeline.predict(&x).unwrap();
assert_eq!(preds.len(), 4);
}
#[test]
fn test_output_shape_all_selected_drop() {
let x = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
let ct = ColumnTransformer::new(
vec![
(
"s".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0]),
),
(
"m".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![1, 2]),
),
],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[2, 3]);
}
#[test]
fn test_output_shape_partial_passthrough() {
let x = Array2::<f64>::from_shape_vec((3, 5), (1..=15).map(f64::from).collect()).unwrap();
let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Passthrough,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[3, 5]);
}
#[test]
fn test_n_features_in() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
assert_eq!(fitted.n_features_in(), 4);
}
#[test]
fn test_shape_mismatch_on_transform() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let x_bad = array![[1.0_f64, 2.0], [3.0, 4.0]];
let result = fitted.transform(&x_bad);
assert!(result.is_err(), "expected shape mismatch error");
}
#[test]
fn test_remainder_indices_accessor() {
let x = make_x(); let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 2]),
)],
Remainder::Passthrough,
);
let fitted = ct.fit(&x, &()).unwrap();
assert_eq!(fitted.remainder_indices(), &[1, 3]);
}
#[test]
fn test_standard_scaler_zero_mean_in_output() {
let x = array![[1.0_f64, 100.0, 0.5], [2.0, 200.0, 1.5], [3.0, 300.0, 2.5],];
let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
for j in 0..2 {
let mean: f64 = out.column(j).iter().sum::<f64>() / 3.0;
assert_abs_diff_eq!(mean, 0.0, epsilon = 1e-10);
}
}
#[test]
fn test_min_max_values_in_range() {
let x = make_x();
let ct = ColumnTransformer::new(
vec![(
"mm".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1, 2, 3]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
for j in 0..4 {
let col_min = out.column(j).iter().copied().fold(f64::INFINITY, f64::min);
let col_max = out
.column(j)
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max);
assert_abs_diff_eq!(col_min, 0.0, epsilon = 1e-10);
assert_abs_diff_eq!(col_max, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_pipeline_transformer_interface() {
let x = make_x();
let y = Array1::<f64>::zeros(4);
let ct = ColumnTransformer::new(
vec![(
"std".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Passthrough,
);
let fitted_box = ct.fit_pipeline(&x, &y).unwrap();
let out = fitted_box.transform_pipeline(&x).unwrap();
assert_eq!(out.nrows(), 4);
assert_eq!(out.ncols(), 4);
}
#[test]
fn test_passthrough_values_are_exact() {
let x = array![[10.0_f64, 20.0, 30.0], [40.0, 50.0, 60.0],];
let ct = ColumnTransformer::new(
vec![(
"mm".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![0]),
)],
Remainder::Passthrough,
);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_abs_diff_eq!(out[[0, 1]], 20.0, epsilon = 1e-12);
assert_abs_diff_eq!(out[[1, 1]], 50.0, epsilon = 1e-12);
assert_abs_diff_eq!(out[[0, 2]], 30.0, epsilon = 1e-12);
assert_abs_diff_eq!(out[[1, 2]], 60.0, epsilon = 1e-12);
}
#[test]
fn test_transformer_names_explicit() {
let x = make_x();
let ct = ColumnTransformer::new(
vec![
(
"alpha".into(),
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0]),
),
(
"beta".into(),
Box::new(MinMaxScaler::<f64>::new()),
ColumnSelector::Indices(vec![1]),
),
],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
assert_eq!(fitted.transformer_names(), vec!["alpha", "beta"]);
}
#[test]
fn test_make_column_transformer_single() {
let x = array![[1.0_f64, 2.0], [3.0, 4.0]];
let ct = make_column_transformer(
vec![(
Box::new(StandardScaler::<f64>::new()),
ColumnSelector::Indices(vec![0, 1]),
)],
Remainder::Drop,
);
let fitted = ct.fit(&x, &()).unwrap();
assert_eq!(fitted.transformer_names(), vec!["transformer-0"]);
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[2, 2]);
}
#[test]
fn test_all_remainder_passthrough_unchanged() {
let x = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
let ct = ColumnTransformer::new(vec![], Remainder::Passthrough);
let fitted = ct.fit(&x, &()).unwrap();
let out = fitted.transform(&x).unwrap();
assert_eq!(out.shape(), &[2, 3]);
for i in 0..2 {
for j in 0..3 {
assert_abs_diff_eq!(out[[i, j]], x[[i, j]], epsilon = 1e-12);
}
}
}
}