ferrolearn_preprocess/
iterative_imputer.rs

1//! Iterative imputer (MICE): fill missing values by modeling each feature as a
2//! regression on all the other features, in a round-robin fashion.
3//!
4//! [`IterativeImputer`] mirrors scikit-learn's `IterativeImputer`
5//! (`sklearn/impute/_iterative.py:51`, EXPERIMENTAL). For each feature with
6//! missing values — visited in `imputation_order` (default `Ascending`,
7//! fewest-missing first) — it fits the default per-feature estimator
8//! [`ferrolearn_linear::BayesianRidge`] on the rows where the feature is
9//! OBSERVED (predictors = the other features' current filled values), predicts
10//! the missing rows, and clips the predictions to `[min_value, max_value]`. The
11//! round-robin repeats for `max_iter` rounds or until the inf-norm of the change
12//! falls below `tol * max|X_observed|`.
13//!
14//! # Initial Imputation
15//!
16//! Before the iterative process begins, missing values are filled using a simple
17//! strategy (mean by default). This initial imputation provides the starting
18//! point for the regression models (sklearn `_initial_imputation`,
19//! `_iterative.py:743`).
20//!
21//! ## REQ status
22//!
23//! Translation target: scikit-learn 1.5.2 `class IterativeImputer`
24//! (`sklearn/impute/_iterative.py:51`, EXPERIMENTAL). Tracking: #1403. Each REQ
25//! is BINARY — SHIPPED (impl + non-test consumer + tests + green verification)
26//! or NOT-STARTED (with a concrete open blocker). The round-robin now routes
27//! through the real `BayesianRidge` default estimator with ascending order,
28//! inf-norm convergence, and min/max clip, so the iterated imputed VALUES match
29//! sklearn (~1e-6).
30//!
31//! | REQ | Scope | Status | Evidence / Blocker |
32//! |-----|-------|--------|--------------------|
33//! | REQ-1 | Round-robin STRUCTURE + initial fill (mean/median == `SimpleImputer`) + non-missing values preserved + output shape | SHIPPED | [`IterativeImputer`] `fit`/`transform`; initial-fill VALUES match `SimpleImputer` (oracle); non-missing-preserved + shape + no-NaN tests in `tests/divergence_iterative_imputer.rs`. Consumer: re-export `lib.rs:150` |
34//! | REQ-2 | Determinism (no RNG) + termination (≤ max_iter, tol break) + `n_iter` accessor | SHIPPED | [`FittedIterativeImputer::n_iter`]; determinism + bounded-termination tests |
35//! | REQ-3 | Error/parameter contracts (n_samples==0, transform ncols, unfitted) + `max_iter==0` → initial fill | SHIPPED | [`IterativeImputer::fit`] accepts `max_iter==0` (returns initial fill, `n_iter=0`, matches sklearn `_iterative.py:750-752`); divergence + error tests |
36//! | REQ-4 | Exact iterated imputed-VALUE parity (BayesianRidge default + ascending order + inf-norm tol + min/max clip) | SHIPPED | round-robin routes through [`ferrolearn_linear::BayesianRidge`] (`fn impute_one_feature`), ascending order, inf-norm convergence, min/max clip — matches sklearn `_iterative.py:454,732-735` (~1e-6). Verified by `divergence_round_robin_values_small`/`_three_features`/`min_max_clip_bound` (`tests/divergence_iterative_imputer_values.rs`, live sklearn oracle). Closes #1405 |
37//! | REQ-5 | `estimator` param (pluggable; default BayesianRidge) + `sample_posterior` | SHIPPED (default only) | default per-feature estimator is `BayesianRidge` (`fn impute_one_feature`, sklearn `_iterative.py:74,732-735`); pluggable `estimator` + `sample_posterior` posterior sampling stay NOT-STARTED — blocker #1406 |
38//! | REQ-6 | `imputation_order` param (ascending/descending/roman/arabic/random) | SHIPPED (ascending/descending/roman/arabic) | [`ImputationOrder`] enum + `imputation_order` field; default `Ascending` orders features by ascending missing-count (sklearn `_get_ordered_idx:533-535`). `Random` (RNG) stays NOT-STARTED — blocker #1407 |
39//! | REQ-7 | `min_value`/`max_value` clipping | SHIPPED | `min_value`/`max_value` fields (default ±inf); per-iteration clip in `fn impute_one_feature` (sklearn `_iterative.py:455-457`); array-like per-feature limits stay scalar-only — blocker #1408 |
40//! | REQ-8 | `n_nearest_features` + abs-correlation feature selection | NOT-STARTED | uses all other features; sklearn `:468-502` — blocker #1409 |
41//! | REQ-9 | `initial_strategy` most_frequent/constant + `fill_value` + non-NaN `missing_values` | NOT-STARTED | Mean/Median only, NaN-only; sklearn `:112,183,743` — blocker #1410 |
42//! | REQ-10 | `random_state` + `skip_complete` + `add_indicator` + `keep_empty_features` + `verbose` | NOT-STARTED | sklearn `:305-343` — blocker #1411 |
43//! | REQ-11 | inf-norm convergence (`tol·max\|X\|`) | SHIPPED | `fn fit`/`fn transform` converge on `max\|Xt-Xt_prev\| < tol·max\|X_observed\|` (sklearn `_iterative.py:780,811,818`); `ConvergenceWarning` emission stays NOT-STARTED — blocker #1412 |
44//! | REQ-12 | `get_feature_names_out` + `imputation_sequence_`/`n_iter_`/`n_features_in_`/`random_state_` attrs | NOT-STARTED | sklearn `:739` — blocker #1413 |
45//! | REQ-13 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1414 |
46//! | REQ-14 | ferray substrate | NOT-STARTED | dense `Array2` + `num_traits::Float` only — blocker #1415 |
47
48use ferray::linalg::LinalgFloat;
49use ferrolearn_core::error::FerroError;
50use ferrolearn_core::traits::{Fit, FitTransform, Predict, Transform};
51use ferrolearn_linear::{BayesianRidge, FittedBayesianRidge};
52use ndarray::{Array1, Array2, ScalarOperand};
53use num_traits::{Float, FromPrimitive};
54
55/// Float bound under which the round-robin can route through
56/// [`ferrolearn_linear::BayesianRidge`]: the exact bound `BayesianRidge::fit` /
57/// `FittedBayesianRidge::predict` require (`LinalgFloat + ScalarOperand +
58/// FromPrimitive`, sklearn default estimator `_iterative.py:732-735`). A blanket
59/// impl provides it for every such type (`f32`/`f64`), so the rest of the file
60/// names a single tidy bound. The `ferray::linalg::LinalgFloat` bound enters via
61/// `BayesianRidge` (which already sits on the ferray SVD substrate); the
62/// `IterativeImputer`'s OWN compute remains `ndarray`/`num_traits` (REQ-14
63/// substrate migration stays NOT-STARTED — blocker #1415).
64pub trait ImputerFloat:
65    LinalgFloat + ScalarOperand + FromPrimitive + Send + Sync + 'static
66{
67}
68
69impl<F> ImputerFloat for F where
70    F: LinalgFloat + ScalarOperand + FromPrimitive + Send + Sync + 'static
71{
72}
73
74// ---------------------------------------------------------------------------
75// InitialStrategy
76// ---------------------------------------------------------------------------
77
78/// Strategy for the initial imputation before iterative refinement.
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
80pub enum InitialStrategy {
81    /// Replace NaN with the column mean.
82    Mean,
83    /// Replace NaN with the column median.
84    Median,
85}
86
87// ---------------------------------------------------------------------------
88// ImputationOrder
89// ---------------------------------------------------------------------------
90
91/// Order in which features are imputed each round, mirroring scikit-learn's
92/// `imputation_order` (`sklearn/impute/_iterative.py:126-134`,
93/// `_get_ordered_idx:504-542`). The `'random'` variant (which requires a seeded
94/// RNG, `random_state`) is not yet modeled — blocker #1407.
95#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96pub enum ImputationOrder {
97    /// From features with fewest missing values to most (sklearn default,
98    /// `argsort(frac_of_missing, kind="mergesort")`, ties keep column order).
99    Ascending,
100    /// From features with most missing values to fewest (reversed ascending).
101    Descending,
102    /// Left to right (column-index order, sklearn `'roman'`).
103    Roman,
104    /// Right to left (reversed column-index order, sklearn `'arabic'`).
105    Arabic,
106}
107
108// ---------------------------------------------------------------------------
109// IterativeImputer (unfitted)
110// ---------------------------------------------------------------------------
111
112/// An unfitted iterative imputer.
113///
114/// Calling [`Fit::fit`] learns the per-feature [`BayesianRidge`] models and
115/// returns a [`FittedIterativeImputer`] that can impute missing values in new
116/// data by deterministically replaying the learned imputation sequence.
117///
118/// # Parameters
119///
120/// - `max_iter` — maximum number of imputation rounds (default 10).
121/// - `tol` — convergence tolerance on the inf-norm of the change scaled by
122///   `max|X_observed|` (default 1e-3).
123/// - `initial_strategy` — strategy for the initial fill (default `Mean`).
124/// - `imputation_order` — feature visit order (default `Ascending`).
125/// - `min_value` / `max_value` — imputed-value clip bounds (default ±∞).
126///
127/// # Examples
128///
129/// ```
130/// use ferrolearn_preprocess::iterative_imputer::{IterativeImputer, InitialStrategy};
131/// use ferrolearn_core::traits::{Fit, Transform};
132/// use ndarray::array;
133///
134/// let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
135/// let x = array![[1.0, 2.0], [3.0, f64::NAN], [f64::NAN, 6.0]];
136/// let fitted = imputer.fit(&x, &()).unwrap();
137/// let out = fitted.transform(&x).unwrap();
138/// assert!(!out[[1, 1]].is_nan());
139/// assert!(!out[[2, 0]].is_nan());
140/// ```
141#[must_use]
142#[derive(Debug, Clone)]
143pub struct IterativeImputer<F> {
144    /// Maximum number of imputation rounds.
145    max_iter: usize,
146    /// Convergence tolerance.
147    tol: F,
148    /// Initial imputation strategy.
149    initial_strategy: InitialStrategy,
150    /// Order in which features are imputed (default `Ascending`).
151    imputation_order: ImputationOrder,
152    /// Minimum imputed value (default `-inf`, sklearn `_iterative.py:318`).
153    min_value: F,
154    /// Maximum imputed value (default `+inf`, sklearn `_iterative.py:319`).
155    max_value: F,
156}
157
158impl<F: Float + Send + Sync + 'static> IterativeImputer<F> {
159    /// Create a new `IterativeImputer` with the given core parameters and the
160    /// sklearn defaults for `imputation_order` (`Ascending`) and the
161    /// `min_value`/`max_value` clip (`-inf`/`+inf`).
162    pub fn new(max_iter: usize, tol: F, initial_strategy: InitialStrategy) -> Self {
163        Self {
164            max_iter,
165            tol,
166            initial_strategy,
167            imputation_order: ImputationOrder::Ascending,
168            min_value: F::neg_infinity(),
169            max_value: F::infinity(),
170        }
171    }
172
173    /// Set the feature imputation order (default `Ascending`), mirroring
174    /// sklearn's `imputation_order` (`_iterative.py:316`).
175    pub fn with_imputation_order(mut self, order: ImputationOrder) -> Self {
176        self.imputation_order = order;
177        self
178    }
179
180    /// Set the minimum imputed value (the clip lower bound), mirroring sklearn's
181    /// `min_value` (`_iterative.py:318`, default `-inf`).
182    pub fn with_min_value(mut self, min_value: F) -> Self {
183        self.min_value = min_value;
184        self
185    }
186
187    /// Set the maximum imputed value (the clip upper bound), mirroring sklearn's
188    /// `max_value` (`_iterative.py:319`, default `+inf`).
189    pub fn with_max_value(mut self, max_value: F) -> Self {
190        self.max_value = max_value;
191        self
192    }
193
194    /// Return the maximum number of iterations.
195    #[must_use]
196    pub fn max_iter(&self) -> usize {
197        self.max_iter
198    }
199
200    /// Return the convergence tolerance.
201    #[must_use]
202    pub fn tol(&self) -> F {
203        self.tol
204    }
205
206    /// Return the initial imputation strategy.
207    #[must_use]
208    pub fn initial_strategy(&self) -> InitialStrategy {
209        self.initial_strategy
210    }
211
212    /// Return the feature imputation order.
213    #[must_use]
214    pub fn imputation_order(&self) -> ImputationOrder {
215        self.imputation_order
216    }
217
218    /// Return the minimum imputed value (clip lower bound).
219    #[must_use]
220    pub fn min_value(&self) -> F {
221        self.min_value
222    }
223
224    /// Return the maximum imputed value (clip upper bound).
225    #[must_use]
226    pub fn max_value(&self) -> F {
227        self.max_value
228    }
229}
230
231impl<F: Float + Send + Sync + 'static> Default for IterativeImputer<F> {
232    fn default() -> Self {
233        Self::new(
234            10,
235            F::from(1e-3).unwrap_or_else(F::epsilon),
236            InitialStrategy::Mean,
237        )
238    }
239}
240
241// ---------------------------------------------------------------------------
242// FittedIterativeImputer
243// ---------------------------------------------------------------------------
244
245/// A fitted iterative imputer that stores the ordered sequence of per-feature
246/// [`BayesianRidge`] models learned during fitting, mirroring scikit-learn's
247/// `imputation_sequence_` (`sklearn/impute/_iterative.py:739,798-801`).
248///
249/// Created by calling [`Fit::fit`] on an [`IterativeImputer`]. `transform`
250/// replays this sequence deterministically (no re-fitting), exactly as sklearn's
251/// inductive `transform` does (`_iterative.py:865-873`).
252#[derive(Debug, Clone)]
253pub struct FittedIterativeImputer<F> {
254    /// Per-feature initial fill values (used for initial imputation of transform data).
255    initial_fill: Array1<F>,
256    /// Ordered imputation sequence: `(feat_idx, fitted BayesianRidge)` per round
257    /// per feature, mirroring sklearn's `imputation_sequence_`.
258    imputation_sequence: Vec<ImputationStep<F>>,
259    /// Number of rounds performed (sklearn `n_iter_`).
260    n_iter: usize,
261    /// Minimum imputed value (clip lower bound).
262    min_value: F,
263    /// Maximum imputed value (clip upper bound).
264    max_value: F,
265    /// Initial strategy.
266    initial_strategy: InitialStrategy,
267}
268
269/// One step of the imputation sequence: the feature being imputed plus the
270/// fitted [`BayesianRidge`] model that imputes it.
271#[derive(Debug, Clone)]
272struct ImputationStep<F> {
273    /// Index of the feature this step imputes.
274    feat_idx: usize,
275    /// The fitted per-feature regression model.
276    model: FittedBayesianRidge<F>,
277}
278
279impl<F: Float + Send + Sync + 'static> FittedIterativeImputer<F> {
280    /// Return the number of iterations (rounds) performed during fitting
281    /// (sklearn `n_iter_`).
282    #[must_use]
283    pub fn n_iter(&self) -> usize {
284        self.n_iter
285    }
286
287    /// Return the initial fill values per feature.
288    #[must_use]
289    pub fn initial_fill(&self) -> &Array1<F> {
290        &self.initial_fill
291    }
292
293    /// Return the initial imputation strategy used during fitting.
294    #[must_use]
295    pub fn initial_strategy(&self) -> InitialStrategy {
296        self.initial_strategy
297    }
298}
299
300// ---------------------------------------------------------------------------
301// Helpers
302// ---------------------------------------------------------------------------
303
304/// Compute column means, ignoring NaN values.
305fn column_means_nan<F: Float>(x: &Array2<F>) -> Array1<F> {
306    let n_features = x.ncols();
307    let mut means = Array1::zeros(n_features);
308    for j in 0..n_features {
309        let col = x.column(j);
310        let mut sum = F::zero();
311        let mut count = 0usize;
312        for &v in col {
313            if !v.is_nan() {
314                sum = sum + v;
315                count += 1;
316            }
317        }
318        means[j] = if count > 0 {
319            sum / F::from(count).unwrap_or_else(F::one)
320        } else {
321            F::zero()
322        };
323    }
324    means
325}
326
327/// Compute column medians, ignoring NaN values.
328fn column_medians_nan<F: Float>(x: &Array2<F>) -> Array1<F> {
329    let n_features = x.ncols();
330    let mut medians = Array1::zeros(n_features);
331    for j in 0..n_features {
332        let col = x.column(j);
333        let mut vals: Vec<F> = col.iter().copied().filter(|v| !v.is_nan()).collect();
334        if vals.is_empty() {
335            medians[j] = F::zero();
336        } else {
337            vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
338            let n = vals.len();
339            medians[j] = if n % 2 == 1 {
340                vals[n / 2]
341            } else {
342                (vals[n / 2 - 1] + vals[n / 2]) / (F::one() + F::one())
343            };
344        }
345    }
346    medians
347}
348
349/// Fill NaN values in a matrix with the given fill values.
350fn initial_fill<F: Float>(x: &Array2<F>, fill: &Array1<F>) -> Array2<F> {
351    let mut out = x.to_owned();
352    for (mut col, &f) in out.columns_mut().into_iter().zip(fill.iter()) {
353        for v in &mut col {
354            if v.is_nan() {
355                *v = f;
356            }
357        }
358    }
359    out
360}
361
362/// Compute the visit order of features, mirroring scikit-learn's
363/// `_get_ordered_idx` (`sklearn/impute/_iterative.py:504-542`) with
364/// `skip_complete=False` (so `missing_values_idx = arange(n_features)`).
365///
366/// Returns ALL feature indices in the requested order. Features with no missing
367/// values are kept in the order (matching sklearn) but produce no imputation
368/// (the caller skips them when there are no missing rows). `Ascending` is a
369/// STABLE sort by missing fraction (sklearn's `kind="mergesort"`): ties keep
370/// ascending column order. `frac_of_missing[j]` is `missing_count[j] / n_samples`
371/// but the per-sample denominator is identical across features, so ordering by
372/// the raw missing count is identical to ordering by the fraction.
373fn ordered_feature_idx(missing_counts: &[usize], order: ImputationOrder) -> Vec<usize> {
374    let n_features = missing_counts.len();
375    match order {
376        ImputationOrder::Roman => (0..n_features).collect(),
377        ImputationOrder::Arabic => (0..n_features).rev().collect(),
378        ImputationOrder::Ascending => {
379            let mut idx: Vec<usize> = (0..n_features).collect();
380            // STABLE sort by missing count ascending (sklearn mergesort).
381            idx.sort_by_key(|&j| missing_counts[j]);
382            idx
383        }
384        ImputationOrder::Descending => {
385            // sklearn `descending` is `ascending[::-1]` — the reverse of the
386            // stable ascending order (NOT a stable descending sort).
387            let mut idx: Vec<usize> = (0..n_features).collect();
388            idx.sort_by_key(|&j| missing_counts[j]);
389            idx.reverse();
390            idx
391        }
392    }
393}
394
395/// Clip `v` into `[min_value, max_value]`, mirroring `np.clip` (sklearn
396/// `_iterative.py:455-457`). With the default ±∞ bounds this is the identity.
397fn clip<F: Float>(v: F, min_value: F, max_value: F) -> F {
398    if v < min_value {
399        min_value
400    } else if v > max_value {
401        max_value
402    } else {
403        v
404    }
405}
406
407/// Build the predictor column indices for feature `feat_idx`: all other features
408/// in ascending column order, mirroring scikit-learn's default
409/// `_get_neighbor_feat_idx` (`concatenate((arange(feat_idx), arange(feat_idx+1,
410/// n_features)))`, `_iterative.py:499-501`).
411fn neighbor_feat_idx(n_features: usize, feat_idx: usize) -> Vec<usize> {
412    (0..n_features).filter(|&k| k != feat_idx).collect()
413}
414
415/// Impute one feature `feat_idx` from the others, mirroring scikit-learn's
416/// `_impute_one_feature` (`sklearn/impute/_iterative.py:345-466`) in the
417/// `fit_mode=True`, `sample_posterior=False` path: fit the estimator on the rows
418/// where the feature is OBSERVED (`X = the predictor columns' current filled
419/// values, y = this feature's observed values`, `:408-418`), predict the missing
420/// rows (`:454`), clip the predictions to `[min_value, max_value]` (`:455-457`),
421/// and write them back into `imputed` (`:460-465`). Returns the fitted model.
422///
423/// `mask` marks missing entries of the ORIGINAL `x`. When the feature has no
424/// missing rows the estimator is still fit (matching sklearn) but nothing is
425/// written; `None` is returned so the caller records no replay step.
426fn impute_one_feature<F>(
427    imputed: &mut Array2<F>,
428    mask: &Array2<bool>,
429    feat_idx: usize,
430    predictors: &[usize],
431    min_value: F,
432    max_value: F,
433) -> Result<Option<FittedBayesianRidge<F>>, FerroError>
434where
435    F: ImputerFloat,
436{
437    let n_samples = imputed.nrows();
438    let n_predictors = predictors.len();
439    if n_predictors == 0 {
440        return Ok(None);
441    }
442
443    // Observed (training) rows: feature is NOT missing.
444    let observed_rows: Vec<usize> = (0..n_samples).filter(|&i| !mask[[i, feat_idx]]).collect();
445    if observed_rows.is_empty() {
446        return Ok(None);
447    }
448
449    let mut x_train = Array2::zeros((observed_rows.len(), n_predictors));
450    let mut y_train = Array1::zeros(observed_rows.len());
451    for (r, &i) in observed_rows.iter().enumerate() {
452        for (c, &k) in predictors.iter().enumerate() {
453            x_train[[r, c]] = imputed[[i, k]];
454        }
455        y_train[r] = imputed[[i, feat_idx]];
456    }
457
458    // Default per-feature estimator: BayesianRidge (sklearn `_iterative.py:732-735`).
459    let model = BayesianRidge::<F>::new().fit(&x_train, &y_train)?;
460
461    // Missing rows: predict + clip + write back.
462    let missing_rows: Vec<usize> = (0..n_samples).filter(|&i| mask[[i, feat_idx]]).collect();
463    if !missing_rows.is_empty() {
464        let mut x_test = Array2::zeros((missing_rows.len(), n_predictors));
465        for (r, &i) in missing_rows.iter().enumerate() {
466            for (c, &k) in predictors.iter().enumerate() {
467                x_test[[r, c]] = imputed[[i, k]];
468            }
469        }
470        let preds = model.predict(&x_test)?;
471        for (r, &i) in missing_rows.iter().enumerate() {
472            imputed[[i, feat_idx]] = clip(preds[r], min_value, max_value);
473        }
474    }
475
476    Ok(Some(model))
477}
478
479/// Predict + clip + write back using an already-fitted per-feature model,
480/// mirroring scikit-learn's `transform`-time `_impute_one_feature` with
481/// `fit_mode=False` (`_iterative.py:865-873`).
482fn replay_one_feature<F>(
483    imputed: &mut Array2<F>,
484    mask: &Array2<bool>,
485    feat_idx: usize,
486    predictors: &[usize],
487    model: &FittedBayesianRidge<F>,
488    min_value: F,
489    max_value: F,
490) -> Result<(), FerroError>
491where
492    F: ImputerFloat,
493{
494    let n_samples = imputed.nrows();
495    let missing_rows: Vec<usize> = (0..n_samples).filter(|&i| mask[[i, feat_idx]]).collect();
496    if missing_rows.is_empty() {
497        return Ok(());
498    }
499    let mut x_test = Array2::zeros((missing_rows.len(), predictors.len()));
500    for (r, &i) in missing_rows.iter().enumerate() {
501        for (c, &k) in predictors.iter().enumerate() {
502            x_test[[r, c]] = imputed[[i, k]];
503        }
504    }
505    let preds = model.predict(&x_test)?;
506    for (r, &i) in missing_rows.iter().enumerate() {
507        imputed[[i, feat_idx]] = clip(preds[r], min_value, max_value);
508    }
509    Ok(())
510}
511
512/// Inf-norm of `a - b` over all entries, mirroring `np.linalg.norm(Xt -
513/// Xt_previous, ord=np.inf, axis=None)` (sklearn `_iterative.py:811`): the
514/// maximum absolute element-wise difference.
515fn inf_norm_diff<F: Float>(a: &Array2<F>, b: &Array2<F>) -> F {
516    let mut m = F::zero();
517    for (&x, &y) in a.iter().zip(b.iter()) {
518        let d = (x - y).abs();
519        if d > m {
520            m = d;
521        }
522    }
523    m
524}
525
526/// Maximum absolute value of the OBSERVED entries of the original `x` (entries
527/// where `mask` is `false`), mirroring `np.max(np.abs(X[~mask_missing_values]))`
528/// (sklearn `_iterative.py:780`). Returns `0` when every entry is missing.
529fn max_abs_observed<F: Float>(x: &Array2<F>, mask: &Array2<bool>) -> F {
530    let mut m = F::zero();
531    for (&v, &is_missing) in x.iter().zip(mask.iter()) {
532        if !is_missing {
533            let a = v.abs();
534            if a > m {
535                m = a;
536            }
537        }
538    }
539    m
540}
541
542/// Build the boolean missing mask (`v.is_nan()`) and the per-feature missing
543/// counts for `x`.
544fn missing_mask_and_counts<F: Float>(x: &Array2<F>) -> (Array2<bool>, Vec<usize>) {
545    let (n_samples, n_features) = x.dim();
546    let mut mask = Array2::from_elem((n_samples, n_features), false);
547    let mut counts = vec![0usize; n_features];
548    for j in 0..n_features {
549        for i in 0..n_samples {
550            if x[[i, j]].is_nan() {
551                mask[[i, j]] = true;
552                counts[j] += 1;
553            }
554        }
555    }
556    (mask, counts)
557}
558
559// ---------------------------------------------------------------------------
560// Trait implementations
561// ---------------------------------------------------------------------------
562
563impl<F: ImputerFloat> Fit<Array2<F>, ()> for IterativeImputer<F> {
564    type Fitted = FittedIterativeImputer<F>;
565    type Error = FerroError;
566
567    /// Fit the iterative imputer by round-robin [`BayesianRidge`] regression,
568    /// mirroring `sklearn.impute.IterativeImputer.fit_transform`
569    /// (`sklearn/impute/_iterative.py:693-831`) in the `sample_posterior=False`
570    /// path.
571    ///
572    /// Features with missing values are visited in `imputation_order` (default
573    /// `Ascending`); each is fit on the rows where it is observed and its
574    /// missing rows are predicted and clipped to `[min_value, max_value]`. The
575    /// round-robin repeats up to `max_iter` rounds, breaking early when
576    /// `max|Xt - Xt_prev| < tol * max|X_observed|` (inf-norm convergence,
577    /// `:780,811,818`).
578    ///
579    /// # Errors
580    ///
581    /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
582    /// - Propagates [`FerroError`] from the per-feature `BayesianRidge` fit.
583    ///
584    /// `max_iter == 0` is valid (matching sklearn `_iterative.py:750-752`): the
585    /// iteration loop runs zero times and `fit` returns the initial fill with
586    /// `n_iter() == 0`.
587    fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedIterativeImputer<F>, FerroError> {
588        let n_samples = x.nrows();
589        if n_samples == 0 {
590            return Err(FerroError::InsufficientSamples {
591                required: 1,
592                actual: 0,
593                context: "IterativeImputer::fit".into(),
594            });
595        }
596
597        let n_features = x.ncols();
598
599        // Initial fill values (sklearn `SimpleImputer(strategy=...)`, `:743`).
600        let fill_values = match self.initial_strategy {
601            InitialStrategy::Mean => column_means_nan(x),
602            InitialStrategy::Median => column_medians_nan(x),
603        };
604
605        let (mask, missing_counts) = missing_mask_and_counts(x);
606        let n_features_with_missing = missing_counts.iter().filter(|&&c| c > 0).count();
607
608        // Initial imputation.
609        let mut imputed = initial_fill(x, &fill_values);
610
611        let mut imputation_sequence: Vec<ImputationStep<F>> = Vec::new();
612        let mut n_iter = 0usize;
613
614        // max_iter == 0, all-missing, or single-feature short-circuit
615        // (sklearn `:750-757`).
616        if self.max_iter == 0 || n_features_with_missing == 0 || n_features <= 1 {
617            return Ok(FittedIterativeImputer {
618                initial_fill: fill_values,
619                imputation_sequence,
620                n_iter: 0,
621                min_value: self.min_value,
622                max_value: self.max_value,
623                initial_strategy: self.initial_strategy,
624            });
625        }
626
627        // normalized_tol = tol * max|X_observed| (sklearn `:780`).
628        let normalized_tol = self.tol * max_abs_observed(x, &mask);
629
630        // Feature visit order (sklearn `_get_ordered_idx`, `:769`).
631        let order = ordered_feature_idx(&missing_counts, self.imputation_order);
632
633        let mut prev = imputed.clone();
634
635        // Round-robin loop (sklearn `for self.n_iter_ in range(1, max_iter+1)`).
636        for round in 1..=self.max_iter {
637            n_iter = round;
638            // APPEND every round's fitted models to `imputation_sequence`, matching
639            // sklearn's `self.imputation_sequence_.append(estimator_triplet)` inside
640            // the round loop (`_iterative.py:781,801`): the stored sequence holds
641            // `n_features_with_missing * n_iter` models, in round-then-feature order.
642            // `transform` then replays ALL of them from the initial fill
643            // (`_iterative.py:865-873`), so the inductive transform and the
644            // non-converged fit_transform match sklearn — not just the converged case.
645            for &feat_idx in &order {
646                let predictors = neighbor_feat_idx(n_features, feat_idx);
647                if let Some(model) = impute_one_feature(
648                    &mut imputed,
649                    &mask,
650                    feat_idx,
651                    &predictors,
652                    self.min_value,
653                    self.max_value,
654                )? {
655                    imputation_sequence.push(ImputationStep { feat_idx, model });
656                }
657            }
658
659            // Inf-norm convergence (sklearn `:811,818`).
660            let inf_norm = inf_norm_diff(&imputed, &prev);
661            if inf_norm < normalized_tol {
662                break;
663            }
664            prev = imputed.clone();
665        }
666
667        Ok(FittedIterativeImputer {
668            initial_fill: fill_values,
669            imputation_sequence,
670            n_iter,
671            min_value: self.min_value,
672            max_value: self.max_value,
673            initial_strategy: self.initial_strategy,
674        })
675    }
676}
677
678impl<F: ImputerFloat> Transform<Array2<F>> for FittedIterativeImputer<F> {
679    type Output = Array2<F>;
680    type Error = FerroError;
681
682    /// Impute missing values in `x` by replaying the learned imputation sequence
683    /// without re-fitting, mirroring scikit-learn's inductive `transform`
684    /// (`sklearn/impute/_iterative.py:833-885`): the initial fill is applied,
685    /// then each stored `(feat_idx, estimator)` predicts and clips its feature's
686    /// missing rows in order.
687    ///
688    /// # Errors
689    ///
690    /// Returns [`FerroError::ShapeMismatch`] if the number of columns does not
691    /// match the training data; propagates [`FerroError`] from `predict`.
692    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
693        let n_features = self.initial_fill.len();
694        if x.ncols() != n_features {
695            return Err(FerroError::ShapeMismatch {
696                expected: vec![x.nrows(), n_features],
697                actual: vec![x.nrows(), x.ncols()],
698                context: "FittedIterativeImputer::transform".into(),
699            });
700        }
701
702        // Initial imputation.
703        let mut imputed = initial_fill(x, &self.initial_fill);
704
705        // n_iter_ == 0 (or no recorded steps) → return the initial fill
706        // (sklearn `:857-858`).
707        if self.n_iter == 0 || self.imputation_sequence.is_empty() {
708            return Ok(imputed);
709        }
710
711        let (mask, _) = missing_mask_and_counts(x);
712
713        for step in &self.imputation_sequence {
714            let predictors = neighbor_feat_idx(n_features, step.feat_idx);
715            replay_one_feature(
716                &mut imputed,
717                &mask,
718                step.feat_idx,
719                &predictors,
720                &step.model,
721                self.min_value,
722                self.max_value,
723            )?;
724        }
725
726        Ok(imputed)
727    }
728}
729
730/// Implement `Transform` on the unfitted imputer to satisfy the
731/// `FitTransform: Transform` supertrait bound.
732impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for IterativeImputer<F> {
733    type Output = Array2<F>;
734    type Error = FerroError;
735
736    /// Always returns an error — the imputer must be fitted first.
737    fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
738        Err(FerroError::InvalidParameter {
739            name: "IterativeImputer".into(),
740            reason: "imputer must be fitted before calling transform; use fit() first".into(),
741        })
742    }
743}
744
745impl<F: ImputerFloat> FitTransform<Array2<F>> for IterativeImputer<F> {
746    type FitError = FerroError;
747
748    /// Fit the imputer on `x` and return the imputed output in one step.
749    ///
750    /// # Errors
751    ///
752    /// Returns an error if fitting fails.
753    fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
754        let fitted = self.fit(x, &())?;
755        fitted.transform(x)
756    }
757}
758
759// ---------------------------------------------------------------------------
760// Tests
761// ---------------------------------------------------------------------------
762
763#[cfg(test)]
764mod tests {
765    use super::*;
766    use ndarray::array;
767
768    #[test]
769    fn test_iterative_imputer_basic() {
770        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
771        let x = array![[1.0, 2.0], [3.0, f64::NAN], [f64::NAN, 6.0]];
772        let fitted = imputer.fit(&x, &()).unwrap();
773        let out = fitted.transform(&x).unwrap();
774        // All values should be non-NaN.
775        for v in &out {
776            assert!(!v.is_nan(), "Output contains NaN");
777        }
778    }
779
780    #[test]
781    fn test_iterative_imputer_no_missing() {
782        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
783        let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
784        let fitted = imputer.fit(&x, &()).unwrap();
785        let out = fitted.transform(&x).unwrap();
786        for (a, b) in x.iter().zip(out.iter()) {
787            assert!((a - b).abs() < 1e-10);
788        }
789    }
790
791    #[test]
792    fn test_iterative_imputer_convergence() {
793        let imputer = IterativeImputer::<f64>::new(100, 1e-6, InitialStrategy::Mean);
794        // Correlated features: feature 1 ≈ 2 * feature 0.
795        let x = array![
796            [1.0, 2.0],
797            [2.0, 4.0],
798            [3.0, 6.0],
799            [4.0, f64::NAN],
800            [f64::NAN, 10.0]
801        ];
802        let fitted = imputer.fit(&x, &()).unwrap();
803        let out = fitted.transform(&x).unwrap();
804        // Feature 1 of row 3 should be close to 8.0 (2 * 4.0).
805        assert!(
806            (out[[3, 1]] - 8.0).abs() < 2.0,
807            "Expected ~8.0, got {}",
808            out[[3, 1]]
809        );
810        // Feature 0 of row 4 should be close to 5.0 (10.0 / 2).
811        assert!(
812            (out[[4, 0]] - 5.0).abs() < 2.0,
813            "Expected ~5.0, got {}",
814            out[[4, 0]]
815        );
816    }
817
818    #[test]
819    fn test_iterative_imputer_median_strategy() {
820        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Median);
821        let x = array![[1.0, 10.0], [2.0, 20.0], [3.0, f64::NAN]];
822        let fitted = imputer.fit(&x, &()).unwrap();
823        let out = fitted.transform(&x).unwrap();
824        assert!(!out[[2, 1]].is_nan());
825    }
826
827    #[test]
828    fn test_iterative_imputer_fit_transform() {
829        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
830        let x = array![[1.0, 2.0], [3.0, f64::NAN], [f64::NAN, 6.0]];
831        let out = imputer.fit_transform(&x).unwrap();
832        for v in &out {
833            assert!(!v.is_nan());
834        }
835    }
836
837    #[test]
838    fn test_iterative_imputer_zero_rows_error() {
839        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
840        let x: Array2<f64> = Array2::zeros((0, 3));
841        assert!(imputer.fit(&x, &()).is_err());
842    }
843
844    #[test]
845    fn test_iterative_imputer_zero_max_iter_returns_initial_fill() {
846        // sklearn `_iterative.py:750-752`: max_iter == 0 is VALID — fit_transform
847        // sets n_iter_ = 0 and returns the initial SimpleImputer fill with NO
848        // regression rounds.
849        let imputer = IterativeImputer::<f64>::new(0, 1e-3, InitialStrategy::Mean);
850        let x = array![[1.0, 2.0], [f64::NAN, 3.0], [5.0, f64::NAN], [7.0, 8.0]];
851
852        let fit_res = imputer.fit(&x, &());
853        assert!(
854            fit_res.is_ok(),
855            "max_iter=0 must be accepted (sklearn parity), got {fit_res:?}"
856        );
857        let Ok(fitted) = fit_res else { return };
858        assert_eq!(fitted.n_iter(), 0);
859
860        let out_res = fitted.transform(&x);
861        assert!(
862            out_res.is_ok(),
863            "transform after max_iter=0 fit failed: {out_res:?}"
864        );
865        let Ok(out) = out_res else { return };
866        // Per-column mean initial fill (cols share mean 13/3), observed preserved.
867        let mean_fill = 13.0 / 3.0;
868        let expected = array![[1.0, 2.0], [mean_fill, 3.0], [5.0, mean_fill], [7.0, 8.0]];
869        for (got, want) in out.iter().zip(expected.iter()) {
870            assert!(
871                (got - want).abs() < 1e-9,
872                "max_iter=0 output {got} != initial fill {want}"
873            );
874        }
875    }
876
877    #[test]
878    fn test_iterative_imputer_shape_mismatch_error() {
879        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
880        let x_train = array![[1.0, 2.0], [3.0, 4.0]];
881        let fitted = imputer.fit(&x_train, &()).unwrap();
882        let x_bad = array![[1.0, 2.0, 3.0]];
883        assert!(fitted.transform(&x_bad).is_err());
884    }
885
886    #[test]
887    fn test_iterative_imputer_unfitted_transform_error() {
888        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
889        let x = array![[1.0, 2.0]];
890        assert!(imputer.transform(&x).is_err());
891    }
892
893    #[test]
894    fn test_iterative_imputer_default() {
895        let imputer = IterativeImputer::<f64>::default();
896        assert_eq!(imputer.max_iter(), 10);
897        assert_eq!(imputer.initial_strategy(), InitialStrategy::Mean);
898        // sklearn defaults: ascending order, ±inf clip (`_iterative.py:316,318-319`).
899        assert_eq!(imputer.imputation_order(), ImputationOrder::Ascending);
900        assert!(imputer.min_value().is_infinite() && imputer.min_value() < 0.0);
901        assert!(imputer.max_value().is_infinite() && imputer.max_value() > 0.0);
902    }
903
904    #[test]
905    fn test_iterative_imputer_n_iter_accessor() {
906        let imputer = IterativeImputer::<f64>::new(10, 1e-3, InitialStrategy::Mean);
907        let x = array![[1.0, 2.0], [3.0, f64::NAN], [5.0, 6.0]];
908        let fitted = imputer.fit(&x, &()).unwrap();
909        assert!(fitted.n_iter() > 0);
910        assert!(fitted.n_iter() <= 10);
911    }
912
913    #[test]
914    fn test_iterative_imputer_f32() {
915        let imputer = IterativeImputer::<f32>::new(10, 1e-3, InitialStrategy::Mean);
916        let x: Array2<f32> = array![[1.0f32, 2.0], [3.0, f32::NAN], [5.0, 6.0]];
917        let fitted = imputer.fit(&x, &()).unwrap();
918        let out = fitted.transform(&x).unwrap();
919        assert!(!out[[1, 1]].is_nan());
920    }
921
922    #[test]
923    fn test_ordered_feature_idx_ascending_stable() {
924        // counts [1,2,1] -> ascending stable [0,2,1] (ties keep column order),
925        // matching sklearn argsort(frac, kind="mergesort") (Probe D).
926        assert_eq!(
927            ordered_feature_idx(&[1, 2, 1], ImputationOrder::Ascending),
928            vec![0, 2, 1]
929        );
930        assert_eq!(
931            ordered_feature_idx(&[1, 2, 1], ImputationOrder::Roman),
932            vec![0, 1, 2]
933        );
934        assert_eq!(
935            ordered_feature_idx(&[1, 2, 1], ImputationOrder::Descending),
936            vec![1, 2, 0]
937        );
938        assert_eq!(
939            ordered_feature_idx(&[1, 2, 1], ImputationOrder::Arabic),
940            vec![2, 1, 0]
941        );
942    }
943
944    #[test]
945    fn test_clip_bounds() {
946        assert_eq!(clip(10.0, 0.0, 5.0), 5.0);
947        assert_eq!(clip(-2.0, 0.0, 5.0), 0.0);
948        assert_eq!(clip(3.0, 0.0, 5.0), 3.0);
949        assert_eq!(clip(3.0, f64::NEG_INFINITY, f64::INFINITY), 3.0);
950    }
951}
ferrolearn_preprocess/iterative_imputer.rs

ferrolearn_preprocess/
iterative_imputer.rs