ferrolearn-preprocess 0.5.0

//! Target encoder: encode categorical features using target statistics.
//!
//! [`TargetEncoder`] replaces each category with the mean of the target variable
//! for that category, regularised toward the global mean using smoothing.
//!
//! This is especially useful for high-cardinality categorical features where
//! one-hot encoding would produce too many columns.
//!
//! # Smoothing
//!
//! The encoded value for category `c` is (matching scikit-learn
//! `_target_encoder_fast.pyx:60-75` — the accumulator is seeded with
//! `smooth * global_mean` then the category's targets are added, divided by
//! `smooth + count(c)`):
//!
//! ```text
//! encoded(c) = (smooth * global_mean + sum_of_targets(c)) / (smooth + count(c))
//! ```
//!
//! where `smooth` controls the degree of regularisation.
//!
//! Translation target: scikit-learn 1.5.2 `class TargetEncoder`
//! (`sklearn/preprocessing/_target_encoder.py`). Design:
//! `.design/preprocess/target_encoder.md`. Tracking: #1260.
//!
//! `## REQ status`
//!
//! | REQ | Status | Anchor |
//! |---|---|---|
//! | REQ-1 manual-`smooth` m-estimate value match (f64, bit-exact) | SHIPPED | `TargetEncoder::fit` / `transform`; sklearn `_target_encoder_fast.pyx:60-75`, `_target_encoder.py:289`,`:383` (#1261 pairwise sum, #1262 formula) |
//! | REQ-2 unseen category → `target_mean_` (global mean) | SHIPPED | `transform` `unwrap_or(global_mean)`; sklearn `_target_encoder.py:324-345` |
//! | REQ-3 InsufficientSamples / ShapeMismatch / InvalidParameter errors | SHIPPED | `fit` / `transform` guards; sklearn `_target_encoder.py:189` |
//! | REQ-4 `smooth="auto"` empirical-Bayes encoding + DEFAULT | SHIPPED | `Smooth` enum `{ Auto, Fixed(F) }` (`Default`/`TargetEncoder::default` → `Auto`); `fit_feature_encoding` Auto branch (two-pass means/ssd, `lambda_ = y_variance*count/(y_variance*count+ssd/count)`, NaN→y_mean), `population_variance_f64` (ddof=0) computed once in `fit`; sklearn `_target_encoder_fast.pyx:140-165`, `_target_encoder.py:199`,`:416`. Consumer: `TargetEncoder::fit`/`fit_transform`/`default` (the `Smooth` field drives the encoding branch) + the public module path `ferrolearn_preprocess::target_encoder::Smooth` (`pub mod target_encoder` in `lib.rs`). Verify: pins `divergence_default_smooth_is_auto`/`divergence_smooth_auto_empirical_bayes` green (#2342 #2343) |
//! | REQ-5 cross-fitting `fit_transform` (deterministic KFold) | SHIPPED | `TargetEncoder::fit_transform` cross-fits over `kfold_test_ranges` (contiguous no-shuffle folds, `cv` default 5), per-fold `fit_feature_encoding` on TRAIN rows → encode TEST rows (unseen-in-train → `y_train_mean`); sklearn `_target_encoder.py:232`,`:254-303`, `_split.py:521-534`. Consumer: crate re-export (`lib.rs`). Verify: pin `divergence_crossfit_fit_transform` green (#2344). NOTE: `shuffle`/`random_state` (REQ-8 NOT-STARTED) absent → deterministic `shuffle=False` KFold only |
//! | REQ-6 `target_type` binary/multiclass | NOT-STARTED (#1266) | sklearn `_target_encoder.py:269-273`,`:376-379` |
//! | REQ-7 `categories` param + `categories_`/`target_type_`/`classes_` | NOT-STARTED (#1267) | sklearn `_target_encoder.py:197`,`:358-381` |
//! | REQ-8 `cv`/`shuffle`/`random_state` params | NOT-STARTED (#1268) | sklearn `_target_encoder.py:200-209` |
//! | REQ-9 string/object categories | NOT-STARTED (#1269) | usize-only, R-DEV-3 |
//! | REQ-10 `get_feature_names_out`/`n_features_in_` | NOT-STARTED (#1270) | sklearn `OneToOneFeatureMixin` |
//! | REQ-11 PyO3 binding | NOT-STARTED (#1271) | `ferrolearn-python/src/` (absent) |
//! | REQ-12 ferray substrate | NOT-STARTED (#1272) | R-SUBSTRATE |
//! | REQ-13 per-category sums accumulate in f64 (always), matching sklearn's C `double` | SHIPPED | `fit` accumulates `cat_stats: HashMap<usize,(f64,usize)>` seeded with `smooth_f64*global_mean_f64`, `+= y[i].to_f64()`, then `F::from(sum/(smooth_f64+count))`; sklearn `_target_encoder_fast.pyx:42,44,68` (`double sums[]`/`counts[]`, `sums[cat]+=y[i]` regardless of `Y_DTYPE`), `encodings_` always float64 (`_target_encoder.py:335`). f64 path identity (bit-exact unchanged); `TargetEncoder<f32>` now captures `2^24+1` (#1263) |

use ferrolearn_core::error::FerroError;
use ferrolearn_core::traits::{Fit, Transform};
use ndarray::{Array1, Array2};
use num_traits::Float;
use std::collections::HashMap;

/// The smoothing strategy for [`TargetEncoder`].
///
/// Mirrors scikit-learn's `smooth` parameter
/// (`sklearn/preprocessing/_target_encoder.py:189`,
/// `"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")]`),
/// whose DEFAULT is the string `"auto"` (an empirical-Bayes estimate,
/// `_target_encoder.py:85-89`) rather than a fixed numeric value.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Smooth<F> {
    /// `smooth="auto"` — the empirical-Bayes shrinkage estimate
    /// (`_target_encoder_fast.pyx:140-165`): per category blend the category
    /// mean toward the global mean by a `lambda_` derived from the
    /// within-category sum-of-squared deviations vs the overall target variance.
    Auto,
    /// A fixed numeric smoothing factor `m` driving the m-estimate
    /// `(smooth * y_mean + Σyᵢ) / (smooth + count)`
    /// (`_target_encoder_fast.pyx:60-75`). Must be non-negative.
    Fixed(F),
}

impl<F: Float> Default for Smooth<F> {
    /// The default matches scikit-learn's constructor default `smooth="auto"`
    /// (`_target_encoder.py:199`).
    fn default() -> Self {
        Smooth::Auto
    }
}

/// Sum a slice reproducing NumPy's pairwise summation (the algorithm behind
/// `np.add.reduce` / `np.mean`), so a ferrolearn mean bit-matches sklearn's
/// `np.mean` on ill-conditioned mixed-magnitude inputs.
///
/// sklearn sets `target_mean_ = np.mean(y, axis=0)`
/// (`sklearn/preprocessing/_target_encoder.py:383`), and `np.mean` reduces via
/// NumPy pairwise summation, which rounds differently from a naive left-fold on
/// targets that mix magnitudes.
///
/// Mirrors NumPy `pairwise_sum` (numpy/_core/src/umath/loops_utils.h.src):
/// - `n < 8`        : straight sequential sum seeded from the first element.
/// - `8 <= n <= 128`: 8 partial accumulators, unrolled by 8, combined as
///   `((r0+r1)+(r2+r3)) + ((r4+r5)+(r6+r7))`, then the tail.
/// - `n > 128`      : split at `n2 = (n/2)` rounded DOWN to a multiple of 8, recurse.
fn pairwise_sum<F: Float>(data: &[F]) -> F {
    let n = data.len();
    if n == 0 {
        return F::zero();
    }
    if n < 8 {
        // Seed from the first element, then fold the rest left-to-right (numpy).
        data[1..].iter().fold(data[0], |a, &v| a + v)
    } else if n <= 128 {
        let mut r0 = data[0];
        let mut r1 = data[1];
        let mut r2 = data[2];
        let mut r3 = data[3];
        let mut r4 = data[4];
        let mut r5 = data[5];
        let mut r6 = data[6];
        let mut r7 = data[7];
        let bound = n - (n % 8);
        let mut i = 8;
        while i < bound {
            r0 = r0 + data[i];
            r1 = r1 + data[i + 1];
            r2 = r2 + data[i + 2];
            r3 = r3 + data[i + 3];
            r4 = r4 + data[i + 4];
            r5 = r5 + data[i + 5];
            r6 = r6 + data[i + 6];
            r7 = r7 + data[i + 7];
            i += 8;
        }
        let res = ((r0 + r1) + (r2 + r3)) + ((r4 + r5) + (r6 + r7));
        // Add the remainder (indices `bound..n`) left-to-right (numpy tail).
        data[bound..].iter().fold(res, |a, &v| a + v)
    } else {
        let mut n2 = n / 2;
        n2 -= n2 % 8;
        pairwise_sum(&data[..n2]) + pairwise_sum(&data[n2..])
    }
}

/// `np.mean(y)` over the first `n` elements via NumPy pairwise summation
/// (`_target_encoder.py:383` `target_mean_ = np.mean(y, axis=0)`).
fn mean_pairwise<F: Float>(y: &Array1<F>, n: usize) -> F {
    let total = if let Some(slice) = y.as_slice() {
        pairwise_sum(slice)
    } else {
        let v: Vec<F> = y.iter().copied().collect();
        pairwise_sum(&v)
    };
    total / F::from(n).unwrap_or_else(F::one)
}

/// The POPULATION variance of `y` (`np.var(y)`, ddof=0), computed in f64 to
/// match scikit-learn's C `double` accumulation. sklearn evaluates
/// `y_variance = np.var(y)` once per fit (`_target_encoder.py:416`) and feeds it
/// into the empirical-Bayes `lambda_` (`_target_encoder_fast.pyx:152-156`).
///
/// `mean_f64` is the already-computed `np.mean(y)` (`np.var` subtracts the same
/// mean); the squared deviations are reduced via NumPy pairwise summation, which
/// `np.var` uses internally.
fn population_variance_f64<F: Float>(y: &Array1<F>, mean_f64: f64) -> f64 {
    let n = y.len();
    if n == 0 {
        return 0.0;
    }
    let sq: Vec<f64> = y
        .iter()
        .map(|&v| {
            let d = v.to_f64().unwrap_or(0.0) - mean_f64;
            d * d
        })
        .collect();
    pairwise_sum(&sq) / n as f64
}

/// Learn the per-category encoding for ONE feature column.
///
/// Dispatches on the [`Smooth`] strategy. All arithmetic is done in f64
/// (matching sklearn's C `double` accumulators, `_target_encoder_fast.pyx:42,44`)
/// then cast to `F`; for `F = f64` the round-trip is the identity.
///
/// - [`Smooth::Fixed`] reproduces `_fit_encoding_fast` (`:55-77`): seed each
///   category with `(smooth*y_mean, smooth)`, add `(yᵢ, 1)` per sample, then
///   `encoding = sum/count`, or `y_mean` when `count == 0`.
/// - [`Smooth::Auto`] reproduces `_fit_encoding_fast_auto_smooth`
///   (`:120-165`): two passes (mean, then sum-of-squared-diffs), a per-category
///   `lambda_ = y_variance*count / (y_variance*count + ssd/count)`, blended as
///   `lambda_*mean + (1-lambda_)*y_mean`; a NaN `lambda_` (count 0, or
///   `y_variance == 0 && ssd == 0`) falls back to `y_mean`.
fn fit_feature_encoding<F: Float>(
    col: &[usize],
    y: &Array1<F>,
    smooth: Smooth<F>,
    y_mean_f64: f64,
    y_variance_f64: Option<f64>,
) -> HashMap<usize, F> {
    match smooth {
        Smooth::Fixed(s) => {
            let smooth_f64 = s.to_f64().unwrap_or(0.0);
            // Seed each category's accumulator with `(smooth*y_mean, smooth)`,
            // add each sample's `(yᵢ, 1)` in row order, then `sum/count`
            // (`_target_encoder_fast.pyx:60-75`).
            let mut stats: HashMap<usize, (f64, f64)> = HashMap::new();
            for (i, &cat) in col.iter().enumerate() {
                let entry = stats
                    .entry(cat)
                    .or_insert((smooth_f64 * y_mean_f64, smooth_f64));
                entry.0 += y[i].to_f64().unwrap_or(0.0);
                entry.1 += 1.0;
            }
            let mut map: HashMap<usize, F> = HashMap::new();
            for (&cat, &(sum, count)) in &stats {
                // `count` is `smooth + n_cat`; it is 0 only when smooth==0 AND
                // the category has no rows — which cannot happen here since a
                // category key exists only if a sample produced it. Guard anyway
                // to mirror sklearn's `if counts[cat]==0 -> y_mean` (`:72-73`).
                let encoded = if count == 0.0 {
                    y_mean_f64
                } else {
                    sum / count
                };
                map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
            }
            map
        }
        Smooth::Auto => {
            let y_variance = y_variance_f64.unwrap_or(0.0);
            // First pass: per-category sum + count (-> means).
            let mut sums: HashMap<usize, f64> = HashMap::new();
            let mut counts: HashMap<usize, f64> = HashMap::new();
            for (i, &cat) in col.iter().enumerate() {
                *sums.entry(cat).or_insert(0.0) += y[i].to_f64().unwrap_or(0.0);
                *counts.entry(cat).or_insert(0.0) += 1.0;
            }
            let means: HashMap<usize, f64> = sums
                .iter()
                .map(|(&cat, &s)| (cat, s / counts[&cat]))
                .collect();
            // Second pass: per-category sum of squared deviations from the mean
            // (`_target_encoder_fast.pyx:143-149`).
            let mut ssd: HashMap<usize, f64> = HashMap::new();
            for (i, &cat) in col.iter().enumerate() {
                let diff = y[i].to_f64().unwrap_or(0.0) - means[&cat];
                *ssd.entry(cat).or_insert(0.0) += diff * diff;
            }
            let mut map: HashMap<usize, F> = HashMap::new();
            for (&cat, &mean) in &means {
                let count = counts[&cat];
                let ssd_cat = ssd[&cat];
                // lambda_ = y_variance*count / (y_variance*count + ssd/count)
                // (`_target_encoder_fast.pyx:152-156`).
                let denom = y_variance * count + ssd_cat / count;
                let lambda = (y_variance * count) / denom;
                let encoded = if lambda.is_nan() {
                    // NaN when count==0 OR (y_variance==0 AND ssd==0): -> y_mean
                    // (`_target_encoder_fast.pyx:157-161`).
                    y_mean_f64
                } else {
                    lambda * mean + (1.0 - lambda) * y_mean_f64
                };
                map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
            }
            map
        }
    }
}

// ---------------------------------------------------------------------------
// TargetEncoder (unfitted)
// ---------------------------------------------------------------------------

/// An unfitted target encoder.
///
/// Takes a matrix of categorical integer features and a continuous (or binary)
/// target vector at fit time. Each category is encoded as the smoothed mean of
/// the target for that category.
///
/// # Parameters
///
/// - `smooth` — the smoothing strategy ([`Smooth`]). The DEFAULT is
///   [`Smooth::Auto`] (empirical Bayes), matching scikit-learn's constructor
///   default `smooth="auto"` (`_target_encoder.py:199`). [`Smooth::Fixed`]
///   selects the fixed m-estimate; higher values regularise more toward the
///   global mean, `Fixed(0)` is no smoothing.
/// - `cv` — the number of cross-fitting folds used by
///   [`fit_transform`](TargetEncoder::fit_transform) (default 5, matching
///   scikit-learn's `cv=5`, `_target_encoder.py:200`).
///
/// # Examples
///
/// ```
/// use ferrolearn_preprocess::target_encoder::TargetEncoder;
/// use ferrolearn_core::traits::{Fit, Transform};
/// use ndarray::array;
///
/// let enc = TargetEncoder::<f64>::new(1.0);
/// let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
/// let y = array![1.0, 2.0, 3.0, 4.0];
/// let fitted = enc.fit(&x, &y).unwrap();
/// let out = fitted.transform(&x).unwrap();
/// assert_eq!(out.shape(), &[4, 2]);
/// ```
#[must_use]
#[derive(Debug, Clone)]
pub struct TargetEncoder<F> {
    /// Smoothing strategy.
    smooth: Smooth<F>,
    /// Number of cross-fitting folds for `fit_transform`.
    cv: usize,
}

impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
    /// Create a new `TargetEncoder` with a FIXED smoothing factor.
    ///
    /// This is shorthand for [`with_smooth`](Self::with_smooth) with
    /// [`Smooth::Fixed`] and `cv = 5`.
    pub fn new(smooth: F) -> Self {
        Self {
            smooth: Smooth::Fixed(smooth),
            cv: 5,
        }
    }

    /// Create a new `TargetEncoder` with the given smoothing strategy and
    /// `cv = 5` (matching scikit-learn's default).
    pub fn with_smooth(smooth: Smooth<F>) -> Self {
        Self { smooth, cv: 5 }
    }

    /// Set the number of cross-fitting folds used by
    /// [`fit_transform`](Self::fit_transform).
    pub fn with_cv(mut self, cv: usize) -> Self {
        self.cv = cv;
        self
    }

    /// Return the smoothing strategy.
    #[must_use]
    pub fn smooth(&self) -> Smooth<F> {
        self.smooth
    }

    /// Return the number of cross-fitting folds.
    #[must_use]
    pub fn cv(&self) -> usize {
        self.cv
    }
}

impl<F: Float + Send + Sync + 'static> Default for TargetEncoder<F> {
    /// The default uses [`Smooth::Auto`] (empirical Bayes) and `cv = 5`,
    /// matching scikit-learn's `TargetEncoder()` (`smooth="auto"`, `cv=5`,
    /// `_target_encoder.py:199-200`).
    fn default() -> Self {
        Self {
            smooth: Smooth::Auto,
            cv: 5,
        }
    }
}

// ---------------------------------------------------------------------------
// FittedTargetEncoder
// ---------------------------------------------------------------------------

/// A fitted target encoder holding per-feature, per-category encoding values.
///
/// Created by calling [`Fit::fit`] on a [`TargetEncoder`].
#[derive(Debug, Clone)]
pub struct FittedTargetEncoder<F> {
    /// Per-feature mapping from category → encoded value.
    category_maps: Vec<HashMap<usize, F>>,
    /// Global target mean (used for unseen categories).
    global_mean: F,
}

impl<F: Float + Send + Sync + 'static> FittedTargetEncoder<F> {
    /// Return the encoding maps per feature.
    #[must_use]
    pub fn category_maps(&self) -> &[HashMap<usize, F>] {
        &self.category_maps
    }

    /// Return the global target mean.
    #[must_use]
    pub fn global_mean(&self) -> F {
        self.global_mean
    }
}

// ---------------------------------------------------------------------------
// Trait implementations
// ---------------------------------------------------------------------------

impl<F: Float + Send + Sync + 'static> Fit<Array2<usize>, Array1<F>> for TargetEncoder<F> {
    type Fitted = FittedTargetEncoder<F>;
    type Error = FerroError;

    /// Fit the encoder by computing smoothed target means per category.
    ///
    /// # Errors
    ///
    /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
    /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
    /// - [`FerroError::InvalidParameter`] if `smooth` is negative.
    fn fit(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<FittedTargetEncoder<F>, FerroError> {
        let n_samples = x.nrows();
        if n_samples == 0 {
            return Err(FerroError::InsufficientSamples {
                required: 1,
                actual: 0,
                context: "TargetEncoder::fit".into(),
            });
        }
        if y.len() != n_samples {
            return Err(FerroError::ShapeMismatch {
                expected: vec![n_samples],
                actual: vec![y.len()],
                context: "TargetEncoder::fit — y must have same length as x rows".into(),
            });
        }
        if let Smooth::Fixed(s) = self.smooth
            && s < F::zero()
        {
            return Err(FerroError::InvalidParameter {
                name: "smooth".into(),
                reason: "smoothing factor must be non-negative".into(),
            });
        }

        let n_features = x.ncols();
        // sklearn: target_mean_ = np.mean(y, axis=0) (_target_encoder.py:383),
        // which reduces via NumPy pairwise summation. Reproduce it bit-for-bit so
        // the mean matches on mixed-magnitude targets.
        let global_mean = mean_pairwise(y, n_samples);
        let global_mean_f64 = global_mean.to_f64().unwrap_or(0.0);

        // For `smooth="auto"` (empirical Bayes) sklearn needs the POPULATION
        // variance of the full target, computed once per fit
        // (`_target_encoder.py:416` `y_variance = np.var(y)`).
        let y_variance_f64 = match self.smooth {
            Smooth::Auto => Some(population_variance_f64(y, global_mean_f64)),
            Smooth::Fixed(_) => None,
        };

        let mut category_maps = Vec::with_capacity(n_features);
        for j in 0..n_features {
            let col: Vec<usize> = (0..n_samples).map(|i| x[[i, j]]).collect();
            category_maps.push(fit_feature_encoding(
                &col,
                y,
                self.smooth,
                global_mean_f64,
                y_variance_f64,
            ));
        }

        Ok(FittedTargetEncoder {
            category_maps,
            global_mean,
        })
    }
}

/// The contiguous (un-shuffled) KFold test-index folds over `n` samples.
///
/// Mirrors scikit-learn's `KFold._iter_test_indices`
/// (`sklearn/model_selection/_split.py:521-534`) with `shuffle=False`: the
/// indices are `0..n` in order, split into `k` consecutive folds where the
/// first `n % k` folds have size `n // k + 1` and the rest `n // k`. Returns a
/// vec of `(test_start, test_end)` half-open ranges.
fn kfold_test_ranges(n: usize, k: usize) -> Vec<(usize, usize)> {
    let base = n / k;
    let rem = n % k;
    let mut ranges = Vec::with_capacity(k);
    let mut current = 0usize;
    for fold in 0..k {
        let size = base + usize::from(fold < rem);
        ranges.push((current, current + size));
        current += size;
    }
    ranges
}

impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
    /// Cross-fitting `fit_transform`: encode each row using encodings learned on
    /// the OTHER folds, preventing target leakage.
    ///
    /// Mirrors scikit-learn's `TargetEncoder.fit_transform`
    /// (`sklearn/preprocessing/_target_encoder.py:232-303`): for the
    /// continuous/binary single-output case it uses a deterministic `KFold`
    /// (`cv` folds, NO shuffle — ferrolearn exposes no `shuffle`/`random_state`,
    /// so this is sklearn's reproducible `shuffle=False` path, `:262`); for each
    /// `(train, test)` fold it fits the per-feature encodings on the TRAIN rows
    /// (with that fold's `y_train_mean`) and writes the TEST rows through those
    /// train-encodings (`:277-302`). A category unseen in the train fold encodes
    /// to `y_train_mean` (the `count == 0 -> y_mean` rule, mirroring
    /// `_transform_X_ordinal`'s unknown-category fallback, `:494-497`).
    ///
    /// Note `fit(X,y).transform(X)` does NOT equal `fit_transform(X,y)`
    /// (`:235-238`): `transform` uses the full-data `encodings_`, `fit_transform`
    /// is cross-fit.
    ///
    /// # Errors
    ///
    /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
    /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
    /// - [`FerroError::InvalidParameter`] if a [`Smooth::Fixed`] factor is
    ///   negative, or if `cv < 2` / `cv` exceeds the sample count (sklearn
    ///   requires `cv >= 2`, `_target_encoder.py:190`, and `KFold` rejects more
    ///   splits than samples, `_split.py:408-414`).
    pub fn fit_transform(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<Array2<F>, FerroError> {
        let n_samples = x.nrows();
        if n_samples == 0 {
            return Err(FerroError::InsufficientSamples {
                required: 1,
                actual: 0,
                context: "TargetEncoder::fit_transform".into(),
            });
        }
        if y.len() != n_samples {
            return Err(FerroError::ShapeMismatch {
                expected: vec![n_samples],
                actual: vec![y.len()],
                context: "TargetEncoder::fit_transform — y must have same length as x rows".into(),
            });
        }
        if let Smooth::Fixed(s) = self.smooth
            && s < F::zero()
        {
            return Err(FerroError::InvalidParameter {
                name: "smooth".into(),
                reason: "smoothing factor must be non-negative".into(),
            });
        }
        // sklearn `_parameter_constraints` requires `cv >= 2`
        // (`_target_encoder.py:190`); `KFold` additionally rejects more splits
        // than samples (`_split.py:408-414`).
        if self.cv < 2 {
            return Err(FerroError::InvalidParameter {
                name: "cv".into(),
                reason: "cv must be at least 2".into(),
            });
        }
        if self.cv > n_samples {
            return Err(FerroError::InvalidParameter {
                name: "cv".into(),
                reason: "cv cannot exceed the number of samples".into(),
            });
        }

        let n_features = x.ncols();
        let mut out = Array2::zeros((n_samples, n_features));

        for (test_start, test_end) in kfold_test_ranges(n_samples, self.cv) {
            // Train indices are everything OUTSIDE the contiguous test fold.
            let train_idx: Vec<usize> = (0..n_samples)
                .filter(|&i| i < test_start || i >= test_end)
                .collect();

            // y_train_mean = np.mean(y[train]) (`_target_encoder.py:279`).
            let y_train: Vec<F> = train_idx.iter().map(|&i| y[i]).collect();
            let y_train_arr = Array1::from(y_train);
            let train_mean = mean_pairwise(&y_train_arr, train_idx.len());
            let train_mean_f64 = train_mean.to_f64().unwrap_or(0.0);
            let train_var_f64 = match self.smooth {
                Smooth::Auto => Some(population_variance_f64(&y_train_arr, train_mean_f64)),
                Smooth::Fixed(_) => None,
            };

            for j in 0..n_features {
                // Fit this fold's per-feature encoding on the TRAIN rows.
                let train_col: Vec<usize> = train_idx.iter().map(|&i| x[[i, j]]).collect();
                let enc = fit_feature_encoding(
                    &train_col,
                    &y_train_arr,
                    self.smooth,
                    train_mean_f64,
                    train_var_f64,
                );
                // Encode the TEST rows; a category unseen in the train fold ->
                // the train y_mean (`_transform_X_ordinal`, `:494-497`).
                for i in test_start..test_end {
                    let cat = x[[i, j]];
                    out[[i, j]] = *enc.get(&cat).unwrap_or(&train_mean);
                }
            }
        }

        Ok(out)
    }
}

impl<F: Float + Send + Sync + 'static> Transform<Array2<usize>> for FittedTargetEncoder<F> {
    type Output = Array2<F>;
    type Error = FerroError;

    /// Encode categorical features using the learned target statistics.
    ///
    /// Unseen categories are encoded as the global target mean.
    ///
    /// # Errors
    ///
    /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
    /// from the number of features seen during fitting.
    fn transform(&self, x: &Array2<usize>) -> Result<Array2<F>, FerroError> {
        let n_features = self.category_maps.len();
        if x.ncols() != n_features {
            return Err(FerroError::ShapeMismatch {
                expected: vec![x.nrows(), n_features],
                actual: vec![x.nrows(), x.ncols()],
                context: "FittedTargetEncoder::transform".into(),
            });
        }

        let n_samples = x.nrows();
        let mut out = Array2::zeros((n_samples, n_features));

        for j in 0..n_features {
            let cat_map = &self.category_maps[j];
            for i in 0..n_samples {
                let cat = x[[i, j]];
                out[[i, j]] = *cat_map.get(&cat).unwrap_or(&self.global_mean);
            }
        }

        Ok(out)
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_abs_diff_eq;
    use ndarray::array;

    #[test]
    fn test_target_encoder_basic() {
        let enc = TargetEncoder::<f64>::new(0.0); // no smoothing
        // Category 0: targets [1.0, 2.0], mean = 1.5
        // Category 1: targets [3.0, 4.0], mean = 3.5
        let x = array![[0usize], [0], [1], [1]];
        let y = array![1.0, 2.0, 3.0, 4.0];
        let fitted = enc.fit(&x, &y).unwrap();
        let out = fitted.transform(&x).unwrap();
        assert_abs_diff_eq!(out[[0, 0]], 1.5, epsilon = 1e-10);
        assert_abs_diff_eq!(out[[1, 0]], 1.5, epsilon = 1e-10);
        assert_abs_diff_eq!(out[[2, 0]], 3.5, epsilon = 1e-10);
        assert_abs_diff_eq!(out[[3, 0]], 3.5, epsilon = 1e-10);
    }

    #[test]
    fn test_target_encoder_smoothing() {
        let enc = TargetEncoder::<f64>::new(2.0);
        // Category 0: targets [1.0], mean = 1.0, count = 1
        // Category 1: targets [3.0, 5.0], mean = 4.0, count = 2
        // Global mean = (1 + 3 + 5) / 3 = 3.0
        let x = array![[0usize], [1], [1]];
        let y = array![1.0, 3.0, 5.0];
        let fitted = enc.fit(&x, &y).unwrap();
        let out = fitted.transform(&x).unwrap();
        // Cat 0: (1 * 1.0 + 2 * 3.0) / (1 + 2) = 7/3 ≈ 2.333
        let expected_0 = (1.0 * 1.0 + 2.0 * 3.0) / (1.0 + 2.0);
        assert_abs_diff_eq!(out[[0, 0]], expected_0, epsilon = 1e-10);
        // Cat 1: (2 * 4.0 + 2 * 3.0) / (2 + 2) = 14/4 = 3.5
        let expected_1 = (2.0 * 4.0 + 2.0 * 3.0) / (2.0 + 2.0);
        assert_abs_diff_eq!(out[[1, 0]], expected_1, epsilon = 1e-10);
    }

    #[test]
    fn test_target_encoder_unseen_category() {
        let enc = TargetEncoder::<f64>::new(1.0);
        let x = array![[0usize], [0], [1], [1]];
        let y = array![1.0, 2.0, 3.0, 4.0];
        let fitted = enc.fit(&x, &y).unwrap();
        // Transform with unseen category 2
        let x_new = array![[2usize]];
        let out = fitted.transform(&x_new).unwrap();
        // Unseen category → global mean = 2.5
        assert_abs_diff_eq!(out[[0, 0]], 2.5, epsilon = 1e-10);
    }

    #[test]
    fn test_target_encoder_multi_feature() {
        let enc = TargetEncoder::<f64>::new(0.0);
        let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
        let y = array![1.0, 2.0, 3.0, 4.0];
        let fitted = enc.fit(&x, &y).unwrap();
        let out = fitted.transform(&x).unwrap();
        assert_eq!(out.shape(), &[4, 2]);
    }

    #[test]
    fn test_target_encoder_zero_rows_error() {
        let enc = TargetEncoder::<f64>::new(1.0);
        let x: Array2<usize> = Array2::zeros((0, 2));
        let y: Array1<f64> = Array1::zeros(0);
        assert!(enc.fit(&x, &y).is_err());
    }

    #[test]
    fn test_target_encoder_shape_mismatch_fit() {
        let enc = TargetEncoder::<f64>::new(1.0);
        let x = array![[0usize], [1]];
        let y = array![1.0]; // wrong length
        assert!(enc.fit(&x, &y).is_err());
    }

    #[test]
    fn test_target_encoder_shape_mismatch_transform() {
        let enc = TargetEncoder::<f64>::new(1.0);
        let x = array![[0usize, 1], [1, 0]];
        let y = array![1.0, 2.0];
        let fitted = enc.fit(&x, &y).unwrap();
        let x_bad = array![[0usize]]; // wrong number of columns
        assert!(fitted.transform(&x_bad).is_err());
    }

    #[test]
    fn test_target_encoder_negative_smooth_error() {
        let enc = TargetEncoder::<f64>::new(-1.0);
        let x = array![[0usize]];
        let y = array![1.0];
        assert!(enc.fit(&x, &y).is_err());
    }

    #[test]
    fn test_target_encoder_default() {
        // sklearn's DEFAULT is smooth="auto" (`_target_encoder.py:199`), NOT a
        // fixed value; `new(F)` is the explicit fixed-smooth constructor.
        let enc = TargetEncoder::<f64>::default();
        assert_eq!(enc.smooth(), Smooth::Auto);
        assert_eq!(enc.cv(), 5);
        let fixed = TargetEncoder::<f64>::new(1.0);
        assert_eq!(fixed.smooth(), Smooth::Fixed(1.0));
    }

    #[test]
    fn test_target_encoder_global_mean_accessor() {
        let enc = TargetEncoder::<f64>::new(0.0);
        let x = array![[0usize], [1]];
        let y = array![2.0, 4.0];
        let fitted = enc.fit(&x, &y).unwrap();
        assert_abs_diff_eq!(fitted.global_mean(), 3.0, epsilon = 1e-10);
    }

    #[test]
    fn test_target_encoder_f32() {
        let enc = TargetEncoder::<f32>::new(1.0f32);
        let x = array![[0usize], [0], [1]];
        let y: Array1<f32> = array![1.0f32, 2.0, 3.0];
        let fitted = enc.fit(&x, &y).unwrap();
        let out = fitted.transform(&x).unwrap();
        assert!(!out[[0, 0]].is_nan());
    }
}