ferrolearn_preprocess/
target_encoder.rs

1//! Target encoder: encode categorical features using target statistics.
2//!
3//! [`TargetEncoder`] replaces each category with the mean of the target variable
4//! for that category, regularised toward the global mean using smoothing.
5//!
6//! This is especially useful for high-cardinality categorical features where
7//! one-hot encoding would produce too many columns.
8//!
9//! # Smoothing
10//!
11//! The encoded value for category `c` is (matching scikit-learn
12//! `_target_encoder_fast.pyx:60-75` — the accumulator is seeded with
13//! `smooth * global_mean` then the category's targets are added, divided by
14//! `smooth + count(c)`):
15//!
16//! ```text
17//! encoded(c) = (smooth * global_mean + sum_of_targets(c)) / (smooth + count(c))
18//! ```
19//!
20//! where `smooth` controls the degree of regularisation.
21//!
22//! Translation target: scikit-learn 1.5.2 `class TargetEncoder`
23//! (`sklearn/preprocessing/_target_encoder.py`). Design:
24//! `.design/preprocess/target_encoder.md`. Tracking: #1260.
25//!
26//! `## REQ status`
27//!
28//! | REQ | Status | Anchor |
29//! |---|---|---|
30//! | REQ-1 manual-`smooth` m-estimate value match (f64, bit-exact) | SHIPPED | `TargetEncoder::fit` / `transform`; sklearn `_target_encoder_fast.pyx:60-75`, `_target_encoder.py:289`,`:383` (#1261 pairwise sum, #1262 formula) |
31//! | REQ-2 unseen category → `target_mean_` (global mean) | SHIPPED | `transform` `unwrap_or(global_mean)`; sklearn `_target_encoder.py:324-345` |
32//! | REQ-3 InsufficientSamples / ShapeMismatch / InvalidParameter errors | SHIPPED | `fit` / `transform` guards; sklearn `_target_encoder.py:189` |
33//! | REQ-4 `smooth="auto"` empirical-Bayes encoding + DEFAULT | SHIPPED | `Smooth` enum `{ Auto, Fixed(F) }` (`Default`/`TargetEncoder::default` → `Auto`); `fit_feature_encoding` Auto branch (two-pass means/ssd, `lambda_ = y_variance*count/(y_variance*count+ssd/count)`, NaN→y_mean), `population_variance_f64` (ddof=0) computed once in `fit`; sklearn `_target_encoder_fast.pyx:140-165`, `_target_encoder.py:199`,`:416`. Consumer: `TargetEncoder::fit`/`fit_transform`/`default` (the `Smooth` field drives the encoding branch) + the public module path `ferrolearn_preprocess::target_encoder::Smooth` (`pub mod target_encoder` in `lib.rs`). Verify: pins `divergence_default_smooth_is_auto`/`divergence_smooth_auto_empirical_bayes` green (#2342 #2343) |
34//! | REQ-5 cross-fitting `fit_transform` (deterministic KFold) | SHIPPED | `TargetEncoder::fit_transform` cross-fits over `kfold_test_ranges` (contiguous no-shuffle folds, `cv` default 5), per-fold `fit_feature_encoding` on TRAIN rows → encode TEST rows (unseen-in-train → `y_train_mean`); sklearn `_target_encoder.py:232`,`:254-303`, `_split.py:521-534`. Consumer: crate re-export (`lib.rs`). Verify: pin `divergence_crossfit_fit_transform` green (#2344). NOTE: `shuffle`/`random_state` (REQ-8 NOT-STARTED) absent → deterministic `shuffle=False` KFold only |
35//! | REQ-6 `target_type` binary/multiclass | NOT-STARTED (#1266) | sklearn `_target_encoder.py:269-273`,`:376-379` |
36//! | REQ-7 `categories` param + `categories_`/`target_type_`/`classes_` | NOT-STARTED (#1267) | sklearn `_target_encoder.py:197`,`:358-381` |
37//! | REQ-8 `cv`/`shuffle`/`random_state` params | NOT-STARTED (#1268) | sklearn `_target_encoder.py:200-209` |
38//! | REQ-9 string/object categories | NOT-STARTED (#1269) | usize-only, R-DEV-3 |
39//! | REQ-10 `get_feature_names_out`/`n_features_in_` | NOT-STARTED (#1270) | sklearn `OneToOneFeatureMixin` |
40//! | REQ-11 PyO3 binding | NOT-STARTED (#1271) | `ferrolearn-python/src/` (absent) |
41//! | REQ-12 ferray substrate | NOT-STARTED (#1272) | R-SUBSTRATE |
42//! | REQ-13 per-category sums accumulate in f64 (always), matching sklearn's C `double` | SHIPPED | `fit` accumulates `cat_stats: HashMap<usize,(f64,usize)>` seeded with `smooth_f64*global_mean_f64`, `+= y[i].to_f64()`, then `F::from(sum/(smooth_f64+count))`; sklearn `_target_encoder_fast.pyx:42,44,68` (`double sums[]`/`counts[]`, `sums[cat]+=y[i]` regardless of `Y_DTYPE`), `encodings_` always float64 (`_target_encoder.py:335`). f64 path identity (bit-exact unchanged); `TargetEncoder<f32>` now captures `2^24+1` (#1263) |
43
44use ferrolearn_core::error::FerroError;
45use ferrolearn_core::traits::{Fit, Transform};
46use ndarray::{Array1, Array2};
47use num_traits::Float;
48use std::collections::HashMap;
49
50/// The smoothing strategy for [`TargetEncoder`].
51///
52/// Mirrors scikit-learn's `smooth` parameter
53/// (`sklearn/preprocessing/_target_encoder.py:189`,
54/// `"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")]`),
55/// whose DEFAULT is the string `"auto"` (an empirical-Bayes estimate,
56/// `_target_encoder.py:85-89`) rather than a fixed numeric value.
57#[derive(Debug, Clone, Copy, PartialEq)]
58pub enum Smooth<F> {
59    /// `smooth="auto"` — the empirical-Bayes shrinkage estimate
60    /// (`_target_encoder_fast.pyx:140-165`): per category blend the category
61    /// mean toward the global mean by a `lambda_` derived from the
62    /// within-category sum-of-squared deviations vs the overall target variance.
63    Auto,
64    /// A fixed numeric smoothing factor `m` driving the m-estimate
65    /// `(smooth * y_mean + Σyᵢ) / (smooth + count)`
66    /// (`_target_encoder_fast.pyx:60-75`). Must be non-negative.
67    Fixed(F),
68}
69
70impl<F: Float> Default for Smooth<F> {
71    /// The default matches scikit-learn's constructor default `smooth="auto"`
72    /// (`_target_encoder.py:199`).
73    fn default() -> Self {
74        Smooth::Auto
75    }
76}
77
78/// Sum a slice reproducing NumPy's pairwise summation (the algorithm behind
79/// `np.add.reduce` / `np.mean`), so a ferrolearn mean bit-matches sklearn's
80/// `np.mean` on ill-conditioned mixed-magnitude inputs.
81///
82/// sklearn sets `target_mean_ = np.mean(y, axis=0)`
83/// (`sklearn/preprocessing/_target_encoder.py:383`), and `np.mean` reduces via
84/// NumPy pairwise summation, which rounds differently from a naive left-fold on
85/// targets that mix magnitudes.
86///
87/// Mirrors NumPy `pairwise_sum` (numpy/_core/src/umath/loops_utils.h.src):
88/// - `n < 8`        : straight sequential sum seeded from the first element.
89/// - `8 <= n <= 128`: 8 partial accumulators, unrolled by 8, combined as
90///   `((r0+r1)+(r2+r3)) + ((r4+r5)+(r6+r7))`, then the tail.
91/// - `n > 128`      : split at `n2 = (n/2)` rounded DOWN to a multiple of 8, recurse.
92fn pairwise_sum<F: Float>(data: &[F]) -> F {
93    let n = data.len();
94    if n == 0 {
95        return F::zero();
96    }
97    if n < 8 {
98        // Seed from the first element, then fold the rest left-to-right (numpy).
99        data[1..].iter().fold(data[0], |a, &v| a + v)
100    } else if n <= 128 {
101        let mut r0 = data[0];
102        let mut r1 = data[1];
103        let mut r2 = data[2];
104        let mut r3 = data[3];
105        let mut r4 = data[4];
106        let mut r5 = data[5];
107        let mut r6 = data[6];
108        let mut r7 = data[7];
109        let bound = n - (n % 8);
110        let mut i = 8;
111        while i < bound {
112            r0 = r0 + data[i];
113            r1 = r1 + data[i + 1];
114            r2 = r2 + data[i + 2];
115            r3 = r3 + data[i + 3];
116            r4 = r4 + data[i + 4];
117            r5 = r5 + data[i + 5];
118            r6 = r6 + data[i + 6];
119            r7 = r7 + data[i + 7];
120            i += 8;
121        }
122        let res = ((r0 + r1) + (r2 + r3)) + ((r4 + r5) + (r6 + r7));
123        // Add the remainder (indices `bound..n`) left-to-right (numpy tail).
124        data[bound..].iter().fold(res, |a, &v| a + v)
125    } else {
126        let mut n2 = n / 2;
127        n2 -= n2 % 8;
128        pairwise_sum(&data[..n2]) + pairwise_sum(&data[n2..])
129    }
130}
131
132/// `np.mean(y)` over the first `n` elements via NumPy pairwise summation
133/// (`_target_encoder.py:383` `target_mean_ = np.mean(y, axis=0)`).
134fn mean_pairwise<F: Float>(y: &Array1<F>, n: usize) -> F {
135    let total = if let Some(slice) = y.as_slice() {
136        pairwise_sum(slice)
137    } else {
138        let v: Vec<F> = y.iter().copied().collect();
139        pairwise_sum(&v)
140    };
141    total / F::from(n).unwrap_or_else(F::one)
142}
143
144/// The POPULATION variance of `y` (`np.var(y)`, ddof=0), computed in f64 to
145/// match scikit-learn's C `double` accumulation. sklearn evaluates
146/// `y_variance = np.var(y)` once per fit (`_target_encoder.py:416`) and feeds it
147/// into the empirical-Bayes `lambda_` (`_target_encoder_fast.pyx:152-156`).
148///
149/// `mean_f64` is the already-computed `np.mean(y)` (`np.var` subtracts the same
150/// mean); the squared deviations are reduced via NumPy pairwise summation, which
151/// `np.var` uses internally.
152fn population_variance_f64<F: Float>(y: &Array1<F>, mean_f64: f64) -> f64 {
153    let n = y.len();
154    if n == 0 {
155        return 0.0;
156    }
157    let sq: Vec<f64> = y
158        .iter()
159        .map(|&v| {
160            let d = v.to_f64().unwrap_or(0.0) - mean_f64;
161            d * d
162        })
163        .collect();
164    pairwise_sum(&sq) / n as f64
165}
166
167/// Learn the per-category encoding for ONE feature column.
168///
169/// Dispatches on the [`Smooth`] strategy. All arithmetic is done in f64
170/// (matching sklearn's C `double` accumulators, `_target_encoder_fast.pyx:42,44`)
171/// then cast to `F`; for `F = f64` the round-trip is the identity.
172///
173/// - [`Smooth::Fixed`] reproduces `_fit_encoding_fast` (`:55-77`): seed each
174///   category with `(smooth*y_mean, smooth)`, add `(yᵢ, 1)` per sample, then
175///   `encoding = sum/count`, or `y_mean` when `count == 0`.
176/// - [`Smooth::Auto`] reproduces `_fit_encoding_fast_auto_smooth`
177///   (`:120-165`): two passes (mean, then sum-of-squared-diffs), a per-category
178///   `lambda_ = y_variance*count / (y_variance*count + ssd/count)`, blended as
179///   `lambda_*mean + (1-lambda_)*y_mean`; a NaN `lambda_` (count 0, or
180///   `y_variance == 0 && ssd == 0`) falls back to `y_mean`.
181fn fit_feature_encoding<F: Float>(
182    col: &[usize],
183    y: &Array1<F>,
184    smooth: Smooth<F>,
185    y_mean_f64: f64,
186    y_variance_f64: Option<f64>,
187) -> HashMap<usize, F> {
188    match smooth {
189        Smooth::Fixed(s) => {
190            let smooth_f64 = s.to_f64().unwrap_or(0.0);
191            // Seed each category's accumulator with `(smooth*y_mean, smooth)`,
192            // add each sample's `(yᵢ, 1)` in row order, then `sum/count`
193            // (`_target_encoder_fast.pyx:60-75`).
194            let mut stats: HashMap<usize, (f64, f64)> = HashMap::new();
195            for (i, &cat) in col.iter().enumerate() {
196                let entry = stats
197                    .entry(cat)
198                    .or_insert((smooth_f64 * y_mean_f64, smooth_f64));
199                entry.0 += y[i].to_f64().unwrap_or(0.0);
200                entry.1 += 1.0;
201            }
202            let mut map: HashMap<usize, F> = HashMap::new();
203            for (&cat, &(sum, count)) in &stats {
204                // `count` is `smooth + n_cat`; it is 0 only when smooth==0 AND
205                // the category has no rows — which cannot happen here since a
206                // category key exists only if a sample produced it. Guard anyway
207                // to mirror sklearn's `if counts[cat]==0 -> y_mean` (`:72-73`).
208                let encoded = if count == 0.0 {
209                    y_mean_f64
210                } else {
211                    sum / count
212                };
213                map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
214            }
215            map
216        }
217        Smooth::Auto => {
218            let y_variance = y_variance_f64.unwrap_or(0.0);
219            // First pass: per-category sum + count (-> means).
220            let mut sums: HashMap<usize, f64> = HashMap::new();
221            let mut counts: HashMap<usize, f64> = HashMap::new();
222            for (i, &cat) in col.iter().enumerate() {
223                *sums.entry(cat).or_insert(0.0) += y[i].to_f64().unwrap_or(0.0);
224                *counts.entry(cat).or_insert(0.0) += 1.0;
225            }
226            let means: HashMap<usize, f64> = sums
227                .iter()
228                .map(|(&cat, &s)| (cat, s / counts[&cat]))
229                .collect();
230            // Second pass: per-category sum of squared deviations from the mean
231            // (`_target_encoder_fast.pyx:143-149`).
232            let mut ssd: HashMap<usize, f64> = HashMap::new();
233            for (i, &cat) in col.iter().enumerate() {
234                let diff = y[i].to_f64().unwrap_or(0.0) - means[&cat];
235                *ssd.entry(cat).or_insert(0.0) += diff * diff;
236            }
237            let mut map: HashMap<usize, F> = HashMap::new();
238            for (&cat, &mean) in &means {
239                let count = counts[&cat];
240                let ssd_cat = ssd[&cat];
241                // lambda_ = y_variance*count / (y_variance*count + ssd/count)
242                // (`_target_encoder_fast.pyx:152-156`).
243                let denom = y_variance * count + ssd_cat / count;
244                let lambda = (y_variance * count) / denom;
245                let encoded = if lambda.is_nan() {
246                    // NaN when count==0 OR (y_variance==0 AND ssd==0): -> y_mean
247                    // (`_target_encoder_fast.pyx:157-161`).
248                    y_mean_f64
249                } else {
250                    lambda * mean + (1.0 - lambda) * y_mean_f64
251                };
252                map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
253            }
254            map
255        }
256    }
257}
258
259// ---------------------------------------------------------------------------
260// TargetEncoder (unfitted)
261// ---------------------------------------------------------------------------
262
263/// An unfitted target encoder.
264///
265/// Takes a matrix of categorical integer features and a continuous (or binary)
266/// target vector at fit time. Each category is encoded as the smoothed mean of
267/// the target for that category.
268///
269/// # Parameters
270///
271/// - `smooth` — the smoothing strategy ([`Smooth`]). The DEFAULT is
272///   [`Smooth::Auto`] (empirical Bayes), matching scikit-learn's constructor
273///   default `smooth="auto"` (`_target_encoder.py:199`). [`Smooth::Fixed`]
274///   selects the fixed m-estimate; higher values regularise more toward the
275///   global mean, `Fixed(0)` is no smoothing.
276/// - `cv` — the number of cross-fitting folds used by
277///   [`fit_transform`](TargetEncoder::fit_transform) (default 5, matching
278///   scikit-learn's `cv=5`, `_target_encoder.py:200`).
279///
280/// # Examples
281///
282/// ```
283/// use ferrolearn_preprocess::target_encoder::TargetEncoder;
284/// use ferrolearn_core::traits::{Fit, Transform};
285/// use ndarray::array;
286///
287/// let enc = TargetEncoder::<f64>::new(1.0);
288/// let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
289/// let y = array![1.0, 2.0, 3.0, 4.0];
290/// let fitted = enc.fit(&x, &y).unwrap();
291/// let out = fitted.transform(&x).unwrap();
292/// assert_eq!(out.shape(), &[4, 2]);
293/// ```
294#[must_use]
295#[derive(Debug, Clone)]
296pub struct TargetEncoder<F> {
297    /// Smoothing strategy.
298    smooth: Smooth<F>,
299    /// Number of cross-fitting folds for `fit_transform`.
300    cv: usize,
301}
302
303impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
304    /// Create a new `TargetEncoder` with a FIXED smoothing factor.
305    ///
306    /// This is shorthand for [`with_smooth`](Self::with_smooth) with
307    /// [`Smooth::Fixed`] and `cv = 5`.
308    pub fn new(smooth: F) -> Self {
309        Self {
310            smooth: Smooth::Fixed(smooth),
311            cv: 5,
312        }
313    }
314
315    /// Create a new `TargetEncoder` with the given smoothing strategy and
316    /// `cv = 5` (matching scikit-learn's default).
317    pub fn with_smooth(smooth: Smooth<F>) -> Self {
318        Self { smooth, cv: 5 }
319    }
320
321    /// Set the number of cross-fitting folds used by
322    /// [`fit_transform`](Self::fit_transform).
323    pub fn with_cv(mut self, cv: usize) -> Self {
324        self.cv = cv;
325        self
326    }
327
328    /// Return the smoothing strategy.
329    #[must_use]
330    pub fn smooth(&self) -> Smooth<F> {
331        self.smooth
332    }
333
334    /// Return the number of cross-fitting folds.
335    #[must_use]
336    pub fn cv(&self) -> usize {
337        self.cv
338    }
339}
340
341impl<F: Float + Send + Sync + 'static> Default for TargetEncoder<F> {
342    /// The default uses [`Smooth::Auto`] (empirical Bayes) and `cv = 5`,
343    /// matching scikit-learn's `TargetEncoder()` (`smooth="auto"`, `cv=5`,
344    /// `_target_encoder.py:199-200`).
345    fn default() -> Self {
346        Self {
347            smooth: Smooth::Auto,
348            cv: 5,
349        }
350    }
351}
352
353// ---------------------------------------------------------------------------
354// FittedTargetEncoder
355// ---------------------------------------------------------------------------
356
357/// A fitted target encoder holding per-feature, per-category encoding values.
358///
359/// Created by calling [`Fit::fit`] on a [`TargetEncoder`].
360#[derive(Debug, Clone)]
361pub struct FittedTargetEncoder<F> {
362    /// Per-feature mapping from category → encoded value.
363    category_maps: Vec<HashMap<usize, F>>,
364    /// Global target mean (used for unseen categories).
365    global_mean: F,
366}
367
368impl<F: Float + Send + Sync + 'static> FittedTargetEncoder<F> {
369    /// Return the encoding maps per feature.
370    #[must_use]
371    pub fn category_maps(&self) -> &[HashMap<usize, F>] {
372        &self.category_maps
373    }
374
375    /// Return the global target mean.
376    #[must_use]
377    pub fn global_mean(&self) -> F {
378        self.global_mean
379    }
380}
381
382// ---------------------------------------------------------------------------
383// Trait implementations
384// ---------------------------------------------------------------------------
385
386impl<F: Float + Send + Sync + 'static> Fit<Array2<usize>, Array1<F>> for TargetEncoder<F> {
387    type Fitted = FittedTargetEncoder<F>;
388    type Error = FerroError;
389
390    /// Fit the encoder by computing smoothed target means per category.
391    ///
392    /// # Errors
393    ///
394    /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
395    /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
396    /// - [`FerroError::InvalidParameter`] if `smooth` is negative.
397    fn fit(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<FittedTargetEncoder<F>, FerroError> {
398        let n_samples = x.nrows();
399        if n_samples == 0 {
400            return Err(FerroError::InsufficientSamples {
401                required: 1,
402                actual: 0,
403                context: "TargetEncoder::fit".into(),
404            });
405        }
406        if y.len() != n_samples {
407            return Err(FerroError::ShapeMismatch {
408                expected: vec![n_samples],
409                actual: vec![y.len()],
410                context: "TargetEncoder::fit — y must have same length as x rows".into(),
411            });
412        }
413        if let Smooth::Fixed(s) = self.smooth
414            && s < F::zero()
415        {
416            return Err(FerroError::InvalidParameter {
417                name: "smooth".into(),
418                reason: "smoothing factor must be non-negative".into(),
419            });
420        }
421
422        let n_features = x.ncols();
423        // sklearn: target_mean_ = np.mean(y, axis=0) (_target_encoder.py:383),
424        // which reduces via NumPy pairwise summation. Reproduce it bit-for-bit so
425        // the mean matches on mixed-magnitude targets.
426        let global_mean = mean_pairwise(y, n_samples);
427        let global_mean_f64 = global_mean.to_f64().unwrap_or(0.0);
428
429        // For `smooth="auto"` (empirical Bayes) sklearn needs the POPULATION
430        // variance of the full target, computed once per fit
431        // (`_target_encoder.py:416` `y_variance = np.var(y)`).
432        let y_variance_f64 = match self.smooth {
433            Smooth::Auto => Some(population_variance_f64(y, global_mean_f64)),
434            Smooth::Fixed(_) => None,
435        };
436
437        let mut category_maps = Vec::with_capacity(n_features);
438        for j in 0..n_features {
439            let col: Vec<usize> = (0..n_samples).map(|i| x[[i, j]]).collect();
440            category_maps.push(fit_feature_encoding(
441                &col,
442                y,
443                self.smooth,
444                global_mean_f64,
445                y_variance_f64,
446            ));
447        }
448
449        Ok(FittedTargetEncoder {
450            category_maps,
451            global_mean,
452        })
453    }
454}
455
456/// The contiguous (un-shuffled) KFold test-index folds over `n` samples.
457///
458/// Mirrors scikit-learn's `KFold._iter_test_indices`
459/// (`sklearn/model_selection/_split.py:521-534`) with `shuffle=False`: the
460/// indices are `0..n` in order, split into `k` consecutive folds where the
461/// first `n % k` folds have size `n // k + 1` and the rest `n // k`. Returns a
462/// vec of `(test_start, test_end)` half-open ranges.
463fn kfold_test_ranges(n: usize, k: usize) -> Vec<(usize, usize)> {
464    let base = n / k;
465    let rem = n % k;
466    let mut ranges = Vec::with_capacity(k);
467    let mut current = 0usize;
468    for fold in 0..k {
469        let size = base + usize::from(fold < rem);
470        ranges.push((current, current + size));
471        current += size;
472    }
473    ranges
474}
475
476impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
477    /// Cross-fitting `fit_transform`: encode each row using encodings learned on
478    /// the OTHER folds, preventing target leakage.
479    ///
480    /// Mirrors scikit-learn's `TargetEncoder.fit_transform`
481    /// (`sklearn/preprocessing/_target_encoder.py:232-303`): for the
482    /// continuous/binary single-output case it uses a deterministic `KFold`
483    /// (`cv` folds, NO shuffle — ferrolearn exposes no `shuffle`/`random_state`,
484    /// so this is sklearn's reproducible `shuffle=False` path, `:262`); for each
485    /// `(train, test)` fold it fits the per-feature encodings on the TRAIN rows
486    /// (with that fold's `y_train_mean`) and writes the TEST rows through those
487    /// train-encodings (`:277-302`). A category unseen in the train fold encodes
488    /// to `y_train_mean` (the `count == 0 -> y_mean` rule, mirroring
489    /// `_transform_X_ordinal`'s unknown-category fallback, `:494-497`).
490    ///
491    /// Note `fit(X,y).transform(X)` does NOT equal `fit_transform(X,y)`
492    /// (`:235-238`): `transform` uses the full-data `encodings_`, `fit_transform`
493    /// is cross-fit.
494    ///
495    /// # Errors
496    ///
497    /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
498    /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
499    /// - [`FerroError::InvalidParameter`] if a [`Smooth::Fixed`] factor is
500    ///   negative, or if `cv < 2` / `cv` exceeds the sample count (sklearn
501    ///   requires `cv >= 2`, `_target_encoder.py:190`, and `KFold` rejects more
502    ///   splits than samples, `_split.py:408-414`).
503    pub fn fit_transform(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<Array2<F>, FerroError> {
504        let n_samples = x.nrows();
505        if n_samples == 0 {
506            return Err(FerroError::InsufficientSamples {
507                required: 1,
508                actual: 0,
509                context: "TargetEncoder::fit_transform".into(),
510            });
511        }
512        if y.len() != n_samples {
513            return Err(FerroError::ShapeMismatch {
514                expected: vec![n_samples],
515                actual: vec![y.len()],
516                context: "TargetEncoder::fit_transform — y must have same length as x rows".into(),
517            });
518        }
519        if let Smooth::Fixed(s) = self.smooth
520            && s < F::zero()
521        {
522            return Err(FerroError::InvalidParameter {
523                name: "smooth".into(),
524                reason: "smoothing factor must be non-negative".into(),
525            });
526        }
527        // sklearn `_parameter_constraints` requires `cv >= 2`
528        // (`_target_encoder.py:190`); `KFold` additionally rejects more splits
529        // than samples (`_split.py:408-414`).
530        if self.cv < 2 {
531            return Err(FerroError::InvalidParameter {
532                name: "cv".into(),
533                reason: "cv must be at least 2".into(),
534            });
535        }
536        if self.cv > n_samples {
537            return Err(FerroError::InvalidParameter {
538                name: "cv".into(),
539                reason: "cv cannot exceed the number of samples".into(),
540            });
541        }
542
543        let n_features = x.ncols();
544        let mut out = Array2::zeros((n_samples, n_features));
545
546        for (test_start, test_end) in kfold_test_ranges(n_samples, self.cv) {
547            // Train indices are everything OUTSIDE the contiguous test fold.
548            let train_idx: Vec<usize> = (0..n_samples)
549                .filter(|&i| i < test_start || i >= test_end)
550                .collect();
551
552            // y_train_mean = np.mean(y[train]) (`_target_encoder.py:279`).
553            let y_train: Vec<F> = train_idx.iter().map(|&i| y[i]).collect();
554            let y_train_arr = Array1::from(y_train);
555            let train_mean = mean_pairwise(&y_train_arr, train_idx.len());
556            let train_mean_f64 = train_mean.to_f64().unwrap_or(0.0);
557            let train_var_f64 = match self.smooth {
558                Smooth::Auto => Some(population_variance_f64(&y_train_arr, train_mean_f64)),
559                Smooth::Fixed(_) => None,
560            };
561
562            for j in 0..n_features {
563                // Fit this fold's per-feature encoding on the TRAIN rows.
564                let train_col: Vec<usize> = train_idx.iter().map(|&i| x[[i, j]]).collect();
565                let enc = fit_feature_encoding(
566                    &train_col,
567                    &y_train_arr,
568                    self.smooth,
569                    train_mean_f64,
570                    train_var_f64,
571                );
572                // Encode the TEST rows; a category unseen in the train fold ->
573                // the train y_mean (`_transform_X_ordinal`, `:494-497`).
574                for i in test_start..test_end {
575                    let cat = x[[i, j]];
576                    out[[i, j]] = *enc.get(&cat).unwrap_or(&train_mean);
577                }
578            }
579        }
580
581        Ok(out)
582    }
583}
584
585impl<F: Float + Send + Sync + 'static> Transform<Array2<usize>> for FittedTargetEncoder<F> {
586    type Output = Array2<F>;
587    type Error = FerroError;
588
589    /// Encode categorical features using the learned target statistics.
590    ///
591    /// Unseen categories are encoded as the global target mean.
592    ///
593    /// # Errors
594    ///
595    /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
596    /// from the number of features seen during fitting.
597    fn transform(&self, x: &Array2<usize>) -> Result<Array2<F>, FerroError> {
598        let n_features = self.category_maps.len();
599        if x.ncols() != n_features {
600            return Err(FerroError::ShapeMismatch {
601                expected: vec![x.nrows(), n_features],
602                actual: vec![x.nrows(), x.ncols()],
603                context: "FittedTargetEncoder::transform".into(),
604            });
605        }
606
607        let n_samples = x.nrows();
608        let mut out = Array2::zeros((n_samples, n_features));
609
610        for j in 0..n_features {
611            let cat_map = &self.category_maps[j];
612            for i in 0..n_samples {
613                let cat = x[[i, j]];
614                out[[i, j]] = *cat_map.get(&cat).unwrap_or(&self.global_mean);
615            }
616        }
617
618        Ok(out)
619    }
620}
621
622// ---------------------------------------------------------------------------
623// Tests
624// ---------------------------------------------------------------------------
625
626#[cfg(test)]
627mod tests {
628    use super::*;
629    use approx::assert_abs_diff_eq;
630    use ndarray::array;
631
632    #[test]
633    fn test_target_encoder_basic() {
634        let enc = TargetEncoder::<f64>::new(0.0); // no smoothing
635        // Category 0: targets [1.0, 2.0], mean = 1.5
636        // Category 1: targets [3.0, 4.0], mean = 3.5
637        let x = array![[0usize], [0], [1], [1]];
638        let y = array![1.0, 2.0, 3.0, 4.0];
639        let fitted = enc.fit(&x, &y).unwrap();
640        let out = fitted.transform(&x).unwrap();
641        assert_abs_diff_eq!(out[[0, 0]], 1.5, epsilon = 1e-10);
642        assert_abs_diff_eq!(out[[1, 0]], 1.5, epsilon = 1e-10);
643        assert_abs_diff_eq!(out[[2, 0]], 3.5, epsilon = 1e-10);
644        assert_abs_diff_eq!(out[[3, 0]], 3.5, epsilon = 1e-10);
645    }
646
647    #[test]
648    fn test_target_encoder_smoothing() {
649        let enc = TargetEncoder::<f64>::new(2.0);
650        // Category 0: targets [1.0], mean = 1.0, count = 1
651        // Category 1: targets [3.0, 5.0], mean = 4.0, count = 2
652        // Global mean = (1 + 3 + 5) / 3 = 3.0
653        let x = array![[0usize], [1], [1]];
654        let y = array![1.0, 3.0, 5.0];
655        let fitted = enc.fit(&x, &y).unwrap();
656        let out = fitted.transform(&x).unwrap();
657        // Cat 0: (1 * 1.0 + 2 * 3.0) / (1 + 2) = 7/3 ≈ 2.333
658        let expected_0 = (1.0 * 1.0 + 2.0 * 3.0) / (1.0 + 2.0);
659        assert_abs_diff_eq!(out[[0, 0]], expected_0, epsilon = 1e-10);
660        // Cat 1: (2 * 4.0 + 2 * 3.0) / (2 + 2) = 14/4 = 3.5
661        let expected_1 = (2.0 * 4.0 + 2.0 * 3.0) / (2.0 + 2.0);
662        assert_abs_diff_eq!(out[[1, 0]], expected_1, epsilon = 1e-10);
663    }
664
665    #[test]
666    fn test_target_encoder_unseen_category() {
667        let enc = TargetEncoder::<f64>::new(1.0);
668        let x = array![[0usize], [0], [1], [1]];
669        let y = array![1.0, 2.0, 3.0, 4.0];
670        let fitted = enc.fit(&x, &y).unwrap();
671        // Transform with unseen category 2
672        let x_new = array![[2usize]];
673        let out = fitted.transform(&x_new).unwrap();
674        // Unseen category → global mean = 2.5
675        assert_abs_diff_eq!(out[[0, 0]], 2.5, epsilon = 1e-10);
676    }
677
678    #[test]
679    fn test_target_encoder_multi_feature() {
680        let enc = TargetEncoder::<f64>::new(0.0);
681        let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
682        let y = array![1.0, 2.0, 3.0, 4.0];
683        let fitted = enc.fit(&x, &y).unwrap();
684        let out = fitted.transform(&x).unwrap();
685        assert_eq!(out.shape(), &[4, 2]);
686    }
687
688    #[test]
689    fn test_target_encoder_zero_rows_error() {
690        let enc = TargetEncoder::<f64>::new(1.0);
691        let x: Array2<usize> = Array2::zeros((0, 2));
692        let y: Array1<f64> = Array1::zeros(0);
693        assert!(enc.fit(&x, &y).is_err());
694    }
695
696    #[test]
697    fn test_target_encoder_shape_mismatch_fit() {
698        let enc = TargetEncoder::<f64>::new(1.0);
699        let x = array![[0usize], [1]];
700        let y = array![1.0]; // wrong length
701        assert!(enc.fit(&x, &y).is_err());
702    }
703
704    #[test]
705    fn test_target_encoder_shape_mismatch_transform() {
706        let enc = TargetEncoder::<f64>::new(1.0);
707        let x = array![[0usize, 1], [1, 0]];
708        let y = array![1.0, 2.0];
709        let fitted = enc.fit(&x, &y).unwrap();
710        let x_bad = array![[0usize]]; // wrong number of columns
711        assert!(fitted.transform(&x_bad).is_err());
712    }
713
714    #[test]
715    fn test_target_encoder_negative_smooth_error() {
716        let enc = TargetEncoder::<f64>::new(-1.0);
717        let x = array![[0usize]];
718        let y = array![1.0];
719        assert!(enc.fit(&x, &y).is_err());
720    }
721
722    #[test]
723    fn test_target_encoder_default() {
724        // sklearn's DEFAULT is smooth="auto" (`_target_encoder.py:199`), NOT a
725        // fixed value; `new(F)` is the explicit fixed-smooth constructor.
726        let enc = TargetEncoder::<f64>::default();
727        assert_eq!(enc.smooth(), Smooth::Auto);
728        assert_eq!(enc.cv(), 5);
729        let fixed = TargetEncoder::<f64>::new(1.0);
730        assert_eq!(fixed.smooth(), Smooth::Fixed(1.0));
731    }
732
733    #[test]
734    fn test_target_encoder_global_mean_accessor() {
735        let enc = TargetEncoder::<f64>::new(0.0);
736        let x = array![[0usize], [1]];
737        let y = array![2.0, 4.0];
738        let fitted = enc.fit(&x, &y).unwrap();
739        assert_abs_diff_eq!(fitted.global_mean(), 3.0, epsilon = 1e-10);
740    }
741
742    #[test]
743    fn test_target_encoder_f32() {
744        let enc = TargetEncoder::<f32>::new(1.0f32);
745        let x = array![[0usize], [0], [1]];
746        let y: Array1<f32> = array![1.0f32, 2.0, 3.0];
747        let fitted = enc.fit(&x, &y).unwrap();
748        let out = fitted.transform(&x).unwrap();
749        assert!(!out[[0, 0]].is_nan());
750    }
751}
ferrolearn_preprocess/target_encoder.rs

ferrolearn_preprocess/
target_encoder.rs