ferrolearn_preprocess/target_encoder.rs
1//! Target encoder: encode categorical features using target statistics.
2//!
3//! [`TargetEncoder`] replaces each category with the mean of the target variable
4//! for that category, regularised toward the global mean using smoothing.
5//!
6//! This is especially useful for high-cardinality categorical features where
7//! one-hot encoding would produce too many columns.
8//!
9//! # Smoothing
10//!
11//! The encoded value for category `c` is (matching scikit-learn
12//! `_target_encoder_fast.pyx:60-75` — the accumulator is seeded with
13//! `smooth * global_mean` then the category's targets are added, divided by
14//! `smooth + count(c)`):
15//!
16//! ```text
17//! encoded(c) = (smooth * global_mean + sum_of_targets(c)) / (smooth + count(c))
18//! ```
19//!
20//! where `smooth` controls the degree of regularisation.
21//!
22//! Translation target: scikit-learn 1.5.2 `class TargetEncoder`
23//! (`sklearn/preprocessing/_target_encoder.py`). Design:
24//! `.design/preprocess/target_encoder.md`. Tracking: #1260.
25//!
26//! `## REQ status`
27//!
28//! | REQ | Status | Anchor |
29//! |---|---|---|
30//! | REQ-1 manual-`smooth` m-estimate value match (f64, bit-exact) | SHIPPED | `TargetEncoder::fit` / `transform`; sklearn `_target_encoder_fast.pyx:60-75`, `_target_encoder.py:289`,`:383` (#1261 pairwise sum, #1262 formula) |
31//! | REQ-2 unseen category → `target_mean_` (global mean) | SHIPPED | `transform` `unwrap_or(global_mean)`; sklearn `_target_encoder.py:324-345` |
32//! | REQ-3 InsufficientSamples / ShapeMismatch / InvalidParameter errors | SHIPPED | `fit` / `transform` guards; sklearn `_target_encoder.py:189` |
33//! | REQ-4 `smooth="auto"` empirical-Bayes encoding + DEFAULT | SHIPPED | `Smooth` enum `{ Auto, Fixed(F) }` (`Default`/`TargetEncoder::default` → `Auto`); `fit_feature_encoding` Auto branch (two-pass means/ssd, `lambda_ = y_variance*count/(y_variance*count+ssd/count)`, NaN→y_mean), `population_variance_f64` (ddof=0) computed once in `fit`; sklearn `_target_encoder_fast.pyx:140-165`, `_target_encoder.py:199`,`:416`. Consumer: `TargetEncoder::fit`/`fit_transform`/`default` (the `Smooth` field drives the encoding branch) + the public module path `ferrolearn_preprocess::target_encoder::Smooth` (`pub mod target_encoder` in `lib.rs`). Verify: pins `divergence_default_smooth_is_auto`/`divergence_smooth_auto_empirical_bayes` green (#2342 #2343) |
34//! | REQ-5 cross-fitting `fit_transform` (deterministic KFold) | SHIPPED | `TargetEncoder::fit_transform` cross-fits over `kfold_test_ranges` (contiguous no-shuffle folds, `cv` default 5), per-fold `fit_feature_encoding` on TRAIN rows → encode TEST rows (unseen-in-train → `y_train_mean`); sklearn `_target_encoder.py:232`,`:254-303`, `_split.py:521-534`. Consumer: crate re-export (`lib.rs`). Verify: pin `divergence_crossfit_fit_transform` green (#2344). NOTE: `shuffle`/`random_state` (REQ-8 NOT-STARTED) absent → deterministic `shuffle=False` KFold only |
35//! | REQ-6 `target_type` binary/multiclass | NOT-STARTED (#1266) | sklearn `_target_encoder.py:269-273`,`:376-379` |
36//! | REQ-7 `categories` param + `categories_`/`target_type_`/`classes_` | NOT-STARTED (#1267) | sklearn `_target_encoder.py:197`,`:358-381` |
37//! | REQ-8 `cv`/`shuffle`/`random_state` params | NOT-STARTED (#1268) | sklearn `_target_encoder.py:200-209` |
38//! | REQ-9 string/object categories | NOT-STARTED (#1269) | usize-only, R-DEV-3 |
39//! | REQ-10 `get_feature_names_out`/`n_features_in_` | NOT-STARTED (#1270) | sklearn `OneToOneFeatureMixin` |
40//! | REQ-11 PyO3 binding | NOT-STARTED (#1271) | `ferrolearn-python/src/` (absent) |
41//! | REQ-12 ferray substrate | NOT-STARTED (#1272) | R-SUBSTRATE |
42//! | REQ-13 per-category sums accumulate in f64 (always), matching sklearn's C `double` | SHIPPED | `fit` accumulates `cat_stats: HashMap<usize,(f64,usize)>` seeded with `smooth_f64*global_mean_f64`, `+= y[i].to_f64()`, then `F::from(sum/(smooth_f64+count))`; sklearn `_target_encoder_fast.pyx:42,44,68` (`double sums[]`/`counts[]`, `sums[cat]+=y[i]` regardless of `Y_DTYPE`), `encodings_` always float64 (`_target_encoder.py:335`). f64 path identity (bit-exact unchanged); `TargetEncoder<f32>` now captures `2^24+1` (#1263) |
43
44use ferrolearn_core::error::FerroError;
45use ferrolearn_core::traits::{Fit, Transform};
46use ndarray::{Array1, Array2};
47use num_traits::Float;
48use std::collections::HashMap;
49
50/// The smoothing strategy for [`TargetEncoder`].
51///
52/// Mirrors scikit-learn's `smooth` parameter
53/// (`sklearn/preprocessing/_target_encoder.py:189`,
54/// `"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")]`),
55/// whose DEFAULT is the string `"auto"` (an empirical-Bayes estimate,
56/// `_target_encoder.py:85-89`) rather than a fixed numeric value.
57#[derive(Debug, Clone, Copy, PartialEq)]
58pub enum Smooth<F> {
59 /// `smooth="auto"` — the empirical-Bayes shrinkage estimate
60 /// (`_target_encoder_fast.pyx:140-165`): per category blend the category
61 /// mean toward the global mean by a `lambda_` derived from the
62 /// within-category sum-of-squared deviations vs the overall target variance.
63 Auto,
64 /// A fixed numeric smoothing factor `m` driving the m-estimate
65 /// `(smooth * y_mean + Σyᵢ) / (smooth + count)`
66 /// (`_target_encoder_fast.pyx:60-75`). Must be non-negative.
67 Fixed(F),
68}
69
70impl<F: Float> Default for Smooth<F> {
71 /// The default matches scikit-learn's constructor default `smooth="auto"`
72 /// (`_target_encoder.py:199`).
73 fn default() -> Self {
74 Smooth::Auto
75 }
76}
77
78/// Sum a slice reproducing NumPy's pairwise summation (the algorithm behind
79/// `np.add.reduce` / `np.mean`), so a ferrolearn mean bit-matches sklearn's
80/// `np.mean` on ill-conditioned mixed-magnitude inputs.
81///
82/// sklearn sets `target_mean_ = np.mean(y, axis=0)`
83/// (`sklearn/preprocessing/_target_encoder.py:383`), and `np.mean` reduces via
84/// NumPy pairwise summation, which rounds differently from a naive left-fold on
85/// targets that mix magnitudes.
86///
87/// Mirrors NumPy `pairwise_sum` (numpy/_core/src/umath/loops_utils.h.src):
88/// - `n < 8` : straight sequential sum seeded from the first element.
89/// - `8 <= n <= 128`: 8 partial accumulators, unrolled by 8, combined as
90/// `((r0+r1)+(r2+r3)) + ((r4+r5)+(r6+r7))`, then the tail.
91/// - `n > 128` : split at `n2 = (n/2)` rounded DOWN to a multiple of 8, recurse.
92fn pairwise_sum<F: Float>(data: &[F]) -> F {
93 let n = data.len();
94 if n == 0 {
95 return F::zero();
96 }
97 if n < 8 {
98 // Seed from the first element, then fold the rest left-to-right (numpy).
99 data[1..].iter().fold(data[0], |a, &v| a + v)
100 } else if n <= 128 {
101 let mut r0 = data[0];
102 let mut r1 = data[1];
103 let mut r2 = data[2];
104 let mut r3 = data[3];
105 let mut r4 = data[4];
106 let mut r5 = data[5];
107 let mut r6 = data[6];
108 let mut r7 = data[7];
109 let bound = n - (n % 8);
110 let mut i = 8;
111 while i < bound {
112 r0 = r0 + data[i];
113 r1 = r1 + data[i + 1];
114 r2 = r2 + data[i + 2];
115 r3 = r3 + data[i + 3];
116 r4 = r4 + data[i + 4];
117 r5 = r5 + data[i + 5];
118 r6 = r6 + data[i + 6];
119 r7 = r7 + data[i + 7];
120 i += 8;
121 }
122 let res = ((r0 + r1) + (r2 + r3)) + ((r4 + r5) + (r6 + r7));
123 // Add the remainder (indices `bound..n`) left-to-right (numpy tail).
124 data[bound..].iter().fold(res, |a, &v| a + v)
125 } else {
126 let mut n2 = n / 2;
127 n2 -= n2 % 8;
128 pairwise_sum(&data[..n2]) + pairwise_sum(&data[n2..])
129 }
130}
131
132/// `np.mean(y)` over the first `n` elements via NumPy pairwise summation
133/// (`_target_encoder.py:383` `target_mean_ = np.mean(y, axis=0)`).
134fn mean_pairwise<F: Float>(y: &Array1<F>, n: usize) -> F {
135 let total = if let Some(slice) = y.as_slice() {
136 pairwise_sum(slice)
137 } else {
138 let v: Vec<F> = y.iter().copied().collect();
139 pairwise_sum(&v)
140 };
141 total / F::from(n).unwrap_or_else(F::one)
142}
143
144/// The POPULATION variance of `y` (`np.var(y)`, ddof=0), computed in f64 to
145/// match scikit-learn's C `double` accumulation. sklearn evaluates
146/// `y_variance = np.var(y)` once per fit (`_target_encoder.py:416`) and feeds it
147/// into the empirical-Bayes `lambda_` (`_target_encoder_fast.pyx:152-156`).
148///
149/// `mean_f64` is the already-computed `np.mean(y)` (`np.var` subtracts the same
150/// mean); the squared deviations are reduced via NumPy pairwise summation, which
151/// `np.var` uses internally.
152fn population_variance_f64<F: Float>(y: &Array1<F>, mean_f64: f64) -> f64 {
153 let n = y.len();
154 if n == 0 {
155 return 0.0;
156 }
157 let sq: Vec<f64> = y
158 .iter()
159 .map(|&v| {
160 let d = v.to_f64().unwrap_or(0.0) - mean_f64;
161 d * d
162 })
163 .collect();
164 pairwise_sum(&sq) / n as f64
165}
166
167/// Learn the per-category encoding for ONE feature column.
168///
169/// Dispatches on the [`Smooth`] strategy. All arithmetic is done in f64
170/// (matching sklearn's C `double` accumulators, `_target_encoder_fast.pyx:42,44`)
171/// then cast to `F`; for `F = f64` the round-trip is the identity.
172///
173/// - [`Smooth::Fixed`] reproduces `_fit_encoding_fast` (`:55-77`): seed each
174/// category with `(smooth*y_mean, smooth)`, add `(yᵢ, 1)` per sample, then
175/// `encoding = sum/count`, or `y_mean` when `count == 0`.
176/// - [`Smooth::Auto`] reproduces `_fit_encoding_fast_auto_smooth`
177/// (`:120-165`): two passes (mean, then sum-of-squared-diffs), a per-category
178/// `lambda_ = y_variance*count / (y_variance*count + ssd/count)`, blended as
179/// `lambda_*mean + (1-lambda_)*y_mean`; a NaN `lambda_` (count 0, or
180/// `y_variance == 0 && ssd == 0`) falls back to `y_mean`.
181fn fit_feature_encoding<F: Float>(
182 col: &[usize],
183 y: &Array1<F>,
184 smooth: Smooth<F>,
185 y_mean_f64: f64,
186 y_variance_f64: Option<f64>,
187) -> HashMap<usize, F> {
188 match smooth {
189 Smooth::Fixed(s) => {
190 let smooth_f64 = s.to_f64().unwrap_or(0.0);
191 // Seed each category's accumulator with `(smooth*y_mean, smooth)`,
192 // add each sample's `(yᵢ, 1)` in row order, then `sum/count`
193 // (`_target_encoder_fast.pyx:60-75`).
194 let mut stats: HashMap<usize, (f64, f64)> = HashMap::new();
195 for (i, &cat) in col.iter().enumerate() {
196 let entry = stats
197 .entry(cat)
198 .or_insert((smooth_f64 * y_mean_f64, smooth_f64));
199 entry.0 += y[i].to_f64().unwrap_or(0.0);
200 entry.1 += 1.0;
201 }
202 let mut map: HashMap<usize, F> = HashMap::new();
203 for (&cat, &(sum, count)) in &stats {
204 // `count` is `smooth + n_cat`; it is 0 only when smooth==0 AND
205 // the category has no rows — which cannot happen here since a
206 // category key exists only if a sample produced it. Guard anyway
207 // to mirror sklearn's `if counts[cat]==0 -> y_mean` (`:72-73`).
208 let encoded = if count == 0.0 {
209 y_mean_f64
210 } else {
211 sum / count
212 };
213 map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
214 }
215 map
216 }
217 Smooth::Auto => {
218 let y_variance = y_variance_f64.unwrap_or(0.0);
219 // First pass: per-category sum + count (-> means).
220 let mut sums: HashMap<usize, f64> = HashMap::new();
221 let mut counts: HashMap<usize, f64> = HashMap::new();
222 for (i, &cat) in col.iter().enumerate() {
223 *sums.entry(cat).or_insert(0.0) += y[i].to_f64().unwrap_or(0.0);
224 *counts.entry(cat).or_insert(0.0) += 1.0;
225 }
226 let means: HashMap<usize, f64> = sums
227 .iter()
228 .map(|(&cat, &s)| (cat, s / counts[&cat]))
229 .collect();
230 // Second pass: per-category sum of squared deviations from the mean
231 // (`_target_encoder_fast.pyx:143-149`).
232 let mut ssd: HashMap<usize, f64> = HashMap::new();
233 for (i, &cat) in col.iter().enumerate() {
234 let diff = y[i].to_f64().unwrap_or(0.0) - means[&cat];
235 *ssd.entry(cat).or_insert(0.0) += diff * diff;
236 }
237 let mut map: HashMap<usize, F> = HashMap::new();
238 for (&cat, &mean) in &means {
239 let count = counts[&cat];
240 let ssd_cat = ssd[&cat];
241 // lambda_ = y_variance*count / (y_variance*count + ssd/count)
242 // (`_target_encoder_fast.pyx:152-156`).
243 let denom = y_variance * count + ssd_cat / count;
244 let lambda = (y_variance * count) / denom;
245 let encoded = if lambda.is_nan() {
246 // NaN when count==0 OR (y_variance==0 AND ssd==0): -> y_mean
247 // (`_target_encoder_fast.pyx:157-161`).
248 y_mean_f64
249 } else {
250 lambda * mean + (1.0 - lambda) * y_mean_f64
251 };
252 map.insert(cat, F::from(encoded).unwrap_or_else(F::zero));
253 }
254 map
255 }
256 }
257}
258
259// ---------------------------------------------------------------------------
260// TargetEncoder (unfitted)
261// ---------------------------------------------------------------------------
262
263/// An unfitted target encoder.
264///
265/// Takes a matrix of categorical integer features and a continuous (or binary)
266/// target vector at fit time. Each category is encoded as the smoothed mean of
267/// the target for that category.
268///
269/// # Parameters
270///
271/// - `smooth` — the smoothing strategy ([`Smooth`]). The DEFAULT is
272/// [`Smooth::Auto`] (empirical Bayes), matching scikit-learn's constructor
273/// default `smooth="auto"` (`_target_encoder.py:199`). [`Smooth::Fixed`]
274/// selects the fixed m-estimate; higher values regularise more toward the
275/// global mean, `Fixed(0)` is no smoothing.
276/// - `cv` — the number of cross-fitting folds used by
277/// [`fit_transform`](TargetEncoder::fit_transform) (default 5, matching
278/// scikit-learn's `cv=5`, `_target_encoder.py:200`).
279///
280/// # Examples
281///
282/// ```
283/// use ferrolearn_preprocess::target_encoder::TargetEncoder;
284/// use ferrolearn_core::traits::{Fit, Transform};
285/// use ndarray::array;
286///
287/// let enc = TargetEncoder::<f64>::new(1.0);
288/// let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
289/// let y = array![1.0, 2.0, 3.0, 4.0];
290/// let fitted = enc.fit(&x, &y).unwrap();
291/// let out = fitted.transform(&x).unwrap();
292/// assert_eq!(out.shape(), &[4, 2]);
293/// ```
294#[must_use]
295#[derive(Debug, Clone)]
296pub struct TargetEncoder<F> {
297 /// Smoothing strategy.
298 smooth: Smooth<F>,
299 /// Number of cross-fitting folds for `fit_transform`.
300 cv: usize,
301}
302
303impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
304 /// Create a new `TargetEncoder` with a FIXED smoothing factor.
305 ///
306 /// This is shorthand for [`with_smooth`](Self::with_smooth) with
307 /// [`Smooth::Fixed`] and `cv = 5`.
308 pub fn new(smooth: F) -> Self {
309 Self {
310 smooth: Smooth::Fixed(smooth),
311 cv: 5,
312 }
313 }
314
315 /// Create a new `TargetEncoder` with the given smoothing strategy and
316 /// `cv = 5` (matching scikit-learn's default).
317 pub fn with_smooth(smooth: Smooth<F>) -> Self {
318 Self { smooth, cv: 5 }
319 }
320
321 /// Set the number of cross-fitting folds used by
322 /// [`fit_transform`](Self::fit_transform).
323 pub fn with_cv(mut self, cv: usize) -> Self {
324 self.cv = cv;
325 self
326 }
327
328 /// Return the smoothing strategy.
329 #[must_use]
330 pub fn smooth(&self) -> Smooth<F> {
331 self.smooth
332 }
333
334 /// Return the number of cross-fitting folds.
335 #[must_use]
336 pub fn cv(&self) -> usize {
337 self.cv
338 }
339}
340
341impl<F: Float + Send + Sync + 'static> Default for TargetEncoder<F> {
342 /// The default uses [`Smooth::Auto`] (empirical Bayes) and `cv = 5`,
343 /// matching scikit-learn's `TargetEncoder()` (`smooth="auto"`, `cv=5`,
344 /// `_target_encoder.py:199-200`).
345 fn default() -> Self {
346 Self {
347 smooth: Smooth::Auto,
348 cv: 5,
349 }
350 }
351}
352
353// ---------------------------------------------------------------------------
354// FittedTargetEncoder
355// ---------------------------------------------------------------------------
356
357/// A fitted target encoder holding per-feature, per-category encoding values.
358///
359/// Created by calling [`Fit::fit`] on a [`TargetEncoder`].
360#[derive(Debug, Clone)]
361pub struct FittedTargetEncoder<F> {
362 /// Per-feature mapping from category → encoded value.
363 category_maps: Vec<HashMap<usize, F>>,
364 /// Global target mean (used for unseen categories).
365 global_mean: F,
366}
367
368impl<F: Float + Send + Sync + 'static> FittedTargetEncoder<F> {
369 /// Return the encoding maps per feature.
370 #[must_use]
371 pub fn category_maps(&self) -> &[HashMap<usize, F>] {
372 &self.category_maps
373 }
374
375 /// Return the global target mean.
376 #[must_use]
377 pub fn global_mean(&self) -> F {
378 self.global_mean
379 }
380}
381
382// ---------------------------------------------------------------------------
383// Trait implementations
384// ---------------------------------------------------------------------------
385
386impl<F: Float + Send + Sync + 'static> Fit<Array2<usize>, Array1<F>> for TargetEncoder<F> {
387 type Fitted = FittedTargetEncoder<F>;
388 type Error = FerroError;
389
390 /// Fit the encoder by computing smoothed target means per category.
391 ///
392 /// # Errors
393 ///
394 /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
395 /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
396 /// - [`FerroError::InvalidParameter`] if `smooth` is negative.
397 fn fit(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<FittedTargetEncoder<F>, FerroError> {
398 let n_samples = x.nrows();
399 if n_samples == 0 {
400 return Err(FerroError::InsufficientSamples {
401 required: 1,
402 actual: 0,
403 context: "TargetEncoder::fit".into(),
404 });
405 }
406 if y.len() != n_samples {
407 return Err(FerroError::ShapeMismatch {
408 expected: vec![n_samples],
409 actual: vec![y.len()],
410 context: "TargetEncoder::fit — y must have same length as x rows".into(),
411 });
412 }
413 if let Smooth::Fixed(s) = self.smooth
414 && s < F::zero()
415 {
416 return Err(FerroError::InvalidParameter {
417 name: "smooth".into(),
418 reason: "smoothing factor must be non-negative".into(),
419 });
420 }
421
422 let n_features = x.ncols();
423 // sklearn: target_mean_ = np.mean(y, axis=0) (_target_encoder.py:383),
424 // which reduces via NumPy pairwise summation. Reproduce it bit-for-bit so
425 // the mean matches on mixed-magnitude targets.
426 let global_mean = mean_pairwise(y, n_samples);
427 let global_mean_f64 = global_mean.to_f64().unwrap_or(0.0);
428
429 // For `smooth="auto"` (empirical Bayes) sklearn needs the POPULATION
430 // variance of the full target, computed once per fit
431 // (`_target_encoder.py:416` `y_variance = np.var(y)`).
432 let y_variance_f64 = match self.smooth {
433 Smooth::Auto => Some(population_variance_f64(y, global_mean_f64)),
434 Smooth::Fixed(_) => None,
435 };
436
437 let mut category_maps = Vec::with_capacity(n_features);
438 for j in 0..n_features {
439 let col: Vec<usize> = (0..n_samples).map(|i| x[[i, j]]).collect();
440 category_maps.push(fit_feature_encoding(
441 &col,
442 y,
443 self.smooth,
444 global_mean_f64,
445 y_variance_f64,
446 ));
447 }
448
449 Ok(FittedTargetEncoder {
450 category_maps,
451 global_mean,
452 })
453 }
454}
455
456/// The contiguous (un-shuffled) KFold test-index folds over `n` samples.
457///
458/// Mirrors scikit-learn's `KFold._iter_test_indices`
459/// (`sklearn/model_selection/_split.py:521-534`) with `shuffle=False`: the
460/// indices are `0..n` in order, split into `k` consecutive folds where the
461/// first `n % k` folds have size `n // k + 1` and the rest `n // k`. Returns a
462/// vec of `(test_start, test_end)` half-open ranges.
463fn kfold_test_ranges(n: usize, k: usize) -> Vec<(usize, usize)> {
464 let base = n / k;
465 let rem = n % k;
466 let mut ranges = Vec::with_capacity(k);
467 let mut current = 0usize;
468 for fold in 0..k {
469 let size = base + usize::from(fold < rem);
470 ranges.push((current, current + size));
471 current += size;
472 }
473 ranges
474}
475
476impl<F: Float + Send + Sync + 'static> TargetEncoder<F> {
477 /// Cross-fitting `fit_transform`: encode each row using encodings learned on
478 /// the OTHER folds, preventing target leakage.
479 ///
480 /// Mirrors scikit-learn's `TargetEncoder.fit_transform`
481 /// (`sklearn/preprocessing/_target_encoder.py:232-303`): for the
482 /// continuous/binary single-output case it uses a deterministic `KFold`
483 /// (`cv` folds, NO shuffle — ferrolearn exposes no `shuffle`/`random_state`,
484 /// so this is sklearn's reproducible `shuffle=False` path, `:262`); for each
485 /// `(train, test)` fold it fits the per-feature encodings on the TRAIN rows
486 /// (with that fold's `y_train_mean`) and writes the TEST rows through those
487 /// train-encodings (`:277-302`). A category unseen in the train fold encodes
488 /// to `y_train_mean` (the `count == 0 -> y_mean` rule, mirroring
489 /// `_transform_X_ordinal`'s unknown-category fallback, `:494-497`).
490 ///
491 /// Note `fit(X,y).transform(X)` does NOT equal `fit_transform(X,y)`
492 /// (`:235-238`): `transform` uses the full-data `encodings_`, `fit_transform`
493 /// is cross-fit.
494 ///
495 /// # Errors
496 ///
497 /// - [`FerroError::InsufficientSamples`] if the input has zero rows.
498 /// - [`FerroError::ShapeMismatch`] if `x` rows and `y` length differ.
499 /// - [`FerroError::InvalidParameter`] if a [`Smooth::Fixed`] factor is
500 /// negative, or if `cv < 2` / `cv` exceeds the sample count (sklearn
501 /// requires `cv >= 2`, `_target_encoder.py:190`, and `KFold` rejects more
502 /// splits than samples, `_split.py:408-414`).
503 pub fn fit_transform(&self, x: &Array2<usize>, y: &Array1<F>) -> Result<Array2<F>, FerroError> {
504 let n_samples = x.nrows();
505 if n_samples == 0 {
506 return Err(FerroError::InsufficientSamples {
507 required: 1,
508 actual: 0,
509 context: "TargetEncoder::fit_transform".into(),
510 });
511 }
512 if y.len() != n_samples {
513 return Err(FerroError::ShapeMismatch {
514 expected: vec![n_samples],
515 actual: vec![y.len()],
516 context: "TargetEncoder::fit_transform — y must have same length as x rows".into(),
517 });
518 }
519 if let Smooth::Fixed(s) = self.smooth
520 && s < F::zero()
521 {
522 return Err(FerroError::InvalidParameter {
523 name: "smooth".into(),
524 reason: "smoothing factor must be non-negative".into(),
525 });
526 }
527 // sklearn `_parameter_constraints` requires `cv >= 2`
528 // (`_target_encoder.py:190`); `KFold` additionally rejects more splits
529 // than samples (`_split.py:408-414`).
530 if self.cv < 2 {
531 return Err(FerroError::InvalidParameter {
532 name: "cv".into(),
533 reason: "cv must be at least 2".into(),
534 });
535 }
536 if self.cv > n_samples {
537 return Err(FerroError::InvalidParameter {
538 name: "cv".into(),
539 reason: "cv cannot exceed the number of samples".into(),
540 });
541 }
542
543 let n_features = x.ncols();
544 let mut out = Array2::zeros((n_samples, n_features));
545
546 for (test_start, test_end) in kfold_test_ranges(n_samples, self.cv) {
547 // Train indices are everything OUTSIDE the contiguous test fold.
548 let train_idx: Vec<usize> = (0..n_samples)
549 .filter(|&i| i < test_start || i >= test_end)
550 .collect();
551
552 // y_train_mean = np.mean(y[train]) (`_target_encoder.py:279`).
553 let y_train: Vec<F> = train_idx.iter().map(|&i| y[i]).collect();
554 let y_train_arr = Array1::from(y_train);
555 let train_mean = mean_pairwise(&y_train_arr, train_idx.len());
556 let train_mean_f64 = train_mean.to_f64().unwrap_or(0.0);
557 let train_var_f64 = match self.smooth {
558 Smooth::Auto => Some(population_variance_f64(&y_train_arr, train_mean_f64)),
559 Smooth::Fixed(_) => None,
560 };
561
562 for j in 0..n_features {
563 // Fit this fold's per-feature encoding on the TRAIN rows.
564 let train_col: Vec<usize> = train_idx.iter().map(|&i| x[[i, j]]).collect();
565 let enc = fit_feature_encoding(
566 &train_col,
567 &y_train_arr,
568 self.smooth,
569 train_mean_f64,
570 train_var_f64,
571 );
572 // Encode the TEST rows; a category unseen in the train fold ->
573 // the train y_mean (`_transform_X_ordinal`, `:494-497`).
574 for i in test_start..test_end {
575 let cat = x[[i, j]];
576 out[[i, j]] = *enc.get(&cat).unwrap_or(&train_mean);
577 }
578 }
579 }
580
581 Ok(out)
582 }
583}
584
585impl<F: Float + Send + Sync + 'static> Transform<Array2<usize>> for FittedTargetEncoder<F> {
586 type Output = Array2<F>;
587 type Error = FerroError;
588
589 /// Encode categorical features using the learned target statistics.
590 ///
591 /// Unseen categories are encoded as the global target mean.
592 ///
593 /// # Errors
594 ///
595 /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
596 /// from the number of features seen during fitting.
597 fn transform(&self, x: &Array2<usize>) -> Result<Array2<F>, FerroError> {
598 let n_features = self.category_maps.len();
599 if x.ncols() != n_features {
600 return Err(FerroError::ShapeMismatch {
601 expected: vec![x.nrows(), n_features],
602 actual: vec![x.nrows(), x.ncols()],
603 context: "FittedTargetEncoder::transform".into(),
604 });
605 }
606
607 let n_samples = x.nrows();
608 let mut out = Array2::zeros((n_samples, n_features));
609
610 for j in 0..n_features {
611 let cat_map = &self.category_maps[j];
612 for i in 0..n_samples {
613 let cat = x[[i, j]];
614 out[[i, j]] = *cat_map.get(&cat).unwrap_or(&self.global_mean);
615 }
616 }
617
618 Ok(out)
619 }
620}
621
622// ---------------------------------------------------------------------------
623// Tests
624// ---------------------------------------------------------------------------
625
626#[cfg(test)]
627mod tests {
628 use super::*;
629 use approx::assert_abs_diff_eq;
630 use ndarray::array;
631
632 #[test]
633 fn test_target_encoder_basic() {
634 let enc = TargetEncoder::<f64>::new(0.0); // no smoothing
635 // Category 0: targets [1.0, 2.0], mean = 1.5
636 // Category 1: targets [3.0, 4.0], mean = 3.5
637 let x = array![[0usize], [0], [1], [1]];
638 let y = array![1.0, 2.0, 3.0, 4.0];
639 let fitted = enc.fit(&x, &y).unwrap();
640 let out = fitted.transform(&x).unwrap();
641 assert_abs_diff_eq!(out[[0, 0]], 1.5, epsilon = 1e-10);
642 assert_abs_diff_eq!(out[[1, 0]], 1.5, epsilon = 1e-10);
643 assert_abs_diff_eq!(out[[2, 0]], 3.5, epsilon = 1e-10);
644 assert_abs_diff_eq!(out[[3, 0]], 3.5, epsilon = 1e-10);
645 }
646
647 #[test]
648 fn test_target_encoder_smoothing() {
649 let enc = TargetEncoder::<f64>::new(2.0);
650 // Category 0: targets [1.0], mean = 1.0, count = 1
651 // Category 1: targets [3.0, 5.0], mean = 4.0, count = 2
652 // Global mean = (1 + 3 + 5) / 3 = 3.0
653 let x = array![[0usize], [1], [1]];
654 let y = array![1.0, 3.0, 5.0];
655 let fitted = enc.fit(&x, &y).unwrap();
656 let out = fitted.transform(&x).unwrap();
657 // Cat 0: (1 * 1.0 + 2 * 3.0) / (1 + 2) = 7/3 ≈ 2.333
658 let expected_0 = (1.0 * 1.0 + 2.0 * 3.0) / (1.0 + 2.0);
659 assert_abs_diff_eq!(out[[0, 0]], expected_0, epsilon = 1e-10);
660 // Cat 1: (2 * 4.0 + 2 * 3.0) / (2 + 2) = 14/4 = 3.5
661 let expected_1 = (2.0 * 4.0 + 2.0 * 3.0) / (2.0 + 2.0);
662 assert_abs_diff_eq!(out[[1, 0]], expected_1, epsilon = 1e-10);
663 }
664
665 #[test]
666 fn test_target_encoder_unseen_category() {
667 let enc = TargetEncoder::<f64>::new(1.0);
668 let x = array![[0usize], [0], [1], [1]];
669 let y = array![1.0, 2.0, 3.0, 4.0];
670 let fitted = enc.fit(&x, &y).unwrap();
671 // Transform with unseen category 2
672 let x_new = array![[2usize]];
673 let out = fitted.transform(&x_new).unwrap();
674 // Unseen category → global mean = 2.5
675 assert_abs_diff_eq!(out[[0, 0]], 2.5, epsilon = 1e-10);
676 }
677
678 #[test]
679 fn test_target_encoder_multi_feature() {
680 let enc = TargetEncoder::<f64>::new(0.0);
681 let x = array![[0usize, 1], [0, 0], [1, 1], [1, 0]];
682 let y = array![1.0, 2.0, 3.0, 4.0];
683 let fitted = enc.fit(&x, &y).unwrap();
684 let out = fitted.transform(&x).unwrap();
685 assert_eq!(out.shape(), &[4, 2]);
686 }
687
688 #[test]
689 fn test_target_encoder_zero_rows_error() {
690 let enc = TargetEncoder::<f64>::new(1.0);
691 let x: Array2<usize> = Array2::zeros((0, 2));
692 let y: Array1<f64> = Array1::zeros(0);
693 assert!(enc.fit(&x, &y).is_err());
694 }
695
696 #[test]
697 fn test_target_encoder_shape_mismatch_fit() {
698 let enc = TargetEncoder::<f64>::new(1.0);
699 let x = array![[0usize], [1]];
700 let y = array![1.0]; // wrong length
701 assert!(enc.fit(&x, &y).is_err());
702 }
703
704 #[test]
705 fn test_target_encoder_shape_mismatch_transform() {
706 let enc = TargetEncoder::<f64>::new(1.0);
707 let x = array![[0usize, 1], [1, 0]];
708 let y = array![1.0, 2.0];
709 let fitted = enc.fit(&x, &y).unwrap();
710 let x_bad = array![[0usize]]; // wrong number of columns
711 assert!(fitted.transform(&x_bad).is_err());
712 }
713
714 #[test]
715 fn test_target_encoder_negative_smooth_error() {
716 let enc = TargetEncoder::<f64>::new(-1.0);
717 let x = array![[0usize]];
718 let y = array![1.0];
719 assert!(enc.fit(&x, &y).is_err());
720 }
721
722 #[test]
723 fn test_target_encoder_default() {
724 // sklearn's DEFAULT is smooth="auto" (`_target_encoder.py:199`), NOT a
725 // fixed value; `new(F)` is the explicit fixed-smooth constructor.
726 let enc = TargetEncoder::<f64>::default();
727 assert_eq!(enc.smooth(), Smooth::Auto);
728 assert_eq!(enc.cv(), 5);
729 let fixed = TargetEncoder::<f64>::new(1.0);
730 assert_eq!(fixed.smooth(), Smooth::Fixed(1.0));
731 }
732
733 #[test]
734 fn test_target_encoder_global_mean_accessor() {
735 let enc = TargetEncoder::<f64>::new(0.0);
736 let x = array![[0usize], [1]];
737 let y = array![2.0, 4.0];
738 let fitted = enc.fit(&x, &y).unwrap();
739 assert_abs_diff_eq!(fitted.global_mean(), 3.0, epsilon = 1e-10);
740 }
741
742 #[test]
743 fn test_target_encoder_f32() {
744 let enc = TargetEncoder::<f32>::new(1.0f32);
745 let x = array![[0usize], [0], [1]];
746 let y: Array1<f32> = array![1.0f32, 2.0, 3.0];
747 let fitted = enc.fit(&x, &y).unwrap();
748 let out = fitted.transform(&x).unwrap();
749 assert!(!out[[0, 0]].is_nan());
750 }
751}