ferrolearn_preprocess/
kbins_discretizer.rs

1//! K-bins discretizer: bin continuous features into discrete intervals.
2//!
3//! [`KBinsDiscretizer`] transforms continuous features into discrete bins.
4//! Each feature is independently binned according to one of the following
5//! strategies:
6//!
7//! - [`BinStrategy::Uniform`] — equal-width bins.
8//! - [`BinStrategy::Quantile`] — bins with equal numbers of samples.
9//! - [`BinStrategy::KMeans`] — bins based on 1D k-means clustering.
10//!
11//! The output can be ordinal-encoded (integers 0..k-1) or one-hot encoded.
12//!
13//! ## REQ status
14//!
15//! Translation target: scikit-learn 1.5.2 `class KBinsDiscretizer`
16//! (`sklearn/preprocessing/_discretization.py:184`). Tracking: #1375. Each REQ
17//! is BINARY — SHIPPED (impl + non-test consumer + tests + green verification)
18//! or NOT-STARTED (with a concrete open blocker).
19//!
20//! | REQ | Scope | Status | Evidence / Blocker |
21//! |-----|-------|--------|--------------------|
22//! | REQ-1 | Uniform + Quantile bin EDGES + ordinal/onehot transform VALUES (non-degenerate features) | SHIPPED | [`KBinsDiscretizer`] `fit` — Uniform=`np.linspace` (`_discretization.py:271`), Quantile=`np.percentile` (`:276`); `assign_bin` ≡ `searchsorted(edges[1:-1], side="right")` (`:377`); oracle value tests in `tests/divergence_kbins_discretizer.rs`. Consumer: re-export `lib.rs:151` |
23//! | REQ-2 | KMeans bin edges/transform via faithful sklearn `KMeans` Lloyd | SHIPPED | `kmeans_1d` replicates sklearn `KMeans` Lloyd incl. mean-centering (`_kmeans.py:1486-1546`), `\|\|C\|\|²-2xC` assignment + lowest-index tie-break (`_k_means_lloyd.pyx:196-213`), empty-cluster RELOCATION (`_k_means_common.pyx:_relocate_empty_clusters_dense`), var-scaled `tol=mean(var)*1e-4` (`_tolerance`), strict/center-shift convergence, max_iter=300, deterministic uniform-center init (`_discretization.py:289-300`); matches sklearn bit-for-bit on well-separated + moderately-separated + empty-init-cluster data (km1 #2321, km2 #2322, 3 green oracle fixtures). RESIDUAL: ~0.1% of well-spread continuous data converges to a different valid Lloyd local optimum (BLAS-gemm vs scalar float tie-break) — honestly pinned `divergence_km3_blas_gemm_local_optimum` (#2321 follow-up) |
24//! | REQ-3 | Error/parameter contracts (n_samples<2, n_bins<2, transform ncols, unfitted) | SHIPPED (scoped) | [`KBinsDiscretizer::fit`]/[`FittedKBinsDiscretizer`] `transform`; in-module + divergence error tests |
25//! | REQ-4 | Constant feature → bin 0 + per-feature `n_bins_=1` (`col_min==col_max`) | SHIPPED | `fit` sets `bin_edges=[-inf,+inf]` + `n_bins_per_feature[j]=1`; `assign_bin` → bin 0 (mirrors `_discretization.py:262-268`); 3 oracle tests — was DIV-1 #1376, fixed |
26//! | REQ-5 | Small-bin removal (quantile/kmeans near-duplicate edge collapse → per-feature `n_bins_`) + onehot variable width | SHIPPED | `fit` collapse `gap > 1e-8` (mirrors `ediff1d > 1e-8` `:302-312`); `transform` onehot width = `sum(n_bins_per_feature)` cumulative offsets; oracle tests (quantile collapse, onehot variable width, threshold boundary) — was DIV-2 #1377, fixed |
27//! | REQ-6 | `subsample` (default 200000) + `random_state` resample for quantile/kmeans | NOT-STARTED | sklearn `_discretization.py:242-249` — blocker #1379 |
28//! | REQ-7 | `n_bins` as per-feature array + `_validate_n_bins` | NOT-STARTED | scalar only; sklearn `_discretization.py:329-352` — blocker #1380 |
29//! | REQ-8 | encode='onehot' SPARSE default + sklearn ctor defaults (encode=onehot, strategy=quantile) | NOT-STARTED | dense only, defaults Ordinal/Uniform; sklearn `_discretization.py:185,320` — blocker #1381 |
30//! | REQ-9 | `dtype` param + `sample_weight` (weighted percentile/kmeans) | NOT-STARTED | sklearn `_discretization.py:228,235,295` — blocker #1382 |
31//! | REQ-10 | `inverse_transform` | NOT-STARTED | sklearn `_discretization.py:393` — blocker #1383 |
32//! | REQ-11 | `get_feature_names_out` + `bin_edges_`/`n_bins_` attr names + `PipelineTransformer` impl | NOT-STARTED | absent — blocker #1384 |
33//! | REQ-12 | PyO3 binding | NOT-STARTED | no `ferrolearn-python` registration — blocker #1385 |
34//! | REQ-13 | ferray substrate | NOT-STARTED | dense `Array2` + `num_traits::Float` only — blocker #1386 |
35//! | REQ-14 | KMeans empty-cluster relocation (degenerate/duplicate-heavy data) | SHIPPED | `kmeans_1d` now RELOCATES empty clusters (translates `_relocate_empty_clusters_dense`, `_k_means_common.pyx:170-214`): farthest-point reassignment + heaviest-cluster fallback in `_average_centers`; km2 (#2322) + `green_kmeans_empty_cluster_relocation_k3`. Residual degenerate carve-out (more clusters than distinct samples / multi-empty argpartition tie order) folded into REQ-2's pinned BLAS-gemm residual — was carve-out #1378 |
36
37use ferrolearn_core::error::FerroError;
38use ferrolearn_core::traits::{Fit, FitTransform, Transform};
39use ndarray::Array2;
40use num_traits::Float;
41
42// ---------------------------------------------------------------------------
43// BinStrategy
44// ---------------------------------------------------------------------------
45
46/// Strategy for computing bin edges.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum BinStrategy {
49    /// Equal-width bins.
50    Uniform,
51    /// Equal-frequency bins (quantile-based).
52    Quantile,
53    /// Bins based on 1D k-means clustering.
54    KMeans,
55}
56
57/// Encoding method for the output.
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub enum BinEncoding {
60    /// Ordinal encoding: each value is replaced by its bin index (0..n_bins-1).
61    Ordinal,
62    /// One-hot encoding: each bin becomes a separate binary column.
63    OneHot,
64}
65
66// ---------------------------------------------------------------------------
67// KBinsDiscretizer (unfitted)
68// ---------------------------------------------------------------------------
69
70/// An unfitted K-bins discretizer.
71///
72/// Calling [`Fit::fit`] computes the bin edges for each feature and returns a
73/// [`FittedKBinsDiscretizer`].
74///
75/// # Examples
76///
77/// ```
78/// use ferrolearn_preprocess::kbins_discretizer::{KBinsDiscretizer, BinStrategy, BinEncoding};
79/// use ferrolearn_core::traits::{Fit, Transform};
80/// use ndarray::array;
81///
82/// let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
83/// let x = array![[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]];
84/// let fitted = disc.fit(&x, &()).unwrap();
85/// let out = fitted.transform(&x).unwrap();
86/// // Values should be in {0.0, 1.0, 2.0}
87/// for v in out.iter() {
88///     assert!(*v >= 0.0 && *v < 3.0);
89/// }
90/// ```
91#[must_use]
92#[derive(Debug, Clone)]
93pub struct KBinsDiscretizer<F> {
94    /// Number of bins.
95    n_bins: usize,
96    /// Encoding method.
97    encode: BinEncoding,
98    /// Binning strategy.
99    strategy: BinStrategy,
100    _marker: std::marker::PhantomData<F>,
101}
102
103impl<F: Float + Send + Sync + 'static> KBinsDiscretizer<F> {
104    /// Create a new `KBinsDiscretizer`.
105    pub fn new(n_bins: usize, encode: BinEncoding, strategy: BinStrategy) -> Self {
106        Self {
107            n_bins,
108            encode,
109            strategy,
110            _marker: std::marker::PhantomData,
111        }
112    }
113
114    /// Return the number of bins.
115    #[must_use]
116    pub fn n_bins(&self) -> usize {
117        self.n_bins
118    }
119
120    /// Return the encoding method.
121    #[must_use]
122    pub fn encode(&self) -> BinEncoding {
123        self.encode
124    }
125
126    /// Return the binning strategy.
127    #[must_use]
128    pub fn strategy(&self) -> BinStrategy {
129        self.strategy
130    }
131}
132
133impl<F: Float + Send + Sync + 'static> Default for KBinsDiscretizer<F> {
134    fn default() -> Self {
135        Self::new(5, BinEncoding::Ordinal, BinStrategy::Uniform)
136    }
137}
138
139// ---------------------------------------------------------------------------
140// FittedKBinsDiscretizer
141// ---------------------------------------------------------------------------
142
143/// A fitted K-bins discretizer holding per-feature bin edges.
144///
145/// Created by calling [`Fit::fit`] on a [`KBinsDiscretizer`].
146#[derive(Debug, Clone)]
147pub struct FittedKBinsDiscretizer<F> {
148    /// Bin edges per feature. `bin_edges[j]` has `n_bins_per_feature[j] + 1`
149    /// edges (which may be fewer than `n_bins + 1` for constant or collapsed
150    /// features).
151    bin_edges: Vec<Vec<F>>,
152    /// Per-feature bin count (mirrors sklearn `n_bins_`). A constant feature
153    /// collapses to 1 bin; quantile/kmeans features may shrink when
154    /// near-duplicate edges are removed.
155    n_bins_per_feature: Vec<usize>,
156    /// Requested number of bins (the global `n_bins` argument).
157    n_bins: usize,
158    /// Encoding method.
159    encode: BinEncoding,
160}
161
162impl<F: Float + Send + Sync + 'static> FittedKBinsDiscretizer<F> {
163    /// Return the bin edges per feature.
164    #[must_use]
165    pub fn bin_edges(&self) -> &[Vec<F>] {
166        &self.bin_edges
167    }
168
169    /// Return the per-feature bin count (sklearn `n_bins_`).
170    #[must_use]
171    pub fn n_bins_per_feature(&self) -> &[usize] {
172        &self.n_bins_per_feature
173    }
174
175    /// Return the requested number of bins.
176    #[must_use]
177    pub fn n_bins(&self) -> usize {
178        self.n_bins
179    }
180
181    /// Return the encoding method.
182    #[must_use]
183    pub fn encode(&self) -> BinEncoding {
184        self.encode
185    }
186}
187
188// ---------------------------------------------------------------------------
189// Helpers
190// ---------------------------------------------------------------------------
191
192/// Assign a value to a bin index given sorted bin edges.
193fn assign_bin<F: Float>(value: F, edges: &[F]) -> usize {
194    let n_bins = edges.len() - 1;
195    if n_bins == 0 {
196        return 0;
197    }
198    // Binary search for the bin
199    for (i, edge) in edges.iter().enumerate().skip(1) {
200        if value < *edge {
201            return i - 1;
202        }
203    }
204    // Last bin for values >= last edge
205    n_bins - 1
206}
207
208/// 1-D k-means bin edges, faithfully replicating scikit-learn 1.5.2's
209/// `KBinsDiscretizer(strategy="kmeans")` path (`sklearn/preprocessing/_discretization.py:285-300`).
210///
211/// sklearn runs ONE `KMeans(n_clusters=n_bins, init=uniform-bin-centers, n_init=1)`
212/// Lloyd fit on the column, sorts the resulting centers, and builds
213/// `bin_edges = np.r_[col_min, (centers[1:]+centers[:-1])*0.5, col_max]`.
214///
215/// This reproduces the full Lloyd machinery used by `KMeans.fit`
216/// (`sklearn/cluster/_kmeans.py` + `_k_means_lloyd.pyx` + `_k_means_common.pyx`):
217///
218/// - **Mean-centering** (`_kmeans.py:1486-1493,1543-1546`): `KMeans.fit` subtracts
219///   `X.mean(axis=0)` from both the data and the init "for more accurate distance
220///   computations", runs Lloyd on the centered data, then adds the mean back to the
221///   centers. The distance argmin is computed via `||C||² - 2·x·C` which is NOT
222///   translation-invariant in floating point, so this shift is load-bearing for
223///   the converged local optimum (not just numerical hygiene).
224/// - **Assignment** (`_k_means_lloyd.pyx:196-213`): each point goes to the center
225///   minimizing `pairwise[j] = ||C_j||² - 2·x·C_j` (the `x²` term is dropped since it
226///   is constant per point); ties resolve to the LOWEST center index (strict `<`).
227/// - **Center update** (`_k_means_lloyd.pyx:215-218`, `_k_means_common.pyx:_average_centers`):
228///   each new center is the mean of its assigned points.
229/// - **Empty-cluster relocation** (`_k_means_common.pyx:_relocate_empty_clusters_dense`,
230///   `:170-214`): for each empty cluster, take the points FARTHEST from their own
231///   assigned center (largest squared distance, descending) and move one into the
232///   empty cluster; the donor loses it. Skipped when `max(distances) == 0` (more
233///   clusters than distinct samples). Any cluster still empty after relocation is
234///   placed at the location of the heaviest cluster (`_average_centers` else-branch).
235/// - **Convergence** (`_kmeans.py:704-755`): stop on strict convergence (no label
236///   changed) OR when `center_shift_total = Σ_j (C_new[j] - C_old[j])² <= tol`, with
237///   `tol = mean(var(column)) * 1e-4` (`_tolerance`, `_kmeans.py:286-294`,
238///   population variance), OR `max_iter = 300`.
239///
240/// All intermediate arithmetic is done in `f64` (matching numpy's float64 default)
241/// regardless of `F`, then converted back to `F` for the edges.
242fn kmeans_1d<F: Float>(values: &[F], n_bins: usize) -> Vec<F> {
243    let n = values.len();
244    // Column min/max in F (the outer edges; sklearn uses the un-centered col_min/col_max).
245    let min_v = values
246        .iter()
247        .copied()
248        .fold(F::infinity(), num_traits::Float::min);
249    let max_v = values
250        .iter()
251        .copied()
252        .fold(F::neg_infinity(), num_traits::Float::max);
253
254    if n == 0 || n_bins == 0 {
255        // Degenerate: fall back to a uniform partition over [min, max].
256        return (0..=n_bins)
257            .map(|i| {
258                min_v
259                    + (max_v - min_v) * F::from(i).unwrap_or_else(F::zero)
260                        / fdiv_or_one::<F>(n_bins)
261            })
262            .collect();
263    }
264
265    // Work entirely in f64 (numpy float64 default).
266    let col: Vec<f64> = values.iter().map(|&v| v.to_f64().unwrap_or(0.0)).collect();
267    let col_min = col.iter().copied().fold(f64::INFINITY, f64::min);
268    let col_max = col.iter().copied().fold(f64::NEG_INFINITY, f64::max);
269
270    // Variance-scaled tolerance: tol = mean(var(column)) * 1e-4 (population variance,
271    // ddof=0), == sklearn `_tolerance(X, 1e-4)` (`_kmeans.py:286-294`).
272    let mean_all: f64 = col.iter().sum::<f64>() / (n as f64);
273    let var: f64 = col
274        .iter()
275        .map(|&x| (x - mean_all) * (x - mean_all))
276        .sum::<f64>()
277        / (n as f64);
278    let tol = var * 1e-4;
279
280    // KMeans.fit mean-centers X and the init (`_kmeans.py:1486-1493`).
281    let x_mean = mean_all;
282    let xc: Vec<f64> = col.iter().map(|&x| x - x_mean).collect();
283
284    // Uniform-bin-centers init (`_discretization.py:289-290`), then shifted by -x_mean.
285    let mut centers: Vec<f64> = (0..n_bins)
286        .map(|i| {
287            let lo = col_min + (col_max - col_min) * (i as f64) / (n_bins as f64);
288            let hi = col_min + (col_max - col_min) * ((i + 1) as f64) / (n_bins as f64);
289            (lo + hi) * 0.5 - x_mean
290        })
291        .collect();
292
293    let mut labels = vec![usize::MAX; n];
294    let mut labels_old = vec![usize::MAX; n];
295    let max_iter = 300usize;
296
297    for _ in 0..max_iter {
298        let centers_old = centers.clone();
299
300        // --- Assignment: argmin_j (||C_j||² - 2·x·C_j), ties -> lowest index. ---
301        let csq: Vec<f64> = centers_old.iter().map(|&c| c * c).collect();
302        for i in 0..n {
303            let xi = xc[i];
304            let mut best_j = 0usize;
305            let mut best = csq[0] - 2.0 * xi * centers_old[0];
306            for j in 1..n_bins {
307                let d = csq[j] - 2.0 * xi * centers_old[j];
308                if d < best {
309                    best = d;
310                    best_j = j;
311                }
312            }
313            labels[i] = best_j;
314        }
315
316        // --- Accumulate per-cluster sum and weight (count). ---
317        let mut acc = vec![0.0f64; n_bins];
318        let mut wic = vec![0.0f64; n_bins];
319        for i in 0..n {
320            acc[labels[i]] += xc[i];
321            wic[labels[i]] += 1.0;
322        }
323
324        // --- Empty-cluster relocation (`_relocate_empty_clusters_dense`). ---
325        let empty: Vec<usize> = (0..n_bins).filter(|&j| wic[j] == 0.0).collect();
326        if !empty.is_empty() {
327            // distances[i] = (xc[i] - centers_old[labels[i]])²
328            let distances: Vec<f64> = (0..n)
329                .map(|i| {
330                    let d = xc[i] - centers_old[labels[i]];
331                    d * d
332                })
333                .collect();
334            let max_dist = distances.iter().copied().fold(0.0f64, f64::max);
335            if max_dist != 0.0 {
336                let n_empty = empty.len();
337                // far_from_centers: the n_empty points with the largest distance,
338                // in descending order. sklearn uses
339                // `np.argpartition(distances, -n_empty)[:-n_empty-1:-1]`
340                // (`_k_means_common.pyx:190`), whose introselect partition + reverse
341                // slice resolves equal-distance ties toward the HIGHEST original index
342                // (e.g. distances `[.04,.04,0,0,.04,.04]`, n_empty=2 -> far `[5, 4]`).
343                // Match that by breaking distance ties on DESCENDING index, so the
344                // duplicate-heavy `n_bins > n_distinct` relocation dumps the same
345                // points onto the empty clusters as sklearn (the centers then coincide
346                // and collapse under small-bin removal to sklearn's `n_bins_`).
347                let mut order: Vec<usize> = (0..n).collect();
348                order.sort_by(|&a, &b| {
349                    distances[b]
350                        .partial_cmp(&distances[a])
351                        .unwrap_or(std::cmp::Ordering::Equal)
352                        .then(b.cmp(&a))
353                });
354                for idx in 0..n_empty {
355                    let new_cluster = empty[idx];
356                    let far = order[idx];
357                    let old_cluster = labels[far];
358                    acc[old_cluster] -= xc[far];
359                    acc[new_cluster] = xc[far];
360                    wic[new_cluster] = 1.0;
361                    wic[old_cluster] -= 1.0;
362                }
363            }
364        }
365
366        // --- Average; clusters still empty -> location of the heaviest cluster
367        //     (`_average_centers` else-branch). ---
368        let mut argmax_w = 0usize;
369        for j in 1..n_bins {
370            if wic[j] > wic[argmax_w] {
371                argmax_w = j;
372            }
373        }
374        for j in 0..n_bins {
375            if wic[j] > 0.0 {
376                centers[j] = acc[j] / wic[j];
377            } else if wic[argmax_w] > 0.0 {
378                centers[j] = acc[argmax_w] / wic[argmax_w];
379            } else {
380                centers[j] = centers_old[j];
381            }
382        }
383
384        // --- Convergence (`_kmeans.py:724-739`). ---
385        if labels == labels_old {
386            // Strict convergence: no label changed.
387            break;
388        }
389        let center_shift_tot: f64 = (0..n_bins)
390            .map(|j| {
391                let d = centers[j] - centers_old[j];
392                d * d
393            })
394            .sum();
395        if center_shift_tot <= tol {
396            break;
397        }
398        labels_old.copy_from_slice(&labels);
399    }
400
401    // Shift centers back (`_kmeans.py:1546`) and sort (`_discretization.py:298`).
402    let mut centers_out: Vec<f64> = centers.iter().map(|&c| c + x_mean).collect();
403    centers_out.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
404
405    // edges = [col_min, midpoints.., col_max] (`_discretization.py:299-300`).
406    let mut edges = Vec::with_capacity(n_bins + 1);
407    edges.push(min_v);
408    for i in 0..n_bins.saturating_sub(1) {
409        let mid = (centers_out[i] + centers_out[i + 1]) * 0.5;
410        edges.push(F::from(mid).unwrap_or(min_v));
411    }
412    edges.push(max_v);
413
414    edges
415}
416
417/// `F::from(n)` as a divisor, falling back to `F::one()` when `n == 0` to avoid a
418/// division by zero in the degenerate fallback path.
419fn fdiv_or_one<F: Float>(n: usize) -> F {
420    if n == 0 {
421        F::one()
422    } else {
423        F::from(n).unwrap_or_else(F::one)
424    }
425}
426
427// ---------------------------------------------------------------------------
428// Trait implementations
429// ---------------------------------------------------------------------------
430
431impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for KBinsDiscretizer<F> {
432    type Fitted = FittedKBinsDiscretizer<F>;
433    type Error = FerroError;
434
435    /// Fit by computing bin edges for each feature.
436    ///
437    /// # Errors
438    ///
439    /// - [`FerroError::InsufficientSamples`] if the input has fewer than 2 rows.
440    /// - [`FerroError::InvalidParameter`] if `n_bins` < 2.
441    fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedKBinsDiscretizer<F>, FerroError> {
442        let n_samples = x.nrows();
443        if n_samples < 2 {
444            return Err(FerroError::InsufficientSamples {
445                required: 2,
446                actual: n_samples,
447                context: "KBinsDiscretizer::fit".into(),
448            });
449        }
450        if self.n_bins < 2 {
451            return Err(FerroError::InvalidParameter {
452                name: "n_bins".into(),
453                reason: "n_bins must be at least 2".into(),
454            });
455        }
456
457        let n_features = x.ncols();
458        let mut bin_edges = Vec::with_capacity(n_features);
459        let mut n_bins_per_feature = Vec::with_capacity(n_features);
460
461        for j in 0..n_features {
462            let mut col_vals: Vec<F> = x.column(j).iter().copied().collect();
463            col_vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
464
465            let min_val = col_vals[0];
466            let max_val = col_vals[col_vals.len() - 1];
467
468            // Constant feature (sklearn :262-268): collapse to a single bin
469            // spanning [-inf, +inf] so transform maps every value to bin 0.
470            if min_val == max_val {
471                bin_edges.push(vec![F::neg_infinity(), F::infinity()]);
472                n_bins_per_feature.push(1);
473                continue;
474            }
475
476            let edges: Vec<F> = match self.strategy {
477                BinStrategy::Uniform => (0..=self.n_bins)
478                    .map(|i| {
479                        min_val
480                            + (max_val - min_val) * F::from(i).unwrap()
481                                / F::from(self.n_bins).unwrap()
482                    })
483                    .collect(),
484                BinStrategy::Quantile => {
485                    let n = col_vals.len();
486                    (0..=self.n_bins)
487                        .map(|i| {
488                            let frac = F::from(i).unwrap() / F::from(self.n_bins).unwrap();
489                            let pos = frac * F::from(n.saturating_sub(1)).unwrap();
490                            let lo = pos.floor().to_usize().unwrap_or(0).min(n - 1);
491                            let hi = pos.ceil().to_usize().unwrap_or(0).min(n - 1);
492                            let f = pos - F::from(lo).unwrap();
493                            col_vals[lo] * (F::one() - f) + col_vals[hi] * f
494                        })
495                        .collect()
496                }
497                BinStrategy::KMeans => kmeans_1d(&col_vals, self.n_bins),
498            };
499
500            // Small-bin removal for quantile and kmeans only (sklearn
501            // :302-312): keep the first edge, then keep each subsequent edge
502            // only if its gap to the previously kept edge exceeds 1e-8.
503            // Uniform is never collapsed.
504            match self.strategy {
505                BinStrategy::Quantile | BinStrategy::KMeans => {
506                    let tol = F::from(1e-8).unwrap_or_else(F::epsilon);
507                    let mut kept: Vec<F> = Vec::with_capacity(edges.len());
508                    for &edge in &edges {
509                        match kept.last() {
510                            None => kept.push(edge),
511                            Some(&last) if edge - last > tol => kept.push(edge),
512                            Some(_) => {}
513                        }
514                    }
515                    n_bins_per_feature.push(kept.len() - 1);
516                    bin_edges.push(kept);
517                }
518                BinStrategy::Uniform => {
519                    n_bins_per_feature.push(self.n_bins);
520                    bin_edges.push(edges);
521                }
522            }
523        }
524
525        Ok(FittedKBinsDiscretizer {
526            bin_edges,
527            n_bins_per_feature,
528            n_bins: self.n_bins,
529            encode: self.encode,
530        })
531    }
532}
533
534impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedKBinsDiscretizer<F> {
535    type Output = Array2<F>;
536    type Error = FerroError;
537
538    /// Discretize features into bin indices or one-hot vectors.
539    ///
540    /// # Errors
541    ///
542    /// Returns [`FerroError::ShapeMismatch`] if the number of columns differs
543    /// from the number of features seen during fitting.
544    fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
545        let n_features = self.bin_edges.len();
546        if x.ncols() != n_features {
547            return Err(FerroError::ShapeMismatch {
548                expected: vec![x.nrows(), n_features],
549                actual: vec![x.nrows(), x.ncols()],
550                context: "FittedKBinsDiscretizer::transform".into(),
551            });
552        }
553
554        let n_samples = x.nrows();
555
556        match self.encode {
557            BinEncoding::Ordinal => {
558                let mut out = Array2::zeros((n_samples, n_features));
559                for j in 0..n_features {
560                    let edges = &self.bin_edges[j];
561                    for i in 0..n_samples {
562                        let bin = assign_bin(x[[i, j]], edges);
563                        out[[i, j]] = F::from(bin).unwrap_or_else(F::zero);
564                    }
565                }
566                Ok(out)
567            }
568            BinEncoding::OneHot => {
569                // Output width is the sum of the per-feature bin counts, and
570                // feature `j`'s columns start at the cumulative sum of the
571                // preceding features' bin counts (sklearn one-hot over
572                // `n_bins_`).
573                let mut offsets = Vec::with_capacity(n_features + 1);
574                let mut acc = 0usize;
575                for &nb in &self.n_bins_per_feature {
576                    offsets.push(acc);
577                    acc += nb;
578                }
579                let n_out = acc;
580                let mut out = Array2::zeros((n_samples, n_out));
581                for j in 0..n_features {
582                    let edges = &self.bin_edges[j];
583                    let col_offset = offsets[j];
584                    for i in 0..n_samples {
585                        let bin = assign_bin(x[[i, j]], edges);
586                        out[[i, col_offset + bin]] = F::one();
587                    }
588                }
589                Ok(out)
590            }
591        }
592    }
593}
594
595/// Implement `Transform` on the unfitted discretizer.
596impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for KBinsDiscretizer<F> {
597    type Output = Array2<F>;
598    type Error = FerroError;
599
600    /// Always returns an error — must be fitted first.
601    fn transform(&self, _x: &Array2<F>) -> Result<Array2<F>, FerroError> {
602        Err(FerroError::InvalidParameter {
603            name: "KBinsDiscretizer".into(),
604            reason: "discretizer must be fitted before calling transform; use fit() first".into(),
605        })
606    }
607}
608
609impl<F: Float + Send + Sync + 'static> FitTransform<Array2<F>> for KBinsDiscretizer<F> {
610    type FitError = FerroError;
611
612    /// Fit and transform in one step.
613    ///
614    /// # Errors
615    ///
616    /// Returns an error if fitting fails.
617    fn fit_transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
618        let fitted = self.fit(x, &())?;
619        fitted.transform(x)
620    }
621}
622
623// ---------------------------------------------------------------------------
624// Tests
625// ---------------------------------------------------------------------------
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use approx::assert_abs_diff_eq;
631    use ndarray::array;
632
633    #[test]
634    fn test_kbins_ordinal_uniform() {
635        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
636        let x = array![[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]];
637        let fitted = disc.fit(&x, &()).unwrap();
638        let out = fitted.transform(&x).unwrap();
639        assert_eq!(out.ncols(), 1);
640        // Check bin assignments
641        assert_abs_diff_eq!(out[[0, 0]], 0.0, epsilon = 1e-10); // 0.0 → bin 0
642        assert_abs_diff_eq!(out[[5, 0]], 2.0, epsilon = 1e-10); // 5.0 → bin 2 (last)
643    }
644
645    #[test]
646    fn test_kbins_onehot_uniform() {
647        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::OneHot, BinStrategy::Uniform);
648        let x = array![[0.0], [2.5], [5.0]];
649        let fitted = disc.fit(&x, &()).unwrap();
650        let out = fitted.transform(&x).unwrap();
651        // 3 bins → 3 columns per feature
652        assert_eq!(out.ncols(), 3);
653        // Each row should have exactly one 1.0
654        for i in 0..out.nrows() {
655            let row_sum: f64 = out.row(i).iter().sum();
656            assert_abs_diff_eq!(row_sum, 1.0, epsilon = 1e-10);
657        }
658    }
659
660    #[test]
661    fn test_kbins_quantile_strategy() {
662        let disc = KBinsDiscretizer::<f64>::new(4, BinEncoding::Ordinal, BinStrategy::Quantile);
663        let x = array![[0.0], [1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0]];
664        let fitted = disc.fit(&x, &()).unwrap();
665        let out = fitted.transform(&x).unwrap();
666        // All values should be valid bin indices
667        for v in &out {
668            assert!(*v >= 0.0 && *v < 4.0);
669        }
670    }
671
672    #[test]
673    fn test_kbins_kmeans_strategy() {
674        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::KMeans);
675        let x = array![
676            [0.0],
677            [0.1],
678            [0.2],
679            [5.0],
680            [5.1],
681            [5.2],
682            [10.0],
683            [10.1],
684            [10.2]
685        ];
686        let fitted = disc.fit(&x, &()).unwrap();
687        let out = fitted.transform(&x).unwrap();
688        // Values should be valid bin indices
689        for v in &out {
690            assert!(*v >= 0.0 && *v < 3.0);
691        }
692    }
693
694    #[test]
695    fn test_kbins_multi_feature() {
696        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
697        let x = array![[0.0, 10.0], [2.5, 15.0], [5.0, 20.0]];
698        let fitted = disc.fit(&x, &()).unwrap();
699        let out = fitted.transform(&x).unwrap();
700        assert_eq!(out.ncols(), 2);
701    }
702
703    #[test]
704    fn test_kbins_bin_edges() {
705        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
706        let x = array![[0.0], [3.0], [6.0]];
707        let fitted = disc.fit(&x, &()).unwrap();
708        let edges = &fitted.bin_edges()[0];
709        // 4 edges for 3 bins: [0, 2, 4, 6]
710        assert_eq!(edges.len(), 4);
711        assert_abs_diff_eq!(edges[0], 0.0, epsilon = 1e-10);
712        assert_abs_diff_eq!(edges[3], 6.0, epsilon = 1e-10);
713    }
714
715    #[test]
716    fn test_kbins_fit_transform() {
717        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
718        let x = array![[0.0], [2.5], [5.0]];
719        let out = disc.fit_transform(&x).unwrap();
720        assert_eq!(out.ncols(), 1);
721    }
722
723    #[test]
724    fn test_kbins_insufficient_samples_error() {
725        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
726        let x = array![[1.0]];
727        assert!(disc.fit(&x, &()).is_err());
728    }
729
730    #[test]
731    fn test_kbins_too_few_bins_error() {
732        let disc = KBinsDiscretizer::<f64>::new(1, BinEncoding::Ordinal, BinStrategy::Uniform);
733        let x = array![[0.0], [1.0]];
734        assert!(disc.fit(&x, &()).is_err());
735    }
736
737    #[test]
738    fn test_kbins_shape_mismatch_error() {
739        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
740        let x_train = array![[0.0, 1.0], [2.0, 3.0]];
741        let fitted = disc.fit(&x_train, &()).unwrap();
742        let x_bad = array![[1.0, 2.0, 3.0]];
743        assert!(fitted.transform(&x_bad).is_err());
744    }
745
746    #[test]
747    fn test_kbins_unfitted_error() {
748        let disc = KBinsDiscretizer::<f64>::new(3, BinEncoding::Ordinal, BinStrategy::Uniform);
749        let x = array![[0.0]];
750        assert!(disc.transform(&x).is_err());
751    }
752
753    #[test]
754    fn test_kbins_default() {
755        let disc = KBinsDiscretizer::<f64>::default();
756        assert_eq!(disc.n_bins(), 5);
757        assert_eq!(disc.encode(), BinEncoding::Ordinal);
758        assert_eq!(disc.strategy(), BinStrategy::Uniform);
759    }
760
761    #[test]
762    fn test_kbins_ordinal_values_in_range() {
763        let disc = KBinsDiscretizer::<f64>::new(5, BinEncoding::Ordinal, BinStrategy::Uniform);
764        let x = array![[0.0], [2.5], [5.0], [7.5], [10.0]];
765        let fitted = disc.fit(&x, &()).unwrap();
766        let out = fitted.transform(&x).unwrap();
767        for v in &out {
768            assert!(*v >= 0.0 && *v < 5.0, "Bin index {v} out of range");
769        }
770    }
771}
ferrolearn_preprocess/kbins_discretizer.rs

ferrolearn_preprocess/
kbins_discretizer.rs