Skip to main content

ferrolearn_preprocess/
label_encoder.rs

1//! Label encoder: maps string labels to integer indices.
2//!
3//! Learns an ordered mapping from unique string labels to consecutive integers
4//! `0, 1, ..., n_classes - 1`. Supports forward (`label → int`) and reverse
5//! (`int → label`) transformation.
6//!
7//! # `## REQ status`
8//!
9//! Binary (R-DEFER-2), translating `sklearn/preprocessing/_label.py` (`class LabelEncoder`
10//! `:34`). Design doc: `.design/preprocess/label_encoder.md`. Expected values from the live
11//! sklearn 1.5.2 oracle (R-CHAR-3). Consumer: crate re-export (`lib.rs:116`, grandfathered S5).
12//! HONEST (R-HONEST-3): ferrolearn is `Array1<String>`-only; sklearn `LabelEncoder` accepts any
13//! hashable+comparable dtype. The non-empty string path value-matches the oracle exactly.
14//!
15//! | REQ | Status | Evidence |
16//! |---|---|---|
17//! | REQ-1 (string fit → sorted-unique classes_) | SHIPPED | `Fit::fit` collects unique labels, `Vec<String>::sort` (lexicographic), builds `label_to_index`; mirrors sklearn `classes_ = _unique(y)` (`_label.py:98`). Critic-verified vs live oracle: `green_fit_classes_sorted` (`["bird","cat","dog"]`), `green_sort_order_mixed_ascii_matches_numpy` (`["10","2","A","B","a","b"]` == np.unique). Consumer: crate re-export `lib.rs:116`. |
18//! | REQ-2 (inverse_transform) | SHIPPED | `FittedLabelEncoder::inverse_transform` = `classes[idx]` with out-of-range → `InvalidParameter`; mirrors sklearn `classes_[y]` + `setdiff1d` guard (`:158-162`). Critic-verified: `green_inverse_transform_roundtrip`, out-of-range rejected. |
19//! | REQ-3 (transform + fit_transform) | SHIPPED | `transform` = `label_to_index.get` (unknown → `InvalidParameter`), mirrors `_encode` (`:137`); `fit_transform` mirrors `_unique(return_inverse=True)` (`:115`). Critic-verified: `green_transform` (`[1,2,1,0]`), `green_fit_transform_equals_fit_then_transform`, empty transform/inverse → empty (`:134-135`,`:155-156`). |
20//! | REQ-5 (empty-fit parity) | SHIPPED | FIXED #1134. Removed the `if x.is_empty()` → `InsufficientSamples` guard; `fit([])` now yields an empty `FittedLabelEncoder` matching sklearn `_unique([])` (`:98`). Critic-verified: `divergence_empty_fit_succeeds` + 4 post-empty-fit guards; in-module `test_empty_fit_yields_empty_classes` (R-HONEST-4). |
21//! | REQ-4 (numeric/generic dtype) | NOT-STARTED | open prereq blocker #1135. `Array1<String>`-only; sklearn accepts any dtype, numeric sort `[10,2,1]→[1,2,10]` (`np.unique`) unrepresentable (R-DEV-3). |
22//! | REQ-6 (error-contract parity, R-DEV-2) | NOT-STARTED | open prereq blocker #1136. Unseen-label message ("unknown label" vs "y contains previously unseen labels", `:137,160`) + unfitted-transform `InvalidParameter` vs `NotFittedError` (`:131`). Both REJECT (type maps to FerroError); message/NotFitted-analog gap. |
23//! | REQ-7 (PyO3 binding) | SHIPPED | FIXED #1137. `_RsLabelEncoder` (hand `#[pyclass]` in `ferrolearn-python/src/extras.rs`, the 1-D string-input analog of `_RsOrdinalEncoder`) over `FittedLabelEncoder` exposes `fit(Vec<String>)` (→ `Fit::fit` on `Array1<String>`), `transform(Vec<String>)` → numpy `int64` codes (unknown label → `PyValueError` "y contains previously unseen labels", `_label.py:137`), `inverse_transform(Vec<i64>)` → `Vec<String>` (negative code rejected pre-cast, out-of-range → `PyValueError`, `_label.py:158-160`), and `#[getter] classes_` (sorted-unique str list, `_label.py:98`); registered in `lib.rs` (`add_class::<extras::RsLabelEncoder>`). The `_extras.py::LabelEncoder(BaseEstimator)` wrapper mirrors sklearn's no-param ctor (`get_params() == {}`, `_label.py:34`), a `_to_labels(y)` helper that requires 1-D (`column_or_1d`, `_label.py:97`; 2-D → `ValueError`) and REJECTS numeric-dtype input (`NotImplementedError` — string-only core sorts lexicographically; #2230 lesson), `check_is_fitted`→`NotFittedError` pre-fit, and an explicit `fit_transform` (LabelEncoder is not a `TransformerMixin`, `_label.py:101`). Non-test consumer: `_extras.py::LabelEncoder` + `lib.rs` registration + `__init__.py` re-export (R-DEFER-1). Verification (model B): `tests/divergence_label_encoder_py.py` (20 pass, live sklearn 1.5.2 oracle) — `fit_transform(['b','a','c','a'])==[1,0,2,0]`, `classes_==['a','b','c']`, inverse roundtrip, unknown-label/out-of-range/negative `ValueError`, numeric-input `NotImplementedError` (vs sklearn numeric sort), numeric-looking-string string-sorted parity, pre-fit `NotFittedError`, 2-D `ValueError`, `get_params()=={}`, `clone`, numpy str/object array input. LabelEncoder has no `n_features_in_`/`get_feature_names_out` (target encoder). |
24//! | REQ-8 (ferray substrate) | NOT-STARTED | open prereq blocker #1138. `ndarray::Array1<String>` + `std::HashMap`, not `ferray-core` (R-SUBSTRATE-1/2). |
25
26use ferrolearn_core::error::FerroError;
27use ferrolearn_core::traits::{Fit, FitTransform, Transform};
28use ndarray::Array1;
29use std::collections::HashMap;
30
31// ---------------------------------------------------------------------------
32// LabelEncoder (unfitted)
33// ---------------------------------------------------------------------------
34
35/// An unfitted label encoder.
36///
37/// Calling [`Fit::fit`] on an `Array1<String>` learns an alphabetically
38/// ordered mapping from unique string labels to integer indices
39/// `0, 1, ..., n_classes - 1` and returns a [`FittedLabelEncoder`].
40///
41/// # Examples
42///
43/// ```
44/// use ferrolearn_preprocess::LabelEncoder;
45/// use ferrolearn_core::traits::{Fit, Transform};
46/// use ndarray::array;
47///
48/// let enc = LabelEncoder::new();
49/// let labels = array!["cat".to_string(), "dog".to_string(), "cat".to_string()];
50/// let fitted = enc.fit(&labels, &()).unwrap();
51/// let encoded = fitted.transform(&labels).unwrap();
52/// assert_eq!(encoded[0], 0); // "cat" → 0
53/// assert_eq!(encoded[1], 1); // "dog" → 1
54/// ```
55#[derive(Debug, Clone, Default)]
56pub struct LabelEncoder;
57
58impl LabelEncoder {
59    /// Create a new `LabelEncoder`.
60    #[must_use]
61    pub fn new() -> Self {
62        Self
63    }
64}
65
66// ---------------------------------------------------------------------------
67// FittedLabelEncoder
68// ---------------------------------------------------------------------------
69
70/// A fitted label encoder holding the bidirectional label-to-index mapping.
71///
72/// Created by calling [`Fit::fit`] on a [`LabelEncoder`].
73#[derive(Debug, Clone)]
74pub struct FittedLabelEncoder {
75    /// Ordered list of unique class labels (index = class integer).
76    pub(crate) classes: Vec<String>,
77    /// Map from label string to integer index.
78    pub(crate) label_to_index: HashMap<String, usize>,
79}
80
81impl FittedLabelEncoder {
82    /// Return the ordered list of class labels.
83    ///
84    /// `classes[i]` is the label corresponding to integer `i`.
85    #[must_use]
86    pub fn classes(&self) -> &[String] {
87        &self.classes
88    }
89
90    /// Return the number of unique classes.
91    #[must_use]
92    pub fn n_classes(&self) -> usize {
93        self.classes.len()
94    }
95
96    /// Map integer indices back to the original string labels.
97    ///
98    /// # Errors
99    ///
100    /// Returns [`FerroError::InvalidParameter`] if any index is out of range.
101    pub fn inverse_transform(&self, y: &Array1<usize>) -> Result<Array1<String>, FerroError> {
102        let n_classes = self.classes.len();
103        let mut out = Vec::with_capacity(y.len());
104        for (i, &idx) in y.iter().enumerate() {
105            if idx >= n_classes {
106                return Err(FerroError::InvalidParameter {
107                    name: format!("y[{i}]"),
108                    reason: format!("y contains previously unseen labels: [{idx}]"),
109                });
110            }
111            out.push(self.classes[idx].clone());
112        }
113        Ok(Array1::from_vec(out))
114    }
115}
116
117// ---------------------------------------------------------------------------
118// Trait implementations
119// ---------------------------------------------------------------------------
120
121impl Fit<Array1<String>, ()> for LabelEncoder {
122    type Fitted = FittedLabelEncoder;
123    type Error = FerroError;
124
125    /// Fit the encoder by learning the sorted set of unique labels.
126    ///
127    /// Labels are sorted alphabetically; the first label maps to `0`.
128    ///
129    /// Empty input is accepted (matching scikit-learn): it yields a fitted
130    /// encoder with an empty `classes_` (`n_classes == 0`).
131    ///
132    /// # Errors
133    ///
134    /// This method does not currently return an error.
135    fn fit(&self, x: &Array1<String>, _y: &()) -> Result<FittedLabelEncoder, FerroError> {
136        let mut unique: Vec<String> = x
137            .iter()
138            .cloned()
139            .collect::<std::collections::HashSet<_>>()
140            .into_iter()
141            .collect();
142        unique.sort();
143
144        let label_to_index: HashMap<String, usize> = unique
145            .iter()
146            .enumerate()
147            .map(|(i, label)| (label.clone(), i))
148            .collect();
149
150        Ok(FittedLabelEncoder {
151            classes: unique,
152            label_to_index,
153        })
154    }
155}
156
157impl Transform<Array1<String>> for FittedLabelEncoder {
158    type Output = Array1<usize>;
159    type Error = FerroError;
160
161    /// Transform string labels to integer indices.
162    ///
163    /// # Errors
164    ///
165    /// Returns [`FerroError::InvalidParameter`] if any label was not seen during fitting.
166    fn transform(&self, x: &Array1<String>) -> Result<Array1<usize>, FerroError> {
167        let mut out = Vec::with_capacity(x.len());
168        for (i, label) in x.iter().enumerate() {
169            match self.label_to_index.get(label) {
170                Some(&idx) => out.push(idx),
171                None => {
172                    return Err(FerroError::InvalidParameter {
173                        name: format!("x[{i}]"),
174                        reason: format!("y contains previously unseen labels: \"{label}\""),
175                    });
176                }
177            }
178        }
179        Ok(Array1::from_vec(out))
180    }
181}
182
183/// Implement `Transform` on the unfitted encoder to satisfy the `FitTransform: Transform`
184/// supertrait bound. Calling `transform` on an unfitted encoder always returns an error.
185impl Transform<Array1<String>> for LabelEncoder {
186    type Output = Array1<usize>;
187    type Error = FerroError;
188
189    /// Always returns an error — the encoder must be fitted first.
190    ///
191    /// Use [`Fit::fit`] to produce a [`FittedLabelEncoder`], then call
192    /// [`Transform::transform`] on that.
193    fn transform(&self, _x: &Array1<String>) -> Result<Array1<usize>, FerroError> {
194        Err(FerroError::InvalidParameter {
195            name: "LabelEncoder".into(),
196            reason: "encoder must be fitted before calling transform; use fit() first".into(),
197        })
198    }
199}
200
201impl FitTransform<Array1<String>> for LabelEncoder {
202    type FitError = FerroError;
203
204    /// Fit the encoder on `x` and return the encoded output in one step.
205    ///
206    /// # Errors
207    ///
208    /// Returns an error if fitting or transformation fails.
209    fn fit_transform(&self, x: &Array1<String>) -> Result<Array1<usize>, FerroError> {
210        let fitted = self.fit(x, &())?;
211        fitted.transform(x)
212    }
213}
214
215// ---------------------------------------------------------------------------
216// Tests
217// ---------------------------------------------------------------------------
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222    use ndarray::array;
223
224    fn str_arr(v: &[&str]) -> Array1<String> {
225        Array1::from_vec(v.iter().map(std::string::ToString::to_string).collect())
226    }
227
228    #[test]
229    fn test_label_encoder_basic() {
230        let enc = LabelEncoder::new();
231        let labels = str_arr(&["cat", "dog", "cat", "bird"]);
232        let fitted = enc.fit(&labels, &()).unwrap();
233
234        // Classes should be sorted alphabetically
235        assert_eq!(fitted.classes(), &["bird", "cat", "dog"]);
236        assert_eq!(fitted.n_classes(), 3);
237
238        let encoded = fitted.transform(&labels).unwrap();
239        assert_eq!(encoded[0], 1); // "cat" → 1
240        assert_eq!(encoded[1], 2); // "dog" → 2
241        assert_eq!(encoded[2], 1); // "cat" → 1
242        assert_eq!(encoded[3], 0); // "bird" → 0
243    }
244
245    #[test]
246    fn test_inverse_transform_roundtrip() {
247        let enc = LabelEncoder::new();
248        let labels = str_arr(&["a", "b", "c", "a", "b"]);
249        let fitted = enc.fit(&labels, &()).unwrap();
250        let encoded = fitted.transform(&labels).unwrap();
251        let recovered = fitted.inverse_transform(&encoded).unwrap();
252        for (orig, rec) in labels.iter().zip(recovered.iter()) {
253            assert_eq!(orig, rec);
254        }
255    }
256
257    #[test]
258    fn test_unknown_label_error() {
259        let enc = LabelEncoder::new();
260        let labels = str_arr(&["a", "b"]);
261        let fitted = enc.fit(&labels, &()).unwrap();
262        let unknown = str_arr(&["c"]);
263        assert!(fitted.transform(&unknown).is_err());
264    }
265
266    #[test]
267    fn test_inverse_transform_out_of_range() {
268        let enc = LabelEncoder::new();
269        let labels = str_arr(&["x", "y"]);
270        let fitted = enc.fit(&labels, &()).unwrap();
271        let bad_indices = array![5usize];
272        assert!(fitted.inverse_transform(&bad_indices).is_err());
273    }
274
275    #[test]
276    fn test_fit_transform_equivalence() {
277        let enc = LabelEncoder::new();
278        let labels = str_arr(&["foo", "bar", "foo", "baz"]);
279        let via_fit_transform = enc.fit_transform(&labels).unwrap();
280        let fitted = enc.fit(&labels, &()).unwrap();
281        let via_separate = fitted.transform(&labels).unwrap();
282        assert_eq!(via_fit_transform, via_separate);
283    }
284
285    #[test]
286    fn test_empty_fit_yields_empty_classes() {
287        // sklearn LabelEncoder().fit([]) succeeds with classes_ == [] (shape (0,)).
288        let enc = LabelEncoder::new();
289        let empty: Array1<String> = Array1::from_vec(vec![]);
290        let fitted = enc
291            .fit(&empty, &())
292            .ok()
293            .filter(|f| f.n_classes() == 0 && f.classes().is_empty());
294        assert!(fitted.is_some());
295    }
296
297    #[test]
298    fn test_single_class() {
299        let enc = LabelEncoder::new();
300        let labels = str_arr(&["only", "only", "only"]);
301        let fitted = enc.fit(&labels, &()).unwrap();
302        assert_eq!(fitted.n_classes(), 1);
303        let encoded = fitted.transform(&labels).unwrap();
304        assert!(encoded.iter().all(|&v| v == 0));
305    }
306}