sklears_python/preprocessing/
label_encoder.rs

1//! Python bindings for LabelEncoder
2//!
3//! This module provides Python bindings for LabelEncoder,
4//! offering scikit-learn compatible label encoding for categorical features.
5
6use pyo3::exceptions::PyValueError;
7use pyo3::prelude::*;
8use std::collections::HashMap;
9
10/// LabelEncoder state after fitting
11#[derive(Debug, Clone)]
12struct LabelEncoderState {
13    classes: Vec<String>,
14    class_to_index: HashMap<String, usize>,
15}
16
17/// Encode target labels with value between 0 and n_classes-1.
18///
19/// This transformer should be used to encode target values, *i.e.* `y`, and
20/// not the input `X`.
21///
22/// Attributes
23/// ----------
24/// classes_ : list of shape (n_classes,)
25///     Holds the label for each class.
26///
27/// Examples
28/// --------
29/// >>> from sklears_python import LabelEncoder
30/// >>> le = LabelEncoder()
31/// >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
32/// LabelEncoder()
33/// >>> list(le.classes_)
34/// ['amsterdam', 'paris', 'tokyo']
35/// >>> le.transform(["tokyo", "tokyo", "paris"])
36/// [2, 2, 1]
37/// >>> list(le.inverse_transform([2, 2, 1]))
38/// ['tokyo', 'tokyo', 'paris']
39#[pyclass(name = "LabelEncoder")]
40pub struct PyLabelEncoder {
41    state: Option<LabelEncoderState>,
42}
43
44#[pymethods]
45impl PyLabelEncoder {
46    #[new]
47    fn new() -> Self {
48        Self { state: None }
49    }
50
51    /// Fit label encoder.
52    ///
53    /// Parameters
54    /// ----------
55    /// y : array-like of shape (n_samples,)
56    ///     Target values.
57    ///
58    /// Returns
59    /// -------
60    /// self : returns an instance of self
61    ///     Fitted label encoder.
62    fn fit(&mut self, y: Vec<String>) -> PyResult<()> {
63        if y.is_empty() {
64            return Err(PyValueError::new_err("y cannot be empty"));
65        }
66
67        // Get unique classes and sort them
68        let mut classes: Vec<String> = y
69            .into_iter()
70            .collect::<std::collections::HashSet<_>>()
71            .into_iter()
72            .collect();
73        classes.sort();
74
75        // Create mapping from class to index
76        let class_to_index: HashMap<String, usize> = classes
77            .iter()
78            .enumerate()
79            .map(|(i, c)| (c.clone(), i))
80            .collect();
81
82        self.state = Some(LabelEncoderState {
83            classes,
84            class_to_index,
85        });
86
87        Ok(())
88    }
89
90    /// Fit label encoder and return encoded labels.
91    ///
92    /// Parameters
93    /// ----------
94    /// y : array-like of shape (n_samples,)
95    ///     Target values.
96    ///
97    /// Returns
98    /// -------
99    /// y : array-like of shape (n_samples,)
100    ///     Encoded labels.
101    fn fit_transform(&mut self, y: Vec<String>) -> PyResult<Vec<i64>> {
102        self.fit(y.clone())?;
103        self.transform(y)
104    }
105
106    /// Transform labels to normalized encoding.
107    ///
108    /// Parameters
109    /// ----------
110    /// y : array-like of shape (n_samples,)
111    ///     Target values.
112    ///
113    /// Returns
114    /// -------
115    /// y : array-like of shape (n_samples,)
116    ///     Labels as normalized encodings.
117    fn transform(&self, y: Vec<String>) -> PyResult<Vec<i64>> {
118        let state = self
119            .state
120            .as_ref()
121            .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
122
123        let mut encoded = Vec::with_capacity(y.len());
124
125        for label in y.iter() {
126            match state.class_to_index.get(label) {
127                Some(&index) => encoded.push(index as i64),
128                None => {
129                    return Err(PyValueError::new_err(format!(
130                        "Unknown label '{}'. Label encoder has only seen: {:?}",
131                        label, state.classes
132                    )));
133                }
134            }
135        }
136
137        Ok(encoded)
138    }
139
140    /// Transform labels back to original encoding.
141    ///
142    /// Parameters
143    /// ----------
144    /// y : array-like of shape (n_samples,)
145    ///     Target values.
146    ///
147    /// Returns
148    /// -------
149    /// y : array-like of shape (n_samples,)
150    ///     Original encoding.
151    fn inverse_transform(&self, y: Vec<i64>) -> PyResult<Vec<String>> {
152        let state = self
153            .state
154            .as_ref()
155            .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
156
157        let mut decoded = Vec::with_capacity(y.len());
158
159        for &index in y.iter() {
160            if index < 0 || index >= state.classes.len() as i64 {
161                return Err(PyValueError::new_err(format!(
162                    "Index {} is out of bounds for {} classes",
163                    index,
164                    state.classes.len()
165                )));
166            }
167            decoded.push(state.classes[index as usize].clone());
168        }
169
170        Ok(decoded)
171    }
172
173    /// Get the classes seen during fit
174    #[getter]
175    fn classes_(&self) -> PyResult<Vec<String>> {
176        let state = self
177            .state
178            .as_ref()
179            .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
180
181        Ok(state.classes.clone())
182    }
183
184    /// String representation
185    fn __repr__(&self) -> String {
186        "LabelEncoder()".to_string()
187    }
188}