sklears_python/preprocessing/label_encoder.rs
1//! Python bindings for LabelEncoder
2//!
3//! This module provides Python bindings for LabelEncoder,
4//! offering scikit-learn compatible label encoding for categorical features.
5
6use pyo3::exceptions::PyValueError;
7use pyo3::prelude::*;
8use std::collections::HashMap;
9
10/// LabelEncoder state after fitting
11#[derive(Debug, Clone)]
12struct LabelEncoderState {
13 classes: Vec<String>,
14 class_to_index: HashMap<String, usize>,
15}
16
17/// Encode target labels with value between 0 and n_classes-1.
18///
19/// This transformer should be used to encode target values, *i.e.* `y`, and
20/// not the input `X`.
21///
22/// Attributes
23/// ----------
24/// classes_ : list of shape (n_classes,)
25/// Holds the label for each class.
26///
27/// Examples
28/// --------
29/// >>> from sklears_python import LabelEncoder
30/// >>> le = LabelEncoder()
31/// >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
32/// LabelEncoder()
33/// >>> list(le.classes_)
34/// ['amsterdam', 'paris', 'tokyo']
35/// >>> le.transform(["tokyo", "tokyo", "paris"])
36/// [2, 2, 1]
37/// >>> list(le.inverse_transform([2, 2, 1]))
38/// ['tokyo', 'tokyo', 'paris']
39#[pyclass(name = "LabelEncoder")]
40pub struct PyLabelEncoder {
41 state: Option<LabelEncoderState>,
42}
43
44#[pymethods]
45impl PyLabelEncoder {
46 #[new]
47 fn new() -> Self {
48 Self { state: None }
49 }
50
51 /// Fit label encoder.
52 ///
53 /// Parameters
54 /// ----------
55 /// y : array-like of shape (n_samples,)
56 /// Target values.
57 ///
58 /// Returns
59 /// -------
60 /// self : returns an instance of self
61 /// Fitted label encoder.
62 fn fit(&mut self, y: Vec<String>) -> PyResult<()> {
63 if y.is_empty() {
64 return Err(PyValueError::new_err("y cannot be empty"));
65 }
66
67 // Get unique classes and sort them
68 let mut classes: Vec<String> = y
69 .into_iter()
70 .collect::<std::collections::HashSet<_>>()
71 .into_iter()
72 .collect();
73 classes.sort();
74
75 // Create mapping from class to index
76 let class_to_index: HashMap<String, usize> = classes
77 .iter()
78 .enumerate()
79 .map(|(i, c)| (c.clone(), i))
80 .collect();
81
82 self.state = Some(LabelEncoderState {
83 classes,
84 class_to_index,
85 });
86
87 Ok(())
88 }
89
90 /// Fit label encoder and return encoded labels.
91 ///
92 /// Parameters
93 /// ----------
94 /// y : array-like of shape (n_samples,)
95 /// Target values.
96 ///
97 /// Returns
98 /// -------
99 /// y : array-like of shape (n_samples,)
100 /// Encoded labels.
101 fn fit_transform(&mut self, y: Vec<String>) -> PyResult<Vec<i64>> {
102 self.fit(y.clone())?;
103 self.transform(y)
104 }
105
106 /// Transform labels to normalized encoding.
107 ///
108 /// Parameters
109 /// ----------
110 /// y : array-like of shape (n_samples,)
111 /// Target values.
112 ///
113 /// Returns
114 /// -------
115 /// y : array-like of shape (n_samples,)
116 /// Labels as normalized encodings.
117 fn transform(&self, y: Vec<String>) -> PyResult<Vec<i64>> {
118 let state = self
119 .state
120 .as_ref()
121 .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
122
123 let mut encoded = Vec::with_capacity(y.len());
124
125 for label in y.iter() {
126 match state.class_to_index.get(label) {
127 Some(&index) => encoded.push(index as i64),
128 None => {
129 return Err(PyValueError::new_err(format!(
130 "Unknown label '{}'. Label encoder has only seen: {:?}",
131 label, state.classes
132 )));
133 }
134 }
135 }
136
137 Ok(encoded)
138 }
139
140 /// Transform labels back to original encoding.
141 ///
142 /// Parameters
143 /// ----------
144 /// y : array-like of shape (n_samples,)
145 /// Target values.
146 ///
147 /// Returns
148 /// -------
149 /// y : array-like of shape (n_samples,)
150 /// Original encoding.
151 fn inverse_transform(&self, y: Vec<i64>) -> PyResult<Vec<String>> {
152 let state = self
153 .state
154 .as_ref()
155 .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
156
157 let mut decoded = Vec::with_capacity(y.len());
158
159 for &index in y.iter() {
160 if index < 0 || index >= state.classes.len() as i64 {
161 return Err(PyValueError::new_err(format!(
162 "Index {} is out of bounds for {} classes",
163 index,
164 state.classes.len()
165 )));
166 }
167 decoded.push(state.classes[index as usize].clone());
168 }
169
170 Ok(decoded)
171 }
172
173 /// Get the classes seen during fit
174 #[getter]
175 fn classes_(&self) -> PyResult<Vec<String>> {
176 let state = self
177 .state
178 .as_ref()
179 .ok_or_else(|| PyValueError::new_err("LabelEncoder not fitted. Call fit() first."))?;
180
181 Ok(state.classes.clone())
182 }
183
184 /// String representation
185 fn __repr__(&self) -> String {
186 "LabelEncoder()".to_string()
187 }
188}