1use ghostflow_core::Tensor;
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use std::collections::hash_map::DefaultHasher;
9
10pub struct PolynomialFeatures {
15 pub degree: usize,
16 pub interaction_only: bool,
17 pub include_bias: bool,
18 n_input_features: usize,
19 n_output_features: usize,
20}
21
22impl PolynomialFeatures {
23 pub fn new(degree: usize) -> Self {
24 Self {
25 degree,
26 interaction_only: false,
27 include_bias: true,
28 n_input_features: 0,
29 n_output_features: 0,
30 }
31 }
32
33 pub fn interaction_only(mut self, value: bool) -> Self {
34 self.interaction_only = value;
35 self
36 }
37
38 pub fn include_bias(mut self, value: bool) -> Self {
39 self.include_bias = value;
40 self
41 }
42
43 pub fn fit(&mut self, x: &Tensor) {
45 self.n_input_features = x.dims()[1];
46 self.n_output_features = self.calculate_n_output_features();
47 }
48
49 fn calculate_n_output_features(&self) -> usize {
50 let n = self.n_input_features;
51 let d = self.degree;
52
53 let mut count = if self.include_bias { 1 } else { 0 };
54
55 if self.interaction_only {
56 for degree in 1..=d {
58 count += Self::n_combinations(n, degree);
59 }
60 } else {
61 count += Self::n_combinations_with_replacement(n + d, d) - 1;
63 if !self.include_bias {
64 count -= 1;
65 }
66 }
67
68 count
69 }
70
71 fn n_combinations(n: usize, k: usize) -> usize {
72 if k > n {
73 return 0;
74 }
75 let mut result = 1;
76 for i in 0..k {
77 result = result * (n - i) / (i + 1);
78 }
79 result
80 }
81
82 fn n_combinations_with_replacement(n: usize, k: usize) -> usize {
83 Self::n_combinations(n + k - 1, k)
84 }
85
86 pub fn transform(&self, x: &Tensor) -> Tensor {
88 let n_samples = x.dims()[0];
89 let x_data = x.data_f32();
90
91 let mut all_features = Vec::new();
92 let mut actual_n_features = 0;
93
94 for i in 0..n_samples {
95 let sample = &x_data[i * self.n_input_features..(i + 1) * self.n_input_features];
96 let poly_features = self.generate_polynomial_features(sample);
97 if i == 0 {
98 actual_n_features = poly_features.len();
99 }
100 all_features.extend(poly_features);
101 }
102
103 Tensor::from_slice(&all_features, &[n_samples, actual_n_features]).unwrap()
104 }
105
106 fn generate_polynomial_features(&self, sample: &[f32]) -> Vec<f32> {
107 let mut features = Vec::new();
108
109 if self.include_bias {
110 features.push(1.0);
111 }
112
113 self.generate_combinations(sample, &mut features, &mut Vec::new(), 0, 0);
115
116 features
117 }
118
119 fn generate_combinations(
120 &self,
121 sample: &[f32],
122 features: &mut Vec<f32>,
123 current: &mut Vec<usize>,
124 start: usize,
125 current_degree: usize,
126 ) {
127 if current_degree > 0 {
128 let mut product = 1.0;
130 for &idx in current.iter() {
131 product *= sample[idx];
132 }
133 features.push(product);
134 }
135
136 if current_degree >= self.degree {
137 return;
138 }
139
140 for i in start..self.n_input_features {
141 current.push(i);
142
143 let next_start = if self.interaction_only { i + 1 } else { i };
144 self.generate_combinations(sample, features, current, next_start, current_degree + 1);
145
146 current.pop();
147 }
148 }
149
150 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
151 self.fit(x);
152 self.transform(x)
153 }
154}
155
156pub struct FeatureHasher {
161 pub n_features: usize,
162 pub alternate_sign: bool,
163}
164
165impl FeatureHasher {
166 pub fn new(n_features: usize) -> Self {
167 Self {
168 n_features,
169 alternate_sign: true,
170 }
171 }
172
173 pub fn alternate_sign(mut self, value: bool) -> Self {
174 self.alternate_sign = value;
175 self
176 }
177
178 pub fn transform_strings(&self, features: &[Vec<String>]) -> Tensor {
180 let n_samples = features.len();
181 let mut output = vec![0.0f32; n_samples * self.n_features];
182
183 for (i, sample_features) in features.iter().enumerate() {
184 for feature in sample_features {
185 let hash = self.hash_feature(feature);
186 let idx = (hash % self.n_features as u64) as usize;
187 let sign = if self.alternate_sign && (hash / self.n_features as u64) % 2 == 1 {
188 -1.0
189 } else {
190 1.0
191 };
192 output[i * self.n_features + idx] += sign;
193 }
194 }
195
196 Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
197 }
198
199 pub fn transform_pairs(&self, features: &[Vec<(String, f32)>]) -> Tensor {
201 let n_samples = features.len();
202 let mut output = vec![0.0f32; n_samples * self.n_features];
203
204 for (i, sample_features) in features.iter().enumerate() {
205 for (feature, value) in sample_features {
206 let hash = self.hash_feature(feature);
207 let idx = (hash % self.n_features as u64) as usize;
208 let sign = if self.alternate_sign && (hash / self.n_features as u64) % 2 == 1 {
209 -1.0
210 } else {
211 1.0
212 };
213 output[i * self.n_features + idx] += sign * value;
214 }
215 }
216
217 Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
218 }
219
220 fn hash_feature(&self, feature: &str) -> u64 {
221 let mut hasher = DefaultHasher::new();
222 feature.hash(&mut hasher);
223 hasher.finish()
224 }
225}
226
227pub struct TargetEncoder {
232 pub smoothing: f32,
233 pub min_samples_leaf: usize,
234 encodings: HashMap<String, f32>,
235 global_mean: f32,
236}
237
238impl TargetEncoder {
239 pub fn new() -> Self {
240 Self {
241 smoothing: 1.0,
242 min_samples_leaf: 1,
243 encodings: HashMap::new(),
244 global_mean: 0.0,
245 }
246 }
247
248 pub fn smoothing(mut self, value: f32) -> Self {
249 self.smoothing = value;
250 self
251 }
252
253 pub fn min_samples_leaf(mut self, value: usize) -> Self {
254 self.min_samples_leaf = value;
255 self
256 }
257
258 pub fn fit(&mut self, categories: &[String], target: &[f32]) {
260 assert_eq!(categories.len(), target.len());
261
262 self.global_mean = target.iter().sum::<f32>() / target.len() as f32;
264
265 let mut category_stats: HashMap<String, (f32, usize)> = HashMap::new();
267
268 for (cat, &tgt) in categories.iter().zip(target.iter()) {
269 let entry = category_stats.entry(cat.clone()).or_insert((0.0, 0));
270 entry.0 += tgt;
271 entry.1 += 1;
272 }
273
274 for (category, (sum, count)) in category_stats {
276 if count >= self.min_samples_leaf {
277 let category_mean = sum / count as f32;
278 let smoothed = (count as f32 * category_mean + self.smoothing * self.global_mean)
280 / (count as f32 + self.smoothing);
281 self.encodings.insert(category, smoothed);
282 }
283 }
284 }
285
286 pub fn transform(&self, categories: &[String]) -> Vec<f32> {
288 categories
289 .iter()
290 .map(|cat| {
291 *self.encodings.get(cat).unwrap_or(&self.global_mean)
292 })
293 .collect()
294 }
295
296 pub fn fit_transform(&mut self, categories: &[String], target: &[f32]) -> Vec<f32> {
297 self.fit(categories, target);
298 self.transform(categories)
299 }
300}
301
302pub struct OneHotEncoder {
306 categories: Vec<Vec<String>>,
307 n_features: usize,
308}
309
310impl OneHotEncoder {
311 pub fn new() -> Self {
312 Self {
313 categories: Vec::new(),
314 n_features: 0,
315 }
316 }
317
318 pub fn fit(&mut self, data: &[Vec<String>]) {
320 if data.is_empty() {
321 return;
322 }
323
324 let n_cols = data[0].len();
325 self.categories = vec![Vec::new(); n_cols];
326
327 for sample in data {
329 for (col_idx, value) in sample.iter().enumerate() {
330 if !self.categories[col_idx].contains(value) {
331 self.categories[col_idx].push(value.clone());
332 }
333 }
334 }
335
336 for cats in &mut self.categories {
338 cats.sort();
339 }
340
341 self.n_features = self.categories.iter().map(|cats| cats.len()).sum();
343 }
344
345 pub fn transform(&self, data: &[Vec<String>]) -> Tensor {
347 let n_samples = data.len();
348 let mut output = vec![0.0f32; n_samples * self.n_features];
349
350 for (sample_idx, sample) in data.iter().enumerate() {
351 let mut feature_offset = 0;
352
353 for (col_idx, value) in sample.iter().enumerate() {
354 if let Some(cat_idx) = self.categories[col_idx].iter().position(|c| c == value) {
355 let output_idx = sample_idx * self.n_features + feature_offset + cat_idx;
356 output[output_idx] = 1.0;
357 }
358 feature_offset += self.categories[col_idx].len();
359 }
360 }
361
362 Tensor::from_slice(&output, &[n_samples, self.n_features]).unwrap()
363 }
364
365 pub fn fit_transform(&mut self, data: &[Vec<String>]) -> Tensor {
366 self.fit(data);
367 self.transform(data)
368 }
369
370 pub fn get_feature_names(&self) -> Vec<String> {
372 let mut names = Vec::new();
373
374 for (col_idx, cats) in self.categories.iter().enumerate() {
375 for cat in cats {
376 names.push(format!("col{}_{}", col_idx, cat));
377 }
378 }
379
380 names
381 }
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 #[test]
389 fn test_polynomial_features() {
390 let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
391
392 let mut poly = PolynomialFeatures::new(2);
393 let transformed = poly.fit_transform(&x);
394
395 assert!(transformed.dims()[1] > x.dims()[1]);
397 assert_eq!(transformed.dims()[0], 2); }
399
400 #[test]
401 fn test_feature_hasher() {
402 let features = vec![
403 vec!["feature1".to_string(), "feature2".to_string()],
404 vec!["feature3".to_string()],
405 ];
406
407 let hasher = FeatureHasher::new(10);
408 let hashed = hasher.transform_strings(&features);
409
410 assert_eq!(hashed.dims(), &[2, 10]);
411 }
412
413 #[test]
414 fn test_target_encoder() {
415 let categories = vec![
416 "A".to_string(),
417 "B".to_string(),
418 "A".to_string(),
419 "B".to_string(),
420 ];
421 let target = vec![1.0, 0.0, 1.0, 0.0];
422
423 let mut encoder = TargetEncoder::new();
424 let encoded = encoder.fit_transform(&categories, &target);
425
426 assert_eq!(encoded.len(), 4);
427 }
428
429 #[test]
430 fn test_one_hot_encoder() {
431 let data = vec![
432 vec!["A".to_string(), "X".to_string()],
433 vec!["B".to_string(), "Y".to_string()],
434 vec!["A".to_string(), "X".to_string()],
435 ];
436
437 let mut encoder = OneHotEncoder::new();
438 let encoded = encoder.fit_transform(&data);
439
440 assert_eq!(encoded.dims()[1], 4);
442 }
443}
444
445