1use ghostflow_core::Tensor;
4
5pub struct VarianceThreshold {
7 pub threshold: f32,
8 variances_: Option<Vec<f32>>,
9 mask_: Option<Vec<bool>>,
10 n_features_in_: usize,
11}
12
13impl VarianceThreshold {
14 pub fn new(threshold: f32) -> Self {
15 VarianceThreshold {
16 threshold,
17 variances_: None,
18 mask_: None,
19 n_features_in_: 0,
20 }
21 }
22
23 pub fn fit(&mut self, x: &Tensor) {
24 let x_data = x.data_f32();
25 let n_samples = x.dims()[0];
26 let n_features = x.dims()[1];
27
28 self.n_features_in_ = n_features;
29
30 let variances: Vec<f32> = (0..n_features)
32 .map(|j| {
33 let mean: f32 = (0..n_samples)
34 .map(|i| x_data[i * n_features + j])
35 .sum::<f32>() / n_samples as f32;
36
37 (0..n_samples)
38 .map(|i| (x_data[i * n_features + j] - mean).powi(2))
39 .sum::<f32>() / n_samples as f32
40 })
41 .collect();
42
43 let mask: Vec<bool> = variances.iter().map(|&v| v > self.threshold).collect();
44
45 self.variances_ = Some(variances);
46 self.mask_ = Some(mask);
47 }
48
49 pub fn transform(&self, x: &Tensor) -> Tensor {
50 let x_data = x.data_f32();
51 let n_samples = x.dims()[0];
52 let n_features = x.dims()[1];
53
54 let mask = self.mask_.as_ref().expect("Model not fitted");
55 let n_selected: usize = mask.iter().filter(|&&m| m).count();
56
57 let mut result = Vec::with_capacity(n_samples * n_selected);
58
59 for i in 0..n_samples {
60 for (j, &keep) in mask.iter().enumerate() {
61 if keep {
62 result.push(x_data[i * n_features + j]);
63 }
64 }
65 }
66
67 Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
68 }
69
70 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
71 self.fit(x);
72 self.transform(x)
73 }
74
75 pub fn get_support(&self) -> Vec<bool> {
76 self.mask_.clone().unwrap_or_default()
77 }
78}
79
80pub struct SelectKBest {
82 pub k: usize,
83 pub score_func: ScoreFunction,
84 scores_: Option<Vec<f32>>,
85 #[allow(dead_code)]
86 pvalues_: Option<Vec<f32>>,
87 mask_: Option<Vec<bool>>,
88 n_features_in_: usize,
89}
90
91#[derive(Clone, Copy, Debug)]
92pub enum ScoreFunction {
93 FClassif, MutualInfoClassif,
95 Chi2,
96 FRegression, MutualInfoRegression,
98}
99
100impl SelectKBest {
101 pub fn new(k: usize, score_func: ScoreFunction) -> Self {
102 SelectKBest {
103 k,
104 score_func,
105 scores_: None,
106 pvalues_: None,
107 mask_: None,
108 n_features_in_: 0,
109 }
110 }
111
112 fn f_classif(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
113 let n_classes = y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
114
115 (0..n_features)
116 .map(|j| {
117 let mut class_sums = vec![0.0f32; n_classes];
119 let mut class_counts = vec![0usize; n_classes];
120 let mut total_sum = 0.0f32;
121
122 for i in 0..n_samples {
123 let class = y[i] as usize;
124 let val = x[i * n_features + j];
125 class_sums[class] += val;
126 class_counts[class] += 1;
127 total_sum += val;
128 }
129
130 let overall_mean = total_sum / n_samples as f32;
131 let class_means: Vec<f32> = class_sums.iter()
132 .zip(class_counts.iter())
133 .map(|(&sum, &count)| if count > 0 { sum / count as f32 } else { 0.0 })
134 .collect();
135
136 let ss_between: f32 = class_means.iter()
138 .zip(class_counts.iter())
139 .map(|(&mean, &count)| count as f32 * (mean - overall_mean).powi(2))
140 .sum();
141
142 let mut ss_within = 0.0f32;
144 for i in 0..n_samples {
145 let class = y[i] as usize;
146 let val = x[i * n_features + j];
147 ss_within += (val - class_means[class]).powi(2);
148 }
149
150 let df_between = (n_classes - 1) as f32;
152 let df_within = (n_samples - n_classes) as f32;
153
154 if ss_within > 1e-10 && df_within > 0.0 {
155 (ss_between / df_between) / (ss_within / df_within)
156 } else {
157 0.0
158 }
159 })
160 .collect()
161 }
162
163 fn chi2(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
164 let n_classes = y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1;
165
166 (0..n_features)
167 .map(|j| {
168 let mut observed = vec![vec![0.0f32; 2]; n_classes]; let mut class_totals = vec![0.0f32; n_classes];
171 let mut feature_totals = [0.0f32; 2];
172
173 for i in 0..n_samples {
174 let class = y[i] as usize;
175 let val = if x[i * n_features + j] > 0.0 { 1 } else { 0 };
176 observed[class][val] += 1.0;
177 class_totals[class] += 1.0;
178 feature_totals[val] += 1.0;
179 }
180
181 let total = n_samples as f32;
182 let mut chi2 = 0.0f32;
183
184 for c in 0..n_classes {
185 for f in 0..2 {
186 let expected = class_totals[c] * feature_totals[f] / total;
187 if expected > 0.0 {
188 chi2 += (observed[c][f] - expected).powi(2) / expected;
189 }
190 }
191 }
192
193 chi2
194 })
195 .collect()
196 }
197
198 fn f_regression(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize) -> Vec<f32> {
199 let y_mean: f32 = y.iter().sum::<f32>() / n_samples as f32;
200 let ss_tot: f32 = y.iter().map(|&yi| (yi - y_mean).powi(2)).sum();
201
202 (0..n_features)
203 .map(|j| {
204 let x_mean: f32 = (0..n_samples)
206 .map(|i| x[i * n_features + j])
207 .sum::<f32>() / n_samples as f32;
208
209 let mut cov = 0.0f32;
210 let mut var_x = 0.0f32;
211
212 for i in 0..n_samples {
213 let xi = x[i * n_features + j] - x_mean;
214 let yi = y[i] - y_mean;
215 cov += xi * yi;
216 var_x += xi * xi;
217 }
218
219 if var_x < 1e-10 {
220 return 0.0;
221 }
222
223 let slope = cov / var_x;
224 let intercept = y_mean - slope * x_mean;
225
226 let ss_res: f32 = (0..n_samples)
228 .map(|i| {
229 let pred = slope * x[i * n_features + j] + intercept;
230 (y[i] - pred).powi(2)
231 })
232 .sum();
233
234 let r2 = 1.0 - ss_res / ss_tot.max(1e-10);
235 let df1 = 1.0f32;
236 let df2 = (n_samples - 2) as f32;
237
238 if r2 < 1.0 && df2 > 0.0 {
239 (r2 / df1) / ((1.0 - r2) / df2)
240 } else {
241 0.0
242 }
243 })
244 .collect()
245 }
246
247 fn mutual_info(&self, x: &[f32], y: &[f32], n_samples: usize, n_features: usize, is_classification: bool) -> Vec<f32> {
248 let n_bins = 10;
250
251 (0..n_features)
252 .map(|j| {
253 let feature_vals: Vec<f32> = (0..n_samples)
254 .map(|i| x[i * n_features + j])
255 .collect();
256
257 let min_val = feature_vals.iter().cloned().fold(f32::INFINITY, f32::min);
258 let max_val = feature_vals.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
259 let range = (max_val - min_val).max(1e-10);
260
261 let x_binned: Vec<usize> = feature_vals.iter()
263 .map(|&v| (((v - min_val) / range * (n_bins - 1) as f32) as usize).min(n_bins - 1))
264 .collect();
265
266 let y_binned: Vec<usize> = if is_classification {
268 y.iter().map(|&v| v as usize).collect()
269 } else {
270 let y_min = y.iter().cloned().fold(f32::INFINITY, f32::min);
271 let y_max = y.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
272 let y_range = (y_max - y_min).max(1e-10);
273 y.iter()
274 .map(|&v| (((v - y_min) / y_range * (n_bins - 1) as f32) as usize).min(n_bins - 1))
275 .collect()
276 };
277
278 let n_y_bins = if is_classification {
279 y.iter().map(|&v| v as usize).max().unwrap_or(0) + 1
280 } else {
281 n_bins
282 };
283
284 let mut joint = vec![vec![0.0f32; n_y_bins]; n_bins];
286 let mut p_x = vec![0.0f32; n_bins];
287 let mut p_y = vec![0.0f32; n_y_bins];
288
289 for i in 0..n_samples {
290 joint[x_binned[i]][y_binned[i]] += 1.0;
291 p_x[x_binned[i]] += 1.0;
292 p_y[y_binned[i]] += 1.0;
293 }
294
295 let n = n_samples as f32;
296 for p in &mut p_x { *p /= n; }
297 for p in &mut p_y { *p /= n; }
298 for row in &mut joint {
299 for p in row { *p /= n; }
300 }
301
302 let mut mi = 0.0f32;
304 for xi in 0..n_bins {
305 for yi in 0..n_y_bins {
306 if joint[xi][yi] > 1e-10 && p_x[xi] > 1e-10 && p_y[yi] > 1e-10 {
307 mi += joint[xi][yi] * (joint[xi][yi] / (p_x[xi] * p_y[yi])).ln();
308 }
309 }
310 }
311
312 mi.max(0.0)
313 })
314 .collect()
315 }
316
317 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
318 let x_data = x.data_f32();
319 let y_data = y.data_f32();
320 let n_samples = x.dims()[0];
321 let n_features = x.dims()[1];
322
323 self.n_features_in_ = n_features;
324
325 let scores = match self.score_func {
326 ScoreFunction::FClassif => self.f_classif(&x_data, &y_data, n_samples, n_features),
327 ScoreFunction::Chi2 => self.chi2(&x_data, &y_data, n_samples, n_features),
328 ScoreFunction::FRegression => self.f_regression(&x_data, &y_data, n_samples, n_features),
329 ScoreFunction::MutualInfoClassif => self.mutual_info(&x_data, &y_data, n_samples, n_features, true),
330 ScoreFunction::MutualInfoRegression => self.mutual_info(&x_data, &y_data, n_samples, n_features, false),
331 };
332
333 let mut indexed_scores: Vec<(usize, f32)> = scores.iter().cloned().enumerate().collect();
335 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
336
337 let k = self.k.min(n_features);
338 let selected_indices: Vec<usize> = indexed_scores.iter().take(k).map(|(i, _)| *i).collect();
339
340 let mut mask = vec![false; n_features];
341 for &idx in &selected_indices {
342 mask[idx] = true;
343 }
344
345 self.scores_ = Some(scores);
346 self.mask_ = Some(mask);
347 }
348
349 pub fn transform(&self, x: &Tensor) -> Tensor {
350 let x_data = x.data_f32();
351 let n_samples = x.dims()[0];
352 let n_features = x.dims()[1];
353
354 let mask = self.mask_.as_ref().expect("Model not fitted");
355 let n_selected: usize = mask.iter().filter(|&&m| m).count();
356
357 let mut result = Vec::with_capacity(n_samples * n_selected);
358
359 for i in 0..n_samples {
360 for (j, &keep) in mask.iter().enumerate() {
361 if keep {
362 result.push(x_data[i * n_features + j]);
363 }
364 }
365 }
366
367 Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
368 }
369
370 pub fn fit_transform(&mut self, x: &Tensor, y: &Tensor) -> Tensor {
371 self.fit(x, y);
372 self.transform(x)
373 }
374
375 pub fn get_support(&self) -> Vec<bool> {
376 self.mask_.clone().unwrap_or_default()
377 }
378
379 pub fn scores(&self) -> Vec<f32> {
380 self.scores_.clone().unwrap_or_default()
381 }
382}
383
384pub struct RFE {
386 pub n_features_to_select: usize,
387 pub step: usize,
388 support_: Option<Vec<bool>>,
389 ranking_: Option<Vec<usize>>,
390 n_features_in_: usize,
391}
392
393impl RFE {
394 pub fn new(n_features_to_select: usize) -> Self {
395 RFE {
396 n_features_to_select,
397 step: 1,
398 support_: None,
399 ranking_: None,
400 n_features_in_: 0,
401 }
402 }
403
404 pub fn step(mut self, step: usize) -> Self {
405 self.step = step.max(1);
406 self
407 }
408
409 pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
411 let x_data = x.data_f32();
412 let y_data = y.data_f32();
413 let n_samples = x.dims()[0];
414 let n_features = x.dims()[1];
415
416 self.n_features_in_ = n_features;
417
418 let mut mask = vec![true; n_features];
419 let mut ranking = vec![1usize; n_features];
420 let mut current_rank = 1;
421
422 while mask.iter().filter(|&&m| m).count() > self.n_features_to_select {
423 let importances: Vec<(usize, f32)> = (0..n_features)
425 .filter(|&j| mask[j])
426 .map(|j| {
427 let x_mean: f32 = (0..n_samples)
428 .map(|i| x_data[i * n_features + j])
429 .sum::<f32>() / n_samples as f32;
430 let y_mean: f32 = y_data.iter().sum::<f32>() / n_samples as f32;
431
432 let mut cov = 0.0f32;
433 let mut var_x = 0.0f32;
434 let mut var_y = 0.0f32;
435
436 for i in 0..n_samples {
437 let xi = x_data[i * n_features + j] - x_mean;
438 let yi = y_data[i] - y_mean;
439 cov += xi * yi;
440 var_x += xi * xi;
441 var_y += yi * yi;
442 }
443
444 let corr = if var_x > 1e-10 && var_y > 1e-10 {
445 (cov / (var_x.sqrt() * var_y.sqrt())).abs()
446 } else {
447 0.0
448 };
449
450 (j, corr)
451 })
452 .collect();
453
454 let mut sorted_importances = importances.clone();
456 sorted_importances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
457
458 let n_to_remove = self.step.min(mask.iter().filter(|&&m| m).count() - self.n_features_to_select);
459
460 current_rank += 1;
461 for (idx, _) in sorted_importances.iter().take(n_to_remove) {
462 mask[*idx] = false;
463 ranking[*idx] = current_rank;
464 }
465 }
466
467 self.support_ = Some(mask);
468 self.ranking_ = Some(ranking);
469 }
470
471 pub fn transform(&self, x: &Tensor) -> Tensor {
472 let x_data = x.data_f32();
473 let n_samples = x.dims()[0];
474 let n_features = x.dims()[1];
475
476 let mask = self.support_.as_ref().expect("Model not fitted");
477 let n_selected: usize = mask.iter().filter(|&&m| m).count();
478
479 let mut result = Vec::with_capacity(n_samples * n_selected);
480
481 for i in 0..n_samples {
482 for (j, &keep) in mask.iter().enumerate() {
483 if keep {
484 result.push(x_data[i * n_features + j]);
485 }
486 }
487 }
488
489 Tensor::from_slice(&result, &[n_samples, n_selected]).unwrap()
490 }
491
492 pub fn fit_transform(&mut self, x: &Tensor, y: &Tensor) -> Tensor {
493 self.fit(x, y);
494 self.transform(x)
495 }
496
497 pub fn get_support(&self) -> Vec<bool> {
498 self.support_.clone().unwrap_or_default()
499 }
500
501 pub fn ranking(&self) -> Vec<usize> {
502 self.ranking_.clone().unwrap_or_default()
503 }
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_variance_threshold() {
512 let x = Tensor::from_slice(&[
513 0.0, 1.0, 0.0,
514 0.0, 2.0, 0.0,
515 0.0, 3.0, 0.0,
516 0.0, 4.0, 0.0,
517 ], &[4, 3]).unwrap();
518
519 let mut vt = VarianceThreshold::new(0.0);
520 let transformed = vt.fit_transform(&x);
521
522 assert!(transformed.dims()[1] <= 3);
524 }
525
526 #[test]
527 fn test_select_k_best() {
528 let x = Tensor::from_slice(&[
529 1.0, 0.0, 2.0,
530 2.0, 0.0, 4.0,
531 3.0, 0.0, 6.0,
532 4.0, 0.0, 8.0,
533 ], &[4, 3]).unwrap();
534
535 let y = Tensor::from_slice(&[0.0, 0.0, 1.0, 1.0], &[4]).unwrap();
536
537 let mut skb = SelectKBest::new(2, ScoreFunction::FClassif);
538 let transformed = skb.fit_transform(&x, &y);
539
540 assert_eq!(transformed.dims()[1], 2);
541 }
542
543 #[test]
544 fn test_rfe() {
545 let x = Tensor::from_slice(&[
546 1.0, 0.0, 2.0, 0.5,
547 2.0, 0.0, 4.0, 0.5,
548 3.0, 0.0, 6.0, 0.5,
549 4.0, 0.0, 8.0, 0.5,
550 ], &[4, 4]).unwrap();
551
552 let y = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0], &[4]).unwrap();
553
554 let mut rfe = RFE::new(2);
555 let transformed = rfe.fit_transform(&x, &y);
556
557 assert_eq!(transformed.dims()[1], 2);
558 }
559}