ghostflow_ml/
imbalanced.rs

1//! Imbalanced Learning - SMOTE, Random Over/Under Sampling
2
3use ghostflow_core::Tensor;
4use rand::prelude::*;
5
6/// Random Over Sampler - duplicates minority class samples
7pub struct RandomOverSampler {
8    pub sampling_strategy: SamplingStrategy,
9    pub random_state: Option<u64>,
10}
11
12#[derive(Clone)]
13pub enum SamplingStrategy {
14    /// Balance all classes to the majority class count
15    Auto,
16    /// Specific ratio of minority to majority
17    Ratio(f32),
18    /// Target count for each class
19    Counts(Vec<(usize, usize)>),
20}
21
22impl RandomOverSampler {
23    pub fn new() -> Self {
24        RandomOverSampler {
25            sampling_strategy: SamplingStrategy::Auto,
26            random_state: None,
27        }
28    }
29
30    pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
31        self.sampling_strategy = s;
32        self
33    }
34
35    pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
36        let x_data = x.data_f32();
37        let y_data = y.data_f32();
38        let _n_samples = x.dims()[0];
39        let n_features = x.dims()[1];
40
41        // Count classes
42        let mut class_counts: std::collections::HashMap<i32, Vec<usize>> = std::collections::HashMap::new();
43        for (i, &label) in y_data.iter().enumerate() {
44            class_counts.entry(label as i32).or_default().push(i);
45        }
46
47        let max_count = class_counts.values().map(|v| v.len()).max().unwrap_or(0);
48
49        let mut rng = match self.random_state {
50            Some(seed) => StdRng::seed_from_u64(seed),
51            None => StdRng::from_entropy(),
52        };
53
54        let mut new_x = x_data.clone();
55        let mut new_y = y_data.clone();
56
57        for (class, indices) in &class_counts {
58            let target_count = match &self.sampling_strategy {
59                SamplingStrategy::Auto => max_count,
60                SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
61                SamplingStrategy::Counts(counts) => {
62                    counts.iter().find(|(c, _)| *c == *class as usize)
63                        .map(|(_, count)| *count)
64                        .unwrap_or(indices.len())
65                }
66            };
67
68            let n_to_add = target_count.saturating_sub(indices.len());
69            
70            for _ in 0..n_to_add {
71                let idx = indices[rng.gen_range(0..indices.len())];
72                new_x.extend_from_slice(&x_data[idx * n_features..(idx + 1) * n_features]);
73                new_y.push(*class as f32);
74            }
75        }
76
77        let new_n_samples = new_y.len();
78        (
79            Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
80            Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
81        )
82    }
83}
84
85impl Default for RandomOverSampler {
86    fn default() -> Self { Self::new() }
87}
88
89/// Random Under Sampler - removes majority class samples
90pub struct RandomUnderSampler {
91    pub sampling_strategy: SamplingStrategy,
92    pub random_state: Option<u64>,
93    pub replacement: bool,
94}
95
96impl RandomUnderSampler {
97    pub fn new() -> Self {
98        RandomUnderSampler {
99            sampling_strategy: SamplingStrategy::Auto,
100            random_state: None,
101            replacement: false,
102        }
103    }
104
105    pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
106        self.sampling_strategy = s;
107        self
108    }
109
110    pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
111        let x_data = x.data_f32();
112        let y_data = y.data_f32();
113        let _n_samples = x.dims()[0];
114        let n_features = x.dims()[1];
115
116        let mut class_counts: std::collections::HashMap<i32, Vec<usize>> = std::collections::HashMap::new();
117        for (i, &label) in y_data.iter().enumerate() {
118            class_counts.entry(label as i32).or_default().push(i);
119        }
120
121        let min_count = class_counts.values().map(|v| v.len()).min().unwrap_or(0);
122
123        let mut rng = match self.random_state {
124            Some(seed) => StdRng::seed_from_u64(seed),
125            None => StdRng::from_entropy(),
126        };
127
128        let mut selected_indices = Vec::new();
129
130        for (class, indices) in &class_counts {
131            let target_count = match &self.sampling_strategy {
132                SamplingStrategy::Auto => min_count,
133                SamplingStrategy::Ratio(r) => (min_count as f32 / r) as usize,
134                SamplingStrategy::Counts(counts) => {
135                    counts.iter().find(|(c, _)| *c == *class as usize)
136                        .map(|(_, count)| *count)
137                        .unwrap_or(indices.len())
138                }
139            };
140
141            let n_to_select = target_count.min(indices.len());
142            let mut class_indices = indices.clone();
143            class_indices.shuffle(&mut rng);
144            selected_indices.extend(class_indices.into_iter().take(n_to_select));
145        }
146
147        selected_indices.sort();
148
149        let new_x: Vec<f32> = selected_indices.iter()
150            .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
151            .collect();
152        let new_y: Vec<f32> = selected_indices.iter().map(|&i| y_data[i]).collect();
153
154        let new_n_samples = new_y.len();
155        (
156            Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
157            Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
158        )
159    }
160}
161
162impl Default for RandomUnderSampler {
163    fn default() -> Self { Self::new() }
164}
165
166/// SMOTE - Synthetic Minority Over-sampling Technique
167pub struct SMOTE {
168    pub k_neighbors: usize,
169    pub sampling_strategy: SamplingStrategy,
170    pub random_state: Option<u64>,
171}
172
173impl SMOTE {
174    pub fn new() -> Self {
175        SMOTE {
176            k_neighbors: 5,
177            sampling_strategy: SamplingStrategy::Auto,
178            random_state: None,
179        }
180    }
181
182    pub fn k_neighbors(mut self, k: usize) -> Self {
183        self.k_neighbors = k;
184        self
185    }
186
187    pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
188        self.sampling_strategy = s;
189        self
190    }
191
192    fn find_k_neighbors(&self, point: &[f32], candidates: &[Vec<f32>], k: usize) -> Vec<usize> {
193        let mut distances: Vec<(usize, f32)> = candidates.iter()
194            .enumerate()
195            .map(|(i, c)| {
196                let dist: f32 = point.iter().zip(c.iter())
197                    .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
198                (i, dist)
199            })
200            .collect();
201        
202        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
203        distances.into_iter().skip(1).take(k).map(|(i, _)| i).collect()
204    }
205
206    pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
207        let x_data = x.data_f32();
208        let y_data = y.data_f32();
209        let _n_samples = x.dims()[0];
210        let n_features = x.dims()[1];
211
212        let mut class_samples: std::collections::HashMap<i32, Vec<Vec<f32>>> = std::collections::HashMap::new();
213        for (i, &label) in y_data.iter().enumerate() {
214            let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
215            class_samples.entry(label as i32).or_default().push(sample);
216        }
217
218        let max_count = class_samples.values().map(|v| v.len()).max().unwrap_or(0);
219
220        let mut rng = match self.random_state {
221            Some(seed) => StdRng::seed_from_u64(seed),
222            None => StdRng::from_entropy(),
223        };
224
225        let mut new_x = x_data.clone();
226        let mut new_y = y_data.clone();
227
228        for (class, samples) in &class_samples {
229            let target_count = match &self.sampling_strategy {
230                SamplingStrategy::Auto => max_count,
231                SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
232                SamplingStrategy::Counts(counts) => {
233                    counts.iter().find(|(c, _)| *c == *class as usize)
234                        .map(|(_, count)| *count)
235                        .unwrap_or(samples.len())
236                }
237            };
238
239            let n_to_generate = target_count.saturating_sub(samples.len());
240            let k = self.k_neighbors.min(samples.len() - 1).max(1);
241
242            for _ in 0..n_to_generate {
243                // Pick a random sample
244                let idx = rng.gen_range(0..samples.len());
245                let sample = &samples[idx];
246
247                // Find k nearest neighbors
248                let neighbors = self.find_k_neighbors(sample, samples, k);
249                
250                if neighbors.is_empty() {
251                    // Just duplicate if no neighbors
252                    new_x.extend_from_slice(sample);
253                } else {
254                    // Pick a random neighbor
255                    let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
256                    let neighbor = &samples[neighbor_idx];
257
258                    // Generate synthetic sample
259                    let lambda: f32 = rng.gen();
260                    let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
261                        .map(|(&s, &n)| s + lambda * (n - s))
262                        .collect();
263                    
264                    new_x.extend_from_slice(&synthetic);
265                }
266                new_y.push(*class as f32);
267            }
268        }
269
270        let new_n_samples = new_y.len();
271        (
272            Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
273            Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
274        )
275    }
276}
277
278impl Default for SMOTE {
279    fn default() -> Self { Self::new() }
280}
281
282/// BorderlineSMOTE - SMOTE variant focusing on borderline samples
283pub struct BorderlineSMOTE {
284    pub k_neighbors: usize,
285    pub m_neighbors: usize,
286    pub sampling_strategy: SamplingStrategy,
287    pub random_state: Option<u64>,
288}
289
290impl BorderlineSMOTE {
291    pub fn new() -> Self {
292        BorderlineSMOTE {
293            k_neighbors: 5,
294            m_neighbors: 10,
295            sampling_strategy: SamplingStrategy::Auto,
296            random_state: None,
297        }
298    }
299
300    fn find_k_neighbors_with_labels(&self, point: &[f32], all_samples: &[(Vec<f32>, i32)], k: usize) 
301        -> Vec<(usize, i32)> 
302    {
303        let mut distances: Vec<(usize, f32, i32)> = all_samples.iter()
304            .enumerate()
305            .map(|(i, (s, label))| {
306                let dist: f32 = point.iter().zip(s.iter())
307                    .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
308                (i, dist, *label)
309            })
310            .collect();
311        
312        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
313        distances.into_iter().skip(1).take(k).map(|(i, _, l)| (i, l)).collect()
314    }
315
316    pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
317        let x_data = x.data_f32();
318        let y_data = y.data_f32();
319        let n_samples = x.dims()[0];
320        let n_features = x.dims()[1];
321
322        // Prepare all samples with labels
323        let all_samples: Vec<(Vec<f32>, i32)> = (0..n_samples)
324            .map(|i| {
325                let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
326                (sample, y_data[i] as i32)
327            })
328            .collect();
329
330        let mut class_counts: std::collections::HashMap<i32, usize> = std::collections::HashMap::new();
331        for &label in &y_data {
332            *class_counts.entry(label as i32).or_default() += 1;
333        }
334
335        let (minority_class, _) = class_counts.iter()
336            .min_by_key(|(_, &count)| count)
337            .unwrap();
338        let max_count = *class_counts.values().max().unwrap();
339
340        let mut rng = match self.random_state {
341            Some(seed) => StdRng::seed_from_u64(seed),
342            None => StdRng::from_entropy(),
343        };
344
345        // Find borderline samples
346        let mut borderline_samples: Vec<Vec<f32>> = Vec::new();
347        let minority_samples: Vec<Vec<f32>> = all_samples.iter()
348            .filter(|(_, l)| l == minority_class)
349            .map(|(s, _)| s.clone())
350            .collect();
351
352        for (sample, label) in &all_samples {
353            if *label != *minority_class { continue; }
354
355            let neighbors = self.find_k_neighbors_with_labels(sample, &all_samples, self.m_neighbors);
356            let n_majority = neighbors.iter().filter(|(_, l)| l != minority_class).count();
357
358            // Borderline if half or more neighbors are majority class
359            if n_majority >= self.m_neighbors / 2 && n_majority < self.m_neighbors {
360                borderline_samples.push(sample.clone());
361            }
362        }
363
364        // If no borderline samples, fall back to regular SMOTE
365        if borderline_samples.is_empty() {
366            borderline_samples = minority_samples.clone();
367        }
368
369        let mut new_x = x_data.clone();
370        let mut new_y = y_data.clone();
371
372        let target_count = match &self.sampling_strategy {
373            SamplingStrategy::Auto => max_count,
374            SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
375            _ => max_count,
376        };
377
378        let n_to_generate = target_count.saturating_sub(class_counts[minority_class]);
379        let k = self.k_neighbors.min(minority_samples.len() - 1).max(1);
380
381        for _ in 0..n_to_generate {
382            let idx = rng.gen_range(0..borderline_samples.len());
383            let sample = &borderline_samples[idx];
384
385            // Find k nearest neighbors from minority class
386            let mut distances: Vec<(usize, f32)> = minority_samples.iter()
387                .enumerate()
388                .map(|(i, s)| {
389                    let dist: f32 = sample.iter().zip(s.iter())
390                        .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
391                    (i, dist)
392                })
393                .collect();
394            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
395            let neighbors: Vec<usize> = distances.into_iter().skip(1).take(k).map(|(i, _)| i).collect();
396
397            if !neighbors.is_empty() {
398                let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
399                let neighbor = &minority_samples[neighbor_idx];
400
401                let lambda: f32 = rng.gen();
402                let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
403                    .map(|(&s, &n)| s + lambda * (n - s))
404                    .collect();
405                
406                new_x.extend_from_slice(&synthetic);
407                new_y.push(*minority_class as f32);
408            }
409        }
410
411        let new_n_samples = new_y.len();
412        (
413            Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
414            Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
415        )
416    }
417}
418
419/// ADASYN - Adaptive Synthetic Sampling
420pub struct ADASYN {
421    pub k_neighbors: usize,
422    pub sampling_strategy: SamplingStrategy,
423    pub random_state: Option<u64>,
424}
425
426impl ADASYN {
427    pub fn new() -> Self {
428        ADASYN {
429            k_neighbors: 5,
430            sampling_strategy: SamplingStrategy::Auto,
431            random_state: None,
432        }
433    }
434
435    pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
436        let x_data = x.data_f32();
437        let y_data = y.data_f32();
438        let n_samples = x.dims()[0];
439        let n_features = x.dims()[1];
440
441        let all_samples: Vec<(Vec<f32>, i32)> = (0..n_samples)
442            .map(|i| {
443                let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
444                (sample, y_data[i] as i32)
445            })
446            .collect();
447
448        let mut class_counts: std::collections::HashMap<i32, usize> = std::collections::HashMap::new();
449        for &label in &y_data {
450            *class_counts.entry(label as i32).or_default() += 1;
451        }
452
453        let (minority_class, minority_count) = class_counts.iter()
454            .min_by_key(|(_, &count)| count)
455            .map(|(&c, &count)| (c, count))
456            .unwrap();
457        let max_count = *class_counts.values().max().unwrap();
458
459        let minority_samples: Vec<Vec<f32>> = all_samples.iter()
460            .filter(|(_, l)| *l == minority_class)
461            .map(|(s, _)| s.clone())
462            .collect();
463
464        let mut rng = match self.random_state {
465            Some(seed) => StdRng::seed_from_u64(seed),
466            None => StdRng::from_entropy(),
467        };
468
469        // Compute density ratio for each minority sample
470        let k = self.k_neighbors.min(n_samples - 1).max(1);
471        let mut ratios: Vec<f32> = Vec::new();
472
473        for sample in &minority_samples {
474            let mut distances: Vec<(f32, i32)> = all_samples.iter()
475                .map(|(s, label)| {
476                    let dist: f32 = sample.iter().zip(s.iter())
477                        .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
478                    (dist, *label)
479                })
480                .collect();
481            distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
482            
483            let neighbors: Vec<i32> = distances.into_iter().skip(1).take(k).map(|(_, l)| l).collect();
484            let n_majority = neighbors.iter().filter(|&&l| l != minority_class).count();
485            ratios.push(n_majority as f32 / k as f32);
486        }
487
488        // Normalize ratios
489        let ratio_sum: f32 = ratios.iter().sum();
490        if ratio_sum > 0.0 {
491            for r in &mut ratios {
492                *r /= ratio_sum;
493            }
494        }
495
496        let target_count = match &self.sampling_strategy {
497            SamplingStrategy::Auto => max_count,
498            SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
499            _ => max_count,
500        };
501
502        let g = target_count.saturating_sub(minority_count);
503
504        let mut new_x = x_data.clone();
505        let mut new_y = y_data.clone();
506
507        let k_synth = self.k_neighbors.min(minority_samples.len() - 1).max(1);
508
509        for (i, sample) in minority_samples.iter().enumerate() {
510            let n_to_generate = (ratios[i] * g as f32).round() as usize;
511
512            // Find k nearest minority neighbors
513            let mut distances: Vec<(usize, f32)> = minority_samples.iter()
514                .enumerate()
515                .map(|(j, s)| {
516                    let dist: f32 = sample.iter().zip(s.iter())
517                        .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
518                    (j, dist)
519                })
520                .collect();
521            distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
522            let neighbors: Vec<usize> = distances.into_iter().skip(1).take(k_synth).map(|(j, _)| j).collect();
523
524            for _ in 0..n_to_generate {
525                if !neighbors.is_empty() {
526                    let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
527                    let neighbor = &minority_samples[neighbor_idx];
528
529                    let lambda: f32 = rng.gen();
530                    let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
531                        .map(|(&s, &n)| s + lambda * (n - s))
532                        .collect();
533                    
534                    new_x.extend_from_slice(&synthetic);
535                    new_y.push(minority_class as f32);
536                }
537            }
538        }
539
540        let new_n_samples = new_y.len();
541        (
542            Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
543            Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
544        )
545    }
546}
547
548#[cfg(test)]
549mod tests {
550    use super::*;
551
552    #[test]
553    fn test_random_over_sampler() {
554        let x = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], &[4, 2]).unwrap();
555        let y = Tensor::from_slice(&[0.0, 0.0, 0.0, 1.0], &[4]).unwrap();
556        
557        let ros = RandomOverSampler::new();
558        let (_x_res, y_res) = ros.fit_resample(&x, &y);
559        
560        assert!(y_res.dims()[0] >= 4);
561    }
562
563    #[test]
564    fn test_smote() {
565        let x = Tensor::from_slice(&[
566            1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0,
567            10.0, 10.0, 11.0, 11.0
568        ], &[6, 2]).unwrap();
569        let y = Tensor::from_slice(&[0.0, 0.0, 0.0, 0.0, 1.0, 1.0], &[6]).unwrap();
570        
571        let smote = SMOTE::new().k_neighbors(1);
572        let (_x_res, y_res) = smote.fit_resample(&x, &y);
573        
574        assert!(y_res.dims()[0] >= 6);
575    }
576}