1use ghostflow_core::Tensor;
4use rand::prelude::*;
5
6pub struct RandomOverSampler {
8 pub sampling_strategy: SamplingStrategy,
9 pub random_state: Option<u64>,
10}
11
12#[derive(Clone)]
13pub enum SamplingStrategy {
14 Auto,
16 Ratio(f32),
18 Counts(Vec<(usize, usize)>),
20}
21
22impl RandomOverSampler {
23 pub fn new() -> Self {
24 RandomOverSampler {
25 sampling_strategy: SamplingStrategy::Auto,
26 random_state: None,
27 }
28 }
29
30 pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
31 self.sampling_strategy = s;
32 self
33 }
34
35 pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
36 let x_data = x.data_f32();
37 let y_data = y.data_f32();
38 let _n_samples = x.dims()[0];
39 let n_features = x.dims()[1];
40
41 let mut class_counts: std::collections::HashMap<i32, Vec<usize>> = std::collections::HashMap::new();
43 for (i, &label) in y_data.iter().enumerate() {
44 class_counts.entry(label as i32).or_default().push(i);
45 }
46
47 let max_count = class_counts.values().map(|v| v.len()).max().unwrap_or(0);
48
49 let mut rng = match self.random_state {
50 Some(seed) => StdRng::seed_from_u64(seed),
51 None => StdRng::from_entropy(),
52 };
53
54 let mut new_x = x_data.clone();
55 let mut new_y = y_data.clone();
56
57 for (class, indices) in &class_counts {
58 let target_count = match &self.sampling_strategy {
59 SamplingStrategy::Auto => max_count,
60 SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
61 SamplingStrategy::Counts(counts) => {
62 counts.iter().find(|(c, _)| *c == *class as usize)
63 .map(|(_, count)| *count)
64 .unwrap_or(indices.len())
65 }
66 };
67
68 let n_to_add = target_count.saturating_sub(indices.len());
69
70 for _ in 0..n_to_add {
71 let idx = indices[rng.gen_range(0..indices.len())];
72 new_x.extend_from_slice(&x_data[idx * n_features..(idx + 1) * n_features]);
73 new_y.push(*class as f32);
74 }
75 }
76
77 let new_n_samples = new_y.len();
78 (
79 Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
80 Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
81 )
82 }
83}
84
85impl Default for RandomOverSampler {
86 fn default() -> Self { Self::new() }
87}
88
89pub struct RandomUnderSampler {
91 pub sampling_strategy: SamplingStrategy,
92 pub random_state: Option<u64>,
93 pub replacement: bool,
94}
95
96impl RandomUnderSampler {
97 pub fn new() -> Self {
98 RandomUnderSampler {
99 sampling_strategy: SamplingStrategy::Auto,
100 random_state: None,
101 replacement: false,
102 }
103 }
104
105 pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
106 self.sampling_strategy = s;
107 self
108 }
109
110 pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
111 let x_data = x.data_f32();
112 let y_data = y.data_f32();
113 let _n_samples = x.dims()[0];
114 let n_features = x.dims()[1];
115
116 let mut class_counts: std::collections::HashMap<i32, Vec<usize>> = std::collections::HashMap::new();
117 for (i, &label) in y_data.iter().enumerate() {
118 class_counts.entry(label as i32).or_default().push(i);
119 }
120
121 let min_count = class_counts.values().map(|v| v.len()).min().unwrap_or(0);
122
123 let mut rng = match self.random_state {
124 Some(seed) => StdRng::seed_from_u64(seed),
125 None => StdRng::from_entropy(),
126 };
127
128 let mut selected_indices = Vec::new();
129
130 for (class, indices) in &class_counts {
131 let target_count = match &self.sampling_strategy {
132 SamplingStrategy::Auto => min_count,
133 SamplingStrategy::Ratio(r) => (min_count as f32 / r) as usize,
134 SamplingStrategy::Counts(counts) => {
135 counts.iter().find(|(c, _)| *c == *class as usize)
136 .map(|(_, count)| *count)
137 .unwrap_or(indices.len())
138 }
139 };
140
141 let n_to_select = target_count.min(indices.len());
142 let mut class_indices = indices.clone();
143 class_indices.shuffle(&mut rng);
144 selected_indices.extend(class_indices.into_iter().take(n_to_select));
145 }
146
147 selected_indices.sort();
148
149 let new_x: Vec<f32> = selected_indices.iter()
150 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
151 .collect();
152 let new_y: Vec<f32> = selected_indices.iter().map(|&i| y_data[i]).collect();
153
154 let new_n_samples = new_y.len();
155 (
156 Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
157 Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
158 )
159 }
160}
161
162impl Default for RandomUnderSampler {
163 fn default() -> Self { Self::new() }
164}
165
166pub struct SMOTE {
168 pub k_neighbors: usize,
169 pub sampling_strategy: SamplingStrategy,
170 pub random_state: Option<u64>,
171}
172
173impl SMOTE {
174 pub fn new() -> Self {
175 SMOTE {
176 k_neighbors: 5,
177 sampling_strategy: SamplingStrategy::Auto,
178 random_state: None,
179 }
180 }
181
182 pub fn k_neighbors(mut self, k: usize) -> Self {
183 self.k_neighbors = k;
184 self
185 }
186
187 pub fn sampling_strategy(mut self, s: SamplingStrategy) -> Self {
188 self.sampling_strategy = s;
189 self
190 }
191
192 fn find_k_neighbors(&self, point: &[f32], candidates: &[Vec<f32>], k: usize) -> Vec<usize> {
193 let mut distances: Vec<(usize, f32)> = candidates.iter()
194 .enumerate()
195 .map(|(i, c)| {
196 let dist: f32 = point.iter().zip(c.iter())
197 .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
198 (i, dist)
199 })
200 .collect();
201
202 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
203 distances.into_iter().skip(1).take(k).map(|(i, _)| i).collect()
204 }
205
206 pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
207 let x_data = x.data_f32();
208 let y_data = y.data_f32();
209 let _n_samples = x.dims()[0];
210 let n_features = x.dims()[1];
211
212 let mut class_samples: std::collections::HashMap<i32, Vec<Vec<f32>>> = std::collections::HashMap::new();
213 for (i, &label) in y_data.iter().enumerate() {
214 let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
215 class_samples.entry(label as i32).or_default().push(sample);
216 }
217
218 let max_count = class_samples.values().map(|v| v.len()).max().unwrap_or(0);
219
220 let mut rng = match self.random_state {
221 Some(seed) => StdRng::seed_from_u64(seed),
222 None => StdRng::from_entropy(),
223 };
224
225 let mut new_x = x_data.clone();
226 let mut new_y = y_data.clone();
227
228 for (class, samples) in &class_samples {
229 let target_count = match &self.sampling_strategy {
230 SamplingStrategy::Auto => max_count,
231 SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
232 SamplingStrategy::Counts(counts) => {
233 counts.iter().find(|(c, _)| *c == *class as usize)
234 .map(|(_, count)| *count)
235 .unwrap_or(samples.len())
236 }
237 };
238
239 let n_to_generate = target_count.saturating_sub(samples.len());
240 let k = self.k_neighbors.min(samples.len() - 1).max(1);
241
242 for _ in 0..n_to_generate {
243 let idx = rng.gen_range(0..samples.len());
245 let sample = &samples[idx];
246
247 let neighbors = self.find_k_neighbors(sample, samples, k);
249
250 if neighbors.is_empty() {
251 new_x.extend_from_slice(sample);
253 } else {
254 let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
256 let neighbor = &samples[neighbor_idx];
257
258 let lambda: f32 = rng.gen();
260 let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
261 .map(|(&s, &n)| s + lambda * (n - s))
262 .collect();
263
264 new_x.extend_from_slice(&synthetic);
265 }
266 new_y.push(*class as f32);
267 }
268 }
269
270 let new_n_samples = new_y.len();
271 (
272 Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
273 Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
274 )
275 }
276}
277
278impl Default for SMOTE {
279 fn default() -> Self { Self::new() }
280}
281
282pub struct BorderlineSMOTE {
284 pub k_neighbors: usize,
285 pub m_neighbors: usize,
286 pub sampling_strategy: SamplingStrategy,
287 pub random_state: Option<u64>,
288}
289
290impl BorderlineSMOTE {
291 pub fn new() -> Self {
292 BorderlineSMOTE {
293 k_neighbors: 5,
294 m_neighbors: 10,
295 sampling_strategy: SamplingStrategy::Auto,
296 random_state: None,
297 }
298 }
299
300 fn find_k_neighbors_with_labels(&self, point: &[f32], all_samples: &[(Vec<f32>, i32)], k: usize)
301 -> Vec<(usize, i32)>
302 {
303 let mut distances: Vec<(usize, f32, i32)> = all_samples.iter()
304 .enumerate()
305 .map(|(i, (s, label))| {
306 let dist: f32 = point.iter().zip(s.iter())
307 .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
308 (i, dist, *label)
309 })
310 .collect();
311
312 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
313 distances.into_iter().skip(1).take(k).map(|(i, _, l)| (i, l)).collect()
314 }
315
316 pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
317 let x_data = x.data_f32();
318 let y_data = y.data_f32();
319 let n_samples = x.dims()[0];
320 let n_features = x.dims()[1];
321
322 let all_samples: Vec<(Vec<f32>, i32)> = (0..n_samples)
324 .map(|i| {
325 let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
326 (sample, y_data[i] as i32)
327 })
328 .collect();
329
330 let mut class_counts: std::collections::HashMap<i32, usize> = std::collections::HashMap::new();
331 for &label in &y_data {
332 *class_counts.entry(label as i32).or_default() += 1;
333 }
334
335 let (minority_class, _) = class_counts.iter()
336 .min_by_key(|(_, &count)| count)
337 .unwrap();
338 let max_count = *class_counts.values().max().unwrap();
339
340 let mut rng = match self.random_state {
341 Some(seed) => StdRng::seed_from_u64(seed),
342 None => StdRng::from_entropy(),
343 };
344
345 let mut borderline_samples: Vec<Vec<f32>> = Vec::new();
347 let minority_samples: Vec<Vec<f32>> = all_samples.iter()
348 .filter(|(_, l)| l == minority_class)
349 .map(|(s, _)| s.clone())
350 .collect();
351
352 for (sample, label) in &all_samples {
353 if *label != *minority_class { continue; }
354
355 let neighbors = self.find_k_neighbors_with_labels(sample, &all_samples, self.m_neighbors);
356 let n_majority = neighbors.iter().filter(|(_, l)| l != minority_class).count();
357
358 if n_majority >= self.m_neighbors / 2 && n_majority < self.m_neighbors {
360 borderline_samples.push(sample.clone());
361 }
362 }
363
364 if borderline_samples.is_empty() {
366 borderline_samples = minority_samples.clone();
367 }
368
369 let mut new_x = x_data.clone();
370 let mut new_y = y_data.clone();
371
372 let target_count = match &self.sampling_strategy {
373 SamplingStrategy::Auto => max_count,
374 SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
375 _ => max_count,
376 };
377
378 let n_to_generate = target_count.saturating_sub(class_counts[minority_class]);
379 let k = self.k_neighbors.min(minority_samples.len() - 1).max(1);
380
381 for _ in 0..n_to_generate {
382 let idx = rng.gen_range(0..borderline_samples.len());
383 let sample = &borderline_samples[idx];
384
385 let mut distances: Vec<(usize, f32)> = minority_samples.iter()
387 .enumerate()
388 .map(|(i, s)| {
389 let dist: f32 = sample.iter().zip(s.iter())
390 .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
391 (i, dist)
392 })
393 .collect();
394 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
395 let neighbors: Vec<usize> = distances.into_iter().skip(1).take(k).map(|(i, _)| i).collect();
396
397 if !neighbors.is_empty() {
398 let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
399 let neighbor = &minority_samples[neighbor_idx];
400
401 let lambda: f32 = rng.gen();
402 let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
403 .map(|(&s, &n)| s + lambda * (n - s))
404 .collect();
405
406 new_x.extend_from_slice(&synthetic);
407 new_y.push(*minority_class as f32);
408 }
409 }
410
411 let new_n_samples = new_y.len();
412 (
413 Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
414 Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
415 )
416 }
417}
418
419pub struct ADASYN {
421 pub k_neighbors: usize,
422 pub sampling_strategy: SamplingStrategy,
423 pub random_state: Option<u64>,
424}
425
426impl ADASYN {
427 pub fn new() -> Self {
428 ADASYN {
429 k_neighbors: 5,
430 sampling_strategy: SamplingStrategy::Auto,
431 random_state: None,
432 }
433 }
434
435 pub fn fit_resample(&self, x: &Tensor, y: &Tensor) -> (Tensor, Tensor) {
436 let x_data = x.data_f32();
437 let y_data = y.data_f32();
438 let n_samples = x.dims()[0];
439 let n_features = x.dims()[1];
440
441 let all_samples: Vec<(Vec<f32>, i32)> = (0..n_samples)
442 .map(|i| {
443 let sample: Vec<f32> = x_data[i * n_features..(i + 1) * n_features].to_vec();
444 (sample, y_data[i] as i32)
445 })
446 .collect();
447
448 let mut class_counts: std::collections::HashMap<i32, usize> = std::collections::HashMap::new();
449 for &label in &y_data {
450 *class_counts.entry(label as i32).or_default() += 1;
451 }
452
453 let (minority_class, minority_count) = class_counts.iter()
454 .min_by_key(|(_, &count)| count)
455 .map(|(&c, &count)| (c, count))
456 .unwrap();
457 let max_count = *class_counts.values().max().unwrap();
458
459 let minority_samples: Vec<Vec<f32>> = all_samples.iter()
460 .filter(|(_, l)| *l == minority_class)
461 .map(|(s, _)| s.clone())
462 .collect();
463
464 let mut rng = match self.random_state {
465 Some(seed) => StdRng::seed_from_u64(seed),
466 None => StdRng::from_entropy(),
467 };
468
469 let k = self.k_neighbors.min(n_samples - 1).max(1);
471 let mut ratios: Vec<f32> = Vec::new();
472
473 for sample in &minority_samples {
474 let mut distances: Vec<(f32, i32)> = all_samples.iter()
475 .map(|(s, label)| {
476 let dist: f32 = sample.iter().zip(s.iter())
477 .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
478 (dist, *label)
479 })
480 .collect();
481 distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
482
483 let neighbors: Vec<i32> = distances.into_iter().skip(1).take(k).map(|(_, l)| l).collect();
484 let n_majority = neighbors.iter().filter(|&&l| l != minority_class).count();
485 ratios.push(n_majority as f32 / k as f32);
486 }
487
488 let ratio_sum: f32 = ratios.iter().sum();
490 if ratio_sum > 0.0 {
491 for r in &mut ratios {
492 *r /= ratio_sum;
493 }
494 }
495
496 let target_count = match &self.sampling_strategy {
497 SamplingStrategy::Auto => max_count,
498 SamplingStrategy::Ratio(r) => (max_count as f32 * r) as usize,
499 _ => max_count,
500 };
501
502 let g = target_count.saturating_sub(minority_count);
503
504 let mut new_x = x_data.clone();
505 let mut new_y = y_data.clone();
506
507 let k_synth = self.k_neighbors.min(minority_samples.len() - 1).max(1);
508
509 for (i, sample) in minority_samples.iter().enumerate() {
510 let n_to_generate = (ratios[i] * g as f32).round() as usize;
511
512 let mut distances: Vec<(usize, f32)> = minority_samples.iter()
514 .enumerate()
515 .map(|(j, s)| {
516 let dist: f32 = sample.iter().zip(s.iter())
517 .map(|(&a, &b)| (a - b).powi(2)).sum::<f32>().sqrt();
518 (j, dist)
519 })
520 .collect();
521 distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
522 let neighbors: Vec<usize> = distances.into_iter().skip(1).take(k_synth).map(|(j, _)| j).collect();
523
524 for _ in 0..n_to_generate {
525 if !neighbors.is_empty() {
526 let neighbor_idx = neighbors[rng.gen_range(0..neighbors.len())];
527 let neighbor = &minority_samples[neighbor_idx];
528
529 let lambda: f32 = rng.gen();
530 let synthetic: Vec<f32> = sample.iter().zip(neighbor.iter())
531 .map(|(&s, &n)| s + lambda * (n - s))
532 .collect();
533
534 new_x.extend_from_slice(&synthetic);
535 new_y.push(minority_class as f32);
536 }
537 }
538 }
539
540 let new_n_samples = new_y.len();
541 (
542 Tensor::from_slice(&new_x, &[new_n_samples, n_features]).unwrap(),
543 Tensor::from_slice(&new_y, &[new_n_samples]).unwrap()
544 )
545 }
546}
547
548#[cfg(test)]
549mod tests {
550 use super::*;
551
552 #[test]
553 fn test_random_over_sampler() {
554 let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], &[4, 2]).unwrap();
555 let y = Tensor::from_slice(&[0.0f32, 0.0, 0.0, 1.0], &[4]).unwrap();
556
557 let ros = RandomOverSampler::new();
558 let (_x_res, y_res) = ros.fit_resample(&x, &y);
559
560 assert!(y_res.dims()[0] >= 4);
561 }
562
563 #[test]
564 fn test_smote() {
565 let x = Tensor::from_slice(&[1.0f32, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0,
566 10.0, 10.0, 11.0, 11.0
567 ], &[6, 2]).unwrap();
568 let y = Tensor::from_slice(&[0.0f32, 0.0, 0.0, 0.0, 1.0, 1.0], &[6]).unwrap();
569
570 let smote = SMOTE::new().k_neighbors(1);
571 let (_x_res, y_res) = smote.fit_resample(&x, &y);
572
573 assert!(y_res.dims()[0] >= 6);
574 }
575}
576
577