1use crate::error::{DatasetsError, Result};
4use crate::gpu::{GpuContext, GpuDeviceInfo};
5use crate::utils::Dataset;
6use ndarray::{Array1, Array2};
7use rand::prelude::*;
8use rand::rngs::StdRng;
9use rand_distr::Distribution;
10use rand_distr::Uniform;
11use crate::gpu::GpuBackend as LocalGpuBackend;
13use rand::seq::SliceRandom;
17use scirs2_core::rng;
18use std::f64::consts::PI;
19
20#[allow(dead_code)]
22#[allow(clippy::too_many_arguments)]
23pub fn make_classification(
24 n_samples: usize,
25 n_features: usize,
26 n_classes: usize,
27 n_clusters_per_class: usize,
28 n_informative: usize,
29 randomseed: Option<u64>,
30) -> Result<Dataset> {
31 if n_samples == 0 {
33 return Err(DatasetsError::InvalidFormat(
34 "n_samples must be > 0".to_string(),
35 ));
36 }
37
38 if n_features == 0 {
39 return Err(DatasetsError::InvalidFormat(
40 "n_features must be > 0".to_string(),
41 ));
42 }
43
44 if n_informative == 0 {
45 return Err(DatasetsError::InvalidFormat(
46 "n_informative must be > 0".to_string(),
47 ));
48 }
49
50 if n_features < n_informative {
51 return Err(DatasetsError::InvalidFormat(format!(
52 "n_features ({n_features}) must be >= n_informative ({n_informative})"
53 )));
54 }
55
56 if n_classes < 2 {
57 return Err(DatasetsError::InvalidFormat(
58 "n_classes must be >= 2".to_string(),
59 ));
60 }
61
62 if n_clusters_per_class == 0 {
63 return Err(DatasetsError::InvalidFormat(
64 "n_clusters_per_class must be > 0".to_string(),
65 ));
66 }
67
68 let mut rng = match randomseed {
69 Some(_seed) => StdRng::seed_from_u64(_seed),
70 None => {
71 let mut r = rng();
72 StdRng::seed_from_u64(r.next_u64())
73 }
74 };
75
76 let n_centroids = n_classes * n_clusters_per_class;
78 let mut centroids = Array2::zeros((n_centroids, n_informative));
79 let scale = 2.0;
80
81 for i in 0..n_centroids {
82 for j in 0..n_informative {
83 centroids[[i, j]] = scale * rng.gen_range(-1.0f64..1.0f64);
84 }
85 }
86
87 let mut data = Array2::zeros((n_samples, n_features));
89 let mut target = Array1::zeros(n_samples);
90
91 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
92
93 let samples_per_class = n_samples / n_classes;
95 let remainder = n_samples % n_classes;
96
97 let mut sample_idx = 0;
98
99 for _class in 0..n_classes {
100 let n_samples_class = if _class < remainder {
101 samples_per_class + 1
102 } else {
103 samples_per_class
104 };
105
106 let samples_per_cluster = n_samples_class / n_clusters_per_class;
108 let cluster_remainder = n_samples_class % n_clusters_per_class;
109
110 for cluster in 0..n_clusters_per_class {
111 let n_samples_cluster = if cluster < cluster_remainder {
112 samples_per_cluster + 1
113 } else {
114 samples_per_cluster
115 };
116
117 let centroid_idx = _class * n_clusters_per_class + cluster;
118
119 for _ in 0..n_samples_cluster {
120 for j in 0..n_informative {
122 data[[sample_idx, j]] =
123 centroids[[centroid_idx, j]] + 0.3 * normal.sample(&mut rng);
124 }
125
126 for j in n_informative..n_features {
128 data[[sample_idx, j]] = normal.sample(&mut rng);
129 }
130
131 target[sample_idx] = _class as f64;
132 sample_idx += 1;
133 }
134 }
135 }
136
137 let mut dataset = Dataset::new(data, Some(target));
139
140 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
142
143 let classnames: Vec<String> = (0..n_classes).map(|i| format!("class_{i}")).collect();
145
146 dataset = dataset
147 .with_featurenames(featurenames)
148 .with_targetnames(classnames)
149 .with_description(format!(
150 "Synthetic classification dataset with {n_classes} _classes and {n_features} _features"
151 ));
152
153 Ok(dataset)
154}
155
156#[allow(dead_code)]
158pub fn make_regression(
159 n_samples: usize,
160 n_features: usize,
161 n_informative: usize,
162 noise: f64,
163 randomseed: Option<u64>,
164) -> Result<Dataset> {
165 if n_samples == 0 {
167 return Err(DatasetsError::InvalidFormat(
168 "n_samples must be > 0".to_string(),
169 ));
170 }
171
172 if n_features == 0 {
173 return Err(DatasetsError::InvalidFormat(
174 "n_features must be > 0".to_string(),
175 ));
176 }
177
178 if n_informative == 0 {
179 return Err(DatasetsError::InvalidFormat(
180 "n_informative must be > 0".to_string(),
181 ));
182 }
183
184 if n_features < n_informative {
185 return Err(DatasetsError::InvalidFormat(format!(
186 "n_features ({n_features}) must be >= n_informative ({n_informative})"
187 )));
188 }
189
190 if noise < 0.0 {
191 return Err(DatasetsError::InvalidFormat(
192 "noise must be >= 0.0".to_string(),
193 ));
194 }
195
196 let mut rng = match randomseed {
197 Some(_seed) => StdRng::seed_from_u64(_seed),
198 None => {
199 let mut r = rng();
200 StdRng::seed_from_u64(r.next_u64())
201 }
202 };
203
204 let mut coef = Array1::zeros(n_features);
206 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
207
208 for i in 0..n_informative {
209 coef[i] = 100.0 * normal.sample(&mut rng);
210 }
211
212 let mut data = Array2::zeros((n_samples, n_features));
214
215 for i in 0..n_samples {
216 for j in 0..n_features {
217 data[[i, j]] = normal.sample(&mut rng);
218 }
219 }
220
221 let mut target = Array1::zeros(n_samples);
223
224 for i in 0..n_samples {
225 let mut y = 0.0;
226 for j in 0..n_features {
227 y += data[[i, j]] * coef[j];
228 }
229
230 if noise > 0.0 {
232 y += normal.sample(&mut rng) * noise;
233 }
234
235 target[i] = y;
236 }
237
238 let mut dataset = Dataset::new(data, Some(target));
240
241 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
243
244 dataset = dataset
245 .with_featurenames(featurenames)
246 .with_description(format!(
247 "Synthetic regression dataset with {n_features} _features ({n_informative} informative)"
248 ))
249 .with_metadata("noise", &noise.to_string())
250 .with_metadata("coefficients", &format!("{coef:?}"));
251
252 Ok(dataset)
253}
254
255#[allow(dead_code)]
257pub fn make_time_series(
258 n_samples: usize,
259 n_features: usize,
260 trend: bool,
261 seasonality: bool,
262 noise: f64,
263 randomseed: Option<u64>,
264) -> Result<Dataset> {
265 if n_samples == 0 {
267 return Err(DatasetsError::InvalidFormat(
268 "n_samples must be > 0".to_string(),
269 ));
270 }
271
272 if n_features == 0 {
273 return Err(DatasetsError::InvalidFormat(
274 "n_features must be > 0".to_string(),
275 ));
276 }
277
278 if noise < 0.0 {
279 return Err(DatasetsError::InvalidFormat(
280 "noise must be >= 0.0".to_string(),
281 ));
282 }
283
284 let mut rng = match randomseed {
285 Some(_seed) => StdRng::seed_from_u64(_seed),
286 None => {
287 let mut r = rng();
288 StdRng::seed_from_u64(r.next_u64())
289 }
290 };
291
292 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
293 let mut data = Array2::zeros((n_samples, n_features));
294
295 for feature in 0..n_features {
296 let trend_coef = if trend {
297 rng.gen_range(0.01f64..0.1f64)
298 } else {
299 0.0
300 };
301 let seasonality_period = rng.sample(Uniform::new(10, 50).unwrap()) as f64;
302 let seasonality_amplitude = if seasonality {
303 rng.gen_range(1.0f64..5.0f64)
304 } else {
305 0.0
306 };
307
308 let base_value = rng.gen_range(-10.0f64..10.0f64);
309
310 for i in 0..n_samples {
311 let t = i as f64;
312
313 let mut value = base_value;
315
316 if trend {
318 value += trend_coef * t;
319 }
320
321 if seasonality {
323 value += seasonality_amplitude * (2.0 * PI * t / seasonality_period).sin();
324 }
325
326 if noise > 0.0 {
328 value += normal.sample(&mut rng) * noise;
329 }
330
331 data[[i, feature]] = value;
332 }
333 }
334
335 let time_index: Vec<f64> = (0..n_samples).map(|i| i as f64).collect();
337 let _time_array = Array1::from(time_index);
338
339 let mut dataset = Dataset::new(data, None);
341
342 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
344
345 dataset = dataset
346 .with_featurenames(featurenames)
347 .with_description(format!(
348 "Synthetic time series dataset with {n_features} _features"
349 ))
350 .with_metadata("trend", &trend.to_string())
351 .with_metadata("seasonality", &seasonality.to_string())
352 .with_metadata("noise", &noise.to_string());
353
354 Ok(dataset)
355}
356
357#[allow(dead_code)]
359pub fn make_blobs(
360 n_samples: usize,
361 n_features: usize,
362 centers: usize,
363 cluster_std: f64,
364 randomseed: Option<u64>,
365) -> Result<Dataset> {
366 if n_samples == 0 {
368 return Err(DatasetsError::InvalidFormat(
369 "n_samples must be > 0".to_string(),
370 ));
371 }
372
373 if n_features == 0 {
374 return Err(DatasetsError::InvalidFormat(
375 "n_features must be > 0".to_string(),
376 ));
377 }
378
379 if centers == 0 {
380 return Err(DatasetsError::InvalidFormat(
381 "centers must be > 0".to_string(),
382 ));
383 }
384
385 if cluster_std <= 0.0 {
386 return Err(DatasetsError::InvalidFormat(
387 "cluster_std must be > 0.0".to_string(),
388 ));
389 }
390
391 let mut rng = match randomseed {
392 Some(_seed) => StdRng::seed_from_u64(_seed),
393 None => {
394 let mut r = rng();
395 StdRng::seed_from_u64(r.next_u64())
396 }
397 };
398
399 let mut cluster_centers = Array2::zeros((centers, n_features));
401 let center_box = 10.0;
402
403 for i in 0..centers {
404 for j in 0..n_features {
405 cluster_centers[[i, j]] = rng.gen_range(-center_box..center_box);
406 }
407 }
408
409 let mut data = Array2::zeros((n_samples, n_features));
411 let mut target = Array1::zeros(n_samples);
412
413 let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
414
415 let samples_per_center = n_samples / centers;
417 let remainder = n_samples % centers;
418
419 let mut sample_idx = 0;
420
421 for center_idx in 0..centers {
422 let n_samples_center = if center_idx < remainder {
423 samples_per_center + 1
424 } else {
425 samples_per_center
426 };
427
428 for _ in 0..n_samples_center {
429 for j in 0..n_features {
430 data[[sample_idx, j]] = cluster_centers[[center_idx, j]] + normal.sample(&mut rng);
431 }
432
433 target[sample_idx] = center_idx as f64;
434 sample_idx += 1;
435 }
436 }
437
438 let mut dataset = Dataset::new(data, Some(target));
440
441 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
443
444 dataset = dataset
445 .with_featurenames(featurenames)
446 .with_description(format!(
447 "Synthetic clustering dataset with {centers} clusters and {n_features} _features"
448 ))
449 .with_metadata("centers", ¢ers.to_string())
450 .with_metadata("cluster_std", &cluster_std.to_string());
451
452 Ok(dataset)
453}
454
455#[allow(dead_code)]
457pub fn make_spirals(
458 n_samples: usize,
459 n_spirals: usize,
460 noise: f64,
461 randomseed: Option<u64>,
462) -> Result<Dataset> {
463 if n_samples == 0 {
465 return Err(DatasetsError::InvalidFormat(
466 "n_samples must be > 0".to_string(),
467 ));
468 }
469
470 if n_spirals == 0 {
471 return Err(DatasetsError::InvalidFormat(
472 "n_spirals must be > 0".to_string(),
473 ));
474 }
475
476 if noise < 0.0 {
477 return Err(DatasetsError::InvalidFormat(
478 "noise must be >= 0.0".to_string(),
479 ));
480 }
481
482 let mut rng = match randomseed {
483 Some(_seed) => StdRng::seed_from_u64(_seed),
484 None => {
485 let mut r = rng();
486 StdRng::seed_from_u64(r.next_u64())
487 }
488 };
489
490 let mut data = Array2::zeros((n_samples, 2));
491 let mut target = Array1::zeros(n_samples);
492
493 let normal = if noise > 0.0 {
494 Some(rand_distr::Normal::new(0.0, noise).unwrap())
495 } else {
496 None
497 };
498
499 let samples_per_spiral = n_samples / n_spirals;
500 let remainder = n_samples % n_spirals;
501
502 let mut sample_idx = 0;
503
504 for spiral in 0..n_spirals {
505 let n_samples_spiral = if spiral < remainder {
506 samples_per_spiral + 1
507 } else {
508 samples_per_spiral
509 };
510
511 let spiral_offset = 2.0 * PI * spiral as f64 / n_spirals as f64;
512
513 for i in 0..n_samples_spiral {
514 let t = 2.0 * PI * i as f64 / n_samples_spiral as f64;
515 let radius = t / (2.0 * PI);
516
517 let mut x = radius * (t + spiral_offset).cos();
518 let mut y = radius * (t + spiral_offset).sin();
519
520 if let Some(ref normal_dist) = normal {
522 x += normal_dist.sample(&mut rng);
523 y += normal_dist.sample(&mut rng);
524 }
525
526 data[[sample_idx, 0]] = x;
527 data[[sample_idx, 1]] = y;
528 target[sample_idx] = spiral as f64;
529 sample_idx += 1;
530 }
531 }
532
533 let mut dataset = Dataset::new(data, Some(target));
534 dataset = dataset
535 .with_featurenames(vec!["x".to_string(), "y".to_string()])
536 .with_targetnames((0..n_spirals).map(|i| format!("spiral_{i}")).collect())
537 .with_description(format!("Spiral dataset with {n_spirals} _spirals"))
538 .with_metadata("noise", &noise.to_string());
539
540 Ok(dataset)
541}
542
543#[allow(dead_code)]
545pub fn make_moons(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
546 if n_samples == 0 {
548 return Err(DatasetsError::InvalidFormat(
549 "n_samples must be > 0".to_string(),
550 ));
551 }
552
553 if noise < 0.0 {
554 return Err(DatasetsError::InvalidFormat(
555 "noise must be >= 0.0".to_string(),
556 ));
557 }
558
559 let mut rng = match randomseed {
560 Some(_seed) => StdRng::seed_from_u64(_seed),
561 None => {
562 let mut r = rng();
563 StdRng::seed_from_u64(r.next_u64())
564 }
565 };
566
567 let mut data = Array2::zeros((n_samples, 2));
568 let mut target = Array1::zeros(n_samples);
569
570 let normal = if noise > 0.0 {
571 Some(rand_distr::Normal::new(0.0, noise).unwrap())
572 } else {
573 None
574 };
575
576 let samples_per_moon = n_samples / 2;
577 let remainder = n_samples % 2;
578
579 let mut sample_idx = 0;
580
581 for i in 0..(samples_per_moon + remainder) {
583 let t = PI * i as f64 / (samples_per_moon + remainder) as f64;
584
585 let mut x = t.cos();
586 let mut y = t.sin();
587
588 if let Some(ref normal_dist) = normal {
590 x += normal_dist.sample(&mut rng);
591 y += normal_dist.sample(&mut rng);
592 }
593
594 data[[sample_idx, 0]] = x;
595 data[[sample_idx, 1]] = y;
596 target[sample_idx] = 0.0;
597 sample_idx += 1;
598 }
599
600 for i in 0..samples_per_moon {
602 let t = PI * i as f64 / samples_per_moon as f64;
603
604 let mut x = 1.0 - t.cos();
605 let mut y = 0.5 - t.sin(); if let Some(ref normal_dist) = normal {
609 x += normal_dist.sample(&mut rng);
610 y += normal_dist.sample(&mut rng);
611 }
612
613 data[[sample_idx, 0]] = x;
614 data[[sample_idx, 1]] = y;
615 target[sample_idx] = 1.0;
616 sample_idx += 1;
617 }
618
619 let mut dataset = Dataset::new(data, Some(target));
620 dataset = dataset
621 .with_featurenames(vec!["x".to_string(), "y".to_string()])
622 .with_targetnames(vec!["moon_0".to_string(), "moon_1".to_string()])
623 .with_description("Two moons dataset for non-linear classification".to_string())
624 .with_metadata("noise", &noise.to_string());
625
626 Ok(dataset)
627}
628
629#[allow(dead_code)]
631pub fn make_circles(
632 n_samples: usize,
633 factor: f64,
634 noise: f64,
635 randomseed: Option<u64>,
636) -> Result<Dataset> {
637 if n_samples == 0 {
639 return Err(DatasetsError::InvalidFormat(
640 "n_samples must be > 0".to_string(),
641 ));
642 }
643
644 if factor <= 0.0 || factor >= 1.0 {
645 return Err(DatasetsError::InvalidFormat(
646 "factor must be between 0.0 and 1.0".to_string(),
647 ));
648 }
649
650 if noise < 0.0 {
651 return Err(DatasetsError::InvalidFormat(
652 "noise must be >= 0.0".to_string(),
653 ));
654 }
655
656 let mut rng = match randomseed {
657 Some(_seed) => StdRng::seed_from_u64(_seed),
658 None => {
659 let mut r = rng();
660 StdRng::seed_from_u64(r.next_u64())
661 }
662 };
663
664 let mut data = Array2::zeros((n_samples, 2));
665 let mut target = Array1::zeros(n_samples);
666
667 let normal = if noise > 0.0 {
668 Some(rand_distr::Normal::new(0.0, noise).unwrap())
669 } else {
670 None
671 };
672
673 let samples_per_circle = n_samples / 2;
674 let remainder = n_samples % 2;
675
676 let mut sample_idx = 0;
677
678 for i in 0..(samples_per_circle + remainder) {
680 let angle = 2.0 * PI * i as f64 / (samples_per_circle + remainder) as f64;
681
682 let mut x = angle.cos();
683 let mut y = angle.sin();
684
685 if let Some(ref normal_dist) = normal {
687 x += normal_dist.sample(&mut rng);
688 y += normal_dist.sample(&mut rng);
689 }
690
691 data[[sample_idx, 0]] = x;
692 data[[sample_idx, 1]] = y;
693 target[sample_idx] = 0.0;
694 sample_idx += 1;
695 }
696
697 for i in 0..samples_per_circle {
699 let angle = 2.0 * PI * i as f64 / samples_per_circle as f64;
700
701 let mut x = factor * angle.cos();
702 let mut y = factor * angle.sin();
703
704 if let Some(ref normal_dist) = normal {
706 x += normal_dist.sample(&mut rng);
707 y += normal_dist.sample(&mut rng);
708 }
709
710 data[[sample_idx, 0]] = x;
711 data[[sample_idx, 1]] = y;
712 target[sample_idx] = 1.0;
713 sample_idx += 1;
714 }
715
716 let mut dataset = Dataset::new(data, Some(target));
717 dataset = dataset
718 .with_featurenames(vec!["x".to_string(), "y".to_string()])
719 .with_targetnames(vec!["outer_circle".to_string(), "inner_circle".to_string()])
720 .with_description("Concentric circles dataset for non-linear classification".to_string())
721 .with_metadata("factor", &factor.to_string())
722 .with_metadata("noise", &noise.to_string());
723
724 Ok(dataset)
725}
726
727#[allow(dead_code)]
729pub fn make_swiss_roll(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
730 if n_samples == 0 {
732 return Err(DatasetsError::InvalidFormat(
733 "n_samples must be > 0".to_string(),
734 ));
735 }
736
737 if noise < 0.0 {
738 return Err(DatasetsError::InvalidFormat(
739 "noise must be >= 0.0".to_string(),
740 ));
741 }
742
743 let mut rng = match randomseed {
744 Some(_seed) => StdRng::seed_from_u64(_seed),
745 None => {
746 let mut r = rng();
747 StdRng::seed_from_u64(r.next_u64())
748 }
749 };
750
751 let mut data = Array2::zeros((n_samples, 3));
752 let mut color = Array1::zeros(n_samples); let normal = if noise > 0.0 {
755 Some(rand_distr::Normal::new(0.0, noise).unwrap())
756 } else {
757 None
758 };
759
760 for i in 0..n_samples {
761 let t = 1.5 * PI * (1.0 + 2.0 * i as f64 / n_samples as f64);
763
764 let height = 21.0 * i as f64 / n_samples as f64;
766
767 let mut x = t * t.cos();
768 let mut y = height;
769 let mut z = t * t.sin();
770
771 if let Some(ref normal_dist) = normal {
773 x += normal_dist.sample(&mut rng);
774 y += normal_dist.sample(&mut rng);
775 z += normal_dist.sample(&mut rng);
776 }
777
778 data[[i, 0]] = x;
779 data[[i, 1]] = y;
780 data[[i, 2]] = z;
781 color[i] = t; }
783
784 let mut dataset = Dataset::new(data, Some(color));
785 dataset = dataset
786 .with_featurenames(vec!["x".to_string(), "y".to_string(), "z".to_string()])
787 .with_description("Swiss roll manifold dataset for dimensionality reduction".to_string())
788 .with_metadata("noise", &noise.to_string())
789 .with_metadata("dimensions", "3")
790 .with_metadata("manifold_dim", "2");
791
792 Ok(dataset)
793}
794
795#[allow(clippy::too_many_arguments)]
797#[allow(dead_code)]
798pub fn make_anisotropic_blobs(
799 n_samples: usize,
800 n_features: usize,
801 centers: usize,
802 cluster_std: f64,
803 anisotropy_factor: f64,
804 randomseed: Option<u64>,
805) -> Result<Dataset> {
806 if n_samples == 0 {
808 return Err(DatasetsError::InvalidFormat(
809 "n_samples must be > 0".to_string(),
810 ));
811 }
812
813 if n_features < 2 {
814 return Err(DatasetsError::InvalidFormat(
815 "n_features must be >= 2 for anisotropic clusters".to_string(),
816 ));
817 }
818
819 if centers == 0 {
820 return Err(DatasetsError::InvalidFormat(
821 "centers must be > 0".to_string(),
822 ));
823 }
824
825 if cluster_std <= 0.0 {
826 return Err(DatasetsError::InvalidFormat(
827 "cluster_std must be > 0.0".to_string(),
828 ));
829 }
830
831 if anisotropy_factor <= 0.0 {
832 return Err(DatasetsError::InvalidFormat(
833 "anisotropy_factor must be > 0.0".to_string(),
834 ));
835 }
836
837 let mut rng = match randomseed {
838 Some(_seed) => StdRng::seed_from_u64(_seed),
839 None => {
840 let mut r = rng();
841 StdRng::seed_from_u64(r.next_u64())
842 }
843 };
844
845 let mut cluster_centers = Array2::zeros((centers, n_features));
847 let center_box = 10.0;
848
849 for i in 0..centers {
850 for j in 0..n_features {
851 cluster_centers[[i, j]] = rng.gen_range(-center_box..center_box);
852 }
853 }
854
855 let mut data = Array2::zeros((n_samples, n_features));
857 let mut target = Array1::zeros(n_samples);
858
859 let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
860
861 let samples_per_center = n_samples / centers;
862 let remainder = n_samples % centers;
863
864 let mut sample_idx = 0;
865
866 for center_idx in 0..centers {
867 let n_samples_center = if center_idx < remainder {
868 samples_per_center + 1
869 } else {
870 samples_per_center
871 };
872
873 let rotation_angle = rng.gen_range(0.0..(2.0 * PI));
875
876 for _ in 0..n_samples_center {
877 let mut point = vec![0.0; n_features];
879
880 point[0] = normal.sample(&mut rng);
882 point[1] = normal.sample(&mut rng) / anisotropy_factor;
883
884 for item in point.iter_mut().take(n_features).skip(2) {
886 *item = normal.sample(&mut rng);
887 }
888
889 if n_features >= 2 {
891 let cos_theta = rotation_angle.cos();
892 let sin_theta = rotation_angle.sin();
893
894 let x_rot = cos_theta * point[0] - sin_theta * point[1];
895 let y_rot = sin_theta * point[0] + cos_theta * point[1];
896
897 point[0] = x_rot;
898 point[1] = y_rot;
899 }
900
901 for j in 0..n_features {
903 data[[sample_idx, j]] = cluster_centers[[center_idx, j]] + point[j];
904 }
905
906 target[sample_idx] = center_idx as f64;
907 sample_idx += 1;
908 }
909 }
910
911 let mut dataset = Dataset::new(data, Some(target));
912 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
913
914 dataset = dataset
915 .with_featurenames(featurenames)
916 .with_description(format!(
917 "Anisotropic clustering dataset with {centers} elongated clusters and {n_features} _features"
918 ))
919 .with_metadata("centers", ¢ers.to_string())
920 .with_metadata("cluster_std", &cluster_std.to_string())
921 .with_metadata("anisotropy_factor", &anisotropy_factor.to_string());
922
923 Ok(dataset)
924}
925
926#[allow(clippy::too_many_arguments)]
928#[allow(dead_code)]
929pub fn make_hierarchical_clusters(
930 n_samples: usize,
931 n_features: usize,
932 n_main_clusters: usize,
933 n_sub_clusters: usize,
934 main_cluster_std: f64,
935 sub_cluster_std: f64,
936 randomseed: Option<u64>,
937) -> Result<Dataset> {
938 if n_samples == 0 {
940 return Err(DatasetsError::InvalidFormat(
941 "n_samples must be > 0".to_string(),
942 ));
943 }
944
945 if n_features == 0 {
946 return Err(DatasetsError::InvalidFormat(
947 "n_features must be > 0".to_string(),
948 ));
949 }
950
951 if n_main_clusters == 0 {
952 return Err(DatasetsError::InvalidFormat(
953 "n_main_clusters must be > 0".to_string(),
954 ));
955 }
956
957 if n_sub_clusters == 0 {
958 return Err(DatasetsError::InvalidFormat(
959 "n_sub_clusters must be > 0".to_string(),
960 ));
961 }
962
963 if main_cluster_std <= 0.0 {
964 return Err(DatasetsError::InvalidFormat(
965 "main_cluster_std must be > 0.0".to_string(),
966 ));
967 }
968
969 if sub_cluster_std <= 0.0 {
970 return Err(DatasetsError::InvalidFormat(
971 "sub_cluster_std must be > 0.0".to_string(),
972 ));
973 }
974
975 let mut rng = match randomseed {
976 Some(_seed) => StdRng::seed_from_u64(_seed),
977 None => {
978 let mut r = rng();
979 StdRng::seed_from_u64(r.next_u64())
980 }
981 };
982
983 let mut main_centers = Array2::zeros((n_main_clusters, n_features));
985 let center_box = 20.0;
986
987 for i in 0..n_main_clusters {
988 for j in 0..n_features {
989 main_centers[[i, j]] = rng.gen_range(-center_box..center_box);
990 }
991 }
992
993 let mut data = Array2::zeros((n_samples, n_features));
994 let mut main_target = Array1::zeros(n_samples);
995 let mut sub_target = Array1::zeros(n_samples);
996
997 let main_normal = rand_distr::Normal::new(0.0, main_cluster_std).unwrap();
998 let sub_normal = rand_distr::Normal::new(0.0, sub_cluster_std).unwrap();
999
1000 let samples_per_main = n_samples / n_main_clusters;
1001 let remainder = n_samples % n_main_clusters;
1002
1003 let mut sample_idx = 0;
1004
1005 for main_idx in 0..n_main_clusters {
1006 let n_samples_main = if main_idx < remainder {
1007 samples_per_main + 1
1008 } else {
1009 samples_per_main
1010 };
1011
1012 let mut sub_centers = Array2::zeros((n_sub_clusters, n_features));
1014 for i in 0..n_sub_clusters {
1015 for j in 0..n_features {
1016 sub_centers[[i, j]] = main_centers[[main_idx, j]] + main_normal.sample(&mut rng);
1017 }
1018 }
1019
1020 let samples_per_sub = n_samples_main / n_sub_clusters;
1021 let sub_remainder = n_samples_main % n_sub_clusters;
1022
1023 for sub_idx in 0..n_sub_clusters {
1024 let n_samples_sub = if sub_idx < sub_remainder {
1025 samples_per_sub + 1
1026 } else {
1027 samples_per_sub
1028 };
1029
1030 for _ in 0..n_samples_sub {
1031 for j in 0..n_features {
1032 data[[sample_idx, j]] = sub_centers[[sub_idx, j]] + sub_normal.sample(&mut rng);
1033 }
1034
1035 main_target[sample_idx] = main_idx as f64;
1036 sub_target[sample_idx] = (main_idx * n_sub_clusters + sub_idx) as f64;
1037 sample_idx += 1;
1038 }
1039 }
1040 }
1041
1042 let mut dataset = Dataset::new(data, Some(main_target));
1043 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1044
1045 dataset = dataset
1046 .with_featurenames(featurenames)
1047 .with_description(format!(
1048 "Hierarchical clustering dataset with {n_main_clusters} main clusters, {n_sub_clusters} sub-_clusters each"
1049 ))
1050 .with_metadata("n_main_clusters", &n_main_clusters.to_string())
1051 .with_metadata("n_sub_clusters", &n_sub_clusters.to_string())
1052 .with_metadata("main_cluster_std", &main_cluster_std.to_string())
1053 .with_metadata("sub_cluster_std", &sub_cluster_std.to_string());
1054
1055 let sub_target_vec = sub_target.to_vec();
1056 dataset = dataset.with_metadata("sub_cluster_labels", &format!("{sub_target_vec:?}"));
1057
1058 Ok(dataset)
1059}
1060
1061#[derive(Debug, Clone, Copy)]
1063pub enum MissingPattern {
1064 MCAR,
1066 MAR,
1068 MNAR,
1070 Block,
1072}
1073
1074#[derive(Debug, Clone, Copy)]
1076pub enum OutlierType {
1077 Point,
1079 Contextual,
1081 Collective,
1083}
1084
1085#[allow(dead_code)]
1087pub fn inject_missing_data(
1088 data: &mut Array2<f64>,
1089 missing_rate: f64,
1090 pattern: MissingPattern,
1091 randomseed: Option<u64>,
1092) -> Result<Array2<bool>> {
1093 if !(0.0..=1.0).contains(&missing_rate) {
1095 return Err(DatasetsError::InvalidFormat(
1096 "missing_rate must be between 0.0 and 1.0".to_string(),
1097 ));
1098 }
1099
1100 let mut rng = match randomseed {
1101 Some(_seed) => StdRng::seed_from_u64(_seed),
1102 None => {
1103 let mut r = rng();
1104 StdRng::seed_from_u64(r.next_u64())
1105 }
1106 };
1107
1108 let (n_samples, n_features) = data.dim();
1109 let mut missing_mask = Array2::from_elem((n_samples, n_features), false);
1110
1111 match pattern {
1112 MissingPattern::MCAR => {
1113 for i in 0..n_samples {
1115 for j in 0..n_features {
1116 if rng.gen_range(0.0f64..1.0) < missing_rate {
1117 missing_mask[[i, j]] = true;
1118 data[[i, j]] = f64::NAN;
1119 }
1120 }
1121 }
1122 }
1123 MissingPattern::MAR => {
1124 for i in 0..n_samples {
1126 let first_feature_val = data[[i, 0]];
1127 let normalized_val = (first_feature_val + 10.0) / 20.0; let adjusted_rate = missing_rate * normalized_val.clamp(0.1, 2.0);
1129
1130 for j in 1..n_features {
1131 if rng.gen_range(0.0f64..1.0) < adjusted_rate {
1133 missing_mask[[i, j]] = true;
1134 data[[i, j]] = f64::NAN;
1135 }
1136 }
1137 }
1138 }
1139 MissingPattern::MNAR => {
1140 for i in 0..n_samples {
1142 for j in 0..n_features {
1143 let value = data[[i, j]];
1144 let normalized_val = (value + 10.0) / 20.0; let adjusted_rate = missing_rate * normalized_val.clamp(0.1, 3.0);
1146
1147 if rng.gen_range(0.0f64..1.0) < adjusted_rate {
1148 missing_mask[[i, j]] = true;
1149 data[[i, j]] = f64::NAN;
1150 }
1151 }
1152 }
1153 }
1154 MissingPattern::Block => {
1155 let block_size = (n_features as f64 * missing_rate).ceil() as usize;
1157 let n_blocks = (missing_rate * n_samples as f64).ceil() as usize;
1158
1159 for _ in 0..n_blocks {
1160 let start_row = rng.sample(Uniform::new(0, n_samples).unwrap());
1161 let start_col =
1162 rng.sample(Uniform::new(0, n_features.saturating_sub(block_size)).unwrap());
1163
1164 for i in start_row..n_samples.min(start_row + block_size) {
1165 for j in start_col..n_features.min(start_col + block_size) {
1166 missing_mask[[i, j]] = true;
1167 data[[i, j]] = f64::NAN;
1168 }
1169 }
1170 }
1171 }
1172 }
1173
1174 Ok(missing_mask)
1175}
1176
1177#[allow(dead_code)]
1179pub fn inject_outliers(
1180 data: &mut Array2<f64>,
1181 outlier_rate: f64,
1182 outlier_type: OutlierType,
1183 outlier_strength: f64,
1184 randomseed: Option<u64>,
1185) -> Result<Array1<bool>> {
1186 if !(0.0..=1.0).contains(&outlier_rate) {
1188 return Err(DatasetsError::InvalidFormat(
1189 "outlier_rate must be between 0.0 and 1.0".to_string(),
1190 ));
1191 }
1192
1193 if outlier_strength <= 0.0 {
1194 return Err(DatasetsError::InvalidFormat(
1195 "outlier_strength must be > 0.0".to_string(),
1196 ));
1197 }
1198
1199 let mut rng = match randomseed {
1200 Some(_seed) => StdRng::seed_from_u64(_seed),
1201 None => {
1202 let mut r = rng();
1203 StdRng::seed_from_u64(r.next_u64())
1204 }
1205 };
1206
1207 let (n_samples, n_features) = data.dim();
1208 let n_outliers = (n_samples as f64 * outlier_rate).ceil() as usize;
1209 let mut outlier_mask = Array1::from_elem(n_samples, false);
1210
1211 let mut feature_means = vec![0.0; n_features];
1213 let mut feature_stds = vec![0.0; n_features];
1214
1215 for j in 0..n_features {
1216 let column = data.column(j);
1217 let valid_values: Vec<f64> = column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
1218
1219 if !valid_values.is_empty() {
1220 feature_means[j] = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
1221 let variance = valid_values
1222 .iter()
1223 .map(|&x| (x - feature_means[j]).powi(2))
1224 .sum::<f64>()
1225 / valid_values.len() as f64;
1226 feature_stds[j] = variance.sqrt().max(1.0); }
1228 }
1229
1230 match outlier_type {
1231 OutlierType::Point => {
1232 for _ in 0..n_outliers {
1234 let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1235 outlier_mask[outlier_idx] = true;
1236
1237 for j in 0..n_features {
1239 let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1240 -1.0
1241 } else {
1242 1.0
1243 };
1244 data[[outlier_idx, j]] =
1245 feature_means[j] + direction * outlier_strength * feature_stds[j];
1246 }
1247 }
1248 }
1249 OutlierType::Contextual => {
1250 for _ in 0..n_outliers {
1252 let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1253 outlier_mask[outlier_idx] = true;
1254
1255 let n_features_to_modify =
1257 rng.sample(Uniform::new(1, (n_features / 2).max(1) + 1).unwrap());
1258 let mut features_to_modify: Vec<usize> = (0..n_features).collect();
1259 features_to_modify.shuffle(&mut rng);
1260 features_to_modify.truncate(n_features_to_modify);
1261
1262 for &j in &features_to_modify {
1263 let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1264 -1.0
1265 } else {
1266 1.0
1267 };
1268 data[[outlier_idx, j]] =
1269 feature_means[j] + direction * outlier_strength * feature_stds[j];
1270 }
1271 }
1272 }
1273 OutlierType::Collective => {
1274 let outliers_per_group = (n_outliers / 3).max(2); let n_groups = (n_outliers / outliers_per_group).max(1);
1277
1278 for _ in 0..n_groups {
1279 let mut outlier_center = vec![0.0; n_features];
1281 for j in 0..n_features {
1282 let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1283 -1.0
1284 } else {
1285 1.0
1286 };
1287 outlier_center[j] =
1288 feature_means[j] + direction * outlier_strength * feature_stds[j];
1289 }
1290
1291 for _ in 0..outliers_per_group {
1293 let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1294 outlier_mask[outlier_idx] = true;
1295
1296 for j in 0..n_features {
1297 let noise = rng.gen_range(-0.5f64..0.5f64) * feature_stds[j];
1298 data[[outlier_idx, j]] = outlier_center[j] + noise;
1299 }
1300 }
1301 }
1302 }
1303 }
1304
1305 Ok(outlier_mask)
1306}
1307
1308#[allow(dead_code)]
1310pub fn add_time_series_noise(
1311 data: &mut Array2<f64>,
1312 noise_types: &[(&str, f64)], randomseed: Option<u64>,
1314) -> Result<()> {
1315 let mut rng = match randomseed {
1316 Some(_seed) => StdRng::seed_from_u64(_seed),
1317 None => {
1318 let mut r = rng();
1319 StdRng::seed_from_u64(r.next_u64())
1320 }
1321 };
1322
1323 let (n_samples, n_features) = data.dim();
1324 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1325
1326 for &(noise_type, strength) in noise_types {
1327 match noise_type {
1328 "gaussian" => {
1329 for i in 0..n_samples {
1331 for j in 0..n_features {
1332 data[[i, j]] += strength * normal.sample(&mut rng);
1333 }
1334 }
1335 }
1336 "spikes" => {
1337 let n_spikes = (n_samples as f64 * strength * 0.1).ceil() as usize;
1339 for _ in 0..n_spikes {
1340 let spike_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1341 let feature_idx = rng.sample(Uniform::new(0, n_features).unwrap());
1342 let spike_magnitude = rng.gen_range(5.0..=15.0) * strength;
1343 let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1344 -1.0
1345 } else {
1346 1.0
1347 };
1348
1349 data[[spike_idx, feature_idx]] += direction * spike_magnitude;
1350 }
1351 }
1352 "drift" => {
1353 for i in 0..n_samples {
1355 let drift_amount = strength * (i as f64 / n_samples as f64);
1356 for j in 0..n_features {
1357 data[[i, j]] += drift_amount;
1358 }
1359 }
1360 }
1361 "seasonal" => {
1362 let period = n_samples as f64 / 4.0; for i in 0..n_samples {
1365 let seasonal_component = strength * (2.0 * PI * i as f64 / period).sin();
1366 for j in 0..n_features {
1367 data[[i, j]] += seasonal_component;
1368 }
1369 }
1370 }
1371 "autocorrelated" => {
1372 let ar_coeff = 0.7; for j in 0..n_features {
1375 let mut prev_noise = 0.0;
1376 for i in 0..n_samples {
1377 let new_noise = ar_coeff * prev_noise + strength * normal.sample(&mut rng);
1378 data[[i, j]] += new_noise;
1379 prev_noise = new_noise;
1380 }
1381 }
1382 }
1383 "heteroscedastic" => {
1384 for i in 0..n_samples {
1386 let variance_factor = 1.0 + strength * (i as f64 / n_samples as f64);
1387 for j in 0..n_features {
1388 data[[i, j]] += variance_factor * strength * normal.sample(&mut rng);
1389 }
1390 }
1391 }
1392 _ => {
1393 return Err(DatasetsError::InvalidFormat(format!(
1394 "Unknown noise type: {noise_type}. Supported , types: gaussian, spikes, drift, seasonal, autocorrelated, heteroscedastic"
1395 )));
1396 }
1397 }
1398 }
1399
1400 Ok(())
1401}
1402
1403#[allow(dead_code)]
1405pub fn make_corrupted_dataset(
1406 base_dataset: &Dataset,
1407 missing_rate: f64,
1408 missing_pattern: MissingPattern,
1409 outlier_rate: f64,
1410 outlier_type: OutlierType,
1411 outlier_strength: f64,
1412 randomseed: Option<u64>,
1413) -> Result<Dataset> {
1414 if !(0.0..=1.0).contains(&missing_rate) {
1416 return Err(DatasetsError::InvalidFormat(
1417 "missing_rate must be between 0.0 and 1.0".to_string(),
1418 ));
1419 }
1420
1421 if !(0.0..=1.0).contains(&outlier_rate) {
1422 return Err(DatasetsError::InvalidFormat(
1423 "outlier_rate must be between 0.0 and 1.0".to_string(),
1424 ));
1425 }
1426
1427 let mut corrupted_data = base_dataset.data.clone();
1429 let corrupted_target = base_dataset.target.clone();
1430
1431 let missing_mask = inject_missing_data(
1433 &mut corrupted_data,
1434 missing_rate,
1435 missing_pattern,
1436 randomseed,
1437 )?;
1438
1439 let outlier_mask = inject_outliers(
1441 &mut corrupted_data,
1442 outlier_rate,
1443 outlier_type,
1444 outlier_strength,
1445 randomseed,
1446 )?;
1447
1448 let mut corrupted_dataset = Dataset::new(corrupted_data, corrupted_target);
1450
1451 if let Some(featurenames) = &base_dataset.featurenames {
1452 corrupted_dataset = corrupted_dataset.with_featurenames(featurenames.clone());
1453 }
1454
1455 if let Some(targetnames) = &base_dataset.targetnames {
1456 corrupted_dataset = corrupted_dataset.with_targetnames(targetnames.clone());
1457 }
1458
1459 corrupted_dataset = corrupted_dataset
1460 .with_description(format!(
1461 "Corrupted version of: {}",
1462 base_dataset
1463 .description
1464 .as_deref()
1465 .unwrap_or("Unknown _dataset")
1466 ))
1467 .with_metadata("missing_rate", &missing_rate.to_string())
1468 .with_metadata("missing_pattern", &format!("{missing_pattern:?}"))
1469 .with_metadata("outlier_rate", &outlier_rate.to_string())
1470 .with_metadata("outlier_type", &format!("{outlier_type:?}"))
1471 .with_metadata("outlier_strength", &outlier_strength.to_string())
1472 .with_metadata(
1473 "missing_count",
1474 &missing_mask.iter().filter(|&&x| x).count().to_string(),
1475 )
1476 .with_metadata(
1477 "outlier_count",
1478 &outlier_mask.iter().filter(|&&x| x).count().to_string(),
1479 );
1480
1481 Ok(corrupted_dataset)
1482}
1483
1484#[derive(Debug, Clone)]
1486pub struct GpuConfig {
1487 pub use_gpu: bool,
1489 pub device_id: usize,
1491 pub use_single_precision: bool,
1493 pub chunk_size: usize,
1495}
1496
1497impl Default for GpuConfig {
1498 fn default() -> Self {
1499 Self {
1500 use_gpu: true,
1501 device_id: 0,
1502 use_single_precision: false,
1503 chunk_size: 10000,
1504 }
1505 }
1506}
1507
1508impl GpuConfig {
1509 pub fn new() -> Self {
1511 Self::default()
1512 }
1513
1514 pub fn with_gpu(mut self, use_gpu: bool) -> Self {
1516 self.use_gpu = use_gpu;
1517 self
1518 }
1519
1520 pub fn with_device(mut self, device_id: usize) -> Self {
1522 self.device_id = device_id;
1523 self
1524 }
1525
1526 pub fn with_single_precision(mut self, single_precision: bool) -> Self {
1528 self.use_single_precision = single_precision;
1529 self
1530 }
1531
1532 pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
1534 self.chunk_size = chunk_size;
1535 self
1536 }
1537}
1538
1539#[allow(dead_code)]
1541#[allow(clippy::too_many_arguments)]
1542pub fn make_classification_gpu(
1543 n_samples: usize,
1544 n_features: usize,
1545 n_classes: usize,
1546 n_clusters_per_class: usize,
1547 n_informative: usize,
1548 randomseed: Option<u64>,
1549 gpuconfig: GpuConfig,
1550) -> Result<Dataset> {
1551 if gpuconfig.use_gpu && gpu_is_available() {
1553 make_classification_gpu_impl(
1554 n_samples,
1555 n_features,
1556 n_classes,
1557 n_clusters_per_class,
1558 n_informative,
1559 randomseed,
1560 gpuconfig,
1561 )
1562 } else {
1563 make_classification(
1565 n_samples,
1566 n_features,
1567 n_classes,
1568 n_clusters_per_class,
1569 n_informative,
1570 randomseed,
1571 )
1572 }
1573}
1574
1575#[allow(dead_code)]
1577fn make_classification_gpu_impl(
1578 n_samples: usize,
1579 n_features: usize,
1580 n_classes: usize,
1581 n_clusters_per_class: usize,
1582 n_informative: usize,
1583 randomseed: Option<u64>,
1584 gpuconfig: GpuConfig,
1585) -> Result<Dataset> {
1586 if n_samples == 0 || n_features == 0 || n_informative == 0 {
1588 return Err(DatasetsError::InvalidFormat(
1589 "n_samples, n_features, and n_informative must be > 0".to_string(),
1590 ));
1591 }
1592
1593 if n_features < n_informative {
1594 return Err(DatasetsError::InvalidFormat(format!(
1595 "n_features ({n_features}) must be >= n_informative ({n_informative})"
1596 )));
1597 }
1598
1599 if n_classes < 2 || n_clusters_per_class == 0 {
1600 return Err(DatasetsError::InvalidFormat(
1601 "n_classes must be >= 2 and n_clusters_per_class must be > 0".to_string(),
1602 ));
1603 }
1604
1605 let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
1607 backend: crate::gpu::GpuBackend::Cuda {
1608 device_id: gpuconfig.device_id as u32,
1609 },
1610 memory: crate::gpu::GpuMemoryConfig::default(),
1611 threads_per_block: 256,
1612 enable_double_precision: !gpuconfig.use_single_precision,
1613 use_fast_math: false,
1614 random_seed: None,
1615 })
1616 .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
1617
1618 let chunk_size = std::cmp::min(gpuconfig.chunk_size, n_samples);
1620 let num_chunks = n_samples.div_ceil(chunk_size);
1621
1622 let mut all_data = Vec::new();
1623 let mut all_targets = Vec::new();
1624
1625 for chunk_idx in 0..num_chunks {
1626 let start_idx = chunk_idx * chunk_size;
1627 let end_idx = std::cmp::min(start_idx + chunk_size, n_samples);
1628 let chunk_samples = end_idx - start_idx;
1629
1630 let (chunk_data, chunk_targets) = generate_classification_chunk_gpu(
1632 &gpu_context,
1633 chunk_samples,
1634 n_features,
1635 n_classes,
1636 n_clusters_per_class,
1637 n_informative,
1638 randomseed.map(|s| s + chunk_idx as u64),
1639 gpuconfig.use_single_precision,
1640 )?;
1641
1642 all_data.extend(chunk_data);
1643 all_targets.extend(chunk_targets);
1644 }
1645
1646 let data = Array2::from_shape_vec((n_samples, n_features), all_data)
1648 .map_err(|e| DatasetsError::Other(format!("Failed to create data array: {e}")))?;
1649
1650 let target = Array1::from_vec(all_targets);
1651
1652 let mut dataset = Dataset::new(data, Some(target));
1654
1655 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1657 let classnames: Vec<String> = (0..n_classes).map(|i| format!("class_{i}")).collect();
1658
1659 dataset = dataset
1660 .with_featurenames(featurenames)
1661 .with_targetnames(classnames)
1662 .with_description(format!(
1663 "GPU-accelerated synthetic classification dataset with {n_classes} _classes and {n_features} _features"
1664 ));
1665
1666 Ok(dataset)
1667}
1668
1669#[allow(dead_code)]
1671#[allow(clippy::too_many_arguments)]
1672fn generate_classification_chunk_gpu(
1673 gpu_context: &GpuContext,
1674 n_samples: usize,
1675 n_features: usize,
1676 n_classes: usize,
1677 n_clusters_per_class: usize,
1678 n_informative: usize,
1679 randomseed: Option<u64>,
1680 _use_single_precision: bool,
1681) -> Result<(Vec<f64>, Vec<f64>)> {
1682 let _seed = randomseed.unwrap_or(42);
1686 let mut rng = StdRng::seed_from_u64(_seed);
1687
1688 let n_centroids = n_classes * n_clusters_per_class;
1690 let mut centroids = vec![0.0; n_centroids * n_informative];
1691
1692 for i in 0..n_centroids {
1693 for j in 0..n_informative {
1694 centroids[i * n_informative + j] = 2.0 * rng.gen_range(-1.0f64..1.0f64);
1695 }
1696 }
1697
1698 let mut data = vec![0.0; n_samples * n_features];
1700 let mut targets = vec![0.0; n_samples];
1701
1702 if *gpu_context.backend() != LocalGpuBackend::Cpu {
1704 return generate_classification_gpu_optimized(
1705 gpu_context,
1706 ¢roids,
1707 n_samples,
1708 n_features,
1709 n_classes,
1710 n_clusters_per_class,
1711 n_informative,
1712 &mut rng,
1713 );
1714 }
1715
1716 let samples_per_class = n_samples / n_classes;
1718 let remainder = n_samples % n_classes;
1719
1720 let mut sample_idx = 0;
1721 for _class in 0..n_classes {
1722 let n_samples_class = if _class < remainder {
1723 samples_per_class + 1
1724 } else {
1725 samples_per_class
1726 };
1727
1728 let samples_per_cluster = n_samples_class / n_clusters_per_class;
1729 let cluster_remainder = n_samples_class % n_clusters_per_class;
1730
1731 for cluster in 0..n_clusters_per_class {
1732 let n_samples_cluster = if cluster < cluster_remainder {
1733 samples_per_cluster + 1
1734 } else {
1735 samples_per_cluster
1736 };
1737
1738 let centroid_idx = _class * n_clusters_per_class + cluster;
1739
1740 for _ in 0..n_samples_cluster {
1741 for j in 0..n_informative {
1743 let centroid_val = centroids[centroid_idx * n_informative + j];
1744 let noise = rand_distr::Normal::new(0.0, 0.3).unwrap().sample(&mut rng);
1745 data[sample_idx * n_features + j] = centroid_val + noise;
1746 }
1747
1748 for j in n_informative..n_features {
1750 data[sample_idx * n_features + j] =
1751 rand_distr::Normal::new(0.0, 1.0).unwrap().sample(&mut rng);
1752 }
1753
1754 targets[sample_idx] = _class as f64;
1755 sample_idx += 1;
1756 }
1757 }
1758 }
1759
1760 Ok((data, targets))
1761}
1762
1763#[allow(clippy::too_many_arguments)]
1765#[allow(dead_code)]
1766fn generate_classification_gpu_optimized(
1767 _gpu_context: &GpuContext,
1768 centroids: &[f64],
1769 n_samples: usize,
1770 n_features: usize,
1771 n_classes: usize,
1772 n_clusters_per_class: usize,
1773 n_informative: usize,
1774 rng: &mut StdRng,
1775) -> Result<(Vec<f64>, Vec<f64>)> {
1776 use rand_distr::Distribution;
1781 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1782
1783 let mut data = vec![0.0; n_samples * n_features];
1784 let mut targets = vec![0.0; n_samples];
1785
1786 let samples_per_class = n_samples / n_classes;
1788 let remainder = n_samples % n_classes;
1789
1790 let mut sample_idx = 0;
1791
1792 for _class in 0..n_classes {
1793 let n_samples_class = if _class < remainder {
1794 samples_per_class + 1
1795 } else {
1796 samples_per_class
1797 };
1798
1799 let samples_per_cluster = n_samples_class / n_clusters_per_class;
1801 let cluster_remainder = n_samples_class % n_clusters_per_class;
1802
1803 for cluster in 0..n_clusters_per_class {
1804 let n_samples_cluster = if cluster < cluster_remainder {
1805 samples_per_cluster + 1
1806 } else {
1807 samples_per_cluster
1808 };
1809
1810 let centroid_idx = _class * n_clusters_per_class + cluster;
1811
1812 for _ in 0..n_samples_cluster {
1813 for j in 0..n_informative {
1815 let centroid_val = centroids[centroid_idx * n_informative + j];
1816 data[sample_idx * n_features + j] = centroid_val + 0.3 * normal.sample(rng);
1817 }
1818
1819 for j in n_informative..n_features {
1821 data[sample_idx * n_features + j] = normal.sample(rng);
1822 }
1823
1824 targets[sample_idx] = _class as f64;
1825 sample_idx += 1;
1826 }
1827 }
1828 }
1829
1830 Ok((data, targets))
1833}
1834
1835#[allow(dead_code)]
1837#[allow(clippy::too_many_arguments)]
1838pub fn make_regression_gpu(
1839 n_samples: usize,
1840 n_features: usize,
1841 n_informative: usize,
1842 noise: f64,
1843 randomseed: Option<u64>,
1844 gpuconfig: GpuConfig,
1845) -> Result<Dataset> {
1846 if gpuconfig.use_gpu && gpu_is_available() {
1848 make_regression_gpu_impl(
1849 n_samples,
1850 n_features,
1851 n_informative,
1852 noise,
1853 randomseed,
1854 gpuconfig,
1855 )
1856 } else {
1857 make_regression(n_samples, n_features, n_informative, noise, randomseed)
1859 }
1860}
1861
1862#[allow(dead_code)]
1864fn make_regression_gpu_impl(
1865 n_samples: usize,
1866 n_features: usize,
1867 n_informative: usize,
1868 noise: f64,
1869 randomseed: Option<u64>,
1870 gpuconfig: GpuConfig,
1871) -> Result<Dataset> {
1872 if n_samples == 0 || n_features == 0 || n_informative == 0 {
1874 return Err(DatasetsError::InvalidFormat(
1875 "n_samples, n_features, and n_informative must be > 0".to_string(),
1876 ));
1877 }
1878
1879 if n_features < n_informative {
1880 return Err(DatasetsError::InvalidFormat(format!(
1881 "n_features ({n_features}) must be >= n_informative ({n_informative})"
1882 )));
1883 }
1884
1885 let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
1887 backend: crate::gpu::GpuBackend::Cuda {
1888 device_id: gpuconfig.device_id as u32,
1889 },
1890 memory: crate::gpu::GpuMemoryConfig::default(),
1891 threads_per_block: 256,
1892 enable_double_precision: !gpuconfig.use_single_precision,
1893 use_fast_math: false,
1894 random_seed: None,
1895 })
1896 .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
1897
1898 let _seed = randomseed.unwrap_or(42);
1899 let mut rng = StdRng::seed_from_u64(_seed);
1900
1901 let mut coefficients = vec![0.0; n_informative];
1903 for coeff in coefficients.iter_mut().take(n_informative) {
1904 *coeff = rng.gen_range(-2.0f64..2.0f64);
1905 }
1906
1907 let chunk_size = std::cmp::min(gpuconfig.chunk_size, n_samples);
1909 let num_chunks = n_samples.div_ceil(chunk_size);
1910
1911 let mut all_data = Vec::new();
1912 let mut all_targets = Vec::new();
1913
1914 for chunk_idx in 0..num_chunks {
1915 let start_idx = chunk_idx * chunk_size;
1916 let end_idx = std::cmp::min(start_idx + chunk_size, n_samples);
1917 let chunk_samples = end_idx - start_idx;
1918
1919 let (chunk_data, chunk_targets) = generate_regression_chunk_gpu(
1921 &gpu_context,
1922 chunk_samples,
1923 n_features,
1924 n_informative,
1925 &coefficients,
1926 noise,
1927 randomseed.map(|s| s + chunk_idx as u64),
1928 )?;
1929
1930 all_data.extend(chunk_data);
1931 all_targets.extend(chunk_targets);
1932 }
1933
1934 let data = Array2::from_shape_vec((n_samples, n_features), all_data)
1936 .map_err(|e| DatasetsError::Other(format!("Failed to create data array: {e}")))?;
1937
1938 let target = Array1::from_vec(all_targets);
1939
1940 let mut dataset = Dataset::new(data, Some(target));
1942
1943 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1945
1946 dataset = dataset
1947 .with_featurenames(featurenames)
1948 .with_description(format!(
1949 "GPU-accelerated synthetic regression dataset with {n_features} _features"
1950 ));
1951
1952 Ok(dataset)
1953}
1954
1955#[allow(dead_code)]
1957fn generate_regression_chunk_gpu(
1958 gpu_context: &GpuContext,
1959 n_samples: usize,
1960 n_features: usize,
1961 n_informative: usize,
1962 coefficients: &[f64],
1963 noise: f64,
1964 randomseed: Option<u64>,
1965) -> Result<(Vec<f64>, Vec<f64>)> {
1966 let _seed = randomseed.unwrap_or(42);
1967 let mut rng = StdRng::seed_from_u64(_seed);
1968
1969 let mut data = vec![0.0; n_samples * n_features];
1971 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1972
1973 for i in 0..n_samples {
1975 for j in 0..n_features {
1976 data[i * n_features + j] = normal.sample(&mut rng);
1977 }
1978 }
1979
1980 let mut targets = vec![0.0; n_samples];
1982 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
1983
1984 if *gpu_context.backend() != LocalGpuBackend::Cpu {
1986 return generate_regression_gpu_optimized(
1987 gpu_context,
1988 &data,
1989 coefficients,
1990 n_samples,
1991 n_features,
1992 n_informative,
1993 noise,
1994 &mut rng,
1995 );
1996 }
1997
1998 for i in 0..n_samples {
2000 let mut target_val = 0.0;
2001 for j in 0..n_informative {
2002 target_val += data[i * n_features + j] * coefficients[j];
2003 }
2004
2005 target_val += noise_dist.sample(&mut rng);
2007 targets[i] = target_val;
2008 }
2009
2010 Ok((data, targets))
2011}
2012
2013#[allow(clippy::too_many_arguments)]
2015#[allow(dead_code)]
2016fn generate_regression_gpu_optimized(
2017 _gpu_context: &GpuContext,
2018 data: &[f64],
2019 coefficients: &[f64],
2020 n_samples: usize,
2021 n_features: usize,
2022 n_informative: usize,
2023 noise: f64,
2024 rng: &mut StdRng,
2025) -> Result<(Vec<f64>, Vec<f64>)> {
2026 use rand_distr::Distribution;
2031 let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
2032
2033 let mut targets = vec![0.0; n_samples];
2034
2035 for i in 0..n_samples {
2037 let mut target = 0.0;
2038 for j in 0..n_informative {
2039 target += data[i * n_features + j] * coefficients[j];
2040 }
2041
2042 if noise > 0.0 {
2044 target += noise * normal.sample(rng);
2045 }
2046
2047 targets[i] = target;
2048 }
2049
2050 let data_vec = data.to_vec();
2051
2052 Ok((data_vec, targets))
2055}
2056
2057#[allow(dead_code)]
2059pub fn make_blobs_gpu(
2060 n_samples: usize,
2061 n_features: usize,
2062 n_centers: usize,
2063 cluster_std: f64,
2064 randomseed: Option<u64>,
2065 gpuconfig: GpuConfig,
2066) -> Result<Dataset> {
2067 if gpuconfig.use_gpu && gpu_is_available() {
2069 make_blobs_gpu_impl(
2070 n_samples,
2071 n_features,
2072 n_centers,
2073 cluster_std,
2074 randomseed,
2075 gpuconfig,
2076 )
2077 } else {
2078 make_blobs(n_samples, n_features, n_centers, cluster_std, randomseed)
2080 }
2081}
2082
2083#[allow(dead_code)]
2085fn make_blobs_gpu_impl(
2086 n_samples: usize,
2087 n_features: usize,
2088 n_centers: usize,
2089 cluster_std: f64,
2090 randomseed: Option<u64>,
2091 gpuconfig: GpuConfig,
2092) -> Result<Dataset> {
2093 if n_samples == 0 || n_features == 0 || n_centers == 0 {
2095 return Err(DatasetsError::InvalidFormat(
2096 "n_samples, n_features, and n_centers must be > 0".to_string(),
2097 ));
2098 }
2099
2100 if cluster_std <= 0.0 {
2101 return Err(DatasetsError::InvalidFormat(
2102 "cluster_std must be > 0".to_string(),
2103 ));
2104 }
2105
2106 let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
2108 backend: crate::gpu::GpuBackend::Cuda {
2109 device_id: gpuconfig.device_id as u32,
2110 },
2111 memory: crate::gpu::GpuMemoryConfig::default(),
2112 threads_per_block: 256,
2113 enable_double_precision: !gpuconfig.use_single_precision,
2114 use_fast_math: false,
2115 random_seed: None,
2116 })
2117 .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
2118
2119 let _seed = randomseed.unwrap_or(42);
2120 let mut rng = StdRng::seed_from_u64(_seed);
2121
2122 let mut centers = Array2::zeros((n_centers, n_features));
2124 let center_dist = rand_distr::Normal::new(0.0, 10.0).unwrap();
2125
2126 for i in 0..n_centers {
2127 for j in 0..n_features {
2128 centers[[i, j]] = center_dist.sample(&mut rng);
2129 }
2130 }
2131
2132 let samples_per_center = n_samples / n_centers;
2134 let remainder = n_samples % n_centers;
2135
2136 let mut data = Array2::zeros((n_samples, n_features));
2137 let mut target = Array1::zeros(n_samples);
2138
2139 let mut sample_idx = 0;
2140 let noise_dist = rand_distr::Normal::new(0.0, cluster_std).unwrap();
2141
2142 for center_idx in 0..n_centers {
2143 let n_samples_center = if center_idx < remainder {
2144 samples_per_center + 1
2145 } else {
2146 samples_per_center
2147 };
2148
2149 if *gpu_context.backend() != LocalGpuBackend::Cpu {
2151 let gpu_generated = generate_blobs_center_gpu(
2153 &gpu_context,
2154 ¢ers,
2155 center_idx,
2156 n_samples_center,
2157 n_features,
2158 cluster_std,
2159 &mut rng,
2160 )?;
2161
2162 for (local_idx, sample) in gpu_generated.iter().enumerate() {
2164 for j in 0..n_features {
2165 data[[sample_idx + local_idx, j]] = sample[j];
2166 }
2167 target[sample_idx + local_idx] = center_idx as f64;
2168 }
2169 sample_idx += n_samples_center;
2170 } else {
2171 for _ in 0..n_samples_center {
2173 for j in 0..n_features {
2174 data[[sample_idx, j]] = centers[[center_idx, j]] + noise_dist.sample(&mut rng);
2175 }
2176 target[sample_idx] = center_idx as f64;
2177 sample_idx += 1;
2178 }
2179 }
2180 }
2181
2182 let mut dataset = Dataset::new(data, Some(target));
2184
2185 let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
2187 let centernames: Vec<String> = (0..n_centers).map(|i| format!("center_{i}")).collect();
2188
2189 dataset = dataset
2190 .with_featurenames(featurenames)
2191 .with_targetnames(centernames)
2192 .with_description(format!(
2193 "GPU-accelerated synthetic blob dataset with {n_centers} _centers and {n_features} _features"
2194 ));
2195
2196 Ok(dataset)
2197}
2198
2199#[allow(dead_code)]
2201fn generate_blobs_center_gpu(
2202 _gpu_context: &GpuContext,
2203 centers: &Array2<f64>,
2204 center_idx: usize,
2205 n_samples_center: usize,
2206 n_features: usize,
2207 cluster_std: f64,
2208 rng: &mut StdRng,
2209) -> Result<Vec<Vec<f64>>> {
2210 let _center_coords: Vec<f64> = (0..n_features).map(|j| centers[[center_idx, j]]).collect();
2215
2216 use rand_distr::Distribution;
2218 let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
2219
2220 let mut result = Vec::with_capacity(n_samples_center);
2221
2222 for _ in 0..n_samples_center {
2223 let mut sample = Vec::with_capacity(n_features);
2224 for j in 0..n_features {
2225 let center_val = centers[[center_idx, j]];
2226 let noise = normal.sample(rng);
2227 sample.push(center_val + noise);
2228 }
2229 result.push(sample);
2230 }
2231
2232 Ok(result)
2233
2234 }
2349
2350#[allow(dead_code)]
2352pub fn gpu_is_available() -> bool {
2353 GpuContext::new(crate::gpu::GpuConfig::default()).is_ok()
2355}
2356
2357#[allow(dead_code)]
2359pub fn get_gpu_info() -> Result<Vec<GpuDeviceInfo>> {
2360 crate::gpu::list_gpu_devices()
2361 .map_err(|e| DatasetsError::Other(format!("Failed to get GPU info: {e}")))
2362}
2363
2364#[allow(dead_code)]
2366pub fn benchmark_gpu_vs_cpu(
2367 n_samples: usize,
2368 n_features: usize,
2369 iterations: usize,
2370) -> Result<(f64, f64)> {
2371 use std::time::Instant;
2372
2373 let cpu_start = Instant::now();
2375 for _ in 0..iterations {
2376 let _result = make_classification(n_samples, n_features, 3, 2, n_features, Some(42))?;
2377 }
2378 let cpu_time = cpu_start.elapsed().as_secs_f64() / iterations as f64;
2379
2380 let gpuconfig = GpuConfig::default();
2382 let gpu_start = Instant::now();
2383 for _ in 0..iterations {
2384 let _result = make_classification_gpu(
2385 n_samples,
2386 n_features,
2387 3,
2388 2,
2389 n_features,
2390 Some(42),
2391 gpuconfig.clone(),
2392 )?;
2393 }
2394 let gpu_time = gpu_start.elapsed().as_secs_f64() / iterations as f64;
2395
2396 Ok((cpu_time, gpu_time))
2397}
2398
2399#[allow(dead_code)]
2403pub fn make_s_curve(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
2404 if n_samples == 0 {
2405 return Err(DatasetsError::InvalidFormat(
2406 "n_samples must be > 0".to_string(),
2407 ));
2408 }
2409
2410 if noise < 0.0 {
2411 return Err(DatasetsError::InvalidFormat(
2412 "noise must be >= 0".to_string(),
2413 ));
2414 }
2415
2416 let mut rng = match randomseed {
2417 Some(_seed) => StdRng::seed_from_u64(_seed),
2418 None => {
2419 let mut r = rng();
2420 StdRng::seed_from_u64(r.next_u64())
2421 }
2422 };
2423
2424 let mut data = Array2::zeros((n_samples, 3));
2425 let mut color = Array1::zeros(n_samples);
2426
2427 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2428
2429 for i in 0..n_samples {
2430 let t = 4.0 * PI * (i as f64) / (n_samples as f64 - 1.0);
2432
2433 data[[i, 0]] = t.sin() + noise_dist.sample(&mut rng);
2435 data[[i, 1]] = 2.0 * t + noise_dist.sample(&mut rng);
2436 data[[i, 2]] = (t / 2.0).sin() + noise_dist.sample(&mut rng);
2437
2438 color[i] = t;
2440 }
2441
2442 let mut dataset = Dataset::new(data, Some(color));
2443 dataset = dataset
2444 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2445 .with_description("S-curve manifold embedded in 3D space".to_string());
2446
2447 Ok(dataset)
2448}
2449
2450#[allow(dead_code)]
2452pub fn make_swiss_roll_advanced(
2453 n_samples: usize,
2454 noise: f64,
2455 hole: bool,
2456 randomseed: Option<u64>,
2457) -> Result<Dataset> {
2458 if n_samples == 0 {
2459 return Err(DatasetsError::InvalidFormat(
2460 "n_samples must be > 0".to_string(),
2461 ));
2462 }
2463
2464 if noise < 0.0 {
2465 return Err(DatasetsError::InvalidFormat(
2466 "noise must be >= 0".to_string(),
2467 ));
2468 }
2469
2470 let mut rng = match randomseed {
2471 Some(_seed) => StdRng::seed_from_u64(_seed),
2472 None => {
2473 let mut r = rng();
2474 StdRng::seed_from_u64(r.next_u64())
2475 }
2476 };
2477
2478 let mut data = Array2::zeros((n_samples, 3));
2479 let mut color = Array1::zeros(n_samples);
2480
2481 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2482 let uniform = rand_distr::Uniform::new(0.0, 1.0).unwrap();
2483
2484 for i in 0..n_samples {
2485 let mut t = uniform.sample(&mut rng) * 3.0 * PI / 2.0;
2487 let mut y = uniform.sample(&mut rng) * 20.0;
2488
2489 if hole {
2491 while t > PI / 2.0 && t < PI && y > 8.0 && y < 12.0 {
2493 t = uniform.sample(&mut rng) * 3.0 * PI / 2.0;
2494 y = uniform.sample(&mut rng) * 20.0;
2495 }
2496 }
2497
2498 data[[i, 0]] = t * t.cos() + noise_dist.sample(&mut rng);
2500 data[[i, 1]] = y + noise_dist.sample(&mut rng);
2501 data[[i, 2]] = t * t.sin() + noise_dist.sample(&mut rng);
2502
2503 color[i] = t;
2505 }
2506
2507 let mut dataset = Dataset::new(data, Some(color));
2508 let description = if hole {
2509 "Swiss roll manifold with hole embedded in 3D space"
2510 } else {
2511 "Swiss roll manifold embedded in 3D space"
2512 };
2513
2514 dataset = dataset
2515 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2516 .with_description(description.to_string());
2517
2518 Ok(dataset)
2519}
2520
2521#[allow(dead_code)]
2523pub fn make_severed_sphere(
2524 n_samples: usize,
2525 noise: f64,
2526 randomseed: Option<u64>,
2527) -> Result<Dataset> {
2528 if n_samples == 0 {
2529 return Err(DatasetsError::InvalidFormat(
2530 "n_samples must be > 0".to_string(),
2531 ));
2532 }
2533
2534 if noise < 0.0 {
2535 return Err(DatasetsError::InvalidFormat(
2536 "noise must be >= 0".to_string(),
2537 ));
2538 }
2539
2540 let mut rng = match randomseed {
2541 Some(_seed) => StdRng::seed_from_u64(_seed),
2542 None => {
2543 let mut r = rng();
2544 StdRng::seed_from_u64(r.next_u64())
2545 }
2546 };
2547
2548 let mut data = Array2::zeros((n_samples, 3));
2549 let mut color = Array1::zeros(n_samples);
2550
2551 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2552 let uniform = rand_distr::Uniform::new(0.0, 1.0).unwrap();
2553
2554 for i in 0..n_samples {
2555 let mut phi = uniform.sample(&mut rng) * 2.0 * PI; let mut theta = uniform.sample(&mut rng) * PI; while phi > PI / 3.0 && phi < 2.0 * PI / 3.0 && theta > PI / 3.0 && theta < 2.0 * PI / 3.0 {
2561 phi = uniform.sample(&mut rng) * 2.0 * PI;
2562 theta = uniform.sample(&mut rng) * PI;
2563 }
2564
2565 let radius = 1.0; data[[i, 0]] = radius * theta.sin() * phi.cos() + noise_dist.sample(&mut rng);
2569 data[[i, 1]] = radius * theta.sin() * phi.sin() + noise_dist.sample(&mut rng);
2570 data[[i, 2]] = radius * theta.cos() + noise_dist.sample(&mut rng);
2571
2572 color[i] = phi;
2574 }
2575
2576 let mut dataset = Dataset::new(data, Some(color));
2577 dataset = dataset
2578 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2579 .with_description("Severed sphere manifold with discontinuities".to_string());
2580
2581 Ok(dataset)
2582}
2583
2584#[allow(dead_code)]
2586pub fn make_twin_peaks(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
2587 if n_samples == 0 {
2588 return Err(DatasetsError::InvalidFormat(
2589 "n_samples must be > 0".to_string(),
2590 ));
2591 }
2592
2593 if noise < 0.0 {
2594 return Err(DatasetsError::InvalidFormat(
2595 "noise must be >= 0".to_string(),
2596 ));
2597 }
2598
2599 let mut rng = match randomseed {
2600 Some(_seed) => StdRng::seed_from_u64(_seed),
2601 None => {
2602 let mut r = rng();
2603 StdRng::seed_from_u64(r.next_u64())
2604 }
2605 };
2606
2607 let mut data = Array2::zeros((n_samples, 3));
2608 let mut labels = Array1::zeros(n_samples);
2609
2610 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2611 let uniform = rand_distr::Uniform::new(-2.0, 2.0).unwrap();
2612
2613 for i in 0..n_samples {
2614 let x = uniform.sample(&mut rng);
2615 let y = uniform.sample(&mut rng);
2616
2617 let peak1 = (-(((x as f64) - 1.0).powi(2) + ((y as f64) - 1.0).powi(2))).exp();
2619 let peak2 = (-(((x as f64) + 1.0).powi(2) + ((y as f64) + 1.0).powi(2))).exp();
2620 let z = peak1 + peak2 + noise_dist.sample(&mut rng);
2621
2622 data[[i, 0]] = x;
2623 data[[i, 1]] = y;
2624 data[[i, 2]] = z;
2625
2626 labels[i] = if ((x as f64) - 1.0).powi(2) + ((y as f64) - 1.0).powi(2)
2628 < ((x as f64) + 1.0).powi(2) + ((y as f64) + 1.0).powi(2)
2629 {
2630 0.0
2631 } else {
2632 1.0
2633 };
2634 }
2635
2636 let mut dataset = Dataset::new(data, Some(labels));
2637 dataset = dataset
2638 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2639 .with_targetnames(vec!["peak_0".to_string(), "peak_1".to_string()])
2640 .with_description("Twin peaks manifold with two connected Gaussian peaks".to_string());
2641
2642 Ok(dataset)
2643}
2644
2645#[allow(dead_code)]
2647pub fn make_helix(
2648 n_samples: usize,
2649 n_turns: f64,
2650 noise: f64,
2651 randomseed: Option<u64>,
2652) -> Result<Dataset> {
2653 if n_samples == 0 {
2654 return Err(DatasetsError::InvalidFormat(
2655 "n_samples must be > 0".to_string(),
2656 ));
2657 }
2658
2659 if n_turns <= 0.0 {
2660 return Err(DatasetsError::InvalidFormat(
2661 "n_turns must be > 0".to_string(),
2662 ));
2663 }
2664
2665 if noise < 0.0 {
2666 return Err(DatasetsError::InvalidFormat(
2667 "noise must be >= 0".to_string(),
2668 ));
2669 }
2670
2671 let mut rng = match randomseed {
2672 Some(_seed) => StdRng::seed_from_u64(_seed),
2673 None => {
2674 let mut r = rng();
2675 StdRng::seed_from_u64(r.next_u64())
2676 }
2677 };
2678
2679 let mut data = Array2::zeros((n_samples, 3));
2680 let mut color = Array1::zeros(n_samples);
2681
2682 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2683
2684 for i in 0..n_samples {
2685 let t = n_turns * 2.0 * PI * (i as f64) / (n_samples as f64 - 1.0);
2687
2688 data[[i, 0]] = t.cos() + noise_dist.sample(&mut rng);
2690 data[[i, 1]] = t.sin() + noise_dist.sample(&mut rng);
2691 data[[i, 2]] = t / (n_turns * 2.0 * PI) + noise_dist.sample(&mut rng); color[i] = t;
2695 }
2696
2697 let mut dataset = Dataset::new(data, Some(color));
2698 dataset = dataset
2699 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2700 .with_description(format!("Helix manifold with {n_turns} _turns in 3D space"));
2701
2702 Ok(dataset)
2703}
2704
2705#[allow(dead_code)]
2707pub fn make_intersecting_manifolds(
2708 n_samples: usize,
2709 noise: f64,
2710 randomseed: Option<u64>,
2711) -> Result<Dataset> {
2712 if n_samples == 0 {
2713 return Err(DatasetsError::InvalidFormat(
2714 "n_samples must be > 0".to_string(),
2715 ));
2716 }
2717
2718 if noise < 0.0 {
2719 return Err(DatasetsError::InvalidFormat(
2720 "noise must be >= 0".to_string(),
2721 ));
2722 }
2723
2724 let mut rng = match randomseed {
2725 Some(_seed) => StdRng::seed_from_u64(_seed),
2726 None => {
2727 let mut r = rng();
2728 StdRng::seed_from_u64(r.next_u64())
2729 }
2730 };
2731
2732 let samples_per_manifold = n_samples / 2;
2733 let remainder = n_samples % 2;
2734
2735 let mut data = Array2::zeros((n_samples, 3));
2736 let mut labels = Array1::zeros(n_samples);
2737
2738 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2739 let uniform = rand_distr::Uniform::new(-2.0, 2.0).unwrap();
2740
2741 let mut sample_idx = 0;
2742
2743 for _ in 0..samples_per_manifold + remainder {
2745 let x = uniform.sample(&mut rng);
2746 let y = uniform.sample(&mut rng);
2747 let z = x + noise_dist.sample(&mut rng);
2748
2749 data[[sample_idx, 0]] = x;
2750 data[[sample_idx, 1]] = y;
2751 data[[sample_idx, 2]] = z;
2752 labels[sample_idx] = 0.0;
2753 sample_idx += 1;
2754 }
2755
2756 for _ in 0..samples_per_manifold {
2758 let x = uniform.sample(&mut rng);
2759 let y = uniform.sample(&mut rng);
2760 let z = -x + noise_dist.sample(&mut rng);
2761
2762 data[[sample_idx, 0]] = x;
2763 data[[sample_idx, 1]] = y;
2764 data[[sample_idx, 2]] = z;
2765 labels[sample_idx] = 1.0;
2766 sample_idx += 1;
2767 }
2768
2769 let mut dataset = Dataset::new(data, Some(labels));
2770 dataset = dataset
2771 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2772 .with_targetnames(vec!["manifold_0".to_string(), "manifold_1".to_string()])
2773 .with_description("Two intersecting plane manifolds in 3D space".to_string());
2774
2775 Ok(dataset)
2776}
2777
2778#[allow(dead_code)]
2780pub fn make_torus(
2781 n_samples: usize,
2782 major_radius: f64,
2783 minor_radius: f64,
2784 noise: f64,
2785 randomseed: Option<u64>,
2786) -> Result<Dataset> {
2787 if n_samples == 0 {
2788 return Err(DatasetsError::InvalidFormat(
2789 "n_samples must be > 0".to_string(),
2790 ));
2791 }
2792
2793 if major_radius <= 0.0 || minor_radius <= 0.0 {
2794 return Err(DatasetsError::InvalidFormat(
2795 "major_radius and minor_radius must be > 0".to_string(),
2796 ));
2797 }
2798
2799 if minor_radius >= major_radius {
2800 return Err(DatasetsError::InvalidFormat(
2801 "minor_radius must be < major_radius".to_string(),
2802 ));
2803 }
2804
2805 if noise < 0.0 {
2806 return Err(DatasetsError::InvalidFormat(
2807 "noise must be >= 0".to_string(),
2808 ));
2809 }
2810
2811 let mut rng = match randomseed {
2812 Some(_seed) => StdRng::seed_from_u64(_seed),
2813 None => {
2814 let mut r = rng();
2815 StdRng::seed_from_u64(r.next_u64())
2816 }
2817 };
2818
2819 let mut data = Array2::zeros((n_samples, 3));
2820 let mut color = Array1::zeros(n_samples);
2821
2822 let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2823 let uniform = rand_distr::Uniform::new(0.0, 2.0 * PI).unwrap();
2824
2825 for i in 0..n_samples {
2826 let theta = uniform.sample(&mut rng); let phi = uniform.sample(&mut rng); data[[i, 0]] =
2831 (major_radius + minor_radius * phi.cos()) * theta.cos() + noise_dist.sample(&mut rng);
2832 data[[i, 1]] =
2833 (major_radius + minor_radius * phi.cos()) * theta.sin() + noise_dist.sample(&mut rng);
2834 data[[i, 2]] = minor_radius * phi.sin() + noise_dist.sample(&mut rng);
2835
2836 color[i] = theta;
2838 }
2839
2840 let mut dataset = Dataset::new(data, Some(color));
2841 dataset = dataset
2842 .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2843 .with_description(format!(
2844 "Torus manifold with major _radius {major_radius} and minor _radius {minor_radius}"
2845 ));
2846
2847 Ok(dataset)
2848}
2849
2850#[derive(Debug, Clone)]
2852pub struct ManifoldConfig {
2853 pub manifold_type: ManifoldType,
2855 pub n_samples: usize,
2857 pub noise: f64,
2859 pub randomseed: Option<u64>,
2861 pub parameters: std::collections::HashMap<String, f64>,
2863}
2864
2865#[derive(Debug, Clone)]
2867pub enum ManifoldType {
2868 SCurve,
2870 SwissRoll {
2872 hole: bool,
2874 },
2875 SeveredSphere,
2877 TwinPeaks,
2879 Helix {
2881 n_turns: f64,
2883 },
2884 IntersectingManifolds,
2886 Torus {
2888 major_radius: f64,
2890 minor_radius: f64,
2892 },
2893}
2894
2895impl Default for ManifoldConfig {
2896 fn default() -> Self {
2897 Self {
2898 manifold_type: ManifoldType::SCurve,
2899 n_samples: 1000,
2900 noise: 0.1,
2901 randomseed: None,
2902 parameters: std::collections::HashMap::new(),
2903 }
2904 }
2905}
2906
2907impl ManifoldConfig {
2908 pub fn new(manifold_type: ManifoldType) -> Self {
2910 Self {
2911 manifold_type,
2912 ..Default::default()
2913 }
2914 }
2915
2916 pub fn with_samples(mut self, n_samples: usize) -> Self {
2918 self.n_samples = n_samples;
2919 self
2920 }
2921
2922 pub fn with_noise(mut self, noise: f64) -> Self {
2924 self.noise = noise;
2925 self
2926 }
2927
2928 pub fn with_seed(mut self, seed: u64) -> Self {
2930 self.randomseed = Some(seed);
2931 self
2932 }
2933
2934 pub fn with_parameter(mut self, name: String, value: f64) -> Self {
2936 self.parameters.insert(name, value);
2937 self
2938 }
2939}
2940
2941#[allow(dead_code)]
2943pub fn make_manifold(config: ManifoldConfig) -> Result<Dataset> {
2944 match config.manifold_type {
2945 ManifoldType::SCurve => make_s_curve(config.n_samples, config.noise, config.randomseed),
2946 ManifoldType::SwissRoll { hole } => {
2947 make_swiss_roll_advanced(config.n_samples, config.noise, hole, config.randomseed)
2948 }
2949 ManifoldType::SeveredSphere => {
2950 make_severed_sphere(config.n_samples, config.noise, config.randomseed)
2951 }
2952 ManifoldType::TwinPeaks => {
2953 make_twin_peaks(config.n_samples, config.noise, config.randomseed)
2954 }
2955 ManifoldType::Helix { n_turns } => {
2956 make_helix(config.n_samples, n_turns, config.noise, config.randomseed)
2957 }
2958 ManifoldType::IntersectingManifolds => {
2959 make_intersecting_manifolds(config.n_samples, config.noise, config.randomseed)
2960 }
2961 ManifoldType::Torus {
2962 major_radius,
2963 minor_radius,
2964 } => make_torus(
2965 config.n_samples,
2966 major_radius,
2967 minor_radius,
2968 config.noise,
2969 config.randomseed,
2970 ),
2971 }
2972}
2973
2974#[cfg(test)]
2975mod tests {
2976 use super::*;
2977 use rand_distr::Uniform;
2978
2979 #[test]
2980 fn test_make_classification_invalid_params() {
2981 assert!(make_classification(0, 5, 2, 1, 3, None).is_err());
2983
2984 assert!(make_classification(10, 0, 2, 1, 3, None).is_err());
2986
2987 assert!(make_classification(10, 5, 2, 1, 0, None).is_err());
2989
2990 assert!(make_classification(10, 3, 2, 1, 5, None).is_err());
2992
2993 assert!(make_classification(10, 5, 1, 1, 3, None).is_err());
2995
2996 assert!(make_classification(10, 5, 2, 0, 3, None).is_err());
2998 }
2999
3000 #[test]
3001 fn test_make_regression_invalid_params() {
3002 assert!(make_regression(0, 5, 3, 1.0, None).is_err());
3004
3005 assert!(make_regression(10, 0, 3, 1.0, None).is_err());
3007
3008 assert!(make_regression(10, 5, 0, 1.0, None).is_err());
3010
3011 assert!(make_regression(10, 3, 5, 1.0, None).is_err());
3013
3014 assert!(make_regression(10, 5, 3, -1.0, None).is_err());
3016 }
3017
3018 #[test]
3019 fn test_make_time_series_invalid_params() {
3020 assert!(make_time_series(0, 3, false, false, 1.0, None).is_err());
3022
3023 assert!(make_time_series(10, 0, false, false, 1.0, None).is_err());
3025
3026 assert!(make_time_series(10, 3, false, false, -1.0, None).is_err());
3028 }
3029
3030 #[test]
3031 fn test_make_blobs_invalid_params() {
3032 assert!(make_blobs(0, 3, 2, 1.0, None).is_err());
3034
3035 assert!(make_blobs(10, 0, 2, 1.0, None).is_err());
3037
3038 assert!(make_blobs(10, 3, 0, 1.0, None).is_err());
3040
3041 assert!(make_blobs(10, 3, 2, 0.0, None).is_err());
3043 assert!(make_blobs(10, 3, 2, -1.0, None).is_err());
3044 }
3045
3046 #[test]
3047 fn test_make_classification_valid_params() {
3048 let dataset = make_classification(20, 5, 3, 2, 4, Some(42)).unwrap();
3049 assert_eq!(dataset.n_samples(), 20);
3050 assert_eq!(dataset.n_features(), 5);
3051 assert!(dataset.target.is_some());
3052 assert!(dataset.featurenames.is_some());
3053 assert!(dataset.targetnames.is_some());
3054 }
3055
3056 #[test]
3057 fn test_make_regression_valid_params() {
3058 let dataset = make_regression(15, 4, 3, 0.5, Some(42)).unwrap();
3059 assert_eq!(dataset.n_samples(), 15);
3060 assert_eq!(dataset.n_features(), 4);
3061 assert!(dataset.target.is_some());
3062 assert!(dataset.featurenames.is_some());
3063 }
3064
3065 #[test]
3066 fn test_make_time_series_valid_params() {
3067 let dataset = make_time_series(25, 3, true, true, 0.1, Some(42)).unwrap();
3068 assert_eq!(dataset.n_samples(), 25);
3069 assert_eq!(dataset.n_features(), 3);
3070 assert!(dataset.featurenames.is_some());
3071 assert!(dataset.target.is_none());
3073 }
3074
3075 #[test]
3076 fn test_make_blobs_valid_params() {
3077 let dataset = make_blobs(30, 4, 3, 1.5, Some(42)).unwrap();
3078 assert_eq!(dataset.n_samples(), 30);
3079 assert_eq!(dataset.n_features(), 4);
3080 assert!(dataset.target.is_some());
3081 assert!(dataset.featurenames.is_some());
3082 }
3083
3084 #[test]
3085 fn test_make_spirals_invalid_params() {
3086 assert!(make_spirals(0, 2, 0.1, None).is_err());
3088
3089 assert!(make_spirals(100, 0, 0.1, None).is_err());
3091
3092 assert!(make_spirals(100, 2, -0.1, None).is_err());
3094 }
3095
3096 #[test]
3097 fn test_make_spirals_valid_params() {
3098 let dataset = make_spirals(100, 2, 0.1, Some(42)).unwrap();
3099 assert_eq!(dataset.n_samples(), 100);
3100 assert_eq!(dataset.n_features(), 2);
3101 assert!(dataset.target.is_some());
3102 assert!(dataset.featurenames.is_some());
3103
3104 if let Some(target) = &dataset.target {
3106 let unique_labels: std::collections::HashSet<_> =
3107 target.iter().map(|&x| x as i32).collect();
3108 assert_eq!(unique_labels.len(), 2);
3109 }
3110 }
3111
3112 #[test]
3113 fn test_make_moons_invalid_params() {
3114 assert!(make_moons(0, 0.1, None).is_err());
3116
3117 assert!(make_moons(100, -0.1, None).is_err());
3119 }
3120
3121 #[test]
3122 fn test_make_moons_valid_params() {
3123 let dataset = make_moons(100, 0.1, Some(42)).unwrap();
3124 assert_eq!(dataset.n_samples(), 100);
3125 assert_eq!(dataset.n_features(), 2);
3126 assert!(dataset.target.is_some());
3127 assert!(dataset.featurenames.is_some());
3128
3129 if let Some(target) = &dataset.target {
3131 let unique_labels: std::collections::HashSet<_> =
3132 target.iter().map(|&x| x as i32).collect();
3133 assert_eq!(unique_labels.len(), 2);
3134 }
3135 }
3136
3137 #[test]
3138 fn test_make_circles_invalid_params() {
3139 assert!(make_circles(0, 0.5, 0.1, None).is_err());
3141
3142 assert!(make_circles(100, 0.0, 0.1, None).is_err());
3144 assert!(make_circles(100, 1.0, 0.1, None).is_err());
3145 assert!(make_circles(100, 1.5, 0.1, None).is_err());
3146
3147 assert!(make_circles(100, 0.5, -0.1, None).is_err());
3149 }
3150
3151 #[test]
3152 fn test_make_circles_valid_params() {
3153 let dataset = make_circles(100, 0.5, 0.1, Some(42)).unwrap();
3154 assert_eq!(dataset.n_samples(), 100);
3155 assert_eq!(dataset.n_features(), 2);
3156 assert!(dataset.target.is_some());
3157 assert!(dataset.featurenames.is_some());
3158
3159 if let Some(target) = &dataset.target {
3161 let unique_labels: std::collections::HashSet<_> =
3162 target.iter().map(|&x| x as i32).collect();
3163 assert_eq!(unique_labels.len(), 2);
3164 }
3165 }
3166
3167 #[test]
3168 fn test_make_swiss_roll_invalid_params() {
3169 assert!(make_swiss_roll(0, 0.1, None).is_err());
3171
3172 assert!(make_swiss_roll(100, -0.1, None).is_err());
3174 }
3175
3176 #[test]
3177 fn test_make_swiss_roll_valid_params() {
3178 let dataset = make_swiss_roll(100, 0.1, Some(42)).unwrap();
3179 assert_eq!(dataset.n_samples(), 100);
3180 assert_eq!(dataset.n_features(), 3);
3181 assert!(dataset.target.is_some()); assert!(dataset.featurenames.is_some());
3183 }
3184
3185 #[test]
3186 fn test_make_anisotropic_blobs_invalid_params() {
3187 assert!(make_anisotropic_blobs(0, 3, 2, 1.0, 2.0, None).is_err());
3189
3190 assert!(make_anisotropic_blobs(100, 1, 2, 1.0, 2.0, None).is_err());
3192
3193 assert!(make_anisotropic_blobs(100, 3, 0, 1.0, 2.0, None).is_err());
3195
3196 assert!(make_anisotropic_blobs(100, 3, 2, 0.0, 2.0, None).is_err());
3198
3199 assert!(make_anisotropic_blobs(100, 3, 2, 1.0, 0.0, None).is_err());
3201 }
3202
3203 #[test]
3204 fn test_make_anisotropic_blobs_valid_params() {
3205 let dataset = make_anisotropic_blobs(100, 3, 2, 1.0, 3.0, Some(42)).unwrap();
3206 assert_eq!(dataset.n_samples(), 100);
3207 assert_eq!(dataset.n_features(), 3);
3208 assert!(dataset.target.is_some());
3209 assert!(dataset.featurenames.is_some());
3210
3211 if let Some(target) = &dataset.target {
3213 let unique_labels: std::collections::HashSet<_> =
3214 target.iter().map(|&x| x as i32).collect();
3215 assert_eq!(unique_labels.len(), 2);
3216 }
3217 }
3218
3219 #[test]
3220 fn test_make_hierarchical_clusters_invalid_params() {
3221 assert!(make_hierarchical_clusters(0, 3, 2, 3, 1.0, 0.5, None).is_err());
3223
3224 assert!(make_hierarchical_clusters(100, 0, 2, 3, 1.0, 0.5, None).is_err());
3226
3227 assert!(make_hierarchical_clusters(100, 3, 0, 3, 1.0, 0.5, None).is_err());
3229
3230 assert!(make_hierarchical_clusters(100, 3, 2, 0, 1.0, 0.5, None).is_err());
3232
3233 assert!(make_hierarchical_clusters(100, 3, 2, 3, 0.0, 0.5, None).is_err());
3235
3236 assert!(make_hierarchical_clusters(100, 3, 2, 3, 1.0, 0.0, None).is_err());
3238 }
3239
3240 #[test]
3241 fn test_make_hierarchical_clusters_valid_params() {
3242 let dataset = make_hierarchical_clusters(120, 3, 2, 3, 2.0, 0.5, Some(42)).unwrap();
3243 assert_eq!(dataset.n_samples(), 120);
3244 assert_eq!(dataset.n_features(), 3);
3245 assert!(dataset.target.is_some());
3246 assert!(dataset.featurenames.is_some());
3247
3248 if let Some(target) = &dataset.target {
3250 let unique_labels: std::collections::HashSet<_> =
3251 target.iter().map(|&x| x as i32).collect();
3252 assert_eq!(unique_labels.len(), 2); }
3254
3255 assert!(dataset.metadata.contains_key("sub_cluster_labels"));
3257 }
3258
3259 #[test]
3260 fn test_inject_missing_data_invalid_params() {
3261 let mut data = Array2::from_shape_vec((5, 3), vec![1.0; 15]).unwrap();
3262
3263 assert!(inject_missing_data(&mut data, -0.1, MissingPattern::MCAR, None).is_err());
3265 assert!(inject_missing_data(&mut data, 1.5, MissingPattern::MCAR, None).is_err());
3266 }
3267
3268 #[test]
3269 fn test_inject_missing_data_mcar() {
3270 let mut data =
3271 Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
3272 let original_data = data.clone();
3273
3274 let missing_mask =
3275 inject_missing_data(&mut data, 0.3, MissingPattern::MCAR, Some(42)).unwrap();
3276
3277 let missing_count = missing_mask.iter().filter(|&&x| x).count();
3279 assert!(missing_count > 0);
3280
3281 for ((i, j), &is_missing) in missing_mask.indexed_iter() {
3283 if is_missing {
3284 assert!(data[[i, j]].is_nan());
3285 } else {
3286 assert_eq!(data[[i, j]], original_data[[i, j]]);
3287 }
3288 }
3289 }
3290
3291 #[test]
3292 fn test_inject_outliers_invalid_params() {
3293 let mut data = Array2::from_shape_vec((5, 3), vec![1.0; 15]).unwrap();
3294
3295 assert!(inject_outliers(&mut data, -0.1, OutlierType::Point, 2.0, None).is_err());
3297 assert!(inject_outliers(&mut data, 1.5, OutlierType::Point, 2.0, None).is_err());
3298
3299 assert!(inject_outliers(&mut data, 0.1, OutlierType::Point, 0.0, None).is_err());
3301 assert!(inject_outliers(&mut data, 0.1, OutlierType::Point, -1.0, None).is_err());
3302 }
3303
3304 #[test]
3305 fn test_inject_outliers_point() {
3306 let mut data = Array2::from_shape_vec((20, 2), vec![1.0; 40]).unwrap();
3307
3308 let outlier_mask =
3309 inject_outliers(&mut data, 0.2, OutlierType::Point, 3.0, Some(42)).unwrap();
3310
3311 let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
3313 assert!(outlier_count > 0);
3314
3315 for (i, &is_outlier) in outlier_mask.iter().enumerate() {
3317 if is_outlier {
3318 let row = data.row(i);
3320 assert!(row.iter().any(|&x| (x - 1.0).abs() > 1.0));
3321 }
3322 }
3323 }
3324
3325 #[test]
3326 fn test_add_time_series_noise() {
3327 let mut data = Array2::zeros((100, 2));
3328
3329 let noise_types = [("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)];
3330
3331 let original_data = data.clone();
3332 add_time_series_noise(&mut data, &noise_types, Some(42)).unwrap();
3333
3334 assert!(!data
3336 .iter()
3337 .zip(original_data.iter())
3338 .all(|(&a, &b)| (a - b).abs() < 1e-10));
3339
3340 let invalid_noise = [("invalid_type", 0.1)];
3342 let mut test_data = Array2::zeros((10, 2));
3343 assert!(add_time_series_noise(&mut test_data, &invalid_noise, Some(42)).is_err());
3344 }
3345
3346 #[test]
3347 fn test_make_corrupted_dataset() {
3348 let base_dataset = make_blobs(50, 3, 2, 1.0, Some(42)).unwrap();
3349
3350 let corrupted = make_corrupted_dataset(
3351 &base_dataset,
3352 0.1, MissingPattern::MCAR,
3354 0.05, OutlierType::Point,
3356 2.0, Some(42),
3358 )
3359 .unwrap();
3360
3361 assert_eq!(corrupted.n_samples(), base_dataset.n_samples());
3363 assert_eq!(corrupted.n_features(), base_dataset.n_features());
3364
3365 assert!(corrupted.metadata.contains_key("missing_rate"));
3367 assert!(corrupted.metadata.contains_key("outlier_rate"));
3368 assert!(corrupted.metadata.contains_key("missing_count"));
3369 assert!(corrupted.metadata.contains_key("outlier_count"));
3370
3371 let has_nan = corrupted.data.iter().any(|&x| x.is_nan());
3373 assert!(has_nan, "Dataset should have some missing values");
3374 }
3375
3376 #[test]
3377 fn test_make_corrupted_dataset_invalid_params() {
3378 let base_dataset = make_blobs(20, 2, 2, 1.0, Some(42)).unwrap();
3379
3380 assert!(make_corrupted_dataset(
3382 &base_dataset,
3383 -0.1,
3384 MissingPattern::MCAR,
3385 0.0,
3386 OutlierType::Point,
3387 1.0,
3388 None
3389 )
3390 .is_err());
3391 assert!(make_corrupted_dataset(
3392 &base_dataset,
3393 1.5,
3394 MissingPattern::MCAR,
3395 0.0,
3396 OutlierType::Point,
3397 1.0,
3398 None
3399 )
3400 .is_err());
3401
3402 assert!(make_corrupted_dataset(
3404 &base_dataset,
3405 0.0,
3406 MissingPattern::MCAR,
3407 -0.1,
3408 OutlierType::Point,
3409 1.0,
3410 None
3411 )
3412 .is_err());
3413 assert!(make_corrupted_dataset(
3414 &base_dataset,
3415 0.0,
3416 MissingPattern::MCAR,
3417 1.5,
3418 OutlierType::Point,
3419 1.0,
3420 None
3421 )
3422 .is_err());
3423 }
3424
3425 #[test]
3426 fn test_missing_patterns() {
3427 let data = Array2::from_shape_vec((20, 4), (0..80).map(|x| x as f64).collect()).unwrap();
3428
3429 for pattern in [
3431 MissingPattern::MCAR,
3432 MissingPattern::MAR,
3433 MissingPattern::MNAR,
3434 MissingPattern::Block,
3435 ] {
3436 let mut test_data = data.clone();
3437 let missing_mask = inject_missing_data(&mut test_data, 0.2, pattern, Some(42)).unwrap();
3438
3439 let missing_count = missing_mask.iter().filter(|&&x| x).count();
3440 assert!(
3441 missing_count > 0,
3442 "Pattern {pattern:?} should create some missing values"
3443 );
3444 }
3445 }
3446
3447 #[test]
3448 fn test_outlier_types() {
3449 let data = Array2::ones((30, 3));
3450
3451 for outlier_type in [
3453 OutlierType::Point,
3454 OutlierType::Contextual,
3455 OutlierType::Collective,
3456 ] {
3457 let mut test_data = data.clone();
3458 let outlier_mask =
3459 inject_outliers(&mut test_data, 0.2, outlier_type, 3.0, Some(42)).unwrap();
3460
3461 let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
3462 assert!(
3463 outlier_count > 0,
3464 "Outlier type {outlier_type:?} should create some outliers"
3465 );
3466 }
3467 }
3468}