scirs2_datasets/
generators.rs

1//! Dataset generators
2
3use crate::error::{DatasetsError, Result};
4use crate::gpu::{GpuContext, GpuDeviceInfo};
5use crate::utils::Dataset;
6use ndarray::{Array1, Array2};
7use rand::prelude::*;
8use rand::rngs::StdRng;
9use rand_distr::Distribution;
10use rand_distr::Uniform;
11// Use local GPU implementation instead of core to avoid feature flag issues
12use crate::gpu::GpuBackend as LocalGpuBackend;
13// Parallel operations will be added as needed
14// #[cfg(feature = "parallel")]
15// use scirs2_core::parallel_ops::*;
16use rand::seq::SliceRandom;
17use scirs2_core::rng;
18use std::f64::consts::PI;
19
20/// Generate a random classification dataset with clusters
21#[allow(dead_code)]
22#[allow(clippy::too_many_arguments)]
23pub fn make_classification(
24    n_samples: usize,
25    n_features: usize,
26    n_classes: usize,
27    n_clusters_per_class: usize,
28    n_informative: usize,
29    randomseed: Option<u64>,
30) -> Result<Dataset> {
31    // Validate input parameters
32    if n_samples == 0 {
33        return Err(DatasetsError::InvalidFormat(
34            "n_samples must be > 0".to_string(),
35        ));
36    }
37
38    if n_features == 0 {
39        return Err(DatasetsError::InvalidFormat(
40            "n_features must be > 0".to_string(),
41        ));
42    }
43
44    if n_informative == 0 {
45        return Err(DatasetsError::InvalidFormat(
46            "n_informative must be > 0".to_string(),
47        ));
48    }
49
50    if n_features < n_informative {
51        return Err(DatasetsError::InvalidFormat(format!(
52            "n_features ({n_features}) must be >= n_informative ({n_informative})"
53        )));
54    }
55
56    if n_classes < 2 {
57        return Err(DatasetsError::InvalidFormat(
58            "n_classes must be >= 2".to_string(),
59        ));
60    }
61
62    if n_clusters_per_class == 0 {
63        return Err(DatasetsError::InvalidFormat(
64            "n_clusters_per_class must be > 0".to_string(),
65        ));
66    }
67
68    let mut rng = match randomseed {
69        Some(_seed) => StdRng::seed_from_u64(_seed),
70        None => {
71            let mut r = rng();
72            StdRng::seed_from_u64(r.next_u64())
73        }
74    };
75
76    // Generate centroids for each _class and cluster
77    let n_centroids = n_classes * n_clusters_per_class;
78    let mut centroids = Array2::zeros((n_centroids, n_informative));
79    let scale = 2.0;
80
81    for i in 0..n_centroids {
82        for j in 0..n_informative {
83            centroids[[i, j]] = scale * rng.gen_range(-1.0f64..1.0f64);
84        }
85    }
86
87    // Generate _samples
88    let mut data = Array2::zeros((n_samples, n_features));
89    let mut target = Array1::zeros(n_samples);
90
91    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
92
93    // Samples per _class
94    let samples_per_class = n_samples / n_classes;
95    let remainder = n_samples % n_classes;
96
97    let mut sample_idx = 0;
98
99    for _class in 0..n_classes {
100        let n_samples_class = if _class < remainder {
101            samples_per_class + 1
102        } else {
103            samples_per_class
104        };
105
106        // Assign clusters within this _class
107        let samples_per_cluster = n_samples_class / n_clusters_per_class;
108        let cluster_remainder = n_samples_class % n_clusters_per_class;
109
110        for cluster in 0..n_clusters_per_class {
111            let n_samples_cluster = if cluster < cluster_remainder {
112                samples_per_cluster + 1
113            } else {
114                samples_per_cluster
115            };
116
117            let centroid_idx = _class * n_clusters_per_class + cluster;
118
119            for _ in 0..n_samples_cluster {
120                // Randomly select a point near the cluster centroid
121                for j in 0..n_informative {
122                    data[[sample_idx, j]] =
123                        centroids[[centroid_idx, j]] + 0.3 * normal.sample(&mut rng);
124                }
125
126                // Add noise _features
127                for j in n_informative..n_features {
128                    data[[sample_idx, j]] = normal.sample(&mut rng);
129                }
130
131                target[sample_idx] = _class as f64;
132                sample_idx += 1;
133            }
134        }
135    }
136
137    // Create dataset
138    let mut dataset = Dataset::new(data, Some(target));
139
140    // Create feature names
141    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
142
143    // Create _class names
144    let classnames: Vec<String> = (0..n_classes).map(|i| format!("class_{i}")).collect();
145
146    dataset = dataset
147        .with_featurenames(featurenames)
148        .with_targetnames(classnames)
149        .with_description(format!(
150            "Synthetic classification dataset with {n_classes} _classes and {n_features} _features"
151        ));
152
153    Ok(dataset)
154}
155
156/// Generate a random regression dataset
157#[allow(dead_code)]
158pub fn make_regression(
159    n_samples: usize,
160    n_features: usize,
161    n_informative: usize,
162    noise: f64,
163    randomseed: Option<u64>,
164) -> Result<Dataset> {
165    // Validate input parameters
166    if n_samples == 0 {
167        return Err(DatasetsError::InvalidFormat(
168            "n_samples must be > 0".to_string(),
169        ));
170    }
171
172    if n_features == 0 {
173        return Err(DatasetsError::InvalidFormat(
174            "n_features must be > 0".to_string(),
175        ));
176    }
177
178    if n_informative == 0 {
179        return Err(DatasetsError::InvalidFormat(
180            "n_informative must be > 0".to_string(),
181        ));
182    }
183
184    if n_features < n_informative {
185        return Err(DatasetsError::InvalidFormat(format!(
186            "n_features ({n_features}) must be >= n_informative ({n_informative})"
187        )));
188    }
189
190    if noise < 0.0 {
191        return Err(DatasetsError::InvalidFormat(
192            "noise must be >= 0.0".to_string(),
193        ));
194    }
195
196    let mut rng = match randomseed {
197        Some(_seed) => StdRng::seed_from_u64(_seed),
198        None => {
199            let mut r = rng();
200            StdRng::seed_from_u64(r.next_u64())
201        }
202    };
203
204    // Generate the coefficients for the _informative _features
205    let mut coef = Array1::zeros(n_features);
206    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
207
208    for i in 0..n_informative {
209        coef[i] = 100.0 * normal.sample(&mut rng);
210    }
211
212    // Generate the _features
213    let mut data = Array2::zeros((n_samples, n_features));
214
215    for i in 0..n_samples {
216        for j in 0..n_features {
217            data[[i, j]] = normal.sample(&mut rng);
218        }
219    }
220
221    // Generate the target
222    let mut target = Array1::zeros(n_samples);
223
224    for i in 0..n_samples {
225        let mut y = 0.0;
226        for j in 0..n_features {
227            y += data[[i, j]] * coef[j];
228        }
229
230        // Add noise
231        if noise > 0.0 {
232            y += normal.sample(&mut rng) * noise;
233        }
234
235        target[i] = y;
236    }
237
238    // Create dataset
239    let mut dataset = Dataset::new(data, Some(target));
240
241    // Create feature names
242    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
243
244    dataset = dataset
245        .with_featurenames(featurenames)
246        .with_description(format!(
247            "Synthetic regression dataset with {n_features} _features ({n_informative} informative)"
248        ))
249        .with_metadata("noise", &noise.to_string())
250        .with_metadata("coefficients", &format!("{coef:?}"));
251
252    Ok(dataset)
253}
254
255/// Generate a random time series dataset
256#[allow(dead_code)]
257pub fn make_time_series(
258    n_samples: usize,
259    n_features: usize,
260    trend: bool,
261    seasonality: bool,
262    noise: f64,
263    randomseed: Option<u64>,
264) -> Result<Dataset> {
265    // Validate input parameters
266    if n_samples == 0 {
267        return Err(DatasetsError::InvalidFormat(
268            "n_samples must be > 0".to_string(),
269        ));
270    }
271
272    if n_features == 0 {
273        return Err(DatasetsError::InvalidFormat(
274            "n_features must be > 0".to_string(),
275        ));
276    }
277
278    if noise < 0.0 {
279        return Err(DatasetsError::InvalidFormat(
280            "noise must be >= 0.0".to_string(),
281        ));
282    }
283
284    let mut rng = match randomseed {
285        Some(_seed) => StdRng::seed_from_u64(_seed),
286        None => {
287            let mut r = rng();
288            StdRng::seed_from_u64(r.next_u64())
289        }
290    };
291
292    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
293    let mut data = Array2::zeros((n_samples, n_features));
294
295    for feature in 0..n_features {
296        let trend_coef = if trend {
297            rng.gen_range(0.01f64..0.1f64)
298        } else {
299            0.0
300        };
301        let seasonality_period = rng.sample(Uniform::new(10, 50).unwrap()) as f64;
302        let seasonality_amplitude = if seasonality {
303            rng.gen_range(1.0f64..5.0f64)
304        } else {
305            0.0
306        };
307
308        let base_value = rng.gen_range(-10.0f64..10.0f64);
309
310        for i in 0..n_samples {
311            let t = i as f64;
312
313            // Add base value
314            let mut value = base_value;
315
316            // Add trend
317            if trend {
318                value += trend_coef * t;
319            }
320
321            // Add seasonality
322            if seasonality {
323                value += seasonality_amplitude * (2.0 * PI * t / seasonality_period).sin();
324            }
325
326            // Add noise
327            if noise > 0.0 {
328                value += normal.sample(&mut rng) * noise;
329            }
330
331            data[[i, feature]] = value;
332        }
333    }
334
335    // Create time index (unused for now but can be useful for plotting)
336    let time_index: Vec<f64> = (0..n_samples).map(|i| i as f64).collect();
337    let _time_array = Array1::from(time_index);
338
339    // Create dataset
340    let mut dataset = Dataset::new(data, None);
341
342    // Create feature names
343    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
344
345    dataset = dataset
346        .with_featurenames(featurenames)
347        .with_description(format!(
348            "Synthetic time series dataset with {n_features} _features"
349        ))
350        .with_metadata("trend", &trend.to_string())
351        .with_metadata("seasonality", &seasonality.to_string())
352        .with_metadata("noise", &noise.to_string());
353
354    Ok(dataset)
355}
356
357/// Generate a random blobs dataset for clustering
358#[allow(dead_code)]
359pub fn make_blobs(
360    n_samples: usize,
361    n_features: usize,
362    centers: usize,
363    cluster_std: f64,
364    randomseed: Option<u64>,
365) -> Result<Dataset> {
366    // Validate input parameters
367    if n_samples == 0 {
368        return Err(DatasetsError::InvalidFormat(
369            "n_samples must be > 0".to_string(),
370        ));
371    }
372
373    if n_features == 0 {
374        return Err(DatasetsError::InvalidFormat(
375            "n_features must be > 0".to_string(),
376        ));
377    }
378
379    if centers == 0 {
380        return Err(DatasetsError::InvalidFormat(
381            "centers must be > 0".to_string(),
382        ));
383    }
384
385    if cluster_std <= 0.0 {
386        return Err(DatasetsError::InvalidFormat(
387            "cluster_std must be > 0.0".to_string(),
388        ));
389    }
390
391    let mut rng = match randomseed {
392        Some(_seed) => StdRng::seed_from_u64(_seed),
393        None => {
394            let mut r = rng();
395            StdRng::seed_from_u64(r.next_u64())
396        }
397    };
398
399    // Generate random centers
400    let mut cluster_centers = Array2::zeros((centers, n_features));
401    let center_box = 10.0;
402
403    for i in 0..centers {
404        for j in 0..n_features {
405            cluster_centers[[i, j]] = rng.gen_range(-center_box..center_box);
406        }
407    }
408
409    // Generate _samples around centers
410    let mut data = Array2::zeros((n_samples, n_features));
411    let mut target = Array1::zeros(n_samples);
412
413    let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
414
415    // Samples per center
416    let samples_per_center = n_samples / centers;
417    let remainder = n_samples % centers;
418
419    let mut sample_idx = 0;
420
421    for center_idx in 0..centers {
422        let n_samples_center = if center_idx < remainder {
423            samples_per_center + 1
424        } else {
425            samples_per_center
426        };
427
428        for _ in 0..n_samples_center {
429            for j in 0..n_features {
430                data[[sample_idx, j]] = cluster_centers[[center_idx, j]] + normal.sample(&mut rng);
431            }
432
433            target[sample_idx] = center_idx as f64;
434            sample_idx += 1;
435        }
436    }
437
438    // Create dataset
439    let mut dataset = Dataset::new(data, Some(target));
440
441    // Create feature names
442    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
443
444    dataset = dataset
445        .with_featurenames(featurenames)
446        .with_description(format!(
447            "Synthetic clustering dataset with {centers} clusters and {n_features} _features"
448        ))
449        .with_metadata("centers", &centers.to_string())
450        .with_metadata("cluster_std", &cluster_std.to_string());
451
452    Ok(dataset)
453}
454
455/// Generate a spiral dataset for non-linear classification
456#[allow(dead_code)]
457pub fn make_spirals(
458    n_samples: usize,
459    n_spirals: usize,
460    noise: f64,
461    randomseed: Option<u64>,
462) -> Result<Dataset> {
463    // Validate input parameters
464    if n_samples == 0 {
465        return Err(DatasetsError::InvalidFormat(
466            "n_samples must be > 0".to_string(),
467        ));
468    }
469
470    if n_spirals == 0 {
471        return Err(DatasetsError::InvalidFormat(
472            "n_spirals must be > 0".to_string(),
473        ));
474    }
475
476    if noise < 0.0 {
477        return Err(DatasetsError::InvalidFormat(
478            "noise must be >= 0.0".to_string(),
479        ));
480    }
481
482    let mut rng = match randomseed {
483        Some(_seed) => StdRng::seed_from_u64(_seed),
484        None => {
485            let mut r = rng();
486            StdRng::seed_from_u64(r.next_u64())
487        }
488    };
489
490    let mut data = Array2::zeros((n_samples, 2));
491    let mut target = Array1::zeros(n_samples);
492
493    let normal = if noise > 0.0 {
494        Some(rand_distr::Normal::new(0.0, noise).unwrap())
495    } else {
496        None
497    };
498
499    let samples_per_spiral = n_samples / n_spirals;
500    let remainder = n_samples % n_spirals;
501
502    let mut sample_idx = 0;
503
504    for spiral in 0..n_spirals {
505        let n_samples_spiral = if spiral < remainder {
506            samples_per_spiral + 1
507        } else {
508            samples_per_spiral
509        };
510
511        let spiral_offset = 2.0 * PI * spiral as f64 / n_spirals as f64;
512
513        for i in 0..n_samples_spiral {
514            let t = 2.0 * PI * i as f64 / n_samples_spiral as f64;
515            let radius = t / (2.0 * PI);
516
517            let mut x = radius * (t + spiral_offset).cos();
518            let mut y = radius * (t + spiral_offset).sin();
519
520            // Add noise if specified
521            if let Some(ref normal_dist) = normal {
522                x += normal_dist.sample(&mut rng);
523                y += normal_dist.sample(&mut rng);
524            }
525
526            data[[sample_idx, 0]] = x;
527            data[[sample_idx, 1]] = y;
528            target[sample_idx] = spiral as f64;
529            sample_idx += 1;
530        }
531    }
532
533    let mut dataset = Dataset::new(data, Some(target));
534    dataset = dataset
535        .with_featurenames(vec!["x".to_string(), "y".to_string()])
536        .with_targetnames((0..n_spirals).map(|i| format!("spiral_{i}")).collect())
537        .with_description(format!("Spiral dataset with {n_spirals} _spirals"))
538        .with_metadata("noise", &noise.to_string());
539
540    Ok(dataset)
541}
542
543/// Generate a moons dataset for non-linear classification
544#[allow(dead_code)]
545pub fn make_moons(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
546    // Validate input parameters
547    if n_samples == 0 {
548        return Err(DatasetsError::InvalidFormat(
549            "n_samples must be > 0".to_string(),
550        ));
551    }
552
553    if noise < 0.0 {
554        return Err(DatasetsError::InvalidFormat(
555            "noise must be >= 0.0".to_string(),
556        ));
557    }
558
559    let mut rng = match randomseed {
560        Some(_seed) => StdRng::seed_from_u64(_seed),
561        None => {
562            let mut r = rng();
563            StdRng::seed_from_u64(r.next_u64())
564        }
565    };
566
567    let mut data = Array2::zeros((n_samples, 2));
568    let mut target = Array1::zeros(n_samples);
569
570    let normal = if noise > 0.0 {
571        Some(rand_distr::Normal::new(0.0, noise).unwrap())
572    } else {
573        None
574    };
575
576    let samples_per_moon = n_samples / 2;
577    let remainder = n_samples % 2;
578
579    let mut sample_idx = 0;
580
581    // Generate first moon (upper crescent)
582    for i in 0..(samples_per_moon + remainder) {
583        let t = PI * i as f64 / (samples_per_moon + remainder) as f64;
584
585        let mut x = t.cos();
586        let mut y = t.sin();
587
588        // Add noise if specified
589        if let Some(ref normal_dist) = normal {
590            x += normal_dist.sample(&mut rng);
591            y += normal_dist.sample(&mut rng);
592        }
593
594        data[[sample_idx, 0]] = x;
595        data[[sample_idx, 1]] = y;
596        target[sample_idx] = 0.0;
597        sample_idx += 1;
598    }
599
600    // Generate second moon (lower crescent, flipped)
601    for i in 0..samples_per_moon {
602        let t = PI * i as f64 / samples_per_moon as f64;
603
604        let mut x = 1.0 - t.cos();
605        let mut y = 0.5 - t.sin(); // Offset vertically and flip
606
607        // Add noise if specified
608        if let Some(ref normal_dist) = normal {
609            x += normal_dist.sample(&mut rng);
610            y += normal_dist.sample(&mut rng);
611        }
612
613        data[[sample_idx, 0]] = x;
614        data[[sample_idx, 1]] = y;
615        target[sample_idx] = 1.0;
616        sample_idx += 1;
617    }
618
619    let mut dataset = Dataset::new(data, Some(target));
620    dataset = dataset
621        .with_featurenames(vec!["x".to_string(), "y".to_string()])
622        .with_targetnames(vec!["moon_0".to_string(), "moon_1".to_string()])
623        .with_description("Two moons dataset for non-linear classification".to_string())
624        .with_metadata("noise", &noise.to_string());
625
626    Ok(dataset)
627}
628
629/// Generate a circles dataset for non-linear classification
630#[allow(dead_code)]
631pub fn make_circles(
632    n_samples: usize,
633    factor: f64,
634    noise: f64,
635    randomseed: Option<u64>,
636) -> Result<Dataset> {
637    // Validate input parameters
638    if n_samples == 0 {
639        return Err(DatasetsError::InvalidFormat(
640            "n_samples must be > 0".to_string(),
641        ));
642    }
643
644    if factor <= 0.0 || factor >= 1.0 {
645        return Err(DatasetsError::InvalidFormat(
646            "factor must be between 0.0 and 1.0".to_string(),
647        ));
648    }
649
650    if noise < 0.0 {
651        return Err(DatasetsError::InvalidFormat(
652            "noise must be >= 0.0".to_string(),
653        ));
654    }
655
656    let mut rng = match randomseed {
657        Some(_seed) => StdRng::seed_from_u64(_seed),
658        None => {
659            let mut r = rng();
660            StdRng::seed_from_u64(r.next_u64())
661        }
662    };
663
664    let mut data = Array2::zeros((n_samples, 2));
665    let mut target = Array1::zeros(n_samples);
666
667    let normal = if noise > 0.0 {
668        Some(rand_distr::Normal::new(0.0, noise).unwrap())
669    } else {
670        None
671    };
672
673    let samples_per_circle = n_samples / 2;
674    let remainder = n_samples % 2;
675
676    let mut sample_idx = 0;
677
678    // Generate outer circle
679    for i in 0..(samples_per_circle + remainder) {
680        let angle = 2.0 * PI * i as f64 / (samples_per_circle + remainder) as f64;
681
682        let mut x = angle.cos();
683        let mut y = angle.sin();
684
685        // Add noise if specified
686        if let Some(ref normal_dist) = normal {
687            x += normal_dist.sample(&mut rng);
688            y += normal_dist.sample(&mut rng);
689        }
690
691        data[[sample_idx, 0]] = x;
692        data[[sample_idx, 1]] = y;
693        target[sample_idx] = 0.0;
694        sample_idx += 1;
695    }
696
697    // Generate inner circle (scaled by factor)
698    for i in 0..samples_per_circle {
699        let angle = 2.0 * PI * i as f64 / samples_per_circle as f64;
700
701        let mut x = factor * angle.cos();
702        let mut y = factor * angle.sin();
703
704        // Add noise if specified
705        if let Some(ref normal_dist) = normal {
706            x += normal_dist.sample(&mut rng);
707            y += normal_dist.sample(&mut rng);
708        }
709
710        data[[sample_idx, 0]] = x;
711        data[[sample_idx, 1]] = y;
712        target[sample_idx] = 1.0;
713        sample_idx += 1;
714    }
715
716    let mut dataset = Dataset::new(data, Some(target));
717    dataset = dataset
718        .with_featurenames(vec!["x".to_string(), "y".to_string()])
719        .with_targetnames(vec!["outer_circle".to_string(), "inner_circle".to_string()])
720        .with_description("Concentric circles dataset for non-linear classification".to_string())
721        .with_metadata("factor", &factor.to_string())
722        .with_metadata("noise", &noise.to_string());
723
724    Ok(dataset)
725}
726
727/// Generate a Swiss roll dataset for dimensionality reduction
728#[allow(dead_code)]
729pub fn make_swiss_roll(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
730    // Validate input parameters
731    if n_samples == 0 {
732        return Err(DatasetsError::InvalidFormat(
733            "n_samples must be > 0".to_string(),
734        ));
735    }
736
737    if noise < 0.0 {
738        return Err(DatasetsError::InvalidFormat(
739            "noise must be >= 0.0".to_string(),
740        ));
741    }
742
743    let mut rng = match randomseed {
744        Some(_seed) => StdRng::seed_from_u64(_seed),
745        None => {
746            let mut r = rng();
747            StdRng::seed_from_u64(r.next_u64())
748        }
749    };
750
751    let mut data = Array2::zeros((n_samples, 3));
752    let mut color = Array1::zeros(n_samples); // Color parameter for visualization
753
754    let normal = if noise > 0.0 {
755        Some(rand_distr::Normal::new(0.0, noise).unwrap())
756    } else {
757        None
758    };
759
760    for i in 0..n_samples {
761        // Parameter along the roll
762        let t = 1.5 * PI * (1.0 + 2.0 * i as f64 / n_samples as f64);
763
764        // Height parameter
765        let height = 21.0 * i as f64 / n_samples as f64;
766
767        let mut x = t * t.cos();
768        let mut y = height;
769        let mut z = t * t.sin();
770
771        // Add noise if specified
772        if let Some(ref normal_dist) = normal {
773            x += normal_dist.sample(&mut rng);
774            y += normal_dist.sample(&mut rng);
775            z += normal_dist.sample(&mut rng);
776        }
777
778        data[[i, 0]] = x;
779        data[[i, 1]] = y;
780        data[[i, 2]] = z;
781        color[i] = t; // Color based on parameter for visualization
782    }
783
784    let mut dataset = Dataset::new(data, Some(color));
785    dataset = dataset
786        .with_featurenames(vec!["x".to_string(), "y".to_string(), "z".to_string()])
787        .with_description("Swiss roll manifold dataset for dimensionality reduction".to_string())
788        .with_metadata("noise", &noise.to_string())
789        .with_metadata("dimensions", "3")
790        .with_metadata("manifold_dim", "2");
791
792    Ok(dataset)
793}
794
795/// Generate anisotropic (elongated) clusters
796#[allow(clippy::too_many_arguments)]
797#[allow(dead_code)]
798pub fn make_anisotropic_blobs(
799    n_samples: usize,
800    n_features: usize,
801    centers: usize,
802    cluster_std: f64,
803    anisotropy_factor: f64,
804    randomseed: Option<u64>,
805) -> Result<Dataset> {
806    // Validate input parameters
807    if n_samples == 0 {
808        return Err(DatasetsError::InvalidFormat(
809            "n_samples must be > 0".to_string(),
810        ));
811    }
812
813    if n_features < 2 {
814        return Err(DatasetsError::InvalidFormat(
815            "n_features must be >= 2 for anisotropic clusters".to_string(),
816        ));
817    }
818
819    if centers == 0 {
820        return Err(DatasetsError::InvalidFormat(
821            "centers must be > 0".to_string(),
822        ));
823    }
824
825    if cluster_std <= 0.0 {
826        return Err(DatasetsError::InvalidFormat(
827            "cluster_std must be > 0.0".to_string(),
828        ));
829    }
830
831    if anisotropy_factor <= 0.0 {
832        return Err(DatasetsError::InvalidFormat(
833            "anisotropy_factor must be > 0.0".to_string(),
834        ));
835    }
836
837    let mut rng = match randomseed {
838        Some(_seed) => StdRng::seed_from_u64(_seed),
839        None => {
840            let mut r = rng();
841            StdRng::seed_from_u64(r.next_u64())
842        }
843    };
844
845    // Generate random centers
846    let mut cluster_centers = Array2::zeros((centers, n_features));
847    let center_box = 10.0;
848
849    for i in 0..centers {
850        for j in 0..n_features {
851            cluster_centers[[i, j]] = rng.gen_range(-center_box..center_box);
852        }
853    }
854
855    // Generate _samples around centers with anisotropic distribution
856    let mut data = Array2::zeros((n_samples, n_features));
857    let mut target = Array1::zeros(n_samples);
858
859    let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
860
861    let samples_per_center = n_samples / centers;
862    let remainder = n_samples % centers;
863
864    let mut sample_idx = 0;
865
866    for center_idx in 0..centers {
867        let n_samples_center = if center_idx < remainder {
868            samples_per_center + 1
869        } else {
870            samples_per_center
871        };
872
873        // Generate a random rotation angle for this cluster
874        let rotation_angle = rng.gen_range(0.0..(2.0 * PI));
875
876        for _ in 0..n_samples_center {
877            // Generate point with anisotropic distribution (elongated along first axis)
878            let mut point = vec![0.0; n_features];
879
880            // First axis has normal std..second axis has reduced _std (anisotropy)
881            point[0] = normal.sample(&mut rng);
882            point[1] = normal.sample(&mut rng) / anisotropy_factor;
883
884            // Remaining axes have normal _std
885            for item in point.iter_mut().take(n_features).skip(2) {
886                *item = normal.sample(&mut rng);
887            }
888
889            // Apply rotation for 2D case
890            if n_features >= 2 {
891                let cos_theta = rotation_angle.cos();
892                let sin_theta = rotation_angle.sin();
893
894                let x_rot = cos_theta * point[0] - sin_theta * point[1];
895                let y_rot = sin_theta * point[0] + cos_theta * point[1];
896
897                point[0] = x_rot;
898                point[1] = y_rot;
899            }
900
901            // Translate to cluster center
902            for j in 0..n_features {
903                data[[sample_idx, j]] = cluster_centers[[center_idx, j]] + point[j];
904            }
905
906            target[sample_idx] = center_idx as f64;
907            sample_idx += 1;
908        }
909    }
910
911    let mut dataset = Dataset::new(data, Some(target));
912    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
913
914    dataset = dataset
915        .with_featurenames(featurenames)
916        .with_description(format!(
917            "Anisotropic clustering dataset with {centers} elongated clusters and {n_features} _features"
918        ))
919        .with_metadata("centers", &centers.to_string())
920        .with_metadata("cluster_std", &cluster_std.to_string())
921        .with_metadata("anisotropy_factor", &anisotropy_factor.to_string());
922
923    Ok(dataset)
924}
925
926/// Generate hierarchical clusters (clusters within clusters)
927#[allow(clippy::too_many_arguments)]
928#[allow(dead_code)]
929pub fn make_hierarchical_clusters(
930    n_samples: usize,
931    n_features: usize,
932    n_main_clusters: usize,
933    n_sub_clusters: usize,
934    main_cluster_std: f64,
935    sub_cluster_std: f64,
936    randomseed: Option<u64>,
937) -> Result<Dataset> {
938    // Validate input parameters
939    if n_samples == 0 {
940        return Err(DatasetsError::InvalidFormat(
941            "n_samples must be > 0".to_string(),
942        ));
943    }
944
945    if n_features == 0 {
946        return Err(DatasetsError::InvalidFormat(
947            "n_features must be > 0".to_string(),
948        ));
949    }
950
951    if n_main_clusters == 0 {
952        return Err(DatasetsError::InvalidFormat(
953            "n_main_clusters must be > 0".to_string(),
954        ));
955    }
956
957    if n_sub_clusters == 0 {
958        return Err(DatasetsError::InvalidFormat(
959            "n_sub_clusters must be > 0".to_string(),
960        ));
961    }
962
963    if main_cluster_std <= 0.0 {
964        return Err(DatasetsError::InvalidFormat(
965            "main_cluster_std must be > 0.0".to_string(),
966        ));
967    }
968
969    if sub_cluster_std <= 0.0 {
970        return Err(DatasetsError::InvalidFormat(
971            "sub_cluster_std must be > 0.0".to_string(),
972        ));
973    }
974
975    let mut rng = match randomseed {
976        Some(_seed) => StdRng::seed_from_u64(_seed),
977        None => {
978            let mut r = rng();
979            StdRng::seed_from_u64(r.next_u64())
980        }
981    };
982
983    // Generate main cluster centers
984    let mut main_centers = Array2::zeros((n_main_clusters, n_features));
985    let center_box = 20.0;
986
987    for i in 0..n_main_clusters {
988        for j in 0..n_features {
989            main_centers[[i, j]] = rng.gen_range(-center_box..center_box);
990        }
991    }
992
993    let mut data = Array2::zeros((n_samples, n_features));
994    let mut main_target = Array1::zeros(n_samples);
995    let mut sub_target = Array1::zeros(n_samples);
996
997    let main_normal = rand_distr::Normal::new(0.0, main_cluster_std).unwrap();
998    let sub_normal = rand_distr::Normal::new(0.0, sub_cluster_std).unwrap();
999
1000    let samples_per_main = n_samples / n_main_clusters;
1001    let remainder = n_samples % n_main_clusters;
1002
1003    let mut sample_idx = 0;
1004
1005    for main_idx in 0..n_main_clusters {
1006        let n_samples_main = if main_idx < remainder {
1007            samples_per_main + 1
1008        } else {
1009            samples_per_main
1010        };
1011
1012        // Generate sub-cluster centers within this main cluster
1013        let mut sub_centers = Array2::zeros((n_sub_clusters, n_features));
1014        for i in 0..n_sub_clusters {
1015            for j in 0..n_features {
1016                sub_centers[[i, j]] = main_centers[[main_idx, j]] + main_normal.sample(&mut rng);
1017            }
1018        }
1019
1020        let samples_per_sub = n_samples_main / n_sub_clusters;
1021        let sub_remainder = n_samples_main % n_sub_clusters;
1022
1023        for sub_idx in 0..n_sub_clusters {
1024            let n_samples_sub = if sub_idx < sub_remainder {
1025                samples_per_sub + 1
1026            } else {
1027                samples_per_sub
1028            };
1029
1030            for _ in 0..n_samples_sub {
1031                for j in 0..n_features {
1032                    data[[sample_idx, j]] = sub_centers[[sub_idx, j]] + sub_normal.sample(&mut rng);
1033                }
1034
1035                main_target[sample_idx] = main_idx as f64;
1036                sub_target[sample_idx] = (main_idx * n_sub_clusters + sub_idx) as f64;
1037                sample_idx += 1;
1038            }
1039        }
1040    }
1041
1042    let mut dataset = Dataset::new(data, Some(main_target));
1043    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1044
1045    dataset = dataset
1046        .with_featurenames(featurenames)
1047        .with_description(format!(
1048            "Hierarchical clustering dataset with {n_main_clusters} main clusters, {n_sub_clusters} sub-_clusters each"
1049        ))
1050        .with_metadata("n_main_clusters", &n_main_clusters.to_string())
1051        .with_metadata("n_sub_clusters", &n_sub_clusters.to_string())
1052        .with_metadata("main_cluster_std", &main_cluster_std.to_string())
1053        .with_metadata("sub_cluster_std", &sub_cluster_std.to_string());
1054
1055    let sub_target_vec = sub_target.to_vec();
1056    dataset = dataset.with_metadata("sub_cluster_labels", &format!("{sub_target_vec:?}"));
1057
1058    Ok(dataset)
1059}
1060
1061/// Missing data patterns for noise injection
1062#[derive(Debug, Clone, Copy)]
1063pub enum MissingPattern {
1064    /// Missing Completely at Random - uniform probability across all features
1065    MCAR,
1066    /// Missing at Random - probability depends on observed values
1067    MAR,
1068    /// Missing Not at Random - probability depends on missing values themselves
1069    MNAR,
1070    /// Block-wise missing - entire blocks of consecutive features/samples missing
1071    Block,
1072}
1073
1074/// Outlier types for injection
1075#[derive(Debug, Clone, Copy)]
1076pub enum OutlierType {
1077    /// Point outliers - individual data points that are anomalous
1078    Point,
1079    /// Contextual outliers - points anomalous in specific contexts
1080    Contextual,
1081    /// Collective outliers - groups of points that together form an anomaly
1082    Collective,
1083}
1084
1085/// Inject missing data into a dataset with realistic patterns
1086#[allow(dead_code)]
1087pub fn inject_missing_data(
1088    data: &mut Array2<f64>,
1089    missing_rate: f64,
1090    pattern: MissingPattern,
1091    randomseed: Option<u64>,
1092) -> Result<Array2<bool>> {
1093    // Validate input parameters
1094    if !(0.0..=1.0).contains(&missing_rate) {
1095        return Err(DatasetsError::InvalidFormat(
1096            "missing_rate must be between 0.0 and 1.0".to_string(),
1097        ));
1098    }
1099
1100    let mut rng = match randomseed {
1101        Some(_seed) => StdRng::seed_from_u64(_seed),
1102        None => {
1103            let mut r = rng();
1104            StdRng::seed_from_u64(r.next_u64())
1105        }
1106    };
1107
1108    let (n_samples, n_features) = data.dim();
1109    let mut missing_mask = Array2::from_elem((n_samples, n_features), false);
1110
1111    match pattern {
1112        MissingPattern::MCAR => {
1113            // Missing Completely at Random - uniform probability
1114            for i in 0..n_samples {
1115                for j in 0..n_features {
1116                    if rng.gen_range(0.0f64..1.0) < missing_rate {
1117                        missing_mask[[i, j]] = true;
1118                        data[[i, j]] = f64::NAN;
1119                    }
1120                }
1121            }
1122        }
1123        MissingPattern::MAR => {
1124            // Missing at Random - probability depends on first feature
1125            for i in 0..n_samples {
1126                let first_feature_val = data[[i, 0]];
1127                let normalized_val = (first_feature_val + 10.0) / 20.0; // Normalize roughly to [0,1]
1128                let adjusted_rate = missing_rate * normalized_val.clamp(0.1, 2.0);
1129
1130                for j in 1..n_features {
1131                    // Skip first feature
1132                    if rng.gen_range(0.0f64..1.0) < adjusted_rate {
1133                        missing_mask[[i, j]] = true;
1134                        data[[i, j]] = f64::NAN;
1135                    }
1136                }
1137            }
1138        }
1139        MissingPattern::MNAR => {
1140            // Missing Not at Random - higher values more likely to be missing
1141            for i in 0..n_samples {
1142                for j in 0..n_features {
1143                    let value = data[[i, j]];
1144                    let normalized_val = (value + 10.0) / 20.0; // Normalize roughly to [0,1]
1145                    let adjusted_rate = missing_rate * normalized_val.clamp(0.1, 3.0);
1146
1147                    if rng.gen_range(0.0f64..1.0) < adjusted_rate {
1148                        missing_mask[[i, j]] = true;
1149                        data[[i, j]] = f64::NAN;
1150                    }
1151                }
1152            }
1153        }
1154        MissingPattern::Block => {
1155            // Block-wise missing - entire blocks are missing
1156            let block_size = (n_features as f64 * missing_rate).ceil() as usize;
1157            let n_blocks = (missing_rate * n_samples as f64).ceil() as usize;
1158
1159            for _ in 0..n_blocks {
1160                let start_row = rng.sample(Uniform::new(0, n_samples).unwrap());
1161                let start_col =
1162                    rng.sample(Uniform::new(0, n_features.saturating_sub(block_size)).unwrap());
1163
1164                for i in start_row..n_samples.min(start_row + block_size) {
1165                    for j in start_col..n_features.min(start_col + block_size) {
1166                        missing_mask[[i, j]] = true;
1167                        data[[i, j]] = f64::NAN;
1168                    }
1169                }
1170            }
1171        }
1172    }
1173
1174    Ok(missing_mask)
1175}
1176
1177/// Inject outliers into a dataset
1178#[allow(dead_code)]
1179pub fn inject_outliers(
1180    data: &mut Array2<f64>,
1181    outlier_rate: f64,
1182    outlier_type: OutlierType,
1183    outlier_strength: f64,
1184    randomseed: Option<u64>,
1185) -> Result<Array1<bool>> {
1186    // Validate input parameters
1187    if !(0.0..=1.0).contains(&outlier_rate) {
1188        return Err(DatasetsError::InvalidFormat(
1189            "outlier_rate must be between 0.0 and 1.0".to_string(),
1190        ));
1191    }
1192
1193    if outlier_strength <= 0.0 {
1194        return Err(DatasetsError::InvalidFormat(
1195            "outlier_strength must be > 0.0".to_string(),
1196        ));
1197    }
1198
1199    let mut rng = match randomseed {
1200        Some(_seed) => StdRng::seed_from_u64(_seed),
1201        None => {
1202            let mut r = rng();
1203            StdRng::seed_from_u64(r.next_u64())
1204        }
1205    };
1206
1207    let (n_samples, n_features) = data.dim();
1208    let n_outliers = (n_samples as f64 * outlier_rate).ceil() as usize;
1209    let mut outlier_mask = Array1::from_elem(n_samples, false);
1210
1211    // Calculate data statistics for outlier generation
1212    let mut feature_means = vec![0.0; n_features];
1213    let mut feature_stds = vec![0.0; n_features];
1214
1215    for j in 0..n_features {
1216        let column = data.column(j);
1217        let valid_values: Vec<f64> = column.iter().filter(|&&x| !x.is_nan()).cloned().collect();
1218
1219        if !valid_values.is_empty() {
1220            feature_means[j] = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
1221            let variance = valid_values
1222                .iter()
1223                .map(|&x| (x - feature_means[j]).powi(2))
1224                .sum::<f64>()
1225                / valid_values.len() as f64;
1226            feature_stds[j] = variance.sqrt().max(1.0); // Use minimum std of 1.0 to ensure outliers can be created
1227        }
1228    }
1229
1230    match outlier_type {
1231        OutlierType::Point => {
1232            // Point outliers - individual anomalous points
1233            for _ in 0..n_outliers {
1234                let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1235                outlier_mask[outlier_idx] = true;
1236
1237                // Modify each feature to be an outlier
1238                for j in 0..n_features {
1239                    let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1240                        -1.0
1241                    } else {
1242                        1.0
1243                    };
1244                    data[[outlier_idx, j]] =
1245                        feature_means[j] + direction * outlier_strength * feature_stds[j];
1246                }
1247            }
1248        }
1249        OutlierType::Contextual => {
1250            // Contextual outliers - anomalous in specific feature combinations
1251            for _ in 0..n_outliers {
1252                let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1253                outlier_mask[outlier_idx] = true;
1254
1255                // Only modify a subset of features to create contextual anomaly
1256                let n_features_to_modify =
1257                    rng.sample(Uniform::new(1, (n_features / 2).max(1) + 1).unwrap());
1258                let mut features_to_modify: Vec<usize> = (0..n_features).collect();
1259                features_to_modify.shuffle(&mut rng);
1260                features_to_modify.truncate(n_features_to_modify);
1261
1262                for &j in &features_to_modify {
1263                    let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1264                        -1.0
1265                    } else {
1266                        1.0
1267                    };
1268                    data[[outlier_idx, j]] =
1269                        feature_means[j] + direction * outlier_strength * feature_stds[j];
1270                }
1271            }
1272        }
1273        OutlierType::Collective => {
1274            // Collective outliers - groups of points that together form anomalies
1275            let outliers_per_group = (n_outliers / 3).max(2); // At least 2 per group
1276            let n_groups = (n_outliers / outliers_per_group).max(1);
1277
1278            for _ in 0..n_groups {
1279                // Generate cluster center for this collective outlier
1280                let mut outlier_center = vec![0.0; n_features];
1281                for j in 0..n_features {
1282                    let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1283                        -1.0
1284                    } else {
1285                        1.0
1286                    };
1287                    outlier_center[j] =
1288                        feature_means[j] + direction * outlier_strength * feature_stds[j];
1289                }
1290
1291                // Generate points around this center
1292                for _ in 0..outliers_per_group {
1293                    let outlier_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1294                    outlier_mask[outlier_idx] = true;
1295
1296                    for j in 0..n_features {
1297                        let noise = rng.gen_range(-0.5f64..0.5f64) * feature_stds[j];
1298                        data[[outlier_idx, j]] = outlier_center[j] + noise;
1299                    }
1300                }
1301            }
1302        }
1303    }
1304
1305    Ok(outlier_mask)
1306}
1307
1308/// Add realistic noise patterns to time series data
1309#[allow(dead_code)]
1310pub fn add_time_series_noise(
1311    data: &mut Array2<f64>,
1312    noise_types: &[(&str, f64)], // (noise_type, strength)
1313    randomseed: Option<u64>,
1314) -> Result<()> {
1315    let mut rng = match randomseed {
1316        Some(_seed) => StdRng::seed_from_u64(_seed),
1317        None => {
1318            let mut r = rng();
1319            StdRng::seed_from_u64(r.next_u64())
1320        }
1321    };
1322
1323    let (n_samples, n_features) = data.dim();
1324    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1325
1326    for &(noise_type, strength) in noise_types {
1327        match noise_type {
1328            "gaussian" => {
1329                // Add Gaussian white noise
1330                for i in 0..n_samples {
1331                    for j in 0..n_features {
1332                        data[[i, j]] += strength * normal.sample(&mut rng);
1333                    }
1334                }
1335            }
1336            "spikes" => {
1337                // Add random spikes (impulse noise)
1338                let n_spikes = (n_samples as f64 * strength * 0.1).ceil() as usize;
1339                for _ in 0..n_spikes {
1340                    let spike_idx = rng.sample(Uniform::new(0, n_samples).unwrap());
1341                    let feature_idx = rng.sample(Uniform::new(0, n_features).unwrap());
1342                    let spike_magnitude = rng.gen_range(5.0..=15.0) * strength;
1343                    let direction = if rng.gen_range(0.0f64..1.0) < 0.5 {
1344                        -1.0
1345                    } else {
1346                        1.0
1347                    };
1348
1349                    data[[spike_idx, feature_idx]] += direction * spike_magnitude;
1350                }
1351            }
1352            "drift" => {
1353                // Add gradual drift over time
1354                for i in 0..n_samples {
1355                    let drift_amount = strength * (i as f64 / n_samples as f64);
1356                    for j in 0..n_features {
1357                        data[[i, j]] += drift_amount;
1358                    }
1359                }
1360            }
1361            "seasonal" => {
1362                // Add seasonal pattern noise
1363                let period = n_samples as f64 / 4.0; // 4 seasons
1364                for i in 0..n_samples {
1365                    let seasonal_component = strength * (2.0 * PI * i as f64 / period).sin();
1366                    for j in 0..n_features {
1367                        data[[i, j]] += seasonal_component;
1368                    }
1369                }
1370            }
1371            "autocorrelated" => {
1372                // Add autocorrelated noise (AR(1) process)
1373                let ar_coeff = 0.7; // Autocorrelation coefficient
1374                for j in 0..n_features {
1375                    let mut prev_noise = 0.0;
1376                    for i in 0..n_samples {
1377                        let new_noise = ar_coeff * prev_noise + strength * normal.sample(&mut rng);
1378                        data[[i, j]] += new_noise;
1379                        prev_noise = new_noise;
1380                    }
1381                }
1382            }
1383            "heteroscedastic" => {
1384                // Add heteroscedastic noise (variance changes over time)
1385                for i in 0..n_samples {
1386                    let variance_factor = 1.0 + strength * (i as f64 / n_samples as f64);
1387                    for j in 0..n_features {
1388                        data[[i, j]] += variance_factor * strength * normal.sample(&mut rng);
1389                    }
1390                }
1391            }
1392            _ => {
1393                return Err(DatasetsError::InvalidFormat(format!(
1394                    "Unknown noise type: {noise_type}. Supported , types: gaussian, spikes, drift, seasonal, autocorrelated, heteroscedastic"
1395                )));
1396            }
1397        }
1398    }
1399
1400    Ok(())
1401}
1402
1403/// Generate a dataset with controlled corruption patterns
1404#[allow(dead_code)]
1405pub fn make_corrupted_dataset(
1406    base_dataset: &Dataset,
1407    missing_rate: f64,
1408    missing_pattern: MissingPattern,
1409    outlier_rate: f64,
1410    outlier_type: OutlierType,
1411    outlier_strength: f64,
1412    randomseed: Option<u64>,
1413) -> Result<Dataset> {
1414    // Validate inputs
1415    if !(0.0..=1.0).contains(&missing_rate) {
1416        return Err(DatasetsError::InvalidFormat(
1417            "missing_rate must be between 0.0 and 1.0".to_string(),
1418        ));
1419    }
1420
1421    if !(0.0..=1.0).contains(&outlier_rate) {
1422        return Err(DatasetsError::InvalidFormat(
1423            "outlier_rate must be between 0.0 and 1.0".to_string(),
1424        ));
1425    }
1426
1427    // Clone the base _dataset
1428    let mut corrupted_data = base_dataset.data.clone();
1429    let corrupted_target = base_dataset.target.clone();
1430
1431    // Apply missing data
1432    let missing_mask = inject_missing_data(
1433        &mut corrupted_data,
1434        missing_rate,
1435        missing_pattern,
1436        randomseed,
1437    )?;
1438
1439    // Apply outliers
1440    let outlier_mask = inject_outliers(
1441        &mut corrupted_data,
1442        outlier_rate,
1443        outlier_type,
1444        outlier_strength,
1445        randomseed,
1446    )?;
1447
1448    // Create new _dataset with corruption metadata
1449    let mut corrupted_dataset = Dataset::new(corrupted_data, corrupted_target);
1450
1451    if let Some(featurenames) = &base_dataset.featurenames {
1452        corrupted_dataset = corrupted_dataset.with_featurenames(featurenames.clone());
1453    }
1454
1455    if let Some(targetnames) = &base_dataset.targetnames {
1456        corrupted_dataset = corrupted_dataset.with_targetnames(targetnames.clone());
1457    }
1458
1459    corrupted_dataset = corrupted_dataset
1460        .with_description(format!(
1461            "Corrupted version of: {}",
1462            base_dataset
1463                .description
1464                .as_deref()
1465                .unwrap_or("Unknown _dataset")
1466        ))
1467        .with_metadata("missing_rate", &missing_rate.to_string())
1468        .with_metadata("missing_pattern", &format!("{missing_pattern:?}"))
1469        .with_metadata("outlier_rate", &outlier_rate.to_string())
1470        .with_metadata("outlier_type", &format!("{outlier_type:?}"))
1471        .with_metadata("outlier_strength", &outlier_strength.to_string())
1472        .with_metadata(
1473            "missing_count",
1474            &missing_mask.iter().filter(|&&x| x).count().to_string(),
1475        )
1476        .with_metadata(
1477            "outlier_count",
1478            &outlier_mask.iter().filter(|&&x| x).count().to_string(),
1479        );
1480
1481    Ok(corrupted_dataset)
1482}
1483
1484/// GPU-accelerated data generation configuration
1485#[derive(Debug, Clone)]
1486pub struct GpuConfig {
1487    /// Whether to use GPU acceleration
1488    pub use_gpu: bool,
1489    /// GPU device index (0 for default)
1490    pub device_id: usize,
1491    /// Whether to use single precision (f32) instead of double (f64)
1492    pub use_single_precision: bool,
1493    /// Chunk size for GPU operations
1494    pub chunk_size: usize,
1495}
1496
1497impl Default for GpuConfig {
1498    fn default() -> Self {
1499        Self {
1500            use_gpu: true,
1501            device_id: 0,
1502            use_single_precision: false,
1503            chunk_size: 10000,
1504        }
1505    }
1506}
1507
1508impl GpuConfig {
1509    /// Create a new GPU configuration
1510    pub fn new() -> Self {
1511        Self::default()
1512    }
1513
1514    /// Set whether to use GPU
1515    pub fn with_gpu(mut self, use_gpu: bool) -> Self {
1516        self.use_gpu = use_gpu;
1517        self
1518    }
1519
1520    /// Set GPU device ID
1521    pub fn with_device(mut self, device_id: usize) -> Self {
1522        self.device_id = device_id;
1523        self
1524    }
1525
1526    /// Set precision mode
1527    pub fn with_single_precision(mut self, single_precision: bool) -> Self {
1528        self.use_single_precision = single_precision;
1529        self
1530    }
1531
1532    /// Set chunk size
1533    pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
1534        self.chunk_size = chunk_size;
1535        self
1536    }
1537}
1538
1539/// GPU-accelerated classification dataset generation
1540#[allow(dead_code)]
1541#[allow(clippy::too_many_arguments)]
1542pub fn make_classification_gpu(
1543    n_samples: usize,
1544    n_features: usize,
1545    n_classes: usize,
1546    n_clusters_per_class: usize,
1547    n_informative: usize,
1548    randomseed: Option<u64>,
1549    gpuconfig: GpuConfig,
1550) -> Result<Dataset> {
1551    // Check if GPU is available and requested
1552    if gpuconfig.use_gpu && gpu_is_available() {
1553        make_classification_gpu_impl(
1554            n_samples,
1555            n_features,
1556            n_classes,
1557            n_clusters_per_class,
1558            n_informative,
1559            randomseed,
1560            gpuconfig,
1561        )
1562    } else {
1563        // Fallback to CPU implementation
1564        make_classification(
1565            n_samples,
1566            n_features,
1567            n_classes,
1568            n_clusters_per_class,
1569            n_informative,
1570            randomseed,
1571        )
1572    }
1573}
1574
1575/// Internal GPU implementation for classification data generation
1576#[allow(dead_code)]
1577fn make_classification_gpu_impl(
1578    n_samples: usize,
1579    n_features: usize,
1580    n_classes: usize,
1581    n_clusters_per_class: usize,
1582    n_informative: usize,
1583    randomseed: Option<u64>,
1584    gpuconfig: GpuConfig,
1585) -> Result<Dataset> {
1586    // Input validation
1587    if n_samples == 0 || n_features == 0 || n_informative == 0 {
1588        return Err(DatasetsError::InvalidFormat(
1589            "n_samples, n_features, and n_informative must be > 0".to_string(),
1590        ));
1591    }
1592
1593    if n_features < n_informative {
1594        return Err(DatasetsError::InvalidFormat(format!(
1595            "n_features ({n_features}) must be >= n_informative ({n_informative})"
1596        )));
1597    }
1598
1599    if n_classes < 2 || n_clusters_per_class == 0 {
1600        return Err(DatasetsError::InvalidFormat(
1601            "n_classes must be >= 2 and n_clusters_per_class must be > 0".to_string(),
1602        ));
1603    }
1604
1605    // Create GPU context
1606    let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
1607        backend: crate::gpu::GpuBackend::Cuda {
1608            device_id: gpuconfig.device_id as u32,
1609        },
1610        memory: crate::gpu::GpuMemoryConfig::default(),
1611        threads_per_block: 256,
1612        enable_double_precision: !gpuconfig.use_single_precision,
1613        use_fast_math: false,
1614        random_seed: None,
1615    })
1616    .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
1617
1618    // Generate data in chunks to avoid memory issues
1619    let chunk_size = std::cmp::min(gpuconfig.chunk_size, n_samples);
1620    let num_chunks = n_samples.div_ceil(chunk_size);
1621
1622    let mut all_data = Vec::new();
1623    let mut all_targets = Vec::new();
1624
1625    for chunk_idx in 0..num_chunks {
1626        let start_idx = chunk_idx * chunk_size;
1627        let end_idx = std::cmp::min(start_idx + chunk_size, n_samples);
1628        let chunk_samples = end_idx - start_idx;
1629
1630        // Generate chunk on GPU
1631        let (chunk_data, chunk_targets) = generate_classification_chunk_gpu(
1632            &gpu_context,
1633            chunk_samples,
1634            n_features,
1635            n_classes,
1636            n_clusters_per_class,
1637            n_informative,
1638            randomseed.map(|s| s + chunk_idx as u64),
1639            gpuconfig.use_single_precision,
1640        )?;
1641
1642        all_data.extend(chunk_data);
1643        all_targets.extend(chunk_targets);
1644    }
1645
1646    // Convert to ndarray
1647    let data = Array2::from_shape_vec((n_samples, n_features), all_data)
1648        .map_err(|e| DatasetsError::Other(format!("Failed to create data array: {e}")))?;
1649
1650    let target = Array1::from_vec(all_targets);
1651
1652    // Create dataset
1653    let mut dataset = Dataset::new(data, Some(target));
1654
1655    // Add metadata
1656    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1657    let classnames: Vec<String> = (0..n_classes).map(|i| format!("class_{i}")).collect();
1658
1659    dataset = dataset
1660        .with_featurenames(featurenames)
1661        .with_targetnames(classnames)
1662        .with_description(format!(
1663            "GPU-accelerated synthetic classification dataset with {n_classes} _classes and {n_features} _features"
1664        ));
1665
1666    Ok(dataset)
1667}
1668
1669/// Generate a chunk of classification data on GPU
1670#[allow(dead_code)]
1671#[allow(clippy::too_many_arguments)]
1672fn generate_classification_chunk_gpu(
1673    gpu_context: &GpuContext,
1674    n_samples: usize,
1675    n_features: usize,
1676    n_classes: usize,
1677    n_clusters_per_class: usize,
1678    n_informative: usize,
1679    randomseed: Option<u64>,
1680    _use_single_precision: bool,
1681) -> Result<(Vec<f64>, Vec<f64>)> {
1682    // For now, implement using GPU matrix operations
1683    // In a real implementation, this would use custom GPU kernels
1684
1685    let _seed = randomseed.unwrap_or(42);
1686    let mut rng = StdRng::seed_from_u64(_seed);
1687
1688    // Generate centroids
1689    let n_centroids = n_classes * n_clusters_per_class;
1690    let mut centroids = vec![0.0; n_centroids * n_informative];
1691
1692    for i in 0..n_centroids {
1693        for j in 0..n_informative {
1694            centroids[i * n_informative + j] = 2.0 * rng.gen_range(-1.0f64..1.0f64);
1695        }
1696    }
1697
1698    // Generate _samples using GPU-accelerated operations
1699    let mut data = vec![0.0; n_samples * n_features];
1700    let mut targets = vec![0.0; n_samples];
1701
1702    // Implement GPU buffer operations for accelerated data generation
1703    if *gpu_context.backend() != LocalGpuBackend::Cpu {
1704        return generate_classification_gpu_optimized(
1705            gpu_context,
1706            &centroids,
1707            n_samples,
1708            n_features,
1709            n_classes,
1710            n_clusters_per_class,
1711            n_informative,
1712            &mut rng,
1713        );
1714    }
1715
1716    // CPU fallback: Generate _samples in parallel chunks
1717    let samples_per_class = n_samples / n_classes;
1718    let remainder = n_samples % n_classes;
1719
1720    let mut sample_idx = 0;
1721    for _class in 0..n_classes {
1722        let n_samples_class = if _class < remainder {
1723            samples_per_class + 1
1724        } else {
1725            samples_per_class
1726        };
1727
1728        let samples_per_cluster = n_samples_class / n_clusters_per_class;
1729        let cluster_remainder = n_samples_class % n_clusters_per_class;
1730
1731        for cluster in 0..n_clusters_per_class {
1732            let n_samples_cluster = if cluster < cluster_remainder {
1733                samples_per_cluster + 1
1734            } else {
1735                samples_per_cluster
1736            };
1737
1738            let centroid_idx = _class * n_clusters_per_class + cluster;
1739
1740            for _ in 0..n_samples_cluster {
1741                // Generate sample around centroid
1742                for j in 0..n_informative {
1743                    let centroid_val = centroids[centroid_idx * n_informative + j];
1744                    let noise = rand_distr::Normal::new(0.0, 0.3).unwrap().sample(&mut rng);
1745                    data[sample_idx * n_features + j] = centroid_val + noise;
1746                }
1747
1748                // Add noise _features
1749                for j in n_informative..n_features {
1750                    data[sample_idx * n_features + j] =
1751                        rand_distr::Normal::new(0.0, 1.0).unwrap().sample(&mut rng);
1752                }
1753
1754                targets[sample_idx] = _class as f64;
1755                sample_idx += 1;
1756            }
1757        }
1758    }
1759
1760    Ok((data, targets))
1761}
1762
1763/// GPU-optimized classification data generation using buffer operations
1764#[allow(clippy::too_many_arguments)]
1765#[allow(dead_code)]
1766fn generate_classification_gpu_optimized(
1767    _gpu_context: &GpuContext,
1768    centroids: &[f64],
1769    n_samples: usize,
1770    n_features: usize,
1771    n_classes: usize,
1772    n_clusters_per_class: usize,
1773    n_informative: usize,
1774    rng: &mut StdRng,
1775) -> Result<(Vec<f64>, Vec<f64>)> {
1776    // For now, use CPU-based implementation since core GPU _features are not available
1777    // TODO: Implement proper GPU acceleration when core GPU _features are stabilized
1778
1779    // CPU fallback implementation since GPU _features are not available
1780    use rand_distr::Distribution;
1781    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1782
1783    let mut data = vec![0.0; n_samples * n_features];
1784    let mut targets = vec![0.0; n_samples];
1785
1786    // Samples per _class
1787    let samples_per_class = n_samples / n_classes;
1788    let remainder = n_samples % n_classes;
1789
1790    let mut sample_idx = 0;
1791
1792    for _class in 0..n_classes {
1793        let n_samples_class = if _class < remainder {
1794            samples_per_class + 1
1795        } else {
1796            samples_per_class
1797        };
1798
1799        // Samples per cluster within this _class
1800        let samples_per_cluster = n_samples_class / n_clusters_per_class;
1801        let cluster_remainder = n_samples_class % n_clusters_per_class;
1802
1803        for cluster in 0..n_clusters_per_class {
1804            let n_samples_cluster = if cluster < cluster_remainder {
1805                samples_per_cluster + 1
1806            } else {
1807                samples_per_cluster
1808            };
1809
1810            let centroid_idx = _class * n_clusters_per_class + cluster;
1811
1812            for _ in 0..n_samples_cluster {
1813                // Generate _informative _features around cluster centroid
1814                for j in 0..n_informative {
1815                    let centroid_val = centroids[centroid_idx * n_informative + j];
1816                    data[sample_idx * n_features + j] = centroid_val + 0.3 * normal.sample(rng);
1817                }
1818
1819                // Generate noise _features
1820                for j in n_informative..n_features {
1821                    data[sample_idx * n_features + j] = normal.sample(rng);
1822                }
1823
1824                targets[sample_idx] = _class as f64;
1825                sample_idx += 1;
1826            }
1827        }
1828    }
1829
1830    // TODO: Future GPU implementation placeholder - currently using CPU fallback
1831
1832    Ok((data, targets))
1833}
1834
1835/// GPU-accelerated regression dataset generation
1836#[allow(dead_code)]
1837#[allow(clippy::too_many_arguments)]
1838pub fn make_regression_gpu(
1839    n_samples: usize,
1840    n_features: usize,
1841    n_informative: usize,
1842    noise: f64,
1843    randomseed: Option<u64>,
1844    gpuconfig: GpuConfig,
1845) -> Result<Dataset> {
1846    // Check if GPU is available and requested
1847    if gpuconfig.use_gpu && gpu_is_available() {
1848        make_regression_gpu_impl(
1849            n_samples,
1850            n_features,
1851            n_informative,
1852            noise,
1853            randomseed,
1854            gpuconfig,
1855        )
1856    } else {
1857        // Fallback to CPU implementation
1858        make_regression(n_samples, n_features, n_informative, noise, randomseed)
1859    }
1860}
1861
1862/// Internal GPU implementation for regression data generation
1863#[allow(dead_code)]
1864fn make_regression_gpu_impl(
1865    n_samples: usize,
1866    n_features: usize,
1867    n_informative: usize,
1868    noise: f64,
1869    randomseed: Option<u64>,
1870    gpuconfig: GpuConfig,
1871) -> Result<Dataset> {
1872    // Input validation
1873    if n_samples == 0 || n_features == 0 || n_informative == 0 {
1874        return Err(DatasetsError::InvalidFormat(
1875            "n_samples, n_features, and n_informative must be > 0".to_string(),
1876        ));
1877    }
1878
1879    if n_features < n_informative {
1880        return Err(DatasetsError::InvalidFormat(format!(
1881            "n_features ({n_features}) must be >= n_informative ({n_informative})"
1882        )));
1883    }
1884
1885    // Create GPU context
1886    let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
1887        backend: crate::gpu::GpuBackend::Cuda {
1888            device_id: gpuconfig.device_id as u32,
1889        },
1890        memory: crate::gpu::GpuMemoryConfig::default(),
1891        threads_per_block: 256,
1892        enable_double_precision: !gpuconfig.use_single_precision,
1893        use_fast_math: false,
1894        random_seed: None,
1895    })
1896    .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
1897
1898    let _seed = randomseed.unwrap_or(42);
1899    let mut rng = StdRng::seed_from_u64(_seed);
1900
1901    // Generate coefficient matrix on GPU
1902    let mut coefficients = vec![0.0; n_informative];
1903    for coeff in coefficients.iter_mut().take(n_informative) {
1904        *coeff = rng.gen_range(-2.0f64..2.0f64);
1905    }
1906
1907    // Generate data matrix in chunks
1908    let chunk_size = std::cmp::min(gpuconfig.chunk_size, n_samples);
1909    let num_chunks = n_samples.div_ceil(chunk_size);
1910
1911    let mut all_data = Vec::new();
1912    let mut all_targets = Vec::new();
1913
1914    for chunk_idx in 0..num_chunks {
1915        let start_idx = chunk_idx * chunk_size;
1916        let end_idx = std::cmp::min(start_idx + chunk_size, n_samples);
1917        let chunk_samples = end_idx - start_idx;
1918
1919        // Generate chunk on GPU
1920        let (chunk_data, chunk_targets) = generate_regression_chunk_gpu(
1921            &gpu_context,
1922            chunk_samples,
1923            n_features,
1924            n_informative,
1925            &coefficients,
1926            noise,
1927            randomseed.map(|s| s + chunk_idx as u64),
1928        )?;
1929
1930        all_data.extend(chunk_data);
1931        all_targets.extend(chunk_targets);
1932    }
1933
1934    // Convert to ndarray
1935    let data = Array2::from_shape_vec((n_samples, n_features), all_data)
1936        .map_err(|e| DatasetsError::Other(format!("Failed to create data array: {e}")))?;
1937
1938    let target = Array1::from_vec(all_targets);
1939
1940    // Create dataset
1941    let mut dataset = Dataset::new(data, Some(target));
1942
1943    // Add metadata
1944    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
1945
1946    dataset = dataset
1947        .with_featurenames(featurenames)
1948        .with_description(format!(
1949            "GPU-accelerated synthetic regression dataset with {n_features} _features"
1950        ));
1951
1952    Ok(dataset)
1953}
1954
1955/// Generate a chunk of regression data on GPU
1956#[allow(dead_code)]
1957fn generate_regression_chunk_gpu(
1958    gpu_context: &GpuContext,
1959    n_samples: usize,
1960    n_features: usize,
1961    n_informative: usize,
1962    coefficients: &[f64],
1963    noise: f64,
1964    randomseed: Option<u64>,
1965) -> Result<(Vec<f64>, Vec<f64>)> {
1966    let _seed = randomseed.unwrap_or(42);
1967    let mut rng = StdRng::seed_from_u64(_seed);
1968
1969    // Generate random data matrix
1970    let mut data = vec![0.0; n_samples * n_features];
1971    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
1972
1973    // Use GPU for matrix multiplication if available
1974    for i in 0..n_samples {
1975        for j in 0..n_features {
1976            data[i * n_features + j] = normal.sample(&mut rng);
1977        }
1978    }
1979
1980    // Calculate targets using GPU matrix operations
1981    let mut targets = vec![0.0; n_samples];
1982    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
1983
1984    // Create GPU buffers for accelerated matrix operations
1985    if *gpu_context.backend() != LocalGpuBackend::Cpu {
1986        return generate_regression_gpu_optimized(
1987            gpu_context,
1988            &data,
1989            coefficients,
1990            n_samples,
1991            n_features,
1992            n_informative,
1993            noise,
1994            &mut rng,
1995        );
1996    }
1997
1998    // CPU fallback: Matrix multiplication using nested loops
1999    for i in 0..n_samples {
2000        let mut target_val = 0.0;
2001        for j in 0..n_informative {
2002            target_val += data[i * n_features + j] * coefficients[j];
2003        }
2004
2005        // Add noise
2006        target_val += noise_dist.sample(&mut rng);
2007        targets[i] = target_val;
2008    }
2009
2010    Ok((data, targets))
2011}
2012
2013/// GPU-optimized regression data generation using buffer operations and matrix multiplication
2014#[allow(clippy::too_many_arguments)]
2015#[allow(dead_code)]
2016fn generate_regression_gpu_optimized(
2017    _gpu_context: &GpuContext,
2018    data: &[f64],
2019    coefficients: &[f64],
2020    n_samples: usize,
2021    n_features: usize,
2022    n_informative: usize,
2023    noise: f64,
2024    rng: &mut StdRng,
2025) -> Result<(Vec<f64>, Vec<f64>)> {
2026    // For now, use CPU-based implementation since core GPU _features are not available
2027    // TODO: Implement proper GPU acceleration when core GPU _features are stabilized
2028
2029    // CPU fallback implementation since GPU _features are not available
2030    use rand_distr::Distribution;
2031    let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
2032
2033    let mut targets = vec![0.0; n_samples];
2034
2035    // Matrix multiplication for regression targets
2036    for i in 0..n_samples {
2037        let mut target = 0.0;
2038        for j in 0..n_informative {
2039            target += data[i * n_features + j] * coefficients[j];
2040        }
2041
2042        // Add noise
2043        if noise > 0.0 {
2044            target += noise * normal.sample(rng);
2045        }
2046
2047        targets[i] = target;
2048    }
2049
2050    let data_vec = data.to_vec();
2051
2052    // TODO: Future GPU implementation placeholder - currently using CPU fallback
2053
2054    Ok((data_vec, targets))
2055}
2056
2057/// GPU-accelerated blob generation
2058#[allow(dead_code)]
2059pub fn make_blobs_gpu(
2060    n_samples: usize,
2061    n_features: usize,
2062    n_centers: usize,
2063    cluster_std: f64,
2064    randomseed: Option<u64>,
2065    gpuconfig: GpuConfig,
2066) -> Result<Dataset> {
2067    // Check if GPU is available and requested
2068    if gpuconfig.use_gpu && gpu_is_available() {
2069        make_blobs_gpu_impl(
2070            n_samples,
2071            n_features,
2072            n_centers,
2073            cluster_std,
2074            randomseed,
2075            gpuconfig,
2076        )
2077    } else {
2078        // Fallback to CPU implementation
2079        make_blobs(n_samples, n_features, n_centers, cluster_std, randomseed)
2080    }
2081}
2082
2083/// Internal GPU implementation for blob generation
2084#[allow(dead_code)]
2085fn make_blobs_gpu_impl(
2086    n_samples: usize,
2087    n_features: usize,
2088    n_centers: usize,
2089    cluster_std: f64,
2090    randomseed: Option<u64>,
2091    gpuconfig: GpuConfig,
2092) -> Result<Dataset> {
2093    // Input validation
2094    if n_samples == 0 || n_features == 0 || n_centers == 0 {
2095        return Err(DatasetsError::InvalidFormat(
2096            "n_samples, n_features, and n_centers must be > 0".to_string(),
2097        ));
2098    }
2099
2100    if cluster_std <= 0.0 {
2101        return Err(DatasetsError::InvalidFormat(
2102            "cluster_std must be > 0".to_string(),
2103        ));
2104    }
2105
2106    // Create GPU context
2107    let gpu_context = GpuContext::new(crate::gpu::GpuConfig {
2108        backend: crate::gpu::GpuBackend::Cuda {
2109            device_id: gpuconfig.device_id as u32,
2110        },
2111        memory: crate::gpu::GpuMemoryConfig::default(),
2112        threads_per_block: 256,
2113        enable_double_precision: !gpuconfig.use_single_precision,
2114        use_fast_math: false,
2115        random_seed: None,
2116    })
2117    .map_err(|e| DatasetsError::Other(format!("Failed to create GPU context: {e}")))?;
2118
2119    let _seed = randomseed.unwrap_or(42);
2120    let mut rng = StdRng::seed_from_u64(_seed);
2121
2122    // Generate cluster _centers
2123    let mut centers = Array2::zeros((n_centers, n_features));
2124    let center_dist = rand_distr::Normal::new(0.0, 10.0).unwrap();
2125
2126    for i in 0..n_centers {
2127        for j in 0..n_features {
2128            centers[[i, j]] = center_dist.sample(&mut rng);
2129        }
2130    }
2131
2132    // Generate _samples around _centers using GPU acceleration
2133    let samples_per_center = n_samples / n_centers;
2134    let remainder = n_samples % n_centers;
2135
2136    let mut data = Array2::zeros((n_samples, n_features));
2137    let mut target = Array1::zeros(n_samples);
2138
2139    let mut sample_idx = 0;
2140    let noise_dist = rand_distr::Normal::new(0.0, cluster_std).unwrap();
2141
2142    for center_idx in 0..n_centers {
2143        let n_samples_center = if center_idx < remainder {
2144            samples_per_center + 1
2145        } else {
2146            samples_per_center
2147        };
2148
2149        // Generate _samples for this center using GPU acceleration
2150        if *gpu_context.backend() != LocalGpuBackend::Cpu {
2151            // Use GPU kernel for parallel sample generation
2152            let gpu_generated = generate_blobs_center_gpu(
2153                &gpu_context,
2154                &centers,
2155                center_idx,
2156                n_samples_center,
2157                n_features,
2158                cluster_std,
2159                &mut rng,
2160            )?;
2161
2162            // Copy GPU-generated data to main arrays
2163            for (local_idx, sample) in gpu_generated.iter().enumerate() {
2164                for j in 0..n_features {
2165                    data[[sample_idx + local_idx, j]] = sample[j];
2166                }
2167                target[sample_idx + local_idx] = center_idx as f64;
2168            }
2169            sample_idx += n_samples_center;
2170        } else {
2171            // CPU fallback: generate sequentially
2172            for _ in 0..n_samples_center {
2173                for j in 0..n_features {
2174                    data[[sample_idx, j]] = centers[[center_idx, j]] + noise_dist.sample(&mut rng);
2175                }
2176                target[sample_idx] = center_idx as f64;
2177                sample_idx += 1;
2178            }
2179        }
2180    }
2181
2182    // Create dataset
2183    let mut dataset = Dataset::new(data, Some(target));
2184
2185    // Add metadata
2186    let featurenames: Vec<String> = (0..n_features).map(|i| format!("feature_{i}")).collect();
2187    let centernames: Vec<String> = (0..n_centers).map(|i| format!("center_{i}")).collect();
2188
2189    dataset = dataset
2190        .with_featurenames(featurenames)
2191        .with_targetnames(centernames)
2192        .with_description(format!(
2193            "GPU-accelerated synthetic blob dataset with {n_centers} _centers and {n_features} _features"
2194        ));
2195
2196    Ok(dataset)
2197}
2198
2199/// GPU-optimized blob center generation using parallel kernels
2200#[allow(dead_code)]
2201fn generate_blobs_center_gpu(
2202    _gpu_context: &GpuContext,
2203    centers: &Array2<f64>,
2204    center_idx: usize,
2205    n_samples_center: usize,
2206    n_features: usize,
2207    cluster_std: f64,
2208    rng: &mut StdRng,
2209) -> Result<Vec<Vec<f64>>> {
2210    // For now, use CPU-based implementation since core GPU _features are not available
2211    // TODO: Implement proper GPU acceleration when core GPU _features are stabilized
2212
2213    // Extract _center coordinates for this specific _center
2214    let _center_coords: Vec<f64> = (0..n_features).map(|j| centers[[center_idx, j]]).collect();
2215
2216    // CPU fallback implementation since GPU _features are not available
2217    use rand_distr::Distribution;
2218    let normal = rand_distr::Normal::new(0.0, cluster_std).unwrap();
2219
2220    let mut result = Vec::with_capacity(n_samples_center);
2221
2222    for _ in 0..n_samples_center {
2223        let mut sample = Vec::with_capacity(n_features);
2224        for j in 0..n_features {
2225            let center_val = centers[[center_idx, j]];
2226            let noise = normal.sample(rng);
2227            sample.push(center_val + noise);
2228        }
2229        result.push(sample);
2230    }
2231
2232    Ok(result)
2233
2234    // TODO: GPU implementation placeholder - using CPU fallback above
2235    /*
2236    let _center_buffer = (); // gpu_context.create_buffer_from_slice(&center_coords);
2237    let _data_buffer = (); // gpu_context.create_buffer::<f64>(n_samples_center * n_features);
2238
2239    // Generate random seeds for each sample
2240    let seeds: Vec<u64> = (0..n_samples_center).map(|_| rng.random::<u64>()).collect();
2241    let seeds_buffer = gpu_context.create_buffer_from_slice(&seeds);
2242
2243    // Use GPU kernel for parallel sample generation around _center
2244    gpu_context
2245        .execute(|compiler| {
2246            let kernel_source = r#"
2247            #version 450
2248            layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
2249
2250            layout(set = 0, binding = 0) buffer CenterBuffer {
2251                double center_coords[];
2252            };
2253
2254            layout(set = 0, binding = 1) buffer DataBuffer {
2255                double data[];
2256            };
2257
2258            layout(set = 0, binding = 2) buffer SeedsBuffer {
2259                uint64_t seeds[];
2260            };
2261
2262            layout(push_constant) uniform Params {
2263                uint n_samples_center;
2264                uint n_features;
2265                double cluster_std;
2266            } params;
2267
2268            // High-quality pseudo-random number generator
2269            uint wang_hash(uint seed) {
2270                seed = (seed ^ 61u) ^ (seed >> 16u);
2271                seed *= 9u;
2272                seed = seed ^ (seed >> 4u);
2273                seed *= 0x27d4eb2du;
2274                seed = seed ^ (seed >> 15u);
2275                return seed;
2276            }
2277
2278            // Generate Gaussian random numbers using Box-Muller transform
2279            double random_normal(uint seed, uint index) {
2280                uint h1 = wang_hash(seed + index * 2u);
2281                uint h2 = wang_hash(seed + index * 2u + 1u);
2282                double u1 = double(h1) / double(0xFFFFFFFFu);
2283                double u2 = double(h2) / double(0xFFFFFFFFu);
2284
2285                // Ensure u1 is not zero to avoid log(0)
2286                u1 = max(u1, 1e-8);
2287
2288                // Box-Muller transform
2289                return sqrt(-2.0 * log(u1)) * cos(6.28318530718 * u2);
2290            }
2291
2292            void main() {
2293                uint sample_idx = gl_GlobalInvocationID.x;
2294                if (sample_idx >= params.n_samples_center) return;
2295
2296                uint seed = uint(seeds[sample_idx]);
2297
2298                // Generate all _features for this sample
2299                for (uint j = 0; j < params.n_features; j++) {
2300                    // Get _center coordinate for this feature
2301                    double center_val = center_coords[j];
2302
2303                    // Generate Gaussian noise
2304                    double noise = random_normal(seed, j) * params.cluster_std;
2305
2306                    // Set the data point
2307                    data[sample_idx * params.n_features + j] = center_val + noise;
2308                }
2309            }
2310        "#;
2311
2312            let kernel = compiler.compile(kernel_source)?;
2313
2314            // Set kernel parameters
2315            kernel.set_buffer("center_coords", &center_buffer);
2316            kernel.set_buffer("data", &data_buffer);
2317            kernel.set_buffer("seeds", &seeds_buffer);
2318            kernel.set_u32("n_samples_center", n_samples_center as u32);
2319            kernel.set_u32("n_features", n_features as u32);
2320            kernel.set_f64("cluster_std", cluster_std);
2321
2322            // Dispatch the kernel with optimal work group size
2323            let work_groups = [(n_samples_center + 255) / 256, 1, 1];
2324            kernel.dispatch(work_groups);
2325
2326            Ok(())
2327        })
2328        .map_err(|e| {
2329            DatasetsError::Other(format!(
2330                "GPU blob generation kernel execution failed: {}",
2331                e
2332            ))
2333        })?;
2334
2335    // Copy results back to CPU and restructure as Vec<Vec<f64>>
2336    let flat_data = data_buffer.to_vec();
2337    let mut result = Vec::with_capacity(n_samples_center);
2338
2339    for i in 0..n_samples_center {
2340        let mut sample = Vec::with_capacity(n_features);
2341        for j in 0..n_features {
2342            sample.push(flat_data[i * n_features + j]);
2343        }
2344        result.push(sample);
2345    }
2346
2347    */
2348}
2349
2350/// Check if GPU is available for acceleration
2351#[allow(dead_code)]
2352pub fn gpu_is_available() -> bool {
2353    // Try to create a GPU context to check availability
2354    GpuContext::new(crate::gpu::GpuConfig::default()).is_ok()
2355}
2356
2357/// Get GPU device information
2358#[allow(dead_code)]
2359pub fn get_gpu_info() -> Result<Vec<GpuDeviceInfo>> {
2360    crate::gpu::list_gpu_devices()
2361        .map_err(|e| DatasetsError::Other(format!("Failed to get GPU info: {e}")))
2362}
2363
2364/// Benchmark GPU vs CPU performance for data generation
2365#[allow(dead_code)]
2366pub fn benchmark_gpu_vs_cpu(
2367    n_samples: usize,
2368    n_features: usize,
2369    iterations: usize,
2370) -> Result<(f64, f64)> {
2371    use std::time::Instant;
2372
2373    // Benchmark CPU implementation
2374    let cpu_start = Instant::now();
2375    for _ in 0..iterations {
2376        let _result = make_classification(n_samples, n_features, 3, 2, n_features, Some(42))?;
2377    }
2378    let cpu_time = cpu_start.elapsed().as_secs_f64() / iterations as f64;
2379
2380    // Benchmark GPU implementation
2381    let gpuconfig = GpuConfig::default();
2382    let gpu_start = Instant::now();
2383    for _ in 0..iterations {
2384        let _result = make_classification_gpu(
2385            n_samples,
2386            n_features,
2387            3,
2388            2,
2389            n_features,
2390            Some(42),
2391            gpuconfig.clone(),
2392        )?;
2393    }
2394    let gpu_time = gpu_start.elapsed().as_secs_f64() / iterations as f64;
2395
2396    Ok((cpu_time, gpu_time))
2397}
2398
2399// Advanced manifold learning datasets
2400
2401/// Generate a dataset with an S-curve manifold embedded in 3D space
2402#[allow(dead_code)]
2403pub fn make_s_curve(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
2404    if n_samples == 0 {
2405        return Err(DatasetsError::InvalidFormat(
2406            "n_samples must be > 0".to_string(),
2407        ));
2408    }
2409
2410    if noise < 0.0 {
2411        return Err(DatasetsError::InvalidFormat(
2412            "noise must be >= 0".to_string(),
2413        ));
2414    }
2415
2416    let mut rng = match randomseed {
2417        Some(_seed) => StdRng::seed_from_u64(_seed),
2418        None => {
2419            let mut r = rng();
2420            StdRng::seed_from_u64(r.next_u64())
2421        }
2422    };
2423
2424    let mut data = Array2::zeros((n_samples, 3));
2425    let mut color = Array1::zeros(n_samples);
2426
2427    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2428
2429    for i in 0..n_samples {
2430        // Parameter t ranges from 0 to 4Ï€
2431        let t = 4.0 * PI * (i as f64) / (n_samples as f64 - 1.0);
2432
2433        // S-curve parametric equations
2434        data[[i, 0]] = t.sin() + noise_dist.sample(&mut rng);
2435        data[[i, 1]] = 2.0 * t + noise_dist.sample(&mut rng);
2436        data[[i, 2]] = (t / 2.0).sin() + noise_dist.sample(&mut rng);
2437
2438        // Color represents the position along the curve
2439        color[i] = t;
2440    }
2441
2442    let mut dataset = Dataset::new(data, Some(color));
2443    dataset = dataset
2444        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2445        .with_description("S-curve manifold embedded in 3D space".to_string());
2446
2447    Ok(dataset)
2448}
2449
2450/// Generate a dataset sampling from a Swiss roll manifold
2451#[allow(dead_code)]
2452pub fn make_swiss_roll_advanced(
2453    n_samples: usize,
2454    noise: f64,
2455    hole: bool,
2456    randomseed: Option<u64>,
2457) -> Result<Dataset> {
2458    if n_samples == 0 {
2459        return Err(DatasetsError::InvalidFormat(
2460            "n_samples must be > 0".to_string(),
2461        ));
2462    }
2463
2464    if noise < 0.0 {
2465        return Err(DatasetsError::InvalidFormat(
2466            "noise must be >= 0".to_string(),
2467        ));
2468    }
2469
2470    let mut rng = match randomseed {
2471        Some(_seed) => StdRng::seed_from_u64(_seed),
2472        None => {
2473            let mut r = rng();
2474            StdRng::seed_from_u64(r.next_u64())
2475        }
2476    };
2477
2478    let mut data = Array2::zeros((n_samples, 3));
2479    let mut color = Array1::zeros(n_samples);
2480
2481    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2482    let uniform = rand_distr::Uniform::new(0.0, 1.0).unwrap();
2483
2484    for i in 0..n_samples {
2485        // Sample parameters
2486        let mut t = uniform.sample(&mut rng) * 3.0 * PI / 2.0;
2487        let mut y = uniform.sample(&mut rng) * 20.0;
2488
2489        // Create hole if requested
2490        if hole {
2491            // Create a hole by rejecting _samples in the middle region
2492            while t > PI / 2.0 && t < PI && y > 8.0 && y < 12.0 {
2493                t = uniform.sample(&mut rng) * 3.0 * PI / 2.0;
2494                y = uniform.sample(&mut rng) * 20.0;
2495            }
2496        }
2497
2498        // Swiss roll parametric equations
2499        data[[i, 0]] = t * t.cos() + noise_dist.sample(&mut rng);
2500        data[[i, 1]] = y + noise_dist.sample(&mut rng);
2501        data[[i, 2]] = t * t.sin() + noise_dist.sample(&mut rng);
2502
2503        // Color represents position
2504        color[i] = t;
2505    }
2506
2507    let mut dataset = Dataset::new(data, Some(color));
2508    let description = if hole {
2509        "Swiss roll manifold with hole embedded in 3D space"
2510    } else {
2511        "Swiss roll manifold embedded in 3D space"
2512    };
2513
2514    dataset = dataset
2515        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2516        .with_description(description.to_string());
2517
2518    Ok(dataset)
2519}
2520
2521/// Generate a dataset from a severed sphere (broken manifold)
2522#[allow(dead_code)]
2523pub fn make_severed_sphere(
2524    n_samples: usize,
2525    noise: f64,
2526    randomseed: Option<u64>,
2527) -> Result<Dataset> {
2528    if n_samples == 0 {
2529        return Err(DatasetsError::InvalidFormat(
2530            "n_samples must be > 0".to_string(),
2531        ));
2532    }
2533
2534    if noise < 0.0 {
2535        return Err(DatasetsError::InvalidFormat(
2536            "noise must be >= 0".to_string(),
2537        ));
2538    }
2539
2540    let mut rng = match randomseed {
2541        Some(_seed) => StdRng::seed_from_u64(_seed),
2542        None => {
2543            let mut r = rng();
2544            StdRng::seed_from_u64(r.next_u64())
2545        }
2546    };
2547
2548    let mut data = Array2::zeros((n_samples, 3));
2549    let mut color = Array1::zeros(n_samples);
2550
2551    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2552    let uniform = rand_distr::Uniform::new(0.0, 1.0).unwrap();
2553
2554    for i in 0..n_samples {
2555        // Sample spherical coordinates, but exclude a region to "sever" the sphere
2556        let mut phi = uniform.sample(&mut rng) * 2.0 * PI; // azimuthal angle
2557        let mut theta = uniform.sample(&mut rng) * PI; // polar angle
2558
2559        // Create a severed region by excluding certain angles
2560        while phi > PI / 3.0 && phi < 2.0 * PI / 3.0 && theta > PI / 3.0 && theta < 2.0 * PI / 3.0 {
2561            phi = uniform.sample(&mut rng) * 2.0 * PI;
2562            theta = uniform.sample(&mut rng) * PI;
2563        }
2564
2565        let radius = 1.0; // Unit sphere
2566
2567        // Convert to Cartesian coordinates
2568        data[[i, 0]] = radius * theta.sin() * phi.cos() + noise_dist.sample(&mut rng);
2569        data[[i, 1]] = radius * theta.sin() * phi.sin() + noise_dist.sample(&mut rng);
2570        data[[i, 2]] = radius * theta.cos() + noise_dist.sample(&mut rng);
2571
2572        // Color based on position
2573        color[i] = phi;
2574    }
2575
2576    let mut dataset = Dataset::new(data, Some(color));
2577    dataset = dataset
2578        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2579        .with_description("Severed sphere manifold with discontinuities".to_string());
2580
2581    Ok(dataset)
2582}
2583
2584/// Generate a dataset from a twin peaks manifold (two connected peaks)
2585#[allow(dead_code)]
2586pub fn make_twin_peaks(n_samples: usize, noise: f64, randomseed: Option<u64>) -> Result<Dataset> {
2587    if n_samples == 0 {
2588        return Err(DatasetsError::InvalidFormat(
2589            "n_samples must be > 0".to_string(),
2590        ));
2591    }
2592
2593    if noise < 0.0 {
2594        return Err(DatasetsError::InvalidFormat(
2595            "noise must be >= 0".to_string(),
2596        ));
2597    }
2598
2599    let mut rng = match randomseed {
2600        Some(_seed) => StdRng::seed_from_u64(_seed),
2601        None => {
2602            let mut r = rng();
2603            StdRng::seed_from_u64(r.next_u64())
2604        }
2605    };
2606
2607    let mut data = Array2::zeros((n_samples, 3));
2608    let mut labels = Array1::zeros(n_samples);
2609
2610    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2611    let uniform = rand_distr::Uniform::new(-2.0, 2.0).unwrap();
2612
2613    for i in 0..n_samples {
2614        let x = uniform.sample(&mut rng);
2615        let y = uniform.sample(&mut rng);
2616
2617        // Twin peaks function: two Gaussian peaks
2618        let peak1 = (-(((x as f64) - 1.0).powi(2) + ((y as f64) - 1.0).powi(2))).exp();
2619        let peak2 = (-(((x as f64) + 1.0).powi(2) + ((y as f64) + 1.0).powi(2))).exp();
2620        let z = peak1 + peak2 + noise_dist.sample(&mut rng);
2621
2622        data[[i, 0]] = x;
2623        data[[i, 1]] = y;
2624        data[[i, 2]] = z;
2625
2626        // Label based on which peak is closer
2627        labels[i] = if ((x as f64) - 1.0).powi(2) + ((y as f64) - 1.0).powi(2)
2628            < ((x as f64) + 1.0).powi(2) + ((y as f64) + 1.0).powi(2)
2629        {
2630            0.0
2631        } else {
2632            1.0
2633        };
2634    }
2635
2636    let mut dataset = Dataset::new(data, Some(labels));
2637    dataset = dataset
2638        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2639        .with_targetnames(vec!["peak_0".to_string(), "peak_1".to_string()])
2640        .with_description("Twin peaks manifold with two connected Gaussian peaks".to_string());
2641
2642    Ok(dataset)
2643}
2644
2645/// Generate a dataset from a helix manifold in 3D space
2646#[allow(dead_code)]
2647pub fn make_helix(
2648    n_samples: usize,
2649    n_turns: f64,
2650    noise: f64,
2651    randomseed: Option<u64>,
2652) -> Result<Dataset> {
2653    if n_samples == 0 {
2654        return Err(DatasetsError::InvalidFormat(
2655            "n_samples must be > 0".to_string(),
2656        ));
2657    }
2658
2659    if n_turns <= 0.0 {
2660        return Err(DatasetsError::InvalidFormat(
2661            "n_turns must be > 0".to_string(),
2662        ));
2663    }
2664
2665    if noise < 0.0 {
2666        return Err(DatasetsError::InvalidFormat(
2667            "noise must be >= 0".to_string(),
2668        ));
2669    }
2670
2671    let mut rng = match randomseed {
2672        Some(_seed) => StdRng::seed_from_u64(_seed),
2673        None => {
2674            let mut r = rng();
2675            StdRng::seed_from_u64(r.next_u64())
2676        }
2677    };
2678
2679    let mut data = Array2::zeros((n_samples, 3));
2680    let mut color = Array1::zeros(n_samples);
2681
2682    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2683
2684    for i in 0..n_samples {
2685        // Parameter t ranges from 0 to n_turns * 2Ï€
2686        let t = n_turns * 2.0 * PI * (i as f64) / (n_samples as f64 - 1.0);
2687
2688        // Helix parametric equations
2689        data[[i, 0]] = t.cos() + noise_dist.sample(&mut rng);
2690        data[[i, 1]] = t.sin() + noise_dist.sample(&mut rng);
2691        data[[i, 2]] = t / (n_turns * 2.0 * PI) + noise_dist.sample(&mut rng); // Normalized height
2692
2693        // Color represents position along helix
2694        color[i] = t;
2695    }
2696
2697    let mut dataset = Dataset::new(data, Some(color));
2698    dataset = dataset
2699        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2700        .with_description(format!("Helix manifold with {n_turns} _turns in 3D space"));
2701
2702    Ok(dataset)
2703}
2704
2705/// Generate a dataset from an intersecting manifolds (two intersecting planes)
2706#[allow(dead_code)]
2707pub fn make_intersecting_manifolds(
2708    n_samples: usize,
2709    noise: f64,
2710    randomseed: Option<u64>,
2711) -> Result<Dataset> {
2712    if n_samples == 0 {
2713        return Err(DatasetsError::InvalidFormat(
2714            "n_samples must be > 0".to_string(),
2715        ));
2716    }
2717
2718    if noise < 0.0 {
2719        return Err(DatasetsError::InvalidFormat(
2720            "noise must be >= 0".to_string(),
2721        ));
2722    }
2723
2724    let mut rng = match randomseed {
2725        Some(_seed) => StdRng::seed_from_u64(_seed),
2726        None => {
2727            let mut r = rng();
2728            StdRng::seed_from_u64(r.next_u64())
2729        }
2730    };
2731
2732    let samples_per_manifold = n_samples / 2;
2733    let remainder = n_samples % 2;
2734
2735    let mut data = Array2::zeros((n_samples, 3));
2736    let mut labels = Array1::zeros(n_samples);
2737
2738    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2739    let uniform = rand_distr::Uniform::new(-2.0, 2.0).unwrap();
2740
2741    let mut sample_idx = 0;
2742
2743    // First manifold: plane z = x
2744    for _ in 0..samples_per_manifold + remainder {
2745        let x = uniform.sample(&mut rng);
2746        let y = uniform.sample(&mut rng);
2747        let z = x + noise_dist.sample(&mut rng);
2748
2749        data[[sample_idx, 0]] = x;
2750        data[[sample_idx, 1]] = y;
2751        data[[sample_idx, 2]] = z;
2752        labels[sample_idx] = 0.0;
2753        sample_idx += 1;
2754    }
2755
2756    // Second manifold: plane z = -x
2757    for _ in 0..samples_per_manifold {
2758        let x = uniform.sample(&mut rng);
2759        let y = uniform.sample(&mut rng);
2760        let z = -x + noise_dist.sample(&mut rng);
2761
2762        data[[sample_idx, 0]] = x;
2763        data[[sample_idx, 1]] = y;
2764        data[[sample_idx, 2]] = z;
2765        labels[sample_idx] = 1.0;
2766        sample_idx += 1;
2767    }
2768
2769    let mut dataset = Dataset::new(data, Some(labels));
2770    dataset = dataset
2771        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2772        .with_targetnames(vec!["manifold_0".to_string(), "manifold_1".to_string()])
2773        .with_description("Two intersecting plane manifolds in 3D space".to_string());
2774
2775    Ok(dataset)
2776}
2777
2778/// Generate a dataset from a torus manifold in 3D space
2779#[allow(dead_code)]
2780pub fn make_torus(
2781    n_samples: usize,
2782    major_radius: f64,
2783    minor_radius: f64,
2784    noise: f64,
2785    randomseed: Option<u64>,
2786) -> Result<Dataset> {
2787    if n_samples == 0 {
2788        return Err(DatasetsError::InvalidFormat(
2789            "n_samples must be > 0".to_string(),
2790        ));
2791    }
2792
2793    if major_radius <= 0.0 || minor_radius <= 0.0 {
2794        return Err(DatasetsError::InvalidFormat(
2795            "major_radius and minor_radius must be > 0".to_string(),
2796        ));
2797    }
2798
2799    if minor_radius >= major_radius {
2800        return Err(DatasetsError::InvalidFormat(
2801            "minor_radius must be < major_radius".to_string(),
2802        ));
2803    }
2804
2805    if noise < 0.0 {
2806        return Err(DatasetsError::InvalidFormat(
2807            "noise must be >= 0".to_string(),
2808        ));
2809    }
2810
2811    let mut rng = match randomseed {
2812        Some(_seed) => StdRng::seed_from_u64(_seed),
2813        None => {
2814            let mut r = rng();
2815            StdRng::seed_from_u64(r.next_u64())
2816        }
2817    };
2818
2819    let mut data = Array2::zeros((n_samples, 3));
2820    let mut color = Array1::zeros(n_samples);
2821
2822    let noise_dist = rand_distr::Normal::new(0.0, noise).unwrap();
2823    let uniform = rand_distr::Uniform::new(0.0, 2.0 * PI).unwrap();
2824
2825    for i in 0..n_samples {
2826        let theta = uniform.sample(&mut rng); // Major angle
2827        let phi = uniform.sample(&mut rng); // Minor angle
2828
2829        // Torus parametric equations
2830        data[[i, 0]] =
2831            (major_radius + minor_radius * phi.cos()) * theta.cos() + noise_dist.sample(&mut rng);
2832        data[[i, 1]] =
2833            (major_radius + minor_radius * phi.cos()) * theta.sin() + noise_dist.sample(&mut rng);
2834        data[[i, 2]] = minor_radius * phi.sin() + noise_dist.sample(&mut rng);
2835
2836        // Color based on major angle
2837        color[i] = theta;
2838    }
2839
2840    let mut dataset = Dataset::new(data, Some(color));
2841    dataset = dataset
2842        .with_featurenames(vec!["X".to_string(), "Y".to_string(), "Z".to_string()])
2843        .with_description(format!(
2844            "Torus manifold with major _radius {major_radius} and minor _radius {minor_radius}"
2845        ));
2846
2847    Ok(dataset)
2848}
2849
2850/// Advanced manifold configuration for complex datasets
2851#[derive(Debug, Clone)]
2852pub struct ManifoldConfig {
2853    /// Type of manifold to generate
2854    pub manifold_type: ManifoldType,
2855    /// Number of samples
2856    pub n_samples: usize,
2857    /// Noise level
2858    pub noise: f64,
2859    /// Random seed
2860    pub randomseed: Option<u64>,
2861    /// Manifold-specific parameters
2862    pub parameters: std::collections::HashMap<String, f64>,
2863}
2864
2865/// Types of manifolds that can be generated
2866#[derive(Debug, Clone)]
2867pub enum ManifoldType {
2868    /// S-curve manifold
2869    SCurve,
2870    /// Swiss roll (with optional hole)
2871    SwissRoll {
2872        /// Whether to create a hole in the middle
2873        hole: bool,
2874    },
2875    /// Severed sphere
2876    SeveredSphere,
2877    /// Twin peaks
2878    TwinPeaks,
2879    /// Helix with specified turns
2880    Helix {
2881        /// Number of turns in the helix
2882        n_turns: f64,
2883    },
2884    /// Intersecting manifolds
2885    IntersectingManifolds,
2886    /// Torus with major and minor radii
2887    Torus {
2888        /// Major radius of the torus
2889        major_radius: f64,
2890        /// Minor radius of the torus
2891        minor_radius: f64,
2892    },
2893}
2894
2895impl Default for ManifoldConfig {
2896    fn default() -> Self {
2897        Self {
2898            manifold_type: ManifoldType::SCurve,
2899            n_samples: 1000,
2900            noise: 0.1,
2901            randomseed: None,
2902            parameters: std::collections::HashMap::new(),
2903        }
2904    }
2905}
2906
2907impl ManifoldConfig {
2908    /// Create a new manifold configuration
2909    pub fn new(manifold_type: ManifoldType) -> Self {
2910        Self {
2911            manifold_type,
2912            ..Default::default()
2913        }
2914    }
2915
2916    /// Set number of samples
2917    pub fn with_samples(mut self, n_samples: usize) -> Self {
2918        self.n_samples = n_samples;
2919        self
2920    }
2921
2922    /// Set noise level
2923    pub fn with_noise(mut self, noise: f64) -> Self {
2924        self.noise = noise;
2925        self
2926    }
2927
2928    /// Set random seed
2929    pub fn with_seed(mut self, seed: u64) -> Self {
2930        self.randomseed = Some(seed);
2931        self
2932    }
2933
2934    /// Add a parameter
2935    pub fn with_parameter(mut self, name: String, value: f64) -> Self {
2936        self.parameters.insert(name, value);
2937        self
2938    }
2939}
2940
2941/// Generate a manifold dataset based on configuration
2942#[allow(dead_code)]
2943pub fn make_manifold(config: ManifoldConfig) -> Result<Dataset> {
2944    match config.manifold_type {
2945        ManifoldType::SCurve => make_s_curve(config.n_samples, config.noise, config.randomseed),
2946        ManifoldType::SwissRoll { hole } => {
2947            make_swiss_roll_advanced(config.n_samples, config.noise, hole, config.randomseed)
2948        }
2949        ManifoldType::SeveredSphere => {
2950            make_severed_sphere(config.n_samples, config.noise, config.randomseed)
2951        }
2952        ManifoldType::TwinPeaks => {
2953            make_twin_peaks(config.n_samples, config.noise, config.randomseed)
2954        }
2955        ManifoldType::Helix { n_turns } => {
2956            make_helix(config.n_samples, n_turns, config.noise, config.randomseed)
2957        }
2958        ManifoldType::IntersectingManifolds => {
2959            make_intersecting_manifolds(config.n_samples, config.noise, config.randomseed)
2960        }
2961        ManifoldType::Torus {
2962            major_radius,
2963            minor_radius,
2964        } => make_torus(
2965            config.n_samples,
2966            major_radius,
2967            minor_radius,
2968            config.noise,
2969            config.randomseed,
2970        ),
2971    }
2972}
2973
2974#[cfg(test)]
2975mod tests {
2976    use super::*;
2977    use rand_distr::Uniform;
2978
2979    #[test]
2980    fn test_make_classification_invalid_params() {
2981        // Test zero n_samples
2982        assert!(make_classification(0, 5, 2, 1, 3, None).is_err());
2983
2984        // Test zero n_features
2985        assert!(make_classification(10, 0, 2, 1, 3, None).is_err());
2986
2987        // Test zero n_informative
2988        assert!(make_classification(10, 5, 2, 1, 0, None).is_err());
2989
2990        // Test n_features < n_informative
2991        assert!(make_classification(10, 3, 2, 1, 5, None).is_err());
2992
2993        // Test n_classes < 2
2994        assert!(make_classification(10, 5, 1, 1, 3, None).is_err());
2995
2996        // Test zero n_clusters_per_class
2997        assert!(make_classification(10, 5, 2, 0, 3, None).is_err());
2998    }
2999
3000    #[test]
3001    fn test_make_regression_invalid_params() {
3002        // Test zero n_samples
3003        assert!(make_regression(0, 5, 3, 1.0, None).is_err());
3004
3005        // Test zero n_features
3006        assert!(make_regression(10, 0, 3, 1.0, None).is_err());
3007
3008        // Test zero n_informative
3009        assert!(make_regression(10, 5, 0, 1.0, None).is_err());
3010
3011        // Test n_features < n_informative
3012        assert!(make_regression(10, 3, 5, 1.0, None).is_err());
3013
3014        // Test negative noise
3015        assert!(make_regression(10, 5, 3, -1.0, None).is_err());
3016    }
3017
3018    #[test]
3019    fn test_make_time_series_invalid_params() {
3020        // Test zero n_samples
3021        assert!(make_time_series(0, 3, false, false, 1.0, None).is_err());
3022
3023        // Test zero n_features
3024        assert!(make_time_series(10, 0, false, false, 1.0, None).is_err());
3025
3026        // Test negative noise
3027        assert!(make_time_series(10, 3, false, false, -1.0, None).is_err());
3028    }
3029
3030    #[test]
3031    fn test_make_blobs_invalid_params() {
3032        // Test zero n_samples
3033        assert!(make_blobs(0, 3, 2, 1.0, None).is_err());
3034
3035        // Test zero n_features
3036        assert!(make_blobs(10, 0, 2, 1.0, None).is_err());
3037
3038        // Test zero centers
3039        assert!(make_blobs(10, 3, 0, 1.0, None).is_err());
3040
3041        // Test zero or negative cluster_std
3042        assert!(make_blobs(10, 3, 2, 0.0, None).is_err());
3043        assert!(make_blobs(10, 3, 2, -1.0, None).is_err());
3044    }
3045
3046    #[test]
3047    fn test_make_classification_valid_params() {
3048        let dataset = make_classification(20, 5, 3, 2, 4, Some(42)).unwrap();
3049        assert_eq!(dataset.n_samples(), 20);
3050        assert_eq!(dataset.n_features(), 5);
3051        assert!(dataset.target.is_some());
3052        assert!(dataset.featurenames.is_some());
3053        assert!(dataset.targetnames.is_some());
3054    }
3055
3056    #[test]
3057    fn test_make_regression_valid_params() {
3058        let dataset = make_regression(15, 4, 3, 0.5, Some(42)).unwrap();
3059        assert_eq!(dataset.n_samples(), 15);
3060        assert_eq!(dataset.n_features(), 4);
3061        assert!(dataset.target.is_some());
3062        assert!(dataset.featurenames.is_some());
3063    }
3064
3065    #[test]
3066    fn test_make_time_series_valid_params() {
3067        let dataset = make_time_series(25, 3, true, true, 0.1, Some(42)).unwrap();
3068        assert_eq!(dataset.n_samples(), 25);
3069        assert_eq!(dataset.n_features(), 3);
3070        assert!(dataset.featurenames.is_some());
3071        // Time series doesn't have targets by default
3072        assert!(dataset.target.is_none());
3073    }
3074
3075    #[test]
3076    fn test_make_blobs_valid_params() {
3077        let dataset = make_blobs(30, 4, 3, 1.5, Some(42)).unwrap();
3078        assert_eq!(dataset.n_samples(), 30);
3079        assert_eq!(dataset.n_features(), 4);
3080        assert!(dataset.target.is_some());
3081        assert!(dataset.featurenames.is_some());
3082    }
3083
3084    #[test]
3085    fn test_make_spirals_invalid_params() {
3086        // Test zero n_samples
3087        assert!(make_spirals(0, 2, 0.1, None).is_err());
3088
3089        // Test zero n_spirals
3090        assert!(make_spirals(100, 0, 0.1, None).is_err());
3091
3092        // Test negative noise
3093        assert!(make_spirals(100, 2, -0.1, None).is_err());
3094    }
3095
3096    #[test]
3097    fn test_make_spirals_valid_params() {
3098        let dataset = make_spirals(100, 2, 0.1, Some(42)).unwrap();
3099        assert_eq!(dataset.n_samples(), 100);
3100        assert_eq!(dataset.n_features(), 2);
3101        assert!(dataset.target.is_some());
3102        assert!(dataset.featurenames.is_some());
3103
3104        // Check that we have the right number of spirals
3105        if let Some(target) = &dataset.target {
3106            let unique_labels: std::collections::HashSet<_> =
3107                target.iter().map(|&x| x as i32).collect();
3108            assert_eq!(unique_labels.len(), 2);
3109        }
3110    }
3111
3112    #[test]
3113    fn test_make_moons_invalid_params() {
3114        // Test zero n_samples
3115        assert!(make_moons(0, 0.1, None).is_err());
3116
3117        // Test negative noise
3118        assert!(make_moons(100, -0.1, None).is_err());
3119    }
3120
3121    #[test]
3122    fn test_make_moons_valid_params() {
3123        let dataset = make_moons(100, 0.1, Some(42)).unwrap();
3124        assert_eq!(dataset.n_samples(), 100);
3125        assert_eq!(dataset.n_features(), 2);
3126        assert!(dataset.target.is_some());
3127        assert!(dataset.featurenames.is_some());
3128
3129        // Check that we have exactly 2 classes (2 moons)
3130        if let Some(target) = &dataset.target {
3131            let unique_labels: std::collections::HashSet<_> =
3132                target.iter().map(|&x| x as i32).collect();
3133            assert_eq!(unique_labels.len(), 2);
3134        }
3135    }
3136
3137    #[test]
3138    fn test_make_circles_invalid_params() {
3139        // Test zero n_samples
3140        assert!(make_circles(0, 0.5, 0.1, None).is_err());
3141
3142        // Test invalid factor (must be between 0 and 1)
3143        assert!(make_circles(100, 0.0, 0.1, None).is_err());
3144        assert!(make_circles(100, 1.0, 0.1, None).is_err());
3145        assert!(make_circles(100, 1.5, 0.1, None).is_err());
3146
3147        // Test negative noise
3148        assert!(make_circles(100, 0.5, -0.1, None).is_err());
3149    }
3150
3151    #[test]
3152    fn test_make_circles_valid_params() {
3153        let dataset = make_circles(100, 0.5, 0.1, Some(42)).unwrap();
3154        assert_eq!(dataset.n_samples(), 100);
3155        assert_eq!(dataset.n_features(), 2);
3156        assert!(dataset.target.is_some());
3157        assert!(dataset.featurenames.is_some());
3158
3159        // Check that we have exactly 2 classes (inner and outer circle)
3160        if let Some(target) = &dataset.target {
3161            let unique_labels: std::collections::HashSet<_> =
3162                target.iter().map(|&x| x as i32).collect();
3163            assert_eq!(unique_labels.len(), 2);
3164        }
3165    }
3166
3167    #[test]
3168    fn test_make_swiss_roll_invalid_params() {
3169        // Test zero n_samples
3170        assert!(make_swiss_roll(0, 0.1, None).is_err());
3171
3172        // Test negative noise
3173        assert!(make_swiss_roll(100, -0.1, None).is_err());
3174    }
3175
3176    #[test]
3177    fn test_make_swiss_roll_valid_params() {
3178        let dataset = make_swiss_roll(100, 0.1, Some(42)).unwrap();
3179        assert_eq!(dataset.n_samples(), 100);
3180        assert_eq!(dataset.n_features(), 3);
3181        assert!(dataset.target.is_some()); // Color parameter
3182        assert!(dataset.featurenames.is_some());
3183    }
3184
3185    #[test]
3186    fn test_make_anisotropic_blobs_invalid_params() {
3187        // Test zero n_samples
3188        assert!(make_anisotropic_blobs(0, 3, 2, 1.0, 2.0, None).is_err());
3189
3190        // Test insufficient features
3191        assert!(make_anisotropic_blobs(100, 1, 2, 1.0, 2.0, None).is_err());
3192
3193        // Test zero centers
3194        assert!(make_anisotropic_blobs(100, 3, 0, 1.0, 2.0, None).is_err());
3195
3196        // Test invalid std
3197        assert!(make_anisotropic_blobs(100, 3, 2, 0.0, 2.0, None).is_err());
3198
3199        // Test invalid anisotropy factor
3200        assert!(make_anisotropic_blobs(100, 3, 2, 1.0, 0.0, None).is_err());
3201    }
3202
3203    #[test]
3204    fn test_make_anisotropic_blobs_valid_params() {
3205        let dataset = make_anisotropic_blobs(100, 3, 2, 1.0, 3.0, Some(42)).unwrap();
3206        assert_eq!(dataset.n_samples(), 100);
3207        assert_eq!(dataset.n_features(), 3);
3208        assert!(dataset.target.is_some());
3209        assert!(dataset.featurenames.is_some());
3210
3211        // Check that we have the right number of clusters
3212        if let Some(target) = &dataset.target {
3213            let unique_labels: std::collections::HashSet<_> =
3214                target.iter().map(|&x| x as i32).collect();
3215            assert_eq!(unique_labels.len(), 2);
3216        }
3217    }
3218
3219    #[test]
3220    fn test_make_hierarchical_clusters_invalid_params() {
3221        // Test zero n_samples
3222        assert!(make_hierarchical_clusters(0, 3, 2, 3, 1.0, 0.5, None).is_err());
3223
3224        // Test zero features
3225        assert!(make_hierarchical_clusters(100, 0, 2, 3, 1.0, 0.5, None).is_err());
3226
3227        // Test zero main clusters
3228        assert!(make_hierarchical_clusters(100, 3, 0, 3, 1.0, 0.5, None).is_err());
3229
3230        // Test zero sub clusters
3231        assert!(make_hierarchical_clusters(100, 3, 2, 0, 1.0, 0.5, None).is_err());
3232
3233        // Test invalid main cluster std
3234        assert!(make_hierarchical_clusters(100, 3, 2, 3, 0.0, 0.5, None).is_err());
3235
3236        // Test invalid sub cluster std
3237        assert!(make_hierarchical_clusters(100, 3, 2, 3, 1.0, 0.0, None).is_err());
3238    }
3239
3240    #[test]
3241    fn test_make_hierarchical_clusters_valid_params() {
3242        let dataset = make_hierarchical_clusters(120, 3, 2, 3, 2.0, 0.5, Some(42)).unwrap();
3243        assert_eq!(dataset.n_samples(), 120);
3244        assert_eq!(dataset.n_features(), 3);
3245        assert!(dataset.target.is_some());
3246        assert!(dataset.featurenames.is_some());
3247
3248        // Check that we have the right number of main clusters
3249        if let Some(target) = &dataset.target {
3250            let unique_labels: std::collections::HashSet<_> =
3251                target.iter().map(|&x| x as i32).collect();
3252            assert_eq!(unique_labels.len(), 2); // 2 main clusters
3253        }
3254
3255        // Check metadata contains sub-cluster information
3256        assert!(dataset.metadata.contains_key("sub_cluster_labels"));
3257    }
3258
3259    #[test]
3260    fn test_inject_missing_data_invalid_params() {
3261        let mut data = Array2::from_shape_vec((5, 3), vec![1.0; 15]).unwrap();
3262
3263        // Test invalid missing rate
3264        assert!(inject_missing_data(&mut data, -0.1, MissingPattern::MCAR, None).is_err());
3265        assert!(inject_missing_data(&mut data, 1.5, MissingPattern::MCAR, None).is_err());
3266    }
3267
3268    #[test]
3269    fn test_inject_missing_data_mcar() {
3270        let mut data =
3271            Array2::from_shape_vec((10, 3), (0..30).map(|x| x as f64).collect()).unwrap();
3272        let original_data = data.clone();
3273
3274        let missing_mask =
3275            inject_missing_data(&mut data, 0.3, MissingPattern::MCAR, Some(42)).unwrap();
3276
3277        // Check that some data is missing
3278        let missing_count = missing_mask.iter().filter(|&&x| x).count();
3279        assert!(missing_count > 0);
3280
3281        // Check that missing values are NaN
3282        for ((i, j), &is_missing) in missing_mask.indexed_iter() {
3283            if is_missing {
3284                assert!(data[[i, j]].is_nan());
3285            } else {
3286                assert_eq!(data[[i, j]], original_data[[i, j]]);
3287            }
3288        }
3289    }
3290
3291    #[test]
3292    fn test_inject_outliers_invalid_params() {
3293        let mut data = Array2::from_shape_vec((5, 3), vec![1.0; 15]).unwrap();
3294
3295        // Test invalid outlier rate
3296        assert!(inject_outliers(&mut data, -0.1, OutlierType::Point, 2.0, None).is_err());
3297        assert!(inject_outliers(&mut data, 1.5, OutlierType::Point, 2.0, None).is_err());
3298
3299        // Test invalid outlier strength
3300        assert!(inject_outliers(&mut data, 0.1, OutlierType::Point, 0.0, None).is_err());
3301        assert!(inject_outliers(&mut data, 0.1, OutlierType::Point, -1.0, None).is_err());
3302    }
3303
3304    #[test]
3305    fn test_inject_outliers_point() {
3306        let mut data = Array2::from_shape_vec((20, 2), vec![1.0; 40]).unwrap();
3307
3308        let outlier_mask =
3309            inject_outliers(&mut data, 0.2, OutlierType::Point, 3.0, Some(42)).unwrap();
3310
3311        // Check that some outliers were created
3312        let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
3313        assert!(outlier_count > 0);
3314
3315        // Check that outliers are different from original values
3316        for (i, &is_outlier) in outlier_mask.iter().enumerate() {
3317            if is_outlier {
3318                // At least one feature should be different from 1.0
3319                let row = data.row(i);
3320                assert!(row.iter().any(|&x| (x - 1.0).abs() > 1.0));
3321            }
3322        }
3323    }
3324
3325    #[test]
3326    fn test_add_time_series_noise() {
3327        let mut data = Array2::zeros((100, 2));
3328
3329        let noise_types = [("gaussian", 0.1), ("spikes", 0.05), ("drift", 0.2)];
3330
3331        let original_data = data.clone();
3332        add_time_series_noise(&mut data, &noise_types, Some(42)).unwrap();
3333
3334        // Check that data has been modified
3335        assert!(!data
3336            .iter()
3337            .zip(original_data.iter())
3338            .all(|(&a, &b)| (a - b).abs() < 1e-10));
3339
3340        // Test invalid noise type
3341        let invalid_noise = [("invalid_type", 0.1)];
3342        let mut test_data = Array2::zeros((10, 2));
3343        assert!(add_time_series_noise(&mut test_data, &invalid_noise, Some(42)).is_err());
3344    }
3345
3346    #[test]
3347    fn test_make_corrupted_dataset() {
3348        let base_dataset = make_blobs(50, 3, 2, 1.0, Some(42)).unwrap();
3349
3350        let corrupted = make_corrupted_dataset(
3351            &base_dataset,
3352            0.1, // 10% missing
3353            MissingPattern::MCAR,
3354            0.05, // 5% outliers
3355            OutlierType::Point,
3356            2.0, // outlier strength
3357            Some(42),
3358        )
3359        .unwrap();
3360
3361        // Check basic properties
3362        assert_eq!(corrupted.n_samples(), base_dataset.n_samples());
3363        assert_eq!(corrupted.n_features(), base_dataset.n_features());
3364
3365        // Check metadata
3366        assert!(corrupted.metadata.contains_key("missing_rate"));
3367        assert!(corrupted.metadata.contains_key("outlier_rate"));
3368        assert!(corrupted.metadata.contains_key("missing_count"));
3369        assert!(corrupted.metadata.contains_key("outlier_count"));
3370
3371        // Check some data is corrupted
3372        let has_nan = corrupted.data.iter().any(|&x| x.is_nan());
3373        assert!(has_nan, "Dataset should have some missing values");
3374    }
3375
3376    #[test]
3377    fn test_make_corrupted_dataset_invalid_params() {
3378        let base_dataset = make_blobs(20, 2, 2, 1.0, Some(42)).unwrap();
3379
3380        // Test invalid missing rate
3381        assert!(make_corrupted_dataset(
3382            &base_dataset,
3383            -0.1,
3384            MissingPattern::MCAR,
3385            0.0,
3386            OutlierType::Point,
3387            1.0,
3388            None
3389        )
3390        .is_err());
3391        assert!(make_corrupted_dataset(
3392            &base_dataset,
3393            1.5,
3394            MissingPattern::MCAR,
3395            0.0,
3396            OutlierType::Point,
3397            1.0,
3398            None
3399        )
3400        .is_err());
3401
3402        // Test invalid outlier rate
3403        assert!(make_corrupted_dataset(
3404            &base_dataset,
3405            0.0,
3406            MissingPattern::MCAR,
3407            -0.1,
3408            OutlierType::Point,
3409            1.0,
3410            None
3411        )
3412        .is_err());
3413        assert!(make_corrupted_dataset(
3414            &base_dataset,
3415            0.0,
3416            MissingPattern::MCAR,
3417            1.5,
3418            OutlierType::Point,
3419            1.0,
3420            None
3421        )
3422        .is_err());
3423    }
3424
3425    #[test]
3426    fn test_missing_patterns() {
3427        let data = Array2::from_shape_vec((20, 4), (0..80).map(|x| x as f64).collect()).unwrap();
3428
3429        // Test different missing patterns
3430        for pattern in [
3431            MissingPattern::MCAR,
3432            MissingPattern::MAR,
3433            MissingPattern::MNAR,
3434            MissingPattern::Block,
3435        ] {
3436            let mut test_data = data.clone();
3437            let missing_mask = inject_missing_data(&mut test_data, 0.2, pattern, Some(42)).unwrap();
3438
3439            let missing_count = missing_mask.iter().filter(|&&x| x).count();
3440            assert!(
3441                missing_count > 0,
3442                "Pattern {pattern:?} should create some missing values"
3443            );
3444        }
3445    }
3446
3447    #[test]
3448    fn test_outlier_types() {
3449        let data = Array2::ones((30, 3));
3450
3451        // Test different outlier types
3452        for outlier_type in [
3453            OutlierType::Point,
3454            OutlierType::Contextual,
3455            OutlierType::Collective,
3456        ] {
3457            let mut test_data = data.clone();
3458            let outlier_mask =
3459                inject_outliers(&mut test_data, 0.2, outlier_type, 3.0, Some(42)).unwrap();
3460
3461            let outlier_count = outlier_mask.iter().filter(|&&x| x).count();
3462            assert!(
3463                outlier_count > 0,
3464                "Outlier type {outlier_type:?} should create some outliers"
3465            );
3466        }
3467    }
3468}