pub fn make_classification(
n_samples: usize,
n_features: usize,
n_classes: usize,
n_clusters_per_class: usize,
n_informative: usize,
randomseed: Option<u64>,
) -> Result<Dataset>
Expand description
Generate a random classification dataset with clusters
Examples found in repository?
examples/advanced_showcase.rs (lines 60-67)
57fn create_sampledataset() -> Result<Dataset, Box<dyn std::error::Error>> {
58 println!("🔧 Generating sample classification dataset...");
59
60 let dataset = make_classification(
61 1000, // n_samples
62 10, // n_features
63 3, // n_classes
64 2, // n_clusters_per_class
65 5, // n_informative
66 Some(42), // random_state
67 )?;
68
69 Ok(dataset)
70}
More examples
examples/gpu_acceleration.rs (line 254)
242fn demonstrate_cpu_gpu_comparison() -> Result<(), Box<dyn std::error::Error>> {
243 let dataset_sizes = vec![10_000, 50_000, 100_000];
244
245 println!(
246 "{:<12} {:<15} {:<15} {:<10}",
247 "Size", "CPU Time", "GPU Time", "Speedup"
248 );
249 println!("{}", "-".repeat(55));
250
251 for &size in &dataset_sizes {
252 // CPU benchmark
253 let cpu_start = Instant::now();
254 let _cpudataset = make_classification(size, 20, 5, 2, 15, Some(42))?;
255 let cpu_time = cpu_start.elapsed();
256
257 // GPU benchmark
258 let gpu_start = Instant::now();
259 let _gpudataset = make_classification_auto_gpu(size, 20, 5, 2, 15, Some(42))?;
260 let gpu_time = gpu_start.elapsed();
261
262 let speedup = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
263
264 println!(
265 "{:<12} {:<15} {:<15} {:<10.1}x",
266 size,
267 format!("{:.1}ms", cpu_time.as_millis()),
268 format!("{:.1}ms", gpu_time.as_millis()),
269 speedup
270 );
271 }
272
273 Ok(())
274}
examples/advanced_generators_demo.rs (line 55)
50fn demonstrate_adversarial_examples() -> Result<(), Box<dyn std::error::Error>> {
51 println!("🛡️ ADVERSARIAL EXAMPLES GENERATION");
52 println!("{}", "-".repeat(45));
53
54 // Create a base classification dataset
55 let basedataset = make_classification(1000, 20, 5, 2, 15, Some(42))?;
56 println!(
57 "Base dataset: {} samples, {} features, {} classes",
58 basedataset.n_samples(),
59 basedataset.n_features(),
60 5
61 );
62
63 // Test different attack methods
64 let attack_methods = vec![
65 ("FGSM", AttackMethod::FGSM, 0.1),
66 ("PGD", AttackMethod::PGD, 0.05),
67 ("Random Noise", AttackMethod::RandomNoise, 0.2),
68 ];
69
70 for (name, method, epsilon) in attack_methods {
71 println!("\nGenerating {name} adversarial examples:");
72
73 let config = AdversarialConfig {
74 epsilon,
75 attack_method: method,
76 target_class: None, // Untargeted attack
77 iterations: 10,
78 step_size: 0.01,
79 random_state: Some(42),
80 };
81
82 let adversarialdataset = make_adversarial_examples(&basedataset, config)?;
83
84 // Analyze perturbation strength
85 let perturbation_norm = calculate_perturbation_norm(&basedataset, &adversarialdataset);
86
87 println!(
88 " ✅ Generated {} adversarial examples",
89 adversarialdataset.n_samples()
90 );
91 println!(" 📊 Perturbation strength: {perturbation_norm:.4}");
92 println!(" 🎯 Attack budget (ε): {epsilon:.2}");
93 println!(
94 " 📈 Expected robustness impact: {:.1}%",
95 (1.0 - perturbation_norm) * 100.0
96 );
97 }
98
99 // Targeted attack example
100 println!("\nTargeted adversarial attack:");
101 let targeted_config = AdversarialConfig {
102 epsilon: 0.1,
103 attack_method: AttackMethod::FGSM,
104 target_class: Some(2), // Target class 2
105 iterations: 5,
106 random_state: Some(42),
107 ..Default::default()
108 };
109
110 let targeted_adversarial = make_adversarial_examples(&basedataset, targeted_config)?;
111
112 if let Some(target) = &targeted_adversarial.target {
113 let target_class_count = target.iter().filter(|&&x| x == 2.0).count();
114 println!(
115 " 🎯 Targeted to class 2: {}/{} samples",
116 target_class_count,
117 target.len()
118 );
119 }
120
121 println!();
122 Ok(())
123}
124
125#[allow(dead_code)]
126fn demonstrate_anomaly_detection() -> Result<(), Box<dyn std::error::Error>> {
127 println!("🔍 ANOMALY DETECTION DATASETS");
128 println!("{}", "-".repeat(35));
129
130 let anomaly_scenarios = vec![
131 ("Point Anomalies", AnomalyType::Point, 0.05, 3.0),
132 ("Contextual Anomalies", AnomalyType::Contextual, 0.08, 2.0),
133 ("Mixed Anomalies", AnomalyType::Mixed, 0.10, 2.5),
134 ];
135
136 for (name, anomaly_type, fraction, severity) in anomaly_scenarios {
137 println!("\nGenerating {name} dataset:");
138
139 let config = AnomalyConfig {
140 anomaly_fraction: fraction,
141 anomaly_type: anomaly_type.clone(),
142 severity,
143 mixed_anomalies: false,
144 clustering_factor: 1.0,
145 random_state: Some(42),
146 };
147
148 let dataset = make_anomaly_dataset(2000, 15, config)?;
149
150 // Analyze the generated dataset
151 if let Some(target) = &dataset.target {
152 let anomaly_count = target.iter().filter(|&&x| x == 1.0).count();
153 let normal_count = target.len() - anomaly_count;
154
155 println!(" 📊 Dataset composition:");
156 println!(
157 " Normal samples: {} ({:.1}%)",
158 normal_count,
159 (normal_count as f64 / target.len() as f64) * 100.0
160 );
161 println!(
162 " Anomalous samples: {} ({:.1}%)",
163 anomaly_count,
164 (anomaly_count as f64 / target.len() as f64) * 100.0
165 );
166
167 // Calculate separation metrics
168 let separation = calculate_anomaly_separation(&dataset);
169 println!(" 🎯 Anomaly characteristics:");
170 println!(
171 " Expected detection difficulty: {}",
172 if separation > 2.0 {
173 "Easy"
174 } else if separation > 1.0 {
175 "Medium"
176 } else {
177 "Hard"
178 }
179 );
180 println!(" Separation score: {separation:.2}");
181 println!(
182 " Recommended algorithms: {}",
183 get_recommended_anomaly_algorithms(&anomaly_type)
184 );
185 }
186 }
187
188 // Real-world scenario simulation
189 println!("\nReal-world anomaly detection scenario:");
190 let realistic_config = AnomalyConfig {
191 anomaly_fraction: 0.02, // 2% anomalies (realistic)
192 anomaly_type: AnomalyType::Mixed,
193 severity: 1.5, // Subtle anomalies
194 mixed_anomalies: true,
195 clustering_factor: 0.8,
196 random_state: Some(42),
197 };
198
199 let realisticdataset = make_anomaly_dataset(10000, 50, realistic_config)?;
200
201 if let Some(target) = &realisticdataset.target {
202 let anomaly_count = target.iter().filter(|&&x| x == 1.0).count();
203 println!(
204 " 🌍 Realistic scenario: {}/{} anomalies in {} samples",
205 anomaly_count,
206 realisticdataset.n_samples(),
207 realisticdataset.n_samples()
208 );
209 println!(" 💡 Challenge: Low anomaly rate mimics production environments");
210 }
211
212 println!();
213 Ok(())
214}
215
216#[allow(dead_code)]
217fn demonstrate_multitask_learning() -> Result<(), Box<dyn std::error::Error>> {
218 println!("🎯 MULTI-TASK LEARNING DATASETS");
219 println!("{}", "-".repeat(35));
220
221 // Basic multi-task scenario
222 println!("Multi-task scenario: Healthcare prediction");
223 let config = MultiTaskConfig {
224 n_tasks: 4,
225 task_types: vec![
226 TaskType::Classification(3), // Disease classification
227 TaskType::Regression, // Risk score prediction
228 TaskType::Classification(2), // Treatment response
229 TaskType::Ordinal(5), // Severity rating
230 ],
231 shared_features: 20, // Common patient features
232 task_specific_features: 10, // Task-specific biomarkers
233 task_correlation: 0.7, // High correlation between tasks
234 task_noise: vec![0.05, 0.1, 0.08, 0.12],
235 random_state: Some(42),
236 };
237
238 let multitaskdataset = make_multitask_dataset(1500, config)?;
239
240 println!(" 📊 Multi-task dataset structure:");
241 println!(" Number of tasks: {}", multitaskdataset.tasks.len());
242 println!(" Shared features: {}", multitaskdataset.shared_features);
243 println!(
244 " Task correlation: {:.1}",
245 multitaskdataset.task_correlation
246 );
247
248 for (i, task) in multitaskdataset.tasks.iter().enumerate() {
249 println!(
250 " Task {}: {} samples, {} features ({})",
251 i + 1,
252 task.n_samples(),
253 task.n_features(),
254 task.metadata
255 .get("task_type")
256 .unwrap_or(&"unknown".to_string())
257 );
258
259 // Analyze task characteristics
260 if let Some(target) = &task.target {
261 match task
262 .metadata
263 .get("task_type")
264 .map(|s| s.as_str())
265 .unwrap_or("unknown")
266 {
267 "classification" => {
268 let n_classes = analyze_classification_target(target);
269 println!(" Classes: {n_classes}");
270 }
271 "regression" => {
272 let (mean, std) = analyze_regression_target(target);
273 println!(" Target range: {mean:.2} ± {std:.2}");
274 }
275 "ordinal_regression" => {
276 let levels = analyze_ordinal_target(target);
277 println!(" Ordinal levels: {levels}");
278 }
279 _ => {}
280 }
281 }
282 }
283
284 // Transfer learning scenario
285 println!("\nTransfer learning analysis:");
286 analyze_task_relationships(&multitaskdataset);
287
288 println!();
289 Ok(())
290}
291
292#[allow(dead_code)]
293fn demonstrate_domain_adaptation() -> Result<(), Box<dyn std::error::Error>> {
294 println!("🌐 DOMAIN ADAPTATION DATASETS");
295 println!("{}", "-".repeat(35));
296
297 println!("Domain adaptation scenario: Cross-domain sentiment analysis");
298
299 let config = DomainAdaptationConfig {
300 n_source_domains: 3,
301 domain_shifts: vec![], // Will use default shifts
302 label_shift: true,
303 feature_shift: true,
304 concept_drift: false,
305 random_state: Some(42),
306 };
307
308 let domaindataset = make_domain_adaptation_dataset(800, 25, 3, config)?;
309
310 println!(" 📊 Domain adaptation structure:");
311 println!(" Total domains: {}", domaindataset.domains.len());
312 println!(" Source domains: {}", domaindataset.n_source_domains);
313
314 for (domainname, dataset) in &domaindataset.domains {
315 println!(
316 " {}: {} samples, {} features",
317 domainname,
318 dataset.n_samples(),
319 dataset.n_features()
320 );
321
322 // Analyze domain characteristics
323 if let Some(target) = &dataset.target {
324 let class_distribution = analyze_class_distribution(target);
325 println!(" Class distribution: {class_distribution:?}");
326 }
327
328 // Calculate domain statistics
329 let feature_stats = calculate_domain_statistics(&dataset.data);
330 println!(
331 " Feature mean: {:.3}, std: {:.3}",
332 feature_stats.0, feature_stats.1
333 );
334 }
335
336 // Domain shift analysis
337 println!("\n 🔄 Domain shift analysis:");
338 analyze_domain_shifts(&domaindataset);
339
340 println!();
341 Ok(())
342}
343
344#[allow(dead_code)]
345fn demonstrate_few_shot_learning() -> Result<(), Box<dyn std::error::Error>> {
346 println!("🎯 FEW-SHOT LEARNING DATASETS");
347 println!("{}", "-".repeat(35));
348
349 let few_shot_scenarios = vec![
350 ("5-way 1-shot", 5, 1, 15),
351 ("5-way 5-shot", 5, 5, 10),
352 ("10-way 3-shot", 10, 3, 12),
353 ];
354
355 for (name, n_way, k_shot, n_query) in few_shot_scenarios {
356 println!("\nGenerating {name} dataset:");
357
358 let dataset = make_few_shot_dataset(n_way, k_shot, n_query, 5, 20)?;
359
360 println!(" 📊 Few-shot configuration:");
361 println!(" Ways (classes): {}", dataset.n_way);
362 println!(" Shots per class: {}", dataset.k_shot);
363 println!(" Query samples per class: {}", dataset.n_query);
364 println!(" Episodes: {}", dataset.episodes.len());
365
366 // Analyze episode characteristics
367 for (i, episode) in dataset.episodes.iter().enumerate().take(2) {
368 println!(" Episode {}:", i + 1);
369 println!(
370 " Support set: {} samples",
371 episode.support_set.n_samples()
372 );
373 println!(" Query set: {} samples", episode.query_set.n_samples());
374
375 // Calculate class balance in support set
376 if let Some(support_target) = &episode.support_set.target {
377 let balance = calculate_class_balance(support_target, n_way);
378 println!(" Support balance: {balance:.2}");
379 }
380 }
381
382 println!(" 💡 Use case: {}", get_few_shot_use_case(n_way, k_shot));
383 }
384
385 println!();
386 Ok(())
387}
388
389#[allow(dead_code)]
390fn demonstrate_continual_learning() -> Result<(), Box<dyn std::error::Error>> {
391 println!("📚 CONTINUAL LEARNING DATASETS");
392 println!("{}", "-".repeat(35));
393
394 let drift_strengths = vec![
395 ("Mild drift", 0.2),
396 ("Moderate drift", 0.5),
397 ("Severe drift", 1.0),
398 ];
399
400 for (name, drift_strength) in drift_strengths {
401 println!("\nGenerating {name} scenario:");
402
403 let dataset = make_continual_learning_dataset(5, 500, 15, 4, drift_strength)?;
404
405 println!(" 📊 Continual learning structure:");
406 println!(" Number of tasks: {}", dataset.tasks.len());
407 println!(
408 " Concept drift strength: {:.1}",
409 dataset.concept_drift_strength
410 );
411
412 // Analyze concept drift between tasks
413 analyze_concept_drift(&dataset);
414
415 // Recommend continual learning strategies
416 println!(
417 " 💡 Recommended strategies: {}",
418 get_continual_learning_strategies(drift_strength)
419 );
420 }
421
422 // Catastrophic forgetting simulation
423 println!("\nCatastrophic forgetting analysis:");
424 simulate_catastrophic_forgetting()?;
425
426 println!();
427 Ok(())
428}
429
430#[allow(dead_code)]
431fn demonstrate_advanced_applications() -> Result<(), Box<dyn std::error::Error>> {
432 println!("🚀 ADVANCED APPLICATIONS");
433 println!("{}", "-".repeat(25));
434
435 // Meta-learning scenario
436 println!("Meta-learning scenario:");
437 demonstrate_meta_learning_setup()?;
438
439 // Robust machine learning
440 println!("\nRobust ML scenario:");
441 demonstrate_robust_ml_setup()?;
442
443 // Federated learning simulation
444 println!("\nFederated learning scenario:");
445 demonstrate_federated_learning_setup()?;
446
447 Ok(())
448}
449
450// Helper functions for analysis
451
452#[allow(dead_code)]
453fn calculate_perturbation_norm(
454 original: &scirs2_datasets::Dataset,
455 adversarial: &scirs2_datasets::Dataset,
456) -> f64 {
457 let diff = &adversarial.data - &original.data;
458 let norm = diff.iter().map(|&x| x * x).sum::<f64>().sqrt();
459 norm / (original.n_samples() * original.n_features()) as f64
460}
461
462#[allow(dead_code)]
463fn calculate_anomaly_separation(dataset: &scirs2_datasets::Dataset) -> f64 {
464 // Simplified separation metric
465 if let Some(target) = &dataset.target {
466 let normal_indices: Vec<usize> = target
467 .iter()
468 .enumerate()
469 .filter_map(|(i, &label)| if label == 0.0 { Some(i) } else { None })
470 .collect();
471 let anomaly_indices: Vec<usize> = target
472 .iter()
473 .enumerate()
474 .filter_map(|(i, &label)| if label == 1.0 { Some(i) } else { None })
475 .collect();
476
477 if normal_indices.is_empty() || anomaly_indices.is_empty() {
478 return 0.0;
479 }
480
481 // Calculate average distances
482 let normal_center = calculate_centroid(&dataset.data, &normal_indices);
483 let anomaly_center = calculate_centroid(&dataset.data, &anomaly_indices);
484
485 let distance = (&normal_center - &anomaly_center)
486 .iter()
487 .map(|&x| x * x)
488 .sum::<f64>()
489 .sqrt();
490 distance / dataset.n_features() as f64
491 } else {
492 0.0
493 }
494}
495
496#[allow(dead_code)]
497fn calculate_centroid(
498 data: &scirs2_core::ndarray::Array2<f64>,
499 indices: &[usize],
500) -> scirs2_core::ndarray::Array1<f64> {
501 let mut centroid = scirs2_core::ndarray::Array1::zeros(data.ncols());
502 for &idx in indices {
503 centroid = centroid + data.row(idx);
504 }
505 centroid / indices.len() as f64
506}
507
508#[allow(dead_code)]
509fn get_recommended_anomaly_algorithms(_anomalytype: &AnomalyType) -> &'static str {
510 match _anomalytype {
511 AnomalyType::Point => "Isolation Forest, Local Outlier Factor, One-Class SVM",
512 AnomalyType::Contextual => "LSTM Autoencoders, Hidden Markov Models",
513 AnomalyType::Collective => "Graph-based methods, Sequential pattern mining",
514 AnomalyType::Mixed => "Ensemble methods, Deep anomaly detection",
515 AnomalyType::Adversarial => "Robust statistical methods, Adversarial training",
516 }
517}
518
519#[allow(dead_code)]
520fn analyze_classification_target(target: &scirs2_core::ndarray::Array1<f64>) -> usize {
521 let mut classes = std::collections::HashSet::new();
522 for &label in target.iter() {
523 classes.insert(label as i32);
524 }
525 classes.len()
526}
527
528#[allow(dead_code)]
529fn analyze_regression_target(target: &scirs2_core::ndarray::Array1<f64>) -> (f64, f64) {
530 let mean = target.mean().unwrap_or(0.0);
531 let std = target.std(0.0);
532 (mean, std)
533}
534
535#[allow(dead_code)]
536fn analyze_ordinal_target(target: &scirs2_core::ndarray::Array1<f64>) -> usize {
537 let max_level = target.iter().fold(0.0f64, |a, &b| a.max(b)) as usize;
538 max_level + 1
539}
540
541#[allow(dead_code)]
542fn analyze_task_relationships(multitaskdataset: &MultiTaskDataset) {
543 println!(" 🔗 Task relationship analysis:");
544 println!(
545 " Shared feature ratio: {:.1}%",
546 (multitaskdataset.shared_features as f64 / multitaskdataset.tasks[0].n_features() as f64)
547 * 100.0
548 );
549 println!(
550 " Task correlation: {:.2}",
551 multitaskdataset.task_correlation
552 );
553
554 if multitaskdataset.task_correlation > 0.7 {
555 println!(" 💡 High correlation suggests strong transfer learning potential");
556 } else if multitaskdataset.task_correlation > 0.3 {
557 println!(" 💡 Moderate correlation indicates selective transfer benefits");
558 } else {
559 println!(" 💡 Low correlation requires careful negative transfer mitigation");
560 }
561}
562
563#[allow(dead_code)]
564fn analyze_class_distribution(target: &scirs2_core::ndarray::Array1<f64>) -> HashMap<i32, usize> {
565 let mut distribution = HashMap::new();
566 for &label in target.iter() {
567 *distribution.entry(label as i32).or_insert(0) += 1;
568 }
569 distribution
570}
571
572#[allow(dead_code)]
573fn calculate_domain_statistics(data: &scirs2_core::ndarray::Array2<f64>) -> (f64, f64) {
574 let mean = data.mean().unwrap_or(0.0);
575 let std = data.std(0.0);
576 (mean, std)
577}
578
579#[allow(dead_code)]
580fn analyze_domain_shifts(domaindataset: &DomainAdaptationDataset) {
581 if domaindataset.domains.len() >= 2 {
582 let source_stats = calculate_domain_statistics(&domaindataset.domains[0].1.data);
583 let target_stats =
584 calculate_domain_statistics(&domaindataset.domains.last().unwrap().1.data);
585
586 let mean_shift = (target_stats.0 - source_stats.0).abs();
587 let std_shift = (target_stats.1 - source_stats.1).abs();
588
589 println!(" Mean shift magnitude: {mean_shift:.3}");
590 println!(" Std shift magnitude: {std_shift:.3}");
591
592 if mean_shift > 0.5 || std_shift > 0.3 {
593 println!(" 💡 Significant domain shift detected - adaptation needed");
594 } else {
595 println!(" 💡 Mild domain shift - simple adaptation may suffice");
596 }
597 }
598}
599
600#[allow(dead_code)]
601fn calculate_class_balance(target: &scirs2_core::ndarray::Array1<f64>, nclasses: usize) -> f64 {
602 let mut class_counts = vec![0; nclasses];
603 for &label in target.iter() {
604 let class_idx = label as usize;
605 if class_idx < nclasses {
606 class_counts[class_idx] += 1;
607 }
608 }
609
610 let total = target.len() as f64;
611 let expected_per_class = total / nclasses as f64;
612
613 let balance_score = class_counts
614 .iter()
615 .map(|&count| (count as f64 - expected_per_class).abs())
616 .sum::<f64>()
617 / (nclasses as f64 * expected_per_class);
618
619 1.0 - balance_score.min(1.0) // Higher score = better balance
620}
621
622#[allow(dead_code)]
623fn get_few_shot_use_case(_n_way: usize, kshot: usize) -> &'static str {
624 match (_n_way, kshot) {
625 (5, 1) => "Image classification with minimal examples",
626 (5, 5) => "Balanced few-shot learning benchmark",
627 (10, _) => "Multi-class few-shot classification",
628 (_, 1) => "One-shot learning scenario",
629 _ => "General few-shot learning",
630 }
631}
632
633#[allow(dead_code)]
634fn analyze_concept_drift(dataset: &scirs2_datasets::ContinualLearningDataset) {
635 println!(" Task progression analysis:");
636
637 for i in 1..dataset.tasks.len() {
638 let prev_stats = calculate_domain_statistics(&dataset.tasks[i - 1].data);
639 let curr_stats = calculate_domain_statistics(&dataset.tasks[i].data);
640
641 let drift_magnitude =
642 ((curr_stats.0 - prev_stats.0).powi(2) + (curr_stats.1 - prev_stats.1).powi(2)).sqrt();
643
644 println!(
645 " Task {} → {}: drift = {:.3}",
646 i,
647 i + 1,
648 drift_magnitude
649 );
650 }
651}
652
653#[allow(dead_code)]
654fn get_continual_learning_strategies(_driftstrength: f64) -> &'static str {
655 if _driftstrength < 0.3 {
656 "Fine-tuning, Elastic Weight Consolidation"
657 } else if _driftstrength < 0.7 {
658 "Progressive Neural Networks, Learning without Forgetting"
659 } else {
660 "Memory replay, Meta-learning approaches, Dynamic architectures"
661 }
662}
663
664#[allow(dead_code)]
665fn simulate_catastrophic_forgetting() -> Result<(), Box<dyn std::error::Error>> {
666 let dataset = make_continual_learning_dataset(3, 200, 10, 3, 0.8)?;
667
668 println!(" Simulating catastrophic forgetting:");
669 println!(" 📉 Task 1 performance after Task 2: ~60% (typical drop)");
670 println!(" 📉 Task 1 performance after Task 3: ~40% (severe forgetting)");
671 println!(" 💡 Recommendation: Use rehearsal or regularization techniques");
672
673 Ok(())
674}
675
676#[allow(dead_code)]
677fn demonstrate_meta_learning_setup() -> Result<(), Box<dyn std::error::Error>> {
678 let few_shotdata = make_few_shot_dataset(5, 3, 10, 20, 15)?;
679
680 println!(" 🧠 Meta-learning (MAML) setup:");
681 println!(
682 " Meta-training episodes: {}",
683 few_shotdata.episodes.len()
684 );
685 println!(
686 " Support/Query split per episode: {}/{} samples per class",
687 few_shotdata.k_shot, few_shotdata.n_query
688 );
689 println!(" 💡 Goal: Learn to learn quickly from few examples");
690
691 Ok(())
692}
693
694#[allow(dead_code)]
695fn demonstrate_robust_ml_setup() -> Result<(), Box<dyn std::error::Error>> {
696 let basedataset = make_classification(500, 15, 3, 2, 10, Some(42))?;
697
698 // Generate multiple adversarial versions
699 let attacks = vec![
700 ("FGSM", AttackMethod::FGSM, 0.1),
701 ("PGD", AttackMethod::PGD, 0.05),
702 ];
703
704 println!(" 🛡️ Robust ML training setup:");
705 println!(" Clean samples: {}", basedataset.n_samples());
706
707 for (name, method, epsilon) in attacks {
708 let config = AdversarialConfig {
709 attack_method: method,
710 epsilon,
711 ..Default::default()
712 };
713
714 let advdataset = make_adversarial_examples(&basedataset, config)?;
715 println!(
716 " {} adversarial samples: {}",
717 name,
718 advdataset.n_samples()
719 );
720 }
721
722 println!(" 💡 Goal: Train models robust to adversarial perturbations");
723
724 Ok(())
725}
examples/data_generators.rs (lines 14-21)
7fn main() -> Result<(), Box<dyn std::error::Error>> {
8 println!("Creating synthetic datasets...\n");
9
10 // Generate classification dataset
11 let n_samples = 100;
12 let n_features = 5;
13
14 let classificationdata = make_classification(
15 n_samples,
16 n_features,
17 3, // 3 classes
18 2, // 2 clusters per class
19 3, // 3 informative features
20 Some(42), // random seed
21 )?;
22
23 // Train-test split
24 let (train, test) = train_test_split(&classificationdata, 0.2, Some(42))?;
25
26 println!("Classification dataset:");
27 println!(" Total samples: {}", classificationdata.n_samples());
28 println!(" Features: {}", classificationdata.n_features());
29 println!(" Training samples: {}", train.n_samples());
30 println!(" Test samples: {}", test.n_samples());
31
32 // Generate regression dataset
33 let regressiondata = make_regression(
34 n_samples,
35 n_features,
36 3, // 3 informative features
37 0.5, // noise level
38 Some(42),
39 )?;
40
41 println!("\nRegression dataset:");
42 println!(" Samples: {}", regressiondata.n_samples());
43 println!(" Features: {}", regressiondata.n_features());
44
45 // Normalize the data (in-place)
46 let mut data_copy = regressiondata.data.clone();
47 normalize(&mut data_copy);
48 println!(" Data normalized successfully");
49
50 // Generate clustering data (blobs)
51 let clusteringdata = make_blobs(
52 n_samples,
53 2, // 2 features for easy visualization
54 4, // 4 clusters
55 0.8, // cluster standard deviation
56 Some(42),
57 )?;
58
59 println!("\nClustering dataset (blobs):");
60 println!(" Samples: {}", clusteringdata.n_samples());
61 println!(" Features: {}", clusteringdata.n_features());
62
63 // Find the number of clusters by finding the max value of target
64 let num_clusters = clusteringdata.target.as_ref().map_or(0, |t| {
65 let mut max_val = -1.0;
66 for &val in t.iter() {
67 if val > max_val {
68 max_val = val;
69 }
70 }
71 (max_val as usize) + 1
72 });
73
74 println!(" Clusters: {num_clusters}");
75
76 // Generate time series data
77 let time_series = make_time_series(
78 100, // 100 time steps
79 3, // 3 features/variables
80 true, // with trend
81 true, // with seasonality
82 0.2, // noise level
83 Some(42),
84 )?;
85
86 println!("\nTime series dataset:");
87 println!(" Time steps: {}", time_series.n_samples());
88 println!(" Features: {}", time_series.n_features());
89
90 Ok(())
91}
examples/scikit_learn_benchmark.rs (line 361)
346fn run_sklearn_generation_comparison() {
347 println!("\n 🔬 Data Generation Comparison:");
348
349 let configs = vec![
350 (1000, 10, "classification"),
351 (5000, 20, "classification"),
352 (1000, 10, "regression"),
353 (5000, 20, "regression"),
354 ];
355
356 for (n_samples, n_features, gen_type) in configs {
357 #[allow(clippy::type_complexity)]
358 let (python_code, scirs2_fn): (&str, Box<dyn Fn() -> Result<Dataset, Box<dyn std::error::Error>>>) = match gen_type {
359 "classification" => (
360 &format!("from sklearn.datasets import make_classification; make_classification(n_samples={n_samples}, n_features={n_features}, random_state=42)"),
361 Box::new(move || make_classification(n_samples, n_features, 3, 2, 4, Some(42)).map_err(|e| Box::new(e) as Box<dyn std::error::Error>))
362 ),
363 "regression" => (
364 &format!("from sklearn.datasets import make_regression; make_regression(n_samples={n_samples}, n_features={n_features}, random_state=42)"),
365 Box::new(move || make_regression(n_samples, n_features, 3, 0.1, Some(42)).map_err(|e| Box::new(e) as Box<dyn std::error::Error>))
366 ),
367 _ => continue,
368 };
369
370 // Time Python execution
371 let python_result = Command::new("python3")
372 .arg("-c")
373 .arg(format!(
374 "import time; start=time.time(); {python_code}; print(f'{{:.4f}}', time.time()-start)"
375 ))
376 .output();
377
378 match python_result {
379 Ok(output) if output.status.success() => {
380 let python_time = String::from_utf8_lossy(&output.stdout)
381 .trim()
382 .parse::<f64>()
383 .unwrap_or(0.0);
384
385 // Time SciRS2 execution
386 let scirs2_start = Instant::now();
387 let _scirs2_result = scirs2_fn();
388 let scirs2_time = scirs2_start.elapsed().as_secs_f64();
389
390 let speedup = python_time / scirs2_time;
391 let status = if speedup > 1.2 {
392 "🚀 FASTER"
393 } else if speedup > 0.8 {
394 "≈ SIMILAR"
395 } else {
396 "🐌 SLOWER"
397 };
398
399 println!(
400 " {} {}x{}: SciRS2 {:.2}ms vs sklearn {:.2}ms ({:.1}x {})",
401 gen_type,
402 n_samples,
403 n_features,
404 scirs2_time * 1000.0,
405 python_time * 1000.0,
406 speedup,
407 status
408 );
409 }
410 _ => {
411 println!(
412 " {gen_type} {n_samples}x{n_features}: Failed to benchmark Python version"
413 );
414 }
415 }
416 }
417}
examples/datasets_streaming_demo.rs (line 147)
132fn demonstrate_memory_efficient_processing() -> Result<(), Box<dyn std::error::Error>> {
133 println!("💾 MEMORY-EFFICIENT PROCESSING");
134 println!("{}", "-".repeat(40));
135
136 // Compare memory usage: streaming vs. in-memory
137 let datasetsize = 50_000;
138 let n_features = 50;
139
140 println!("Comparing memory usage for {datasetsize} samples with {n_features} features");
141
142 // In-memory approach (for comparison)
143 println!("\n1. In-memory approach:");
144 let start_mem = get_memory_usage();
145 let start_time = Instant::now();
146
147 let in_memorydataset = make_classification(datasetsize, n_features, 5, 2, 25, Some(42))?;
148 let (train, test) = train_test_split(&in_memorydataset, 0.2, Some(42))?;
149
150 let in_memory_time = start_time.elapsed();
151 let in_memory_mem = get_memory_usage() - start_mem;
152
153 println!(" Time: {:.2}s", in_memory_time.as_secs_f64());
154 println!(" Memory usage: ~{in_memory_mem:.1} MB");
155 println!(" Train samples: {}", train.n_samples());
156 println!(" Test samples: {}", test.n_samples());
157
158 // Streaming approach
159 println!("\n2. Streaming approach:");
160 let stream_start_time = Instant::now();
161 let stream_start_mem = get_memory_usage();
162
163 let config = StreamConfig {
164 chunk_size: 5_000, // Smaller chunks for memory efficiency
165 buffer_size: 2, // Smaller buffer
166 num_workers: 2,
167 memory_limit_mb: Some(50),
168 ..Default::default()
169 };
170
171 let mut stream = stream_classification(datasetsize, n_features, 5, config)?;
172
173 let mut total_processed = 0;
174 let mut train_samples = 0;
175 let mut test_samples = 0;
176
177 while let Some(chunk) = stream.next_chunk()? {
178 total_processed += chunk.n_samples();
179
180 // Simulate train/test split on chunk level
181 let chunk_trainsize = (chunk.n_samples() as f64 * 0.8) as usize;
182 train_samples += chunk_trainsize;
183 test_samples += chunk.n_samples() - chunk_trainsize;
184
185 // Process chunk (simulate some computation)
186 let _mean = chunk.data.mean_axis(scirs2_core::ndarray::Axis(0));
187 let _std = chunk.data.std_axis(scirs2_core::ndarray::Axis(0), 0.0);
188
189 if chunk.is_last {
190 break;
191 }
192 }
193
194 let stream_time = stream_start_time.elapsed();
195 let stream_mem = get_memory_usage() - stream_start_mem;
196
197 println!(" Time: {:.2}s", stream_time.as_secs_f64());
198 println!(" Memory usage: ~{stream_mem:.1} MB");
199 println!(" Train samples: {train_samples}");
200 println!(" Test samples: {test_samples}");
201 println!(" Total processed: {total_processed}");
202
203 // Comparison
204 println!("\n3. Comparison:");
205 println!(
206 " Memory savings: {:.1}x less memory",
207 in_memory_mem / stream_mem.max(1.0)
208 );
209 println!(
210 " Time overhead: {:.1}x",
211 stream_time.as_secs_f64() / in_memory_time.as_secs_f64()
212 );
213 println!(" Streaming is beneficial for large datasets that don't fit in memory");
214
215 println!();
216 Ok(())
217}