real_world_datasets/
real_world_datasets.rs

1//! Real-world datasets demonstration
2//!
3//! This example demonstrates how to load and work with real-world datasets
4//! from various domains including finance, healthcare, and machine learning research.
5//!
6//! Usage:
7//!   cargo run --example real_world_datasets --release
8
9use scirs2_datasets::{
10    list_real_world_datasets, load_adult, load_california_housing, load_heart_disease,
11    load_red_wine_quality, load_titanic,
12    utils::{k_fold_split, train_test_split},
13    BenchmarkRunner, MLPipeline, RealWorldConfig,
14};
15use statrs::statistics::Statistics;
16use std::collections::HashMap;
17
18#[allow(dead_code)]
19fn main() -> Result<(), Box<dyn std::error::Error>> {
20    println!("🌍 Real-World Datasets Demonstration");
21    println!("====================================\n");
22
23    // List all available real-world datasets
24    demonstrate_dataset_catalog();
25
26    // Load and explore different types of datasets
27    demonstrate_classification_datasets()?;
28    demonstrate_regression_datasets()?;
29    demonstrate_healthcare_datasets()?;
30
31    // Advanced dataset operations
32    demonstrate_advanced_operations()?;
33
34    // Performance comparison
35    demonstrate_performance_comparison()?;
36
37    println!("\n🎉 Real-world datasets demonstration completed!");
38    Ok(())
39}
40
41#[allow(dead_code)]
42fn demonstrate_dataset_catalog() {
43    println!("📋 AVAILABLE REAL-WORLD DATASETS");
44    println!("{}", "-".repeat(40));
45
46    let datasets = list_real_world_datasets();
47
48    // Group datasets by domain
49    let mut classification = Vec::new();
50    let mut regression = Vec::new();
51    let mut time_series = Vec::new();
52    let mut healthcare = Vec::new();
53    let mut financial = Vec::new();
54
55    for dataset in &datasets {
56        match dataset.as_str() {
57            "adult" | "bank_marketing" | "credit_approval" | "german_credit" | "mushroom"
58            | "spam" | "titanic" => classification.push(dataset),
59            "auto_mpg" | "california_housing" | "concrete_strength" | "energy_efficiency"
60            | "red_wine_quality" | "white_wine_quality" => regression.push(dataset),
61            "air_passengers" | "bitcoin_prices" | "electricity_load" | "stock_prices" => {
62                time_series.push(dataset)
63            }
64            "diabetes_readmission" | "heart_disease" => healthcare.push(dataset),
65            "credit_card_fraud" | "loan_default" => financial.push(dataset),
66            _ => {}
67        }
68    }
69
70    println!("Classification Datasets ({}):", classification.len());
71    for dataset in classification {
72        println!("  • {dataset}");
73    }
74
75    println!("\nRegression Datasets ({}):", regression.len());
76    for dataset in regression {
77        println!("  • {dataset}");
78    }
79
80    println!("\nTime Series Datasets ({}):", time_series.len());
81    for dataset in time_series {
82        println!("  • {dataset}");
83    }
84
85    println!("\nHealthcare Datasets ({}):", healthcare.len());
86    for dataset in healthcare {
87        println!("  • {dataset}");
88    }
89
90    println!("\nFinancial Datasets ({}):", financial.len());
91    for dataset in financial {
92        println!("  • {dataset}");
93    }
94
95    println!(
96        "\nTotal: {} real-world datasets available\n",
97        datasets.len()
98    );
99}
100
101#[allow(dead_code)]
102fn demonstrate_classification_datasets() -> Result<(), Box<dyn std::error::Error>> {
103    println!("🎯 CLASSIFICATION DATASETS");
104    println!("{}", "-".repeat(40));
105
106    // Titanic dataset
107    println!("Loading Titanic dataset...");
108    let titanic = load_titanic()?;
109
110    println!("Titanic Dataset:");
111    println!(
112        "  Description: {}",
113        titanic
114            .metadata
115            .get("description")
116            .unwrap_or(&"Unknown".to_string())
117    );
118    println!("  Samples: {}", titanic.n_samples());
119    println!("  Features: {}", titanic.n_features());
120
121    if let Some(featurenames) = titanic.featurenames() {
122        println!("  Features: {featurenames:?}");
123    }
124
125    if let Some(targetnames) = titanic.targetnames() {
126        println!("  Classes: {targetnames:?}");
127    }
128
129    // Analyze class distribution
130    if let Some(target) = &titanic.target {
131        let mut class_counts = HashMap::new();
132        for &class in target.iter() {
133            *class_counts.entry(class as i32).or_insert(0) += 1;
134        }
135        println!("  Class distribution: {class_counts:?}");
136
137        // Calculate survival rate
138        let survived = class_counts.get(&1).unwrap_or(&0);
139        let total = titanic.n_samples();
140        println!(
141            "  Survival rate: {:.1}%",
142            (*survived as f64 / total as f64) * 100.0
143        );
144    }
145
146    // Demonstrate train/test split
147    let (train, test) = train_test_split(&titanic, 0.2, Some(42))?;
148    println!(
149        "  Train/test split: {} train, {} test",
150        train.n_samples(),
151        test.n_samples()
152    );
153
154    // Adult (Census Income) dataset
155    println!("\nLoading Adult (Census Income) dataset...");
156    match load_adult() {
157        Ok(adult) => {
158            println!("Adult Dataset:");
159            println!(
160                "  Description: {}",
161                adult
162                    .metadata
163                    .get("description")
164                    .unwrap_or(&"Unknown".to_string())
165            );
166            println!("  Samples: {}", adult.n_samples());
167            println!("  Features: {}", adult.n_features());
168            println!("  Task: Predict income >$50K based on census data");
169        }
170        Err(e) => {
171            println!("  Note: Adult dataset requires download: {e}");
172            println!("  This is expected for the demonstration");
173        }
174    }
175
176    println!();
177    Ok(())
178}
179
180#[allow(dead_code)]
181fn demonstrate_regression_datasets() -> Result<(), Box<dyn std::error::Error>> {
182    println!("📈 REGRESSION DATASETS");
183    println!("{}", "-".repeat(40));
184
185    // California Housing dataset
186    println!("Loading California Housing dataset...");
187    let housing = load_california_housing()?;
188
189    println!("California Housing Dataset:");
190    println!(
191        "  Description: {}",
192        housing
193            .metadata
194            .get("description")
195            .unwrap_or(&"Unknown".to_string())
196    );
197    println!("  Samples: {}", housing.n_samples());
198    println!("  Features: {}", housing.n_features());
199
200    if let Some(featurenames) = housing.featurenames() {
201        println!("  Features: {featurenames:?}");
202    }
203
204    // Analyze target distribution
205    if let Some(target) = &housing.target {
206        let mean = target.mean().unwrap_or(0.0);
207        let std = target.std(0.0);
208        let min = target.iter().fold(f64::INFINITY, |a, &b| a.min(b));
209        let max = target.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
210
211        println!("  Target (house value) statistics:");
212        println!("    Mean: {mean:.2} (hundreds of thousands)");
213        println!("    Std:  {std:.2}");
214        println!("    Range: [{min:.2}, {max:.2}]");
215    }
216
217    // Red Wine Quality dataset
218    println!("\nLoading Red Wine Quality dataset...");
219    let wine = load_red_wine_quality()?;
220
221    println!("Red Wine Quality Dataset:");
222    println!(
223        "  Description: {}",
224        wine.metadata
225            .get("description")
226            .unwrap_or(&"Unknown".to_string())
227    );
228    println!("  Samples: {}", wine.n_samples());
229    println!("  Features: {}", wine.n_features());
230
231    if let Some(target) = &wine.target {
232        let mean_quality = target.mean().unwrap_or(0.0);
233        println!("  Average wine quality: {mean_quality:.1}/10");
234
235        // Quality distribution
236        let mut quality_counts = HashMap::new();
237        for &quality in target.iter() {
238            let q = quality.round() as i32;
239            *quality_counts.entry(q).or_insert(0) += 1;
240        }
241        println!("  Quality distribution: {quality_counts:?}");
242    }
243
244    println!();
245    Ok(())
246}
247
248#[allow(dead_code)]
249fn demonstrate_healthcare_datasets() -> Result<(), Box<dyn std::error::Error>> {
250    println!("🏥 HEALTHCARE DATASETS");
251    println!("{}", "-".repeat(40));
252
253    // Heart Disease dataset
254    println!("Loading Heart Disease dataset...");
255    let heart = load_heart_disease()?;
256
257    println!("Heart Disease Dataset:");
258    println!(
259        "  Description: {}",
260        heart
261            .metadata
262            .get("description")
263            .unwrap_or(&"Unknown".to_string())
264    );
265    println!("  Samples: {}", heart.n_samples());
266    println!("  Features: {}", heart.n_features());
267
268    if let Some(featurenames) = heart.featurenames() {
269        println!("  Clinical features: {:?}", &featurenames[..5]); // Show first 5
270        println!("  ... and {} more features", featurenames.len() - 5);
271    }
272
273    // Analyze risk factors
274    if let Some(target) = &heart.target {
275        let mut disease_counts = HashMap::new();
276        for &disease in target.iter() {
277            *disease_counts.entry(disease as i32).or_insert(0) += 1;
278        }
279
280        let with_disease = disease_counts.get(&1).unwrap_or(&0);
281        let total = heart.n_samples();
282        println!(
283            "  Disease prevalence: {:.1}% ({}/{})",
284            (*with_disease as f64 / total as f64) * 100.0,
285            with_disease,
286            total
287        );
288    }
289
290    // Demonstrate feature analysis
291    println!("  Sample clinical parameter ranges:");
292    let age_col = heart.data.column(0);
293    let age_mean = age_col.mean();
294    let age_std = age_col.std(0.0);
295    println!("    Age: {age_mean:.1} ± {age_std:.1} years");
296
297    println!();
298    Ok(())
299}
300
301#[allow(dead_code)]
302fn demonstrate_advanced_operations() -> Result<(), Box<dyn std::error::Error>> {
303    println!("🔧 ADVANCED DATASET OPERATIONS");
304    println!("{}", "-".repeat(40));
305
306    let housing = load_california_housing()?;
307
308    // Data preprocessing pipeline
309    println!("Preprocessing pipeline for California Housing:");
310
311    // 1. Train/test split
312    let (mut train, test) = train_test_split(&housing, 0.2, Some(42))?;
313    println!(
314        "  1. Split: {} train, {} test",
315        train.n_samples(),
316        test.n_samples()
317    );
318
319    // 2. Feature scaling
320    let mut pipeline = MLPipeline::default();
321    train = pipeline.prepare_dataset(&train)?;
322    println!("  2. Standardized features");
323
324    // 3. Cross-validation setup
325    let cv_folds = k_fold_split(train.n_samples(), 5, true, Some(42))?;
326    println!("  3. Created {} CV folds", cv_folds.len());
327
328    // Feature correlation analysis (simplified)
329    println!("  4. Feature analysis:");
330    println!("     • {} numerical features", train.n_features());
331    println!("     • Ready for machine learning models");
332
333    // Custom dataset configuration
334    println!("\nCustom dataset loading configuration:");
335    let config = RealWorldConfig {
336        use_cache: true,
337        download_if_missing: false, // Don't download in demo
338        return_preprocessed: true,
339        subset: Some("small".to_string()),
340        random_state: Some(42),
341        ..Default::default()
342    };
343
344    println!("  • Caching: {}", config.use_cache);
345    println!("  • Download missing: {}", config.download_if_missing);
346    println!("  • Preprocessed: {}", config.return_preprocessed);
347    println!("  • Subset: {:?}", config.subset);
348
349    println!();
350    Ok(())
351}
352
353#[allow(dead_code)]
354fn demonstrate_performance_comparison() -> Result<(), Box<dyn std::error::Error>> {
355    println!("⚡ PERFORMANCE COMPARISON");
356    println!("{}", "-".repeat(40));
357
358    let runner = BenchmarkRunner::new().with_iterations(3).with_warmup(1);
359
360    // Benchmark real-world dataset loading
361    println!("Benchmarking real-world dataset operations...");
362
363    // Titanic loading benchmark
364    let titanic_params = HashMap::from([("dataset".to_string(), "titanic".to_string())]);
365    let titanic_result =
366        runner.run_benchmark("load_titanic", titanic_params, || match load_titanic() {
367            Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
368            Err(e) => Err(format!("Failed to load Titanic: {e}")),
369        });
370
371    // California Housing loading benchmark
372    let housing_params = HashMap::from([("dataset".to_string(), "california_housing".to_string())]);
373    let housing_result = runner.run_benchmark("load_california_housing", housing_params, || {
374        match load_california_housing() {
375            Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
376            Err(e) => Err(format!("Failed to load California Housing: {e}")),
377        }
378    });
379
380    // Heart Disease loading benchmark
381    let heart_params = HashMap::from([("dataset".to_string(), "heart_disease".to_string())]);
382    let heart_result =
383        runner.run_benchmark(
384            "load_heart_disease",
385            heart_params,
386            || match load_heart_disease() {
387                Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
388                Err(e) => Err(format!("Failed to load Heart Disease: {e}")),
389            },
390        );
391
392    // Display results
393    println!("\nReal-world dataset loading performance:");
394
395    let results = vec![
396        ("Titanic", &titanic_result),
397        ("California Housing", &housing_result),
398        ("Heart Disease", &heart_result),
399    ];
400
401    for (name, result) in results {
402        if result.success {
403            println!(
404                "  {}: {} ({} samples, {} features, {:.1} samples/s)",
405                name,
406                result.formatted_duration(),
407                result.samples,
408                result.features,
409                result.throughput
410            );
411        } else {
412            println!(
413                "  {}: Failed - {}",
414                name,
415                result
416                    .error
417                    .as_ref()
418                    .unwrap_or(&"Unknown error".to_string())
419            );
420        }
421    }
422
423    // Memory usage estimation
424    let total_samples = titanic_result.samples + housing_result.samples + heart_result.samples;
425    let total_features = titanic_result.features + housing_result.features + heart_result.features;
426    let estimated_memory_mb = (total_samples * total_features * 8) as f64 / (1024.0 * 1024.0);
427
428    println!("\nMemory usage estimate:");
429    println!("  Total samples: {total_samples}");
430    println!("  Total features: {total_features}");
431    println!("  Estimated memory: {estimated_memory_mb:.1} MB");
432
433    // Performance recommendations
434    println!("\nPerformance recommendations:");
435    if estimated_memory_mb > 100.0 {
436        println!("  • Consider using streaming for large datasets");
437        println!("  • Enable caching for frequently accessed datasets");
438    }
439    println!("  • Use train/test splitting to reduce memory usage");
440    println!("  • Apply feature selection to reduce dimensionality");
441
442    println!();
443    Ok(())
444}
445
446/// Helper function to format large numbers
447#[allow(dead_code)]
448fn format_number(n: usize) -> String {
449    if n >= 1_000_000 {
450        format!("{:.1}M", n as f64 / 1_000_000.0)
451    } else if n >= 1_000 {
452        format!("{:.1}K", n as f64 / 1_000.0)
453    } else {
454        n.to_string()
455    }
456}
457
458/// Demonstrate dataset information display
459#[allow(dead_code)]
460fn show_dataset_info(name: &str, dataset: &scirs2_datasets::utils::Dataset) {
461    println!("{name}:");
462    println!("  Samples: {}", format_number(dataset.n_samples()));
463    println!("  Features: {}", dataset.n_features());
464    println!(
465        "  Task: {}",
466        dataset
467            .metadata
468            .get("task_type")
469            .unwrap_or(&"Unknown".to_string())
470    );
471
472    if let Some(source) = dataset.metadata.get("source") {
473        println!("  Source: {source}");
474    }
475
476    if dataset.target.is_some() {
477        println!("  Supervised: Yes");
478    } else {
479        println!("  Supervised: No");
480    }
481}
real_world_datasets/real_world_datasets.rs

real_world_datasets/
real_world_datasets.rs