1use scirs2_datasets::{
10 list_real_world_datasets, load_adult, load_california_housing, load_heart_disease,
11 load_red_wine_quality, load_titanic,
12 utils::{k_fold_split, train_test_split},
13 BenchmarkRunner, MLPipeline, RealWorldConfig,
14};
15use statrs::statistics::Statistics;
16use std::collections::HashMap;
17
18#[allow(dead_code)]
19fn main() -> Result<(), Box<dyn std::error::Error>> {
20 println!("š Real-World Datasets Demonstration");
21 println!("====================================\n");
22
23 demonstrate_dataset_catalog();
25
26 demonstrate_classification_datasets()?;
28 demonstrate_regression_datasets()?;
29 demonstrate_healthcare_datasets()?;
30
31 demonstrate_advanced_operations()?;
33
34 demonstrate_performance_comparison()?;
36
37 println!("\nš Real-world datasets demonstration completed!");
38 Ok(())
39}
40
41#[allow(dead_code)]
42fn demonstrate_dataset_catalog() {
43 println!("š AVAILABLE REAL-WORLD DATASETS");
44 println!("{}", "-".repeat(40));
45
46 let datasets = list_real_world_datasets();
47
48 let mut classification = Vec::new();
50 let mut regression = Vec::new();
51 let mut time_series = Vec::new();
52 let mut healthcare = Vec::new();
53 let mut financial = Vec::new();
54
55 for dataset in &datasets {
56 match dataset.as_str() {
57 "adult" | "bank_marketing" | "credit_approval" | "german_credit" | "mushroom"
58 | "spam" | "titanic" => classification.push(dataset),
59 "auto_mpg" | "california_housing" | "concrete_strength" | "energy_efficiency"
60 | "red_wine_quality" | "white_wine_quality" => regression.push(dataset),
61 "air_passengers" | "bitcoin_prices" | "electricity_load" | "stock_prices" => {
62 time_series.push(dataset)
63 }
64 "diabetes_readmission" | "heart_disease" => healthcare.push(dataset),
65 "credit_card_fraud" | "loan_default" => financial.push(dataset),
66 _ => {}
67 }
68 }
69
70 println!("Classification Datasets ({}):", classification.len());
71 for dataset in classification {
72 println!(" ⢠{dataset}");
73 }
74
75 println!("\nRegression Datasets ({}):", regression.len());
76 for dataset in regression {
77 println!(" ⢠{dataset}");
78 }
79
80 println!("\nTime Series Datasets ({}):", time_series.len());
81 for dataset in time_series {
82 println!(" ⢠{dataset}");
83 }
84
85 println!("\nHealthcare Datasets ({}):", healthcare.len());
86 for dataset in healthcare {
87 println!(" ⢠{dataset}");
88 }
89
90 println!("\nFinancial Datasets ({}):", financial.len());
91 for dataset in financial {
92 println!(" ⢠{dataset}");
93 }
94
95 println!(
96 "\nTotal: {} real-world datasets available\n",
97 datasets.len()
98 );
99}
100
101#[allow(dead_code)]
102fn demonstrate_classification_datasets() -> Result<(), Box<dyn std::error::Error>> {
103 println!("šÆ CLASSIFICATION DATASETS");
104 println!("{}", "-".repeat(40));
105
106 println!("Loading Titanic dataset...");
108 let titanic = load_titanic()?;
109
110 println!("Titanic Dataset:");
111 println!(
112 " Description: {}",
113 titanic
114 .metadata
115 .get("description")
116 .unwrap_or(&"Unknown".to_string())
117 );
118 println!(" Samples: {}", titanic.n_samples());
119 println!(" Features: {}", titanic.n_features());
120
121 if let Some(featurenames) = titanic.featurenames() {
122 println!(" Features: {featurenames:?}");
123 }
124
125 if let Some(targetnames) = titanic.targetnames() {
126 println!(" Classes: {targetnames:?}");
127 }
128
129 if let Some(target) = &titanic.target {
131 let mut class_counts = HashMap::new();
132 for &class in target.iter() {
133 *class_counts.entry(class as i32).or_insert(0) += 1;
134 }
135 println!(" Class distribution: {class_counts:?}");
136
137 let survived = class_counts.get(&1).unwrap_or(&0);
139 let total = titanic.n_samples();
140 println!(
141 " Survival rate: {:.1}%",
142 (*survived as f64 / total as f64) * 100.0
143 );
144 }
145
146 let (train, test) = train_test_split(&titanic, 0.2, Some(42))?;
148 println!(
149 " Train/test split: {} train, {} test",
150 train.n_samples(),
151 test.n_samples()
152 );
153
154 println!("\nLoading Adult (Census Income) dataset...");
156 match load_adult() {
157 Ok(adult) => {
158 println!("Adult Dataset:");
159 println!(
160 " Description: {}",
161 adult
162 .metadata
163 .get("description")
164 .unwrap_or(&"Unknown".to_string())
165 );
166 println!(" Samples: {}", adult.n_samples());
167 println!(" Features: {}", adult.n_features());
168 println!(" Task: Predict income >$50K based on census data");
169 }
170 Err(e) => {
171 println!(" Note: Adult dataset requires download: {e}");
172 println!(" This is expected for the demonstration");
173 }
174 }
175
176 println!();
177 Ok(())
178}
179
180#[allow(dead_code)]
181fn demonstrate_regression_datasets() -> Result<(), Box<dyn std::error::Error>> {
182 println!("š REGRESSION DATASETS");
183 println!("{}", "-".repeat(40));
184
185 println!("Loading California Housing dataset...");
187 let housing = load_california_housing()?;
188
189 println!("California Housing Dataset:");
190 println!(
191 " Description: {}",
192 housing
193 .metadata
194 .get("description")
195 .unwrap_or(&"Unknown".to_string())
196 );
197 println!(" Samples: {}", housing.n_samples());
198 println!(" Features: {}", housing.n_features());
199
200 if let Some(featurenames) = housing.featurenames() {
201 println!(" Features: {featurenames:?}");
202 }
203
204 if let Some(target) = &housing.target {
206 let mean = target.mean().unwrap_or(0.0);
207 let std = target.std(0.0);
208 let min = target.iter().fold(f64::INFINITY, |a, &b| a.min(b));
209 let max = target.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
210
211 println!(" Target (house value) statistics:");
212 println!(" Mean: {mean:.2} (hundreds of thousands)");
213 println!(" Std: {std:.2}");
214 println!(" Range: [{min:.2}, {max:.2}]");
215 }
216
217 println!("\nLoading Red Wine Quality dataset...");
219 let wine = load_red_wine_quality()?;
220
221 println!("Red Wine Quality Dataset:");
222 println!(
223 " Description: {}",
224 wine.metadata
225 .get("description")
226 .unwrap_or(&"Unknown".to_string())
227 );
228 println!(" Samples: {}", wine.n_samples());
229 println!(" Features: {}", wine.n_features());
230
231 if let Some(target) = &wine.target {
232 let mean_quality = target.mean().unwrap_or(0.0);
233 println!(" Average wine quality: {mean_quality:.1}/10");
234
235 let mut quality_counts = HashMap::new();
237 for &quality in target.iter() {
238 let q = quality.round() as i32;
239 *quality_counts.entry(q).or_insert(0) += 1;
240 }
241 println!(" Quality distribution: {quality_counts:?}");
242 }
243
244 println!();
245 Ok(())
246}
247
248#[allow(dead_code)]
249fn demonstrate_healthcare_datasets() -> Result<(), Box<dyn std::error::Error>> {
250 println!("š„ HEALTHCARE DATASETS");
251 println!("{}", "-".repeat(40));
252
253 println!("Loading Heart Disease dataset...");
255 let heart = load_heart_disease()?;
256
257 println!("Heart Disease Dataset:");
258 println!(
259 " Description: {}",
260 heart
261 .metadata
262 .get("description")
263 .unwrap_or(&"Unknown".to_string())
264 );
265 println!(" Samples: {}", heart.n_samples());
266 println!(" Features: {}", heart.n_features());
267
268 if let Some(featurenames) = heart.featurenames() {
269 println!(" Clinical features: {:?}", &featurenames[..5]); println!(" ... and {} more features", featurenames.len() - 5);
271 }
272
273 if let Some(target) = &heart.target {
275 let mut disease_counts = HashMap::new();
276 for &disease in target.iter() {
277 *disease_counts.entry(disease as i32).or_insert(0) += 1;
278 }
279
280 let with_disease = disease_counts.get(&1).unwrap_or(&0);
281 let total = heart.n_samples();
282 println!(
283 " Disease prevalence: {:.1}% ({}/{})",
284 (*with_disease as f64 / total as f64) * 100.0,
285 with_disease,
286 total
287 );
288 }
289
290 println!(" Sample clinical parameter ranges:");
292 let age_col = heart.data.column(0);
293 let age_mean = age_col.mean();
294 let age_std = age_col.std(0.0);
295 println!(" Age: {age_mean:.1} ± {age_std:.1} years");
296
297 println!();
298 Ok(())
299}
300
301#[allow(dead_code)]
302fn demonstrate_advanced_operations() -> Result<(), Box<dyn std::error::Error>> {
303 println!("š§ ADVANCED DATASET OPERATIONS");
304 println!("{}", "-".repeat(40));
305
306 let housing = load_california_housing()?;
307
308 println!("Preprocessing pipeline for California Housing:");
310
311 let (mut train, test) = train_test_split(&housing, 0.2, Some(42))?;
313 println!(
314 " 1. Split: {} train, {} test",
315 train.n_samples(),
316 test.n_samples()
317 );
318
319 let mut pipeline = MLPipeline::default();
321 train = pipeline.prepare_dataset(&train)?;
322 println!(" 2. Standardized features");
323
324 let cv_folds = k_fold_split(train.n_samples(), 5, true, Some(42))?;
326 println!(" 3. Created {} CV folds", cv_folds.len());
327
328 println!(" 4. Feature analysis:");
330 println!(" ⢠{} numerical features", train.n_features());
331 println!(" ⢠Ready for machine learning models");
332
333 println!("\nCustom dataset loading configuration:");
335 let config = RealWorldConfig {
336 use_cache: true,
337 download_if_missing: false, return_preprocessed: true,
339 subset: Some("small".to_string()),
340 random_state: Some(42),
341 ..Default::default()
342 };
343
344 println!(" ⢠Caching: {}", config.use_cache);
345 println!(" ⢠Download missing: {}", config.download_if_missing);
346 println!(" ⢠Preprocessed: {}", config.return_preprocessed);
347 println!(" ⢠Subset: {:?}", config.subset);
348
349 println!();
350 Ok(())
351}
352
353#[allow(dead_code)]
354fn demonstrate_performance_comparison() -> Result<(), Box<dyn std::error::Error>> {
355 println!("ā” PERFORMANCE COMPARISON");
356 println!("{}", "-".repeat(40));
357
358 let runner = BenchmarkRunner::new().with_iterations(3).with_warmup(1);
359
360 println!("Benchmarking real-world dataset operations...");
362
363 let titanic_params = HashMap::from([("dataset".to_string(), "titanic".to_string())]);
365 let titanic_result =
366 runner.run_benchmark("load_titanic", titanic_params, || match load_titanic() {
367 Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
368 Err(e) => Err(format!("Failed to load Titanic: {e}")),
369 });
370
371 let housing_params = HashMap::from([("dataset".to_string(), "california_housing".to_string())]);
373 let housing_result = runner.run_benchmark("load_california_housing", housing_params, || {
374 match load_california_housing() {
375 Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
376 Err(e) => Err(format!("Failed to load California Housing: {e}")),
377 }
378 });
379
380 let heart_params = HashMap::from([("dataset".to_string(), "heart_disease".to_string())]);
382 let heart_result =
383 runner.run_benchmark(
384 "load_heart_disease",
385 heart_params,
386 || match load_heart_disease() {
387 Ok(dataset) => Ok((dataset.n_samples(), dataset.n_features())),
388 Err(e) => Err(format!("Failed to load Heart Disease: {e}")),
389 },
390 );
391
392 println!("\nReal-world dataset loading performance:");
394
395 let results = vec![
396 ("Titanic", &titanic_result),
397 ("California Housing", &housing_result),
398 ("Heart Disease", &heart_result),
399 ];
400
401 for (name, result) in results {
402 if result.success {
403 println!(
404 " {}: {} ({} samples, {} features, {:.1} samples/s)",
405 name,
406 result.formatted_duration(),
407 result.samples,
408 result.features,
409 result.throughput
410 );
411 } else {
412 println!(
413 " {}: Failed - {}",
414 name,
415 result
416 .error
417 .as_ref()
418 .unwrap_or(&"Unknown error".to_string())
419 );
420 }
421 }
422
423 let total_samples = titanic_result.samples + housing_result.samples + heart_result.samples;
425 let total_features = titanic_result.features + housing_result.features + heart_result.features;
426 let estimated_memory_mb = (total_samples * total_features * 8) as f64 / (1024.0 * 1024.0);
427
428 println!("\nMemory usage estimate:");
429 println!(" Total samples: {total_samples}");
430 println!(" Total features: {total_features}");
431 println!(" Estimated memory: {estimated_memory_mb:.1} MB");
432
433 println!("\nPerformance recommendations:");
435 if estimated_memory_mb > 100.0 {
436 println!(" ⢠Consider using streaming for large datasets");
437 println!(" ⢠Enable caching for frequently accessed datasets");
438 }
439 println!(" ⢠Use train/test splitting to reduce memory usage");
440 println!(" ⢠Apply feature selection to reduce dimensionality");
441
442 println!();
443 Ok(())
444}
445
446#[allow(dead_code)]
448fn format_number(n: usize) -> String {
449 if n >= 1_000_000 {
450 format!("{:.1}M", n as f64 / 1_000_000.0)
451 } else if n >= 1_000 {
452 format!("{:.1}K", n as f64 / 1_000.0)
453 } else {
454 n.to_string()
455 }
456}
457
458#[allow(dead_code)]
460fn show_dataset_info(name: &str, dataset: &scirs2_datasets::utils::Dataset) {
461 println!("{name}:");
462 println!(" Samples: {}", format_number(dataset.n_samples()));
463 println!(" Features: {}", dataset.n_features());
464 println!(
465 " Task: {}",
466 dataset
467 .metadata
468 .get("task_type")
469 .unwrap_or(&"Unknown".to_string())
470 );
471
472 if let Some(source) = dataset.metadata.get("source") {
473 println!(" Source: {source}");
474 }
475
476 if dataset.target.is_some() {
477 println!(" Supervised: Yes");
478 } else {
479 println!(" Supervised: No");
480 }
481}