1use std::collections::HashMap;
10use std::io::{self, Write};
11
12use scirs2_core::ndarray::Array2;
13use serde::{Deserialize, Serialize};
14
15use crate::error::{DatasetsError, Result};
16use crate::utils::Dataset;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ExploreConfig {
21 pub output_format: OutputFormat,
23 pub precision: usize,
25 pub show_detailed_stats: bool,
27 pub max_unique_values: usize,
29 pub interactive: bool,
31}
32
33impl Default for ExploreConfig {
34 fn default() -> Self {
35 Self {
36 output_format: OutputFormat::Table,
37 precision: 3,
38 show_detailed_stats: true,
39 max_unique_values: 20,
40 interactive: false,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
47pub enum OutputFormat {
48 Table,
50 Json,
52 Csv,
54 Markdown,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct DatasetSummary {
61 pub info: DatasetInfo,
63 pub statistics: FeatureStatistics,
65 pub missingdata: MissingDataAnalysis,
67 pub targetanalysis: Option<TargetAnalysis>,
69 pub quality_assessment: QualityAssessment,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DatasetInfo {
76 pub n_samples: usize,
78 pub n_features: usize,
80 pub featurenames: Option<Vec<String>>,
82 pub targetnames: Option<Vec<String>>,
84 pub description: Option<String>,
86 pub memory_usage: usize,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct FeatureStatistics {
93 pub features: Vec<FeatureStats>,
95 pub correlations: Option<Array2<f64>>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FeatureStats {
102 pub name: String,
104 pub index: usize,
106 pub data_type: InferredDataType,
108 pub count: usize,
110 pub mean: Option<f64>,
112 pub std: Option<f64>,
114 pub min: Option<f64>,
116 pub max: Option<f64>,
118 pub median: Option<f64>,
120 pub q25: Option<f64>,
122 pub q75: Option<f64>,
124 pub unique_count: Option<usize>,
126 pub unique_values: Option<Vec<String>>,
128 pub missing_count: usize,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
134pub enum InferredDataType {
135 Numerical,
137 Categorical,
139 Binary,
141 Unknown,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct MissingDataAnalysis {
148 pub total_missing: usize,
150 pub missing_percentage: f64,
152 pub feature_missing: Vec<(String, usize, f64)>,
154 pub missing_patterns: Vec<MissingPattern>,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct MissingPattern {
161 pub pattern: Vec<bool>,
163 pub count: usize,
165 pub percentage: f64,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct TargetAnalysis {
172 pub target_stats: FeatureStats,
174 pub class_distribution: Option<HashMap<String, usize>>,
176 pub correlations_with_features: Vec<(String, f64)>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct QualityAssessment {
183 pub quality_score: f64,
185 pub issues: Vec<QualityIssue>,
187 pub recommendations: Vec<String>,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct QualityIssue {
194 pub issue_type: IssueType,
196 pub severity: Severity,
198 pub description: String,
200 pub affected_features: Vec<String>,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
206pub enum IssueType {
207 MissingData,
209 Outliers,
211 Duplicates,
213 LowVariance,
215 HighCorrelation,
217 ImbalancedClasses,
219 SkewedDistribution,
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize)]
225pub enum Severity {
226 Low,
228 Medium,
230 High,
232 Critical,
234}
235
236pub struct DatasetExplorer {
238 config: ExploreConfig,
239}
240
241impl DatasetExplorer {
242 pub fn new(config: ExploreConfig) -> Self {
244 Self { config }
245 }
246
247 pub fn default_config() -> Self {
249 Self::new(ExploreConfig::default())
250 }
251
252 pub fn summarize(&self, dataset: &Dataset) -> Result<DatasetSummary> {
254 let info = self.collect_basic_info(dataset);
255 let statistics = self.compute_feature_statistics(dataset)?;
256 let missingdata = self.analyze_missingdata(dataset);
257 let targetanalysis = self.analyze_target(dataset)?;
258 let quality_assessment = self.assess_quality(dataset, &statistics, &missingdata)?;
259
260 Ok(DatasetSummary {
261 info,
262 statistics,
263 missingdata,
264 targetanalysis,
265 quality_assessment,
266 })
267 }
268
269 pub fn display_summary(&self, summary: &DatasetSummary) -> Result<()> {
271 match self.config.output_format {
272 OutputFormat::Table => self.display_table(summary),
273 OutputFormat::Json => self.display_json(summary),
274 OutputFormat::Csv => self.display_csv(summary),
275 OutputFormat::Markdown => self.display_markdown(summary),
276 }
277 }
278
279 pub fn interactive_explore(&self, dataset: &Dataset) -> Result<()> {
281 if !self.config.interactive {
282 return Err(DatasetsError::InvalidFormat(
283 "Interactive mode not enabled".to_string(),
284 ));
285 }
286
287 println!("š Interactive Dataset Explorer");
288 println!("==============================");
289
290 let summary = self.summarize(dataset)?;
291 self.display_basic_info(&summary.info);
292
293 loop {
294 println!("\nCommands:");
295 println!(" 1. Summary statistics");
296 println!(" 2. Feature details");
297 println!(" 3. Missing data analysis");
298 println!(" 4. Target analysis");
299 println!(" 5. Quality assessment");
300 println!(" 6. Export summary");
301 println!(" q. Quit");
302
303 print!("\nEnter command: ");
304 io::stdout().flush().unwrap();
305
306 let mut input = String::new();
307 io::stdin().read_line(&mut input).unwrap();
308 let input = input.trim();
309
310 match input {
311 "1" => self.display_statistics(&summary.statistics)?,
312 "2" => self.interactive_feature_details(dataset, &summary.statistics)?,
313 "3" => self.display_missingdata(&summary.missingdata)?,
314 "4" => {
315 if let Some(ref targetanalysis) = summary.targetanalysis {
316 self.display_targetanalysis(targetanalysis)?;
317 } else {
318 println!("No target variable found in dataset.");
319 }
320 }
321 "5" => self.display_quality_assessment(&summary.quality_assessment)?,
322 "6" => self.export_summary(&summary)?,
323 "q" | "quit" | "exit" => break,
324 _ => println!("Invalid command. Please try again."),
325 }
326 }
327
328 Ok(())
329 }
330
331 fn collect_basic_info(&self, dataset: &Dataset) -> DatasetInfo {
334 let n_samples = dataset.n_samples();
335 let n_features = dataset.n_features();
336
337 let data_size = n_samples * n_features * std::mem::size_of::<f64>();
339 let target_size = dataset
340 .target
341 .as_ref()
342 .map(|t| t.len() * std::mem::size_of::<f64>())
343 .unwrap_or(0);
344 let memory_usage = data_size + target_size;
345
346 DatasetInfo {
347 n_samples,
348 n_features,
349 featurenames: dataset.featurenames.clone(),
350 targetnames: dataset.targetnames.clone(),
351 description: dataset.description.clone(),
352 memory_usage,
353 }
354 }
355
356 fn compute_feature_statistics(&self, dataset: &Dataset) -> Result<FeatureStatistics> {
357 let mut features = Vec::new();
358
359 for (i, column) in dataset.data.columns().into_iter().enumerate() {
360 let name = dataset
361 .featurenames
362 .as_ref()
363 .and_then(|names| names.get(i))
364 .cloned()
365 .unwrap_or_else(|| format!("feature_{i}"));
366
367 let stats = self.compute_single_feature_stats(&name, i, &column)?;
368 features.push(stats);
369 }
370
371 let correlations = if self.config.show_detailed_stats {
373 Some(self.compute_correlation_matrix(dataset)?)
374 } else {
375 None
376 };
377
378 Ok(FeatureStatistics {
379 features,
380 correlations,
381 })
382 }
383
384 fn compute_single_feature_stats(
385 &self,
386 name: &str,
387 index: usize,
388 column: &scirs2_core::ndarray::ArrayView1<f64>,
389 ) -> Result<FeatureStats> {
390 let values: Vec<f64> = column.iter().copied().collect();
391 let count = values.len();
392 let missing_count = values.iter().filter(|&&x| x.is_nan()).count();
393 let valid_values: Vec<f64> = values.iter().copied().filter(|x| !x.is_nan()).collect();
394
395 let (mean, std, min, max, median, q25, q75) = if !valid_values.is_empty() {
396 let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
397
398 let variance = valid_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
399 / valid_values.len() as f64;
400 let std = variance.sqrt();
401
402 let mut sorted_values = valid_values.clone();
403 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
404
405 let min = sorted_values.first().copied();
406 let max = sorted_values.last().copied();
407
408 let median = Self::percentile(&sorted_values, 0.5);
409 let q25 = Self::percentile(&sorted_values, 0.25);
410 let q75 = Self::percentile(&sorted_values, 0.75);
411
412 (Some(mean), Some(std), min, max, median, q25, q75)
413 } else {
414 (None, None, None, None, None, None, None)
415 };
416
417 let data_type = self.infer_data_type(&valid_values);
419
420 let (unique_count, unique_values) = if matches!(
422 data_type,
423 InferredDataType::Categorical | InferredDataType::Binary
424 ) {
425 let mut unique: std::collections::HashSet<String> = std::collections::HashSet::new();
426 for &value in &valid_values {
427 unique.insert(format!("{value:.0}"));
428 }
429
430 let unique_count = unique.len();
431 let unique_values = if unique_count <= self.config.max_unique_values {
432 let mut values: Vec<String> = unique.into_iter().collect();
433 values.sort();
434 Some(values)
435 } else {
436 None
437 };
438
439 (Some(unique_count), unique_values)
440 } else {
441 (None, None)
442 };
443
444 Ok(FeatureStats {
445 name: name.to_string(),
446 index,
447 data_type,
448 count,
449 mean,
450 std,
451 min,
452 max,
453 median,
454 q25,
455 q75,
456 unique_count,
457 unique_values,
458 missing_count,
459 })
460 }
461
462 fn percentile(sorted_values: &[f64], p: f64) -> Option<f64> {
463 if sorted_values.is_empty() {
464 return None;
465 }
466
467 let index = p * (sorted_values.len() - 1) as f64;
468 let lower = index.floor() as usize;
469 let upper = index.ceil() as usize;
470
471 if lower == upper {
472 Some(sorted_values[lower])
473 } else {
474 let weight = index - lower as f64;
475 Some(sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight)
476 }
477 }
478
479 fn infer_data_type(&self, values: &[f64]) -> InferredDataType {
480 if values.is_empty() {
481 return InferredDataType::Unknown;
482 }
483
484 let all_integers = values.iter().all(|&x| x.fract() == 0.0);
486
487 if all_integers {
488 let unique_values: std::collections::HashSet<i64> =
489 values.iter().map(|&x| x as i64).collect();
490
491 match unique_values.len() {
492 1 => InferredDataType::Unknown, 2 => InferredDataType::Binary,
494 3..=20 => InferredDataType::Categorical,
495 _ => InferredDataType::Numerical,
496 }
497 } else {
498 InferredDataType::Numerical
499 }
500 }
501
502 fn compute_correlation_matrix(&self, dataset: &Dataset) -> Result<Array2<f64>> {
503 let n_features = dataset.n_features();
504 let mut correlations = Array2::zeros((n_features, n_features));
505
506 for i in 0..n_features {
507 for j in 0..n_features {
508 if i == j {
509 correlations[[i, j]] = 1.0;
510 } else {
511 let col_i = dataset.data.column(i);
512 let col_j = dataset.data.column(j);
513
514 let corr = self.compute_correlation(&col_i, &col_j);
515 correlations[[i, j]] = corr;
516 }
517 }
518 }
519
520 Ok(correlations)
521 }
522
523 fn compute_correlation(
524 &self,
525 x: &scirs2_core::ndarray::ArrayView1<f64>,
526 y: &scirs2_core::ndarray::ArrayView1<f64>,
527 ) -> f64 {
528 let x_vals: Vec<f64> = x.iter().copied().filter(|v| !v.is_nan()).collect();
529 let y_vals: Vec<f64> = y.iter().copied().filter(|v| !v.is_nan()).collect();
530
531 if x_vals.len() != y_vals.len() || x_vals.len() < 2 {
532 return 0.0;
533 }
534
535 let mean_x = x_vals.iter().sum::<f64>() / x_vals.len() as f64;
536 let mean_y = y_vals.iter().sum::<f64>() / y_vals.len() as f64;
537
538 let mut numerator = 0.0;
539 let mut sum_sq_x = 0.0;
540 let mut sum_sq_y = 0.0;
541
542 for (x_val, y_val) in x_vals.iter().zip(y_vals.iter()) {
543 let dx = x_val - mean_x;
544 let dy = y_val - mean_y;
545
546 numerator += dx * dy;
547 sum_sq_x += dx * dx;
548 sum_sq_y += dy * dy;
549 }
550
551 let denominator = (sum_sq_x * sum_sq_y).sqrt();
552
553 if denominator == 0.0 {
554 0.0
555 } else {
556 numerator / denominator
557 }
558 }
559
560 fn analyze_missingdata(&self, dataset: &Dataset) -> MissingDataAnalysis {
561 let n_samples = dataset.n_samples();
562 let n_features = dataset.n_features();
563 let total_values = n_samples * n_features;
564
565 let mut total_missing = 0;
566 let mut feature_missing = Vec::new();
567
568 for (i, column) in dataset.data.columns().into_iter().enumerate() {
569 let missing_count = column.iter().filter(|&&x| x.is_nan()).count();
570 total_missing += missing_count;
571
572 let featurename = dataset
573 .featurenames
574 .as_ref()
575 .and_then(|names| names.get(i))
576 .cloned()
577 .unwrap_or_else(|| format!("feature_{i}"));
578
579 let missing_percentage = missing_count as f64 / n_samples as f64 * 100.0;
580 feature_missing.push((featurename, missing_count, missing_percentage));
581 }
582
583 let missing_percentage = total_missing as f64 / total_values as f64 * 100.0;
584
585 let missing_patterns = self.analyze_missing_patterns(dataset);
587
588 MissingDataAnalysis {
589 total_missing,
590 missing_percentage,
591 feature_missing,
592 missing_patterns,
593 }
594 }
595
596 fn analyze_missing_patterns(&self, dataset: &Dataset) -> Vec<MissingPattern> {
597 let mut pattern_counts: HashMap<Vec<bool>, usize> = HashMap::new();
598
599 for row in dataset.data.rows() {
600 let pattern: Vec<bool> = row.iter().map(|&x| x.is_nan()).collect();
601 *pattern_counts.entry(pattern).or_insert(0) += 1;
602 }
603
604 let total_samples = dataset.n_samples() as f64;
605 let mut patterns: Vec<MissingPattern> = pattern_counts
606 .into_iter()
607 .map(|(pattern, count)| MissingPattern {
608 pattern,
609 count,
610 percentage: count as f64 / total_samples * 100.0,
611 })
612 .collect();
613
614 patterns.sort_by(|a, b| b.count.cmp(&a.count));
616
617 patterns.truncate(10);
619
620 patterns
621 }
622
623 fn analyze_target(&self, dataset: &Dataset) -> Result<Option<TargetAnalysis>> {
624 let target = match &dataset.target {
625 Some(target) => target,
626 None => return Ok(None),
627 };
628
629 let target_column = target.view();
630 let target_stats = self.compute_single_feature_stats("target", 0, &target_column)?;
631
632 let class_distribution = if matches!(
634 target_stats.data_type,
635 InferredDataType::Categorical | InferredDataType::Binary
636 ) {
637 let mut distribution = HashMap::new();
638 for &value in target.iter() {
639 if !value.is_nan() {
640 let classname = format!("{value:.0}");
641 *distribution.entry(classname).or_insert(0) += 1;
642 }
643 }
644 Some(distribution)
645 } else {
646 None
647 };
648
649 let mut correlations_with_features = Vec::new();
651 for (i, column) in dataset.data.columns().into_iter().enumerate() {
652 let featurename = dataset
653 .featurenames
654 .as_ref()
655 .and_then(|names| names.get(i))
656 .cloned()
657 .unwrap_or_else(|| format!("feature_{i}"));
658
659 let correlation = self.compute_correlation(&column, &target_column);
660 correlations_with_features.push((featurename, correlation));
661 }
662
663 correlations_with_features.sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).unwrap());
665
666 Ok(Some(TargetAnalysis {
667 target_stats,
668 class_distribution,
669 correlations_with_features,
670 }))
671 }
672
673 fn assess_quality(
674 &self,
675 _dataset: &Dataset,
676 statistics: &FeatureStatistics,
677 missingdata: &MissingDataAnalysis,
678 ) -> Result<QualityAssessment> {
679 let mut issues = Vec::new();
680 let mut quality_score = 100.0;
681
682 if missingdata.missing_percentage > 5.0 {
684 let severity = if missingdata.missing_percentage > 20.0 {
685 Severity::High
686 } else if missingdata.missing_percentage > 10.0 {
687 Severity::Medium
688 } else {
689 Severity::Low
690 };
691
692 issues.push(QualityIssue {
693 issue_type: IssueType::MissingData,
694 severity,
695 description: format!("{:.1}% of _data is missing", missingdata.missing_percentage),
696 affected_features: missingdata
697 .feature_missing
698 .iter()
699 .filter(|(_, _, pct)| *pct > 5.0)
700 .map(|(name, _, _)| name.clone())
701 .collect(),
702 });
703
704 quality_score -= missingdata.missing_percentage.min(30.0);
705 }
706
707 let low_variance_features: Vec<String> = statistics
709 .features
710 .iter()
711 .filter(|f| f.std.is_some_and(|std| std < 1e-6))
712 .map(|f| f.name.clone())
713 .collect();
714
715 if !low_variance_features.is_empty() {
716 issues.push(QualityIssue {
717 issue_type: IssueType::LowVariance,
718 severity: Severity::Medium,
719 description: format!(
720 "{} features have very low variance",
721 low_variance_features.len()
722 ),
723 affected_features: low_variance_features,
724 });
725
726 quality_score -= 10.0;
727 }
728
729 if let Some(ref correlations) = statistics.correlations {
731 let mut high_corr_pairs = Vec::new();
732 for i in 0..correlations.nrows() {
733 for j in (i + 1)..correlations.ncols() {
734 if correlations[[i, j]].abs() > 0.9 {
735 let name_i = statistics.features[i].name.clone();
736 let name_j = statistics.features[j].name.clone();
737 high_corr_pairs.push(format!("{name_i} - {name_j}"));
738 }
739 }
740 }
741
742 if !high_corr_pairs.is_empty() {
743 issues.push(QualityIssue {
744 issue_type: IssueType::HighCorrelation,
745 severity: Severity::Medium,
746 description: format!(
747 "{} highly correlated feature pairs found",
748 high_corr_pairs.len()
749 ),
750 affected_features: high_corr_pairs,
751 });
752
753 quality_score -= 5.0;
754 }
755 }
756
757 let recommendations = self.generate_recommendations(&issues);
758
759 Ok(QualityAssessment {
760 quality_score: quality_score.max(0.0),
761 issues,
762 recommendations,
763 })
764 }
765
766 fn generate_recommendations(&self, issues: &[QualityIssue]) -> Vec<String> {
767 let mut recommendations = Vec::new();
768
769 for issue in issues {
770 match issue.issue_type {
771 IssueType::MissingData => {
772 recommendations.push("Consider imputation strategies for missing data or remove features with excessive missing values".to_string());
773 }
774 IssueType::LowVariance => {
775 recommendations.push(
776 "Remove low variance features as they provide little information"
777 .to_string(),
778 );
779 }
780 IssueType::HighCorrelation => {
781 recommendations.push("Consider removing redundant highly correlated features or use dimensionality reduction".to_string());
782 }
783 _ => {}
784 }
785 }
786
787 if recommendations.is_empty() {
788 recommendations.push("Dataset appears to be of good quality".to_string());
789 }
790
791 recommendations
792 }
793
794 fn display_table(&self, summary: &DatasetSummary) -> Result<()> {
797 self.display_basic_info(&summary.info);
798 self.display_statistics(&summary.statistics)?;
799 self.display_missingdata(&summary.missingdata)?;
800
801 if let Some(ref targetanalysis) = summary.targetanalysis {
802 self.display_targetanalysis(targetanalysis)?;
803 }
804
805 self.display_quality_assessment(&summary.quality_assessment)?;
806
807 Ok(())
808 }
809
810 fn display_basic_info(&self, info: &DatasetInfo) {
811 println!("š Dataset Overview");
812 println!("==================");
813 println!("Samples: {}", info.n_samples);
814 println!("Features: {}", info.n_features);
815 println!(
816 "Memory usage: {:.2} MB",
817 info.memory_usage as f64 / 1_048_576.0
818 );
819
820 if let Some(ref description) = info.description {
821 println!("Description: {description}");
822 }
823
824 println!();
825 }
826
827 fn display_statistics(&self, statistics: &FeatureStatistics) -> Result<()> {
828 println!("š Feature Statistics");
829 println!("====================");
830
831 println!(
833 "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
834 "Feature", "Type", "Mean", "Std", "Min", "Max", "Missing"
835 );
836 let separator = "-".repeat(80);
837 println!("{separator}");
838
839 for feature in &statistics.features {
840 let type_str = match feature.data_type {
841 InferredDataType::Numerical => "num",
842 InferredDataType::Categorical => "cat",
843 InferredDataType::Binary => "bin",
844 InferredDataType::Unknown => "unk",
845 };
846
847 println!(
848 "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
849 feature.name.chars().take(15).collect::<String>(),
850 type_str,
851 feature
852 .mean
853 .map(|x| format!("{x:.3}"))
854 .unwrap_or_else(|| "-".to_string()),
855 feature
856 .std
857 .map(|x| format!("{x:.3}"))
858 .unwrap_or_else(|| "-".to_string()),
859 feature
860 .min
861 .map(|x| format!("{x:.3}"))
862 .unwrap_or_else(|| "-".to_string()),
863 feature
864 .max
865 .map(|x| format!("{x:.3}"))
866 .unwrap_or_else(|| "-".to_string()),
867 feature.missing_count
868 );
869 }
870
871 println!();
872 Ok(())
873 }
874
875 fn display_missingdata(&self, missingdata: &MissingDataAnalysis) -> Result<()> {
876 println!("ā Missing Data Analysis");
877 println!("========================");
878 println!(
879 "Total missing: {} ({:.2}%)",
880 missingdata.total_missing, missingdata.missing_percentage
881 );
882
883 if !missingdata.feature_missing.is_empty() {
884 println!("\nMissing by feature:");
885 for (feature, count, percentage) in &missingdata.feature_missing {
886 if *count > 0 {
887 println!(" {feature}: {count} ({percentage:.1}%)");
888 }
889 }
890 }
891
892 println!();
893 Ok(())
894 }
895
896 fn display_targetanalysis(&self, targetanalysis: &TargetAnalysis) -> Result<()> {
897 println!("šÆ Target Analysis");
898 println!("==================");
899
900 let target = &targetanalysis.target_stats;
901 println!("Target type: {:?}", target.data_type);
902
903 if let Some(ref distribution) = targetanalysis.class_distribution {
904 println!("\nClass distribution:");
905 for (class, count) in distribution {
906 println!(" {class}: {count}");
907 }
908 }
909
910 println!("\nTop correlations with features:");
911 for (feature, correlation) in targetanalysis.correlations_with_features.iter().take(5) {
912 println!(" {feature}: {correlation:.3}");
913 }
914
915 println!();
916 Ok(())
917 }
918
919 fn display_quality_assessment(&self, quality: &QualityAssessment) -> Result<()> {
920 println!("ā
Quality Assessment");
921 println!("=====================");
922 println!("Quality score: {:.1}/100", quality.quality_score);
923
924 if !quality.issues.is_empty() {
925 println!("\nIssues found:");
926 for issue in &quality.issues {
927 let severity_icon = match issue.severity {
928 Severity::Low => "ā ļø",
929 Severity::Medium => "š”",
930 Severity::High => "š ",
931 Severity::Critical => "š“",
932 };
933 println!(" {} {}", severity_icon, issue.description);
934 }
935 }
936
937 println!("\nRecommendations:");
938 for recommendation in &quality.recommendations {
939 println!(" ⢠{recommendation}");
940 }
941
942 println!();
943 Ok(())
944 }
945
946 fn display_json(&self, summary: &DatasetSummary) -> Result<()> {
947 let json = serde_json::to_string_pretty(summary)
948 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?;
949 println!("{json}");
950 Ok(())
951 }
952
953 fn display_csv(&self, summary: &DatasetSummary) -> Result<()> {
954 println!("feature,type,count,mean,std,min,max,missing");
956 for feature in &summary.statistics.features {
957 println!(
958 "{},{:?},{},{},{},{},{},{}",
959 feature.name,
960 feature.data_type,
961 feature.count,
962 feature
963 .mean
964 .map(|x| x.to_string())
965 .unwrap_or_else(|| "".to_string()),
966 feature
967 .std
968 .map(|x| x.to_string())
969 .unwrap_or_else(|| "".to_string()),
970 feature
971 .min
972 .map(|x| x.to_string())
973 .unwrap_or_else(|| "".to_string()),
974 feature
975 .max
976 .map(|x| x.to_string())
977 .unwrap_or_else(|| "".to_string()),
978 feature.missing_count
979 );
980 }
981 Ok(())
982 }
983
984 fn display_markdown(&self, summary: &DatasetSummary) -> Result<()> {
985 println!("# Dataset Summary\n");
986
987 println!("## Overview\n");
988 println!("- **Samples**: {}", summary.info.n_samples);
989 println!("- **Features**: {}", summary.info.n_features);
990 println!(
991 "- **Memory usage**: {:.2} MB\n",
992 summary.info.memory_usage as f64 / 1_048_576.0
993 );
994
995 println!("## Feature Statistics\n");
996 println!("| Feature | Type | Mean | Std | Min | Max | Missing |");
997 println!("|---------|------|------|-----|-----|-----|---------|");
998
999 for feature in &summary.statistics.features {
1000 println!(
1001 "| {} | {:?} | {} | {} | {} | {} | {} |",
1002 feature.name,
1003 feature.data_type,
1004 feature
1005 .mean
1006 .map(|x| format!("{x:.3}"))
1007 .unwrap_or_else(|| "-".to_string()),
1008 feature
1009 .std
1010 .map(|x| format!("{x:.3}"))
1011 .unwrap_or_else(|| "-".to_string()),
1012 feature
1013 .min
1014 .map(|x| format!("{x:.3}"))
1015 .unwrap_or_else(|| "-".to_string()),
1016 feature
1017 .max
1018 .map(|x| format!("{x:.3}"))
1019 .unwrap_or_else(|| "-".to_string()),
1020 feature.missing_count
1021 );
1022 }
1023
1024 println!(
1025 "\n## Quality Score: {:.1}/100\n",
1026 summary.quality_assessment.quality_score
1027 );
1028
1029 Ok(())
1030 }
1031
1032 fn interactive_feature_details(
1033 &self,
1034 dataset: &Dataset,
1035 statistics: &FeatureStatistics,
1036 ) -> Result<()> {
1037 println!("\nFeature Details");
1038 println!("===============");
1039
1040 for (i, feature) in statistics.features.iter().enumerate() {
1041 println!("{}. {}", i + 1, feature.name);
1042 }
1043
1044 print!("\nEnter feature number (or 'back'): ");
1045 io::stdout().flush().unwrap();
1046
1047 let mut input = String::new();
1048 io::stdin().read_line(&mut input).unwrap();
1049 let input = input.trim();
1050
1051 if input == "back" {
1052 return Ok(());
1053 }
1054
1055 if let Ok(index) = input.parse::<usize>() {
1056 if index > 0 && index <= statistics.features.len() {
1057 let feature = &statistics.features[index - 1];
1058 self.display_feature_detail(feature, dataset)?;
1059 } else {
1060 println!("Invalid feature number.");
1061 }
1062 } else {
1063 println!("Invalid input.");
1064 }
1065
1066 Ok(())
1067 }
1068
1069 fn display_feature_detail(&self, feature: &FeatureStats, _dataset: &Dataset) -> Result<()> {
1070 println!("\nš Feature: {}", feature.name);
1071 println!("==================");
1072 println!("Type: {:?}", feature.data_type);
1073 println!("Count: {}", feature.count);
1074 println!(
1075 "Missing: {} ({:.1}%)",
1076 feature.missing_count,
1077 feature.missing_count as f64 / feature.count as f64 * 100.0
1078 );
1079
1080 if let Some(mean) = feature.mean {
1081 println!("Mean: {mean:.6}");
1082 }
1083 if let Some(std) = feature.std {
1084 println!("Std: {std:.6}");
1085 }
1086 if let Some(min) = feature.min {
1087 println!("Min: {min:.6}");
1088 }
1089 if let Some(max) = feature.max {
1090 println!("Max: {max:.6}");
1091 }
1092 if let Some(median) = feature.median {
1093 println!("Median: {median:.6}");
1094 }
1095 if let Some(q25) = feature.q25 {
1096 println!("Q25: {q25:.6}");
1097 }
1098 if let Some(q75) = feature.q75 {
1099 println!("Q75: {q75:.6}");
1100 }
1101
1102 if let Some(ref unique_values) = feature.unique_values {
1103 println!("Unique values: {unique_values:?}");
1104 } else if let Some(unique_count) = feature.unique_count {
1105 println!("Unique count: {unique_count}");
1106 }
1107
1108 Ok(())
1109 }
1110
1111 fn export_summary(&self, summary: &DatasetSummary) -> Result<()> {
1112 print!("Export format (json/csv/markdown): ");
1113 io::stdout().flush().unwrap();
1114
1115 let mut input = String::new();
1116 io::stdin().read_line(&mut input).unwrap();
1117 let format = input.trim();
1118
1119 let filename = format!("dataset_summary.{format}");
1120
1121 let content = match format {
1122 "json" => serde_json::to_string_pretty(summary)
1123 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1124 "csv" => {
1125 let mut csv_content = String::from("feature,type,count,mean,std,min,max,missing\n");
1126 for feature in &summary.statistics.features {
1127 csv_content.push_str(&format!(
1128 "{},{:?},{},{},{},{},{},{}\n",
1129 feature.name,
1130 feature.data_type,
1131 feature.count,
1132 feature
1133 .mean
1134 .map(|x| x.to_string())
1135 .unwrap_or_else(|| "".to_string()),
1136 feature
1137 .std
1138 .map(|x| x.to_string())
1139 .unwrap_or_else(|| "".to_string()),
1140 feature
1141 .min
1142 .map(|x| x.to_string())
1143 .unwrap_or_else(|| "".to_string()),
1144 feature
1145 .max
1146 .map(|x| x.to_string())
1147 .unwrap_or_else(|| "".to_string()),
1148 feature.missing_count
1149 ));
1150 }
1151 csv_content
1152 }
1153 "markdown" => {
1154 format!(
1156 "# Dataset Summary\n\nQuality Score: {:.1}/100\n",
1157 summary.quality_assessment.quality_score
1158 )
1159 }
1160 _ => {
1161 return Err(DatasetsError::InvalidFormat(
1162 "Unsupported export format".to_string(),
1163 ))
1164 }
1165 };
1166
1167 std::fs::write(&filename, content).map_err(DatasetsError::IoError)?;
1168
1169 println!("Summary exported to: {filename}");
1170 Ok(())
1171 }
1172}
1173
1174pub mod convenience {
1176 use super::*;
1177
1178 pub fn quick_summary(dataset: &Dataset) -> Result<DatasetSummary> {
1180 let explorer = DatasetExplorer::default_config();
1181 explorer.summarize(dataset)
1182 }
1183
1184 pub fn info(dataset: &Dataset) -> Result<()> {
1186 let explorer = DatasetExplorer::default_config();
1187 let summary = explorer.summarize(dataset)?;
1188 explorer.display_basic_info(&summary.info);
1189 Ok(())
1190 }
1191
1192 pub fn explore(dataset: &Dataset) -> Result<()> {
1194 let config = ExploreConfig {
1195 interactive: true,
1196 ..Default::default()
1197 };
1198
1199 let explorer = DatasetExplorer::new(config);
1200 explorer.interactive_explore(dataset)
1201 }
1202
1203 pub fn export_summary(dataset: &Dataset, format: OutputFormat, filename: &str) -> Result<()> {
1205 let config = ExploreConfig {
1206 output_format: format,
1207 ..Default::default()
1208 };
1209 let output_format = config.output_format;
1210
1211 let explorer = DatasetExplorer::new(config);
1212 let summary = explorer.summarize(dataset)?;
1213
1214 let content = match output_format {
1215 OutputFormat::Json => serde_json::to_string_pretty(&summary)
1216 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1217 _ => {
1218 return Err(DatasetsError::InvalidFormat(
1219 "Only JSON export is currently supported in convenience function".to_string(),
1220 ));
1221 }
1222 };
1223
1224 std::fs::write(filename, content).map_err(DatasetsError::IoError)?;
1225
1226 Ok(())
1227 }
1228}
1229
1230#[cfg(test)]
1231mod tests {
1232 use super::*;
1233 use crate::generators::make_classification;
1234
1235 #[test]
1236 fn testdataset_explorer_creation() {
1237 let explorer = DatasetExplorer::default_config();
1238 assert_eq!(explorer.config.precision, 3);
1239 assert!(explorer.config.show_detailed_stats);
1240 }
1241
1242 #[test]
1243 fn test_basic_summary() {
1244 let dataset = make_classification(100, 5, 2, 1, 1, Some(42)).unwrap();
1245 let summary = convenience::quick_summary(&dataset).unwrap();
1246
1247 assert_eq!(summary.info.n_samples, 100);
1248 assert_eq!(summary.info.n_features, 5);
1249 assert_eq!(summary.statistics.features.len(), 5);
1250 }
1251
1252 #[test]
1253 fn test_feature_statistics() {
1254 let dataset = make_classification(50, 3, 2, 1, 1, Some(42)).unwrap();
1255 let explorer = DatasetExplorer::default_config();
1256 let statistics = explorer.compute_feature_statistics(&dataset).unwrap();
1257
1258 assert_eq!(statistics.features.len(), 3);
1259
1260 for feature in &statistics.features {
1261 assert!(feature.mean.is_some());
1262 assert!(feature.std.is_some());
1263 assert!(feature.min.is_some());
1264 assert!(feature.max.is_some());
1265 }
1266 }
1267
1268 #[test]
1269 fn test_quality_assessment() {
1270 let dataset = make_classification(100, 4, 2, 1, 1, Some(42)).unwrap();
1271 let explorer = DatasetExplorer::default_config();
1272 let summary = explorer.summarize(&dataset).unwrap();
1273
1274 assert!(summary.quality_assessment.quality_score > 80.0);
1276 }
1277
1278 #[test]
1279 fn test_data_type_inference() {
1280 let explorer = DatasetExplorer::default_config();
1281
1282 let numerical_data = vec![1.1, 2.3, 3.7, 4.2];
1284 assert!(matches!(
1285 explorer.infer_data_type(&numerical_data),
1286 InferredDataType::Numerical
1287 ));
1288
1289 let binary_data = vec![0.0, 1.0, 0.0, 1.0];
1291 assert!(matches!(
1292 explorer.infer_data_type(&binary_data),
1293 InferredDataType::Binary
1294 ));
1295
1296 let categorical_data = vec![1.0, 2.0, 3.0, 1.0, 2.0];
1298 assert!(matches!(
1299 explorer.infer_data_type(&categorical_data),
1300 InferredDataType::Categorical
1301 ));
1302 }
1303}