1use std::collections::HashMap;
10use std::io::{self, Write};
11
12use scirs2_core::ndarray::Array2;
13use serde::{Deserialize, Serialize};
14
15use crate::error::{DatasetsError, Result};
16use crate::utils::Dataset;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ExploreConfig {
21 pub output_format: OutputFormat,
23 pub precision: usize,
25 pub show_detailed_stats: bool,
27 pub max_unique_values: usize,
29 pub interactive: bool,
31}
32
33impl Default for ExploreConfig {
34 fn default() -> Self {
35 Self {
36 output_format: OutputFormat::Table,
37 precision: 3,
38 show_detailed_stats: true,
39 max_unique_values: 20,
40 interactive: false,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
47pub enum OutputFormat {
48 Table,
50 Json,
52 Csv,
54 Markdown,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct DatasetSummary {
61 pub info: DatasetInfo,
63 pub statistics: FeatureStatistics,
65 pub missingdata: MissingDataAnalysis,
67 pub targetanalysis: Option<TargetAnalysis>,
69 pub quality_assessment: QualityAssessment,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DatasetInfo {
76 pub n_samples: usize,
78 pub n_features: usize,
80 pub featurenames: Option<Vec<String>>,
82 pub targetnames: Option<Vec<String>>,
84 pub description: Option<String>,
86 pub memory_usage: usize,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct FeatureStatistics {
93 pub features: Vec<FeatureStats>,
95 pub correlations: Option<Array2<f64>>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FeatureStats {
102 pub name: String,
104 pub index: usize,
106 pub data_type: InferredDataType,
108 pub count: usize,
110 pub mean: Option<f64>,
112 pub std: Option<f64>,
114 pub min: Option<f64>,
116 pub max: Option<f64>,
118 pub median: Option<f64>,
120 pub q25: Option<f64>,
122 pub q75: Option<f64>,
124 pub unique_count: Option<usize>,
126 pub unique_values: Option<Vec<String>>,
128 pub missing_count: usize,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
134pub enum InferredDataType {
135 Numerical,
137 Categorical,
139 Binary,
141 Unknown,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct MissingDataAnalysis {
148 pub total_missing: usize,
150 pub missing_percentage: f64,
152 pub feature_missing: Vec<(String, usize, f64)>,
154 pub missing_patterns: Vec<MissingPattern>,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct MissingPattern {
161 pub pattern: Vec<bool>,
163 pub count: usize,
165 pub percentage: f64,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct TargetAnalysis {
172 pub target_stats: FeatureStats,
174 pub class_distribution: Option<HashMap<String, usize>>,
176 pub correlations_with_features: Vec<(String, f64)>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct QualityAssessment {
183 pub quality_score: f64,
185 pub issues: Vec<QualityIssue>,
187 pub recommendations: Vec<String>,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct QualityIssue {
194 pub issue_type: IssueType,
196 pub severity: Severity,
198 pub description: String,
200 pub affected_features: Vec<String>,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
206pub enum IssueType {
207 MissingData,
209 Outliers,
211 Duplicates,
213 LowVariance,
215 HighCorrelation,
217 ImbalancedClasses,
219 SkewedDistribution,
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize)]
225pub enum Severity {
226 Low,
228 Medium,
230 High,
232 Critical,
234}
235
236pub struct DatasetExplorer {
238 config: ExploreConfig,
239}
240
241impl DatasetExplorer {
242 pub fn new(config: ExploreConfig) -> Self {
244 Self { config }
245 }
246
247 pub fn default_config() -> Self {
249 Self::new(ExploreConfig::default())
250 }
251
252 pub fn summarize(&self, dataset: &Dataset) -> Result<DatasetSummary> {
254 let info = self.collect_basic_info(dataset);
255 let statistics = self.compute_feature_statistics(dataset)?;
256 let missingdata = self.analyze_missingdata(dataset);
257 let targetanalysis = self.analyze_target(dataset)?;
258 let quality_assessment = self.assess_quality(dataset, &statistics, &missingdata)?;
259
260 Ok(DatasetSummary {
261 info,
262 statistics,
263 missingdata,
264 targetanalysis,
265 quality_assessment,
266 })
267 }
268
269 pub fn display_summary(&self, summary: &DatasetSummary) -> Result<()> {
271 match self.config.output_format {
272 OutputFormat::Table => self.display_table(summary),
273 OutputFormat::Json => self.display_json(summary),
274 OutputFormat::Csv => self.display_csv(summary),
275 OutputFormat::Markdown => self.display_markdown(summary),
276 }
277 }
278
279 pub fn interactive_explore(&self, dataset: &Dataset) -> Result<()> {
281 if !self.config.interactive {
282 return Err(DatasetsError::InvalidFormat(
283 "Interactive mode not enabled".to_string(),
284 ));
285 }
286
287 println!("š Interactive Dataset Explorer");
288 println!("==============================");
289
290 let summary = self.summarize(dataset)?;
291 self.display_basic_info(&summary.info);
292
293 loop {
294 println!("\nCommands:");
295 println!(" 1. Summary statistics");
296 println!(" 2. Feature details");
297 println!(" 3. Missing data analysis");
298 println!(" 4. Target analysis");
299 println!(" 5. Quality assessment");
300 println!(" 6. Export summary");
301 println!(" q. Quit");
302
303 print!("\nEnter command: ");
304 io::stdout().flush().expect("Operation failed");
305
306 let mut input = String::new();
307 io::stdin().read_line(&mut input).expect("Operation failed");
308 let input = input.trim();
309
310 match input {
311 "1" => self.display_statistics(&summary.statistics)?,
312 "2" => self.interactive_feature_details(dataset, &summary.statistics)?,
313 "3" => self.display_missingdata(&summary.missingdata)?,
314 "4" => {
315 if let Some(ref targetanalysis) = summary.targetanalysis {
316 self.display_targetanalysis(targetanalysis)?;
317 } else {
318 println!("No target variable found in dataset.");
319 }
320 }
321 "5" => self.display_quality_assessment(&summary.quality_assessment)?,
322 "6" => self.export_summary(&summary)?,
323 "q" | "quit" | "exit" => break,
324 _ => println!("Invalid command. Please try again."),
325 }
326 }
327
328 Ok(())
329 }
330
331 fn collect_basic_info(&self, dataset: &Dataset) -> DatasetInfo {
334 let n_samples = dataset.n_samples();
335 let n_features = dataset.n_features();
336
337 let data_size = n_samples * n_features * std::mem::size_of::<f64>();
339 let target_size = dataset
340 .target
341 .as_ref()
342 .map(|t| t.len() * std::mem::size_of::<f64>())
343 .unwrap_or(0);
344 let memory_usage = data_size + target_size;
345
346 DatasetInfo {
347 n_samples,
348 n_features,
349 featurenames: dataset.featurenames.clone(),
350 targetnames: dataset.targetnames.clone(),
351 description: dataset.description.clone(),
352 memory_usage,
353 }
354 }
355
356 fn compute_feature_statistics(&self, dataset: &Dataset) -> Result<FeatureStatistics> {
357 let mut features = Vec::new();
358
359 for (i, column) in dataset.data.columns().into_iter().enumerate() {
360 let name = dataset
361 .featurenames
362 .as_ref()
363 .and_then(|names| names.get(i))
364 .cloned()
365 .unwrap_or_else(|| format!("feature_{i}"));
366
367 let stats = self.compute_single_feature_stats(&name, i, &column)?;
368 features.push(stats);
369 }
370
371 let correlations = if self.config.show_detailed_stats {
373 Some(self.compute_correlation_matrix(dataset)?)
374 } else {
375 None
376 };
377
378 Ok(FeatureStatistics {
379 features,
380 correlations,
381 })
382 }
383
384 fn compute_single_feature_stats(
385 &self,
386 name: &str,
387 index: usize,
388 column: &scirs2_core::ndarray::ArrayView1<f64>,
389 ) -> Result<FeatureStats> {
390 let values: Vec<f64> = column.iter().copied().collect();
391 let count = values.len();
392 let missing_count = values.iter().filter(|&&x| x.is_nan()).count();
393 let valid_values: Vec<f64> = values.iter().copied().filter(|x| !x.is_nan()).collect();
394
395 let (mean, std, min, max, median, q25, q75) = if !valid_values.is_empty() {
396 let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
397
398 let variance = valid_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
399 / valid_values.len() as f64;
400 let std = variance.sqrt();
401
402 let mut sorted_values = valid_values.clone();
403 sorted_values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
404
405 let min = sorted_values.first().copied();
406 let max = sorted_values.last().copied();
407
408 let median = Self::percentile(&sorted_values, 0.5);
409 let q25 = Self::percentile(&sorted_values, 0.25);
410 let q75 = Self::percentile(&sorted_values, 0.75);
411
412 (Some(mean), Some(std), min, max, median, q25, q75)
413 } else {
414 (None, None, None, None, None, None, None)
415 };
416
417 let data_type = self.infer_data_type(&valid_values);
419
420 let (unique_count, unique_values) = if matches!(
422 data_type,
423 InferredDataType::Categorical | InferredDataType::Binary
424 ) {
425 let mut unique: std::collections::HashSet<String> = std::collections::HashSet::new();
426 for &value in &valid_values {
427 unique.insert(format!("{value:.0}"));
428 }
429
430 let unique_count = unique.len();
431 let unique_values = if unique_count <= self.config.max_unique_values {
432 let mut values: Vec<String> = unique.into_iter().collect();
433 values.sort();
434 Some(values)
435 } else {
436 None
437 };
438
439 (Some(unique_count), unique_values)
440 } else {
441 (None, None)
442 };
443
444 Ok(FeatureStats {
445 name: name.to_string(),
446 index,
447 data_type,
448 count,
449 mean,
450 std,
451 min,
452 max,
453 median,
454 q25,
455 q75,
456 unique_count,
457 unique_values,
458 missing_count,
459 })
460 }
461
462 fn percentile(sorted_values: &[f64], p: f64) -> Option<f64> {
463 if sorted_values.is_empty() {
464 return None;
465 }
466
467 let index = p * (sorted_values.len() - 1) as f64;
468 let lower = index.floor() as usize;
469 let upper = index.ceil() as usize;
470
471 if lower == upper {
472 Some(sorted_values[lower])
473 } else {
474 let weight = index - lower as f64;
475 Some(sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight)
476 }
477 }
478
479 fn infer_data_type(&self, values: &[f64]) -> InferredDataType {
480 if values.is_empty() {
481 return InferredDataType::Unknown;
482 }
483
484 let all_integers = values.iter().all(|&x| x.fract() == 0.0);
486
487 if all_integers {
488 let unique_values: std::collections::HashSet<i64> =
489 values.iter().map(|&x| x as i64).collect();
490
491 match unique_values.len() {
492 1 => InferredDataType::Unknown, 2 => InferredDataType::Binary,
494 3..=20 => InferredDataType::Categorical,
495 _ => InferredDataType::Numerical,
496 }
497 } else {
498 InferredDataType::Numerical
499 }
500 }
501
502 fn compute_correlation_matrix(&self, dataset: &Dataset) -> Result<Array2<f64>> {
503 let n_features = dataset.n_features();
504 let mut correlations = Array2::zeros((n_features, n_features));
505
506 for i in 0..n_features {
507 for j in 0..n_features {
508 if i == j {
509 correlations[[i, j]] = 1.0;
510 } else {
511 let col_i = dataset.data.column(i);
512 let col_j = dataset.data.column(j);
513
514 let corr = self.compute_correlation(&col_i, &col_j);
515 correlations[[i, j]] = corr;
516 }
517 }
518 }
519
520 Ok(correlations)
521 }
522
523 fn compute_correlation(
524 &self,
525 x: &scirs2_core::ndarray::ArrayView1<f64>,
526 y: &scirs2_core::ndarray::ArrayView1<f64>,
527 ) -> f64 {
528 let x_vals: Vec<f64> = x.iter().copied().filter(|v| !v.is_nan()).collect();
529 let y_vals: Vec<f64> = y.iter().copied().filter(|v| !v.is_nan()).collect();
530
531 if x_vals.len() != y_vals.len() || x_vals.len() < 2 {
532 return 0.0;
533 }
534
535 let mean_x = x_vals.iter().sum::<f64>() / x_vals.len() as f64;
536 let mean_y = y_vals.iter().sum::<f64>() / y_vals.len() as f64;
537
538 let mut numerator = 0.0;
539 let mut sum_sq_x = 0.0;
540 let mut sum_sq_y = 0.0;
541
542 for (x_val, y_val) in x_vals.iter().zip(y_vals.iter()) {
543 let dx = x_val - mean_x;
544 let dy = y_val - mean_y;
545
546 numerator += dx * dy;
547 sum_sq_x += dx * dx;
548 sum_sq_y += dy * dy;
549 }
550
551 let denominator = (sum_sq_x * sum_sq_y).sqrt();
552
553 if denominator == 0.0 {
554 0.0
555 } else {
556 numerator / denominator
557 }
558 }
559
560 fn analyze_missingdata(&self, dataset: &Dataset) -> MissingDataAnalysis {
561 let n_samples = dataset.n_samples();
562 let n_features = dataset.n_features();
563 let total_values = n_samples * n_features;
564
565 let mut total_missing = 0;
566 let mut feature_missing = Vec::new();
567
568 for (i, column) in dataset.data.columns().into_iter().enumerate() {
569 let missing_count = column.iter().filter(|&&x| x.is_nan()).count();
570 total_missing += missing_count;
571
572 let featurename = dataset
573 .featurenames
574 .as_ref()
575 .and_then(|names| names.get(i))
576 .cloned()
577 .unwrap_or_else(|| format!("feature_{i}"));
578
579 let missing_percentage = missing_count as f64 / n_samples as f64 * 100.0;
580 feature_missing.push((featurename, missing_count, missing_percentage));
581 }
582
583 let missing_percentage = total_missing as f64 / total_values as f64 * 100.0;
584
585 let missing_patterns = self.analyze_missing_patterns(dataset);
587
588 MissingDataAnalysis {
589 total_missing,
590 missing_percentage,
591 feature_missing,
592 missing_patterns,
593 }
594 }
595
596 fn analyze_missing_patterns(&self, dataset: &Dataset) -> Vec<MissingPattern> {
597 let mut pattern_counts: HashMap<Vec<bool>, usize> = HashMap::new();
598
599 for row in dataset.data.rows() {
600 let pattern: Vec<bool> = row.iter().map(|&x| x.is_nan()).collect();
601 *pattern_counts.entry(pattern).or_insert(0) += 1;
602 }
603
604 let total_samples = dataset.n_samples() as f64;
605 let mut patterns: Vec<MissingPattern> = pattern_counts
606 .into_iter()
607 .map(|(pattern, count)| MissingPattern {
608 pattern,
609 count,
610 percentage: count as f64 / total_samples * 100.0,
611 })
612 .collect();
613
614 patterns.sort_by(|a, b| b.count.cmp(&a.count));
616
617 patterns.truncate(10);
619
620 patterns
621 }
622
623 fn analyze_target(&self, dataset: &Dataset) -> Result<Option<TargetAnalysis>> {
624 let target = match &dataset.target {
625 Some(target) => target,
626 None => return Ok(None),
627 };
628
629 let target_column = target.view();
630 let target_stats = self.compute_single_feature_stats("target", 0, &target_column)?;
631
632 let class_distribution = if matches!(
634 target_stats.data_type,
635 InferredDataType::Categorical | InferredDataType::Binary
636 ) {
637 let mut distribution = HashMap::new();
638 for &value in target.iter() {
639 if !value.is_nan() {
640 let classname = format!("{value:.0}");
641 *distribution.entry(classname).or_insert(0) += 1;
642 }
643 }
644 Some(distribution)
645 } else {
646 None
647 };
648
649 let mut correlations_with_features = Vec::new();
651 for (i, column) in dataset.data.columns().into_iter().enumerate() {
652 let featurename = dataset
653 .featurenames
654 .as_ref()
655 .and_then(|names| names.get(i))
656 .cloned()
657 .unwrap_or_else(|| format!("feature_{i}"));
658
659 let correlation = self.compute_correlation(&column, &target_column);
660 correlations_with_features.push((featurename, correlation));
661 }
662
663 correlations_with_features
665 .sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).expect("Operation failed"));
666
667 Ok(Some(TargetAnalysis {
668 target_stats,
669 class_distribution,
670 correlations_with_features,
671 }))
672 }
673
674 fn assess_quality(
675 &self,
676 _dataset: &Dataset,
677 statistics: &FeatureStatistics,
678 missingdata: &MissingDataAnalysis,
679 ) -> Result<QualityAssessment> {
680 let mut issues = Vec::new();
681 let mut quality_score = 100.0;
682
683 if missingdata.missing_percentage > 5.0 {
685 let severity = if missingdata.missing_percentage > 20.0 {
686 Severity::High
687 } else if missingdata.missing_percentage > 10.0 {
688 Severity::Medium
689 } else {
690 Severity::Low
691 };
692
693 issues.push(QualityIssue {
694 issue_type: IssueType::MissingData,
695 severity,
696 description: format!("{:.1}% of _data is missing", missingdata.missing_percentage),
697 affected_features: missingdata
698 .feature_missing
699 .iter()
700 .filter(|(_, _, pct)| *pct > 5.0)
701 .map(|(name, _, _)| name.clone())
702 .collect(),
703 });
704
705 quality_score -= missingdata.missing_percentage.min(30.0);
706 }
707
708 let low_variance_features: Vec<String> = statistics
710 .features
711 .iter()
712 .filter(|f| f.std.is_some_and(|std| std < 1e-6))
713 .map(|f| f.name.clone())
714 .collect();
715
716 if !low_variance_features.is_empty() {
717 issues.push(QualityIssue {
718 issue_type: IssueType::LowVariance,
719 severity: Severity::Medium,
720 description: format!(
721 "{} features have very low variance",
722 low_variance_features.len()
723 ),
724 affected_features: low_variance_features,
725 });
726
727 quality_score -= 10.0;
728 }
729
730 if let Some(ref correlations) = statistics.correlations {
732 let mut high_corr_pairs = Vec::new();
733 for i in 0..correlations.nrows() {
734 for j in (i + 1)..correlations.ncols() {
735 if correlations[[i, j]].abs() > 0.9 {
736 let name_i = statistics.features[i].name.clone();
737 let name_j = statistics.features[j].name.clone();
738 high_corr_pairs.push(format!("{name_i} - {name_j}"));
739 }
740 }
741 }
742
743 if !high_corr_pairs.is_empty() {
744 issues.push(QualityIssue {
745 issue_type: IssueType::HighCorrelation,
746 severity: Severity::Medium,
747 description: format!(
748 "{} highly correlated feature pairs found",
749 high_corr_pairs.len()
750 ),
751 affected_features: high_corr_pairs,
752 });
753
754 quality_score -= 5.0;
755 }
756 }
757
758 let recommendations = self.generate_recommendations(&issues);
759
760 Ok(QualityAssessment {
761 quality_score: quality_score.max(0.0),
762 issues,
763 recommendations,
764 })
765 }
766
767 fn generate_recommendations(&self, issues: &[QualityIssue]) -> Vec<String> {
768 let mut recommendations = Vec::new();
769
770 for issue in issues {
771 match issue.issue_type {
772 IssueType::MissingData => {
773 recommendations.push("Consider imputation strategies for missing data or remove features with excessive missing values".to_string());
774 }
775 IssueType::LowVariance => {
776 recommendations.push(
777 "Remove low variance features as they provide little information"
778 .to_string(),
779 );
780 }
781 IssueType::HighCorrelation => {
782 recommendations.push("Consider removing redundant highly correlated features or use dimensionality reduction".to_string());
783 }
784 _ => {}
785 }
786 }
787
788 if recommendations.is_empty() {
789 recommendations.push("Dataset appears to be of good quality".to_string());
790 }
791
792 recommendations
793 }
794
795 fn display_table(&self, summary: &DatasetSummary) -> Result<()> {
798 self.display_basic_info(&summary.info);
799 self.display_statistics(&summary.statistics)?;
800 self.display_missingdata(&summary.missingdata)?;
801
802 if let Some(ref targetanalysis) = summary.targetanalysis {
803 self.display_targetanalysis(targetanalysis)?;
804 }
805
806 self.display_quality_assessment(&summary.quality_assessment)?;
807
808 Ok(())
809 }
810
811 fn display_basic_info(&self, info: &DatasetInfo) {
812 println!("š Dataset Overview");
813 println!("==================");
814 println!("Samples: {}", info.n_samples);
815 println!("Features: {}", info.n_features);
816 println!(
817 "Memory usage: {:.2} MB",
818 info.memory_usage as f64 / 1_048_576.0
819 );
820
821 if let Some(ref description) = info.description {
822 println!("Description: {description}");
823 }
824
825 println!();
826 }
827
828 fn display_statistics(&self, statistics: &FeatureStatistics) -> Result<()> {
829 println!("š Feature Statistics");
830 println!("====================");
831
832 println!(
834 "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
835 "Feature", "Type", "Mean", "Std", "Min", "Max", "Missing"
836 );
837 let separator = "-".repeat(80);
838 println!("{separator}");
839
840 for feature in &statistics.features {
841 let type_str = match feature.data_type {
842 InferredDataType::Numerical => "num",
843 InferredDataType::Categorical => "cat",
844 InferredDataType::Binary => "bin",
845 InferredDataType::Unknown => "unk",
846 };
847
848 println!(
849 "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
850 feature.name.chars().take(15).collect::<String>(),
851 type_str,
852 feature
853 .mean
854 .map(|x| format!("{x:.3}"))
855 .unwrap_or_else(|| "-".to_string()),
856 feature
857 .std
858 .map(|x| format!("{x:.3}"))
859 .unwrap_or_else(|| "-".to_string()),
860 feature
861 .min
862 .map(|x| format!("{x:.3}"))
863 .unwrap_or_else(|| "-".to_string()),
864 feature
865 .max
866 .map(|x| format!("{x:.3}"))
867 .unwrap_or_else(|| "-".to_string()),
868 feature.missing_count
869 );
870 }
871
872 println!();
873 Ok(())
874 }
875
876 fn display_missingdata(&self, missingdata: &MissingDataAnalysis) -> Result<()> {
877 println!("ā Missing Data Analysis");
878 println!("========================");
879 println!(
880 "Total missing: {} ({:.2}%)",
881 missingdata.total_missing, missingdata.missing_percentage
882 );
883
884 if !missingdata.feature_missing.is_empty() {
885 println!("\nMissing by feature:");
886 for (feature, count, percentage) in &missingdata.feature_missing {
887 if *count > 0 {
888 println!(" {feature}: {count} ({percentage:.1}%)");
889 }
890 }
891 }
892
893 println!();
894 Ok(())
895 }
896
897 fn display_targetanalysis(&self, targetanalysis: &TargetAnalysis) -> Result<()> {
898 println!("šÆ Target Analysis");
899 println!("==================");
900
901 let target = &targetanalysis.target_stats;
902 println!("Target type: {:?}", target.data_type);
903
904 if let Some(ref distribution) = targetanalysis.class_distribution {
905 println!("\nClass distribution:");
906 for (class, count) in distribution {
907 println!(" {class}: {count}");
908 }
909 }
910
911 println!("\nTop correlations with features:");
912 for (feature, correlation) in targetanalysis.correlations_with_features.iter().take(5) {
913 println!(" {feature}: {correlation:.3}");
914 }
915
916 println!();
917 Ok(())
918 }
919
920 fn display_quality_assessment(&self, quality: &QualityAssessment) -> Result<()> {
921 println!("ā
Quality Assessment");
922 println!("=====================");
923 println!("Quality score: {:.1}/100", quality.quality_score);
924
925 if !quality.issues.is_empty() {
926 println!("\nIssues found:");
927 for issue in &quality.issues {
928 let severity_icon = match issue.severity {
929 Severity::Low => "ā ļø",
930 Severity::Medium => "š”",
931 Severity::High => "š ",
932 Severity::Critical => "š“",
933 };
934 println!(" {} {}", severity_icon, issue.description);
935 }
936 }
937
938 println!("\nRecommendations:");
939 for recommendation in &quality.recommendations {
940 println!(" ⢠{recommendation}");
941 }
942
943 println!();
944 Ok(())
945 }
946
947 fn display_json(&self, summary: &DatasetSummary) -> Result<()> {
948 let json = serde_json::to_string_pretty(summary)
949 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?;
950 println!("{json}");
951 Ok(())
952 }
953
954 fn display_csv(&self, summary: &DatasetSummary) -> Result<()> {
955 println!("feature,type,count,mean,std,min,max,missing");
957 for feature in &summary.statistics.features {
958 println!(
959 "{},{:?},{},{},{},{},{},{}",
960 feature.name,
961 feature.data_type,
962 feature.count,
963 feature
964 .mean
965 .map(|x| x.to_string())
966 .unwrap_or_else(|| "".to_string()),
967 feature
968 .std
969 .map(|x| x.to_string())
970 .unwrap_or_else(|| "".to_string()),
971 feature
972 .min
973 .map(|x| x.to_string())
974 .unwrap_or_else(|| "".to_string()),
975 feature
976 .max
977 .map(|x| x.to_string())
978 .unwrap_or_else(|| "".to_string()),
979 feature.missing_count
980 );
981 }
982 Ok(())
983 }
984
985 fn display_markdown(&self, summary: &DatasetSummary) -> Result<()> {
986 println!("# Dataset Summary\n");
987
988 println!("## Overview\n");
989 println!("- **Samples**: {}", summary.info.n_samples);
990 println!("- **Features**: {}", summary.info.n_features);
991 println!(
992 "- **Memory usage**: {:.2} MB\n",
993 summary.info.memory_usage as f64 / 1_048_576.0
994 );
995
996 println!("## Feature Statistics\n");
997 println!("| Feature | Type | Mean | Std | Min | Max | Missing |");
998 println!("|---------|------|------|-----|-----|-----|---------|");
999
1000 for feature in &summary.statistics.features {
1001 println!(
1002 "| {} | {:?} | {} | {} | {} | {} | {} |",
1003 feature.name,
1004 feature.data_type,
1005 feature
1006 .mean
1007 .map(|x| format!("{x:.3}"))
1008 .unwrap_or_else(|| "-".to_string()),
1009 feature
1010 .std
1011 .map(|x| format!("{x:.3}"))
1012 .unwrap_or_else(|| "-".to_string()),
1013 feature
1014 .min
1015 .map(|x| format!("{x:.3}"))
1016 .unwrap_or_else(|| "-".to_string()),
1017 feature
1018 .max
1019 .map(|x| format!("{x:.3}"))
1020 .unwrap_or_else(|| "-".to_string()),
1021 feature.missing_count
1022 );
1023 }
1024
1025 println!(
1026 "\n## Quality Score: {:.1}/100\n",
1027 summary.quality_assessment.quality_score
1028 );
1029
1030 Ok(())
1031 }
1032
1033 fn interactive_feature_details(
1034 &self,
1035 dataset: &Dataset,
1036 statistics: &FeatureStatistics,
1037 ) -> Result<()> {
1038 println!("\nFeature Details");
1039 println!("===============");
1040
1041 for (i, feature) in statistics.features.iter().enumerate() {
1042 println!("{}. {}", i + 1, feature.name);
1043 }
1044
1045 print!("\nEnter feature number (or 'back'): ");
1046 io::stdout().flush().expect("Operation failed");
1047
1048 let mut input = String::new();
1049 io::stdin().read_line(&mut input).expect("Operation failed");
1050 let input = input.trim();
1051
1052 if input == "back" {
1053 return Ok(());
1054 }
1055
1056 if let Ok(index) = input.parse::<usize>() {
1057 if index > 0 && index <= statistics.features.len() {
1058 let feature = &statistics.features[index - 1];
1059 self.display_feature_detail(feature, dataset)?;
1060 } else {
1061 println!("Invalid feature number.");
1062 }
1063 } else {
1064 println!("Invalid input.");
1065 }
1066
1067 Ok(())
1068 }
1069
1070 fn display_feature_detail(&self, feature: &FeatureStats, _dataset: &Dataset) -> Result<()> {
1071 println!("\nš Feature: {}", feature.name);
1072 println!("==================");
1073 println!("Type: {:?}", feature.data_type);
1074 println!("Count: {}", feature.count);
1075 println!(
1076 "Missing: {} ({:.1}%)",
1077 feature.missing_count,
1078 feature.missing_count as f64 / feature.count as f64 * 100.0
1079 );
1080
1081 if let Some(mean) = feature.mean {
1082 println!("Mean: {mean:.6}");
1083 }
1084 if let Some(std) = feature.std {
1085 println!("Std: {std:.6}");
1086 }
1087 if let Some(min) = feature.min {
1088 println!("Min: {min:.6}");
1089 }
1090 if let Some(max) = feature.max {
1091 println!("Max: {max:.6}");
1092 }
1093 if let Some(median) = feature.median {
1094 println!("Median: {median:.6}");
1095 }
1096 if let Some(q25) = feature.q25 {
1097 println!("Q25: {q25:.6}");
1098 }
1099 if let Some(q75) = feature.q75 {
1100 println!("Q75: {q75:.6}");
1101 }
1102
1103 if let Some(ref unique_values) = feature.unique_values {
1104 println!("Unique values: {unique_values:?}");
1105 } else if let Some(unique_count) = feature.unique_count {
1106 println!("Unique count: {unique_count}");
1107 }
1108
1109 Ok(())
1110 }
1111
1112 fn export_summary(&self, summary: &DatasetSummary) -> Result<()> {
1113 print!("Export format (json/csv/markdown): ");
1114 io::stdout().flush().expect("Operation failed");
1115
1116 let mut input = String::new();
1117 io::stdin().read_line(&mut input).expect("Operation failed");
1118 let format = input.trim();
1119
1120 let filename = format!("dataset_summary.{format}");
1121
1122 let content = match format {
1123 "json" => serde_json::to_string_pretty(summary)
1124 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1125 "csv" => {
1126 let mut csv_content = String::from("feature,type,count,mean,std,min,max,missing\n");
1127 for feature in &summary.statistics.features {
1128 csv_content.push_str(&format!(
1129 "{},{:?},{},{},{},{},{},{}\n",
1130 feature.name,
1131 feature.data_type,
1132 feature.count,
1133 feature
1134 .mean
1135 .map(|x| x.to_string())
1136 .unwrap_or_else(|| "".to_string()),
1137 feature
1138 .std
1139 .map(|x| x.to_string())
1140 .unwrap_or_else(|| "".to_string()),
1141 feature
1142 .min
1143 .map(|x| x.to_string())
1144 .unwrap_or_else(|| "".to_string()),
1145 feature
1146 .max
1147 .map(|x| x.to_string())
1148 .unwrap_or_else(|| "".to_string()),
1149 feature.missing_count
1150 ));
1151 }
1152 csv_content
1153 }
1154 "markdown" => {
1155 format!(
1157 "# Dataset Summary\n\nQuality Score: {:.1}/100\n",
1158 summary.quality_assessment.quality_score
1159 )
1160 }
1161 _ => {
1162 return Err(DatasetsError::InvalidFormat(
1163 "Unsupported export format".to_string(),
1164 ))
1165 }
1166 };
1167
1168 std::fs::write(&filename, content).map_err(DatasetsError::IoError)?;
1169
1170 println!("Summary exported to: {filename}");
1171 Ok(())
1172 }
1173}
1174
1175pub mod convenience {
1177 use super::*;
1178
1179 pub fn quick_summary(dataset: &Dataset) -> Result<DatasetSummary> {
1181 let explorer = DatasetExplorer::default_config();
1182 explorer.summarize(dataset)
1183 }
1184
1185 pub fn info(dataset: &Dataset) -> Result<()> {
1187 let explorer = DatasetExplorer::default_config();
1188 let summary = explorer.summarize(dataset)?;
1189 explorer.display_basic_info(&summary.info);
1190 Ok(())
1191 }
1192
1193 pub fn explore(dataset: &Dataset) -> Result<()> {
1195 let config = ExploreConfig {
1196 interactive: true,
1197 ..Default::default()
1198 };
1199
1200 let explorer = DatasetExplorer::new(config);
1201 explorer.interactive_explore(dataset)
1202 }
1203
1204 pub fn export_summary(dataset: &Dataset, format: OutputFormat, filename: &str) -> Result<()> {
1206 let config = ExploreConfig {
1207 output_format: format,
1208 ..Default::default()
1209 };
1210 let output_format = config.output_format;
1211
1212 let explorer = DatasetExplorer::new(config);
1213 let summary = explorer.summarize(dataset)?;
1214
1215 let content = match output_format {
1216 OutputFormat::Json => serde_json::to_string_pretty(&summary)
1217 .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1218 _ => {
1219 return Err(DatasetsError::InvalidFormat(
1220 "Only JSON export is currently supported in convenience function".to_string(),
1221 ));
1222 }
1223 };
1224
1225 std::fs::write(filename, content).map_err(DatasetsError::IoError)?;
1226
1227 Ok(())
1228 }
1229}
1230
1231#[cfg(test)]
1232mod tests {
1233 use super::*;
1234 use crate::generators::make_classification;
1235
1236 #[test]
1237 fn testdataset_explorer_creation() {
1238 let explorer = DatasetExplorer::default_config();
1239 assert_eq!(explorer.config.precision, 3);
1240 assert!(explorer.config.show_detailed_stats);
1241 }
1242
1243 #[test]
1244 fn test_basic_summary() {
1245 let dataset = make_classification(100, 5, 2, 1, 1, Some(42)).expect("Operation failed");
1246 let summary = convenience::quick_summary(&dataset).expect("Operation failed");
1247
1248 assert_eq!(summary.info.n_samples, 100);
1249 assert_eq!(summary.info.n_features, 5);
1250 assert_eq!(summary.statistics.features.len(), 5);
1251 }
1252
1253 #[test]
1254 fn test_feature_statistics() {
1255 let dataset = make_classification(50, 3, 2, 1, 1, Some(42)).expect("Operation failed");
1256 let explorer = DatasetExplorer::default_config();
1257 let statistics = explorer
1258 .compute_feature_statistics(&dataset)
1259 .expect("Operation failed");
1260
1261 assert_eq!(statistics.features.len(), 3);
1262
1263 for feature in &statistics.features {
1264 assert!(feature.mean.is_some());
1265 assert!(feature.std.is_some());
1266 assert!(feature.min.is_some());
1267 assert!(feature.max.is_some());
1268 }
1269 }
1270
1271 #[test]
1272 fn test_quality_assessment() {
1273 let dataset = make_classification(100, 4, 2, 1, 1, Some(42)).expect("Operation failed");
1274 let explorer = DatasetExplorer::default_config();
1275 let summary = explorer.summarize(&dataset).expect("Operation failed");
1276
1277 assert!(summary.quality_assessment.quality_score > 80.0);
1279 }
1280
1281 #[test]
1282 fn test_data_type_inference() {
1283 let explorer = DatasetExplorer::default_config();
1284
1285 let numerical_data = vec![1.1, 2.3, 3.7, 4.2];
1287 assert!(matches!(
1288 explorer.infer_data_type(&numerical_data),
1289 InferredDataType::Numerical
1290 ));
1291
1292 let binary_data = vec![0.0, 1.0, 0.0, 1.0];
1294 assert!(matches!(
1295 explorer.infer_data_type(&binary_data),
1296 InferredDataType::Binary
1297 ));
1298
1299 let categorical_data = vec![1.0, 2.0, 3.0, 1.0, 2.0];
1301 assert!(matches!(
1302 explorer.infer_data_type(&categorical_data),
1303 InferredDataType::Categorical
1304 ));
1305 }
1306}