scirs2_datasets/
explore.rs

1//! Interactive dataset exploration and analysis tools
2//!
3//! This module provides CLI tools and utilities for exploring datasets interactively:
4//! - Dataset summary and statistics
5//! - Data visualization and plotting
6//! - Interactive data filtering and querying
7//! - Export functionality for exploration results
8
9use std::collections::HashMap;
10use std::io::{self, Write};
11
12use scirs2_core::ndarray::Array2;
13use serde::{Deserialize, Serialize};
14
15use crate::error::{DatasetsError, Result};
16use crate::utils::Dataset;
17
18/// Configuration for dataset exploration
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ExploreConfig {
21    /// Output format for results
22    pub output_format: OutputFormat,
23    /// Number of decimal places for numerical output
24    pub precision: usize,
25    /// Whether to show detailed statistics
26    pub show_detailed_stats: bool,
27    /// Maximum number of unique values to show for categorical data
28    pub max_unique_values: usize,
29    /// Enable interactive mode
30    pub interactive: bool,
31}
32
33impl Default for ExploreConfig {
34    fn default() -> Self {
35        Self {
36            output_format: OutputFormat::Table,
37            precision: 3,
38            show_detailed_stats: true,
39            max_unique_values: 20,
40            interactive: false,
41        }
42    }
43}
44
45/// Output format for exploration results
46#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
47pub enum OutputFormat {
48    /// Plain text table format
49    Table,
50    /// JSON format
51    Json,
52    /// CSV format
53    Csv,
54    /// Markdown format
55    Markdown,
56}
57
58/// Dataset exploration summary
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct DatasetSummary {
61    /// Basic dataset information
62    pub info: DatasetInfo,
63    /// Statistical summary of features
64    pub statistics: FeatureStatistics,
65    /// Missing data analysis
66    pub missingdata: MissingDataAnalysis,
67    /// Target variable analysis (if available)
68    pub targetanalysis: Option<TargetAnalysis>,
69    /// Data quality assessment
70    pub quality_assessment: QualityAssessment,
71}
72
73/// Basic dataset information
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DatasetInfo {
76    /// Number of samples
77    pub n_samples: usize,
78    /// Number of features
79    pub n_features: usize,
80    /// Feature names
81    pub featurenames: Option<Vec<String>>,
82    /// Target names
83    pub targetnames: Option<Vec<String>>,
84    /// Dataset description
85    pub description: Option<String>,
86    /// Memory usage in bytes
87    pub memory_usage: usize,
88}
89
90/// Statistical summary of features
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct FeatureStatistics {
93    /// Per-feature statistics
94    pub features: Vec<FeatureStats>,
95    /// Correlation matrix (for numerical features)
96    pub correlations: Option<Array2<f64>>,
97}
98
99/// Statistics for a single feature
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FeatureStats {
102    /// Feature name
103    pub name: String,
104    /// Feature index
105    pub index: usize,
106    /// Data type inference
107    pub data_type: InferredDataType,
108    /// Basic statistics
109    pub count: usize,
110    /// Mean value (for numerical data)
111    pub mean: Option<f64>,
112    /// Standard deviation (for numerical data)
113    pub std: Option<f64>,
114    /// Minimum value (for numerical data)
115    pub min: Option<f64>,
116    /// Maximum value (for numerical data)
117    pub max: Option<f64>,
118    /// Median value (for numerical data)
119    pub median: Option<f64>,
120    /// Percentiles (25%, 75%)
121    pub q25: Option<f64>,
122    /// 75th percentile
123    pub q75: Option<f64>,
124    /// Unique values (for categorical data)
125    pub unique_count: Option<usize>,
126    /// List of unique values (for categorical data with few values)
127    pub unique_values: Option<Vec<String>>,
128    /// Missing data count
129    pub missing_count: usize,
130}
131
132/// Inferred data type for a feature
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub enum InferredDataType {
135    /// Continuous numerical data
136    Numerical,
137    /// Categorical/string data
138    Categorical,
139    /// Binary data (0/1 or true/false)
140    Binary,
141    /// Unknown data type
142    Unknown,
143}
144
145/// Missing data analysis
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct MissingDataAnalysis {
148    /// Total missing values
149    pub total_missing: usize,
150    /// Missing percentage
151    pub missing_percentage: f64,
152    /// Per-feature missing counts
153    pub feature_missing: Vec<(String, usize, f64)>,
154    /// Missing data patterns
155    pub missing_patterns: Vec<MissingPattern>,
156}
157
158/// Missing data pattern
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct MissingPattern {
161    /// Pattern description (which features are missing)
162    pub pattern: Vec<bool>,
163    /// Number of samples with this pattern
164    pub count: usize,
165    /// Percentage of samples with this pattern
166    pub percentage: f64,
167}
168
169/// Target variable analysis
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct TargetAnalysis {
172    /// Target variable statistics
173    pub target_stats: FeatureStats,
174    /// Class distribution (for classification)
175    pub class_distribution: Option<HashMap<String, usize>>,
176    /// Target-feature correlations
177    pub correlations_with_features: Vec<(String, f64)>,
178}
179
180/// Data quality assessment
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct QualityAssessment {
183    /// Overall quality score (0-100)
184    pub quality_score: f64,
185    /// Identified issues
186    pub issues: Vec<QualityIssue>,
187    /// Recommendations
188    pub recommendations: Vec<String>,
189}
190
191/// Data quality issue
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct QualityIssue {
194    /// Issue type
195    pub issue_type: IssueType,
196    /// Severity level
197    pub severity: Severity,
198    /// Description
199    pub description: String,
200    /// Affected features
201    pub affected_features: Vec<String>,
202}
203
204/// Type of data quality issue
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub enum IssueType {
207    /// Missing data values
208    MissingData,
209    /// Statistical outliers
210    Outliers,
211    /// Duplicate rows
212    Duplicates,
213    /// Low variance features
214    LowVariance,
215    /// Highly correlated features
216    HighCorrelation,
217    /// Imbalanced class distribution
218    ImbalancedClasses,
219    /// Skewed data distribution
220    SkewedDistribution,
221}
222
223/// Severity level of an issue
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub enum Severity {
226    /// Low severity issue
227    Low,
228    /// Medium severity issue
229    Medium,
230    /// High severity issue
231    High,
232    /// Critical severity issue
233    Critical,
234}
235
236/// Dataset explorer
237pub struct DatasetExplorer {
238    config: ExploreConfig,
239}
240
241impl DatasetExplorer {
242    /// Create a new dataset explorer
243    pub fn new(config: ExploreConfig) -> Self {
244        Self { config }
245    }
246
247    /// Create with default configuration
248    pub fn default_config() -> Self {
249        Self::new(ExploreConfig::default())
250    }
251
252    /// Generate a comprehensive dataset summary
253    pub fn summarize(&self, dataset: &Dataset) -> Result<DatasetSummary> {
254        let info = self.collect_basic_info(dataset);
255        let statistics = self.compute_feature_statistics(dataset)?;
256        let missingdata = self.analyze_missingdata(dataset);
257        let targetanalysis = self.analyze_target(dataset)?;
258        let quality_assessment = self.assess_quality(dataset, &statistics, &missingdata)?;
259
260        Ok(DatasetSummary {
261            info,
262            statistics,
263            missingdata,
264            targetanalysis,
265            quality_assessment,
266        })
267    }
268
269    /// Display dataset summary in the configured format
270    pub fn display_summary(&self, summary: &DatasetSummary) -> Result<()> {
271        match self.config.output_format {
272            OutputFormat::Table => self.display_table(summary),
273            OutputFormat::Json => self.display_json(summary),
274            OutputFormat::Csv => self.display_csv(summary),
275            OutputFormat::Markdown => self.display_markdown(summary),
276        }
277    }
278
279    /// Start interactive exploration session
280    pub fn interactive_explore(&self, dataset: &Dataset) -> Result<()> {
281        if !self.config.interactive {
282            return Err(DatasetsError::InvalidFormat(
283                "Interactive mode not enabled".to_string(),
284            ));
285        }
286
287        println!("🔍 Interactive Dataset Explorer");
288        println!("==============================");
289
290        let summary = self.summarize(dataset)?;
291        self.display_basic_info(&summary.info);
292
293        loop {
294            println!("\nCommands:");
295            println!("  1. Summary statistics");
296            println!("  2. Feature details");
297            println!("  3. Missing data analysis");
298            println!("  4. Target analysis");
299            println!("  5. Quality assessment");
300            println!("  6. Export summary");
301            println!("  q. Quit");
302
303            print!("\nEnter command: ");
304            io::stdout().flush().unwrap();
305
306            let mut input = String::new();
307            io::stdin().read_line(&mut input).unwrap();
308            let input = input.trim();
309
310            match input {
311                "1" => self.display_statistics(&summary.statistics)?,
312                "2" => self.interactive_feature_details(dataset, &summary.statistics)?,
313                "3" => self.display_missingdata(&summary.missingdata)?,
314                "4" => {
315                    if let Some(ref targetanalysis) = summary.targetanalysis {
316                        self.display_targetanalysis(targetanalysis)?;
317                    } else {
318                        println!("No target variable found in dataset.");
319                    }
320                }
321                "5" => self.display_quality_assessment(&summary.quality_assessment)?,
322                "6" => self.export_summary(&summary)?,
323                "q" | "quit" | "exit" => break,
324                _ => println!("Invalid command. Please try again."),
325            }
326        }
327
328        Ok(())
329    }
330
331    // Implementation methods
332
333    fn collect_basic_info(&self, dataset: &Dataset) -> DatasetInfo {
334        let n_samples = dataset.n_samples();
335        let n_features = dataset.n_features();
336
337        // Estimate memory usage
338        let data_size = n_samples * n_features * std::mem::size_of::<f64>();
339        let target_size = dataset
340            .target
341            .as_ref()
342            .map(|t| t.len() * std::mem::size_of::<f64>())
343            .unwrap_or(0);
344        let memory_usage = data_size + target_size;
345
346        DatasetInfo {
347            n_samples,
348            n_features,
349            featurenames: dataset.featurenames.clone(),
350            targetnames: dataset.targetnames.clone(),
351            description: dataset.description.clone(),
352            memory_usage,
353        }
354    }
355
356    fn compute_feature_statistics(&self, dataset: &Dataset) -> Result<FeatureStatistics> {
357        let mut features = Vec::new();
358
359        for (i, column) in dataset.data.columns().into_iter().enumerate() {
360            let name = dataset
361                .featurenames
362                .as_ref()
363                .and_then(|names| names.get(i))
364                .cloned()
365                .unwrap_or_else(|| format!("feature_{i}"));
366
367            let stats = self.compute_single_feature_stats(&name, i, &column)?;
368            features.push(stats);
369        }
370
371        // Compute correlation matrix for numerical features
372        let correlations = if self.config.show_detailed_stats {
373            Some(self.compute_correlation_matrix(dataset)?)
374        } else {
375            None
376        };
377
378        Ok(FeatureStatistics {
379            features,
380            correlations,
381        })
382    }
383
384    fn compute_single_feature_stats(
385        &self,
386        name: &str,
387        index: usize,
388        column: &scirs2_core::ndarray::ArrayView1<f64>,
389    ) -> Result<FeatureStats> {
390        let values: Vec<f64> = column.iter().copied().collect();
391        let count = values.len();
392        let missing_count = values.iter().filter(|&&x| x.is_nan()).count();
393        let valid_values: Vec<f64> = values.iter().copied().filter(|x| !x.is_nan()).collect();
394
395        let (mean, std, min, max, median, q25, q75) = if !valid_values.is_empty() {
396            let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
397
398            let variance = valid_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
399                / valid_values.len() as f64;
400            let std = variance.sqrt();
401
402            let mut sorted_values = valid_values.clone();
403            sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
404
405            let min = sorted_values.first().copied();
406            let max = sorted_values.last().copied();
407
408            let median = Self::percentile(&sorted_values, 0.5);
409            let q25 = Self::percentile(&sorted_values, 0.25);
410            let q75 = Self::percentile(&sorted_values, 0.75);
411
412            (Some(mean), Some(std), min, max, median, q25, q75)
413        } else {
414            (None, None, None, None, None, None, None)
415        };
416
417        // Infer data type
418        let data_type = self.infer_data_type(&valid_values);
419
420        // For categorical-like data, compute unique values
421        let (unique_count, unique_values) = if matches!(
422            data_type,
423            InferredDataType::Categorical | InferredDataType::Binary
424        ) {
425            let mut unique: std::collections::HashSet<String> = std::collections::HashSet::new();
426            for &value in &valid_values {
427                unique.insert(format!("{value:.0}"));
428            }
429
430            let unique_count = unique.len();
431            let unique_values = if unique_count <= self.config.max_unique_values {
432                let mut values: Vec<String> = unique.into_iter().collect();
433                values.sort();
434                Some(values)
435            } else {
436                None
437            };
438
439            (Some(unique_count), unique_values)
440        } else {
441            (None, None)
442        };
443
444        Ok(FeatureStats {
445            name: name.to_string(),
446            index,
447            data_type,
448            count,
449            mean,
450            std,
451            min,
452            max,
453            median,
454            q25,
455            q75,
456            unique_count,
457            unique_values,
458            missing_count,
459        })
460    }
461
462    fn percentile(sorted_values: &[f64], p: f64) -> Option<f64> {
463        if sorted_values.is_empty() {
464            return None;
465        }
466
467        let index = p * (sorted_values.len() - 1) as f64;
468        let lower = index.floor() as usize;
469        let upper = index.ceil() as usize;
470
471        if lower == upper {
472            Some(sorted_values[lower])
473        } else {
474            let weight = index - lower as f64;
475            Some(sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight)
476        }
477    }
478
479    fn infer_data_type(&self, values: &[f64]) -> InferredDataType {
480        if values.is_empty() {
481            return InferredDataType::Unknown;
482        }
483
484        // Check if all values are integers
485        let all_integers = values.iter().all(|&x| x.fract() == 0.0);
486
487        if all_integers {
488            let unique_values: std::collections::HashSet<i64> =
489                values.iter().map(|&x| x as i64).collect();
490
491            match unique_values.len() {
492                1 => InferredDataType::Unknown, // Constant
493                2 => InferredDataType::Binary,
494                3..=20 => InferredDataType::Categorical,
495                _ => InferredDataType::Numerical,
496            }
497        } else {
498            InferredDataType::Numerical
499        }
500    }
501
502    fn compute_correlation_matrix(&self, dataset: &Dataset) -> Result<Array2<f64>> {
503        let n_features = dataset.n_features();
504        let mut correlations = Array2::zeros((n_features, n_features));
505
506        for i in 0..n_features {
507            for j in 0..n_features {
508                if i == j {
509                    correlations[[i, j]] = 1.0;
510                } else {
511                    let col_i = dataset.data.column(i);
512                    let col_j = dataset.data.column(j);
513
514                    let corr = self.compute_correlation(&col_i, &col_j);
515                    correlations[[i, j]] = corr;
516                }
517            }
518        }
519
520        Ok(correlations)
521    }
522
523    fn compute_correlation(
524        &self,
525        x: &scirs2_core::ndarray::ArrayView1<f64>,
526        y: &scirs2_core::ndarray::ArrayView1<f64>,
527    ) -> f64 {
528        let x_vals: Vec<f64> = x.iter().copied().filter(|v| !v.is_nan()).collect();
529        let y_vals: Vec<f64> = y.iter().copied().filter(|v| !v.is_nan()).collect();
530
531        if x_vals.len() != y_vals.len() || x_vals.len() < 2 {
532            return 0.0;
533        }
534
535        let mean_x = x_vals.iter().sum::<f64>() / x_vals.len() as f64;
536        let mean_y = y_vals.iter().sum::<f64>() / y_vals.len() as f64;
537
538        let mut numerator = 0.0;
539        let mut sum_sq_x = 0.0;
540        let mut sum_sq_y = 0.0;
541
542        for (x_val, y_val) in x_vals.iter().zip(y_vals.iter()) {
543            let dx = x_val - mean_x;
544            let dy = y_val - mean_y;
545
546            numerator += dx * dy;
547            sum_sq_x += dx * dx;
548            sum_sq_y += dy * dy;
549        }
550
551        let denominator = (sum_sq_x * sum_sq_y).sqrt();
552
553        if denominator == 0.0 {
554            0.0
555        } else {
556            numerator / denominator
557        }
558    }
559
560    fn analyze_missingdata(&self, dataset: &Dataset) -> MissingDataAnalysis {
561        let n_samples = dataset.n_samples();
562        let n_features = dataset.n_features();
563        let total_values = n_samples * n_features;
564
565        let mut total_missing = 0;
566        let mut feature_missing = Vec::new();
567
568        for (i, column) in dataset.data.columns().into_iter().enumerate() {
569            let missing_count = column.iter().filter(|&&x| x.is_nan()).count();
570            total_missing += missing_count;
571
572            let featurename = dataset
573                .featurenames
574                .as_ref()
575                .and_then(|names| names.get(i))
576                .cloned()
577                .unwrap_or_else(|| format!("feature_{i}"));
578
579            let missing_percentage = missing_count as f64 / n_samples as f64 * 100.0;
580            feature_missing.push((featurename, missing_count, missing_percentage));
581        }
582
583        let missing_percentage = total_missing as f64 / total_values as f64 * 100.0;
584
585        // Analyze missing patterns (simplified)
586        let missing_patterns = self.analyze_missing_patterns(dataset);
587
588        MissingDataAnalysis {
589            total_missing,
590            missing_percentage,
591            feature_missing,
592            missing_patterns,
593        }
594    }
595
596    fn analyze_missing_patterns(&self, dataset: &Dataset) -> Vec<MissingPattern> {
597        let mut pattern_counts: HashMap<Vec<bool>, usize> = HashMap::new();
598
599        for row in dataset.data.rows() {
600            let pattern: Vec<bool> = row.iter().map(|&x| x.is_nan()).collect();
601            *pattern_counts.entry(pattern).or_insert(0) += 1;
602        }
603
604        let total_samples = dataset.n_samples() as f64;
605        let mut patterns: Vec<MissingPattern> = pattern_counts
606            .into_iter()
607            .map(|(pattern, count)| MissingPattern {
608                pattern,
609                count,
610                percentage: count as f64 / total_samples * 100.0,
611            })
612            .collect();
613
614        // Sort by frequency
615        patterns.sort_by(|a, b| b.count.cmp(&a.count));
616
617        // Keep only top 10 patterns
618        patterns.truncate(10);
619
620        patterns
621    }
622
623    fn analyze_target(&self, dataset: &Dataset) -> Result<Option<TargetAnalysis>> {
624        let target = match &dataset.target {
625            Some(target) => target,
626            None => return Ok(None),
627        };
628
629        let target_column = target.view();
630        let target_stats = self.compute_single_feature_stats("target", 0, &target_column)?;
631
632        // Compute class distribution for classification
633        let class_distribution = if matches!(
634            target_stats.data_type,
635            InferredDataType::Categorical | InferredDataType::Binary
636        ) {
637            let mut distribution = HashMap::new();
638            for &value in target.iter() {
639                if !value.is_nan() {
640                    let classname = format!("{value:.0}");
641                    *distribution.entry(classname).or_insert(0) += 1;
642                }
643            }
644            Some(distribution)
645        } else {
646            None
647        };
648
649        // Compute correlations with features
650        let mut correlations_with_features = Vec::new();
651        for (i, column) in dataset.data.columns().into_iter().enumerate() {
652            let featurename = dataset
653                .featurenames
654                .as_ref()
655                .and_then(|names| names.get(i))
656                .cloned()
657                .unwrap_or_else(|| format!("feature_{i}"));
658
659            let correlation = self.compute_correlation(&column, &target_column);
660            correlations_with_features.push((featurename, correlation));
661        }
662
663        // Sort by absolute correlation
664        correlations_with_features.sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).unwrap());
665
666        Ok(Some(TargetAnalysis {
667            target_stats,
668            class_distribution,
669            correlations_with_features,
670        }))
671    }
672
673    fn assess_quality(
674        &self,
675        _dataset: &Dataset,
676        statistics: &FeatureStatistics,
677        missingdata: &MissingDataAnalysis,
678    ) -> Result<QualityAssessment> {
679        let mut issues = Vec::new();
680        let mut quality_score = 100.0;
681
682        // Check missing _data
683        if missingdata.missing_percentage > 5.0 {
684            let severity = if missingdata.missing_percentage > 20.0 {
685                Severity::High
686            } else if missingdata.missing_percentage > 10.0 {
687                Severity::Medium
688            } else {
689                Severity::Low
690            };
691
692            issues.push(QualityIssue {
693                issue_type: IssueType::MissingData,
694                severity,
695                description: format!("{:.1}% of _data is missing", missingdata.missing_percentage),
696                affected_features: missingdata
697                    .feature_missing
698                    .iter()
699                    .filter(|(_, _, pct)| *pct > 5.0)
700                    .map(|(name, _, _)| name.clone())
701                    .collect(),
702            });
703
704            quality_score -= missingdata.missing_percentage.min(30.0);
705        }
706
707        // Check for low variance features
708        let low_variance_features: Vec<String> = statistics
709            .features
710            .iter()
711            .filter(|f| f.std.is_some_and(|std| std < 1e-6))
712            .map(|f| f.name.clone())
713            .collect();
714
715        if !low_variance_features.is_empty() {
716            issues.push(QualityIssue {
717                issue_type: IssueType::LowVariance,
718                severity: Severity::Medium,
719                description: format!(
720                    "{} features have very low variance",
721                    low_variance_features.len()
722                ),
723                affected_features: low_variance_features,
724            });
725
726            quality_score -= 10.0;
727        }
728
729        // Check for highly correlated features
730        if let Some(ref correlations) = statistics.correlations {
731            let mut high_corr_pairs = Vec::new();
732            for i in 0..correlations.nrows() {
733                for j in (i + 1)..correlations.ncols() {
734                    if correlations[[i, j]].abs() > 0.9 {
735                        let name_i = statistics.features[i].name.clone();
736                        let name_j = statistics.features[j].name.clone();
737                        high_corr_pairs.push(format!("{name_i} - {name_j}"));
738                    }
739                }
740            }
741
742            if !high_corr_pairs.is_empty() {
743                issues.push(QualityIssue {
744                    issue_type: IssueType::HighCorrelation,
745                    severity: Severity::Medium,
746                    description: format!(
747                        "{} highly correlated feature pairs found",
748                        high_corr_pairs.len()
749                    ),
750                    affected_features: high_corr_pairs,
751                });
752
753                quality_score -= 5.0;
754            }
755        }
756
757        let recommendations = self.generate_recommendations(&issues);
758
759        Ok(QualityAssessment {
760            quality_score: quality_score.max(0.0),
761            issues,
762            recommendations,
763        })
764    }
765
766    fn generate_recommendations(&self, issues: &[QualityIssue]) -> Vec<String> {
767        let mut recommendations = Vec::new();
768
769        for issue in issues {
770            match issue.issue_type {
771                IssueType::MissingData => {
772                    recommendations.push("Consider imputation strategies for missing data or remove features with excessive missing values".to_string());
773                }
774                IssueType::LowVariance => {
775                    recommendations.push(
776                        "Remove low variance features as they provide little information"
777                            .to_string(),
778                    );
779                }
780                IssueType::HighCorrelation => {
781                    recommendations.push("Consider removing redundant highly correlated features or use dimensionality reduction".to_string());
782                }
783                _ => {}
784            }
785        }
786
787        if recommendations.is_empty() {
788            recommendations.push("Dataset appears to be of good quality".to_string());
789        }
790
791        recommendations
792    }
793
794    // Display methods
795
796    fn display_table(&self, summary: &DatasetSummary) -> Result<()> {
797        self.display_basic_info(&summary.info);
798        self.display_statistics(&summary.statistics)?;
799        self.display_missingdata(&summary.missingdata)?;
800
801        if let Some(ref targetanalysis) = summary.targetanalysis {
802            self.display_targetanalysis(targetanalysis)?;
803        }
804
805        self.display_quality_assessment(&summary.quality_assessment)?;
806
807        Ok(())
808    }
809
810    fn display_basic_info(&self, info: &DatasetInfo) {
811        println!("📊 Dataset Overview");
812        println!("==================");
813        println!("Samples: {}", info.n_samples);
814        println!("Features: {}", info.n_features);
815        println!(
816            "Memory usage: {:.2} MB",
817            info.memory_usage as f64 / 1_048_576.0
818        );
819
820        if let Some(ref description) = info.description {
821            println!("Description: {description}");
822        }
823
824        println!();
825    }
826
827    fn display_statistics(&self, statistics: &FeatureStatistics) -> Result<()> {
828        println!("📈 Feature Statistics");
829        println!("====================");
830
831        // Display table header
832        println!(
833            "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
834            "Feature", "Type", "Mean", "Std", "Min", "Max", "Missing"
835        );
836        let separator = "-".repeat(80);
837        println!("{separator}");
838
839        for feature in &statistics.features {
840            let type_str = match feature.data_type {
841                InferredDataType::Numerical => "num",
842                InferredDataType::Categorical => "cat",
843                InferredDataType::Binary => "bin",
844                InferredDataType::Unknown => "unk",
845            };
846
847            println!(
848                "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
849                feature.name.chars().take(15).collect::<String>(),
850                type_str,
851                feature
852                    .mean
853                    .map(|x| format!("{x:.3}"))
854                    .unwrap_or_else(|| "-".to_string()),
855                feature
856                    .std
857                    .map(|x| format!("{x:.3}"))
858                    .unwrap_or_else(|| "-".to_string()),
859                feature
860                    .min
861                    .map(|x| format!("{x:.3}"))
862                    .unwrap_or_else(|| "-".to_string()),
863                feature
864                    .max
865                    .map(|x| format!("{x:.3}"))
866                    .unwrap_or_else(|| "-".to_string()),
867                feature.missing_count
868            );
869        }
870
871        println!();
872        Ok(())
873    }
874
875    fn display_missingdata(&self, missingdata: &MissingDataAnalysis) -> Result<()> {
876        println!("❌ Missing Data Analysis");
877        println!("========================");
878        println!(
879            "Total missing: {} ({:.2}%)",
880            missingdata.total_missing, missingdata.missing_percentage
881        );
882
883        if !missingdata.feature_missing.is_empty() {
884            println!("\nMissing by feature:");
885            for (feature, count, percentage) in &missingdata.feature_missing {
886                if *count > 0 {
887                    println!("  {feature}: {count} ({percentage:.1}%)");
888                }
889            }
890        }
891
892        println!();
893        Ok(())
894    }
895
896    fn display_targetanalysis(&self, targetanalysis: &TargetAnalysis) -> Result<()> {
897        println!("🎯 Target Analysis");
898        println!("==================");
899
900        let target = &targetanalysis.target_stats;
901        println!("Target type: {:?}", target.data_type);
902
903        if let Some(ref distribution) = targetanalysis.class_distribution {
904            println!("\nClass distribution:");
905            for (class, count) in distribution {
906                println!("  {class}: {count}");
907            }
908        }
909
910        println!("\nTop correlations with features:");
911        for (feature, correlation) in targetanalysis.correlations_with_features.iter().take(5) {
912            println!("  {feature}: {correlation:.3}");
913        }
914
915        println!();
916        Ok(())
917    }
918
919    fn display_quality_assessment(&self, quality: &QualityAssessment) -> Result<()> {
920        println!("✅ Quality Assessment");
921        println!("=====================");
922        println!("Quality score: {:.1}/100", quality.quality_score);
923
924        if !quality.issues.is_empty() {
925            println!("\nIssues found:");
926            for issue in &quality.issues {
927                let severity_icon = match issue.severity {
928                    Severity::Low => "⚠️",
929                    Severity::Medium => "🟡",
930                    Severity::High => "🟠",
931                    Severity::Critical => "🔴",
932                };
933                println!("  {} {}", severity_icon, issue.description);
934            }
935        }
936
937        println!("\nRecommendations:");
938        for recommendation in &quality.recommendations {
939            println!("  • {recommendation}");
940        }
941
942        println!();
943        Ok(())
944    }
945
946    fn display_json(&self, summary: &DatasetSummary) -> Result<()> {
947        let json = serde_json::to_string_pretty(summary)
948            .map_err(|e| DatasetsError::SerdeError(e.to_string()))?;
949        println!("{json}");
950        Ok(())
951    }
952
953    fn display_csv(&self, summary: &DatasetSummary) -> Result<()> {
954        // CSV format for feature statistics
955        println!("feature,type,count,mean,std,min,max,missing");
956        for feature in &summary.statistics.features {
957            println!(
958                "{},{:?},{},{},{},{},{},{}",
959                feature.name,
960                feature.data_type,
961                feature.count,
962                feature
963                    .mean
964                    .map(|x| x.to_string())
965                    .unwrap_or_else(|| "".to_string()),
966                feature
967                    .std
968                    .map(|x| x.to_string())
969                    .unwrap_or_else(|| "".to_string()),
970                feature
971                    .min
972                    .map(|x| x.to_string())
973                    .unwrap_or_else(|| "".to_string()),
974                feature
975                    .max
976                    .map(|x| x.to_string())
977                    .unwrap_or_else(|| "".to_string()),
978                feature.missing_count
979            );
980        }
981        Ok(())
982    }
983
984    fn display_markdown(&self, summary: &DatasetSummary) -> Result<()> {
985        println!("# Dataset Summary\n");
986
987        println!("## Overview\n");
988        println!("- **Samples**: {}", summary.info.n_samples);
989        println!("- **Features**: {}", summary.info.n_features);
990        println!(
991            "- **Memory usage**: {:.2} MB\n",
992            summary.info.memory_usage as f64 / 1_048_576.0
993        );
994
995        println!("## Feature Statistics\n");
996        println!("| Feature | Type | Mean | Std | Min | Max | Missing |");
997        println!("|---------|------|------|-----|-----|-----|---------|");
998
999        for feature in &summary.statistics.features {
1000            println!(
1001                "| {} | {:?} | {} | {} | {} | {} | {} |",
1002                feature.name,
1003                feature.data_type,
1004                feature
1005                    .mean
1006                    .map(|x| format!("{x:.3}"))
1007                    .unwrap_or_else(|| "-".to_string()),
1008                feature
1009                    .std
1010                    .map(|x| format!("{x:.3}"))
1011                    .unwrap_or_else(|| "-".to_string()),
1012                feature
1013                    .min
1014                    .map(|x| format!("{x:.3}"))
1015                    .unwrap_or_else(|| "-".to_string()),
1016                feature
1017                    .max
1018                    .map(|x| format!("{x:.3}"))
1019                    .unwrap_or_else(|| "-".to_string()),
1020                feature.missing_count
1021            );
1022        }
1023
1024        println!(
1025            "\n## Quality Score: {:.1}/100\n",
1026            summary.quality_assessment.quality_score
1027        );
1028
1029        Ok(())
1030    }
1031
1032    fn interactive_feature_details(
1033        &self,
1034        dataset: &Dataset,
1035        statistics: &FeatureStatistics,
1036    ) -> Result<()> {
1037        println!("\nFeature Details");
1038        println!("===============");
1039
1040        for (i, feature) in statistics.features.iter().enumerate() {
1041            println!("{}. {}", i + 1, feature.name);
1042        }
1043
1044        print!("\nEnter feature number (or 'back'): ");
1045        io::stdout().flush().unwrap();
1046
1047        let mut input = String::new();
1048        io::stdin().read_line(&mut input).unwrap();
1049        let input = input.trim();
1050
1051        if input == "back" {
1052            return Ok(());
1053        }
1054
1055        if let Ok(index) = input.parse::<usize>() {
1056            if index > 0 && index <= statistics.features.len() {
1057                let feature = &statistics.features[index - 1];
1058                self.display_feature_detail(feature, dataset)?;
1059            } else {
1060                println!("Invalid feature number.");
1061            }
1062        } else {
1063            println!("Invalid input.");
1064        }
1065
1066        Ok(())
1067    }
1068
1069    fn display_feature_detail(&self, feature: &FeatureStats, _dataset: &Dataset) -> Result<()> {
1070        println!("\n📊 Feature: {}", feature.name);
1071        println!("==================");
1072        println!("Type: {:?}", feature.data_type);
1073        println!("Count: {}", feature.count);
1074        println!(
1075            "Missing: {} ({:.1}%)",
1076            feature.missing_count,
1077            feature.missing_count as f64 / feature.count as f64 * 100.0
1078        );
1079
1080        if let Some(mean) = feature.mean {
1081            println!("Mean: {mean:.6}");
1082        }
1083        if let Some(std) = feature.std {
1084            println!("Std: {std:.6}");
1085        }
1086        if let Some(min) = feature.min {
1087            println!("Min: {min:.6}");
1088        }
1089        if let Some(max) = feature.max {
1090            println!("Max: {max:.6}");
1091        }
1092        if let Some(median) = feature.median {
1093            println!("Median: {median:.6}");
1094        }
1095        if let Some(q25) = feature.q25 {
1096            println!("Q25: {q25:.6}");
1097        }
1098        if let Some(q75) = feature.q75 {
1099            println!("Q75: {q75:.6}");
1100        }
1101
1102        if let Some(ref unique_values) = feature.unique_values {
1103            println!("Unique values: {unique_values:?}");
1104        } else if let Some(unique_count) = feature.unique_count {
1105            println!("Unique count: {unique_count}");
1106        }
1107
1108        Ok(())
1109    }
1110
1111    fn export_summary(&self, summary: &DatasetSummary) -> Result<()> {
1112        print!("Export format (json/csv/markdown): ");
1113        io::stdout().flush().unwrap();
1114
1115        let mut input = String::new();
1116        io::stdin().read_line(&mut input).unwrap();
1117        let format = input.trim();
1118
1119        let filename = format!("dataset_summary.{format}");
1120
1121        let content = match format {
1122            "json" => serde_json::to_string_pretty(summary)
1123                .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1124            "csv" => {
1125                let mut csv_content = String::from("feature,type,count,mean,std,min,max,missing\n");
1126                for feature in &summary.statistics.features {
1127                    csv_content.push_str(&format!(
1128                        "{},{:?},{},{},{},{},{},{}\n",
1129                        feature.name,
1130                        feature.data_type,
1131                        feature.count,
1132                        feature
1133                            .mean
1134                            .map(|x| x.to_string())
1135                            .unwrap_or_else(|| "".to_string()),
1136                        feature
1137                            .std
1138                            .map(|x| x.to_string())
1139                            .unwrap_or_else(|| "".to_string()),
1140                        feature
1141                            .min
1142                            .map(|x| x.to_string())
1143                            .unwrap_or_else(|| "".to_string()),
1144                        feature
1145                            .max
1146                            .map(|x| x.to_string())
1147                            .unwrap_or_else(|| "".to_string()),
1148                        feature.missing_count
1149                    ));
1150                }
1151                csv_content
1152            }
1153            "markdown" => {
1154                // Generate markdown content
1155                format!(
1156                    "# Dataset Summary\n\nQuality Score: {:.1}/100\n",
1157                    summary.quality_assessment.quality_score
1158                )
1159            }
1160            _ => {
1161                return Err(DatasetsError::InvalidFormat(
1162                    "Unsupported export format".to_string(),
1163                ))
1164            }
1165        };
1166
1167        std::fs::write(&filename, content).map_err(DatasetsError::IoError)?;
1168
1169        println!("Summary exported to: {filename}");
1170        Ok(())
1171    }
1172}
1173
1174/// Convenience functions for dataset exploration
1175pub mod convenience {
1176    use super::*;
1177
1178    /// Quick dataset summary with default configuration
1179    pub fn quick_summary(dataset: &Dataset) -> Result<DatasetSummary> {
1180        let explorer = DatasetExplorer::default_config();
1181        explorer.summarize(dataset)
1182    }
1183
1184    /// Display basic dataset information
1185    pub fn info(dataset: &Dataset) -> Result<()> {
1186        let explorer = DatasetExplorer::default_config();
1187        let summary = explorer.summarize(dataset)?;
1188        explorer.display_basic_info(&summary.info);
1189        Ok(())
1190    }
1191
1192    /// Start interactive exploration
1193    pub fn explore(dataset: &Dataset) -> Result<()> {
1194        let config = ExploreConfig {
1195            interactive: true,
1196            ..Default::default()
1197        };
1198
1199        let explorer = DatasetExplorer::new(config);
1200        explorer.interactive_explore(dataset)
1201    }
1202
1203    /// Export dataset summary to file
1204    pub fn export_summary(dataset: &Dataset, format: OutputFormat, filename: &str) -> Result<()> {
1205        let config = ExploreConfig {
1206            output_format: format,
1207            ..Default::default()
1208        };
1209        let output_format = config.output_format;
1210
1211        let explorer = DatasetExplorer::new(config);
1212        let summary = explorer.summarize(dataset)?;
1213
1214        let content = match output_format {
1215            OutputFormat::Json => serde_json::to_string_pretty(&summary)
1216                .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1217            _ => {
1218                return Err(DatasetsError::InvalidFormat(
1219                    "Only JSON export is currently supported in convenience function".to_string(),
1220                ));
1221            }
1222        };
1223
1224        std::fs::write(filename, content).map_err(DatasetsError::IoError)?;
1225
1226        Ok(())
1227    }
1228}
1229
1230#[cfg(test)]
1231mod tests {
1232    use super::*;
1233    use crate::generators::make_classification;
1234
1235    #[test]
1236    fn testdataset_explorer_creation() {
1237        let explorer = DatasetExplorer::default_config();
1238        assert_eq!(explorer.config.precision, 3);
1239        assert!(explorer.config.show_detailed_stats);
1240    }
1241
1242    #[test]
1243    fn test_basic_summary() {
1244        let dataset = make_classification(100, 5, 2, 1, 1, Some(42)).unwrap();
1245        let summary = convenience::quick_summary(&dataset).unwrap();
1246
1247        assert_eq!(summary.info.n_samples, 100);
1248        assert_eq!(summary.info.n_features, 5);
1249        assert_eq!(summary.statistics.features.len(), 5);
1250    }
1251
1252    #[test]
1253    fn test_feature_statistics() {
1254        let dataset = make_classification(50, 3, 2, 1, 1, Some(42)).unwrap();
1255        let explorer = DatasetExplorer::default_config();
1256        let statistics = explorer.compute_feature_statistics(&dataset).unwrap();
1257
1258        assert_eq!(statistics.features.len(), 3);
1259
1260        for feature in &statistics.features {
1261            assert!(feature.mean.is_some());
1262            assert!(feature.std.is_some());
1263            assert!(feature.min.is_some());
1264            assert!(feature.max.is_some());
1265        }
1266    }
1267
1268    #[test]
1269    fn test_quality_assessment() {
1270        let dataset = make_classification(100, 4, 2, 1, 1, Some(42)).unwrap();
1271        let explorer = DatasetExplorer::default_config();
1272        let summary = explorer.summarize(&dataset).unwrap();
1273
1274        // Should have high quality score for synthetic data
1275        assert!(summary.quality_assessment.quality_score > 80.0);
1276    }
1277
1278    #[test]
1279    fn test_data_type_inference() {
1280        let explorer = DatasetExplorer::default_config();
1281
1282        // Test numerical data
1283        let numerical_data = vec![1.1, 2.3, 3.7, 4.2];
1284        assert!(matches!(
1285            explorer.infer_data_type(&numerical_data),
1286            InferredDataType::Numerical
1287        ));
1288
1289        // Test binary data
1290        let binary_data = vec![0.0, 1.0, 0.0, 1.0];
1291        assert!(matches!(
1292            explorer.infer_data_type(&binary_data),
1293            InferredDataType::Binary
1294        ));
1295
1296        // Test categorical data
1297        let categorical_data = vec![1.0, 2.0, 3.0, 1.0, 2.0];
1298        assert!(matches!(
1299            explorer.infer_data_type(&categorical_data),
1300            InferredDataType::Categorical
1301        ));
1302    }
1303}
scirs2_datasets/explore.rs

scirs2_datasets/
explore.rs