scirs2_datasets/
explore.rs

1//! Interactive dataset exploration and analysis tools
2//!
3//! This module provides CLI tools and utilities for exploring datasets interactively:
4//! - Dataset summary and statistics
5//! - Data visualization and plotting
6//! - Interactive data filtering and querying
7//! - Export functionality for exploration results
8
9use std::collections::HashMap;
10use std::io::{self, Write};
11
12use scirs2_core::ndarray::Array2;
13use serde::{Deserialize, Serialize};
14
15use crate::error::{DatasetsError, Result};
16use crate::utils::Dataset;
17
18/// Configuration for dataset exploration
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ExploreConfig {
21    /// Output format for results
22    pub output_format: OutputFormat,
23    /// Number of decimal places for numerical output
24    pub precision: usize,
25    /// Whether to show detailed statistics
26    pub show_detailed_stats: bool,
27    /// Maximum number of unique values to show for categorical data
28    pub max_unique_values: usize,
29    /// Enable interactive mode
30    pub interactive: bool,
31}
32
33impl Default for ExploreConfig {
34    fn default() -> Self {
35        Self {
36            output_format: OutputFormat::Table,
37            precision: 3,
38            show_detailed_stats: true,
39            max_unique_values: 20,
40            interactive: false,
41        }
42    }
43}
44
45/// Output format for exploration results
46#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
47pub enum OutputFormat {
48    /// Plain text table format
49    Table,
50    /// JSON format
51    Json,
52    /// CSV format
53    Csv,
54    /// Markdown format
55    Markdown,
56}
57
58/// Dataset exploration summary
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct DatasetSummary {
61    /// Basic dataset information
62    pub info: DatasetInfo,
63    /// Statistical summary of features
64    pub statistics: FeatureStatistics,
65    /// Missing data analysis
66    pub missingdata: MissingDataAnalysis,
67    /// Target variable analysis (if available)
68    pub targetanalysis: Option<TargetAnalysis>,
69    /// Data quality assessment
70    pub quality_assessment: QualityAssessment,
71}
72
73/// Basic dataset information
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DatasetInfo {
76    /// Number of samples
77    pub n_samples: usize,
78    /// Number of features
79    pub n_features: usize,
80    /// Feature names
81    pub featurenames: Option<Vec<String>>,
82    /// Target names
83    pub targetnames: Option<Vec<String>>,
84    /// Dataset description
85    pub description: Option<String>,
86    /// Memory usage in bytes
87    pub memory_usage: usize,
88}
89
90/// Statistical summary of features
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct FeatureStatistics {
93    /// Per-feature statistics
94    pub features: Vec<FeatureStats>,
95    /// Correlation matrix (for numerical features)
96    pub correlations: Option<Array2<f64>>,
97}
98
99/// Statistics for a single feature
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct FeatureStats {
102    /// Feature name
103    pub name: String,
104    /// Feature index
105    pub index: usize,
106    /// Data type inference
107    pub data_type: InferredDataType,
108    /// Basic statistics
109    pub count: usize,
110    /// Mean value (for numerical data)
111    pub mean: Option<f64>,
112    /// Standard deviation (for numerical data)
113    pub std: Option<f64>,
114    /// Minimum value (for numerical data)
115    pub min: Option<f64>,
116    /// Maximum value (for numerical data)
117    pub max: Option<f64>,
118    /// Median value (for numerical data)
119    pub median: Option<f64>,
120    /// Percentiles (25%, 75%)
121    pub q25: Option<f64>,
122    /// 75th percentile
123    pub q75: Option<f64>,
124    /// Unique values (for categorical data)
125    pub unique_count: Option<usize>,
126    /// List of unique values (for categorical data with few values)
127    pub unique_values: Option<Vec<String>>,
128    /// Missing data count
129    pub missing_count: usize,
130}
131
132/// Inferred data type for a feature
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub enum InferredDataType {
135    /// Continuous numerical data
136    Numerical,
137    /// Categorical/string data
138    Categorical,
139    /// Binary data (0/1 or true/false)
140    Binary,
141    /// Unknown data type
142    Unknown,
143}
144
145/// Missing data analysis
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct MissingDataAnalysis {
148    /// Total missing values
149    pub total_missing: usize,
150    /// Missing percentage
151    pub missing_percentage: f64,
152    /// Per-feature missing counts
153    pub feature_missing: Vec<(String, usize, f64)>,
154    /// Missing data patterns
155    pub missing_patterns: Vec<MissingPattern>,
156}
157
158/// Missing data pattern
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct MissingPattern {
161    /// Pattern description (which features are missing)
162    pub pattern: Vec<bool>,
163    /// Number of samples with this pattern
164    pub count: usize,
165    /// Percentage of samples with this pattern
166    pub percentage: f64,
167}
168
169/// Target variable analysis
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct TargetAnalysis {
172    /// Target variable statistics
173    pub target_stats: FeatureStats,
174    /// Class distribution (for classification)
175    pub class_distribution: Option<HashMap<String, usize>>,
176    /// Target-feature correlations
177    pub correlations_with_features: Vec<(String, f64)>,
178}
179
180/// Data quality assessment
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct QualityAssessment {
183    /// Overall quality score (0-100)
184    pub quality_score: f64,
185    /// Identified issues
186    pub issues: Vec<QualityIssue>,
187    /// Recommendations
188    pub recommendations: Vec<String>,
189}
190
191/// Data quality issue
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct QualityIssue {
194    /// Issue type
195    pub issue_type: IssueType,
196    /// Severity level
197    pub severity: Severity,
198    /// Description
199    pub description: String,
200    /// Affected features
201    pub affected_features: Vec<String>,
202}
203
204/// Type of data quality issue
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub enum IssueType {
207    /// Missing data values
208    MissingData,
209    /// Statistical outliers
210    Outliers,
211    /// Duplicate rows
212    Duplicates,
213    /// Low variance features
214    LowVariance,
215    /// Highly correlated features
216    HighCorrelation,
217    /// Imbalanced class distribution
218    ImbalancedClasses,
219    /// Skewed data distribution
220    SkewedDistribution,
221}
222
223/// Severity level of an issue
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub enum Severity {
226    /// Low severity issue
227    Low,
228    /// Medium severity issue
229    Medium,
230    /// High severity issue
231    High,
232    /// Critical severity issue
233    Critical,
234}
235
236/// Dataset explorer
237pub struct DatasetExplorer {
238    config: ExploreConfig,
239}
240
241impl DatasetExplorer {
242    /// Create a new dataset explorer
243    pub fn new(config: ExploreConfig) -> Self {
244        Self { config }
245    }
246
247    /// Create with default configuration
248    pub fn default_config() -> Self {
249        Self::new(ExploreConfig::default())
250    }
251
252    /// Generate a comprehensive dataset summary
253    pub fn summarize(&self, dataset: &Dataset) -> Result<DatasetSummary> {
254        let info = self.collect_basic_info(dataset);
255        let statistics = self.compute_feature_statistics(dataset)?;
256        let missingdata = self.analyze_missingdata(dataset);
257        let targetanalysis = self.analyze_target(dataset)?;
258        let quality_assessment = self.assess_quality(dataset, &statistics, &missingdata)?;
259
260        Ok(DatasetSummary {
261            info,
262            statistics,
263            missingdata,
264            targetanalysis,
265            quality_assessment,
266        })
267    }
268
269    /// Display dataset summary in the configured format
270    pub fn display_summary(&self, summary: &DatasetSummary) -> Result<()> {
271        match self.config.output_format {
272            OutputFormat::Table => self.display_table(summary),
273            OutputFormat::Json => self.display_json(summary),
274            OutputFormat::Csv => self.display_csv(summary),
275            OutputFormat::Markdown => self.display_markdown(summary),
276        }
277    }
278
279    /// Start interactive exploration session
280    pub fn interactive_explore(&self, dataset: &Dataset) -> Result<()> {
281        if !self.config.interactive {
282            return Err(DatasetsError::InvalidFormat(
283                "Interactive mode not enabled".to_string(),
284            ));
285        }
286
287        println!("🔍 Interactive Dataset Explorer");
288        println!("==============================");
289
290        let summary = self.summarize(dataset)?;
291        self.display_basic_info(&summary.info);
292
293        loop {
294            println!("\nCommands:");
295            println!("  1. Summary statistics");
296            println!("  2. Feature details");
297            println!("  3. Missing data analysis");
298            println!("  4. Target analysis");
299            println!("  5. Quality assessment");
300            println!("  6. Export summary");
301            println!("  q. Quit");
302
303            print!("\nEnter command: ");
304            io::stdout().flush().expect("Operation failed");
305
306            let mut input = String::new();
307            io::stdin().read_line(&mut input).expect("Operation failed");
308            let input = input.trim();
309
310            match input {
311                "1" => self.display_statistics(&summary.statistics)?,
312                "2" => self.interactive_feature_details(dataset, &summary.statistics)?,
313                "3" => self.display_missingdata(&summary.missingdata)?,
314                "4" => {
315                    if let Some(ref targetanalysis) = summary.targetanalysis {
316                        self.display_targetanalysis(targetanalysis)?;
317                    } else {
318                        println!("No target variable found in dataset.");
319                    }
320                }
321                "5" => self.display_quality_assessment(&summary.quality_assessment)?,
322                "6" => self.export_summary(&summary)?,
323                "q" | "quit" | "exit" => break,
324                _ => println!("Invalid command. Please try again."),
325            }
326        }
327
328        Ok(())
329    }
330
331    // Implementation methods
332
333    fn collect_basic_info(&self, dataset: &Dataset) -> DatasetInfo {
334        let n_samples = dataset.n_samples();
335        let n_features = dataset.n_features();
336
337        // Estimate memory usage
338        let data_size = n_samples * n_features * std::mem::size_of::<f64>();
339        let target_size = dataset
340            .target
341            .as_ref()
342            .map(|t| t.len() * std::mem::size_of::<f64>())
343            .unwrap_or(0);
344        let memory_usage = data_size + target_size;
345
346        DatasetInfo {
347            n_samples,
348            n_features,
349            featurenames: dataset.featurenames.clone(),
350            targetnames: dataset.targetnames.clone(),
351            description: dataset.description.clone(),
352            memory_usage,
353        }
354    }
355
356    fn compute_feature_statistics(&self, dataset: &Dataset) -> Result<FeatureStatistics> {
357        let mut features = Vec::new();
358
359        for (i, column) in dataset.data.columns().into_iter().enumerate() {
360            let name = dataset
361                .featurenames
362                .as_ref()
363                .and_then(|names| names.get(i))
364                .cloned()
365                .unwrap_or_else(|| format!("feature_{i}"));
366
367            let stats = self.compute_single_feature_stats(&name, i, &column)?;
368            features.push(stats);
369        }
370
371        // Compute correlation matrix for numerical features
372        let correlations = if self.config.show_detailed_stats {
373            Some(self.compute_correlation_matrix(dataset)?)
374        } else {
375            None
376        };
377
378        Ok(FeatureStatistics {
379            features,
380            correlations,
381        })
382    }
383
384    fn compute_single_feature_stats(
385        &self,
386        name: &str,
387        index: usize,
388        column: &scirs2_core::ndarray::ArrayView1<f64>,
389    ) -> Result<FeatureStats> {
390        let values: Vec<f64> = column.iter().copied().collect();
391        let count = values.len();
392        let missing_count = values.iter().filter(|&&x| x.is_nan()).count();
393        let valid_values: Vec<f64> = values.iter().copied().filter(|x| !x.is_nan()).collect();
394
395        let (mean, std, min, max, median, q25, q75) = if !valid_values.is_empty() {
396            let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
397
398            let variance = valid_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
399                / valid_values.len() as f64;
400            let std = variance.sqrt();
401
402            let mut sorted_values = valid_values.clone();
403            sorted_values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
404
405            let min = sorted_values.first().copied();
406            let max = sorted_values.last().copied();
407
408            let median = Self::percentile(&sorted_values, 0.5);
409            let q25 = Self::percentile(&sorted_values, 0.25);
410            let q75 = Self::percentile(&sorted_values, 0.75);
411
412            (Some(mean), Some(std), min, max, median, q25, q75)
413        } else {
414            (None, None, None, None, None, None, None)
415        };
416
417        // Infer data type
418        let data_type = self.infer_data_type(&valid_values);
419
420        // For categorical-like data, compute unique values
421        let (unique_count, unique_values) = if matches!(
422            data_type,
423            InferredDataType::Categorical | InferredDataType::Binary
424        ) {
425            let mut unique: std::collections::HashSet<String> = std::collections::HashSet::new();
426            for &value in &valid_values {
427                unique.insert(format!("{value:.0}"));
428            }
429
430            let unique_count = unique.len();
431            let unique_values = if unique_count <= self.config.max_unique_values {
432                let mut values: Vec<String> = unique.into_iter().collect();
433                values.sort();
434                Some(values)
435            } else {
436                None
437            };
438
439            (Some(unique_count), unique_values)
440        } else {
441            (None, None)
442        };
443
444        Ok(FeatureStats {
445            name: name.to_string(),
446            index,
447            data_type,
448            count,
449            mean,
450            std,
451            min,
452            max,
453            median,
454            q25,
455            q75,
456            unique_count,
457            unique_values,
458            missing_count,
459        })
460    }
461
462    fn percentile(sorted_values: &[f64], p: f64) -> Option<f64> {
463        if sorted_values.is_empty() {
464            return None;
465        }
466
467        let index = p * (sorted_values.len() - 1) as f64;
468        let lower = index.floor() as usize;
469        let upper = index.ceil() as usize;
470
471        if lower == upper {
472            Some(sorted_values[lower])
473        } else {
474            let weight = index - lower as f64;
475            Some(sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight)
476        }
477    }
478
479    fn infer_data_type(&self, values: &[f64]) -> InferredDataType {
480        if values.is_empty() {
481            return InferredDataType::Unknown;
482        }
483
484        // Check if all values are integers
485        let all_integers = values.iter().all(|&x| x.fract() == 0.0);
486
487        if all_integers {
488            let unique_values: std::collections::HashSet<i64> =
489                values.iter().map(|&x| x as i64).collect();
490
491            match unique_values.len() {
492                1 => InferredDataType::Unknown, // Constant
493                2 => InferredDataType::Binary,
494                3..=20 => InferredDataType::Categorical,
495                _ => InferredDataType::Numerical,
496            }
497        } else {
498            InferredDataType::Numerical
499        }
500    }
501
502    fn compute_correlation_matrix(&self, dataset: &Dataset) -> Result<Array2<f64>> {
503        let n_features = dataset.n_features();
504        let mut correlations = Array2::zeros((n_features, n_features));
505
506        for i in 0..n_features {
507            for j in 0..n_features {
508                if i == j {
509                    correlations[[i, j]] = 1.0;
510                } else {
511                    let col_i = dataset.data.column(i);
512                    let col_j = dataset.data.column(j);
513
514                    let corr = self.compute_correlation(&col_i, &col_j);
515                    correlations[[i, j]] = corr;
516                }
517            }
518        }
519
520        Ok(correlations)
521    }
522
523    fn compute_correlation(
524        &self,
525        x: &scirs2_core::ndarray::ArrayView1<f64>,
526        y: &scirs2_core::ndarray::ArrayView1<f64>,
527    ) -> f64 {
528        let x_vals: Vec<f64> = x.iter().copied().filter(|v| !v.is_nan()).collect();
529        let y_vals: Vec<f64> = y.iter().copied().filter(|v| !v.is_nan()).collect();
530
531        if x_vals.len() != y_vals.len() || x_vals.len() < 2 {
532            return 0.0;
533        }
534
535        let mean_x = x_vals.iter().sum::<f64>() / x_vals.len() as f64;
536        let mean_y = y_vals.iter().sum::<f64>() / y_vals.len() as f64;
537
538        let mut numerator = 0.0;
539        let mut sum_sq_x = 0.0;
540        let mut sum_sq_y = 0.0;
541
542        for (x_val, y_val) in x_vals.iter().zip(y_vals.iter()) {
543            let dx = x_val - mean_x;
544            let dy = y_val - mean_y;
545
546            numerator += dx * dy;
547            sum_sq_x += dx * dx;
548            sum_sq_y += dy * dy;
549        }
550
551        let denominator = (sum_sq_x * sum_sq_y).sqrt();
552
553        if denominator == 0.0 {
554            0.0
555        } else {
556            numerator / denominator
557        }
558    }
559
560    fn analyze_missingdata(&self, dataset: &Dataset) -> MissingDataAnalysis {
561        let n_samples = dataset.n_samples();
562        let n_features = dataset.n_features();
563        let total_values = n_samples * n_features;
564
565        let mut total_missing = 0;
566        let mut feature_missing = Vec::new();
567
568        for (i, column) in dataset.data.columns().into_iter().enumerate() {
569            let missing_count = column.iter().filter(|&&x| x.is_nan()).count();
570            total_missing += missing_count;
571
572            let featurename = dataset
573                .featurenames
574                .as_ref()
575                .and_then(|names| names.get(i))
576                .cloned()
577                .unwrap_or_else(|| format!("feature_{i}"));
578
579            let missing_percentage = missing_count as f64 / n_samples as f64 * 100.0;
580            feature_missing.push((featurename, missing_count, missing_percentage));
581        }
582
583        let missing_percentage = total_missing as f64 / total_values as f64 * 100.0;
584
585        // Analyze missing patterns (simplified)
586        let missing_patterns = self.analyze_missing_patterns(dataset);
587
588        MissingDataAnalysis {
589            total_missing,
590            missing_percentage,
591            feature_missing,
592            missing_patterns,
593        }
594    }
595
596    fn analyze_missing_patterns(&self, dataset: &Dataset) -> Vec<MissingPattern> {
597        let mut pattern_counts: HashMap<Vec<bool>, usize> = HashMap::new();
598
599        for row in dataset.data.rows() {
600            let pattern: Vec<bool> = row.iter().map(|&x| x.is_nan()).collect();
601            *pattern_counts.entry(pattern).or_insert(0) += 1;
602        }
603
604        let total_samples = dataset.n_samples() as f64;
605        let mut patterns: Vec<MissingPattern> = pattern_counts
606            .into_iter()
607            .map(|(pattern, count)| MissingPattern {
608                pattern,
609                count,
610                percentage: count as f64 / total_samples * 100.0,
611            })
612            .collect();
613
614        // Sort by frequency
615        patterns.sort_by(|a, b| b.count.cmp(&a.count));
616
617        // Keep only top 10 patterns
618        patterns.truncate(10);
619
620        patterns
621    }
622
623    fn analyze_target(&self, dataset: &Dataset) -> Result<Option<TargetAnalysis>> {
624        let target = match &dataset.target {
625            Some(target) => target,
626            None => return Ok(None),
627        };
628
629        let target_column = target.view();
630        let target_stats = self.compute_single_feature_stats("target", 0, &target_column)?;
631
632        // Compute class distribution for classification
633        let class_distribution = if matches!(
634            target_stats.data_type,
635            InferredDataType::Categorical | InferredDataType::Binary
636        ) {
637            let mut distribution = HashMap::new();
638            for &value in target.iter() {
639                if !value.is_nan() {
640                    let classname = format!("{value:.0}");
641                    *distribution.entry(classname).or_insert(0) += 1;
642                }
643            }
644            Some(distribution)
645        } else {
646            None
647        };
648
649        // Compute correlations with features
650        let mut correlations_with_features = Vec::new();
651        for (i, column) in dataset.data.columns().into_iter().enumerate() {
652            let featurename = dataset
653                .featurenames
654                .as_ref()
655                .and_then(|names| names.get(i))
656                .cloned()
657                .unwrap_or_else(|| format!("feature_{i}"));
658
659            let correlation = self.compute_correlation(&column, &target_column);
660            correlations_with_features.push((featurename, correlation));
661        }
662
663        // Sort by absolute correlation
664        correlations_with_features
665            .sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).expect("Operation failed"));
666
667        Ok(Some(TargetAnalysis {
668            target_stats,
669            class_distribution,
670            correlations_with_features,
671        }))
672    }
673
674    fn assess_quality(
675        &self,
676        _dataset: &Dataset,
677        statistics: &FeatureStatistics,
678        missingdata: &MissingDataAnalysis,
679    ) -> Result<QualityAssessment> {
680        let mut issues = Vec::new();
681        let mut quality_score = 100.0;
682
683        // Check missing _data
684        if missingdata.missing_percentage > 5.0 {
685            let severity = if missingdata.missing_percentage > 20.0 {
686                Severity::High
687            } else if missingdata.missing_percentage > 10.0 {
688                Severity::Medium
689            } else {
690                Severity::Low
691            };
692
693            issues.push(QualityIssue {
694                issue_type: IssueType::MissingData,
695                severity,
696                description: format!("{:.1}% of _data is missing", missingdata.missing_percentage),
697                affected_features: missingdata
698                    .feature_missing
699                    .iter()
700                    .filter(|(_, _, pct)| *pct > 5.0)
701                    .map(|(name, _, _)| name.clone())
702                    .collect(),
703            });
704
705            quality_score -= missingdata.missing_percentage.min(30.0);
706        }
707
708        // Check for low variance features
709        let low_variance_features: Vec<String> = statistics
710            .features
711            .iter()
712            .filter(|f| f.std.is_some_and(|std| std < 1e-6))
713            .map(|f| f.name.clone())
714            .collect();
715
716        if !low_variance_features.is_empty() {
717            issues.push(QualityIssue {
718                issue_type: IssueType::LowVariance,
719                severity: Severity::Medium,
720                description: format!(
721                    "{} features have very low variance",
722                    low_variance_features.len()
723                ),
724                affected_features: low_variance_features,
725            });
726
727            quality_score -= 10.0;
728        }
729
730        // Check for highly correlated features
731        if let Some(ref correlations) = statistics.correlations {
732            let mut high_corr_pairs = Vec::new();
733            for i in 0..correlations.nrows() {
734                for j in (i + 1)..correlations.ncols() {
735                    if correlations[[i, j]].abs() > 0.9 {
736                        let name_i = statistics.features[i].name.clone();
737                        let name_j = statistics.features[j].name.clone();
738                        high_corr_pairs.push(format!("{name_i} - {name_j}"));
739                    }
740                }
741            }
742
743            if !high_corr_pairs.is_empty() {
744                issues.push(QualityIssue {
745                    issue_type: IssueType::HighCorrelation,
746                    severity: Severity::Medium,
747                    description: format!(
748                        "{} highly correlated feature pairs found",
749                        high_corr_pairs.len()
750                    ),
751                    affected_features: high_corr_pairs,
752                });
753
754                quality_score -= 5.0;
755            }
756        }
757
758        let recommendations = self.generate_recommendations(&issues);
759
760        Ok(QualityAssessment {
761            quality_score: quality_score.max(0.0),
762            issues,
763            recommendations,
764        })
765    }
766
767    fn generate_recommendations(&self, issues: &[QualityIssue]) -> Vec<String> {
768        let mut recommendations = Vec::new();
769
770        for issue in issues {
771            match issue.issue_type {
772                IssueType::MissingData => {
773                    recommendations.push("Consider imputation strategies for missing data or remove features with excessive missing values".to_string());
774                }
775                IssueType::LowVariance => {
776                    recommendations.push(
777                        "Remove low variance features as they provide little information"
778                            .to_string(),
779                    );
780                }
781                IssueType::HighCorrelation => {
782                    recommendations.push("Consider removing redundant highly correlated features or use dimensionality reduction".to_string());
783                }
784                _ => {}
785            }
786        }
787
788        if recommendations.is_empty() {
789            recommendations.push("Dataset appears to be of good quality".to_string());
790        }
791
792        recommendations
793    }
794
795    // Display methods
796
797    fn display_table(&self, summary: &DatasetSummary) -> Result<()> {
798        self.display_basic_info(&summary.info);
799        self.display_statistics(&summary.statistics)?;
800        self.display_missingdata(&summary.missingdata)?;
801
802        if let Some(ref targetanalysis) = summary.targetanalysis {
803            self.display_targetanalysis(targetanalysis)?;
804        }
805
806        self.display_quality_assessment(&summary.quality_assessment)?;
807
808        Ok(())
809    }
810
811    fn display_basic_info(&self, info: &DatasetInfo) {
812        println!("📊 Dataset Overview");
813        println!("==================");
814        println!("Samples: {}", info.n_samples);
815        println!("Features: {}", info.n_features);
816        println!(
817            "Memory usage: {:.2} MB",
818            info.memory_usage as f64 / 1_048_576.0
819        );
820
821        if let Some(ref description) = info.description {
822            println!("Description: {description}");
823        }
824
825        println!();
826    }
827
828    fn display_statistics(&self, statistics: &FeatureStatistics) -> Result<()> {
829        println!("📈 Feature Statistics");
830        println!("====================");
831
832        // Display table header
833        println!(
834            "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
835            "Feature", "Type", "Mean", "Std", "Min", "Max", "Missing"
836        );
837        let separator = "-".repeat(80);
838        println!("{separator}");
839
840        for feature in &statistics.features {
841            let type_str = match feature.data_type {
842                InferredDataType::Numerical => "num",
843                InferredDataType::Categorical => "cat",
844                InferredDataType::Binary => "bin",
845                InferredDataType::Unknown => "unk",
846            };
847
848            println!(
849                "{:<15} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8}",
850                feature.name.chars().take(15).collect::<String>(),
851                type_str,
852                feature
853                    .mean
854                    .map(|x| format!("{x:.3}"))
855                    .unwrap_or_else(|| "-".to_string()),
856                feature
857                    .std
858                    .map(|x| format!("{x:.3}"))
859                    .unwrap_or_else(|| "-".to_string()),
860                feature
861                    .min
862                    .map(|x| format!("{x:.3}"))
863                    .unwrap_or_else(|| "-".to_string()),
864                feature
865                    .max
866                    .map(|x| format!("{x:.3}"))
867                    .unwrap_or_else(|| "-".to_string()),
868                feature.missing_count
869            );
870        }
871
872        println!();
873        Ok(())
874    }
875
876    fn display_missingdata(&self, missingdata: &MissingDataAnalysis) -> Result<()> {
877        println!("❌ Missing Data Analysis");
878        println!("========================");
879        println!(
880            "Total missing: {} ({:.2}%)",
881            missingdata.total_missing, missingdata.missing_percentage
882        );
883
884        if !missingdata.feature_missing.is_empty() {
885            println!("\nMissing by feature:");
886            for (feature, count, percentage) in &missingdata.feature_missing {
887                if *count > 0 {
888                    println!("  {feature}: {count} ({percentage:.1}%)");
889                }
890            }
891        }
892
893        println!();
894        Ok(())
895    }
896
897    fn display_targetanalysis(&self, targetanalysis: &TargetAnalysis) -> Result<()> {
898        println!("🎯 Target Analysis");
899        println!("==================");
900
901        let target = &targetanalysis.target_stats;
902        println!("Target type: {:?}", target.data_type);
903
904        if let Some(ref distribution) = targetanalysis.class_distribution {
905            println!("\nClass distribution:");
906            for (class, count) in distribution {
907                println!("  {class}: {count}");
908            }
909        }
910
911        println!("\nTop correlations with features:");
912        for (feature, correlation) in targetanalysis.correlations_with_features.iter().take(5) {
913            println!("  {feature}: {correlation:.3}");
914        }
915
916        println!();
917        Ok(())
918    }
919
920    fn display_quality_assessment(&self, quality: &QualityAssessment) -> Result<()> {
921        println!("✅ Quality Assessment");
922        println!("=====================");
923        println!("Quality score: {:.1}/100", quality.quality_score);
924
925        if !quality.issues.is_empty() {
926            println!("\nIssues found:");
927            for issue in &quality.issues {
928                let severity_icon = match issue.severity {
929                    Severity::Low => "⚠️",
930                    Severity::Medium => "🟡",
931                    Severity::High => "🟠",
932                    Severity::Critical => "🔴",
933                };
934                println!("  {} {}", severity_icon, issue.description);
935            }
936        }
937
938        println!("\nRecommendations:");
939        for recommendation in &quality.recommendations {
940            println!("  • {recommendation}");
941        }
942
943        println!();
944        Ok(())
945    }
946
947    fn display_json(&self, summary: &DatasetSummary) -> Result<()> {
948        let json = serde_json::to_string_pretty(summary)
949            .map_err(|e| DatasetsError::SerdeError(e.to_string()))?;
950        println!("{json}");
951        Ok(())
952    }
953
954    fn display_csv(&self, summary: &DatasetSummary) -> Result<()> {
955        // CSV format for feature statistics
956        println!("feature,type,count,mean,std,min,max,missing");
957        for feature in &summary.statistics.features {
958            println!(
959                "{},{:?},{},{},{},{},{},{}",
960                feature.name,
961                feature.data_type,
962                feature.count,
963                feature
964                    .mean
965                    .map(|x| x.to_string())
966                    .unwrap_or_else(|| "".to_string()),
967                feature
968                    .std
969                    .map(|x| x.to_string())
970                    .unwrap_or_else(|| "".to_string()),
971                feature
972                    .min
973                    .map(|x| x.to_string())
974                    .unwrap_or_else(|| "".to_string()),
975                feature
976                    .max
977                    .map(|x| x.to_string())
978                    .unwrap_or_else(|| "".to_string()),
979                feature.missing_count
980            );
981        }
982        Ok(())
983    }
984
985    fn display_markdown(&self, summary: &DatasetSummary) -> Result<()> {
986        println!("# Dataset Summary\n");
987
988        println!("## Overview\n");
989        println!("- **Samples**: {}", summary.info.n_samples);
990        println!("- **Features**: {}", summary.info.n_features);
991        println!(
992            "- **Memory usage**: {:.2} MB\n",
993            summary.info.memory_usage as f64 / 1_048_576.0
994        );
995
996        println!("## Feature Statistics\n");
997        println!("| Feature | Type | Mean | Std | Min | Max | Missing |");
998        println!("|---------|------|------|-----|-----|-----|---------|");
999
1000        for feature in &summary.statistics.features {
1001            println!(
1002                "| {} | {:?} | {} | {} | {} | {} | {} |",
1003                feature.name,
1004                feature.data_type,
1005                feature
1006                    .mean
1007                    .map(|x| format!("{x:.3}"))
1008                    .unwrap_or_else(|| "-".to_string()),
1009                feature
1010                    .std
1011                    .map(|x| format!("{x:.3}"))
1012                    .unwrap_or_else(|| "-".to_string()),
1013                feature
1014                    .min
1015                    .map(|x| format!("{x:.3}"))
1016                    .unwrap_or_else(|| "-".to_string()),
1017                feature
1018                    .max
1019                    .map(|x| format!("{x:.3}"))
1020                    .unwrap_or_else(|| "-".to_string()),
1021                feature.missing_count
1022            );
1023        }
1024
1025        println!(
1026            "\n## Quality Score: {:.1}/100\n",
1027            summary.quality_assessment.quality_score
1028        );
1029
1030        Ok(())
1031    }
1032
1033    fn interactive_feature_details(
1034        &self,
1035        dataset: &Dataset,
1036        statistics: &FeatureStatistics,
1037    ) -> Result<()> {
1038        println!("\nFeature Details");
1039        println!("===============");
1040
1041        for (i, feature) in statistics.features.iter().enumerate() {
1042            println!("{}. {}", i + 1, feature.name);
1043        }
1044
1045        print!("\nEnter feature number (or 'back'): ");
1046        io::stdout().flush().expect("Operation failed");
1047
1048        let mut input = String::new();
1049        io::stdin().read_line(&mut input).expect("Operation failed");
1050        let input = input.trim();
1051
1052        if input == "back" {
1053            return Ok(());
1054        }
1055
1056        if let Ok(index) = input.parse::<usize>() {
1057            if index > 0 && index <= statistics.features.len() {
1058                let feature = &statistics.features[index - 1];
1059                self.display_feature_detail(feature, dataset)?;
1060            } else {
1061                println!("Invalid feature number.");
1062            }
1063        } else {
1064            println!("Invalid input.");
1065        }
1066
1067        Ok(())
1068    }
1069
1070    fn display_feature_detail(&self, feature: &FeatureStats, _dataset: &Dataset) -> Result<()> {
1071        println!("\n📊 Feature: {}", feature.name);
1072        println!("==================");
1073        println!("Type: {:?}", feature.data_type);
1074        println!("Count: {}", feature.count);
1075        println!(
1076            "Missing: {} ({:.1}%)",
1077            feature.missing_count,
1078            feature.missing_count as f64 / feature.count as f64 * 100.0
1079        );
1080
1081        if let Some(mean) = feature.mean {
1082            println!("Mean: {mean:.6}");
1083        }
1084        if let Some(std) = feature.std {
1085            println!("Std: {std:.6}");
1086        }
1087        if let Some(min) = feature.min {
1088            println!("Min: {min:.6}");
1089        }
1090        if let Some(max) = feature.max {
1091            println!("Max: {max:.6}");
1092        }
1093        if let Some(median) = feature.median {
1094            println!("Median: {median:.6}");
1095        }
1096        if let Some(q25) = feature.q25 {
1097            println!("Q25: {q25:.6}");
1098        }
1099        if let Some(q75) = feature.q75 {
1100            println!("Q75: {q75:.6}");
1101        }
1102
1103        if let Some(ref unique_values) = feature.unique_values {
1104            println!("Unique values: {unique_values:?}");
1105        } else if let Some(unique_count) = feature.unique_count {
1106            println!("Unique count: {unique_count}");
1107        }
1108
1109        Ok(())
1110    }
1111
1112    fn export_summary(&self, summary: &DatasetSummary) -> Result<()> {
1113        print!("Export format (json/csv/markdown): ");
1114        io::stdout().flush().expect("Operation failed");
1115
1116        let mut input = String::new();
1117        io::stdin().read_line(&mut input).expect("Operation failed");
1118        let format = input.trim();
1119
1120        let filename = format!("dataset_summary.{format}");
1121
1122        let content = match format {
1123            "json" => serde_json::to_string_pretty(summary)
1124                .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1125            "csv" => {
1126                let mut csv_content = String::from("feature,type,count,mean,std,min,max,missing\n");
1127                for feature in &summary.statistics.features {
1128                    csv_content.push_str(&format!(
1129                        "{},{:?},{},{},{},{},{},{}\n",
1130                        feature.name,
1131                        feature.data_type,
1132                        feature.count,
1133                        feature
1134                            .mean
1135                            .map(|x| x.to_string())
1136                            .unwrap_or_else(|| "".to_string()),
1137                        feature
1138                            .std
1139                            .map(|x| x.to_string())
1140                            .unwrap_or_else(|| "".to_string()),
1141                        feature
1142                            .min
1143                            .map(|x| x.to_string())
1144                            .unwrap_or_else(|| "".to_string()),
1145                        feature
1146                            .max
1147                            .map(|x| x.to_string())
1148                            .unwrap_or_else(|| "".to_string()),
1149                        feature.missing_count
1150                    ));
1151                }
1152                csv_content
1153            }
1154            "markdown" => {
1155                // Generate markdown content
1156                format!(
1157                    "# Dataset Summary\n\nQuality Score: {:.1}/100\n",
1158                    summary.quality_assessment.quality_score
1159                )
1160            }
1161            _ => {
1162                return Err(DatasetsError::InvalidFormat(
1163                    "Unsupported export format".to_string(),
1164                ))
1165            }
1166        };
1167
1168        std::fs::write(&filename, content).map_err(DatasetsError::IoError)?;
1169
1170        println!("Summary exported to: {filename}");
1171        Ok(())
1172    }
1173}
1174
1175/// Convenience functions for dataset exploration
1176pub mod convenience {
1177    use super::*;
1178
1179    /// Quick dataset summary with default configuration
1180    pub fn quick_summary(dataset: &Dataset) -> Result<DatasetSummary> {
1181        let explorer = DatasetExplorer::default_config();
1182        explorer.summarize(dataset)
1183    }
1184
1185    /// Display basic dataset information
1186    pub fn info(dataset: &Dataset) -> Result<()> {
1187        let explorer = DatasetExplorer::default_config();
1188        let summary = explorer.summarize(dataset)?;
1189        explorer.display_basic_info(&summary.info);
1190        Ok(())
1191    }
1192
1193    /// Start interactive exploration
1194    pub fn explore(dataset: &Dataset) -> Result<()> {
1195        let config = ExploreConfig {
1196            interactive: true,
1197            ..Default::default()
1198        };
1199
1200        let explorer = DatasetExplorer::new(config);
1201        explorer.interactive_explore(dataset)
1202    }
1203
1204    /// Export dataset summary to file
1205    pub fn export_summary(dataset: &Dataset, format: OutputFormat, filename: &str) -> Result<()> {
1206        let config = ExploreConfig {
1207            output_format: format,
1208            ..Default::default()
1209        };
1210        let output_format = config.output_format;
1211
1212        let explorer = DatasetExplorer::new(config);
1213        let summary = explorer.summarize(dataset)?;
1214
1215        let content = match output_format {
1216            OutputFormat::Json => serde_json::to_string_pretty(&summary)
1217                .map_err(|e| DatasetsError::SerdeError(e.to_string()))?,
1218            _ => {
1219                return Err(DatasetsError::InvalidFormat(
1220                    "Only JSON export is currently supported in convenience function".to_string(),
1221                ));
1222            }
1223        };
1224
1225        std::fs::write(filename, content).map_err(DatasetsError::IoError)?;
1226
1227        Ok(())
1228    }
1229}
1230
1231#[cfg(test)]
1232mod tests {
1233    use super::*;
1234    use crate::generators::make_classification;
1235
1236    #[test]
1237    fn testdataset_explorer_creation() {
1238        let explorer = DatasetExplorer::default_config();
1239        assert_eq!(explorer.config.precision, 3);
1240        assert!(explorer.config.show_detailed_stats);
1241    }
1242
1243    #[test]
1244    fn test_basic_summary() {
1245        let dataset = make_classification(100, 5, 2, 1, 1, Some(42)).expect("Operation failed");
1246        let summary = convenience::quick_summary(&dataset).expect("Operation failed");
1247
1248        assert_eq!(summary.info.n_samples, 100);
1249        assert_eq!(summary.info.n_features, 5);
1250        assert_eq!(summary.statistics.features.len(), 5);
1251    }
1252
1253    #[test]
1254    fn test_feature_statistics() {
1255        let dataset = make_classification(50, 3, 2, 1, 1, Some(42)).expect("Operation failed");
1256        let explorer = DatasetExplorer::default_config();
1257        let statistics = explorer
1258            .compute_feature_statistics(&dataset)
1259            .expect("Operation failed");
1260
1261        assert_eq!(statistics.features.len(), 3);
1262
1263        for feature in &statistics.features {
1264            assert!(feature.mean.is_some());
1265            assert!(feature.std.is_some());
1266            assert!(feature.min.is_some());
1267            assert!(feature.max.is_some());
1268        }
1269    }
1270
1271    #[test]
1272    fn test_quality_assessment() {
1273        let dataset = make_classification(100, 4, 2, 1, 1, Some(42)).expect("Operation failed");
1274        let explorer = DatasetExplorer::default_config();
1275        let summary = explorer.summarize(&dataset).expect("Operation failed");
1276
1277        // Should have high quality score for synthetic data
1278        assert!(summary.quality_assessment.quality_score > 80.0);
1279    }
1280
1281    #[test]
1282    fn test_data_type_inference() {
1283        let explorer = DatasetExplorer::default_config();
1284
1285        // Test numerical data
1286        let numerical_data = vec![1.1, 2.3, 3.7, 4.2];
1287        assert!(matches!(
1288            explorer.infer_data_type(&numerical_data),
1289            InferredDataType::Numerical
1290        ));
1291
1292        // Test binary data
1293        let binary_data = vec![0.0, 1.0, 0.0, 1.0];
1294        assert!(matches!(
1295            explorer.infer_data_type(&binary_data),
1296            InferredDataType::Binary
1297        ));
1298
1299        // Test categorical data
1300        let categorical_data = vec![1.0, 2.0, 3.0, 1.0, 2.0];
1301        assert!(matches!(
1302            explorer.infer_data_type(&categorical_data),
1303            InferredDataType::Categorical
1304        ));
1305    }
1306}
scirs2_datasets/explore.rs

scirs2_datasets/
explore.rs