sklears_impute/
visualization.rs

1//! Missing data visualization utilities
2//!
3//! This module provides visualization tools for understanding missing data patterns,
4//! including heatmaps, pattern plots, and correlation visualizations.
5
6use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
7use sklears_core::{
8    error::{Result as SklResult, SklearsError},
9    types::Float,
10};
11use std::collections::HashMap;
12
13/// Missing data pattern visualization data
14#[derive(Debug, Clone)]
15pub struct MissingPatternPlot {
16    /// Matrix indicating missingness (1 = missing, 0 = observed)
17    pub missing_matrix: Array2<u8>,
18    /// Pattern counts for each unique pattern
19    pub pattern_counts: HashMap<Vec<u8>, usize>,
20    /// Feature names or indices
21    pub feature_names: Vec<String>,
22    /// Sample indices
23    pub sample_indices: Vec<usize>,
24}
25
26/// Missing data correlation heatmap data
27#[derive(Debug, Clone)]
28pub struct MissingCorrelationHeatmap {
29    /// Correlation matrix between missingness indicators
30    pub correlation_matrix: Array2<f64>,
31    /// Feature names or indices
32    pub feature_names: Vec<String>,
33    /// P-values for correlations (if computed)
34    pub p_values: Option<Array2<f64>>,
35}
36
37/// Missing data completeness matrix visualization data
38#[derive(Debug, Clone)]
39pub struct CompletenessMatrix {
40    /// Matrix showing joint observation rates between features
41    pub completeness_matrix: Array2<f64>,
42    /// Feature names or indices
43    pub feature_names: Vec<String>,
44}
45
46/// Missing data distribution plot data
47#[derive(Debug, Clone)]
48pub struct MissingDistributionPlot {
49    /// Count of missing values per feature
50    pub missing_counts: Array1<usize>,
51    /// Percentage of missing values per feature
52    pub missing_percentages: Array1<f64>,
53    /// Feature names or indices
54    pub feature_names: Vec<String>,
55}
56
57/// Generate missing data pattern visualization data
58///
59/// Creates a visualization matrix showing the pattern of missing data across samples and features.
60///
61/// # Parameters
62///
63/// * `X` - Input data matrix
64/// * `missing_values` - Value representing missing data (NaN by default)
65/// * `feature_names` - Optional feature names for labeling
66///
67/// # Returns
68///
69/// A `MissingPatternPlot` struct containing the visualization data
70///
71/// # Examples
72///
73/// ```
74/// use sklears_impute::visualization::create_missing_pattern_plot;
75/// use scirs2_core::ndarray::array;
76///
77/// let X = array![[1.0, 2.0, 3.0], [f64::NAN, 3.0, 4.0], [7.0, f64::NAN, 6.0]];
78/// let plot_data = create_missing_pattern_plot(&X.view(), f64::NAN, None).unwrap();
79/// ```
80#[allow(non_snake_case)]
81pub fn create_missing_pattern_plot(
82    X: &ArrayView2<'_, Float>,
83    missing_values: f64,
84    feature_names: Option<Vec<String>>,
85) -> SklResult<MissingPatternPlot> {
86    let X = X.mapv(|x| x);
87    let (n_samples, n_features) = X.dim();
88
89    // Generate feature names if not provided
90    let feature_names = feature_names
91        .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
92
93    if feature_names.len() != n_features {
94        return Err(SklearsError::InvalidInput(format!(
95            "Number of feature names {} does not match number of features {}",
96            feature_names.len(),
97            n_features
98        )));
99    }
100
101    // Create missing matrix
102    let mut missing_matrix = Array2::zeros((n_samples, n_features));
103    let mut pattern_counts: HashMap<Vec<u8>, usize> = HashMap::new();
104
105    for i in 0..n_samples {
106        let mut pattern = Vec::new();
107        for j in 0..n_features {
108            let is_missing = if missing_values.is_nan() {
109                X[[i, j]].is_nan()
110            } else {
111                (X[[i, j]] - missing_values).abs() < f64::EPSILON
112            };
113
114            let missing_indicator = if is_missing { 1u8 } else { 0u8 };
115            missing_matrix[[i, j]] = missing_indicator;
116            pattern.push(missing_indicator);
117        }
118
119        *pattern_counts.entry(pattern).or_insert(0) += 1;
120    }
121
122    let sample_indices: Vec<usize> = (0..n_samples).collect();
123
124    Ok(MissingPatternPlot {
125        missing_matrix,
126        pattern_counts,
127        feature_names,
128        sample_indices,
129    })
130}
131
132/// Generate missing data correlation heatmap data
133///
134/// Creates correlation matrix between missingness indicators of different features.
135///
136/// # Parameters
137///
138/// * `X` - Input data matrix
139/// * `missing_values` - Value representing missing data (NaN by default)
140/// * `feature_names` - Optional feature names for labeling
141/// * `compute_p_values` - Whether to compute p-values for correlations
142///
143/// # Returns
144///
145/// A `MissingCorrelationHeatmap` struct containing the correlation data
146#[allow(non_snake_case)]
147pub fn create_missing_correlation_heatmap(
148    X: &ArrayView2<'_, Float>,
149    missing_values: f64,
150    feature_names: Option<Vec<String>>,
151    compute_p_values: bool,
152) -> SklResult<MissingCorrelationHeatmap> {
153    let X = X.mapv(|x| x);
154    let (n_samples, n_features) = X.dim();
155
156    // Generate feature names if not provided
157    let feature_names = feature_names
158        .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
159
160    if feature_names.len() != n_features {
161        return Err(SklearsError::InvalidInput(format!(
162            "Number of feature names {} does not match number of features {}",
163            feature_names.len(),
164            n_features
165        )));
166    }
167
168    // Create missingness indicators
169    let mut missing_indicators = Array2::zeros((n_samples, n_features));
170    for i in 0..n_samples {
171        for j in 0..n_features {
172            let is_missing = if missing_values.is_nan() {
173                X[[i, j]].is_nan()
174            } else {
175                (X[[i, j]] - missing_values).abs() < f64::EPSILON
176            };
177            missing_indicators[[i, j]] = if is_missing { 1.0 } else { 0.0 };
178        }
179    }
180
181    // Compute correlation matrix
182    let mut correlation_matrix = Array2::zeros((n_features, n_features));
183    let mut p_values = if compute_p_values {
184        Some(Array2::zeros((n_features, n_features)))
185    } else {
186        None
187    };
188
189    for i in 0..n_features {
190        for j in 0..n_features {
191            if i == j {
192                correlation_matrix[[i, j]] = 1.0;
193                if let Some(ref mut p_vals) = p_values {
194                    p_vals[[i, j]] = 0.0;
195                }
196            } else {
197                let col_i = missing_indicators.column(i);
198                let col_j = missing_indicators.column(j);
199
200                let correlation = compute_correlation(&col_i.to_owned(), &col_j.to_owned());
201                correlation_matrix[[i, j]] = correlation;
202
203                if let Some(ref mut p_vals) = p_values {
204                    let p_value = compute_correlation_p_value(
205                        &col_i.to_owned(),
206                        &col_j.to_owned(),
207                        correlation,
208                    );
209                    p_vals[[i, j]] = p_value;
210                }
211            }
212        }
213    }
214
215    Ok(MissingCorrelationHeatmap {
216        correlation_matrix,
217        feature_names,
218        p_values,
219    })
220}
221
222/// Generate completeness matrix visualization data
223///
224/// Creates a matrix showing the joint observation rates between all pairs of features.
225///
226/// # Parameters
227///
228/// * `X` - Input data matrix
229/// * `missing_values` - Value representing missing data (NaN by default)
230/// * `feature_names` - Optional feature names for labeling
231///
232/// # Returns
233///
234/// A `CompletenessMatrix` struct containing the completeness data
235#[allow(non_snake_case)]
236pub fn create_completeness_matrix(
237    X: &ArrayView2<'_, Float>,
238    missing_values: f64,
239    feature_names: Option<Vec<String>>,
240) -> SklResult<CompletenessMatrix> {
241    let X = X.mapv(|x| x);
242    let (n_samples, n_features) = X.dim();
243
244    // Generate feature names if not provided
245    let feature_names = feature_names
246        .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
247
248    if feature_names.len() != n_features {
249        return Err(SklearsError::InvalidInput(format!(
250            "Number of feature names {} does not match number of features {}",
251            feature_names.len(),
252            n_features
253        )));
254    }
255
256    let mut completeness_matrix = Array2::zeros((n_features, n_features));
257
258    for i in 0..n_features {
259        for j in 0..n_features {
260            let mut joint_observed = 0;
261
262            for sample_idx in 0..n_samples {
263                let i_observed = if missing_values.is_nan() {
264                    !X[[sample_idx, i]].is_nan()
265                } else {
266                    (X[[sample_idx, i]] - missing_values).abs() >= f64::EPSILON
267                };
268
269                let j_observed = if missing_values.is_nan() {
270                    !X[[sample_idx, j]].is_nan()
271                } else {
272                    (X[[sample_idx, j]] - missing_values).abs() >= f64::EPSILON
273                };
274
275                if i_observed && j_observed {
276                    joint_observed += 1;
277                }
278            }
279
280            completeness_matrix[[i, j]] = joint_observed as f64 / n_samples as f64;
281        }
282    }
283
284    Ok(CompletenessMatrix {
285        completeness_matrix,
286        feature_names,
287    })
288}
289
290/// Generate missing data distribution plot data
291///
292/// Creates data for plotting the distribution of missing values across features.
293///
294/// # Parameters
295///
296/// * `X` - Input data matrix
297/// * `missing_values` - Value representing missing data (NaN by default)
298/// * `feature_names` - Optional feature names for labeling
299///
300/// # Returns
301///
302/// A `MissingDistributionPlot` struct containing the distribution data
303#[allow(non_snake_case)]
304pub fn create_missing_distribution_plot(
305    X: &ArrayView2<'_, Float>,
306    missing_values: f64,
307    feature_names: Option<Vec<String>>,
308) -> SklResult<MissingDistributionPlot> {
309    let X = X.mapv(|x| x);
310    let (n_samples, n_features) = X.dim();
311
312    // Generate feature names if not provided
313    let feature_names = feature_names
314        .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
315
316    if feature_names.len() != n_features {
317        return Err(SklearsError::InvalidInput(format!(
318            "Number of feature names {} does not match number of features {}",
319            feature_names.len(),
320            n_features
321        )));
322    }
323
324    let mut missing_counts = Array1::zeros(n_features);
325    let mut missing_percentages = Array1::zeros(n_features);
326
327    for j in 0..n_features {
328        let mut count = 0;
329        for i in 0..n_samples {
330            let is_missing = if missing_values.is_nan() {
331                X[[i, j]].is_nan()
332            } else {
333                (X[[i, j]] - missing_values).abs() < f64::EPSILON
334            };
335
336            if is_missing {
337                count += 1;
338            }
339        }
340
341        missing_counts[j] = count;
342        missing_percentages[j] = (count as f64 / n_samples as f64) * 100.0;
343    }
344
345    Ok(MissingDistributionPlot {
346        missing_counts,
347        missing_percentages,
348        feature_names,
349    })
350}
351
352/// Export missing pattern data to CSV format
353///
354/// Exports the missing pattern matrix to a CSV-like string format for external visualization.
355///
356/// # Parameters
357///
358/// * `plot_data` - Missing pattern plot data
359/// * `include_headers` - Whether to include feature names as headers
360///
361/// # Returns
362///
363/// A string in CSV format representing the missing pattern matrix
364pub fn export_missing_pattern_csv(plot_data: &MissingPatternPlot, include_headers: bool) -> String {
365    let mut csv_lines = Vec::new();
366
367    if include_headers {
368        let header = format!("Sample,{}", plot_data.feature_names.join(","));
369        csv_lines.push(header);
370    }
371
372    for (i, sample_idx) in plot_data.sample_indices.iter().enumerate() {
373        let mut row = vec![sample_idx.to_string()];
374        for j in 0..plot_data.missing_matrix.ncols() {
375            row.push(plot_data.missing_matrix[[i, j]].to_string());
376        }
377        csv_lines.push(row.join(","));
378    }
379
380    csv_lines.join("\n")
381}
382
383/// Export correlation heatmap data to CSV format
384///
385/// Exports the correlation matrix to a CSV-like string format for external visualization.
386///
387/// # Parameters
388///
389/// * `heatmap_data` - Missing correlation heatmap data
390/// * `include_headers` - Whether to include feature names as headers
391///
392/// # Returns
393///
394/// A string in CSV format representing the correlation matrix
395pub fn export_correlation_csv(
396    heatmap_data: &MissingCorrelationHeatmap,
397    include_headers: bool,
398) -> String {
399    let mut csv_lines = Vec::new();
400
401    if include_headers {
402        let header = format!("Feature,{}", heatmap_data.feature_names.join(","));
403        csv_lines.push(header);
404    }
405
406    for (i, feature_name) in heatmap_data.feature_names.iter().enumerate() {
407        let mut row = vec![feature_name.clone()];
408        for j in 0..heatmap_data.correlation_matrix.ncols() {
409            row.push(format!("{:.4}", heatmap_data.correlation_matrix[[i, j]]));
410        }
411        csv_lines.push(row.join(","));
412    }
413
414    csv_lines.join("\n")
415}
416
417/// Generate summary statistics for missing data patterns
418///
419/// Provides comprehensive summary statistics about missing data patterns in the dataset.
420///
421/// # Parameters
422///
423/// * `X` - Input data matrix
424/// * `missing_values` - Value representing missing data (NaN by default)
425///
426/// # Returns
427///
428/// A string containing formatted summary statistics
429#[allow(non_snake_case)]
430pub fn generate_missing_summary_stats(
431    X: &ArrayView2<'_, Float>,
432    missing_values: f64,
433) -> SklResult<String> {
434    let X = X.mapv(|x| x);
435    let (n_samples, n_features) = X.dim();
436
437    let mut total_missing = 0;
438    let mut feature_missing_counts = vec![0; n_features];
439    let mut sample_missing_counts = vec![0; n_samples];
440
441    for i in 0..n_samples {
442        for j in 0..n_features {
443            let is_missing = if missing_values.is_nan() {
444                X[[i, j]].is_nan()
445            } else {
446                (X[[i, j]] - missing_values).abs() < f64::EPSILON
447            };
448
449            if is_missing {
450                total_missing += 1;
451                feature_missing_counts[j] += 1;
452                sample_missing_counts[i] += 1;
453            }
454        }
455    }
456
457    let total_cells = n_samples * n_features;
458    let overall_missing_rate = (total_missing as f64 / total_cells as f64) * 100.0;
459
460    let features_with_missing = feature_missing_counts
461        .iter()
462        .filter(|&&count| count > 0)
463        .count();
464    let samples_with_missing = sample_missing_counts
465        .iter()
466        .filter(|&&count| count > 0)
467        .count();
468
469    let max_feature_missing = *feature_missing_counts.iter().max().unwrap_or(&0);
470    let max_sample_missing = *sample_missing_counts.iter().max().unwrap_or(&0);
471
472    let completely_missing_features = feature_missing_counts
473        .iter()
474        .filter(|&&count| count == n_samples)
475        .count();
476    let completely_observed_samples = sample_missing_counts
477        .iter()
478        .filter(|&&count| count == 0)
479        .count();
480
481    let summary = format!(
482        "Missing Data Summary Statistics\n\
483         ===============================\n\
484         Dataset dimensions: {} samples × {} features\n\
485         Total cells: {}\n\
486         Missing cells: {} ({:.2}%)\n\
487         Observed cells: {} ({:.2}%)\n\
488         \n\
489         Feature-wise statistics:\n\
490         - Features with missing values: {} / {} ({:.1}%)\n\
491         - Completely missing features: {}\n\
492         - Maximum missing values in a feature: {} / {} ({:.1}%)\n\
493         \n\
494         Sample-wise statistics:\n\
495         - Samples with missing values: {} / {} ({:.1}%)\n\
496         - Completely observed samples: {} ({:.1}%)\n\
497         - Maximum missing values in a sample: {} / {} ({:.1}%)\n",
498        n_samples,
499        n_features,
500        total_cells,
501        total_missing,
502        overall_missing_rate,
503        total_cells - total_missing,
504        100.0 - overall_missing_rate,
505        features_with_missing,
506        n_features,
507        (features_with_missing as f64 / n_features as f64) * 100.0,
508        completely_missing_features,
509        max_feature_missing,
510        n_samples,
511        (max_feature_missing as f64 / n_samples as f64) * 100.0,
512        samples_with_missing,
513        n_samples,
514        (samples_with_missing as f64 / n_samples as f64) * 100.0,
515        completely_observed_samples,
516        (completely_observed_samples as f64 / n_samples as f64) * 100.0,
517        max_sample_missing,
518        n_features,
519        (max_sample_missing as f64 / n_features as f64) * 100.0
520    );
521
522    Ok(summary)
523}
524
525// Helper functions
526
527fn compute_correlation(x: &Array1<f64>, y: &Array1<f64>) -> f64 {
528    let n = x.len() as f64;
529    if n == 0.0 {
530        return 0.0;
531    }
532
533    let mean_x = x.sum() / n;
534    let mean_y = y.sum() / n;
535
536    let mut numerator = 0.0;
537    let mut var_x = 0.0;
538    let mut var_y = 0.0;
539
540    for i in 0..x.len() {
541        let dx = x[i] - mean_x;
542        let dy = y[i] - mean_y;
543
544        numerator += dx * dy;
545        var_x += dx * dx;
546        var_y += dy * dy;
547    }
548
549    let denominator = (var_x * var_y).sqrt();
550    if denominator == 0.0 {
551        0.0
552    } else {
553        numerator / denominator
554    }
555}
556
557fn compute_correlation_p_value(x: &Array1<f64>, _y: &Array1<f64>, correlation: f64) -> f64 {
558    let n = x.len() as f64;
559    if n <= 2.0 {
560        return 1.0;
561    }
562
563    // Approximate p-value using t-distribution
564    // t = r * sqrt((n-2)/(1-r^2))
565    let r_squared = correlation * correlation;
566    if r_squared >= 1.0 {
567        return 0.0;
568    }
569
570    let t_stat = correlation * ((n - 2.0) / (1.0 - r_squared)).sqrt();
571
572    // Simplified p-value approximation (two-tailed test)
573    // For a more accurate calculation, you would use the t-distribution CDF
574    let _df = n - 2.0;
575
576    // Very crude approximation - in practice, you'd want to use a proper statistical library
577    if t_stat.abs() > 2.0 {
578        0.05
579    } else if t_stat.abs() > 1.0 {
580        0.1
581    } else {
582        0.5
583    }
584}