1use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
7use sklears_core::{
8 error::{Result as SklResult, SklearsError},
9 types::Float,
10};
11use std::collections::HashMap;
12
13#[derive(Debug, Clone)]
15pub struct MissingPatternPlot {
16 pub missing_matrix: Array2<u8>,
18 pub pattern_counts: HashMap<Vec<u8>, usize>,
20 pub feature_names: Vec<String>,
22 pub sample_indices: Vec<usize>,
24}
25
26#[derive(Debug, Clone)]
28pub struct MissingCorrelationHeatmap {
29 pub correlation_matrix: Array2<f64>,
31 pub feature_names: Vec<String>,
33 pub p_values: Option<Array2<f64>>,
35}
36
37#[derive(Debug, Clone)]
39pub struct CompletenessMatrix {
40 pub completeness_matrix: Array2<f64>,
42 pub feature_names: Vec<String>,
44}
45
46#[derive(Debug, Clone)]
48pub struct MissingDistributionPlot {
49 pub missing_counts: Array1<usize>,
51 pub missing_percentages: Array1<f64>,
53 pub feature_names: Vec<String>,
55}
56
57#[allow(non_snake_case)]
81pub fn create_missing_pattern_plot(
82 X: &ArrayView2<'_, Float>,
83 missing_values: f64,
84 feature_names: Option<Vec<String>>,
85) -> SklResult<MissingPatternPlot> {
86 let X = X.mapv(|x| x);
87 let (n_samples, n_features) = X.dim();
88
89 let feature_names = feature_names
91 .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
92
93 if feature_names.len() != n_features {
94 return Err(SklearsError::InvalidInput(format!(
95 "Number of feature names {} does not match number of features {}",
96 feature_names.len(),
97 n_features
98 )));
99 }
100
101 let mut missing_matrix = Array2::zeros((n_samples, n_features));
103 let mut pattern_counts: HashMap<Vec<u8>, usize> = HashMap::new();
104
105 for i in 0..n_samples {
106 let mut pattern = Vec::new();
107 for j in 0..n_features {
108 let is_missing = if missing_values.is_nan() {
109 X[[i, j]].is_nan()
110 } else {
111 (X[[i, j]] - missing_values).abs() < f64::EPSILON
112 };
113
114 let missing_indicator = if is_missing { 1u8 } else { 0u8 };
115 missing_matrix[[i, j]] = missing_indicator;
116 pattern.push(missing_indicator);
117 }
118
119 *pattern_counts.entry(pattern).or_insert(0) += 1;
120 }
121
122 let sample_indices: Vec<usize> = (0..n_samples).collect();
123
124 Ok(MissingPatternPlot {
125 missing_matrix,
126 pattern_counts,
127 feature_names,
128 sample_indices,
129 })
130}
131
132#[allow(non_snake_case)]
147pub fn create_missing_correlation_heatmap(
148 X: &ArrayView2<'_, Float>,
149 missing_values: f64,
150 feature_names: Option<Vec<String>>,
151 compute_p_values: bool,
152) -> SklResult<MissingCorrelationHeatmap> {
153 let X = X.mapv(|x| x);
154 let (n_samples, n_features) = X.dim();
155
156 let feature_names = feature_names
158 .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
159
160 if feature_names.len() != n_features {
161 return Err(SklearsError::InvalidInput(format!(
162 "Number of feature names {} does not match number of features {}",
163 feature_names.len(),
164 n_features
165 )));
166 }
167
168 let mut missing_indicators = Array2::zeros((n_samples, n_features));
170 for i in 0..n_samples {
171 for j in 0..n_features {
172 let is_missing = if missing_values.is_nan() {
173 X[[i, j]].is_nan()
174 } else {
175 (X[[i, j]] - missing_values).abs() < f64::EPSILON
176 };
177 missing_indicators[[i, j]] = if is_missing { 1.0 } else { 0.0 };
178 }
179 }
180
181 let mut correlation_matrix = Array2::zeros((n_features, n_features));
183 let mut p_values = if compute_p_values {
184 Some(Array2::zeros((n_features, n_features)))
185 } else {
186 None
187 };
188
189 for i in 0..n_features {
190 for j in 0..n_features {
191 if i == j {
192 correlation_matrix[[i, j]] = 1.0;
193 if let Some(ref mut p_vals) = p_values {
194 p_vals[[i, j]] = 0.0;
195 }
196 } else {
197 let col_i = missing_indicators.column(i);
198 let col_j = missing_indicators.column(j);
199
200 let correlation = compute_correlation(&col_i.to_owned(), &col_j.to_owned());
201 correlation_matrix[[i, j]] = correlation;
202
203 if let Some(ref mut p_vals) = p_values {
204 let p_value = compute_correlation_p_value(
205 &col_i.to_owned(),
206 &col_j.to_owned(),
207 correlation,
208 );
209 p_vals[[i, j]] = p_value;
210 }
211 }
212 }
213 }
214
215 Ok(MissingCorrelationHeatmap {
216 correlation_matrix,
217 feature_names,
218 p_values,
219 })
220}
221
222#[allow(non_snake_case)]
236pub fn create_completeness_matrix(
237 X: &ArrayView2<'_, Float>,
238 missing_values: f64,
239 feature_names: Option<Vec<String>>,
240) -> SklResult<CompletenessMatrix> {
241 let X = X.mapv(|x| x);
242 let (n_samples, n_features) = X.dim();
243
244 let feature_names = feature_names
246 .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
247
248 if feature_names.len() != n_features {
249 return Err(SklearsError::InvalidInput(format!(
250 "Number of feature names {} does not match number of features {}",
251 feature_names.len(),
252 n_features
253 )));
254 }
255
256 let mut completeness_matrix = Array2::zeros((n_features, n_features));
257
258 for i in 0..n_features {
259 for j in 0..n_features {
260 let mut joint_observed = 0;
261
262 for sample_idx in 0..n_samples {
263 let i_observed = if missing_values.is_nan() {
264 !X[[sample_idx, i]].is_nan()
265 } else {
266 (X[[sample_idx, i]] - missing_values).abs() >= f64::EPSILON
267 };
268
269 let j_observed = if missing_values.is_nan() {
270 !X[[sample_idx, j]].is_nan()
271 } else {
272 (X[[sample_idx, j]] - missing_values).abs() >= f64::EPSILON
273 };
274
275 if i_observed && j_observed {
276 joint_observed += 1;
277 }
278 }
279
280 completeness_matrix[[i, j]] = joint_observed as f64 / n_samples as f64;
281 }
282 }
283
284 Ok(CompletenessMatrix {
285 completeness_matrix,
286 feature_names,
287 })
288}
289
290#[allow(non_snake_case)]
304pub fn create_missing_distribution_plot(
305 X: &ArrayView2<'_, Float>,
306 missing_values: f64,
307 feature_names: Option<Vec<String>>,
308) -> SklResult<MissingDistributionPlot> {
309 let X = X.mapv(|x| x);
310 let (n_samples, n_features) = X.dim();
311
312 let feature_names = feature_names
314 .unwrap_or_else(|| (0..n_features).map(|i| format!("Feature_{}", i)).collect());
315
316 if feature_names.len() != n_features {
317 return Err(SklearsError::InvalidInput(format!(
318 "Number of feature names {} does not match number of features {}",
319 feature_names.len(),
320 n_features
321 )));
322 }
323
324 let mut missing_counts = Array1::zeros(n_features);
325 let mut missing_percentages = Array1::zeros(n_features);
326
327 for j in 0..n_features {
328 let mut count = 0;
329 for i in 0..n_samples {
330 let is_missing = if missing_values.is_nan() {
331 X[[i, j]].is_nan()
332 } else {
333 (X[[i, j]] - missing_values).abs() < f64::EPSILON
334 };
335
336 if is_missing {
337 count += 1;
338 }
339 }
340
341 missing_counts[j] = count;
342 missing_percentages[j] = (count as f64 / n_samples as f64) * 100.0;
343 }
344
345 Ok(MissingDistributionPlot {
346 missing_counts,
347 missing_percentages,
348 feature_names,
349 })
350}
351
352pub fn export_missing_pattern_csv(plot_data: &MissingPatternPlot, include_headers: bool) -> String {
365 let mut csv_lines = Vec::new();
366
367 if include_headers {
368 let header = format!("Sample,{}", plot_data.feature_names.join(","));
369 csv_lines.push(header);
370 }
371
372 for (i, sample_idx) in plot_data.sample_indices.iter().enumerate() {
373 let mut row = vec![sample_idx.to_string()];
374 for j in 0..plot_data.missing_matrix.ncols() {
375 row.push(plot_data.missing_matrix[[i, j]].to_string());
376 }
377 csv_lines.push(row.join(","));
378 }
379
380 csv_lines.join("\n")
381}
382
383pub fn export_correlation_csv(
396 heatmap_data: &MissingCorrelationHeatmap,
397 include_headers: bool,
398) -> String {
399 let mut csv_lines = Vec::new();
400
401 if include_headers {
402 let header = format!("Feature,{}", heatmap_data.feature_names.join(","));
403 csv_lines.push(header);
404 }
405
406 for (i, feature_name) in heatmap_data.feature_names.iter().enumerate() {
407 let mut row = vec![feature_name.clone()];
408 for j in 0..heatmap_data.correlation_matrix.ncols() {
409 row.push(format!("{:.4}", heatmap_data.correlation_matrix[[i, j]]));
410 }
411 csv_lines.push(row.join(","));
412 }
413
414 csv_lines.join("\n")
415}
416
417#[allow(non_snake_case)]
430pub fn generate_missing_summary_stats(
431 X: &ArrayView2<'_, Float>,
432 missing_values: f64,
433) -> SklResult<String> {
434 let X = X.mapv(|x| x);
435 let (n_samples, n_features) = X.dim();
436
437 let mut total_missing = 0;
438 let mut feature_missing_counts = vec![0; n_features];
439 let mut sample_missing_counts = vec![0; n_samples];
440
441 for i in 0..n_samples {
442 for j in 0..n_features {
443 let is_missing = if missing_values.is_nan() {
444 X[[i, j]].is_nan()
445 } else {
446 (X[[i, j]] - missing_values).abs() < f64::EPSILON
447 };
448
449 if is_missing {
450 total_missing += 1;
451 feature_missing_counts[j] += 1;
452 sample_missing_counts[i] += 1;
453 }
454 }
455 }
456
457 let total_cells = n_samples * n_features;
458 let overall_missing_rate = (total_missing as f64 / total_cells as f64) * 100.0;
459
460 let features_with_missing = feature_missing_counts
461 .iter()
462 .filter(|&&count| count > 0)
463 .count();
464 let samples_with_missing = sample_missing_counts
465 .iter()
466 .filter(|&&count| count > 0)
467 .count();
468
469 let max_feature_missing = *feature_missing_counts.iter().max().unwrap_or(&0);
470 let max_sample_missing = *sample_missing_counts.iter().max().unwrap_or(&0);
471
472 let completely_missing_features = feature_missing_counts
473 .iter()
474 .filter(|&&count| count == n_samples)
475 .count();
476 let completely_observed_samples = sample_missing_counts
477 .iter()
478 .filter(|&&count| count == 0)
479 .count();
480
481 let summary = format!(
482 "Missing Data Summary Statistics\n\
483 ===============================\n\
484 Dataset dimensions: {} samples × {} features\n\
485 Total cells: {}\n\
486 Missing cells: {} ({:.2}%)\n\
487 Observed cells: {} ({:.2}%)\n\
488 \n\
489 Feature-wise statistics:\n\
490 - Features with missing values: {} / {} ({:.1}%)\n\
491 - Completely missing features: {}\n\
492 - Maximum missing values in a feature: {} / {} ({:.1}%)\n\
493 \n\
494 Sample-wise statistics:\n\
495 - Samples with missing values: {} / {} ({:.1}%)\n\
496 - Completely observed samples: {} ({:.1}%)\n\
497 - Maximum missing values in a sample: {} / {} ({:.1}%)\n",
498 n_samples,
499 n_features,
500 total_cells,
501 total_missing,
502 overall_missing_rate,
503 total_cells - total_missing,
504 100.0 - overall_missing_rate,
505 features_with_missing,
506 n_features,
507 (features_with_missing as f64 / n_features as f64) * 100.0,
508 completely_missing_features,
509 max_feature_missing,
510 n_samples,
511 (max_feature_missing as f64 / n_samples as f64) * 100.0,
512 samples_with_missing,
513 n_samples,
514 (samples_with_missing as f64 / n_samples as f64) * 100.0,
515 completely_observed_samples,
516 (completely_observed_samples as f64 / n_samples as f64) * 100.0,
517 max_sample_missing,
518 n_features,
519 (max_sample_missing as f64 / n_features as f64) * 100.0
520 );
521
522 Ok(summary)
523}
524
525fn compute_correlation(x: &Array1<f64>, y: &Array1<f64>) -> f64 {
528 let n = x.len() as f64;
529 if n == 0.0 {
530 return 0.0;
531 }
532
533 let mean_x = x.sum() / n;
534 let mean_y = y.sum() / n;
535
536 let mut numerator = 0.0;
537 let mut var_x = 0.0;
538 let mut var_y = 0.0;
539
540 for i in 0..x.len() {
541 let dx = x[i] - mean_x;
542 let dy = y[i] - mean_y;
543
544 numerator += dx * dy;
545 var_x += dx * dx;
546 var_y += dy * dy;
547 }
548
549 let denominator = (var_x * var_y).sqrt();
550 if denominator == 0.0 {
551 0.0
552 } else {
553 numerator / denominator
554 }
555}
556
557fn compute_correlation_p_value(x: &Array1<f64>, _y: &Array1<f64>, correlation: f64) -> f64 {
558 let n = x.len() as f64;
559 if n <= 2.0 {
560 return 1.0;
561 }
562
563 let r_squared = correlation * correlation;
566 if r_squared >= 1.0 {
567 return 0.0;
568 }
569
570 let t_stat = correlation * ((n - 2.0) / (1.0 - r_squared)).sqrt();
571
572 let _df = n - 2.0;
575
576 if t_stat.abs() > 2.0 {
578 0.05
579 } else if t_stat.abs() > 1.0 {
580 0.1
581 } else {
582 0.5
583 }
584}