sklears_dummy/
comparative_analysis.rs

1//! Comparative analysis tools for benchmarking and evaluation
2//!
3//! This module provides statistical tools for comparing different baseline models
4//! and machine learning algorithms. It includes significance testing, effect size
5//! computation, and various statistical comparison methods.
6//!
7//! The module includes:
8//! - Statistical significance testing (t-test, Wilcoxon, permutation tests)
9//! - Effect size computation (Cohen's d, Cliff's delta, etc.)
10//! - Confidence interval comparisons
11//! - Bayesian comparison methods
12//! - Performance reporting utilities
13
14use scirs2_core::ndarray::Array1;
15use scirs2_core::random::{Rng, SeedableRng};
16use sklears_core::error::SklearsError;
17use std::collections::HashMap;
18
19/// Statistical significance test types
20#[derive(Debug, Clone, Copy)]
21pub enum SignificanceTest {
22    /// Two-sample t-test (assumes normality)
23    TTest,
24    /// Welch's t-test (unequal variances)
25    WelchTTest,
26    /// Wilcoxon rank-sum test (non-parametric)
27    WilcoxonRankSum,
28    /// Mann-Whitney U test (equivalent to Wilcoxon rank-sum)
29    MannWhitneyU,
30    /// Permutation test (non-parametric)
31    PermutationTest { n_permutations: usize },
32    /// Bootstrap test
33    BootstrapTest { n_bootstrap: usize },
34}
35
36/// Effect size measures
37#[derive(Debug, Clone, Copy)]
38pub enum EffectSizeMeasure {
39    /// Cohen's d (standardized mean difference)
40    CohensD,
41    /// Glass's delta (uses control group standard deviation)
42    GlassDelta,
43    /// Hedges' g (bias-corrected Cohen's d)
44    HedgesG,
45    /// Cliff's delta (non-parametric effect size)
46    CliffsDelta,
47    /// Common language effect size
48    CommonLanguageEffect,
49    /// Probability of superiority
50    ProbabilityOfSuperiority,
51}
52
53/// Confidence interval types
54#[derive(Debug, Clone, Copy)]
55pub enum ConfidenceIntervalType {
56    /// Normal approximation
57    Normal,
58    /// Bootstrap percentile method
59    BootstrapPercentile { n_bootstrap: usize },
60    /// Bootstrap bias-corrected and accelerated (BCa)
61    BootstrapBCa { n_bootstrap: usize },
62    /// Student's t-distribution
63    TDistribution,
64}
65
66/// Results of a statistical significance test
67#[derive(Debug, Clone)]
68pub struct SignificanceTestResult {
69    /// test_type
70    pub test_type: SignificanceTest,
71    /// statistic
72    pub statistic: f64,
73    /// p_value
74    pub p_value: f64,
75    /// effect_size
76    pub effect_size: f64,
77    /// effect_size_measure
78    pub effect_size_measure: EffectSizeMeasure,
79    /// confidence_interval
80    pub confidence_interval: (f64, f64),
81    /// sample_sizes
82    pub sample_sizes: (usize, usize),
83    /// is_significant
84    pub is_significant: bool,
85    /// alpha_level
86    pub alpha_level: f64,
87}
88
89/// Results of effect size computation
90#[derive(Debug, Clone)]
91pub struct EffectSizeResult {
92    /// measure
93    pub measure: EffectSizeMeasure,
94    /// value
95    pub value: f64,
96    /// confidence_interval
97    pub confidence_interval: (f64, f64),
98    /// interpretation
99    pub interpretation: EffectSizeInterpretation,
100    /// sample_sizes
101    pub sample_sizes: (usize, usize),
102}
103
104/// Effect size interpretation according to conventional thresholds
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum EffectSizeInterpretation {
107    /// Negligible
108    Negligible,
109    /// Small
110    Small,
111    /// Medium
112    Medium,
113    /// Large
114    Large,
115    /// VeryLarge
116    VeryLarge,
117}
118
119/// Model comparison result
120#[derive(Debug, Clone)]
121pub struct ModelComparisonResult {
122    /// model_names
123    pub model_names: Vec<String>,
124    /// performance_scores
125    pub performance_scores: Vec<f64>,
126    /// pairwise_comparisons
127    pub pairwise_comparisons: Vec<PairwiseComparison>,
128    /// ranking
129    pub ranking: Vec<usize>, // Indices of models sorted by performance
130    /// best_model_index
131    pub best_model_index: usize,
132    /// statistical_summary
133    pub statistical_summary: StatisticalSummary,
134}
135
136/// Pairwise comparison between two models
137#[derive(Debug, Clone)]
138pub struct PairwiseComparison {
139    /// model_a_index
140    pub model_a_index: usize,
141    /// model_b_index
142    pub model_b_index: usize,
143    /// significance_test
144    pub significance_test: SignificanceTestResult,
145    /// bayes_factor
146    pub bayes_factor: Option<f64>,
147    /// practical_significance
148    pub practical_significance: bool,
149}
150
151/// Statistical summary of model comparisons
152#[derive(Debug, Clone)]
153pub struct StatisticalSummary {
154    /// mean_scores
155    pub mean_scores: Vec<f64>,
156    /// std_scores
157    pub std_scores: Vec<f64>,
158    /// confidence_intervals
159    pub confidence_intervals: Vec<(f64, f64)>,
160    /// overall_best_model
161    pub overall_best_model: usize,
162    /// significantly_different_pairs
163    pub significantly_different_pairs: Vec<(usize, usize)>,
164    /// effect_sizes
165    pub effect_sizes: HashMap<(usize, usize), f64>,
166}
167
168/// Main comparative analysis engine
169pub struct ComparativeAnalyzer {
170    alpha_level: f64,
171    random_state: Option<u64>,
172    correction_method: MultipleComparisonCorrection,
173}
174
175/// Multiple comparison correction methods
176#[derive(Debug, Clone, Copy)]
177pub enum MultipleComparisonCorrection {
178    None,
179    /// Bonferroni
180    Bonferroni,
181    /// Holm
182    Holm,
183    /// BenjaminiHochberg
184    BenjaminiHochberg,
185    /// BenjaminiYekutieli
186    BenjaminiYekutieli,
187}
188
189impl ComparativeAnalyzer {
190    /// Create a new comparative analyzer
191    pub fn new() -> Self {
192        Self {
193            alpha_level: 0.05,
194            random_state: None,
195            correction_method: MultipleComparisonCorrection::Holm,
196        }
197    }
198
199    /// Set significance level
200    pub fn with_alpha(mut self, alpha: f64) -> Self {
201        self.alpha_level = alpha;
202        self
203    }
204
205    /// Set random state for reproducible results
206    pub fn with_random_state(mut self, seed: u64) -> Self {
207        self.random_state = Some(seed);
208        self
209    }
210
211    /// Set multiple comparison correction method
212    pub fn with_correction(mut self, correction: MultipleComparisonCorrection) -> Self {
213        self.correction_method = correction;
214        self
215    }
216
217    /// Perform statistical significance test between two groups
218    pub fn significance_test(
219        &self,
220        group_a: &Array1<f64>,
221        group_b: &Array1<f64>,
222        test_type: SignificanceTest,
223        effect_size_measure: EffectSizeMeasure,
224    ) -> Result<SignificanceTestResult, SklearsError> {
225        let statistic = self.compute_test_statistic(group_a, group_b, test_type)?;
226        let p_value = self.compute_p_value(group_a, group_b, test_type, statistic)?;
227        let effect_size = self.compute_effect_size(group_a, group_b, effect_size_measure)?;
228        let confidence_interval = self.compute_confidence_interval(
229            group_a,
230            group_b,
231            ConfidenceIntervalType::Normal,
232            0.95,
233        )?;
234
235        let corrected_alpha = self.apply_correction(self.alpha_level, 1);
236        let is_significant = p_value < corrected_alpha;
237
238        Ok(SignificanceTestResult {
239            test_type,
240            statistic,
241            p_value,
242            effect_size,
243            effect_size_measure,
244            confidence_interval,
245            sample_sizes: (group_a.len(), group_b.len()),
246            is_significant,
247            alpha_level: corrected_alpha,
248        })
249    }
250
251    /// Compute effect size between two groups
252    pub fn effect_size(
253        &self,
254        group_a: &Array1<f64>,
255        group_b: &Array1<f64>,
256        measure: EffectSizeMeasure,
257    ) -> Result<EffectSizeResult, SklearsError> {
258        let value = self.compute_effect_size(group_a, group_b, measure)?;
259        let confidence_interval =
260            self.effect_size_confidence_interval(group_a, group_b, measure)?;
261        let interpretation = self.interpret_effect_size(value, measure);
262
263        Ok(EffectSizeResult {
264            measure,
265            value,
266            confidence_interval,
267            interpretation,
268            sample_sizes: (group_a.len(), group_b.len()),
269        })
270    }
271
272    /// Compare multiple models using cross-validation scores
273    pub fn compare_models(
274        &self,
275        model_names: Vec<String>,
276        cv_scores: Vec<Array1<f64>>,
277    ) -> Result<ModelComparisonResult, SklearsError> {
278        if model_names.len() != cv_scores.len() {
279            return Err(SklearsError::InvalidInput(
280                "Number of model names must match number of score arrays".to_string(),
281            ));
282        }
283
284        let performance_scores: Vec<f64> = cv_scores
285            .iter()
286            .map(|scores| scores.mean().unwrap_or(0.0))
287            .collect();
288
289        // Rank models by performance
290        let mut ranking: Vec<usize> = (0..model_names.len()).collect();
291        ranking.sort_by(|&a, &b| {
292            performance_scores[b]
293                .partial_cmp(&performance_scores[a])
294                .unwrap()
295        });
296        let best_model_index = ranking[0];
297
298        // Pairwise comparisons
299        let mut pairwise_comparisons = Vec::new();
300        let n_models = model_names.len();
301        let n_comparisons = n_models * (n_models - 1) / 2;
302
303        for i in 0..n_models {
304            for j in (i + 1)..n_models {
305                let significance_test = self.significance_test(
306                    &cv_scores[i],
307                    &cv_scores[j],
308                    SignificanceTest::WilcoxonRankSum,
309                    EffectSizeMeasure::CliffsDelta,
310                )?;
311
312                let bayes_factor = self.compute_bayes_factor(&cv_scores[i], &cv_scores[j])?;
313                let practical_significance = significance_test.effect_size.abs() > 0.2; // Small effect size threshold
314
315                pairwise_comparisons.push(PairwiseComparison {
316                    model_a_index: i,
317                    model_b_index: j,
318                    significance_test,
319                    bayes_factor: Some(bayes_factor),
320                    practical_significance,
321                });
322            }
323        }
324
325        // Apply multiple comparison correction
326        let corrected_comparisons =
327            self.apply_multiple_comparison_correction(pairwise_comparisons, n_comparisons);
328
329        // Statistical summary
330        let statistical_summary =
331            self.compute_statistical_summary(&cv_scores, &corrected_comparisons)?;
332
333        Ok(ModelComparisonResult {
334            model_names,
335            performance_scores,
336            pairwise_comparisons: corrected_comparisons,
337            ranking,
338            best_model_index,
339            statistical_summary,
340        })
341    }
342
343    fn compute_test_statistic(
344        &self,
345        group_a: &Array1<f64>,
346        group_b: &Array1<f64>,
347        test_type: SignificanceTest,
348    ) -> Result<f64, SklearsError> {
349        match test_type {
350            SignificanceTest::TTest | SignificanceTest::WelchTTest => self.compute_t_statistic(
351                group_a,
352                group_b,
353                matches!(test_type, SignificanceTest::WelchTTest),
354            ),
355            SignificanceTest::WilcoxonRankSum | SignificanceTest::MannWhitneyU => {
356                self.compute_rank_sum_statistic(group_a, group_b)
357            }
358            SignificanceTest::PermutationTest { n_permutations } => {
359                self.compute_permutation_statistic(group_a, group_b, n_permutations)
360            }
361            SignificanceTest::BootstrapTest { n_bootstrap } => {
362                self.compute_bootstrap_statistic(group_a, group_b, n_bootstrap)
363            }
364        }
365    }
366
367    fn compute_t_statistic(
368        &self,
369        group_a: &Array1<f64>,
370        group_b: &Array1<f64>,
371        welch: bool,
372    ) -> Result<f64, SklearsError> {
373        let mean_a = group_a.mean().unwrap_or(0.0);
374        let mean_b = group_b.mean().unwrap_or(0.0);
375        let n_a = group_a.len() as f64;
376        let n_b = group_b.len() as f64;
377
378        let var_a = group_a.iter().map(|&x| (x - mean_a).powi(2)).sum::<f64>() / (n_a - 1.0);
379        let var_b = group_b.iter().map(|&x| (x - mean_b).powi(2)).sum::<f64>() / (n_b - 1.0);
380
381        let t_statistic = if welch {
382            // Welch's t-test (unequal variances)
383            let se = (var_a / n_a + var_b / n_b).sqrt();
384            (mean_a - mean_b) / se
385        } else {
386            // Student's t-test (equal variances)
387            let pooled_var = ((n_a - 1.0) * var_a + (n_b - 1.0) * var_b) / (n_a + n_b - 2.0);
388            let se = pooled_var.sqrt() * (1.0 / n_a + 1.0 / n_b).sqrt();
389            (mean_a - mean_b) / se
390        };
391
392        Ok(t_statistic)
393    }
394
395    fn compute_rank_sum_statistic(
396        &self,
397        group_a: &Array1<f64>,
398        group_b: &Array1<f64>,
399    ) -> Result<f64, SklearsError> {
400        let mut combined: Vec<(f64, usize)> = group_a.iter().map(|&x| (x, 0)).collect();
401        combined.extend(group_b.iter().map(|&x| (x, 1)));
402        combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
403
404        let mut rank_sum_a = 0.0;
405        for (rank, (_, group)) in combined.iter().enumerate() {
406            if *group == 0 {
407                rank_sum_a += (rank + 1) as f64;
408            }
409        }
410
411        let n_a = group_a.len() as f64;
412        let n_b = group_b.len() as f64;
413        let expected_rank_sum = n_a * (n_a + n_b + 1.0) / 2.0;
414        let variance = n_a * n_b * (n_a + n_b + 1.0) / 12.0;
415
416        let z_statistic = (rank_sum_a - expected_rank_sum) / variance.sqrt();
417        Ok(z_statistic)
418    }
419
420    fn compute_permutation_statistic(
421        &self,
422        group_a: &Array1<f64>,
423        group_b: &Array1<f64>,
424        n_permutations: usize,
425    ) -> Result<f64, SklearsError> {
426        let observed_diff = group_a.mean().unwrap_or(0.0) - group_b.mean().unwrap_or(0.0);
427
428        let mut rng = if let Some(seed) = self.random_state {
429            scirs2_core::random::rngs::StdRng::seed_from_u64(seed)
430        } else {
431            scirs2_core::random::rngs::StdRng::seed_from_u64(0)
432        };
433
434        let mut combined: Vec<f64> = group_a.iter().chain(group_b.iter()).cloned().collect();
435        let n_a = group_a.len();
436
437        let mut extreme_count = 0;
438        for _ in 0..n_permutations {
439            // Shuffle the combined data
440            for i in (1..combined.len()).rev() {
441                let j = rng.gen_range(0..i + 1);
442                combined.swap(i, j);
443            }
444
445            let perm_mean_a = combined[..n_a].iter().sum::<f64>() / n_a as f64;
446            let perm_mean_b = combined[n_a..].iter().sum::<f64>() / (combined.len() - n_a) as f64;
447            let perm_diff = perm_mean_a - perm_mean_b;
448
449            if perm_diff.abs() >= observed_diff.abs() {
450                extreme_count += 1;
451            }
452        }
453
454        let p_value = extreme_count as f64 / n_permutations as f64;
455        Ok(p_value) // Return p-value directly for permutation test
456    }
457
458    fn compute_bootstrap_statistic(
459        &self,
460        group_a: &Array1<f64>,
461        group_b: &Array1<f64>,
462        n_bootstrap: usize,
463    ) -> Result<f64, SklearsError> {
464        let observed_diff = group_a.mean().unwrap_or(0.0) - group_b.mean().unwrap_or(0.0);
465
466        let mut rng = if let Some(seed) = self.random_state {
467            scirs2_core::random::rngs::StdRng::seed_from_u64(seed)
468        } else {
469            scirs2_core::random::rngs::StdRng::seed_from_u64(0)
470        };
471
472        let mut bootstrap_diffs = Vec::with_capacity(n_bootstrap);
473
474        for _ in 0..n_bootstrap {
475            // Bootstrap sample from each group
476            let boot_a: Vec<f64> = (0..group_a.len())
477                .map(|_| group_a[rng.gen_range(0..group_a.len())])
478                .collect();
479            let boot_b: Vec<f64> = (0..group_b.len())
480                .map(|_| group_b[rng.gen_range(0..group_b.len())])
481                .collect();
482
483            let boot_mean_a = boot_a.iter().sum::<f64>() / boot_a.len() as f64;
484            let boot_mean_b = boot_b.iter().sum::<f64>() / boot_b.len() as f64;
485            bootstrap_diffs.push(boot_mean_a - boot_mean_b);
486        }
487
488        bootstrap_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap());
489        let std_bootstrap = {
490            let mean_bootstrap = bootstrap_diffs.iter().sum::<f64>() / bootstrap_diffs.len() as f64;
491            let variance = bootstrap_diffs
492                .iter()
493                .map(|&x| (x - mean_bootstrap).powi(2))
494                .sum::<f64>()
495                / (bootstrap_diffs.len() - 1) as f64;
496            variance.sqrt()
497        };
498
499        Ok(observed_diff / std_bootstrap)
500    }
501
502    fn compute_p_value(
503        &self,
504        group_a: &Array1<f64>,
505        group_b: &Array1<f64>,
506        test_type: SignificanceTest,
507        statistic: f64,
508    ) -> Result<f64, SklearsError> {
509        match test_type {
510            SignificanceTest::TTest => {
511                let df = group_a.len() + group_b.len() - 2;
512                self.t_distribution_p_value(statistic, df)
513            }
514            SignificanceTest::WelchTTest => {
515                let n_a = group_a.len() as f64;
516                let n_b = group_b.len() as f64;
517                let var_a = group_a
518                    .iter()
519                    .map(|&x| (x - group_a.mean().unwrap_or(0.0)).powi(2))
520                    .sum::<f64>()
521                    / (n_a - 1.0);
522                let var_b = group_b
523                    .iter()
524                    .map(|&x| (x - group_b.mean().unwrap_or(0.0)).powi(2))
525                    .sum::<f64>()
526                    / (n_b - 1.0);
527
528                // Welch-Satterthwaite equation for degrees of freedom
529                let s_a2_n_a = var_a / n_a;
530                let s_b2_n_b = var_b / n_b;
531                let numerator = (s_a2_n_a + s_b2_n_b).powi(2);
532                let denominator = s_a2_n_a.powi(2) / (n_a - 1.0) + s_b2_n_b.powi(2) / (n_b - 1.0);
533                let df = (numerator / denominator).floor() as usize;
534
535                self.t_distribution_p_value(statistic, df)
536            }
537            SignificanceTest::WilcoxonRankSum | SignificanceTest::MannWhitneyU => {
538                self.normal_distribution_p_value(statistic)
539            }
540            SignificanceTest::PermutationTest { .. } => {
541                Ok(statistic) // statistic is already the p-value for permutation test
542            }
543            SignificanceTest::BootstrapTest { .. } => self.normal_distribution_p_value(statistic),
544        }
545    }
546
547    fn compute_effect_size(
548        &self,
549        group_a: &Array1<f64>,
550        group_b: &Array1<f64>,
551        measure: EffectSizeMeasure,
552    ) -> Result<f64, SklearsError> {
553        match measure {
554            EffectSizeMeasure::CohensD => self.cohens_d(group_a, group_b),
555            EffectSizeMeasure::GlassDelta => self.glass_delta(group_a, group_b),
556            EffectSizeMeasure::HedgesG => self.hedges_g(group_a, group_b),
557            EffectSizeMeasure::CliffsDelta => self.cliffs_delta(group_a, group_b),
558            EffectSizeMeasure::CommonLanguageEffect => {
559                self.common_language_effect(group_a, group_b)
560            }
561            EffectSizeMeasure::ProbabilityOfSuperiority => {
562                self.probability_of_superiority(group_a, group_b)
563            }
564        }
565    }
566
567    fn cohens_d(&self, group_a: &Array1<f64>, group_b: &Array1<f64>) -> Result<f64, SklearsError> {
568        let mean_a = group_a.mean().unwrap_or(0.0);
569        let mean_b = group_b.mean().unwrap_or(0.0);
570        let n_a = group_a.len() as f64;
571        let n_b = group_b.len() as f64;
572
573        let var_a = group_a.iter().map(|&x| (x - mean_a).powi(2)).sum::<f64>() / (n_a - 1.0);
574        let var_b = group_b.iter().map(|&x| (x - mean_b).powi(2)).sum::<f64>() / (n_b - 1.0);
575
576        let pooled_std = (((n_a - 1.0) * var_a + (n_b - 1.0) * var_b) / (n_a + n_b - 2.0)).sqrt();
577
578        Ok((mean_a - mean_b) / pooled_std)
579    }
580
581    fn glass_delta(
582        &self,
583        group_a: &Array1<f64>,
584        group_b: &Array1<f64>,
585    ) -> Result<f64, SklearsError> {
586        let mean_a = group_a.mean().unwrap_or(0.0);
587        let mean_b = group_b.mean().unwrap_or(0.0);
588        let n_b = group_b.len() as f64;
589
590        let var_b = group_b.iter().map(|&x| (x - mean_b).powi(2)).sum::<f64>() / (n_b - 1.0);
591        let std_b = var_b.sqrt();
592
593        Ok((mean_a - mean_b) / std_b)
594    }
595
596    fn hedges_g(&self, group_a: &Array1<f64>, group_b: &Array1<f64>) -> Result<f64, SklearsError> {
597        let cohens_d = self.cohens_d(group_a, group_b)?;
598        let n_a = group_a.len() as f64;
599        let n_b = group_b.len() as f64;
600
601        // Bias correction factor
602        let df = n_a + n_b - 2.0;
603        let correction = 1.0 - 3.0 / (4.0 * df - 1.0);
604
605        Ok(cohens_d * correction)
606    }
607
608    fn cliffs_delta(
609        &self,
610        group_a: &Array1<f64>,
611        group_b: &Array1<f64>,
612    ) -> Result<f64, SklearsError> {
613        let mut dominance_count = 0;
614        let total_comparisons = group_a.len() * group_b.len();
615
616        for &a in group_a.iter() {
617            for &b in group_b.iter() {
618                if a > b {
619                    dominance_count += 1;
620                } else if a < b {
621                    dominance_count -= 1;
622                }
623                // Ties contribute 0
624            }
625        }
626
627        Ok(dominance_count as f64 / total_comparisons as f64)
628    }
629
630    fn common_language_effect(
631        &self,
632        group_a: &Array1<f64>,
633        group_b: &Array1<f64>,
634    ) -> Result<f64, SklearsError> {
635        let mut superiority_count = 0;
636        let total_comparisons = group_a.len() * group_b.len();
637
638        for &a in group_a.iter() {
639            for &b in group_b.iter() {
640                if a > b {
641                    superiority_count += 1;
642                }
643            }
644        }
645
646        Ok(superiority_count as f64 / total_comparisons as f64)
647    }
648
649    fn probability_of_superiority(
650        &self,
651        group_a: &Array1<f64>,
652        group_b: &Array1<f64>,
653    ) -> Result<f64, SklearsError> {
654        // Same as common language effect for continuous data
655        self.common_language_effect(group_a, group_b)
656    }
657
658    fn compute_confidence_interval(
659        &self,
660        group_a: &Array1<f64>,
661        group_b: &Array1<f64>,
662        ci_type: ConfidenceIntervalType,
663        confidence_level: f64,
664    ) -> Result<(f64, f64), SklearsError> {
665        match ci_type {
666            ConfidenceIntervalType::Normal => {
667                let mean_diff = group_a.mean().unwrap_or(0.0) - group_b.mean().unwrap_or(0.0);
668                let n_a = group_a.len() as f64;
669                let n_b = group_b.len() as f64;
670                let mean_a = group_a.mean().unwrap_or(0.0);
671                let mean_b = group_b.mean().unwrap_or(0.0);
672
673                let var_a =
674                    group_a.iter().map(|&x| (x - mean_a).powi(2)).sum::<f64>() / (n_a - 1.0);
675                let var_b =
676                    group_b.iter().map(|&x| (x - mean_b).powi(2)).sum::<f64>() / (n_b - 1.0);
677                let se = (var_a / n_a + var_b / n_b).sqrt();
678
679                let z_critical = self.normal_quantile((1.0 + confidence_level) / 2.0);
680                let margin_of_error = z_critical * se;
681
682                Ok((mean_diff - margin_of_error, mean_diff + margin_of_error))
683            }
684            ConfidenceIntervalType::TDistribution => {
685                let mean_diff = group_a.mean().unwrap_or(0.0) - group_b.mean().unwrap_or(0.0);
686                let n_a = group_a.len() as f64;
687                let n_b = group_b.len() as f64;
688                let df = (n_a + n_b - 2.0) as usize;
689                let mean_a = group_a.mean().unwrap_or(0.0);
690                let mean_b = group_b.mean().unwrap_or(0.0);
691
692                let var_a =
693                    group_a.iter().map(|&x| (x - mean_a).powi(2)).sum::<f64>() / (n_a - 1.0);
694                let var_b =
695                    group_b.iter().map(|&x| (x - mean_b).powi(2)).sum::<f64>() / (n_b - 1.0);
696                let pooled_var = ((n_a - 1.0) * var_a + (n_b - 1.0) * var_b) / (n_a + n_b - 2.0);
697                let se = pooled_var.sqrt() * (1.0 / n_a + 1.0 / n_b).sqrt();
698
699                let t_critical = self.t_quantile((1.0 + confidence_level) / 2.0, df);
700                let margin_of_error = t_critical * se;
701
702                Ok((mean_diff - margin_of_error, mean_diff + margin_of_error))
703            }
704            _ => {
705                // For bootstrap methods, use normal approximation as fallback
706                self.compute_confidence_interval(
707                    group_a,
708                    group_b,
709                    ConfidenceIntervalType::Normal,
710                    confidence_level,
711                )
712            }
713        }
714    }
715
716    fn effect_size_confidence_interval(
717        &self,
718        group_a: &Array1<f64>,
719        group_b: &Array1<f64>,
720        measure: EffectSizeMeasure,
721    ) -> Result<(f64, f64), SklearsError> {
722        // Simplified confidence interval for effect sizes
723        // In practice, this would use bootstrap or analytical methods
724        let effect_size = self.compute_effect_size(group_a, group_b, measure)?;
725        let n_a = group_a.len() as f64;
726        let n_b = group_b.len() as f64;
727
728        // Rough approximation for demonstration
729        let se = match measure {
730            EffectSizeMeasure::CohensD | EffectSizeMeasure::HedgesG => {
731                ((n_a + n_b) / (n_a * n_b) + effect_size.powi(2) / (2.0 * (n_a + n_b))).sqrt()
732            }
733            _ => 0.1, // Simplified for other measures
734        };
735
736        let z_critical = self.normal_quantile(0.975); // 95% CI
737        let margin_of_error = z_critical * se;
738
739        Ok((effect_size - margin_of_error, effect_size + margin_of_error))
740    }
741
742    fn interpret_effect_size(
743        &self,
744        value: f64,
745        measure: EffectSizeMeasure,
746    ) -> EffectSizeInterpretation {
747        let abs_value = value.abs();
748
749        match measure {
750            EffectSizeMeasure::CohensD | EffectSizeMeasure::HedgesG => {
751                if abs_value < 0.2 {
752                    EffectSizeInterpretation::Negligible
753                } else if abs_value < 0.5 {
754                    EffectSizeInterpretation::Small
755                } else if abs_value < 0.8 {
756                    EffectSizeInterpretation::Medium
757                } else if abs_value < 1.2 {
758                    EffectSizeInterpretation::Large
759                } else {
760                    EffectSizeInterpretation::VeryLarge
761                }
762            }
763            EffectSizeMeasure::CliffsDelta => {
764                if abs_value < 0.147 {
765                    EffectSizeInterpretation::Negligible
766                } else if abs_value < 0.33 {
767                    EffectSizeInterpretation::Small
768                } else if abs_value < 0.474 {
769                    EffectSizeInterpretation::Medium
770                } else {
771                    EffectSizeInterpretation::Large
772                }
773            }
774            _ => {
775                // Generic interpretation
776                if abs_value < 0.1 {
777                    EffectSizeInterpretation::Negligible
778                } else if abs_value < 0.3 {
779                    EffectSizeInterpretation::Small
780                } else if abs_value < 0.5 {
781                    EffectSizeInterpretation::Medium
782                } else {
783                    EffectSizeInterpretation::Large
784                }
785            }
786        }
787    }
788
789    fn compute_bayes_factor(
790        &self,
791        group_a: &Array1<f64>,
792        group_b: &Array1<f64>,
793    ) -> Result<f64, SklearsError> {
794        // Simplified Bayes factor calculation using t-test approximation
795        let t_stat = self.compute_t_statistic(group_a, group_b, false)?;
796        let n_a = group_a.len() as f64;
797        let n_b = group_b.len() as f64;
798        let df = n_a + n_b - 2.0;
799
800        // BIC approximation for Bayes factor
801        let bic_null = (n_a + n_b) * (1.0 + t_stat.powi(2) / df).ln();
802        let bic_alt = bic_null - df.ln();
803        let log_bf = (bic_null - bic_alt) / 2.0;
804
805        Ok(log_bf.exp())
806    }
807
808    fn apply_correction(&self, alpha: f64, n_comparisons: usize) -> f64 {
809        match self.correction_method {
810            MultipleComparisonCorrection::None => alpha,
811            MultipleComparisonCorrection::Bonferroni => alpha / n_comparisons as f64,
812            MultipleComparisonCorrection::Holm => alpha, // Applied during ranking
813            MultipleComparisonCorrection::BenjaminiHochberg => alpha, // Applied during ranking
814            MultipleComparisonCorrection::BenjaminiYekutieli => alpha, // Applied during ranking
815        }
816    }
817
818    fn apply_multiple_comparison_correction(
819        &self,
820        mut comparisons: Vec<PairwiseComparison>,
821        n_comparisons: usize,
822    ) -> Vec<PairwiseComparison> {
823        match self.correction_method {
824            MultipleComparisonCorrection::None | MultipleComparisonCorrection::Bonferroni => {
825                // Already applied in individual tests
826                comparisons
827            }
828            MultipleComparisonCorrection::Holm => {
829                // Sort p-values and apply Holm correction
830                comparisons.sort_by(|a, b| {
831                    a.significance_test
832                        .p_value
833                        .partial_cmp(&b.significance_test.p_value)
834                        .unwrap()
835                });
836                for (i, comparison) in comparisons.iter_mut().enumerate() {
837                    let corrected_alpha = self.alpha_level / (n_comparisons - i) as f64;
838                    comparison.significance_test.is_significant =
839                        comparison.significance_test.p_value < corrected_alpha;
840                    comparison.significance_test.alpha_level = corrected_alpha;
841                }
842                comparisons
843            }
844            MultipleComparisonCorrection::BenjaminiHochberg => {
845                // FDR correction
846                comparisons.sort_by(|a, b| {
847                    a.significance_test
848                        .p_value
849                        .partial_cmp(&b.significance_test.p_value)
850                        .unwrap()
851                });
852                for (i, comparison) in comparisons.iter_mut().enumerate() {
853                    let corrected_alpha = self.alpha_level * (i + 1) as f64 / n_comparisons as f64;
854                    comparison.significance_test.is_significant =
855                        comparison.significance_test.p_value <= corrected_alpha;
856                    comparison.significance_test.alpha_level = corrected_alpha;
857                }
858                comparisons
859            }
860            MultipleComparisonCorrection::BenjaminiYekutieli => {
861                // FDR correction for dependent tests
862                let c_factor: f64 = (1..=n_comparisons).map(|i| 1.0 / i as f64).sum();
863                comparisons.sort_by(|a, b| {
864                    a.significance_test
865                        .p_value
866                        .partial_cmp(&b.significance_test.p_value)
867                        .unwrap()
868                });
869                for (i, comparison) in comparisons.iter_mut().enumerate() {
870                    let corrected_alpha =
871                        self.alpha_level * (i + 1) as f64 / (n_comparisons as f64 * c_factor);
872                    comparison.significance_test.is_significant =
873                        comparison.significance_test.p_value <= corrected_alpha;
874                    comparison.significance_test.alpha_level = corrected_alpha;
875                }
876                comparisons
877            }
878        }
879    }
880
881    fn compute_statistical_summary(
882        &self,
883        cv_scores: &[Array1<f64>],
884        comparisons: &[PairwiseComparison],
885    ) -> Result<StatisticalSummary, SklearsError> {
886        let mean_scores: Vec<f64> = cv_scores
887            .iter()
888            .map(|scores| scores.mean().unwrap_or(0.0))
889            .collect();
890        let std_scores: Vec<f64> = cv_scores
891            .iter()
892            .enumerate()
893            .map(|(i, scores)| {
894                let mean = mean_scores[i];
895                let variance = scores.iter().map(|&x| (x - mean).powi(2)).sum::<f64>()
896                    / (scores.len() - 1) as f64;
897                variance.sqrt()
898            })
899            .collect();
900
901        let confidence_intervals: Vec<(f64, f64)> = cv_scores
902            .iter()
903            .enumerate()
904            .map(|(i, scores)| {
905                let mean = mean_scores[i];
906                let std = std_scores[i];
907                let n = scores.len() as f64;
908                let se = std / n.sqrt();
909                let t_critical = self.t_quantile(0.975, scores.len() - 1);
910                let margin = t_critical * se;
911                (mean - margin, mean + margin)
912            })
913            .collect();
914
915        let overall_best_model = mean_scores
916            .iter()
917            .enumerate()
918            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
919            .map(|(i, _)| i)
920            .unwrap_or(0);
921
922        let significantly_different_pairs: Vec<(usize, usize)> = comparisons
923            .iter()
924            .filter(|comp| comp.significance_test.is_significant)
925            .map(|comp| (comp.model_a_index, comp.model_b_index))
926            .collect();
927
928        let effect_sizes: HashMap<(usize, usize), f64> = comparisons
929            .iter()
930            .map(|comp| {
931                (
932                    (comp.model_a_index, comp.model_b_index),
933                    comp.significance_test.effect_size,
934                )
935            })
936            .collect();
937
938        Ok(StatisticalSummary {
939            mean_scores,
940            std_scores,
941            confidence_intervals,
942            overall_best_model,
943            significantly_different_pairs,
944            effect_sizes,
945        })
946    }
947
948    // Helper functions for statistical distributions
949    fn normal_quantile(&self, p: f64) -> f64 {
950        // Beasley-Springer-Moro algorithm approximation
951        let a = [
952            -3.969683028665376e+01,
953            2.209460984245205e+02,
954            -2.759285104469687e+02,
955            1.383_577_518_672_69e2,
956            -3.066479806614716e+01,
957            2.506628277459239e+00,
958        ];
959        let b = [
960            -5.447609879822406e+01,
961            1.615858368580409e+02,
962            -1.556989798598866e+02,
963            6.680131188771972e+01,
964            -1.328068155288572e+01,
965        ];
966        let c = [
967            -7.784894002430293e-03,
968            -3.223964580411365e-01,
969            -2.400758277161838e+00,
970            -2.549732539343734e+00,
971            4.374664141464968e+00,
972            2.938163982698783e+00,
973        ];
974        let d = [
975            7.784695709041462e-03,
976            3.224671290700398e-01,
977            2.445134137142996e+00,
978            3.754408661907416e+00,
979        ];
980
981        let p_low = 0.02425;
982        let p_high = 1.0 - p_low;
983
984        if p < p_low {
985            let q = (-2.0 * p.ln()).sqrt();
986            -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5])
987                / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0)
988        } else if p <= p_high {
989            let q = p - 0.5;
990            let r = q * q;
991            (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q
992                / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1.0)
993        } else {
994            let q = (-2.0 * (1.0 - p).ln()).sqrt();
995            (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5])
996                / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0)
997        }
998    }
999
1000    fn t_quantile(&self, p: f64, df: usize) -> f64 {
1001        // Simplified t-distribution quantile approximation
1002        let z = self.normal_quantile(p);
1003        let df_f = df as f64;
1004
1005        if df >= 30 {
1006            // For large df, t-distribution approaches normal
1007            return z;
1008        }
1009
1010        // Hill's approximation for t-distribution quantile
1011        let a = 4.0 * df_f;
1012        let c = (a + z * z) / a;
1013        let correction = 1.0 + (z * z + 1.0) / a;
1014
1015        z * c.sqrt() * correction
1016    }
1017
1018    fn t_distribution_p_value(&self, t: f64, df: usize) -> Result<f64, SklearsError> {
1019        // Simplified two-tailed p-value calculation
1020        // This would use a proper t-distribution CDF in practice
1021        let abs_t = t.abs();
1022
1023        if df >= 30 {
1024            return self.normal_distribution_p_value(abs_t);
1025        }
1026
1027        // Rough approximation for smaller df
1028        let p = if abs_t > 3.0 {
1029            0.001
1030        } else if abs_t > 2.0 {
1031            0.05
1032        } else if abs_t > 1.0 {
1033            0.3
1034        } else {
1035            0.6
1036        };
1037
1038        Ok(p)
1039    }
1040
1041    fn normal_distribution_p_value(&self, z: f64) -> Result<f64, SklearsError> {
1042        // Two-tailed p-value using normal distribution
1043        let abs_z = z.abs();
1044
1045        // Approximate normal CDF complement
1046        let p = if abs_z > 6.0 {
1047            0.000000001
1048        } else {
1049            let t = 1.0 / (1.0 + 0.2316419 * abs_z);
1050            let d = 0.3989423 * (-abs_z * abs_z / 2.0).exp();
1051            d * t * (0.3193815 + t * (-0.3565638 + t * (1.781478 + t * (-1.821256 + t * 1.330274))))
1052        };
1053
1054        Ok(2.0 * p) // Two-tailed
1055    }
1056}
1057
1058impl Default for ComparativeAnalyzer {
1059    fn default() -> Self {
1060        Self::new()
1061    }
1062}
1063
1064/// Utility functions for generating comparison reports
1065pub struct ComparisonReporter;
1066
1067impl ComparisonReporter {
1068    /// Generate a comprehensive comparison report
1069    pub fn generate_report(comparison: &ModelComparisonResult) -> String {
1070        let mut report = String::new();
1071
1072        report.push_str("# Model Comparison Report\n\n");
1073
1074        // Overall ranking
1075        report.push_str("## Overall Ranking\n");
1076        for (rank, &model_idx) in comparison.ranking.iter().enumerate() {
1077            let model_name = &comparison.model_names[model_idx];
1078            let score = comparison.performance_scores[model_idx];
1079            report.push_str(&format!(
1080                "{}. {} (Score: {:.4})\n",
1081                rank + 1,
1082                model_name,
1083                score
1084            ));
1085        }
1086
1087        // Statistical summary
1088        report.push_str("\n## Statistical Summary\n");
1089        for (i, model_name) in comparison.model_names.iter().enumerate() {
1090            let mean = comparison.statistical_summary.mean_scores[i];
1091            let std = comparison.statistical_summary.std_scores[i];
1092            let (ci_low, ci_high) = comparison.statistical_summary.confidence_intervals[i];
1093            report.push_str(&format!(
1094                "- {}: Mean = {:.4} ± {:.4}, 95% CI = [{:.4}, {:.4}]\n",
1095                model_name, mean, std, ci_low, ci_high
1096            ));
1097        }
1098
1099        // Pairwise comparisons
1100        report.push_str("\n## Significant Pairwise Differences\n");
1101        for pairwise_comp in &comparison.pairwise_comparisons {
1102            if pairwise_comp.significance_test.is_significant {
1103                let model_a = &comparison.model_names[pairwise_comp.model_a_index];
1104                let model_b = &comparison.model_names[pairwise_comp.model_b_index];
1105                let p_value = pairwise_comp.significance_test.p_value;
1106                let effect_size = pairwise_comp.significance_test.effect_size;
1107
1108                report.push_str(&format!(
1109                    "- {} vs {}: p = {:.4}, Effect Size = {:.4}\n",
1110                    model_a, model_b, p_value, effect_size
1111                ));
1112            }
1113        }
1114
1115        report
1116    }
1117}
1118
1119#[allow(non_snake_case)]
1120#[cfg(test)]
1121mod tests {
1122    use super::*;
1123    use scirs2_core::ndarray::array;
1124
1125    #[test]
1126    fn test_significance_test() {
1127        let analyzer = ComparativeAnalyzer::new();
1128        let group_a = array![1.0, 2.0, 3.0, 4.0, 5.0];
1129        let group_b = array![2.0, 3.0, 4.0, 5.0, 6.0];
1130
1131        let result = analyzer
1132            .significance_test(
1133                &group_a,
1134                &group_b,
1135                SignificanceTest::TTest,
1136                EffectSizeMeasure::CohensD,
1137            )
1138            .unwrap();
1139
1140        assert!(result.p_value >= 0.0 && result.p_value <= 1.0);
1141        assert!(!result.statistic.is_nan());
1142        assert!(!result.effect_size.is_nan());
1143    }
1144
1145    #[test]
1146    fn test_effect_size_cohens_d() {
1147        let analyzer = ComparativeAnalyzer::new();
1148        let group_a = array![1.0, 2.0, 3.0, 4.0, 5.0];
1149        let group_b = array![3.0, 4.0, 5.0, 6.0, 7.0];
1150
1151        let result = analyzer
1152            .effect_size(&group_a, &group_b, EffectSizeMeasure::CohensD)
1153            .unwrap();
1154
1155        assert!(result.value < 0.0); // group_a has lower mean
1156                                     // Check the actual Cohen's d value - it should be around -1.26 for this data
1157        assert!(result.value.abs() > 0.5); // Should be at least medium effect
1158                                           // The effect size should be meaningful
1159        assert!(!matches!(
1160            result.interpretation,
1161            EffectSizeInterpretation::Negligible
1162        ));
1163    }
1164
1165    #[test]
1166    fn test_cliffs_delta() {
1167        let analyzer = ComparativeAnalyzer::new();
1168        let group_a = array![1.0, 2.0, 3.0];
1169        let group_b = array![4.0, 5.0, 6.0];
1170
1171        let delta = analyzer.cliffs_delta(&group_a, &group_b).unwrap();
1172        assert_eq!(delta, -1.0); // All values in group_a are less than group_b
1173    }
1174
1175    #[test]
1176    fn test_model_comparison() {
1177        let analyzer = ComparativeAnalyzer::new();
1178        let model_names = vec!["Model A".to_string(), "Model B".to_string()];
1179        let cv_scores = vec![
1180            array![0.8, 0.82, 0.78, 0.81, 0.79],
1181            array![0.75, 0.77, 0.73, 0.76, 0.74],
1182        ];
1183
1184        let result = analyzer.compare_models(model_names, cv_scores).unwrap();
1185
1186        assert_eq!(result.model_names.len(), 2);
1187        assert_eq!(result.performance_scores.len(), 2);
1188        assert_eq!(result.ranking[0], 0); // Model A should be ranked first
1189        assert_eq!(result.best_model_index, 0);
1190    }
1191
1192    #[test]
1193    fn test_permutation_test() {
1194        let analyzer = ComparativeAnalyzer::new().with_random_state(42);
1195        let group_a = array![1.0, 2.0, 3.0, 4.0, 5.0];
1196        let group_b = array![2.0, 3.0, 4.0, 5.0, 6.0];
1197
1198        let result = analyzer
1199            .significance_test(
1200                &group_a,
1201                &group_b,
1202                SignificanceTest::PermutationTest {
1203                    n_permutations: 1000,
1204                },
1205                EffectSizeMeasure::CohensD,
1206            )
1207            .unwrap();
1208
1209        assert!(result.p_value >= 0.0 && result.p_value <= 1.0);
1210    }
1211
1212    #[test]
1213    fn test_confidence_intervals() {
1214        let analyzer = ComparativeAnalyzer::new();
1215        let group_a = array![1.0, 2.0, 3.0, 4.0, 5.0];
1216        let group_b = array![2.0, 3.0, 4.0, 5.0, 6.0];
1217
1218        let (lower, upper) = analyzer
1219            .compute_confidence_interval(
1220                &group_a,
1221                &group_b,
1222                ConfidenceIntervalType::TDistribution,
1223                0.95,
1224            )
1225            .unwrap();
1226
1227        assert!(lower < upper);
1228        // The mean difference should be negative (group_a has lower mean)
1229        let mean_diff = 3.0 - 4.0; // Expected difference
1230        assert!(lower < mean_diff && mean_diff < upper); // CI should contain the true difference
1231    }
1232}