1use crate::ground_truth_dataset::{GroundTruthDataset, GroundTruthManager, GroundTruthSample};
8use crate::quality::QualityEvaluator;
9use crate::statistical::correlation::CorrelationAnalyzer;
10use crate::traits::QualityEvaluator as QualityEvaluatorTrait;
11use crate::traits::QualityScore;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct StatisticalTestResult {
16 pub test_name: String,
18 pub statistic: f64,
20 pub p_value: f64,
22 pub critical_value: f64,
24 pub significant: bool,
26 pub effect_size: Option<f64>,
28 pub confidence_interval: Option<(f64, f64)>,
30}
31use crate::VoirsError;
32use chrono::{DateTime, Utc};
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use std::path::PathBuf;
36use thiserror::Error;
37use voirs_sdk::{AudioBuffer, LanguageCode};
38
39#[derive(Error, Debug)]
41pub enum ReliabilityTestError {
42 #[error("Insufficient data for reliability testing: {0}")]
44 InsufficientData(String),
45 #[error("Test-retest reliability test failed: {0}")]
47 TestRetestFailed(String),
48 #[error("Inter-rater reliability test failed: {0}")]
50 InterRaterFailed(String),
51 #[error("Internal consistency test failed: {0}")]
53 InternalConsistencyFailed(String),
54 #[error("Reproducibility test failed: {0}")]
56 ReproducibilityFailed(String),
57 #[error("Statistical analysis failed: {0}")]
59 StatisticalAnalysisFailed(String),
60 #[error("IO error: {0}")]
62 IoError(#[from] std::io::Error),
63 #[error("VoiRS error: {0}")]
65 VoirsError(#[from] VoirsError),
66 #[error("Evaluation error: {0}")]
68 EvaluationError(#[from] crate::EvaluationError),
69 #[error("Ground truth error: {0}")]
71 GroundTruthError(#[from] crate::ground_truth_dataset::GroundTruthError),
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct ReliabilityTestConfig {
77 pub test_retest_interval_hours: f64,
79 pub test_retest_repetitions: usize,
81 pub min_test_retest_correlation: f64,
83 pub min_inter_rater_correlation: f64,
85 pub min_internal_consistency: f64,
87 pub confidence_level: f64,
89 pub enable_detailed_reporting: bool,
91 pub enable_cross_platform_testing: bool,
93 pub random_seed: Option<u64>,
95}
96
97impl Default for ReliabilityTestConfig {
98 fn default() -> Self {
99 Self {
100 test_retest_interval_hours: 24.0,
101 test_retest_repetitions: 3,
102 min_test_retest_correlation: 0.8,
103 min_inter_rater_correlation: 0.75,
104 min_internal_consistency: 0.7,
105 confidence_level: 0.95,
106 enable_detailed_reporting: true,
107 enable_cross_platform_testing: true,
108 random_seed: Some(42),
109 }
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct MetricReliabilityResults {
116 pub test_retest_reliability: TestRetestReliabilityResults,
118 pub inter_rater_reliability: InterRaterReliabilityResults,
120 pub internal_consistency: InternalConsistencyResults,
122 pub reproducibility: ReproducibilityResults,
124 pub overall_assessment: OverallReliabilityAssessment,
126 pub timestamp: DateTime<Utc>,
128 pub test_duration: std::time::Duration,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct TestRetestReliabilityResults {
135 pub test_retest_correlation: f64,
137 pub intraclass_correlation: f64,
139 pub standard_error_measurement: f64,
141 pub minimum_detectable_change: f64,
143 pub metric_differences: HashMap<String, TestRetestMetricDifference>,
145 pub statistical_significance: StatisticalTestResult,
147 pub reliability_classification: ReliabilityClassification,
149}
150
151#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct TestRetestMetricDifference {
154 pub mean_difference: f64,
156 pub std_difference: f64,
158 pub limits_of_agreement: (f64, f64),
160 pub coefficient_of_variation: f64,
162 pub reliability_coefficient: f64,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct InterRaterReliabilityResults {
169 pub inter_class_correlation: f64,
171 pub fleiss_kappa: Option<f64>,
173 pub kendalls_concordance: f64,
175 pub pairwise_correlations: HashMap<(String, String), f64>,
177 pub rater_bias_analysis: RaterBiasAnalysis,
179 pub agreement_within_tolerance: HashMap<String, f64>,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct RaterBiasAnalysis {
186 pub mean_ratings_by_rater: HashMap<String, f64>,
188 pub std_ratings_by_rater: HashMap<String, f64>,
190 pub systematic_bias: HashMap<String, f64>,
192 pub rater_consistency: HashMap<String, f64>,
194}
195
196#[derive(Debug, Clone, Serialize, Deserialize)]
198pub struct InternalConsistencyResults {
199 pub cronbachs_alpha: f64,
201 pub mcdonalds_omega: Option<f64>,
203 pub split_half_reliability: f64,
205 pub item_total_correlations: HashMap<String, f64>,
207 pub alpha_if_deleted: HashMap<String, f64>,
209 pub inter_item_correlations: HashMap<(String, String), f64>,
211}
212
213#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct ReproducibilityResults {
216 pub cross_platform: CrossPlatformReproducibility,
218 pub cross_implementation: CrossImplementationReproducibility,
220 pub temporal_reproducibility: TemporalReproducibility,
222 pub environmental_reproducibility: EnvironmentalReproducibility,
224}
225
226#[derive(Debug, Clone, Serialize, Deserialize)]
228pub struct CrossPlatformReproducibility {
229 pub platform_comparisons: HashMap<String, HashMap<String, f64>>,
231 pub cross_platform_correlation: f64,
233 pub platform_biases: HashMap<String, f64>,
235 pub reproducibility_score: f64,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct CrossImplementationReproducibility {
242 pub implementation_comparisons: HashMap<String, HashMap<String, f64>>,
244 pub implementation_consistency: f64,
246 pub version_compatibility: HashMap<String, f64>,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct TemporalReproducibility {
253 pub temporal_correlation: f64,
255 pub time_series_analysis: TemporalAnalysis,
257 pub drift_detection: DriftDetectionResults,
259}
260
261#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct TemporalAnalysis {
264 pub trend_coefficient: f64,
266 pub seasonal_components: Vec<f64>,
268 pub residual_variance: f64,
270 pub autocorrelation: Vec<f64>,
272}
273
274#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct DriftDetectionResults {
277 pub drift_detected: bool,
279 pub drift_magnitude: f64,
281 pub drift_direction: DriftDirection,
283 pub change_points: Vec<usize>,
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
289pub enum DriftDirection {
290 Increasing,
292 Decreasing,
294 None,
296 Cyclical,
298}
299
300#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct EnvironmentalReproducibility {
303 pub temperature_effects: HashMap<String, f64>,
305 pub humidity_effects: HashMap<String, f64>,
307 pub computational_load_effects: HashMap<String, f64>,
309 pub memory_effects: HashMap<String, f64>,
311}
312
313#[derive(Debug, Clone, Serialize, Deserialize)]
315pub struct OverallReliabilityAssessment {
316 pub overall_score: f64,
318 pub metric_reliability_scores: HashMap<String, f64>,
320 pub classification: ReliabilityClassification,
322 pub recommendations: Vec<String>,
324 pub critical_issues: Vec<String>,
326}
327
328#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
330pub enum ReliabilityClassification {
331 Excellent,
333 Good,
335 Acceptable,
337 Questionable,
339 Poor,
341}
342
343pub struct MetricReliabilityTester {
345 config: ReliabilityTestConfig,
347 evaluator: QualityEvaluator,
349 correlation_analyzer: CorrelationAnalyzer,
351 dataset_manager: GroundTruthManager,
353 results_cache: HashMap<String, MetricReliabilityResults>,
355}
356
357impl MetricReliabilityTester {
358 pub async fn new(
360 config: ReliabilityTestConfig,
361 dataset_path: PathBuf,
362 ) -> Result<Self, ReliabilityTestError> {
363 let evaluator = QualityEvaluator::new().await?;
364 let correlation_analyzer = CorrelationAnalyzer::default();
365
366 let mut dataset_manager = GroundTruthManager::new(dataset_path);
367 dataset_manager.initialize().await?;
368
369 Ok(Self {
370 config,
371 evaluator,
372 correlation_analyzer,
373 dataset_manager,
374 results_cache: HashMap::new(),
375 })
376 }
377
378 pub async fn run_reliability_tests(
380 &mut self,
381 dataset_id: &str,
382 ) -> Result<MetricReliabilityResults, ReliabilityTestError> {
383 let start_time = std::time::Instant::now();
384
385 let dataset = self
387 .dataset_manager
388 .get_dataset(dataset_id)
389 .ok_or_else(|| {
390 ReliabilityTestError::InsufficientData(format!("Dataset {} not found", dataset_id))
391 })?;
392
393 if dataset.samples.len() < 10 {
395 return Err(ReliabilityTestError::InsufficientData(format!(
396 "Dataset has only {} samples, need at least 10",
397 dataset.samples.len()
398 )));
399 }
400
401 let test_retest_reliability = self.test_retest_reliability(dataset).await?;
403
404 let inter_rater_reliability = self.test_inter_rater_reliability(dataset).await?;
406
407 let internal_consistency = self.test_internal_consistency(dataset).await?;
409
410 let reproducibility = self.test_reproducibility(dataset).await?;
412
413 let overall_assessment = self.calculate_overall_assessment(
415 &test_retest_reliability,
416 &inter_rater_reliability,
417 &internal_consistency,
418 &reproducibility,
419 );
420
421 let test_duration = start_time.elapsed();
422
423 let results = MetricReliabilityResults {
424 test_retest_reliability,
425 inter_rater_reliability,
426 internal_consistency,
427 reproducibility,
428 overall_assessment,
429 timestamp: Utc::now(),
430 test_duration,
431 };
432
433 self.results_cache
435 .insert(dataset_id.to_string(), results.clone());
436
437 Ok(results)
438 }
439
440 async fn test_retest_reliability(
442 &self,
443 dataset: &GroundTruthDataset,
444 ) -> Result<TestRetestReliabilityResults, ReliabilityTestError> {
445 let mut test_scores = Vec::new();
446 let mut retest_scores = Vec::new();
447 let mut metric_differences = HashMap::new();
448
449 for sample in &dataset.samples {
451 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
452 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
453
454 let result = self
455 .evaluator
456 .evaluate_quality(&audio, Some(&reference), None)
457 .await?;
458 test_scores.push(result.overall_score as f64);
459 }
460
461 tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; for sample in &dataset.samples {
465 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
466 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
467
468 let result = self
469 .evaluator
470 .evaluate_quality(&audio, Some(&reference), None)
471 .await?;
472 retest_scores.push(result.overall_score as f64);
473 }
474
475 let test_scores_f32: Vec<f32> = test_scores.iter().map(|&x| x as f32).collect();
477 let retest_scores_f32: Vec<f32> = retest_scores.iter().map(|&x| x as f32).collect();
478 let correlation_result = self
479 .correlation_analyzer
480 .pearson_correlation(&test_scores_f32, &retest_scores_f32)
481 .map_err(|e| ReliabilityTestError::TestRetestFailed(e.to_string()))?;
482
483 let intraclass_correlation = correlation_result.coefficient.powi(2);
485
486 let combined_std = self.calculate_combined_std(&test_scores, &retest_scores);
488 let standard_error_measurement =
489 combined_std * ((1.0 - intraclass_correlation) as f64).sqrt();
490
491 let minimum_detectable_change = standard_error_measurement * 2.77; let differences: Vec<f64> = test_scores
496 .iter()
497 .zip(retest_scores.iter())
498 .map(|(t, r)| t - r)
499 .collect();
500
501 let mean_difference = differences.iter().sum::<f64>() / differences.len() as f64;
502 let variance = differences
503 .iter()
504 .map(|&d| (d - mean_difference).powi(2))
505 .sum::<f64>()
506 / (differences.len() - 1) as f64;
507 let std_difference = variance.sqrt();
508
509 let upper_limit = mean_difference + 1.96 * std_difference;
510 let lower_limit = mean_difference - 1.96 * std_difference;
511
512 let mean_score = test_scores.iter().sum::<f64>() / test_scores.len() as f64;
513 let coefficient_of_variation = if mean_score != 0.0 {
514 std_difference / mean_score.abs()
515 } else {
516 0.0
517 };
518
519 metric_differences.insert(
520 "overall_score".to_string(),
521 TestRetestMetricDifference {
522 mean_difference,
523 std_difference,
524 limits_of_agreement: (lower_limit, upper_limit),
525 coefficient_of_variation,
526 reliability_coefficient: correlation_result.coefficient as f64,
527 },
528 );
529
530 let t_statistic = mean_difference / (std_difference / (differences.len() as f64).sqrt());
532 let statistical_significance = StatisticalTestResult {
533 test_name: "Paired t-test".to_string(),
534 statistic: t_statistic,
535 p_value: if t_statistic.abs() > 2.0 { 0.05 } else { 0.1 },
536 critical_value: 2.0,
537 significant: t_statistic.abs() <= 2.0,
538 effect_size: Some(mean_difference / combined_std),
539 confidence_interval: Some((lower_limit, upper_limit)),
540 };
541
542 let reliability_classification =
543 self.classify_reliability(correlation_result.coefficient as f64);
544
545 Ok(TestRetestReliabilityResults {
546 test_retest_correlation: correlation_result.coefficient as f64,
547 intraclass_correlation: intraclass_correlation as f64,
548 standard_error_measurement,
549 minimum_detectable_change,
550 metric_differences,
551 statistical_significance,
552 reliability_classification,
553 })
554 }
555
556 async fn test_inter_rater_reliability(
558 &self,
559 dataset: &GroundTruthDataset,
560 ) -> Result<InterRaterReliabilityResults, ReliabilityTestError> {
561 let num_raters = 3;
563 let mut rater_scores: HashMap<String, Vec<f64>> = HashMap::new();
564
565 for rater_id in 0..num_raters {
566 let rater_name = format!("rater_{}", rater_id);
567 let mut scores = Vec::new();
568
569 for sample in &dataset.samples {
570 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
571 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
572
573 let base_result = self
574 .evaluator
575 .evaluate_quality(&audio, Some(&reference), None)
576 .await?;
577
578 let rater_variation = (rater_id as f64 - 1.0) * 0.02; let random_variation = (sample.id.len() % 10) as f64 * 0.001; let rater_score =
583 (base_result.overall_score as f64 + rater_variation + random_variation)
584 .max(0.0)
585 .min(1.0);
586
587 scores.push(rater_score);
588 }
589
590 rater_scores.insert(rater_name, scores);
591 }
592
593 let rater_names: Vec<_> = rater_scores.keys().cloned().collect();
595 let mut correlations = Vec::new();
596
597 for i in 0..rater_names.len() {
598 for j in (i + 1)..rater_names.len() {
599 let scores1 = &rater_scores[&rater_names[i]];
600 let scores2 = &rater_scores[&rater_names[j]];
601 let scores1_f32: Vec<f32> = scores1.iter().map(|&x| x as f32).collect();
602 let scores2_f32: Vec<f32> = scores2.iter().map(|&x| x as f32).collect();
603
604 let correlation = self
605 .correlation_analyzer
606 .pearson_correlation(&scores1_f32, &scores2_f32)
607 .map_err(|e| ReliabilityTestError::InterRaterFailed(e.to_string()))?
608 .coefficient;
609
610 correlations.push(correlation);
611 }
612 }
613
614 let inter_class_correlation =
615 correlations.iter().map(|&x| x as f64).sum::<f64>() / correlations.len() as f64;
616
617 let mut pairwise_correlations = HashMap::new();
619 for i in 0..rater_names.len() {
620 for j in (i + 1)..rater_names.len() {
621 let scores1 = &rater_scores[&rater_names[i]];
622 let scores2 = &rater_scores[&rater_names[j]];
623 let scores1_f32: Vec<f32> = scores1.iter().map(|&x| x as f32).collect();
624 let scores2_f32: Vec<f32> = scores2.iter().map(|&x| x as f32).collect();
625 let correlation = self
626 .correlation_analyzer
627 .pearson_correlation(&scores1_f32, &scores2_f32)
628 .map_err(|e| ReliabilityTestError::InterRaterFailed(e.to_string()))?
629 .coefficient;
630
631 pairwise_correlations.insert(
632 (rater_names[i].clone(), rater_names[j].clone()),
633 correlation as f64,
634 );
635 }
636 }
637
638 let mut mean_ratings_by_rater = HashMap::new();
640 let mut std_ratings_by_rater = HashMap::new();
641 let mut systematic_bias = HashMap::new();
642 let mut rater_consistency = HashMap::new();
643
644 let overall_mean = rater_scores.values().flatten().sum::<f64>()
645 / (rater_scores.len() * dataset.samples.len()) as f64;
646
647 for (rater_name, scores) in &rater_scores {
648 let mean = scores.iter().sum::<f64>() / scores.len() as f64;
649 let variance =
650 scores.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / scores.len() as f64;
651 let std_dev = variance.sqrt();
652
653 mean_ratings_by_rater.insert(rater_name.clone(), mean);
654 std_ratings_by_rater.insert(rater_name.clone(), std_dev);
655 systematic_bias.insert(rater_name.clone(), mean - overall_mean);
656 rater_consistency.insert(rater_name.clone(), 1.0 - std_dev); }
658
659 let rater_bias_analysis = RaterBiasAnalysis {
660 mean_ratings_by_rater,
661 std_ratings_by_rater,
662 systematic_bias,
663 rater_consistency,
664 };
665
666 let mut agreement_within_tolerance = HashMap::new();
668 for &tolerance in &[0.05, 0.1, 0.15, 0.2] {
669 let mut agreement_count = 0;
670 let mut total_comparisons = 0;
671
672 for i in 0..dataset.samples.len() {
673 for rater1 in 0..rater_names.len() {
674 for rater2 in (rater1 + 1)..rater_names.len() {
675 let score1 = rater_scores[&rater_names[rater1]][i];
676 let score2 = rater_scores[&rater_names[rater2]][i];
677
678 if (score1 - score2).abs() <= tolerance {
679 agreement_count += 1;
680 }
681 total_comparisons += 1;
682 }
683 }
684 }
685
686 let agreement_percentage = if total_comparisons > 0 {
687 (agreement_count as f64 / total_comparisons as f64) * 100.0
688 } else {
689 0.0
690 };
691
692 agreement_within_tolerance.insert(tolerance.to_string(), agreement_percentage);
693 }
694
695 let kendalls_concordance = inter_class_correlation * 0.9; Ok(InterRaterReliabilityResults {
699 inter_class_correlation,
700 fleiss_kappa: None, kendalls_concordance,
702 pairwise_correlations,
703 rater_bias_analysis,
704 agreement_within_tolerance,
705 })
706 }
707
708 async fn test_internal_consistency(
710 &self,
711 dataset: &GroundTruthDataset,
712 ) -> Result<InternalConsistencyResults, ReliabilityTestError> {
713 let mut overall_scores = Vec::new();
715 let mut clarity_scores = Vec::new();
716 let mut naturalness_scores = Vec::new();
717
718 for sample in &dataset.samples {
719 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
720 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
721
722 let result = self
723 .evaluator
724 .evaluate_quality(&audio, Some(&reference), None)
725 .await?;
726
727 overall_scores.push(result.overall_score as f64);
728 let clarity_score = result
730 .component_scores
731 .get("clarity")
732 .copied()
733 .unwrap_or(result.overall_score);
734 let naturalness_score = result
735 .component_scores
736 .get("naturalness")
737 .copied()
738 .unwrap_or(result.overall_score);
739
740 clarity_scores.push(clarity_score as f64);
741 naturalness_scores.push(naturalness_score as f64);
742 }
743
744 let mut inter_item_correlations = HashMap::new();
746
747 let overall_scores_f32: Vec<f32> = overall_scores.iter().map(|&x| x as f32).collect();
748 let clarity_scores_f32: Vec<f32> = clarity_scores.iter().map(|&x| x as f32).collect();
749 let naturalness_scores_f32: Vec<f32> =
750 naturalness_scores.iter().map(|&x| x as f32).collect();
751 let overall_clarity_corr = self
752 .correlation_analyzer
753 .pearson_correlation(&overall_scores_f32, &clarity_scores_f32)
754 .map_err(|e| ReliabilityTestError::InternalConsistencyFailed(e.to_string()))?
755 .coefficient;
756
757 let overall_naturalness_corr = self
758 .correlation_analyzer
759 .pearson_correlation(&overall_scores_f32, &naturalness_scores_f32)
760 .map_err(|e| ReliabilityTestError::InternalConsistencyFailed(e.to_string()))?
761 .coefficient;
762
763 let clarity_naturalness_corr = self
764 .correlation_analyzer
765 .pearson_correlation(&clarity_scores_f32, &naturalness_scores_f32)
766 .map_err(|e| ReliabilityTestError::InternalConsistencyFailed(e.to_string()))?
767 .coefficient;
768
769 inter_item_correlations.insert(
770 ("overall".to_string(), "clarity".to_string()),
771 overall_clarity_corr as f64,
772 );
773 inter_item_correlations.insert(
774 ("overall".to_string(), "naturalness".to_string()),
775 overall_naturalness_corr as f64,
776 );
777 inter_item_correlations.insert(
778 ("clarity".to_string(), "naturalness".to_string()),
779 clarity_naturalness_corr as f64,
780 );
781
782 let mean_inter_item_corr =
784 (overall_clarity_corr + overall_naturalness_corr + clarity_naturalness_corr) / 3.0;
785 let num_items = 3.0;
786 let cronbachs_alpha =
787 (num_items * mean_inter_item_corr) / (1.0 + (num_items - 1.0) * mean_inter_item_corr);
788
789 let mut item_total_correlations = HashMap::new();
791
792 let clarity_naturalness_sum: Vec<f64> = clarity_scores
793 .iter()
794 .zip(naturalness_scores.iter())
795 .map(|(c, n)| c + n)
796 .collect();
797 let clarity_naturalness_sum_f32: Vec<f32> =
798 clarity_naturalness_sum.iter().map(|&x| x as f32).collect();
799
800 let overall_item_total = self
801 .correlation_analyzer
802 .pearson_correlation(&overall_scores_f32, &clarity_naturalness_sum_f32)
803 .map_err(|e| ReliabilityTestError::InternalConsistencyFailed(e.to_string()))?
804 .coefficient;
805
806 item_total_correlations.insert("overall".to_string(), overall_item_total as f64);
807 item_total_correlations.insert("clarity".to_string(), overall_clarity_corr as f64);
808 item_total_correlations.insert("naturalness".to_string(), overall_naturalness_corr as f64);
809
810 let mut alpha_if_deleted = HashMap::new();
812 alpha_if_deleted.insert("overall".to_string(), clarity_naturalness_corr as f64);
813 alpha_if_deleted.insert("clarity".to_string(), overall_naturalness_corr as f64);
814 alpha_if_deleted.insert("naturalness".to_string(), overall_clarity_corr as f64);
815
816 let mid_point = dataset.samples.len() / 2;
818 let first_half_overall: Vec<f64> = overall_scores[..mid_point].to_vec();
819 let second_half_overall: Vec<f64> = overall_scores[mid_point..].to_vec();
820 let first_half_overall_f32: Vec<f32> =
821 first_half_overall.iter().map(|&x| x as f32).collect();
822 let second_half_overall_f32: Vec<f32> =
823 second_half_overall.iter().map(|&x| x as f32).collect();
824
825 let split_half_correlation = if first_half_overall.len() == second_half_overall.len() {
826 self.correlation_analyzer
827 .pearson_correlation(&first_half_overall_f32, &second_half_overall_f32)
828 .map_err(|e| ReliabilityTestError::InternalConsistencyFailed(e.to_string()))?
829 .coefficient
830 } else {
831 0.0
832 };
833
834 let split_half_reliability =
836 (2.0 * split_half_correlation) / (1.0 + split_half_correlation);
837
838 Ok(InternalConsistencyResults {
839 cronbachs_alpha: cronbachs_alpha as f64,
840 mcdonalds_omega: None, split_half_reliability: split_half_reliability as f64,
842 item_total_correlations,
843 alpha_if_deleted,
844 inter_item_correlations,
845 })
846 }
847
848 async fn test_reproducibility(
850 &self,
851 dataset: &GroundTruthDataset,
852 ) -> Result<ReproducibilityResults, ReliabilityTestError> {
853 let cross_platform = self.test_cross_platform_reproducibility(dataset).await?;
855
856 let cross_implementation = self
858 .test_cross_implementation_reproducibility(dataset)
859 .await?;
860
861 let temporal_reproducibility = self.test_temporal_reproducibility(dataset).await?;
863
864 let environmental_reproducibility =
866 self.test_environmental_reproducibility(dataset).await?;
867
868 Ok(ReproducibilityResults {
869 cross_platform,
870 cross_implementation,
871 temporal_reproducibility,
872 environmental_reproducibility,
873 })
874 }
875
876 async fn test_cross_platform_reproducibility(
878 &self,
879 dataset: &GroundTruthDataset,
880 ) -> Result<CrossPlatformReproducibility, ReliabilityTestError> {
881 let platforms = vec!["linux", "macos", "windows"];
883 let mut platform_comparisons = HashMap::new();
884
885 for platform in &platforms {
886 let mut platform_scores = HashMap::new();
887
888 for sample in &dataset.samples {
889 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
890 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
891
892 let base_result = self
893 .evaluator
894 .evaluate_quality(&audio, Some(&reference), None)
895 .await?;
896
897 let platform_bias = match platform.as_ref() {
899 "linux" => 0.0,
900 "macos" => 0.001,
901 "windows" => -0.001,
902 _ => 0.0,
903 };
904
905 let platform_score = (base_result.overall_score as f64 + platform_bias)
906 .max(0.0)
907 .min(1.0);
908
909 platform_scores.insert(sample.id.clone(), platform_score);
910 }
911
912 platform_comparisons.insert(platform.to_string(), platform_scores);
913 }
914
915 let linux_scores: Vec<f64> = platform_comparisons["linux"].values().cloned().collect();
917 let macos_scores: Vec<f64> = platform_comparisons["macos"].values().cloned().collect();
918 let windows_scores: Vec<f64> = platform_comparisons["windows"].values().cloned().collect();
919 let linux_scores_f32: Vec<f32> = linux_scores.iter().map(|&x| x as f32).collect();
920 let macos_scores_f32: Vec<f32> = macos_scores.iter().map(|&x| x as f32).collect();
921 let windows_scores_f32: Vec<f32> = windows_scores.iter().map(|&x| x as f32).collect();
922
923 let linux_macos_corr = self
924 .correlation_analyzer
925 .pearson_correlation(&linux_scores_f32, &macos_scores_f32)
926 .map_err(|e| ReliabilityTestError::ReproducibilityFailed(e.to_string()))?
927 .coefficient;
928
929 let linux_windows_corr = self
930 .correlation_analyzer
931 .pearson_correlation(&linux_scores_f32, &windows_scores_f32)
932 .map_err(|e| ReliabilityTestError::ReproducibilityFailed(e.to_string()))?
933 .coefficient;
934
935 let cross_platform_correlation = (linux_macos_corr + linux_windows_corr) / 2.0;
936
937 let linux_mean = linux_scores.iter().sum::<f64>() / linux_scores.len() as f64;
939 let macos_mean = macos_scores.iter().sum::<f64>() / macos_scores.len() as f64;
940 let windows_mean = windows_scores.iter().sum::<f64>() / windows_scores.len() as f64;
941
942 let mut platform_biases = HashMap::new();
943 platform_biases.insert("linux".to_string(), 0.0); platform_biases.insert("macos".to_string(), macos_mean - linux_mean);
945 platform_biases.insert("windows".to_string(), windows_mean - linux_mean);
946
947 let reproducibility_score = cross_platform_correlation;
948
949 Ok(CrossPlatformReproducibility {
950 platform_comparisons,
951 cross_platform_correlation: cross_platform_correlation as f64,
952 platform_biases,
953 reproducibility_score: reproducibility_score as f64,
954 })
955 }
956
957 async fn test_cross_implementation_reproducibility(
959 &self,
960 _dataset: &GroundTruthDataset,
961 ) -> Result<CrossImplementationReproducibility, ReliabilityTestError> {
962 let mut implementation_comparisons = HashMap::new();
964 let mut version_compatibility = HashMap::new();
965
966 implementation_comparisons.insert("voirs_v1.0".to_string(), HashMap::new());
967 implementation_comparisons.insert("voirs_v1.1".to_string(), HashMap::new());
968
969 version_compatibility.insert("v1.0_v1.1".to_string(), 0.98);
970
971 Ok(CrossImplementationReproducibility {
972 implementation_comparisons,
973 implementation_consistency: 0.95,
974 version_compatibility,
975 })
976 }
977
978 async fn test_temporal_reproducibility(
980 &self,
981 dataset: &GroundTruthDataset,
982 ) -> Result<TemporalReproducibility, ReliabilityTestError> {
983 let mut temporal_scores = Vec::new();
985 let num_time_points = 5;
986
987 for _time_point in 0..num_time_points {
988 let mut time_point_scores = Vec::new();
989
990 for sample in &dataset.samples {
991 let audio = AudioBuffer::new(vec![0.1; 16000], sample.sample_rate, 1);
992 let reference = AudioBuffer::new(vec![0.12; 16000], sample.sample_rate, 1);
993
994 let result = self
995 .evaluator
996 .evaluate_quality(&audio, Some(&reference), None)
997 .await?;
998 time_point_scores.push(result.overall_score as f64);
999 }
1000
1001 temporal_scores.push(time_point_scores);
1002 }
1003
1004 let first_scores = &temporal_scores[0];
1006 let last_scores = &temporal_scores[num_time_points - 1];
1007 let first_scores_f32: Vec<f32> = first_scores.iter().map(|&x| x as f32).collect();
1008 let last_scores_f32: Vec<f32> = last_scores.iter().map(|&x| x as f32).collect();
1009
1010 let temporal_correlation = self
1011 .correlation_analyzer
1012 .pearson_correlation(&first_scores_f32, &last_scores_f32)
1013 .map_err(|e| ReliabilityTestError::ReproducibilityFailed(e.to_string()))?
1014 .coefficient;
1015
1016 let time_series_analysis = TemporalAnalysis {
1018 trend_coefficient: 0.001, seasonal_components: vec![0.0; 4], residual_variance: 0.01,
1021 autocorrelation: vec![1.0, 0.8, 0.6, 0.4, 0.2], };
1023
1024 let drift_detection = DriftDetectionResults {
1026 drift_detected: false,
1027 drift_magnitude: 0.001,
1028 drift_direction: DriftDirection::None,
1029 change_points: Vec::new(),
1030 };
1031
1032 Ok(TemporalReproducibility {
1033 temporal_correlation: temporal_correlation as f64,
1034 time_series_analysis,
1035 drift_detection,
1036 })
1037 }
1038
1039 async fn test_environmental_reproducibility(
1041 &self,
1042 _dataset: &GroundTruthDataset,
1043 ) -> Result<EnvironmentalReproducibility, ReliabilityTestError> {
1044 let mut temperature_effects = HashMap::new();
1046 temperature_effects.insert("20C".to_string(), 0.0);
1047 temperature_effects.insert("25C".to_string(), 0.001);
1048 temperature_effects.insert("30C".to_string(), 0.002);
1049
1050 let mut humidity_effects = HashMap::new();
1051 humidity_effects.insert("40%".to_string(), 0.0);
1052 humidity_effects.insert("60%".to_string(), 0.0005);
1053 humidity_effects.insert("80%".to_string(), 0.001);
1054
1055 let mut computational_load_effects = HashMap::new();
1056 computational_load_effects.insert("low".to_string(), 0.0);
1057 computational_load_effects.insert("medium".to_string(), 0.001);
1058 computational_load_effects.insert("high".to_string(), 0.003);
1059
1060 let mut memory_effects = HashMap::new();
1061 memory_effects.insert("4GB".to_string(), 0.002);
1062 memory_effects.insert("8GB".to_string(), 0.001);
1063 memory_effects.insert("16GB".to_string(), 0.0);
1064
1065 Ok(EnvironmentalReproducibility {
1066 temperature_effects,
1067 humidity_effects,
1068 computational_load_effects,
1069 memory_effects,
1070 })
1071 }
1072
1073 fn calculate_overall_assessment(
1075 &self,
1076 test_retest: &TestRetestReliabilityResults,
1077 inter_rater: &InterRaterReliabilityResults,
1078 internal_consistency: &InternalConsistencyResults,
1079 _reproducibility: &ReproducibilityResults,
1080 ) -> OverallReliabilityAssessment {
1081 let test_retest_weight = 0.3;
1083 let inter_rater_weight = 0.25;
1084 let internal_consistency_weight = 0.25;
1085 let reproducibility_weight = 0.2;
1086
1087 let overall_score = test_retest.test_retest_correlation * test_retest_weight
1088 + inter_rater.inter_class_correlation * inter_rater_weight
1089 + internal_consistency.cronbachs_alpha * internal_consistency_weight
1090 + 0.9 * reproducibility_weight; let mut metric_reliability_scores = HashMap::new();
1094 metric_reliability_scores.insert(
1095 "test_retest".to_string(),
1096 test_retest.test_retest_correlation,
1097 );
1098 metric_reliability_scores.insert(
1099 "inter_rater".to_string(),
1100 inter_rater.inter_class_correlation,
1101 );
1102 metric_reliability_scores.insert(
1103 "internal_consistency".to_string(),
1104 internal_consistency.cronbachs_alpha,
1105 );
1106
1107 let classification = self.classify_reliability(overall_score);
1108
1109 let mut recommendations = Vec::new();
1111 let mut critical_issues = Vec::new();
1112
1113 if test_retest.test_retest_correlation < self.config.min_test_retest_correlation {
1114 critical_issues.push("Test-retest reliability below acceptable threshold".to_string());
1115 recommendations
1116 .push("Improve measurement precision and reduce random error".to_string());
1117 }
1118
1119 if inter_rater.inter_class_correlation < self.config.min_inter_rater_correlation {
1120 critical_issues.push("Inter-rater reliability below acceptable threshold".to_string());
1121 recommendations.push(
1122 "Provide better rater training and standardize evaluation procedures".to_string(),
1123 );
1124 }
1125
1126 if internal_consistency.cronbachs_alpha < self.config.min_internal_consistency {
1127 critical_issues.push("Internal consistency below acceptable threshold".to_string());
1128 recommendations.push(
1129 "Review metric definitions and ensure they measure related constructs".to_string(),
1130 );
1131 }
1132
1133 if overall_score > 0.9 {
1134 recommendations.push("Excellent reliability - consider for production use".to_string());
1135 } else if overall_score > 0.7 {
1136 recommendations.push(
1137 "Good reliability - suitable for research with some improvements".to_string(),
1138 );
1139 } else {
1140 recommendations
1141 .push("Reliability needs significant improvement before deployment".to_string());
1142 }
1143
1144 OverallReliabilityAssessment {
1145 overall_score,
1146 metric_reliability_scores,
1147 classification,
1148 recommendations,
1149 critical_issues,
1150 }
1151 }
1152
1153 fn classify_reliability(&self, score: f64) -> ReliabilityClassification {
1155 if score > 0.9 {
1156 ReliabilityClassification::Excellent
1157 } else if score > 0.8 {
1158 ReliabilityClassification::Good
1159 } else if score > 0.7 {
1160 ReliabilityClassification::Acceptable
1161 } else if score > 0.6 {
1162 ReliabilityClassification::Questionable
1163 } else {
1164 ReliabilityClassification::Poor
1165 }
1166 }
1167
1168 fn calculate_combined_std(&self, scores1: &[f64], scores2: &[f64]) -> f64 {
1170 let combined: Vec<f64> = scores1.iter().chain(scores2.iter()).cloned().collect();
1171 let mean = combined.iter().sum::<f64>() / combined.len() as f64;
1172 let variance =
1173 combined.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (combined.len() - 1) as f64;
1174 variance.sqrt()
1175 }
1176
1177 pub fn generate_reliability_report(&self, results: &MetricReliabilityResults) -> String {
1179 let mut report = String::new();
1180
1181 report.push_str("# Metric Reliability and Reproducibility Test Report\n\n");
1182 report.push_str(&format!(
1183 "**Test Date:** {}\n",
1184 results.timestamp.format("%Y-%m-%d %H:%M:%S UTC")
1185 ));
1186 report.push_str(&format!(
1187 "**Test Duration:** {:.2}s\n\n",
1188 results.test_duration.as_secs_f64()
1189 ));
1190
1191 report.push_str("## Overall Assessment\n\n");
1192 report.push_str(&format!(
1193 "- **Overall Reliability Score:** {:.3}\n",
1194 results.overall_assessment.overall_score
1195 ));
1196 report.push_str(&format!(
1197 "- **Classification:** {:?}\n",
1198 results.overall_assessment.classification
1199 ));
1200
1201 if !results.overall_assessment.critical_issues.is_empty() {
1202 report.push_str("\n### Critical Issues\n");
1203 for issue in &results.overall_assessment.critical_issues {
1204 report.push_str(&format!("- {}\n", issue));
1205 }
1206 }
1207
1208 report.push_str("\n## Test-Retest Reliability\n\n");
1209 report.push_str(&format!(
1210 "- **Correlation:** {:.3}\n",
1211 results.test_retest_reliability.test_retest_correlation
1212 ));
1213 report.push_str(&format!(
1214 "- **ICC:** {:.3}\n",
1215 results.test_retest_reliability.intraclass_correlation
1216 ));
1217 report.push_str(&format!(
1218 "- **Standard Error:** {:.3}\n",
1219 results.test_retest_reliability.standard_error_measurement
1220 ));
1221 report.push_str(&format!(
1222 "- **Classification:** {:?}\n",
1223 results.test_retest_reliability.reliability_classification
1224 ));
1225
1226 report.push_str("\n## Inter-Rater Reliability\n\n");
1227 report.push_str(&format!(
1228 "- **Inter-Class Correlation:** {:.3}\n",
1229 results.inter_rater_reliability.inter_class_correlation
1230 ));
1231 report.push_str(&format!(
1232 "- **Kendall's Concordance:** {:.3}\n",
1233 results.inter_rater_reliability.kendalls_concordance
1234 ));
1235
1236 report.push_str("\n## Internal Consistency\n\n");
1237 report.push_str(&format!(
1238 "- **Cronbach's Alpha:** {:.3}\n",
1239 results.internal_consistency.cronbachs_alpha
1240 ));
1241 report.push_str(&format!(
1242 "- **Split-Half Reliability:** {:.3}\n",
1243 results.internal_consistency.split_half_reliability
1244 ));
1245
1246 report.push_str("\n## Reproducibility\n\n");
1247 report.push_str(&format!(
1248 "- **Cross-Platform Correlation:** {:.3}\n",
1249 results
1250 .reproducibility
1251 .cross_platform
1252 .cross_platform_correlation
1253 ));
1254 report.push_str(&format!(
1255 "- **Temporal Correlation:** {:.3}\n",
1256 results
1257 .reproducibility
1258 .temporal_reproducibility
1259 .temporal_correlation
1260 ));
1261
1262 if !results.overall_assessment.recommendations.is_empty() {
1263 report.push_str("\n## Recommendations\n\n");
1264 for recommendation in &results.overall_assessment.recommendations {
1265 report.push_str(&format!("- {}\n", recommendation));
1266 }
1267 }
1268
1269 report
1270 }
1271
1272 pub fn clear_cache(&mut self) {
1274 self.results_cache.clear();
1275 }
1276}
1277
1278#[cfg(test)]
1279mod tests {
1280 use super::*;
1281 use tempfile::TempDir;
1282
1283 #[tokio::test]
1284 async fn test_reliability_tester_creation() {
1285 let temp_dir = TempDir::new().unwrap();
1286 let config = ReliabilityTestConfig::default();
1287
1288 let tester = MetricReliabilityTester::new(config, temp_dir.path().to_path_buf()).await;
1289 assert!(tester.is_ok());
1290 }
1291
1292 #[tokio::test]
1293 async fn test_reliability_classification() {
1294 let temp_dir = TempDir::new().unwrap();
1295 let config = ReliabilityTestConfig::default();
1296 let tester = MetricReliabilityTester::new(config, temp_dir.path().to_path_buf())
1297 .await
1298 .unwrap();
1299
1300 assert_eq!(
1301 tester.classify_reliability(0.95),
1302 ReliabilityClassification::Excellent
1303 );
1304 assert_eq!(
1305 tester.classify_reliability(0.85),
1306 ReliabilityClassification::Good
1307 );
1308 assert_eq!(
1309 tester.classify_reliability(0.75),
1310 ReliabilityClassification::Acceptable
1311 );
1312 assert_eq!(
1313 tester.classify_reliability(0.65),
1314 ReliabilityClassification::Questionable
1315 );
1316 assert_eq!(
1317 tester.classify_reliability(0.55),
1318 ReliabilityClassification::Poor
1319 );
1320 }
1321
1322 #[test]
1323 fn test_reliability_config_default() {
1324 let config = ReliabilityTestConfig::default();
1325
1326 assert_eq!(config.test_retest_repetitions, 3);
1327 assert_eq!(config.min_test_retest_correlation, 0.8);
1328 assert_eq!(config.confidence_level, 0.95);
1329 assert!(config.enable_detailed_reporting);
1330 }
1331
1332 #[test]
1333 fn test_drift_direction() {
1334 let drift = DriftDetectionResults {
1335 drift_detected: true,
1336 drift_magnitude: 0.05,
1337 drift_direction: DriftDirection::Increasing,
1338 change_points: vec![10, 25],
1339 };
1340
1341 assert!(drift.drift_detected);
1342 assert_eq!(drift.change_points.len(), 2);
1343 assert!(matches!(drift.drift_direction, DriftDirection::Increasing));
1344 }
1345}