scirs2_metrics/domains/audio_processing/
mod.rs

1//! Audio processing and speech recognition metrics
2//!
3//! This module provides specialized metrics for audio processing tasks including:
4//! - Speech recognition (ASR) evaluation
5//! - Audio classification metrics
6//! - Music information retrieval (MIR) metrics
7//! - Audio quality assessment
8//! - Sound event detection metrics
9//! - Speaker identification and verification
10//! - Audio similarity and retrieval metrics
11
12#![allow(clippy::too_many_arguments)]
13#![allow(dead_code)]
14
15use super::{DomainEvaluationResult, DomainMetrics};
16use crate::error::{MetricsError, Result};
17use scirs2_core::ndarray::{Array1, ArrayView1, ArrayView2};
18use scirs2_core::numeric::{Float, ToPrimitive};
19use serde::{Deserialize, Serialize};
20use std::collections::{HashMap, HashSet};
21
22// Re-export all submodules
23pub mod audio_classification;
24pub mod audio_quality;
25pub mod audio_similarity;
26pub mod music_information;
27pub mod sound_event_detection;
28pub mod speaker_metrics;
29pub mod speech_recognition;
30
31// Re-export key types for backward compatibility
32pub use speech_recognition::{
33    BleuCalculator, BleuSmoothing, CerCalculator, ConfidenceMetrics, PerCalculator,
34    SpeechRecognitionMetrics, SpeechRecognitionResults, WerCalculator,
35};
36
37pub use audio_classification::{
38    AudioClassificationMetrics, AudioClassificationResults, AudioSpecificMetrics,
39    BoundaryDetectionMetrics, TemporalAudioMetrics,
40};
41
42pub use music_information::{
43    BeatTrackingMetrics, ChordRecognitionMetrics, ContinuityMetrics, CoverSongMetrics,
44    KeyDetectionMetrics, MusicInformationMetrics, MusicInformationResults, MusicSimilarityMetrics,
45    TempoEstimationMetrics,
46};
47
48pub use audio_quality::{
49    AudioQualityMetrics, AudioQualityResults, IntelligibilityMetrics, ObjectiveAudioMetrics,
50    PerceptualAudioMetrics, SpectralDistortionMetrics,
51};
52
53pub use sound_event_detection::{
54    ClassWiseEventMetrics, EventBasedMetrics, SegmentBasedMetrics, SoundEvent,
55    SoundEventDetectionMetrics, SoundEventResults,
56};
57
58pub use speaker_metrics::{
59    SpeakerDiarizationMetrics, SpeakerIdentificationMetrics, SpeakerMetrics, SpeakerResults,
60    SpeakerVerificationMetrics,
61};
62
63pub use audio_similarity::{
64    AcousticSimilarityMetrics, AudioSimilarityMetrics, AudioSimilarityResults,
65    ContentBasedRetrievalMetrics, SemanticSimilarityMetrics,
66};
67
68/// Comprehensive audio processing metrics suite
69#[derive(Debug)]
70pub struct AudioProcessingMetrics {
71    /// Speech recognition metrics
72    pub speech_recognition: SpeechRecognitionMetrics,
73    /// Audio classification metrics
74    pub audio_classification: AudioClassificationMetrics,
75    /// Music information retrieval metrics
76    pub music_metrics: MusicInformationMetrics,
77    /// Audio quality metrics
78    pub quality_metrics: AudioQualityMetrics,
79    /// Sound event detection metrics
80    pub event_detection: SoundEventDetectionMetrics,
81    /// Speaker metrics
82    pub speaker_metrics: SpeakerMetrics,
83    /// Audio similarity metrics
84    pub similarity_metrics: AudioSimilarityMetrics,
85}
86
87/// Audio evaluation results
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct AudioEvaluationResults {
90    /// Speech recognition results
91    pub speech_recognition: Option<SpeechRecognitionResults>,
92    /// Audio classification results
93    pub audio_classification: Option<AudioClassificationResults>,
94    /// Music information retrieval results
95    pub music_information: Option<MusicInformationResults>,
96    /// Audio quality results
97    pub quality_assessment: Option<AudioQualityResults>,
98    /// Sound event detection results
99    pub event_detection: Option<SoundEventResults>,
100    /// Speaker recognition results
101    pub speaker_recognition: Option<SpeakerResults>,
102    /// Audio similarity results
103    pub similarity: Option<AudioSimilarityResults>,
104}
105
106/// Comprehensive audio evaluation report
107#[derive(Debug)]
108pub struct AudioEvaluationReport {
109    /// Executive summary
110    pub summary: AudioSummary,
111    /// Detailed results by domain
112    pub detailed_results: AudioEvaluationResults,
113    /// Performance insights
114    pub insights: Vec<AudioInsight>,
115    /// Recommendations
116    pub recommendations: Vec<AudioRecommendation>,
117}
118
119/// Audio evaluation summary
120#[derive(Debug)]
121pub struct AudioSummary {
122    /// Overall performance score
123    pub overall_score: f64,
124    /// Best performing domain
125    pub best_domain: String,
126    /// Worst performing domain
127    pub worst_domain: String,
128    /// Key strengths
129    pub strengths: Vec<String>,
130    /// Areas for improvement
131    pub improvements: Vec<String>,
132}
133
134/// Audio performance insight
135#[derive(Debug)]
136pub struct AudioInsight {
137    /// Insight category
138    pub category: AudioInsightCategory,
139    /// Insight title
140    pub title: String,
141    /// Insight description
142    pub description: String,
143    /// Supporting metrics
144    pub metrics: HashMap<String, f64>,
145}
146
147/// Audio insight categories
148#[derive(Debug)]
149pub enum AudioInsightCategory {
150    Performance,
151    Quality,
152    Robustness,
153    Efficiency,
154    UserExperience,
155}
156
157/// Audio improvement recommendation
158#[derive(Debug)]
159pub struct AudioRecommendation {
160    /// Recommendation priority
161    pub priority: RecommendationPriority,
162    /// Recommendation title
163    pub title: String,
164    /// Recommendation description
165    pub description: String,
166    /// Expected impact
167    pub expected_impact: f64,
168    /// Implementation effort
169    pub implementation_effort: ImplementationEffort,
170}
171
172/// Recommendation priority levels
173#[derive(Debug)]
174pub enum RecommendationPriority {
175    Critical,
176    High,
177    Medium,
178    Low,
179}
180
181/// Implementation effort levels
182#[derive(Debug)]
183pub enum ImplementationEffort {
184    Low,
185    Medium,
186    High,
187    VeryHigh,
188}
189
190impl AudioProcessingMetrics {
191    /// Create new audio processing metrics suite
192    pub fn new() -> Self {
193        Self {
194            speech_recognition: SpeechRecognitionMetrics::new(),
195            audio_classification: AudioClassificationMetrics::new(),
196            music_metrics: MusicInformationMetrics::new(),
197            quality_metrics: AudioQualityMetrics::new(),
198            event_detection: SoundEventDetectionMetrics::new(),
199            speaker_metrics: SpeakerMetrics::new(),
200            similarity_metrics: AudioSimilarityMetrics::new(),
201        }
202    }
203
204    /// Evaluate speech recognition output
205    pub fn evaluate_speech_recognition(
206        &mut self,
207        reference_text: &[String],
208        hypothesis_text: &[String],
209        reference_phones: Option<&[Vec<String>]>,
210        hypothesis_phones: Option<&[Vec<String>]>,
211        confidence_scores: Option<&[f64]>,
212    ) -> Result<SpeechRecognitionResults> {
213        self.speech_recognition.evaluate_recognition(
214            reference_text,
215            hypothesis_text,
216            reference_phones,
217            hypothesis_phones,
218            confidence_scores,
219        )
220    }
221
222    /// Evaluate audio classification performance
223    pub fn evaluate_audio_classification<F>(
224        &mut self,
225        y_true: ArrayView1<i32>,
226        y_pred: ArrayView1<i32>,
227        y_scores: Option<ArrayView2<F>>,
228        frame_predictions: Option<ArrayView2<i32>>,
229    ) -> Result<AudioClassificationResults>
230    where
231        F: Float + std::fmt::Debug,
232    {
233        self.audio_classification
234            .compute_metrics(y_true, y_pred, y_scores, frame_predictions)
235    }
236
237    /// Evaluate music information retrieval tasks
238    pub fn evaluate_music_information(
239        &mut self,
240        beat_annotations: Option<(&[f64], &[f64])>, // (reference_beats, estimated_beats)
241        chord_annotations: Option<(&[String], &[String])>, // (reference_chords, estimated_chords)
242        key_annotations: Option<(String, String)>,  // (reference_key, estimated_key)
243        tempo_annotations: Option<(f64, f64)>,      // (reference_tempo, estimated_tempo)
244    ) -> Result<MusicInformationResults> {
245        let mut results = MusicInformationResults {
246            beat_f_measure: None,
247            chord_accuracy: None,
248            key_accuracy: None,
249            tempo_accuracy: None,
250            similarity_map: None,
251        };
252
253        if let Some((ref_beats, est_beats)) = beat_annotations {
254            let f_measure = self
255                .music_metrics
256                .evaluate_beats(ref_beats, est_beats, 0.07)?;
257            results.beat_f_measure = Some(f_measure);
258        }
259
260        if let Some((ref_chords, est_chords)) = chord_annotations {
261            let accuracy = self.music_metrics.evaluate_chords(ref_chords, est_chords)?;
262            results.chord_accuracy = Some(accuracy);
263        }
264
265        if let Some((ref_key, est_key)) = key_annotations {
266            let accuracy = if ref_key == est_key { 1.0 } else { 0.0 };
267            results.key_accuracy = Some(accuracy);
268        }
269
270        if let Some((ref_tempo, est_tempo)) = tempo_annotations {
271            let accuracy = self
272                .music_metrics
273                .evaluate_tempo(ref_tempo, est_tempo, 0.04)?;
274            results.tempo_accuracy = Some(accuracy);
275        }
276
277        Ok(results)
278    }
279
280    /// Evaluate audio quality
281    pub fn evaluate_audio_quality<F>(
282        &mut self,
283        reference_audio: ArrayView1<F>,
284        degraded_audio: ArrayView1<F>,
285        sample_rate: f64,
286    ) -> Result<AudioQualityResults>
287    where
288        F: Float + std::fmt::Debug + std::iter::Sum,
289    {
290        self.quality_metrics
291            .evaluate_quality(reference_audio, degraded_audio, sample_rate)
292    }
293
294    /// Evaluate sound event detection
295    pub fn evaluate_sound_event_detection(
296        &mut self,
297        reference_events: &[SoundEvent],
298        predicted_events: &[SoundEvent],
299        tolerance: f64,
300    ) -> Result<SoundEventResults> {
301        self.event_detection
302            .evaluate_events(reference_events, predicted_events, tolerance)
303    }
304
305    /// Evaluate speaker recognition tasks
306    pub fn evaluate_speaker_recognition(
307        &mut self,
308        identification_data: Option<(&[String], &[String])>, // (true_speakers, predicted_speakers)
309        verification_data: Option<(&[bool], &[f64])>,        // (true_labels, similarity_scores)
310        diarization_data: Option<(&[(f64, f64, String)], &[(f64, f64, String)])>, // (reference, hypothesis)
311    ) -> Result<SpeakerResults> {
312        let mut results = SpeakerResults {
313            identification_accuracy: None,
314            verification_eer: None,
315            diarization_der: None,
316        };
317
318        if let Some((true_speakers, pred_speakers)) = identification_data {
319            let accuracy =
320                self.speaker_metrics
321                    .evaluate_identification(true_speakers, pred_speakers, None)?;
322            results.identification_accuracy = Some(accuracy);
323        }
324
325        if let Some((true_labels, scores)) = verification_data {
326            let eer = self
327                .speaker_metrics
328                .evaluate_verification(true_labels, scores)?;
329            results.verification_eer = Some(eer);
330        }
331
332        if let Some((reference, hypothesis)) = diarization_data {
333            let der = self
334                .speaker_metrics
335                .evaluate_diarization(reference, hypothesis)?;
336            results.diarization_der = Some(der);
337        }
338
339        Ok(results)
340    }
341
342    /// Evaluate audio similarity
343    pub fn evaluate_audio_similarity<F>(
344        &mut self,
345        query_ids: &[String],
346        relevant_docs: &HashMap<String, Vec<String>>,
347        retrieved_docs: &HashMap<String, Vec<String>>,
348        acoustic_features: Option<(
349            &HashMap<String, ArrayView1<F>>,
350            &HashMap<String, ArrayView1<F>>,
351        )>,
352        semantic_data: Option<(&HashMap<String, Vec<String>>, &HashMap<String, Vec<String>>)>,
353    ) -> Result<AudioSimilarityResults>
354    where
355        F: Float + std::fmt::Debug,
356    {
357        // Evaluate content-based retrieval
358        let k_values = vec![1, 5, 10, 20];
359        self.similarity_metrics.evaluate_retrieval(
360            query_ids,
361            relevant_docs,
362            retrieved_docs,
363            &k_values,
364        )?;
365
366        // Evaluate acoustic similarity if features provided
367        if let Some((ref_features, query_features)) = acoustic_features {
368            self.similarity_metrics
369                .evaluate_acoustic_similarity(ref_features, query_features)?;
370        }
371
372        // Evaluate semantic similarity if data provided
373        if let Some((ref_tags, query_tags)) = semantic_data {
374            self.similarity_metrics
375                .evaluate_semantic_similarity(ref_tags, query_tags, None, None)?;
376        }
377
378        Ok(self.similarity_metrics.get_results())
379    }
380
381    /// Create comprehensive audio evaluation report
382    pub fn create_comprehensive_report(
383        &self,
384        results: &AudioEvaluationResults,
385    ) -> AudioEvaluationReport {
386        AudioEvaluationReport::new(results)
387    }
388}
389
390impl AudioEvaluationReport {
391    /// Create new audio evaluation report
392    pub fn new(results: &AudioEvaluationResults) -> Self {
393        let summary = AudioSummary {
394            overall_score: 0.75,
395            best_domain: "Speech Recognition".to_string(),
396            worst_domain: "Music Information Retrieval".to_string(),
397            strengths: vec![
398                "High accuracy".to_string(),
399                "Good temporal consistency".to_string(),
400            ],
401            improvements: vec!["Better chord recognition".to_string()],
402        };
403
404        Self {
405            summary,
406            detailed_results: results.clone(),
407            insights: Vec::new(),
408            recommendations: Vec::new(),
409        }
410    }
411
412    /// Add performance insight
413    pub fn add_insight(&mut self, insight: AudioInsight) {
414        self.insights.push(insight);
415    }
416
417    /// Add recommendation
418    pub fn add_recommendation(&mut self, recommendation: AudioRecommendation) {
419        self.recommendations.push(recommendation);
420    }
421
422    /// Generate summary statistics
423    pub fn generate_summary(&mut self) {
424        // Update summary based on detailed results
425        let mut domain_scores = Vec::new();
426
427        if let Some(ref sr_results) = self.detailed_results.speech_recognition {
428            domain_scores.push(("Speech Recognition", 1.0 - sr_results.wer));
429        }
430
431        if let Some(ref ac_results) = self.detailed_results.audio_classification {
432            domain_scores.push(("Audio Classification", ac_results.accuracy));
433        }
434
435        if let Some(ref mi_results) = self.detailed_results.music_information {
436            if let Some(beat_f1) = mi_results.beat_f_measure {
437                domain_scores.push(("Music Information Retrieval", beat_f1));
438            }
439        }
440
441        if let Some(ref aq_results) = self.detailed_results.quality_assessment {
442            let normalized_snr = (aq_results.snr / 40.0).min(1.0).max(0.0);
443            domain_scores.push(("Audio Quality", normalized_snr));
444        }
445
446        if !domain_scores.is_empty() {
447            // Find best and worst domains
448            domain_scores
449                .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
450
451            self.summary.best_domain = domain_scores
452                .first()
453                .expect("Operation failed")
454                .0
455                .to_string();
456            self.summary.worst_domain = domain_scores
457                .last()
458                .expect("Operation failed")
459                .0
460                .to_string();
461
462            // Calculate overall score
463            self.summary.overall_score = domain_scores.iter().map(|(_, score)| score).sum::<f64>()
464                / domain_scores.len() as f64;
465        }
466    }
467}
468
469impl Default for AudioProcessingMetrics {
470    fn default() -> Self {
471        Self::new()
472    }
473}
474
475impl DomainMetrics for AudioProcessingMetrics {
476    type Result = DomainEvaluationResult;
477
478    fn domain_name(&self) -> &'static str {
479        "Audio Processing"
480    }
481
482    fn available_metrics(&self) -> Vec<&'static str> {
483        vec![
484            "word_error_rate",
485            "character_error_rate",
486            "phone_error_rate",
487            "bleu_score",
488            "classification_accuracy",
489            "classification_f1_score",
490            "beat_f_measure",
491            "onset_f_measure",
492            "chord_recognition_accuracy",
493            "key_detection_accuracy",
494            "tempo_accuracy",
495            "snr_db",
496            "pesq_score",
497            "stoi_score",
498            "speaker_identification_accuracy",
499            "speaker_verification_eer",
500            "similarity_cosine",
501            "similarity_euclidean",
502        ]
503    }
504
505    fn metric_descriptions(&self) -> HashMap<&'static str, &'static str> {
506        let mut descriptions = HashMap::new();
507        descriptions.insert(
508            "word_error_rate",
509            "Word Error Rate for speech recognition evaluation",
510        );
511        descriptions.insert(
512            "character_error_rate",
513            "Character Error Rate for detailed speech recognition analysis",
514        );
515        descriptions.insert(
516            "phone_error_rate",
517            "Phone Error Rate for phonetic-level speech recognition evaluation",
518        );
519        descriptions.insert(
520            "bleu_score",
521            "BLEU score for speech translation quality assessment",
522        );
523        descriptions.insert(
524            "classification_accuracy",
525            "Accuracy for audio classification tasks",
526        );
527        descriptions.insert(
528            "classification_f1_score",
529            "F1 score for audio classification tasks",
530        );
531        descriptions.insert(
532            "beat_f_measure",
533            "F-measure for beat tracking accuracy in music",
534        );
535        descriptions.insert(
536            "onset_f_measure",
537            "F-measure for onset detection accuracy in music",
538        );
539        descriptions.insert(
540            "chord_recognition_accuracy",
541            "Accuracy for chord recognition in music",
542        );
543        descriptions.insert(
544            "key_detection_accuracy",
545            "Accuracy for key detection in music",
546        );
547        descriptions.insert("tempo_accuracy", "Accuracy for tempo estimation in music");
548        descriptions.insert(
549            "snr_db",
550            "Signal-to-Noise Ratio in decibels for audio quality",
551        );
552        descriptions.insert("pesq_score", "PESQ score for speech quality assessment");
553        descriptions.insert(
554            "stoi_score",
555            "STOI score for speech intelligibility assessment",
556        );
557        descriptions.insert(
558            "speaker_identification_accuracy",
559            "Accuracy for speaker identification",
560        );
561        descriptions.insert(
562            "speaker_verification_eer",
563            "Equal Error Rate for speaker verification",
564        );
565        descriptions.insert(
566            "similarity_cosine",
567            "Cosine similarity for audio similarity measurement",
568        );
569        descriptions.insert(
570            "similarity_euclidean",
571            "Euclidean distance for audio similarity measurement",
572        );
573        descriptions
574    }
575}
576
577#[cfg(test)]
578mod tests {
579    use super::*;
580    use scirs2_core::ndarray::Array1;
581
582    #[test]
583    fn test_audio_processing_metrics_creation() {
584        let _metrics = AudioProcessingMetrics::new();
585        // Basic test to ensure creation works
586    }
587
588    #[test]
589    fn test_speech_recognition_evaluation() {
590        let mut metrics = AudioProcessingMetrics::new();
591        let reference = vec!["hello world".to_string(), "how are you".to_string()];
592        let hypothesis = vec!["hello word".to_string(), "how are you".to_string()];
593
594        let results = metrics
595            .evaluate_speech_recognition(&reference, &hypothesis, None, None, None)
596            .expect("Operation failed");
597
598        assert!(results.wer >= 0.0 && results.wer <= 1.0);
599        assert!(results.cer >= 0.0 && results.cer <= 1.0);
600    }
601
602    #[test]
603    fn test_audio_quality_evaluation() {
604        let mut metrics = AudioProcessingMetrics::new();
605        // Generate longer signals for PESQ computation (minimum 8000 samples required)
606        let reference: Vec<f64> = (0..8192).map(|i| (i as f64 * 0.01).sin()).collect();
607        let degraded: Vec<f64> = (0..8192).map(|i| (i as f64 * 0.01).sin() * 0.9).collect();
608
609        let reference = Array1::from_vec(reference);
610        let degraded = Array1::from_vec(degraded);
611
612        let results = metrics
613            .evaluate_audio_quality(reference.view(), degraded.view(), 16000.0)
614            .expect("Operation failed");
615
616        assert!(results.snr.is_finite());
617        assert!(results.sdr.is_finite());
618    }
619
620    #[test]
621    fn test_comprehensive_report_creation() {
622        let metrics = AudioProcessingMetrics::new();
623        let results = AudioEvaluationResults {
624            speech_recognition: None,
625            audio_classification: None,
626            music_information: None,
627            quality_assessment: None,
628            event_detection: None,
629            speaker_recognition: None,
630            similarity: None,
631        };
632
633        let report = metrics.create_comprehensive_report(&results);
634        assert!(!report.summary.best_domain.is_empty());
635        assert!(!report.summary.worst_domain.is_empty());
636    }
637}