1#![allow(clippy::too_many_arguments)]
13#![allow(dead_code)]
14
15use super::{DomainEvaluationResult, DomainMetrics};
16use crate::error::{MetricsError, Result};
17use scirs2_core::ndarray::{Array1, ArrayView1, ArrayView2};
18use scirs2_core::numeric::{Float, ToPrimitive};
19use serde::{Deserialize, Serialize};
20use std::collections::{HashMap, HashSet};
21
22pub mod audio_classification;
24pub mod audio_quality;
25pub mod audio_similarity;
26pub mod music_information;
27pub mod sound_event_detection;
28pub mod speaker_metrics;
29pub mod speech_recognition;
30
31pub use speech_recognition::{
33 BleuCalculator, BleuSmoothing, CerCalculator, ConfidenceMetrics, PerCalculator,
34 SpeechRecognitionMetrics, SpeechRecognitionResults, WerCalculator,
35};
36
37pub use audio_classification::{
38 AudioClassificationMetrics, AudioClassificationResults, AudioSpecificMetrics,
39 BoundaryDetectionMetrics, TemporalAudioMetrics,
40};
41
42pub use music_information::{
43 BeatTrackingMetrics, ChordRecognitionMetrics, ContinuityMetrics, CoverSongMetrics,
44 KeyDetectionMetrics, MusicInformationMetrics, MusicInformationResults, MusicSimilarityMetrics,
45 TempoEstimationMetrics,
46};
47
48pub use audio_quality::{
49 AudioQualityMetrics, AudioQualityResults, IntelligibilityMetrics, ObjectiveAudioMetrics,
50 PerceptualAudioMetrics, SpectralDistortionMetrics,
51};
52
53pub use sound_event_detection::{
54 ClassWiseEventMetrics, EventBasedMetrics, SegmentBasedMetrics, SoundEvent,
55 SoundEventDetectionMetrics, SoundEventResults,
56};
57
58pub use speaker_metrics::{
59 SpeakerDiarizationMetrics, SpeakerIdentificationMetrics, SpeakerMetrics, SpeakerResults,
60 SpeakerVerificationMetrics,
61};
62
63pub use audio_similarity::{
64 AcousticSimilarityMetrics, AudioSimilarityMetrics, AudioSimilarityResults,
65 ContentBasedRetrievalMetrics, SemanticSimilarityMetrics,
66};
67
68#[derive(Debug)]
70pub struct AudioProcessingMetrics {
71 pub speech_recognition: SpeechRecognitionMetrics,
73 pub audio_classification: AudioClassificationMetrics,
75 pub music_metrics: MusicInformationMetrics,
77 pub quality_metrics: AudioQualityMetrics,
79 pub event_detection: SoundEventDetectionMetrics,
81 pub speaker_metrics: SpeakerMetrics,
83 pub similarity_metrics: AudioSimilarityMetrics,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct AudioEvaluationResults {
90 pub speech_recognition: Option<SpeechRecognitionResults>,
92 pub audio_classification: Option<AudioClassificationResults>,
94 pub music_information: Option<MusicInformationResults>,
96 pub quality_assessment: Option<AudioQualityResults>,
98 pub event_detection: Option<SoundEventResults>,
100 pub speaker_recognition: Option<SpeakerResults>,
102 pub similarity: Option<AudioSimilarityResults>,
104}
105
106#[derive(Debug)]
108pub struct AudioEvaluationReport {
109 pub summary: AudioSummary,
111 pub detailed_results: AudioEvaluationResults,
113 pub insights: Vec<AudioInsight>,
115 pub recommendations: Vec<AudioRecommendation>,
117}
118
119#[derive(Debug)]
121pub struct AudioSummary {
122 pub overall_score: f64,
124 pub best_domain: String,
126 pub worst_domain: String,
128 pub strengths: Vec<String>,
130 pub improvements: Vec<String>,
132}
133
134#[derive(Debug)]
136pub struct AudioInsight {
137 pub category: AudioInsightCategory,
139 pub title: String,
141 pub description: String,
143 pub metrics: HashMap<String, f64>,
145}
146
147#[derive(Debug)]
149pub enum AudioInsightCategory {
150 Performance,
151 Quality,
152 Robustness,
153 Efficiency,
154 UserExperience,
155}
156
157#[derive(Debug)]
159pub struct AudioRecommendation {
160 pub priority: RecommendationPriority,
162 pub title: String,
164 pub description: String,
166 pub expected_impact: f64,
168 pub implementation_effort: ImplementationEffort,
170}
171
172#[derive(Debug)]
174pub enum RecommendationPriority {
175 Critical,
176 High,
177 Medium,
178 Low,
179}
180
181#[derive(Debug)]
183pub enum ImplementationEffort {
184 Low,
185 Medium,
186 High,
187 VeryHigh,
188}
189
190impl AudioProcessingMetrics {
191 pub fn new() -> Self {
193 Self {
194 speech_recognition: SpeechRecognitionMetrics::new(),
195 audio_classification: AudioClassificationMetrics::new(),
196 music_metrics: MusicInformationMetrics::new(),
197 quality_metrics: AudioQualityMetrics::new(),
198 event_detection: SoundEventDetectionMetrics::new(),
199 speaker_metrics: SpeakerMetrics::new(),
200 similarity_metrics: AudioSimilarityMetrics::new(),
201 }
202 }
203
204 pub fn evaluate_speech_recognition(
206 &mut self,
207 reference_text: &[String],
208 hypothesis_text: &[String],
209 reference_phones: Option<&[Vec<String>]>,
210 hypothesis_phones: Option<&[Vec<String>]>,
211 confidence_scores: Option<&[f64]>,
212 ) -> Result<SpeechRecognitionResults> {
213 self.speech_recognition.evaluate_recognition(
214 reference_text,
215 hypothesis_text,
216 reference_phones,
217 hypothesis_phones,
218 confidence_scores,
219 )
220 }
221
222 pub fn evaluate_audio_classification<F>(
224 &mut self,
225 y_true: ArrayView1<i32>,
226 y_pred: ArrayView1<i32>,
227 y_scores: Option<ArrayView2<F>>,
228 frame_predictions: Option<ArrayView2<i32>>,
229 ) -> Result<AudioClassificationResults>
230 where
231 F: Float + std::fmt::Debug,
232 {
233 self.audio_classification
234 .compute_metrics(y_true, y_pred, y_scores, frame_predictions)
235 }
236
237 pub fn evaluate_music_information(
239 &mut self,
240 beat_annotations: Option<(&[f64], &[f64])>, chord_annotations: Option<(&[String], &[String])>, key_annotations: Option<(String, String)>, tempo_annotations: Option<(f64, f64)>, ) -> Result<MusicInformationResults> {
245 let mut results = MusicInformationResults {
246 beat_f_measure: None,
247 chord_accuracy: None,
248 key_accuracy: None,
249 tempo_accuracy: None,
250 similarity_map: None,
251 };
252
253 if let Some((ref_beats, est_beats)) = beat_annotations {
254 let f_measure = self
255 .music_metrics
256 .evaluate_beats(ref_beats, est_beats, 0.07)?;
257 results.beat_f_measure = Some(f_measure);
258 }
259
260 if let Some((ref_chords, est_chords)) = chord_annotations {
261 let accuracy = self.music_metrics.evaluate_chords(ref_chords, est_chords)?;
262 results.chord_accuracy = Some(accuracy);
263 }
264
265 if let Some((ref_key, est_key)) = key_annotations {
266 let accuracy = if ref_key == est_key { 1.0 } else { 0.0 };
267 results.key_accuracy = Some(accuracy);
268 }
269
270 if let Some((ref_tempo, est_tempo)) = tempo_annotations {
271 let accuracy = self
272 .music_metrics
273 .evaluate_tempo(ref_tempo, est_tempo, 0.04)?;
274 results.tempo_accuracy = Some(accuracy);
275 }
276
277 Ok(results)
278 }
279
280 pub fn evaluate_audio_quality<F>(
282 &mut self,
283 reference_audio: ArrayView1<F>,
284 degraded_audio: ArrayView1<F>,
285 sample_rate: f64,
286 ) -> Result<AudioQualityResults>
287 where
288 F: Float + std::fmt::Debug + std::iter::Sum,
289 {
290 self.quality_metrics
291 .evaluate_quality(reference_audio, degraded_audio, sample_rate)
292 }
293
294 pub fn evaluate_sound_event_detection(
296 &mut self,
297 reference_events: &[SoundEvent],
298 predicted_events: &[SoundEvent],
299 tolerance: f64,
300 ) -> Result<SoundEventResults> {
301 self.event_detection
302 .evaluate_events(reference_events, predicted_events, tolerance)
303 }
304
305 pub fn evaluate_speaker_recognition(
307 &mut self,
308 identification_data: Option<(&[String], &[String])>, verification_data: Option<(&[bool], &[f64])>, diarization_data: Option<(&[(f64, f64, String)], &[(f64, f64, String)])>, ) -> Result<SpeakerResults> {
312 let mut results = SpeakerResults {
313 identification_accuracy: None,
314 verification_eer: None,
315 diarization_der: None,
316 };
317
318 if let Some((true_speakers, pred_speakers)) = identification_data {
319 let accuracy =
320 self.speaker_metrics
321 .evaluate_identification(true_speakers, pred_speakers, None)?;
322 results.identification_accuracy = Some(accuracy);
323 }
324
325 if let Some((true_labels, scores)) = verification_data {
326 let eer = self
327 .speaker_metrics
328 .evaluate_verification(true_labels, scores)?;
329 results.verification_eer = Some(eer);
330 }
331
332 if let Some((reference, hypothesis)) = diarization_data {
333 let der = self
334 .speaker_metrics
335 .evaluate_diarization(reference, hypothesis)?;
336 results.diarization_der = Some(der);
337 }
338
339 Ok(results)
340 }
341
342 pub fn evaluate_audio_similarity<F>(
344 &mut self,
345 query_ids: &[String],
346 relevant_docs: &HashMap<String, Vec<String>>,
347 retrieved_docs: &HashMap<String, Vec<String>>,
348 acoustic_features: Option<(
349 &HashMap<String, ArrayView1<F>>,
350 &HashMap<String, ArrayView1<F>>,
351 )>,
352 semantic_data: Option<(&HashMap<String, Vec<String>>, &HashMap<String, Vec<String>>)>,
353 ) -> Result<AudioSimilarityResults>
354 where
355 F: Float + std::fmt::Debug,
356 {
357 let k_values = vec![1, 5, 10, 20];
359 self.similarity_metrics.evaluate_retrieval(
360 query_ids,
361 relevant_docs,
362 retrieved_docs,
363 &k_values,
364 )?;
365
366 if let Some((ref_features, query_features)) = acoustic_features {
368 self.similarity_metrics
369 .evaluate_acoustic_similarity(ref_features, query_features)?;
370 }
371
372 if let Some((ref_tags, query_tags)) = semantic_data {
374 self.similarity_metrics
375 .evaluate_semantic_similarity(ref_tags, query_tags, None, None)?;
376 }
377
378 Ok(self.similarity_metrics.get_results())
379 }
380
381 pub fn create_comprehensive_report(
383 &self,
384 results: &AudioEvaluationResults,
385 ) -> AudioEvaluationReport {
386 AudioEvaluationReport::new(results)
387 }
388}
389
390impl AudioEvaluationReport {
391 pub fn new(results: &AudioEvaluationResults) -> Self {
393 let summary = AudioSummary {
394 overall_score: 0.75,
395 best_domain: "Speech Recognition".to_string(),
396 worst_domain: "Music Information Retrieval".to_string(),
397 strengths: vec![
398 "High accuracy".to_string(),
399 "Good temporal consistency".to_string(),
400 ],
401 improvements: vec!["Better chord recognition".to_string()],
402 };
403
404 Self {
405 summary,
406 detailed_results: results.clone(),
407 insights: Vec::new(),
408 recommendations: Vec::new(),
409 }
410 }
411
412 pub fn add_insight(&mut self, insight: AudioInsight) {
414 self.insights.push(insight);
415 }
416
417 pub fn add_recommendation(&mut self, recommendation: AudioRecommendation) {
419 self.recommendations.push(recommendation);
420 }
421
422 pub fn generate_summary(&mut self) {
424 let mut domain_scores = Vec::new();
426
427 if let Some(ref sr_results) = self.detailed_results.speech_recognition {
428 domain_scores.push(("Speech Recognition", 1.0 - sr_results.wer));
429 }
430
431 if let Some(ref ac_results) = self.detailed_results.audio_classification {
432 domain_scores.push(("Audio Classification", ac_results.accuracy));
433 }
434
435 if let Some(ref mi_results) = self.detailed_results.music_information {
436 if let Some(beat_f1) = mi_results.beat_f_measure {
437 domain_scores.push(("Music Information Retrieval", beat_f1));
438 }
439 }
440
441 if let Some(ref aq_results) = self.detailed_results.quality_assessment {
442 let normalized_snr = (aq_results.snr / 40.0).min(1.0).max(0.0);
443 domain_scores.push(("Audio Quality", normalized_snr));
444 }
445
446 if !domain_scores.is_empty() {
447 domain_scores
449 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
450
451 self.summary.best_domain = domain_scores
452 .first()
453 .expect("Operation failed")
454 .0
455 .to_string();
456 self.summary.worst_domain = domain_scores
457 .last()
458 .expect("Operation failed")
459 .0
460 .to_string();
461
462 self.summary.overall_score = domain_scores.iter().map(|(_, score)| score).sum::<f64>()
464 / domain_scores.len() as f64;
465 }
466 }
467}
468
469impl Default for AudioProcessingMetrics {
470 fn default() -> Self {
471 Self::new()
472 }
473}
474
475impl DomainMetrics for AudioProcessingMetrics {
476 type Result = DomainEvaluationResult;
477
478 fn domain_name(&self) -> &'static str {
479 "Audio Processing"
480 }
481
482 fn available_metrics(&self) -> Vec<&'static str> {
483 vec![
484 "word_error_rate",
485 "character_error_rate",
486 "phone_error_rate",
487 "bleu_score",
488 "classification_accuracy",
489 "classification_f1_score",
490 "beat_f_measure",
491 "onset_f_measure",
492 "chord_recognition_accuracy",
493 "key_detection_accuracy",
494 "tempo_accuracy",
495 "snr_db",
496 "pesq_score",
497 "stoi_score",
498 "speaker_identification_accuracy",
499 "speaker_verification_eer",
500 "similarity_cosine",
501 "similarity_euclidean",
502 ]
503 }
504
505 fn metric_descriptions(&self) -> HashMap<&'static str, &'static str> {
506 let mut descriptions = HashMap::new();
507 descriptions.insert(
508 "word_error_rate",
509 "Word Error Rate for speech recognition evaluation",
510 );
511 descriptions.insert(
512 "character_error_rate",
513 "Character Error Rate for detailed speech recognition analysis",
514 );
515 descriptions.insert(
516 "phone_error_rate",
517 "Phone Error Rate for phonetic-level speech recognition evaluation",
518 );
519 descriptions.insert(
520 "bleu_score",
521 "BLEU score for speech translation quality assessment",
522 );
523 descriptions.insert(
524 "classification_accuracy",
525 "Accuracy for audio classification tasks",
526 );
527 descriptions.insert(
528 "classification_f1_score",
529 "F1 score for audio classification tasks",
530 );
531 descriptions.insert(
532 "beat_f_measure",
533 "F-measure for beat tracking accuracy in music",
534 );
535 descriptions.insert(
536 "onset_f_measure",
537 "F-measure for onset detection accuracy in music",
538 );
539 descriptions.insert(
540 "chord_recognition_accuracy",
541 "Accuracy for chord recognition in music",
542 );
543 descriptions.insert(
544 "key_detection_accuracy",
545 "Accuracy for key detection in music",
546 );
547 descriptions.insert("tempo_accuracy", "Accuracy for tempo estimation in music");
548 descriptions.insert(
549 "snr_db",
550 "Signal-to-Noise Ratio in decibels for audio quality",
551 );
552 descriptions.insert("pesq_score", "PESQ score for speech quality assessment");
553 descriptions.insert(
554 "stoi_score",
555 "STOI score for speech intelligibility assessment",
556 );
557 descriptions.insert(
558 "speaker_identification_accuracy",
559 "Accuracy for speaker identification",
560 );
561 descriptions.insert(
562 "speaker_verification_eer",
563 "Equal Error Rate for speaker verification",
564 );
565 descriptions.insert(
566 "similarity_cosine",
567 "Cosine similarity for audio similarity measurement",
568 );
569 descriptions.insert(
570 "similarity_euclidean",
571 "Euclidean distance for audio similarity measurement",
572 );
573 descriptions
574 }
575}
576
577#[cfg(test)]
578mod tests {
579 use super::*;
580 use scirs2_core::ndarray::Array1;
581
582 #[test]
583 fn test_audio_processing_metrics_creation() {
584 let _metrics = AudioProcessingMetrics::new();
585 }
587
588 #[test]
589 fn test_speech_recognition_evaluation() {
590 let mut metrics = AudioProcessingMetrics::new();
591 let reference = vec!["hello world".to_string(), "how are you".to_string()];
592 let hypothesis = vec!["hello word".to_string(), "how are you".to_string()];
593
594 let results = metrics
595 .evaluate_speech_recognition(&reference, &hypothesis, None, None, None)
596 .expect("Operation failed");
597
598 assert!(results.wer >= 0.0 && results.wer <= 1.0);
599 assert!(results.cer >= 0.0 && results.cer <= 1.0);
600 }
601
602 #[test]
603 fn test_audio_quality_evaluation() {
604 let mut metrics = AudioProcessingMetrics::new();
605 let reference: Vec<f64> = (0..8192).map(|i| (i as f64 * 0.01).sin()).collect();
607 let degraded: Vec<f64> = (0..8192).map(|i| (i as f64 * 0.01).sin() * 0.9).collect();
608
609 let reference = Array1::from_vec(reference);
610 let degraded = Array1::from_vec(degraded);
611
612 let results = metrics
613 .evaluate_audio_quality(reference.view(), degraded.view(), 16000.0)
614 .expect("Operation failed");
615
616 assert!(results.snr.is_finite());
617 assert!(results.sdr.is_finite());
618 }
619
620 #[test]
621 fn test_comprehensive_report_creation() {
622 let metrics = AudioProcessingMetrics::new();
623 let results = AudioEvaluationResults {
624 speech_recognition: None,
625 audio_classification: None,
626 music_information: None,
627 quality_assessment: None,
628 event_detection: None,
629 speaker_recognition: None,
630 similarity: None,
631 };
632
633 let report = metrics.create_comprehensive_report(&results);
634 assert!(!report.summary.best_domain.is_empty());
635 assert!(!report.summary.worst_domain.is_empty());
636 }
637}