Skip to main content

voirs_recognizer/analysis/
mod.rs

1//! Audio analysis implementations
2//!
3//! This module provides comprehensive audio analysis capabilities including:
4//! - Quality metrics (SNR, THD, spectral features)
5//! - Prosody analysis (pitch, rhythm, stress)
6//! - Speaker characteristics (gender, age, voice quality)
7//! - Emotional analysis
8
9use crate::traits::{
10    AnalysisCapability, AudioAnalysis, AudioAnalysisConfig, AudioAnalyzer, AudioAnalyzerMetadata,
11    AudioMetric, AudioStream, Emotion, EmotionalAnalysis, ProsodyAnalysis, RecognitionResult,
12    SpeakerCharacteristics,
13};
14use crate::RecognitionError;
15use std::sync::Arc;
16
17// Analysis implementations
18pub mod prosody;
19pub mod quality;
20pub mod speaker;
21pub mod vad;
22
23pub use prosody::*;
24pub use quality::*;
25pub use speaker::*;
26pub use vad::*;
27
28/// Audio analyzer backend enumeration
29#[derive(Debug, Clone, PartialEq)]
30pub enum AudioAnalyzerBackend {
31    /// Comprehensive analyzer with all features
32    Comprehensive {
33        /// Enable quality metrics
34        quality_metrics: bool,
35        /// Enable prosody analysis
36        prosody_analysis: bool,
37        /// Enable speaker analysis
38        speaker_analysis: bool,
39        /// Enable emotional analysis
40        emotional_analysis: bool,
41    },
42    /// Quality-focused analyzer
43    QualityFocused {
44        /// Metrics to compute
45        metrics: Vec<AudioMetric>,
46    },
47    /// Prosody-focused analyzer
48    ProsodyFocused {
49        /// Prosody features to analyze
50        features: Vec<ProsodyFeature>,
51    },
52    /// Speaker-focused analyzer
53    SpeakerFocused {
54        /// Speaker features to analyze
55        features: Vec<SpeakerFeature>,
56    },
57}
58
59/// Prosody analysis features
60#[derive(Debug, Clone, PartialEq)]
61pub enum ProsodyFeature {
62    /// Pitch analysis and F0 tracking
63    Pitch,
64    /// Rhythm and timing analysis
65    Rhythm,
66    /// Stress pattern detection
67    Stress,
68    /// Intonation contour analysis
69    Intonation,
70    /// Speaking rate and tempo analysis
71    SpeakingRate,
72    /// Pause detection and analysis
73    Pauses,
74}
75
76/// Speaker analysis features
77#[derive(Debug, Clone, PartialEq)]
78pub enum SpeakerFeature {
79    /// Gender classification
80    Gender,
81    /// Age estimation
82    Age,
83    /// Voice quality assessment
84    VoiceQuality,
85    /// Accent detection and classification
86    Accent,
87    /// Formant frequency analysis
88    Formants,
89    /// Fundamental frequency range analysis
90    F0Range,
91}
92
93/// Main audio analyzer implementation
94pub struct AudioAnalyzerImpl {
95    /// Quality analyzer
96    quality_analyzer: Arc<QualityAnalyzer>,
97    /// Prosody analyzer
98    prosody_analyzer: Arc<ProsodyAnalyzer>,
99    /// Speaker analyzer
100    speaker_analyzer: Arc<SpeakerAnalyzer>,
101    /// Configuration
102    config: AudioAnalysisConfig,
103    /// Supported metrics
104    supported_metrics: Vec<AudioMetric>,
105    /// Metadata
106    metadata: AudioAnalyzerMetadata,
107}
108
109impl AudioAnalyzerImpl {
110    /// Create a new comprehensive audio analyzer
111    ///
112    /// # Errors
113    ///
114    /// Returns a `RecognitionError` if any of the component analyzers fail to initialize.
115    pub async fn new(config: AudioAnalysisConfig) -> Result<Self, RecognitionError> {
116        let quality_analyzer = Arc::new(QualityAnalyzer::new().await?);
117        let prosody_analyzer = Arc::new(ProsodyAnalyzer::new().await?);
118        let speaker_analyzer = Arc::new(SpeakerAnalyzer::new().await?);
119
120        let supported_metrics = vec![
121            AudioMetric::SNR,
122            AudioMetric::THD,
123            AudioMetric::SpectralCentroid,
124            AudioMetric::SpectralRolloff,
125            AudioMetric::ZeroCrossingRate,
126            AudioMetric::MelFrequencyCepstralCoefficients,
127            AudioMetric::ChromaFeatures,
128            AudioMetric::SpectralContrast,
129            AudioMetric::TonnetzFeatures,
130            AudioMetric::RootMeanSquare,
131        ];
132
133        let metadata = AudioAnalyzerMetadata {
134            name: "Comprehensive Audio Analyzer".to_string(),
135            version: "1.0.0".to_string(),
136            description:
137                "Multi-dimensional audio analysis with quality, prosody, and speaker features"
138                    .to_string(),
139            supported_metrics: supported_metrics.clone(),
140            capabilities: vec![
141                AnalysisCapability::QualityMetrics,
142                AnalysisCapability::ProsodyAnalysis,
143                AnalysisCapability::SpeakerCharacteristics,
144                AnalysisCapability::EmotionalAnalysis,
145                AnalysisCapability::RealtimeAnalysis,
146                AnalysisCapability::BatchProcessing,
147                AnalysisCapability::StreamingAnalysis,
148            ],
149            processing_speed: 2.0, // 2x real-time
150        };
151
152        Ok(Self {
153            quality_analyzer,
154            prosody_analyzer,
155            speaker_analyzer,
156            config,
157            supported_metrics,
158            metadata,
159        })
160    }
161
162    /// Create with specific backend configuration
163    ///
164    /// # Errors
165    ///
166    /// Returns a `RecognitionError` if the backend configuration is invalid or initialization fails.
167    pub async fn with_backend(backend: AudioAnalyzerBackend) -> Result<Self, RecognitionError> {
168        let config = match backend {
169            AudioAnalyzerBackend::Comprehensive {
170                quality_metrics,
171                prosody_analysis,
172                speaker_analysis,
173                emotional_analysis,
174            } => AudioAnalysisConfig {
175                quality_metrics,
176                prosody_analysis,
177                speaker_analysis,
178                emotional_analysis,
179                ..Default::default()
180            },
181            AudioAnalyzerBackend::QualityFocused { metrics } => AudioAnalysisConfig {
182                quality_metrics: true,
183                prosody_analysis: false,
184                speaker_analysis: false,
185                emotional_analysis: false,
186                quality_metrics_list: metrics,
187                ..Default::default()
188            },
189            AudioAnalyzerBackend::ProsodyFocused { .. } => AudioAnalysisConfig {
190                quality_metrics: false,
191                prosody_analysis: true,
192                speaker_analysis: false,
193                emotional_analysis: false,
194                ..Default::default()
195            },
196            AudioAnalyzerBackend::SpeakerFocused { .. } => AudioAnalysisConfig {
197                quality_metrics: false,
198                prosody_analysis: false,
199                speaker_analysis: true,
200                emotional_analysis: false,
201                ..Default::default()
202            },
203        };
204
205        Self::new(config).await
206    }
207}
208
209#[async_trait::async_trait]
210impl AudioAnalyzer for AudioAnalyzerImpl {
211    async fn analyze(
212        &self,
213        audio: &voirs_sdk::AudioBuffer,
214        config: Option<&AudioAnalysisConfig>,
215    ) -> RecognitionResult<AudioAnalysis> {
216        let config = config.unwrap_or(&self.config);
217        let start_time = std::time::Instant::now();
218
219        // Initialize result structure
220        let mut quality_metrics = std::collections::HashMap::new();
221
222        // Quality analysis
223        if config.quality_metrics {
224            let quality_results = self
225                .quality_analyzer
226                .analyze_quality(audio, &config.quality_metrics_list)
227                .await?;
228            quality_metrics.extend(quality_results);
229        }
230
231        // Prosody analysis
232        let prosody = if config.prosody_analysis {
233            self.prosody_analyzer.analyze_prosody(audio).await?
234        } else {
235            ProsodyAnalysis::default()
236        };
237
238        // Speaker analysis
239        let speaker_characteristics = if config.speaker_analysis {
240            self.speaker_analyzer.analyze_speaker(audio).await?
241        } else {
242            SpeakerCharacteristics::default()
243        };
244
245        // Emotional analysis
246        let emotional_analysis = if config.emotional_analysis {
247            self.speaker_analyzer.analyze_emotion(audio).await?
248        } else {
249            EmotionalAnalysis::default()
250        };
251
252        let processing_duration = start_time.elapsed();
253
254        Ok(AudioAnalysis {
255            quality_metrics,
256            prosody,
257            speaker_characteristics,
258            emotional_analysis,
259            processing_duration: Some(processing_duration),
260        })
261    }
262
263    async fn analyze_streaming(
264        &self,
265        mut audio_stream: AudioStream,
266        config: Option<&AudioAnalysisConfig>,
267    ) -> RecognitionResult<
268        std::pin::Pin<
269            Box<dyn tokio_stream::Stream<Item = RecognitionResult<AudioAnalysis>> + Send>,
270        >,
271    > {
272        let config = config.cloned().unwrap_or_else(|| self.config.clone());
273        let analyzer = self.clone();
274
275        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();
276
277        tokio::spawn(async move {
278            use futures::StreamExt;
279
280            while let Some(audio_chunk) = audio_stream.next().await {
281                let analysis_result = analyzer.analyze(&audio_chunk, Some(&config)).await;
282
283                if sender.send(analysis_result).is_err() {
284                    break;
285                }
286            }
287        });
288
289        Ok(Box::pin(
290            tokio_stream::wrappers::UnboundedReceiverStream::new(receiver),
291        ))
292    }
293
294    fn supported_metrics(&self) -> Vec<AudioMetric> {
295        self.supported_metrics.clone()
296    }
297
298    fn metadata(&self) -> AudioAnalyzerMetadata {
299        self.metadata.clone()
300    }
301
302    fn supports_capability(&self, capability: AnalysisCapability) -> bool {
303        self.metadata.capabilities.contains(&capability)
304    }
305}
306
307impl Clone for AudioAnalyzerImpl {
308    fn clone(&self) -> Self {
309        Self {
310            quality_analyzer: self.quality_analyzer.clone(),
311            prosody_analyzer: self.prosody_analyzer.clone(),
312            speaker_analyzer: self.speaker_analyzer.clone(),
313            config: self.config.clone(),
314            supported_metrics: self.supported_metrics.clone(),
315            metadata: self.metadata.clone(),
316        }
317    }
318}
319
320// Default implementations for data types
321
322impl Default for EmotionalAnalysis {
323    fn default() -> Self {
324        Self {
325            primary_emotion: Emotion::Neutral,
326            emotion_scores: std::collections::HashMap::new(),
327            intensity: 0.0,
328            valence: 0.0,
329            arousal: 0.0,
330        }
331    }
332}
333
334/// Factory function to create audio analyzers
335///
336/// # Errors
337///
338/// Returns a `RecognitionError` if the analyzer backend fails to initialize.
339pub async fn create_audio_analyzer(
340    backend: AudioAnalyzerBackend,
341) -> RecognitionResult<Arc<dyn AudioAnalyzer>> {
342    let analyzer = AudioAnalyzerImpl::with_backend(backend).await?;
343    Ok(Arc::new(analyzer))
344}
345
346/// Get recommended analyzer configuration for a specific use case
347#[must_use]
348pub fn recommended_config_for_use_case(use_case: AnalysisUseCase) -> AudioAnalysisConfig {
349    match use_case {
350        AnalysisUseCase::QualityAssessment => AudioAnalysisConfig {
351            quality_metrics: true,
352            prosody_analysis: false,
353            speaker_analysis: false,
354            emotional_analysis: false,
355            quality_metrics_list: vec![
356                AudioMetric::SNR,
357                AudioMetric::THD,
358                AudioMetric::SpectralCentroid,
359                AudioMetric::RootMeanSquare,
360            ],
361            ..Default::default()
362        },
363        AnalysisUseCase::SpeechEvaluation => AudioAnalysisConfig {
364            quality_metrics: true,
365            prosody_analysis: true,
366            speaker_analysis: false,
367            emotional_analysis: false,
368            ..Default::default()
369        },
370        AnalysisUseCase::SpeakerIdentification => AudioAnalysisConfig {
371            quality_metrics: false,
372            prosody_analysis: false,
373            speaker_analysis: true,
374            emotional_analysis: false,
375            ..Default::default()
376        },
377        AnalysisUseCase::EmotionRecognition => AudioAnalysisConfig {
378            quality_metrics: false,
379            prosody_analysis: true,
380            speaker_analysis: false,
381            emotional_analysis: true,
382            ..Default::default()
383        },
384        AnalysisUseCase::Comprehensive => AudioAnalysisConfig::default(),
385    }
386}
387
388/// Analysis use cases
389#[derive(Debug, Clone, Copy, PartialEq)]
390pub enum AnalysisUseCase {
391    /// Audio quality assessment and metrics
392    QualityAssessment,
393    /// Speech evaluation and analysis
394    SpeechEvaluation,
395    /// Speaker identification and characteristics
396    SpeakerIdentification,
397    /// Emotion recognition and sentiment analysis
398    EmotionRecognition,
399    /// Comprehensive analysis with all features
400    Comprehensive,
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406    use voirs_sdk::AudioBuffer;
407
408    #[tokio::test]
409    async fn test_audio_analyzer_creation() {
410        let config = AudioAnalysisConfig::default();
411        let analyzer = AudioAnalyzerImpl::new(config).await.unwrap();
412
413        assert!(!analyzer.supported_metrics().is_empty());
414        assert!(analyzer.supports_capability(AnalysisCapability::QualityMetrics));
415        assert!(analyzer.supports_capability(AnalysisCapability::ProsodyAnalysis));
416    }
417
418    #[tokio::test]
419    async fn test_comprehensive_analysis() {
420        let backend = AudioAnalyzerBackend::Comprehensive {
421            quality_metrics: true,
422            prosody_analysis: true,
423            speaker_analysis: true,
424            emotional_analysis: true,
425        };
426
427        let analyzer = AudioAnalyzerImpl::with_backend(backend).await.unwrap();
428        let audio = AudioBuffer::new(vec![0.1; 16000], 16000, 1);
429
430        let result = analyzer.analyze(&audio, None).await.unwrap();
431
432        // Should have some quality metrics
433        assert!(!result.quality_metrics.is_empty());
434
435        // Should have prosody analysis
436        assert!(result.prosody.pitch.mean_f0 >= 0.0);
437
438        // Should have processing duration
439        assert!(result.processing_duration.is_some());
440    }
441
442    #[tokio::test]
443    async fn test_quality_focused_analysis() {
444        let backend = AudioAnalyzerBackend::QualityFocused {
445            metrics: vec![AudioMetric::SNR, AudioMetric::THD],
446        };
447
448        let analyzer = AudioAnalyzerImpl::with_backend(backend).await.unwrap();
449        let audio = AudioBuffer::new(vec![0.1; 16000], 16000, 1);
450
451        let result = analyzer.analyze(&audio, None).await.unwrap();
452
453        // Should focus on quality metrics
454        assert!(!result.quality_metrics.is_empty());
455
456        // Prosody should be default/empty since not requested
457        assert_eq!(result.prosody.pitch.mean_f0, 0.0);
458    }
459
460    #[tokio::test]
461    async fn test_use_case_configs() {
462        let quality_config = recommended_config_for_use_case(AnalysisUseCase::QualityAssessment);
463        assert!(quality_config.quality_metrics);
464        assert!(!quality_config.emotional_analysis);
465
466        let emotion_config = recommended_config_for_use_case(AnalysisUseCase::EmotionRecognition);
467        assert!(emotion_config.emotional_analysis);
468        assert!(emotion_config.prosody_analysis);
469
470        let comprehensive_config = recommended_config_for_use_case(AnalysisUseCase::Comprehensive);
471        assert!(comprehensive_config.quality_metrics);
472        assert!(comprehensive_config.prosody_analysis);
473        assert!(comprehensive_config.speaker_analysis);
474        assert!(comprehensive_config.emotional_analysis);
475    }
476}