1use crate::traits::{
10 AnalysisCapability, AudioAnalysis, AudioAnalysisConfig, AudioAnalyzer, AudioAnalyzerMetadata,
11 AudioMetric, AudioStream, Emotion, EmotionalAnalysis, ProsodyAnalysis, RecognitionResult,
12 SpeakerCharacteristics,
13};
14use crate::RecognitionError;
15use std::sync::Arc;
16
17pub mod prosody;
19pub mod quality;
20pub mod speaker;
21pub mod vad;
22
23pub use prosody::*;
24pub use quality::*;
25pub use speaker::*;
26pub use vad::*;
27
28#[derive(Debug, Clone, PartialEq)]
30pub enum AudioAnalyzerBackend {
31 Comprehensive {
33 quality_metrics: bool,
35 prosody_analysis: bool,
37 speaker_analysis: bool,
39 emotional_analysis: bool,
41 },
42 QualityFocused {
44 metrics: Vec<AudioMetric>,
46 },
47 ProsodyFocused {
49 features: Vec<ProsodyFeature>,
51 },
52 SpeakerFocused {
54 features: Vec<SpeakerFeature>,
56 },
57}
58
59#[derive(Debug, Clone, PartialEq)]
61pub enum ProsodyFeature {
62 Pitch,
64 Rhythm,
66 Stress,
68 Intonation,
70 SpeakingRate,
72 Pauses,
74}
75
76#[derive(Debug, Clone, PartialEq)]
78pub enum SpeakerFeature {
79 Gender,
81 Age,
83 VoiceQuality,
85 Accent,
87 Formants,
89 F0Range,
91}
92
93pub struct AudioAnalyzerImpl {
95 quality_analyzer: Arc<QualityAnalyzer>,
97 prosody_analyzer: Arc<ProsodyAnalyzer>,
99 speaker_analyzer: Arc<SpeakerAnalyzer>,
101 config: AudioAnalysisConfig,
103 supported_metrics: Vec<AudioMetric>,
105 metadata: AudioAnalyzerMetadata,
107}
108
109impl AudioAnalyzerImpl {
110 pub async fn new(config: AudioAnalysisConfig) -> Result<Self, RecognitionError> {
116 let quality_analyzer = Arc::new(QualityAnalyzer::new().await?);
117 let prosody_analyzer = Arc::new(ProsodyAnalyzer::new().await?);
118 let speaker_analyzer = Arc::new(SpeakerAnalyzer::new().await?);
119
120 let supported_metrics = vec![
121 AudioMetric::SNR,
122 AudioMetric::THD,
123 AudioMetric::SpectralCentroid,
124 AudioMetric::SpectralRolloff,
125 AudioMetric::ZeroCrossingRate,
126 AudioMetric::MelFrequencyCepstralCoefficients,
127 AudioMetric::ChromaFeatures,
128 AudioMetric::SpectralContrast,
129 AudioMetric::TonnetzFeatures,
130 AudioMetric::RootMeanSquare,
131 ];
132
133 let metadata = AudioAnalyzerMetadata {
134 name: "Comprehensive Audio Analyzer".to_string(),
135 version: "1.0.0".to_string(),
136 description:
137 "Multi-dimensional audio analysis with quality, prosody, and speaker features"
138 .to_string(),
139 supported_metrics: supported_metrics.clone(),
140 capabilities: vec![
141 AnalysisCapability::QualityMetrics,
142 AnalysisCapability::ProsodyAnalysis,
143 AnalysisCapability::SpeakerCharacteristics,
144 AnalysisCapability::EmotionalAnalysis,
145 AnalysisCapability::RealtimeAnalysis,
146 AnalysisCapability::BatchProcessing,
147 AnalysisCapability::StreamingAnalysis,
148 ],
149 processing_speed: 2.0, };
151
152 Ok(Self {
153 quality_analyzer,
154 prosody_analyzer,
155 speaker_analyzer,
156 config,
157 supported_metrics,
158 metadata,
159 })
160 }
161
162 pub async fn with_backend(backend: AudioAnalyzerBackend) -> Result<Self, RecognitionError> {
168 let config = match backend {
169 AudioAnalyzerBackend::Comprehensive {
170 quality_metrics,
171 prosody_analysis,
172 speaker_analysis,
173 emotional_analysis,
174 } => AudioAnalysisConfig {
175 quality_metrics,
176 prosody_analysis,
177 speaker_analysis,
178 emotional_analysis,
179 ..Default::default()
180 },
181 AudioAnalyzerBackend::QualityFocused { metrics } => AudioAnalysisConfig {
182 quality_metrics: true,
183 prosody_analysis: false,
184 speaker_analysis: false,
185 emotional_analysis: false,
186 quality_metrics_list: metrics,
187 ..Default::default()
188 },
189 AudioAnalyzerBackend::ProsodyFocused { .. } => AudioAnalysisConfig {
190 quality_metrics: false,
191 prosody_analysis: true,
192 speaker_analysis: false,
193 emotional_analysis: false,
194 ..Default::default()
195 },
196 AudioAnalyzerBackend::SpeakerFocused { .. } => AudioAnalysisConfig {
197 quality_metrics: false,
198 prosody_analysis: false,
199 speaker_analysis: true,
200 emotional_analysis: false,
201 ..Default::default()
202 },
203 };
204
205 Self::new(config).await
206 }
207}
208
209#[async_trait::async_trait]
210impl AudioAnalyzer for AudioAnalyzerImpl {
211 async fn analyze(
212 &self,
213 audio: &voirs_sdk::AudioBuffer,
214 config: Option<&AudioAnalysisConfig>,
215 ) -> RecognitionResult<AudioAnalysis> {
216 let config = config.unwrap_or(&self.config);
217 let start_time = std::time::Instant::now();
218
219 let mut quality_metrics = std::collections::HashMap::new();
221
222 if config.quality_metrics {
224 let quality_results = self
225 .quality_analyzer
226 .analyze_quality(audio, &config.quality_metrics_list)
227 .await?;
228 quality_metrics.extend(quality_results);
229 }
230
231 let prosody = if config.prosody_analysis {
233 self.prosody_analyzer.analyze_prosody(audio).await?
234 } else {
235 ProsodyAnalysis::default()
236 };
237
238 let speaker_characteristics = if config.speaker_analysis {
240 self.speaker_analyzer.analyze_speaker(audio).await?
241 } else {
242 SpeakerCharacteristics::default()
243 };
244
245 let emotional_analysis = if config.emotional_analysis {
247 self.speaker_analyzer.analyze_emotion(audio).await?
248 } else {
249 EmotionalAnalysis::default()
250 };
251
252 let processing_duration = start_time.elapsed();
253
254 Ok(AudioAnalysis {
255 quality_metrics,
256 prosody,
257 speaker_characteristics,
258 emotional_analysis,
259 processing_duration: Some(processing_duration),
260 })
261 }
262
263 async fn analyze_streaming(
264 &self,
265 mut audio_stream: AudioStream,
266 config: Option<&AudioAnalysisConfig>,
267 ) -> RecognitionResult<
268 std::pin::Pin<
269 Box<dyn tokio_stream::Stream<Item = RecognitionResult<AudioAnalysis>> + Send>,
270 >,
271 > {
272 let config = config.cloned().unwrap_or_else(|| self.config.clone());
273 let analyzer = self.clone();
274
275 let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();
276
277 tokio::spawn(async move {
278 use futures::StreamExt;
279
280 while let Some(audio_chunk) = audio_stream.next().await {
281 let analysis_result = analyzer.analyze(&audio_chunk, Some(&config)).await;
282
283 if sender.send(analysis_result).is_err() {
284 break;
285 }
286 }
287 });
288
289 Ok(Box::pin(
290 tokio_stream::wrappers::UnboundedReceiverStream::new(receiver),
291 ))
292 }
293
294 fn supported_metrics(&self) -> Vec<AudioMetric> {
295 self.supported_metrics.clone()
296 }
297
298 fn metadata(&self) -> AudioAnalyzerMetadata {
299 self.metadata.clone()
300 }
301
302 fn supports_capability(&self, capability: AnalysisCapability) -> bool {
303 self.metadata.capabilities.contains(&capability)
304 }
305}
306
307impl Clone for AudioAnalyzerImpl {
308 fn clone(&self) -> Self {
309 Self {
310 quality_analyzer: self.quality_analyzer.clone(),
311 prosody_analyzer: self.prosody_analyzer.clone(),
312 speaker_analyzer: self.speaker_analyzer.clone(),
313 config: self.config.clone(),
314 supported_metrics: self.supported_metrics.clone(),
315 metadata: self.metadata.clone(),
316 }
317 }
318}
319
320impl Default for EmotionalAnalysis {
323 fn default() -> Self {
324 Self {
325 primary_emotion: Emotion::Neutral,
326 emotion_scores: std::collections::HashMap::new(),
327 intensity: 0.0,
328 valence: 0.0,
329 arousal: 0.0,
330 }
331 }
332}
333
334pub async fn create_audio_analyzer(
340 backend: AudioAnalyzerBackend,
341) -> RecognitionResult<Arc<dyn AudioAnalyzer>> {
342 let analyzer = AudioAnalyzerImpl::with_backend(backend).await?;
343 Ok(Arc::new(analyzer))
344}
345
346#[must_use]
348pub fn recommended_config_for_use_case(use_case: AnalysisUseCase) -> AudioAnalysisConfig {
349 match use_case {
350 AnalysisUseCase::QualityAssessment => AudioAnalysisConfig {
351 quality_metrics: true,
352 prosody_analysis: false,
353 speaker_analysis: false,
354 emotional_analysis: false,
355 quality_metrics_list: vec![
356 AudioMetric::SNR,
357 AudioMetric::THD,
358 AudioMetric::SpectralCentroid,
359 AudioMetric::RootMeanSquare,
360 ],
361 ..Default::default()
362 },
363 AnalysisUseCase::SpeechEvaluation => AudioAnalysisConfig {
364 quality_metrics: true,
365 prosody_analysis: true,
366 speaker_analysis: false,
367 emotional_analysis: false,
368 ..Default::default()
369 },
370 AnalysisUseCase::SpeakerIdentification => AudioAnalysisConfig {
371 quality_metrics: false,
372 prosody_analysis: false,
373 speaker_analysis: true,
374 emotional_analysis: false,
375 ..Default::default()
376 },
377 AnalysisUseCase::EmotionRecognition => AudioAnalysisConfig {
378 quality_metrics: false,
379 prosody_analysis: true,
380 speaker_analysis: false,
381 emotional_analysis: true,
382 ..Default::default()
383 },
384 AnalysisUseCase::Comprehensive => AudioAnalysisConfig::default(),
385 }
386}
387
388#[derive(Debug, Clone, Copy, PartialEq)]
390pub enum AnalysisUseCase {
391 QualityAssessment,
393 SpeechEvaluation,
395 SpeakerIdentification,
397 EmotionRecognition,
399 Comprehensive,
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406 use voirs_sdk::AudioBuffer;
407
408 #[tokio::test]
409 async fn test_audio_analyzer_creation() {
410 let config = AudioAnalysisConfig::default();
411 let analyzer = AudioAnalyzerImpl::new(config).await.unwrap();
412
413 assert!(!analyzer.supported_metrics().is_empty());
414 assert!(analyzer.supports_capability(AnalysisCapability::QualityMetrics));
415 assert!(analyzer.supports_capability(AnalysisCapability::ProsodyAnalysis));
416 }
417
418 #[tokio::test]
419 async fn test_comprehensive_analysis() {
420 let backend = AudioAnalyzerBackend::Comprehensive {
421 quality_metrics: true,
422 prosody_analysis: true,
423 speaker_analysis: true,
424 emotional_analysis: true,
425 };
426
427 let analyzer = AudioAnalyzerImpl::with_backend(backend).await.unwrap();
428 let audio = AudioBuffer::new(vec![0.1; 16000], 16000, 1);
429
430 let result = analyzer.analyze(&audio, None).await.unwrap();
431
432 assert!(!result.quality_metrics.is_empty());
434
435 assert!(result.prosody.pitch.mean_f0 >= 0.0);
437
438 assert!(result.processing_duration.is_some());
440 }
441
442 #[tokio::test]
443 async fn test_quality_focused_analysis() {
444 let backend = AudioAnalyzerBackend::QualityFocused {
445 metrics: vec![AudioMetric::SNR, AudioMetric::THD],
446 };
447
448 let analyzer = AudioAnalyzerImpl::with_backend(backend).await.unwrap();
449 let audio = AudioBuffer::new(vec![0.1; 16000], 16000, 1);
450
451 let result = analyzer.analyze(&audio, None).await.unwrap();
452
453 assert!(!result.quality_metrics.is_empty());
455
456 assert_eq!(result.prosody.pitch.mean_f0, 0.0);
458 }
459
460 #[tokio::test]
461 async fn test_use_case_configs() {
462 let quality_config = recommended_config_for_use_case(AnalysisUseCase::QualityAssessment);
463 assert!(quality_config.quality_metrics);
464 assert!(!quality_config.emotional_analysis);
465
466 let emotion_config = recommended_config_for_use_case(AnalysisUseCase::EmotionRecognition);
467 assert!(emotion_config.emotional_analysis);
468 assert!(emotion_config.prosody_analysis);
469
470 let comprehensive_config = recommended_config_for_use_case(AnalysisUseCase::Comprehensive);
471 assert!(comprehensive_config.quality_metrics);
472 assert!(comprehensive_config.prosody_analysis);
473 assert!(comprehensive_config.speaker_analysis);
474 assert!(comprehensive_config.emotional_analysis);
475 }
476}