1use async_trait::async_trait;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::pin::Pin;
10use std::time::Duration;
11use tokio_stream::Stream;
12use voirs_sdk::{AudioBuffer, LanguageCode, Phoneme, VoirsError};
13
14pub type RecognitionResult<T> = Result<T, VoirsError>;
16
17pub type AudioStream = Pin<Box<dyn Stream<Item = AudioBuffer> + Send>>;
19
20pub type TranscriptStream = Pin<Box<dyn Stream<Item = RecognitionResult<TranscriptChunk>> + Send>>;
22
23#[derive(Debug, Clone, PartialEq)]
29pub struct Transcript {
30 pub text: String,
32 pub language: LanguageCode,
34 pub confidence: f32,
36 pub word_timestamps: Vec<WordTimestamp>,
38 pub sentence_boundaries: Vec<SentenceBoundary>,
40 pub processing_duration: Option<Duration>,
42}
43
44#[derive(Debug, Clone, PartialEq)]
46pub struct WordTimestamp {
47 pub word: String,
49 pub start_time: f32,
51 pub end_time: f32,
53 pub confidence: f32,
55}
56
57#[derive(Debug, Clone, PartialEq)]
59pub struct SentenceBoundary {
60 pub start_time: f32,
62 pub end_time: f32,
64 pub text: String,
66 pub confidence: f32,
68}
69
70#[derive(Debug, Clone, PartialEq)]
72pub struct TranscriptChunk {
73 pub text: String,
75 pub is_final: bool,
77 pub start_time: f32,
79 pub end_time: f32,
81 pub confidence: f32,
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct PhonemeAlignment {
88 pub phonemes: Vec<AlignedPhoneme>,
90 pub total_duration: f32,
92 pub alignment_confidence: f32,
94 pub word_alignments: Vec<WordAlignment>,
96}
97
98#[derive(Debug, Clone, PartialEq)]
100pub struct AlignedPhoneme {
101 pub phoneme: Phoneme,
103 pub start_time: f32,
105 pub end_time: f32,
107 pub confidence: f32,
109}
110
111#[derive(Debug, Clone, PartialEq)]
113pub struct WordAlignment {
114 pub word: String,
116 pub start_time: f32,
118 pub end_time: f32,
120 pub phonemes: Vec<AlignedPhoneme>,
122 pub confidence: f32,
124}
125
126#[derive(Debug, Clone, PartialEq)]
128pub struct AudioAnalysis {
129 pub quality_metrics: HashMap<String, f32>,
131 pub prosody: ProsodyAnalysis,
133 pub speaker_characteristics: SpeakerCharacteristics,
135 pub emotional_analysis: EmotionalAnalysis,
137 pub processing_duration: Option<Duration>,
139}
140
141#[derive(Debug, Clone, PartialEq, Default)]
143pub struct ProsodyAnalysis {
144 pub pitch: PitchAnalysis,
146 pub rhythm: RhythmAnalysis,
148 pub stress: StressAnalysis,
150 pub intonation: IntonationAnalysis,
152 pub energy: EnergyAnalysis,
154}
155
156#[derive(Debug, Clone, PartialEq, Default)]
158pub struct PitchAnalysis {
159 pub mean_f0: f32,
161 pub f0_std: f32,
163 pub f0_range: f32,
165 pub pitch_contour: Vec<f32>,
167}
168
169#[derive(Debug, Clone, PartialEq, Default)]
171pub struct RhythmAnalysis {
172 pub speaking_rate: f32,
174 pub pause_statistics: PauseStatistics,
176 pub regularity_score: f32,
178}
179
180#[derive(Debug, Clone, PartialEq, Default)]
182pub struct PauseStatistics {
183 pub total_pause_duration: f32,
185 pub average_pause_duration: f32,
187 pub pause_count: usize,
189 pub pause_positions: Vec<f32>,
191}
192
193#[derive(Debug, Clone, PartialEq, Default)]
195pub struct StressAnalysis {
196 pub stress_pattern: Vec<f32>,
198 pub primary_stress: Vec<f32>,
200 pub secondary_stress: Vec<f32>,
202}
203
204#[derive(Debug, Clone, PartialEq, Default)]
206pub struct IntonationAnalysis {
207 pub pattern_type: IntonationPattern,
209 pub boundary_tones: Vec<BoundaryTone>,
211 pub pitch_accents: Vec<PitchAccent>,
213}
214
215#[derive(Debug, Clone, PartialEq, Default)]
217pub struct EnergyAnalysis {
218 pub mean_energy: f32,
220 pub energy_std: f32,
222 pub energy_range: f32,
224 pub energy_contour: Vec<f32>,
226}
227
228#[derive(Debug, Clone, PartialEq, Default)]
230pub enum IntonationPattern {
231 #[default]
233 Declarative,
234 Interrogative,
236 Exclamative,
238 Imperative,
240 Mixed,
242}
243
244#[derive(Debug, Clone, PartialEq)]
246pub struct BoundaryTone {
247 pub time: f32,
249 pub tone_type: ToneType,
251 pub confidence: f32,
253}
254
255#[derive(Debug, Clone, PartialEq)]
257pub struct PitchAccent {
258 pub time: f32,
260 pub accent_type: AccentType,
262 pub confidence: f32,
264}
265
266#[derive(Debug, Clone, PartialEq)]
268pub enum ToneType {
269 Rising,
271 Falling,
273 Level,
275 RisingFalling,
277 FallingRising,
279}
280
281#[derive(Debug, Clone, PartialEq)]
283pub enum AccentType {
284 Primary,
286 Secondary,
288 Tertiary,
290}
291
292#[derive(Debug, Clone, PartialEq, Default)]
294pub struct SpeakerCharacteristics {
295 pub gender: Option<Gender>,
297 pub age_range: Option<AgeRange>,
299 pub voice_characteristics: VoiceCharacteristics,
301 pub accent: Option<AccentInfo>,
303}
304
305#[derive(Debug, Clone, PartialEq)]
307pub enum Gender {
308 Male,
310 Female,
312 Other,
314}
315
316#[derive(Debug, Clone, PartialEq)]
318pub enum AgeRange {
319 Child,
321 Teen,
323 Adult,
325 Senior,
327}
328
329#[derive(Debug, Clone, PartialEq, Default)]
331pub struct VoiceCharacteristics {
332 pub f0_range: (f32, f32),
334 pub formants: Vec<f32>,
336 pub voice_quality: VoiceQuality,
338}
339
340#[derive(Debug, Clone, PartialEq, Default)]
342pub struct VoiceQuality {
343 pub jitter: f32,
345 pub shimmer: f32,
347 pub hnr: f32,
349}
350
351#[derive(Debug, Clone, PartialEq)]
353pub struct AccentInfo {
354 pub accent_type: String,
356 pub confidence: f32,
358 pub regional_indicators: Vec<String>,
360}
361
362#[derive(Debug, Clone, PartialEq)]
364pub struct EmotionalAnalysis {
365 pub primary_emotion: Emotion,
367 pub emotion_scores: HashMap<Emotion, f32>,
369 pub intensity: f32,
371 pub valence: f32,
373 pub arousal: f32,
375}
376
377#[derive(Debug, Clone, PartialEq, Eq, Hash)]
379pub enum Emotion {
380 Joy,
382 Sadness,
384 Anger,
386 Fear,
388 Surprise,
390 Disgust,
392 Neutral,
394 Contempt,
396 Pride,
398 Shame,
400}
401
402#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
408pub struct ASRConfig {
409 pub language: Option<LanguageCode>,
411 pub word_timestamps: bool,
413 pub sentence_segmentation: bool,
415 pub confidence_threshold: f32,
417 pub max_duration: Option<f32>,
419 pub language_detection: bool,
421 pub custom_vocabulary: Option<Vec<String>>,
423 pub model_variant: Option<String>,
425 pub whisper_model_size: Option<String>,
427 pub preferred_models: Vec<String>,
429 pub enable_voice_activity_detection: bool,
431 pub chunk_duration_ms: u32,
433}
434
435impl Default for ASRConfig {
436 fn default() -> Self {
437 Self {
438 language: None,
439 word_timestamps: true,
440 sentence_segmentation: true,
441 confidence_threshold: 0.5,
442 max_duration: Some(60.0),
443 language_detection: true,
444 custom_vocabulary: None,
445 model_variant: None,
446 whisper_model_size: Some("base".to_string()),
447 preferred_models: vec!["whisper".to_string()],
448 enable_voice_activity_detection: true,
449 chunk_duration_ms: 30000,
450 }
451 }
452}
453
454#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
456pub struct PhonemeRecognitionConfig {
457 pub language: LanguageCode,
459 pub word_alignment: bool,
461 pub alignment_method: AlignmentMethod,
463 pub confidence_threshold: f32,
465 pub pronunciation_dict: Option<HashMap<String, Vec<String>>>,
467}
468
469impl Default for PhonemeRecognitionConfig {
470 fn default() -> Self {
471 Self {
472 language: LanguageCode::EnUs,
473 word_alignment: true,
474 alignment_method: AlignmentMethod::Forced,
475 confidence_threshold: 0.3,
476 pronunciation_dict: None,
477 }
478 }
479}
480
481#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
483pub enum AlignmentMethod {
484 Forced,
486 Automatic,
488 Hybrid,
490}
491
492#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
494pub struct AudioAnalysisConfig {
495 pub quality_metrics: bool,
497 pub prosody_analysis: bool,
499 pub speaker_analysis: bool,
501 pub emotional_analysis: bool,
503 pub quality_metrics_list: Vec<AudioMetric>,
505 pub frame_size: usize,
507 pub hop_size: usize,
509}
510
511impl Default for AudioAnalysisConfig {
512 fn default() -> Self {
513 Self {
514 quality_metrics: true,
515 prosody_analysis: true,
516 speaker_analysis: true,
517 emotional_analysis: true,
518 quality_metrics_list: vec![
519 AudioMetric::SNR,
520 AudioMetric::THD,
521 AudioMetric::SpectralCentroid,
522 AudioMetric::SpectralRolloff,
523 ],
524 frame_size: 1024,
525 hop_size: 512,
526 }
527 }
528}
529
530#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
532pub enum AudioMetric {
533 SNR,
535 THD,
537 SpectralCentroid,
539 SpectralRolloff,
541 ZeroCrossingRate,
543 MelFrequencyCepstralCoefficients,
545 ChromaFeatures,
547 SpectralContrast,
549 TonnetzFeatures,
551 RootMeanSquare,
553}
554
555#[derive(Debug, Clone, PartialEq)]
561pub struct ASRMetadata {
562 pub name: String,
564 pub version: String,
566 pub description: String,
568 pub supported_languages: Vec<LanguageCode>,
570 pub architecture: String,
572 pub model_size_mb: f32,
574 pub inference_speed: f32,
576 pub wer_benchmarks: HashMap<LanguageCode, f32>,
578 pub supported_features: Vec<ASRFeature>,
580}
581
582#[derive(Debug, Clone, PartialEq)]
584pub enum ASRFeature {
585 WordTimestamps,
587 SentenceSegmentation,
589 LanguageDetection,
591 NoiseRobustness,
593 StreamingInference,
595 CustomVocabulary,
597 SpeakerDiarization,
599 EmotionRecognition,
601}
602
603#[derive(Debug, Clone, PartialEq)]
605pub struct PhonemeRecognizerMetadata {
606 pub name: String,
608 pub version: String,
610 pub description: String,
612 pub supported_languages: Vec<LanguageCode>,
614 pub alignment_methods: Vec<AlignmentMethod>,
616 pub alignment_accuracy: f32,
618 pub supported_features: Vec<PhonemeRecognitionFeature>,
620}
621
622#[derive(Debug, Clone, PartialEq)]
624pub enum PhonemeRecognitionFeature {
625 WordAlignment,
627 CustomPronunciation,
629 MultiLanguage,
631 RealTimeAlignment,
633 ConfidenceScoring,
635 PronunciationAssessment,
637}
638
639#[derive(Debug, Clone, PartialEq)]
641pub struct AudioAnalyzerMetadata {
642 pub name: String,
644 pub version: String,
646 pub description: String,
648 pub supported_metrics: Vec<AudioMetric>,
650 pub capabilities: Vec<AnalysisCapability>,
652 pub processing_speed: f32,
654}
655
656#[derive(Debug, Clone, PartialEq)]
658pub enum AnalysisCapability {
659 QualityMetrics,
661 ProsodyAnalysis,
663 SpeakerCharacteristics,
665 EmotionalAnalysis,
667 RealtimeAnalysis,
669 BatchProcessing,
671 StreamingAnalysis,
673}
674
675#[async_trait]
681pub trait ASRModel: Send + Sync {
682 async fn transcribe(
684 &self,
685 audio: &AudioBuffer,
686 config: Option<&ASRConfig>,
687 ) -> RecognitionResult<Transcript>;
688
689 async fn transcribe_streaming(
691 &self,
692 audio_stream: AudioStream,
693 config: Option<&ASRConfig>,
694 ) -> RecognitionResult<TranscriptStream>;
695
696 fn supported_languages(&self) -> Vec<LanguageCode>;
698
699 fn metadata(&self) -> ASRMetadata;
701
702 fn supports_feature(&self, feature: ASRFeature) -> bool;
704
705 async fn detect_language(&self, _audio: &AudioBuffer) -> RecognitionResult<LanguageCode> {
707 Err(VoirsError::ModelError {
709 model_type: voirs_sdk::error::ModelType::ASR,
710 message: "Language detection not implemented for this model".to_string(),
711 source: None,
712 })
713 }
714}
715
716#[async_trait]
718pub trait PhonemeRecognizer: Send + Sync {
719 async fn recognize_phonemes(
721 &self,
722 audio: &AudioBuffer,
723 config: Option<&PhonemeRecognitionConfig>,
724 ) -> RecognitionResult<Vec<Phoneme>>;
725
726 async fn align_phonemes(
728 &self,
729 audio: &AudioBuffer,
730 expected: &[Phoneme],
731 config: Option<&PhonemeRecognitionConfig>,
732 ) -> RecognitionResult<PhonemeAlignment>;
733
734 async fn align_text(
736 &self,
737 audio: &AudioBuffer,
738 text: &str,
739 config: Option<&PhonemeRecognitionConfig>,
740 ) -> RecognitionResult<PhonemeAlignment>;
741
742 fn metadata(&self) -> PhonemeRecognizerMetadata;
744
745 fn supports_feature(&self, feature: PhonemeRecognitionFeature) -> bool;
747}
748
749#[async_trait]
751pub trait AudioAnalyzer: Send + Sync {
752 async fn analyze(
754 &self,
755 audio: &AudioBuffer,
756 config: Option<&AudioAnalysisConfig>,
757 ) -> RecognitionResult<AudioAnalysis>;
758
759 async fn analyze_streaming(
761 &self,
762 audio_stream: AudioStream,
763 config: Option<&AudioAnalysisConfig>,
764 ) -> RecognitionResult<Pin<Box<dyn Stream<Item = RecognitionResult<AudioAnalysis>> + Send>>>;
765
766 fn supported_metrics(&self) -> Vec<AudioMetric>;
768
769 fn metadata(&self) -> AudioAnalyzerMetadata;
771
772 fn supports_capability(&self, capability: AnalysisCapability) -> bool;
774}
775
776pub trait Configurable {
782 type Config;
784
785 fn configure(&mut self, config: &Self::Config) -> RecognitionResult<()>;
787
788 fn get_config(&self) -> &Self::Config;
790}
791
792#[async_trait]
794pub trait BatchProcessing<Input, Output> {
795 async fn process_batch(&self, inputs: &[Input]) -> RecognitionResult<Vec<Output>>;
797
798 fn optimal_batch_size(&self) -> usize;
800}
801
802pub trait ResourceManager {
804 fn load_resources(&mut self) -> RecognitionResult<()>;
806
807 fn unload_resources(&mut self) -> RecognitionResult<()>;
809
810 fn is_loaded(&self) -> bool;
812
813 fn resource_usage(&self) -> ResourceUsage;
815}
816
817#[derive(Debug, Clone)]
819pub struct ResourceUsage {
820 pub memory_mb: f32,
822 pub cpu_percent: f32,
824 pub gpu_percent: Option<f32>,
826 pub gpu_memory_mb: Option<f32>,
828}