1#![allow(clippy::cast_precision_loss)] #![allow(clippy::cast_possible_truncation)] #![allow(clippy::cast_sign_loss)] #![allow(clippy::missing_errors_doc)] #![allow(clippy::missing_panics_doc)] #![allow(clippy::unused_self)] #![allow(clippy::must_use_candidate)] #![allow(clippy::doc_markdown)] #![allow(clippy::unnecessary_wraps)] #![allow(clippy::float_cmp)] #![allow(clippy::match_same_arms)] #![allow(clippy::module_name_repetitions)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::too_many_lines)] #![allow(clippy::needless_pass_by_value)] #![allow(clippy::similar_names)] #![allow(clippy::unused_async)] #![allow(clippy::needless_range_loop)] #![allow(clippy::uninlined_format_args)] #![allow(clippy::manual_clamp)] #![allow(clippy::return_self_not_must_use)] #![allow(clippy::cast_possible_wrap)] #![allow(clippy::cast_lossless)] #![allow(clippy::wildcard_imports)] #![allow(clippy::format_push_string)] #![allow(clippy::redundant_closure_for_method_calls)] use async_trait::async_trait;
62use futures::Stream;
63use serde::{Deserialize, Serialize};
64use std::collections::HashMap;
65use thiserror::Error;
66
67pub type Result<T> = std::result::Result<T, VocoderError>;
69
70#[derive(Error, Debug)]
72pub enum VocoderError {
73 #[error("Vocoding failed: {0}")]
74 VocodingError(String),
75
76 #[error("Model loading failed: {0}")]
77 ModelError(String),
78
79 #[error("Invalid input: {0}")]
80 InputError(String),
81
82 #[error("Invalid mel spectrogram: {0}")]
83 InvalidMelSpectrogram(String),
84
85 #[error("Configuration error: {0}")]
86 ConfigurationError(String),
87
88 #[error("Config error: {0}")]
89 ConfigError(String),
90
91 #[error("Streaming error: {0}")]
92 StreamingError(String),
93
94 #[error("Runtime error: {0}")]
95 RuntimeError(String),
96
97 #[error("Processing error: {0}")]
98 ProcessingError(String),
99
100 #[error("Other error: {0}")]
101 Other(String),
102
103 #[cfg(feature = "candle")]
104 #[error("Candle error: {0}")]
105 CandleError(#[from] candle_core::Error),
106
107 #[error("IO error: {0}")]
108 IoError(#[from] std::io::Error),
109
110 #[error("FFT error: {0}")]
111 FFTError(#[from] scirs2_fft::FFTError),
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
116pub enum LanguageCode {
117 EnUs,
119 EnGb,
121 Ja,
123 ZhCn,
125 Ko,
127 De,
129 Fr,
131 Es,
133}
134
135impl LanguageCode {
136 pub fn as_str(&self) -> &'static str {
138 match self {
139 LanguageCode::EnUs => "en-US",
140 LanguageCode::EnGb => "en-GB",
141 LanguageCode::Ja => "ja",
142 LanguageCode::ZhCn => "zh-CN",
143 LanguageCode::Ko => "ko",
144 LanguageCode::De => "de",
145 LanguageCode::Fr => "fr",
146 LanguageCode::Es => "es",
147 }
148 }
149}
150
151#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct MelSpectrogram {
154 pub data: Vec<Vec<f32>>,
156 pub n_mels: usize,
158 pub n_frames: usize,
160 pub sample_rate: u32,
162 pub hop_length: u32,
164}
165
166impl MelSpectrogram {
167 pub fn new(data: Vec<Vec<f32>>, sample_rate: u32, hop_length: u32) -> Self {
169 let n_mels = data.len();
170 let n_frames = data.first().map_or(0, |row| row.len());
171
172 Self {
173 data,
174 n_mels,
175 n_frames,
176 sample_rate,
177 hop_length,
178 }
179 }
180
181 pub fn duration(&self) -> f32 {
183 (self.n_frames as u32 * self.hop_length) as f32 / self.sample_rate as f32
184 }
185}
186
187#[derive(Debug, Clone, Serialize, Deserialize)]
189pub struct SynthesisConfig {
190 pub speed: f32,
192 pub pitch_shift: f32,
194 pub energy: f32,
196 pub speaker_id: Option<u32>,
198 pub seed: Option<u64>,
200}
201
202impl Default for SynthesisConfig {
203 fn default() -> Self {
204 Self {
205 speed: 1.0,
206 pitch_shift: 0.0,
207 energy: 1.0,
208 speaker_id: None,
209 seed: None,
210 }
211 }
212}
213
214#[derive(Debug, Clone)]
216pub struct AudioBuffer {
217 samples: Vec<f32>,
219 sample_rate: u32,
221 channels: u32,
223}
224
225impl AudioBuffer {
226 pub fn new(samples: Vec<f32>, sample_rate: u32, channels: u32) -> Self {
228 Self {
229 samples,
230 sample_rate,
231 channels,
232 }
233 }
234
235 pub fn silence(duration: f32, sample_rate: u32, channels: u32) -> Self {
237 let num_samples = (duration * sample_rate as f32 * channels as f32) as usize;
238 Self::new(vec![0.0; num_samples], sample_rate, channels)
239 }
240
241 pub fn from_samples(samples: Vec<f32>, sample_rate: f32) -> Self {
243 Self::new(samples, sample_rate as u32, 1)
244 }
245
246 pub fn sine_wave(frequency: f32, duration: f32, sample_rate: u32, amplitude: f32) -> Self {
248 let num_samples = (duration * sample_rate as f32) as usize;
249 let mut samples = Vec::with_capacity(num_samples);
250
251 for i in 0..num_samples {
252 let t = i as f32 / sample_rate as f32;
253 let sample = amplitude * (2.0 * std::f32::consts::PI * frequency * t).sin();
254 samples.push(sample);
255 }
256
257 Self::new(samples, sample_rate, 1)
258 }
259
260 pub fn duration(&self) -> f32 {
262 self.samples.len() as f32 / (self.sample_rate * self.channels) as f32
263 }
264
265 pub fn sample_rate(&self) -> u32 {
267 self.sample_rate
268 }
269
270 pub fn channels(&self) -> u32 {
272 self.channels
273 }
274
275 pub fn is_empty(&self) -> bool {
277 self.samples.is_empty()
278 }
279
280 pub fn samples(&self) -> &[f32] {
282 &self.samples
283 }
284
285 pub fn samples_mut(&mut self) -> &mut [f32] {
287 &mut self.samples
288 }
289
290 pub fn len(&self) -> usize {
292 self.samples.len()
293 }
294
295 pub fn peak_amplitude(&self) -> f32 {
297 self.samples.iter().map(|&x| x.abs()).fold(0.0f32, f32::max)
298 }
299
300 pub fn normalize_to_peak(&mut self, target_peak: f32) {
302 let current_peak = self.peak_amplitude();
303 if current_peak > 0.0 {
304 let gain = target_peak / current_peak;
305 for sample in &mut self.samples {
306 *sample *= gain;
307 }
308 }
309 }
310}
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
314pub enum VocoderFeature {
315 Base,
317 StreamingInference,
319 BatchProcessing,
321 GpuAcceleration,
323 HighQuality,
325 RealtimeProcessing,
327 FastInference,
329 EmotionConditioning,
331 Emotion,
333 VoiceConversion,
335 AgeTransformation,
337 GenderTransformation,
339 VoiceMorphing,
341 SingingVoice,
343 Singing,
345 SpatialAudio,
347 Spatial,
349}
350
351#[derive(Debug, Clone, Serialize, Deserialize)]
353pub struct VocoderMetadata {
354 pub name: String,
356 pub version: String,
358 pub architecture: String,
360 pub sample_rate: u32,
362 pub mel_channels: u32,
364 pub latency_ms: f32,
366 pub quality_score: f32,
368}
369
370#[async_trait]
387pub trait Vocoder: Send + Sync {
388 async fn vocode(
402 &self,
403 mel: &MelSpectrogram,
404 config: Option<&SynthesisConfig>,
405 ) -> Result<AudioBuffer>;
406
407 async fn vocode_stream(
421 &self,
422 mel_stream: Box<dyn Stream<Item = MelSpectrogram> + Send + Unpin>,
423 config: Option<&SynthesisConfig>,
424 ) -> Result<Box<dyn Stream<Item = Result<AudioBuffer>> + Send + Unpin>>;
425
426 async fn vocode_batch(
440 &self,
441 mels: &[MelSpectrogram],
442 configs: Option<&[SynthesisConfig]>,
443 ) -> Result<Vec<AudioBuffer>>;
444
445 fn metadata(&self) -> VocoderMetadata;
450
451 fn supports(&self, feature: VocoderFeature) -> bool;
464}
465
466pub mod adaptive_quality;
467pub mod audio;
468pub mod backends;
469pub mod broadcast_quality;
470pub mod cache;
471pub mod codecs;
472pub mod comprehensive_quality_metrics;
473pub mod conditioning;
474pub mod config;
475pub mod containers;
476pub mod conversion;
477pub mod drivers;
478pub mod effects;
479pub mod hifigan;
480pub mod loss;
481pub mod metrics;
482pub mod ml;
483pub mod models;
484pub mod optimization_paths;
485pub mod parallel;
486pub mod performance;
487pub mod post_processing;
488pub mod profiling;
489pub mod simd;
490pub mod streaming;
491pub mod utils;
492pub mod waveglow;
493
494pub mod prelude {
496 pub use crate::effects::AudioQualityMetrics;
497 pub use crate::utils::audio_processing::{
498 apply_adaptive_noise_gate, apply_formant_enhancement, apply_intelligent_agc,
499 apply_psychoacoustic_masking, apply_stereo_widening, calculate_audio_quality_metrics,
500 calculate_spectral_statistics, crossfade_audio, CrossfadeType, SpectralStatistics,
501 };
502 pub use crate::{
503 adaptive_quality::{
504 AdaptationStats, AdaptiveConfig, AdaptiveQualityController, PrecisionMode,
505 QualityAdjustment, QualityTarget,
506 },
507 conditioning::{
508 ConditioningConfigBuilder, EnhancementConfig, ProsodyConfig, SpeakerConfig,
509 VocoderConditioner, VocoderConditioningConfig, VoiceCharacteristics,
510 },
511 conversion::{VoiceConversionConfig, VoiceConverter, VoiceMorpher},
512 hifigan::{EmotionConfig, EmotionVocodingParams},
513 performance::{
514 PerformanceAlert, PerformanceMetrics, PerformanceMonitor, PerformanceStatistics,
515 PerformanceThresholds,
516 },
517 AudioBuffer, DummyVocoder, HiFiGanVocoder, LanguageCode, MelSpectrogram, Result,
518 SynthesisConfig, Vocoder, VocoderError, VocoderFeature, VocoderManager, VocoderMetadata,
519 };
520 pub use async_trait::async_trait;
521}
522
523pub use hifigan::HiFiGanVocoder;
525pub use models::hifigan::{HiFiGanConfig, HiFiGanVariant, HiFiGanVariants};
526pub use streaming::{StreamHandle, StreamingPipeline, StreamingStats, StreamingVocoder};
527
528pub struct VocoderManager {
532 vocoders: HashMap<String, Box<dyn Vocoder>>,
533 default_vocoder: Option<String>,
534}
535
536impl VocoderManager {
537 pub fn new() -> Self {
539 Self {
540 vocoders: HashMap::new(),
541 default_vocoder: None,
542 }
543 }
544
545 pub fn add_vocoder(&mut self, name: String, vocoder: Box<dyn Vocoder>) {
547 self.vocoders.insert(name.clone(), vocoder);
548
549 if self.default_vocoder.is_none() {
551 self.default_vocoder = Some(name);
552 }
553 }
554
555 pub fn set_default_vocoder(&mut self, name: String) {
557 if self.vocoders.contains_key(&name) {
558 self.default_vocoder = Some(name);
559 }
560 }
561
562 pub fn get_vocoder(&self, name: &str) -> Result<&dyn Vocoder> {
564 self.vocoders
565 .get(name)
566 .map(|v| v.as_ref())
567 .ok_or_else(|| VocoderError::ModelError(format!("Vocoder '{name}' not found")))
568 }
569
570 pub fn get_default_vocoder(&self) -> Result<&dyn Vocoder> {
572 let name = self
573 .default_vocoder
574 .as_ref()
575 .ok_or_else(|| VocoderError::ConfigError("No default vocoder set".to_string()))?;
576 self.get_vocoder(name)
577 }
578
579 pub fn list_vocoders(&self) -> Vec<&str> {
581 self.vocoders.keys().map(|s| s.as_str()).collect()
582 }
583}
584
585impl Default for VocoderManager {
586 fn default() -> Self {
587 Self::new()
588 }
589}
590
591#[async_trait]
592impl Vocoder for VocoderManager {
593 async fn vocode(
594 &self,
595 mel: &MelSpectrogram,
596 config: Option<&SynthesisConfig>,
597 ) -> Result<AudioBuffer> {
598 let vocoder = self.get_default_vocoder()?;
599 vocoder.vocode(mel, config).await
600 }
601
602 async fn vocode_stream(
603 &self,
604 mel_stream: Box<dyn Stream<Item = MelSpectrogram> + Send + Unpin>,
605 config: Option<&SynthesisConfig>,
606 ) -> Result<Box<dyn Stream<Item = Result<AudioBuffer>> + Send + Unpin>> {
607 let vocoder = self.get_default_vocoder()?;
608 vocoder.vocode_stream(mel_stream, config).await
609 }
610
611 async fn vocode_batch(
612 &self,
613 mels: &[MelSpectrogram],
614 configs: Option<&[SynthesisConfig]>,
615 ) -> Result<Vec<AudioBuffer>> {
616 let vocoder = self.get_default_vocoder()?;
617 vocoder.vocode_batch(mels, configs).await
618 }
619
620 fn metadata(&self) -> VocoderMetadata {
621 if let Ok(vocoder) = self.get_default_vocoder() {
622 vocoder.metadata()
623 } else {
624 VocoderMetadata {
625 name: "Vocoder Manager".to_string(),
626 version: env!("CARGO_PKG_VERSION").to_string(),
627 architecture: "Manager".to_string(),
628 sample_rate: 22050,
629 mel_channels: 80,
630 latency_ms: 0.0,
631 quality_score: 0.0,
632 }
633 }
634 }
635
636 fn supports(&self, feature: VocoderFeature) -> bool {
637 if let Ok(vocoder) = self.get_default_vocoder() {
638 vocoder.supports(feature)
639 } else {
640 false
641 }
642 }
643}
644
645pub struct DummyVocoder {
649 sample_rate: u32,
650 mel_channels: u32,
651}
652
653impl DummyVocoder {
654 pub fn new() -> Self {
656 Self {
657 sample_rate: 22050,
658 mel_channels: 80,
659 }
660 }
661
662 pub fn with_config(sample_rate: u32, mel_channels: u32) -> Self {
664 Self {
665 sample_rate,
666 mel_channels,
667 }
668 }
669}
670
671impl Default for DummyVocoder {
672 fn default() -> Self {
673 Self::new()
674 }
675}
676
677#[async_trait]
678impl Vocoder for DummyVocoder {
679 async fn vocode(
680 &self,
681 mel: &MelSpectrogram,
682 _config: Option<&SynthesisConfig>,
683 ) -> Result<AudioBuffer> {
684 let duration = mel.duration();
686 let frequency = 440.0; let audio = AudioBuffer::sine_wave(frequency, duration, self.sample_rate, 0.5);
688
689 tracing::debug!(
690 "DummyVocoder: Generated {:.2}s audio from {}x{} mel",
691 duration,
692 mel.n_mels,
693 mel.n_frames
694 );
695 Ok(audio)
696 }
697
698 async fn vocode_stream(
699 &self,
700 mel_stream: Box<dyn Stream<Item = MelSpectrogram> + Send + Unpin>,
701 config: Option<&SynthesisConfig>,
702 ) -> Result<Box<dyn Stream<Item = Result<AudioBuffer>> + Send + Unpin>> {
703 use futures::stream;
704 use futures::StreamExt;
705
706 let mels: Vec<MelSpectrogram> = mel_stream.collect().await;
708 let configs = config.map(|c| vec![c.clone(); mels.len()]);
709
710 let results = self.vocode_batch(&mels, configs.as_deref()).await?;
711 let stream = stream::iter(results.into_iter().map(Ok));
712
713 Ok(Box::new(stream))
714 }
715
716 async fn vocode_batch(
717 &self,
718 mels: &[MelSpectrogram],
719 configs: Option<&[SynthesisConfig]>,
720 ) -> Result<Vec<AudioBuffer>> {
721 let mut results = Vec::new();
722 for (i, mel) in mels.iter().enumerate() {
723 let config = configs.and_then(|c| c.get(i));
724 results.push(self.vocode(mel, config).await?);
725 }
726 Ok(results)
727 }
728
729 fn metadata(&self) -> VocoderMetadata {
730 VocoderMetadata {
731 name: "Dummy Vocoder".to_string(),
732 version: "0.1.0".to_string(),
733 architecture: "Sine Wave".to_string(),
734 sample_rate: self.sample_rate,
735 mel_channels: self.mel_channels,
736 latency_ms: 10.0,
737 quality_score: 2.0, }
739 }
740
741 fn supports(&self, feature: VocoderFeature) -> bool {
742 matches!(feature, VocoderFeature::BatchProcessing)
743 }
744}
745
746#[cfg(test)]
747mod tests {
748 use super::*;
749
750 #[tokio::test]
751 async fn test_vocoder_manager() {
752 let mut manager = VocoderManager::new();
753
754 manager.add_vocoder("dummy".to_string(), Box::new(DummyVocoder::new()));
756
757 let mel_data = vec![vec![0.5; 100]; 80]; let mel = MelSpectrogram::new(mel_data, 22050, 256);
760
761 let audio = manager.vocode(&mel, None).await.unwrap();
762 assert!(audio.duration() > 0.0);
763 assert_eq!(audio.sample_rate(), 22050);
764
765 let vocoders = manager.list_vocoders();
767 assert!(vocoders.contains(&"dummy"));
768 }
769
770 #[tokio::test]
771 async fn test_dummy_vocoder() {
772 let vocoder = DummyVocoder::new();
773
774 let mel_data = vec![vec![0.5; 50]; 80]; let mel = MelSpectrogram::new(mel_data, 22050, 256);
777
778 let audio = vocoder.vocode(&mel, None).await.unwrap();
779
780 assert!(audio.duration() > 0.0);
782 assert_eq!(audio.sample_rate(), 22050);
783 assert!(!audio.is_empty());
784
785 let metadata = vocoder.metadata();
787 assert_eq!(metadata.name, "Dummy Vocoder");
788 assert_eq!(metadata.sample_rate, 22050);
789 assert_eq!(metadata.mel_channels, 80);
790
791 let mels = vec![mel.clone(), mel.clone()];
793 let results = vocoder.vocode_batch(&mels, None).await.unwrap();
794 assert_eq!(results.len(), 2);
795
796 for audio in results {
797 assert!(audio.duration() > 0.0);
798 assert_eq!(audio.sample_rate(), 22050);
799 }
800
801 assert!(vocoder.supports(VocoderFeature::BatchProcessing));
803 assert!(!vocoder.supports(VocoderFeature::StreamingInference));
804 }
805}