Skip to main content

voirs_sdk/
types.rs

1//! Core types for VoiRS speech synthesis.
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::str::FromStr;
6
7/// Result type alias for VoiRS operations
8pub type VoirsResult<T> = std::result::Result<T, crate::VoirsError>;
9
10/// Language code identifier
11#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
12pub enum LanguageCode {
13    /// English (US)
14    EnUs,
15    /// English (UK)
16    EnGb,
17    /// Japanese
18    JaJp,
19    /// Spanish (Spain)
20    EsEs,
21    /// Spanish (Mexico)
22    EsMx,
23    /// French (France)
24    FrFr,
25    /// German (Germany)
26    DeDe,
27    /// Chinese (Simplified)
28    ZhCn,
29    /// Portuguese (Brazil)
30    PtBr,
31    /// Russian
32    RuRu,
33    /// Italian
34    ItIt,
35    /// Korean
36    KoKr,
37    /// Dutch
38    NlNl,
39    /// Swedish
40    SvSe,
41    /// Norwegian
42    NoNo,
43    /// Danish
44    DaDk,
45
46    // Additional short language codes for compatibility
47    /// German (short code)
48    De,
49    /// French (short code)
50    Fr,
51    /// Spanish (short code)
52    Es,
53    /// Italian (short code)
54    It,
55    /// Portuguese (short code)
56    Pt,
57    /// Japanese (short code)
58    Ja,
59    /// Korean (short code)
60    Ko,
61    /// Russian (short code)
62    Ru,
63    /// Arabic
64    Ar,
65    /// Hindi
66    Hi,
67    /// Thai
68    Th,
69    /// Vietnamese
70    Vi,
71    /// Indonesian
72    Id,
73    /// Malay
74    Ms,
75    /// Dutch (short code)
76    Nl,
77    /// Swedish (short code)
78    Sv,
79    /// Norwegian (short code)
80    No,
81    /// Danish (short code)
82    Da,
83    /// Polish
84    Pl,
85    /// Czech
86    Cs,
87    /// Slovak
88    Sk,
89    /// Hungarian
90    Hu,
91    /// Romanian
92    Ro,
93    /// Bulgarian
94    Bg,
95    /// Croatian
96    Hr,
97    /// Serbian
98    Sr,
99    /// Slovenian
100    Sl,
101    /// Estonian
102    Et,
103    /// Latvian
104    Lv,
105    /// Lithuanian
106    Lt,
107    /// Finnish
108    Fi,
109    /// Greek
110    El,
111    /// Turkish
112    Tr,
113    /// Hebrew
114    He,
115    /// Persian/Farsi
116    Fa,
117    /// Urdu
118    Ur,
119    /// Bengali
120    Bn,
121    /// Tamil
122    Ta,
123    /// Telugu
124    Te,
125    /// Malayalam
126    Ml,
127    /// Kannada
128    Kn,
129    /// Gujarati
130    Gu,
131    /// Marathi
132    Mr,
133    /// Punjabi
134    Pa,
135    /// Odia
136    Or,
137    /// Assamese
138    As,
139}
140
141impl LanguageCode {
142    /// Get the language code as a string
143    pub fn as_str(&self) -> &'static str {
144        match self {
145            Self::EnUs => "en-US",
146            Self::EnGb => "en-GB",
147            Self::JaJp => "ja-JP",
148            Self::EsEs => "es-ES",
149            Self::EsMx => "es-MX",
150            Self::FrFr => "fr-FR",
151            Self::DeDe => "de-DE",
152            Self::ZhCn => "zh-CN",
153            Self::PtBr => "pt-BR",
154            Self::RuRu => "ru-RU",
155            Self::ItIt => "it-IT",
156            Self::KoKr => "ko-KR",
157            Self::NlNl => "nl-NL",
158            Self::SvSe => "sv-SE",
159            Self::NoNo => "no-NO",
160            Self::DaDk => "da-DK",
161
162            // Short language codes
163            Self::De => "de",
164            Self::Fr => "fr",
165            Self::Es => "es",
166            Self::It => "it",
167            Self::Pt => "pt",
168            Self::Ja => "ja",
169            Self::Ko => "ko",
170            Self::Ru => "ru",
171            Self::Ar => "ar",
172            Self::Hi => "hi",
173            Self::Th => "th",
174            Self::Vi => "vi",
175            Self::Id => "id",
176            Self::Ms => "ms",
177            Self::Nl => "nl",
178            Self::Sv => "sv",
179            Self::No => "no",
180            Self::Da => "da",
181            Self::Pl => "pl",
182            Self::Cs => "cs",
183            Self::Sk => "sk",
184            Self::Hu => "hu",
185            Self::Ro => "ro",
186            Self::Bg => "bg",
187            Self::Hr => "hr",
188            Self::Sr => "sr",
189            Self::Sl => "sl",
190            Self::Et => "et",
191            Self::Lv => "lv",
192            Self::Lt => "lt",
193            Self::Fi => "fi",
194            Self::El => "el",
195            Self::Tr => "tr",
196            Self::He => "he",
197            Self::Fa => "fa",
198            Self::Ur => "ur",
199            Self::Bn => "bn",
200            Self::Ta => "ta",
201            Self::Te => "te",
202            Self::Ml => "ml",
203            Self::Kn => "kn",
204            Self::Gu => "gu",
205            Self::Mr => "mr",
206            Self::Pa => "pa",
207            Self::Or => "or",
208            Self::As => "as",
209        }
210    }
211
212    /// Parse language code from string
213    pub fn parse(s: &str) -> Option<Self> {
214        match s {
215            "en-US" => Some(Self::EnUs),
216            "en-GB" => Some(Self::EnGb),
217            "ja-JP" => Some(Self::JaJp),
218            "es-ES" => Some(Self::EsEs),
219            "es-MX" => Some(Self::EsMx),
220            "fr-FR" => Some(Self::FrFr),
221            "de-DE" => Some(Self::DeDe),
222            "zh-CN" => Some(Self::ZhCn),
223            "pt-BR" => Some(Self::PtBr),
224            "ru-RU" => Some(Self::RuRu),
225            "it-IT" => Some(Self::ItIt),
226            "ko-KR" => Some(Self::KoKr),
227            "nl-NL" => Some(Self::NlNl),
228            "sv-SE" => Some(Self::SvSe),
229            "no-NO" => Some(Self::NoNo),
230            "da-DK" => Some(Self::DaDk),
231
232            // Short language codes
233            "de" => Some(Self::De),
234            "fr" => Some(Self::Fr),
235            "es" => Some(Self::Es),
236            "it" => Some(Self::It),
237            "pt" => Some(Self::Pt),
238            "ja" => Some(Self::Ja),
239            "ko" => Some(Self::Ko),
240            "ru" => Some(Self::Ru),
241            "ar" => Some(Self::Ar),
242            "hi" => Some(Self::Hi),
243            "th" => Some(Self::Th),
244            "vi" => Some(Self::Vi),
245            "id" => Some(Self::Id),
246            "ms" => Some(Self::Ms),
247            "nl" => Some(Self::Nl),
248            "sv" => Some(Self::Sv),
249            "no" => Some(Self::No),
250            "da" => Some(Self::Da),
251            "pl" => Some(Self::Pl),
252            "cs" => Some(Self::Cs),
253            "sk" => Some(Self::Sk),
254            "hu" => Some(Self::Hu),
255            "ro" => Some(Self::Ro),
256            "bg" => Some(Self::Bg),
257            "hr" => Some(Self::Hr),
258            "sr" => Some(Self::Sr),
259            "sl" => Some(Self::Sl),
260            "et" => Some(Self::Et),
261            "lv" => Some(Self::Lv),
262            "lt" => Some(Self::Lt),
263            "fi" => Some(Self::Fi),
264            "el" => Some(Self::El),
265            "tr" => Some(Self::Tr),
266            "he" => Some(Self::He),
267            "fa" => Some(Self::Fa),
268            "ur" => Some(Self::Ur),
269            "bn" => Some(Self::Bn),
270            "ta" => Some(Self::Ta),
271            "te" => Some(Self::Te),
272            "ml" => Some(Self::Ml),
273            "kn" => Some(Self::Kn),
274            "gu" => Some(Self::Gu),
275            "mr" => Some(Self::Mr),
276            "pa" => Some(Self::Pa),
277            "or" => Some(Self::Or),
278            "as" => Some(Self::As),
279            _ => None,
280        }
281    }
282}
283
284impl std::fmt::Display for LanguageCode {
285    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
286        write!(f, "{}", self.as_str())
287    }
288}
289
290/// Phoneme representation with IPA symbol and metadata
291#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
292pub struct Phoneme {
293    /// Primary symbol representation
294    pub symbol: String,
295
296    /// IPA symbol (e.g., "æ", "t̪", "d͡ʒ")
297    pub ipa_symbol: String,
298
299    /// Stress level (0=none, 1=primary, 2=secondary)
300    pub stress: u8,
301
302    /// Position within syllable
303    pub syllable_position: SyllablePosition,
304
305    /// Predicted duration in milliseconds
306    pub duration_ms: Option<f32>,
307
308    /// Confidence score (0.0-1.0)
309    pub confidence: f32,
310}
311
312impl Phoneme {
313    /// Create a new phoneme with symbol
314    pub fn new(symbol: impl Into<String>) -> Self {
315        let symbol_str = symbol.into();
316        Self {
317            symbol: symbol_str.clone(),
318            ipa_symbol: symbol_str, // Default to same as symbol
319            stress: 0,
320            syllable_position: SyllablePosition::Unknown,
321            duration_ms: None,
322            confidence: 1.0,
323        }
324    }
325
326    /// Create phoneme with stress
327    pub fn with_stress(mut self, stress: u8) -> Self {
328        self.stress = stress;
329        self
330    }
331
332    /// Create phoneme with duration
333    pub fn with_duration(mut self, duration_ms: f32) -> Self {
334        self.duration_ms = Some(duration_ms);
335        self
336    }
337}
338
339/// Position of phoneme within syllable
340#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
341pub enum SyllablePosition {
342    /// Position unknown
343    Unknown,
344    /// Syllable onset
345    Onset,
346    /// Syllable nucleus (vowel)
347    Nucleus,
348    /// Syllable coda
349    Coda,
350}
351
352/// Mel spectrogram representation
353#[derive(Debug, Clone, Serialize, Deserialize)]
354pub struct MelSpectrogram {
355    /// Mel filterbank features [n_mels, n_frames]
356    pub data: Vec<Vec<f32>>,
357
358    /// Sample rate in Hz
359    pub sample_rate: u32,
360
361    /// Hop length in samples
362    pub hop_length: u32,
363
364    /// Number of mel channels
365    pub n_mels: u32,
366
367    /// Number of time frames
368    pub n_frames: u32,
369}
370
371impl MelSpectrogram {
372    /// Create new mel spectrogram
373    pub fn new(data: Vec<Vec<f32>>, sample_rate: u32, hop_length: u32) -> Self {
374        let n_mels = data.len() as u32;
375        let n_frames = data.first().map(|row| row.len()).unwrap_or(0) as u32;
376
377        Self {
378            data,
379            sample_rate,
380            hop_length,
381            n_mels,
382            n_frames,
383        }
384    }
385
386    /// Get duration in seconds
387    pub fn duration(&self) -> f32 {
388        (self.n_frames * self.hop_length) as f32 / self.sample_rate as f32
389    }
390
391    /// Get mel values at specific frame
392    pub fn frame(&self, frame_idx: usize) -> Option<Vec<f32>> {
393        if frame_idx >= self.n_frames as usize {
394            return None;
395        }
396
397        Some(self.data.iter().map(|row| row[frame_idx]).collect())
398    }
399}
400
401/// Audio sample representation
402#[derive(Debug, Clone, Copy, PartialEq)]
403pub struct AudioSample {
404    /// Sample value (typically -1.0 to 1.0)
405    pub value: f32,
406    /// Sample index in audio stream
407    pub index: usize,
408}
409
410/// Voice configuration and metadata
411#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct VoiceConfig {
413    /// Voice identifier
414    pub id: String,
415
416    /// Human-readable name
417    pub name: String,
418
419    /// Language code
420    pub language: LanguageCode,
421
422    /// Voice characteristics
423    pub characteristics: VoiceCharacteristics,
424
425    /// Model paths and configuration
426    pub model_config: ModelConfig,
427
428    /// Additional metadata
429    pub metadata: HashMap<String, String>,
430}
431
432/// Voice characteristics
433#[derive(Debug, Clone, Serialize, Deserialize)]
434pub struct VoiceCharacteristics {
435    /// Gender (if applicable)
436    pub gender: Option<Gender>,
437
438    /// Age range
439    pub age: Option<AgeRange>,
440
441    /// Speaking style
442    pub style: SpeakingStyle,
443
444    /// Emotion capability
445    pub emotion_support: bool,
446
447    /// Quality level
448    pub quality: QualityLevel,
449}
450
451impl Default for VoiceCharacteristics {
452    fn default() -> Self {
453        Self {
454            gender: None,
455            age: None,
456            style: SpeakingStyle::Neutral,
457            emotion_support: false,
458            quality: QualityLevel::Medium,
459        }
460    }
461}
462
463/// Gender classification
464#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
465pub enum Gender {
466    Male,
467    Female,
468    NonBinary,
469}
470
471impl std::fmt::Display for Gender {
472    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
473        match self {
474            Gender::Male => write!(f, "Male"),
475            Gender::Female => write!(f, "Female"),
476            Gender::NonBinary => write!(f, "NonBinary"),
477        }
478    }
479}
480
481/// Age range classification
482#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
483pub enum AgeRange {
484    Child,      // 5-12
485    Teen,       // 13-19
486    YoungAdult, // 20-35
487    Adult,      // 36-60
488    Senior,     // 60+
489}
490
491/// Speaking style
492#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
493pub enum SpeakingStyle {
494    Neutral,
495    Conversational,
496    News,
497    Formal,
498    Casual,
499    Energetic,
500    Calm,
501    Dramatic,
502    Whisper,
503}
504
505impl std::fmt::Display for SpeakingStyle {
506    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
507        match self {
508            SpeakingStyle::Neutral => write!(f, "Neutral"),
509            SpeakingStyle::Conversational => write!(f, "Conversational"),
510            SpeakingStyle::News => write!(f, "News"),
511            SpeakingStyle::Formal => write!(f, "Formal"),
512            SpeakingStyle::Casual => write!(f, "Casual"),
513            SpeakingStyle::Energetic => write!(f, "Energetic"),
514            SpeakingStyle::Calm => write!(f, "Calm"),
515            SpeakingStyle::Dramatic => write!(f, "Dramatic"),
516            SpeakingStyle::Whisper => write!(f, "Whisper"),
517        }
518    }
519}
520
521/// Quality level for synthesis
522#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
523pub enum QualityLevel {
524    Low,
525    Medium,
526    High,
527    Ultra,
528}
529
530/// Model configuration
531#[derive(Debug, Clone, Serialize, Deserialize)]
532pub struct ModelConfig {
533    /// G2P model path
534    pub g2p_model: Option<String>,
535
536    /// Acoustic model path
537    pub acoustic_model: String,
538
539    /// Vocoder model path
540    pub vocoder_model: String,
541
542    /// Model format (candle, onnx, etc.)
543    pub format: ModelFormat,
544
545    /// Device requirements
546    pub device_requirements: DeviceRequirements,
547}
548
549impl Default for ModelConfig {
550    fn default() -> Self {
551        Self {
552            g2p_model: None,
553            acoustic_model: "default-acoustic.safetensors".to_string(),
554            vocoder_model: "default-vocoder.safetensors".to_string(),
555            format: ModelFormat::Candle,
556            device_requirements: DeviceRequirements::default(),
557        }
558    }
559}
560
561/// Model format
562#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
563pub enum ModelFormat {
564    Candle,
565    Onnx,
566    PyTorch,
567    TensorFlow,
568}
569
570/// Device requirements for models
571#[derive(Debug, Clone, Serialize, Deserialize)]
572pub struct DeviceRequirements {
573    /// Minimum memory in MB
574    pub min_memory_mb: u32,
575
576    /// GPU support
577    pub gpu_support: bool,
578
579    /// Supported compute capabilities
580    pub compute_capabilities: Vec<String>,
581}
582
583impl Default for DeviceRequirements {
584    fn default() -> Self {
585        Self {
586            min_memory_mb: 512,
587            gpu_support: false,
588            compute_capabilities: vec![],
589        }
590    }
591}
592
593/// Audio format specification
594#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
595pub enum AudioFormat {
596    Wav,
597    Flac,
598    Mp3,
599    Opus,
600    Ogg,
601}
602
603impl AudioFormat {
604    /// Get file extension for format
605    pub fn extension(&self) -> &'static str {
606        match self {
607            Self::Wav => "wav",
608            Self::Flac => "flac",
609            Self::Mp3 => "mp3",
610            Self::Opus => "opus",
611            Self::Ogg => "ogg",
612        }
613    }
614}
615
616impl std::fmt::Display for AudioFormat {
617    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
618        write!(f, "{}", self.extension())
619    }
620}
621
622/// Audio effect types
623#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
624pub enum AudioEffect {
625    /// Reverb effect
626    Reverb {
627        room_size: f32,
628        damping: f32,
629        wet_level: f32,
630    },
631    /// Delay effect
632    Delay {
633        delay_time: f32,
634        feedback: f32,
635        wet_level: f32,
636    },
637    /// Equalizer effect
638    Equalizer {
639        low_gain: f32,
640        mid_gain: f32,
641        high_gain: f32,
642    },
643    /// Compressor effect
644    Compressor {
645        threshold: f32,
646        ratio: f32,
647        attack: f32,
648        release: f32,
649    },
650}
651
652/// Synthesis configuration
653#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
654pub struct SynthesisConfig {
655    /// Speaking rate multiplier (0.5 - 2.0)
656    pub speaking_rate: f32,
657
658    /// Pitch shift in semitones (-12.0 - 12.0)
659    pub pitch_shift: f32,
660
661    /// Volume gain in dB (-20.0 - 20.0)
662    pub volume_gain: f32,
663
664    /// Enable audio enhancement
665    pub enable_enhancement: bool,
666
667    /// Output audio format
668    pub output_format: AudioFormat,
669
670    /// Sample rate
671    pub sample_rate: u32,
672
673    /// Quality level
674    pub quality: QualityLevel,
675
676    /// Language for synthesis
677    pub language: LanguageCode,
678
679    /// Audio effects to apply
680    pub effects: Vec<AudioEffect>,
681
682    /// Streaming chunk size in words
683    pub streaming_chunk_size: Option<usize>,
684
685    /// Random seed for reproducible generation
686    pub seed: Option<u64>,
687
688    /// Enable emotion processing
689    pub enable_emotion: bool,
690
691    /// Emotion type to apply
692    pub emotion_type: Option<String>,
693
694    /// Emotion intensity (0.0 - 1.0)
695    pub emotion_intensity: f32,
696
697    /// Emotion preset name
698    pub emotion_preset: Option<String>,
699
700    /// Enable automatic emotion detection from text
701    pub auto_emotion_detection: bool,
702
703    // Voice cloning configuration
704    /// Enable voice cloning
705    pub enable_cloning: bool,
706    /// Cloning method
707    pub cloning_method: Option<crate::builder::features::CloningMethod>,
708    /// Cloning quality level (0.0 - 1.0)
709    pub cloning_quality: f32,
710
711    // Voice conversion configuration
712    /// Enable voice conversion
713    pub enable_conversion: bool,
714    /// Conversion target
715    pub conversion_target: Option<crate::builder::features::ConversionTarget>,
716    /// Enable real-time conversion
717    pub realtime_conversion: bool,
718
719    // Singing synthesis configuration
720    /// Enable singing synthesis
721    pub enable_singing: bool,
722    /// Singing voice type
723    pub singing_voice_type: Option<crate::builder::features::SingingVoiceType>,
724    /// Singing technique configuration
725    pub singing_technique: Option<crate::builder::features::SingingTechnique>,
726    /// Musical key
727    pub musical_key: Option<crate::builder::features::MusicalKey>,
728    /// Tempo in BPM
729    pub tempo: Option<f32>,
730
731    // 3D spatial audio configuration
732    /// Enable 3D spatial audio
733    pub enable_spatial: bool,
734    /// Listener position
735    pub listener_position: Option<crate::builder::features::Position3D>,
736    /// Enable HRTF processing
737    pub hrtf_enabled: bool,
738    /// Room size
739    pub room_size: Option<crate::builder::features::RoomSize>,
740    /// Reverb level (0.0 - 1.0)
741    pub reverb_level: f32,
742}
743
744impl Default for SynthesisConfig {
745    fn default() -> Self {
746        Self {
747            speaking_rate: 1.0,
748            pitch_shift: 0.0,
749            volume_gain: 0.0,
750            enable_enhancement: true,
751            output_format: AudioFormat::Wav,
752            sample_rate: 22050,
753            quality: QualityLevel::High,
754            language: LanguageCode::EnUs,
755            effects: Vec::new(),
756            streaming_chunk_size: None,
757            seed: None,
758            enable_emotion: false,
759            emotion_type: None,
760            emotion_intensity: 0.7,
761            emotion_preset: None,
762            auto_emotion_detection: false,
763
764            // Voice cloning defaults
765            enable_cloning: false,
766            cloning_method: None,
767            cloning_quality: 0.85,
768
769            // Voice conversion defaults
770            enable_conversion: false,
771            conversion_target: None,
772            realtime_conversion: false,
773
774            // Singing synthesis defaults
775            enable_singing: false,
776            singing_voice_type: None,
777            singing_technique: None,
778            musical_key: None,
779            tempo: None,
780
781            // 3D spatial audio defaults
782            enable_spatial: false,
783            listener_position: None,
784            hrtf_enabled: false,
785            room_size: None,
786            reverb_level: 0.3,
787        }
788    }
789}
790
791impl crate::config::hierarchy::ConfigHierarchy for SynthesisConfig {
792    fn merge_with(&mut self, other: &Self) {
793        if (other.speaking_rate - 1.0).abs() > f32::EPSILON {
794            self.speaking_rate = other.speaking_rate;
795        }
796        if other.pitch_shift.abs() > f32::EPSILON {
797            self.pitch_shift = other.pitch_shift;
798        }
799        if other.volume_gain.abs() > f32::EPSILON {
800            self.volume_gain = other.volume_gain;
801        }
802        if !other.enable_enhancement {
803            self.enable_enhancement = other.enable_enhancement;
804        }
805        if other.output_format != AudioFormat::Wav {
806            self.output_format = other.output_format;
807        }
808        if other.sample_rate != 22050 {
809            self.sample_rate = other.sample_rate;
810        }
811        if other.quality != QualityLevel::High {
812            self.quality = other.quality;
813        }
814        if other.language != LanguageCode::EnUs {
815            self.language = other.language;
816        }
817        if other.streaming_chunk_size.is_some() {
818            self.streaming_chunk_size = other.streaming_chunk_size;
819        }
820
821        // Merge emotion settings
822        if other.enable_emotion {
823            self.enable_emotion = other.enable_emotion;
824        }
825        if other.emotion_type.is_some() {
826            self.emotion_type = other.emotion_type.clone();
827        }
828        if (other.emotion_intensity - 0.7).abs() > f32::EPSILON {
829            self.emotion_intensity = other.emotion_intensity;
830        }
831        if other.emotion_preset.is_some() {
832            self.emotion_preset = other.emotion_preset.clone();
833        }
834        if other.auto_emotion_detection {
835            self.auto_emotion_detection = other.auto_emotion_detection;
836        }
837
838        // Merge effects (append to existing)
839        self.effects.extend(other.effects.clone());
840    }
841
842    fn validate(&self) -> Result<(), crate::config::hierarchy::ConfigValidationError> {
843        if self.speaking_rate < 0.5 || self.speaking_rate > 2.0 {
844            return Err(crate::config::hierarchy::ConfigValidationError {
845                field: "speaking_rate".to_string(),
846                message: "Speaking rate must be between 0.5 and 2.0".to_string(),
847            });
848        }
849
850        if self.pitch_shift < -12.0 || self.pitch_shift > 12.0 {
851            return Err(crate::config::hierarchy::ConfigValidationError {
852                field: "pitch_shift".to_string(),
853                message: "Pitch shift must be between -12.0 and 12.0 semitones".to_string(),
854            });
855        }
856
857        if self.volume_gain < -20.0 || self.volume_gain > 20.0 {
858            return Err(crate::config::hierarchy::ConfigValidationError {
859                field: "volume_gain".to_string(),
860                message: "Volume gain must be between -20.0 and 20.0 dB".to_string(),
861            });
862        }
863
864        if self.sample_rate < 8000 || self.sample_rate > 96000 {
865            return Err(crate::config::hierarchy::ConfigValidationError {
866                field: "sample_rate".to_string(),
867                message: "Sample rate must be between 8000 and 96000 Hz".to_string(),
868            });
869        }
870
871        if self.emotion_intensity < 0.0 || self.emotion_intensity > 1.0 {
872            return Err(crate::config::hierarchy::ConfigValidationError {
873                field: "emotion_intensity".to_string(),
874                message: "Emotion intensity must be between 0.0 and 1.0".to_string(),
875            });
876        }
877
878        Ok(())
879    }
880}
881
882/// Default implementation for AudioFormat
883impl Default for AudioFormat {
884    fn default() -> Self {
885        AudioFormat::Wav
886    }
887}
888
889/// FromStr implementation for AudioFormat
890impl FromStr for AudioFormat {
891    type Err = String;
892
893    fn from_str(s: &str) -> Result<Self, Self::Err> {
894        match s.to_lowercase().as_str() {
895            "wav" => Ok(AudioFormat::Wav),
896            "flac" => Ok(AudioFormat::Flac),
897            "mp3" => Ok(AudioFormat::Mp3),
898            "opus" => Ok(AudioFormat::Opus),
899            "ogg" => Ok(AudioFormat::Ogg),
900            _ => Err(format!("Unknown audio format: {s}")),
901        }
902    }
903}
904
905/// Model features supported by VoiRS components
906#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
907pub enum ModelFeature {
908    /// Multi-speaker support
909    MultiSpeaker,
910    /// Emotion control support
911    EmotionControl,
912    /// Style control support
913    StyleControl,
914    /// Prosody control support
915    ProsodyControl,
916    /// Voice cloning capability
917    VoiceCloning,
918    /// Streaming support
919    StreamingSupport,
920    /// Batch processing support
921    BatchProcessing,
922    /// GPU acceleration support
923    GPUAcceleration,
924}
925
926/// System capability detection and negotiation
927#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
928pub struct SystemCapabilities {
929    /// Available features in the current runtime
930    pub available_features: Vec<AdvancedFeature>,
931    /// Hardware capabilities
932    pub hardware: HardwareCapabilities,
933    /// Resource constraints
934    pub resource_limits: ResourceLimits,
935    /// Model capabilities by voice
936    pub model_capabilities: HashMap<String, ModelCapabilities>,
937}
938
939/// Advanced voice features
940#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
941pub enum AdvancedFeature {
942    /// Emotion expression control
943    EmotionControl,
944    /// Voice cloning capability
945    VoiceCloning,
946    /// Real-time voice conversion
947    VoiceConversion,
948    /// Singing voice synthesis
949    SingingSynthesis,
950    /// 3D spatial audio processing
951    SpatialAudio,
952    /// Streaming synthesis
953    StreamingSynthesis,
954    /// GPU acceleration
955    GpuAcceleration,
956    /// WebAssembly compatibility
957    WasmSupport,
958    /// Cloud processing
959    CloudProcessing,
960    /// High-quality vocoding
961    HighQualityVocoding,
962    /// Real-time processing
963    RealtimeProcessing,
964}
965
966/// Hardware capability detection
967#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
968pub struct HardwareCapabilities {
969    /// Available GPU compute capability
970    pub gpu_available: bool,
971    /// GPU memory in MB
972    pub gpu_memory_mb: Option<u64>,
973    /// CPU core count
974    pub cpu_cores: u32,
975    /// System RAM in MB
976    pub system_memory_mb: u64,
977    /// Storage type (SSD/HDD)
978    pub fast_storage: bool,
979}
980
981/// Resource constraints for capability negotiation
982#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
983pub struct ResourceLimits {
984    /// Maximum memory usage in MB
985    pub max_memory_mb: u64,
986    /// Maximum CPU usage percentage (0-100)
987    pub max_cpu_percent: u8,
988    /// Maximum latency tolerance in milliseconds
989    pub max_latency_ms: u32,
990    /// Battery optimization (for mobile)
991    pub battery_optimization: bool,
992}
993
994/// Model-specific capabilities
995#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
996pub struct ModelCapabilities {
997    /// Supported advanced features
998    pub supported_features: Vec<AdvancedFeature>,
999    /// Required hardware features
1000    pub hardware_requirements: HardwareRequirements,
1001    /// Performance characteristics
1002    pub performance_profile: PerformanceProfile,
1003}
1004
1005/// Hardware requirements for a model
1006#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1007pub struct HardwareRequirements {
1008    /// Minimum memory in MB
1009    pub min_memory_mb: u64,
1010    /// Minimum GPU memory in MB (if GPU required)
1011    pub min_gpu_memory_mb: Option<u64>,
1012    /// Requires GPU acceleration
1013    pub requires_gpu: bool,
1014    /// Minimum CPU cores
1015    pub min_cpu_cores: u32,
1016}
1017
1018/// Performance characteristics of a model
1019#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1020pub struct PerformanceProfile {
1021    /// Initialization latency in milliseconds
1022    pub init_latency_ms: u32,
1023    /// Synthesis latency per second of audio
1024    pub synthesis_latency_ms_per_sec: u32,
1025    /// Memory usage during synthesis in MB
1026    pub synthesis_memory_mb: u64,
1027    /// Quality score (0.0-1.0)
1028    pub quality_score: u8, // Stored as u8 (0-100) for Eq trait
1029}
1030
1031/// Capability negotiation request
1032#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1033pub struct CapabilityRequest {
1034    /// Desired features
1035    pub desired_features: Vec<AdvancedFeature>,
1036    /// Priority of features (matched by index with desired_features)
1037    pub feature_priorities: Vec<FeaturePriority>,
1038    /// Resource constraints
1039    pub constraints: ResourceLimits,
1040    /// Fallback strategy
1041    pub fallback_strategy: FallbackStrategy,
1042}
1043
1044/// Priority level for features
1045#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
1046pub enum FeaturePriority {
1047    /// Feature is optional
1048    Optional,
1049    /// Feature is preferred but not required
1050    Preferred,
1051    /// Feature is required
1052    Required,
1053    /// Feature is critical - fail if not available
1054    Critical,
1055}
1056
1057/// Fallback strategy when features are unavailable
1058#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1059pub enum FallbackStrategy {
1060    /// Fail immediately if any required feature is unavailable
1061    FailFast,
1062    /// Degrade gracefully by disabling unavailable features
1063    GracefulDegradation,
1064    /// Use alternative implementations
1065    UseAlternatives,
1066    /// Fall back to basic functionality only
1067    BasicFunctionality,
1068}
1069
1070/// Capability negotiation result
1071#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1072pub struct CapabilityNegotiation {
1073    /// Features that will be enabled
1074    pub enabled_features: Vec<AdvancedFeature>,
1075    /// Features that were requested but unavailable
1076    pub unavailable_features: Vec<AdvancedFeature>,
1077    /// Warnings about resource constraints
1078    pub warnings: Vec<String>,
1079    /// Selected models and configurations
1080    pub selected_models: HashMap<String, String>,
1081    /// Estimated resource usage
1082    pub estimated_usage: ResourceUsage,
1083}
1084
1085/// Estimated resource usage for a configuration
1086#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1087pub struct ResourceUsage {
1088    /// Memory usage in MB
1089    pub memory_mb: u64,
1090    /// Initialization time in milliseconds
1091    pub init_time_ms: u32,
1092    /// Processing latency in milliseconds
1093    pub processing_latency_ms: u32,
1094    /// CPU usage percentage (0-100)
1095    pub cpu_usage_percent: u8,
1096}
1097
1098impl Default for SystemCapabilities {
1099    fn default() -> Self {
1100        Self {
1101            available_features: vec![
1102                AdvancedFeature::EmotionControl,
1103                AdvancedFeature::StreamingSynthesis,
1104                AdvancedFeature::RealtimeProcessing,
1105            ],
1106            hardware: HardwareCapabilities::default(),
1107            resource_limits: ResourceLimits::default(),
1108            model_capabilities: HashMap::new(),
1109        }
1110    }
1111}
1112
1113impl Default for HardwareCapabilities {
1114    fn default() -> Self {
1115        Self {
1116            gpu_available: false,
1117            gpu_memory_mb: None,
1118            cpu_cores: num_cpus::get() as u32,
1119            system_memory_mb: 4096, // Conservative default
1120            fast_storage: true,
1121        }
1122    }
1123}
1124
1125impl Default for ResourceLimits {
1126    fn default() -> Self {
1127        Self {
1128            max_memory_mb: 2048,
1129            max_cpu_percent: 80,
1130            max_latency_ms: 500,
1131            battery_optimization: false,
1132        }
1133    }
1134}
1135
1136impl Default for CapabilityRequest {
1137    fn default() -> Self {
1138        Self {
1139            desired_features: vec![AdvancedFeature::StreamingSynthesis],
1140            feature_priorities: vec![FeaturePriority::Preferred],
1141            constraints: ResourceLimits::default(),
1142            fallback_strategy: FallbackStrategy::GracefulDegradation,
1143        }
1144    }
1145}
1146
1147/// Default implementation for QualityLevel
1148impl Default for QualityLevel {
1149    fn default() -> Self {
1150        QualityLevel::High
1151    }
1152}
1153
1154/// FromStr implementation for QualityLevel
1155impl FromStr for QualityLevel {
1156    type Err = String;
1157
1158    fn from_str(s: &str) -> Result<Self, Self::Err> {
1159        match s.to_lowercase().as_str() {
1160            "low" => Ok(QualityLevel::Low),
1161            "medium" => Ok(QualityLevel::Medium),
1162            "high" => Ok(QualityLevel::High),
1163            "ultra" => Ok(QualityLevel::Ultra),
1164            _ => Err(format!("Unknown quality level: {s}")),
1165        }
1166    }
1167}