use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::str::FromStr;
pub type VoirsResult<T> = std::result::Result<T, crate::VoirsError>;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum LanguageCode {
EnUs,
EnGb,
JaJp,
EsEs,
EsMx,
FrFr,
DeDe,
ZhCn,
PtBr,
RuRu,
ItIt,
KoKr,
NlNl,
SvSe,
NoNo,
DaDk,
De,
Fr,
Es,
It,
Pt,
Ja,
Ko,
Ru,
Ar,
Hi,
Th,
Vi,
Id,
Ms,
Nl,
Sv,
No,
Da,
Pl,
Cs,
Sk,
Hu,
Ro,
Bg,
Hr,
Sr,
Sl,
Et,
Lv,
Lt,
Fi,
El,
Tr,
He,
Fa,
Ur,
Bn,
Ta,
Te,
Ml,
Kn,
Gu,
Mr,
Pa,
Or,
As,
}
impl LanguageCode {
pub fn as_str(&self) -> &'static str {
match self {
Self::EnUs => "en-US",
Self::EnGb => "en-GB",
Self::JaJp => "ja-JP",
Self::EsEs => "es-ES",
Self::EsMx => "es-MX",
Self::FrFr => "fr-FR",
Self::DeDe => "de-DE",
Self::ZhCn => "zh-CN",
Self::PtBr => "pt-BR",
Self::RuRu => "ru-RU",
Self::ItIt => "it-IT",
Self::KoKr => "ko-KR",
Self::NlNl => "nl-NL",
Self::SvSe => "sv-SE",
Self::NoNo => "no-NO",
Self::DaDk => "da-DK",
Self::De => "de",
Self::Fr => "fr",
Self::Es => "es",
Self::It => "it",
Self::Pt => "pt",
Self::Ja => "ja",
Self::Ko => "ko",
Self::Ru => "ru",
Self::Ar => "ar",
Self::Hi => "hi",
Self::Th => "th",
Self::Vi => "vi",
Self::Id => "id",
Self::Ms => "ms",
Self::Nl => "nl",
Self::Sv => "sv",
Self::No => "no",
Self::Da => "da",
Self::Pl => "pl",
Self::Cs => "cs",
Self::Sk => "sk",
Self::Hu => "hu",
Self::Ro => "ro",
Self::Bg => "bg",
Self::Hr => "hr",
Self::Sr => "sr",
Self::Sl => "sl",
Self::Et => "et",
Self::Lv => "lv",
Self::Lt => "lt",
Self::Fi => "fi",
Self::El => "el",
Self::Tr => "tr",
Self::He => "he",
Self::Fa => "fa",
Self::Ur => "ur",
Self::Bn => "bn",
Self::Ta => "ta",
Self::Te => "te",
Self::Ml => "ml",
Self::Kn => "kn",
Self::Gu => "gu",
Self::Mr => "mr",
Self::Pa => "pa",
Self::Or => "or",
Self::As => "as",
}
}
pub fn parse(s: &str) -> Option<Self> {
match s {
"en-US" => Some(Self::EnUs),
"en-GB" => Some(Self::EnGb),
"ja-JP" => Some(Self::JaJp),
"es-ES" => Some(Self::EsEs),
"es-MX" => Some(Self::EsMx),
"fr-FR" => Some(Self::FrFr),
"de-DE" => Some(Self::DeDe),
"zh-CN" => Some(Self::ZhCn),
"pt-BR" => Some(Self::PtBr),
"ru-RU" => Some(Self::RuRu),
"it-IT" => Some(Self::ItIt),
"ko-KR" => Some(Self::KoKr),
"nl-NL" => Some(Self::NlNl),
"sv-SE" => Some(Self::SvSe),
"no-NO" => Some(Self::NoNo),
"da-DK" => Some(Self::DaDk),
"de" => Some(Self::De),
"fr" => Some(Self::Fr),
"es" => Some(Self::Es),
"it" => Some(Self::It),
"pt" => Some(Self::Pt),
"ja" => Some(Self::Ja),
"ko" => Some(Self::Ko),
"ru" => Some(Self::Ru),
"ar" => Some(Self::Ar),
"hi" => Some(Self::Hi),
"th" => Some(Self::Th),
"vi" => Some(Self::Vi),
"id" => Some(Self::Id),
"ms" => Some(Self::Ms),
"nl" => Some(Self::Nl),
"sv" => Some(Self::Sv),
"no" => Some(Self::No),
"da" => Some(Self::Da),
"pl" => Some(Self::Pl),
"cs" => Some(Self::Cs),
"sk" => Some(Self::Sk),
"hu" => Some(Self::Hu),
"ro" => Some(Self::Ro),
"bg" => Some(Self::Bg),
"hr" => Some(Self::Hr),
"sr" => Some(Self::Sr),
"sl" => Some(Self::Sl),
"et" => Some(Self::Et),
"lv" => Some(Self::Lv),
"lt" => Some(Self::Lt),
"fi" => Some(Self::Fi),
"el" => Some(Self::El),
"tr" => Some(Self::Tr),
"he" => Some(Self::He),
"fa" => Some(Self::Fa),
"ur" => Some(Self::Ur),
"bn" => Some(Self::Bn),
"ta" => Some(Self::Ta),
"te" => Some(Self::Te),
"ml" => Some(Self::Ml),
"kn" => Some(Self::Kn),
"gu" => Some(Self::Gu),
"mr" => Some(Self::Mr),
"pa" => Some(Self::Pa),
"or" => Some(Self::Or),
"as" => Some(Self::As),
_ => None,
}
}
}
impl std::fmt::Display for LanguageCode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Phoneme {
pub symbol: String,
pub ipa_symbol: String,
pub stress: u8,
pub syllable_position: SyllablePosition,
pub duration_ms: Option<f32>,
pub confidence: f32,
}
impl Phoneme {
pub fn new(symbol: impl Into<String>) -> Self {
let symbol_str = symbol.into();
Self {
symbol: symbol_str.clone(),
ipa_symbol: symbol_str, stress: 0,
syllable_position: SyllablePosition::Unknown,
duration_ms: None,
confidence: 1.0,
}
}
pub fn with_stress(mut self, stress: u8) -> Self {
self.stress = stress;
self
}
pub fn with_duration(mut self, duration_ms: f32) -> Self {
self.duration_ms = Some(duration_ms);
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SyllablePosition {
Unknown,
Onset,
Nucleus,
Coda,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MelSpectrogram {
pub data: Vec<Vec<f32>>,
pub sample_rate: u32,
pub hop_length: u32,
pub n_mels: u32,
pub n_frames: u32,
}
impl MelSpectrogram {
pub fn new(data: Vec<Vec<f32>>, sample_rate: u32, hop_length: u32) -> Self {
let n_mels = data.len() as u32;
let n_frames = data.first().map(|row| row.len()).unwrap_or(0) as u32;
Self {
data,
sample_rate,
hop_length,
n_mels,
n_frames,
}
}
pub fn duration(&self) -> f32 {
(self.n_frames * self.hop_length) as f32 / self.sample_rate as f32
}
pub fn frame(&self, frame_idx: usize) -> Option<Vec<f32>> {
if frame_idx >= self.n_frames as usize {
return None;
}
Some(self.data.iter().map(|row| row[frame_idx]).collect())
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct AudioSample {
pub value: f32,
pub index: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceConfig {
pub id: String,
pub name: String,
pub language: LanguageCode,
pub characteristics: VoiceCharacteristics,
pub model_config: ModelConfig,
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceCharacteristics {
pub gender: Option<Gender>,
pub age: Option<AgeRange>,
pub style: SpeakingStyle,
pub emotion_support: bool,
pub quality: QualityLevel,
}
impl Default for VoiceCharacteristics {
fn default() -> Self {
Self {
gender: None,
age: None,
style: SpeakingStyle::Neutral,
emotion_support: false,
quality: QualityLevel::Medium,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Gender {
Male,
Female,
NonBinary,
}
impl std::fmt::Display for Gender {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Gender::Male => write!(f, "Male"),
Gender::Female => write!(f, "Female"),
Gender::NonBinary => write!(f, "NonBinary"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AgeRange {
Child, Teen, YoungAdult, Adult, Senior, }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SpeakingStyle {
Neutral,
Conversational,
News,
Formal,
Casual,
Energetic,
Calm,
Dramatic,
Whisper,
}
impl std::fmt::Display for SpeakingStyle {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SpeakingStyle::Neutral => write!(f, "Neutral"),
SpeakingStyle::Conversational => write!(f, "Conversational"),
SpeakingStyle::News => write!(f, "News"),
SpeakingStyle::Formal => write!(f, "Formal"),
SpeakingStyle::Casual => write!(f, "Casual"),
SpeakingStyle::Energetic => write!(f, "Energetic"),
SpeakingStyle::Calm => write!(f, "Calm"),
SpeakingStyle::Dramatic => write!(f, "Dramatic"),
SpeakingStyle::Whisper => write!(f, "Whisper"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum QualityLevel {
Low,
Medium,
High,
Ultra,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelConfig {
pub g2p_model: Option<String>,
pub acoustic_model: String,
pub vocoder_model: String,
pub format: ModelFormat,
pub device_requirements: DeviceRequirements,
}
impl Default for ModelConfig {
fn default() -> Self {
Self {
g2p_model: None,
acoustic_model: "default-acoustic.safetensors".to_string(),
vocoder_model: "default-vocoder.safetensors".to_string(),
format: ModelFormat::Candle,
device_requirements: DeviceRequirements::default(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ModelFormat {
Candle,
Onnx,
PyTorch,
TensorFlow,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeviceRequirements {
pub min_memory_mb: u32,
pub gpu_support: bool,
pub compute_capabilities: Vec<String>,
}
impl Default for DeviceRequirements {
fn default() -> Self {
Self {
min_memory_mb: 512,
gpu_support: false,
compute_capabilities: vec![],
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AudioFormat {
Wav,
Flac,
Mp3,
Opus,
Ogg,
}
impl AudioFormat {
pub fn extension(&self) -> &'static str {
match self {
Self::Wav => "wav",
Self::Flac => "flac",
Self::Mp3 => "mp3",
Self::Opus => "opus",
Self::Ogg => "ogg",
}
}
}
impl std::fmt::Display for AudioFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.extension())
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum AudioEffect {
Reverb {
room_size: f32,
damping: f32,
wet_level: f32,
},
Delay {
delay_time: f32,
feedback: f32,
wet_level: f32,
},
Equalizer {
low_gain: f32,
mid_gain: f32,
high_gain: f32,
},
Compressor {
threshold: f32,
ratio: f32,
attack: f32,
release: f32,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SynthesisConfig {
pub speaking_rate: f32,
pub pitch_shift: f32,
pub volume_gain: f32,
pub enable_enhancement: bool,
pub output_format: AudioFormat,
pub sample_rate: u32,
pub quality: QualityLevel,
pub language: LanguageCode,
pub effects: Vec<AudioEffect>,
pub streaming_chunk_size: Option<usize>,
pub seed: Option<u64>,
pub enable_emotion: bool,
pub emotion_type: Option<String>,
pub emotion_intensity: f32,
pub emotion_preset: Option<String>,
pub auto_emotion_detection: bool,
pub enable_cloning: bool,
pub cloning_method: Option<crate::builder::features::CloningMethod>,
pub cloning_quality: f32,
pub enable_conversion: bool,
pub conversion_target: Option<crate::builder::features::ConversionTarget>,
pub realtime_conversion: bool,
pub enable_singing: bool,
pub singing_voice_type: Option<crate::builder::features::SingingVoiceType>,
pub singing_technique: Option<crate::builder::features::SingingTechnique>,
pub musical_key: Option<crate::builder::features::MusicalKey>,
pub tempo: Option<f32>,
pub enable_spatial: bool,
pub listener_position: Option<crate::builder::features::Position3D>,
pub hrtf_enabled: bool,
pub room_size: Option<crate::builder::features::RoomSize>,
pub reverb_level: f32,
}
impl Default for SynthesisConfig {
fn default() -> Self {
Self {
speaking_rate: 1.0,
pitch_shift: 0.0,
volume_gain: 0.0,
enable_enhancement: true,
output_format: AudioFormat::Wav,
sample_rate: 22050,
quality: QualityLevel::High,
language: LanguageCode::EnUs,
effects: Vec::new(),
streaming_chunk_size: None,
seed: None,
enable_emotion: false,
emotion_type: None,
emotion_intensity: 0.7,
emotion_preset: None,
auto_emotion_detection: false,
enable_cloning: false,
cloning_method: None,
cloning_quality: 0.85,
enable_conversion: false,
conversion_target: None,
realtime_conversion: false,
enable_singing: false,
singing_voice_type: None,
singing_technique: None,
musical_key: None,
tempo: None,
enable_spatial: false,
listener_position: None,
hrtf_enabled: false,
room_size: None,
reverb_level: 0.3,
}
}
}
impl crate::config::hierarchy::ConfigHierarchy for SynthesisConfig {
fn merge_with(&mut self, other: &Self) {
if (other.speaking_rate - 1.0).abs() > f32::EPSILON {
self.speaking_rate = other.speaking_rate;
}
if other.pitch_shift.abs() > f32::EPSILON {
self.pitch_shift = other.pitch_shift;
}
if other.volume_gain.abs() > f32::EPSILON {
self.volume_gain = other.volume_gain;
}
if !other.enable_enhancement {
self.enable_enhancement = other.enable_enhancement;
}
if other.output_format != AudioFormat::Wav {
self.output_format = other.output_format;
}
if other.sample_rate != 22050 {
self.sample_rate = other.sample_rate;
}
if other.quality != QualityLevel::High {
self.quality = other.quality;
}
if other.language != LanguageCode::EnUs {
self.language = other.language;
}
if other.streaming_chunk_size.is_some() {
self.streaming_chunk_size = other.streaming_chunk_size;
}
if other.enable_emotion {
self.enable_emotion = other.enable_emotion;
}
if other.emotion_type.is_some() {
self.emotion_type = other.emotion_type.clone();
}
if (other.emotion_intensity - 0.7).abs() > f32::EPSILON {
self.emotion_intensity = other.emotion_intensity;
}
if other.emotion_preset.is_some() {
self.emotion_preset = other.emotion_preset.clone();
}
if other.auto_emotion_detection {
self.auto_emotion_detection = other.auto_emotion_detection;
}
self.effects.extend(other.effects.clone());
}
fn validate(&self) -> Result<(), crate::config::hierarchy::ConfigValidationError> {
if self.speaking_rate < 0.5 || self.speaking_rate > 2.0 {
return Err(crate::config::hierarchy::ConfigValidationError {
field: "speaking_rate".to_string(),
message: "Speaking rate must be between 0.5 and 2.0".to_string(),
});
}
if self.pitch_shift < -12.0 || self.pitch_shift > 12.0 {
return Err(crate::config::hierarchy::ConfigValidationError {
field: "pitch_shift".to_string(),
message: "Pitch shift must be between -12.0 and 12.0 semitones".to_string(),
});
}
if self.volume_gain < -20.0 || self.volume_gain > 20.0 {
return Err(crate::config::hierarchy::ConfigValidationError {
field: "volume_gain".to_string(),
message: "Volume gain must be between -20.0 and 20.0 dB".to_string(),
});
}
if self.sample_rate < 8000 || self.sample_rate > 96000 {
return Err(crate::config::hierarchy::ConfigValidationError {
field: "sample_rate".to_string(),
message: "Sample rate must be between 8000 and 96000 Hz".to_string(),
});
}
if self.emotion_intensity < 0.0 || self.emotion_intensity > 1.0 {
return Err(crate::config::hierarchy::ConfigValidationError {
field: "emotion_intensity".to_string(),
message: "Emotion intensity must be between 0.0 and 1.0".to_string(),
});
}
Ok(())
}
}
impl Default for AudioFormat {
fn default() -> Self {
AudioFormat::Wav
}
}
impl FromStr for AudioFormat {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"wav" => Ok(AudioFormat::Wav),
"flac" => Ok(AudioFormat::Flac),
"mp3" => Ok(AudioFormat::Mp3),
"opus" => Ok(AudioFormat::Opus),
"ogg" => Ok(AudioFormat::Ogg),
_ => Err(format!("Unknown audio format: {s}")),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ModelFeature {
MultiSpeaker,
EmotionControl,
StyleControl,
ProsodyControl,
VoiceCloning,
StreamingSupport,
BatchProcessing,
GPUAcceleration,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SystemCapabilities {
pub available_features: Vec<AdvancedFeature>,
pub hardware: HardwareCapabilities,
pub resource_limits: ResourceLimits,
pub model_capabilities: HashMap<String, ModelCapabilities>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum AdvancedFeature {
EmotionControl,
VoiceCloning,
VoiceConversion,
SingingSynthesis,
SpatialAudio,
StreamingSynthesis,
GpuAcceleration,
WasmSupport,
CloudProcessing,
HighQualityVocoding,
RealtimeProcessing,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct HardwareCapabilities {
pub gpu_available: bool,
pub gpu_memory_mb: Option<u64>,
pub cpu_cores: u32,
pub system_memory_mb: u64,
pub fast_storage: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ResourceLimits {
pub max_memory_mb: u64,
pub max_cpu_percent: u8,
pub max_latency_ms: u32,
pub battery_optimization: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ModelCapabilities {
pub supported_features: Vec<AdvancedFeature>,
pub hardware_requirements: HardwareRequirements,
pub performance_profile: PerformanceProfile,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct HardwareRequirements {
pub min_memory_mb: u64,
pub min_gpu_memory_mb: Option<u64>,
pub requires_gpu: bool,
pub min_cpu_cores: u32,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct PerformanceProfile {
pub init_latency_ms: u32,
pub synthesis_latency_ms_per_sec: u32,
pub synthesis_memory_mb: u64,
pub quality_score: u8, }
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CapabilityRequest {
pub desired_features: Vec<AdvancedFeature>,
pub feature_priorities: Vec<FeaturePriority>,
pub constraints: ResourceLimits,
pub fallback_strategy: FallbackStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum FeaturePriority {
Optional,
Preferred,
Required,
Critical,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FallbackStrategy {
FailFast,
GracefulDegradation,
UseAlternatives,
BasicFunctionality,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CapabilityNegotiation {
pub enabled_features: Vec<AdvancedFeature>,
pub unavailable_features: Vec<AdvancedFeature>,
pub warnings: Vec<String>,
pub selected_models: HashMap<String, String>,
pub estimated_usage: ResourceUsage,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ResourceUsage {
pub memory_mb: u64,
pub init_time_ms: u32,
pub processing_latency_ms: u32,
pub cpu_usage_percent: u8,
}
impl Default for SystemCapabilities {
fn default() -> Self {
Self {
available_features: vec![
AdvancedFeature::EmotionControl,
AdvancedFeature::StreamingSynthesis,
AdvancedFeature::RealtimeProcessing,
],
hardware: HardwareCapabilities::default(),
resource_limits: ResourceLimits::default(),
model_capabilities: HashMap::new(),
}
}
}
impl Default for HardwareCapabilities {
fn default() -> Self {
Self {
gpu_available: false,
gpu_memory_mb: None,
cpu_cores: num_cpus::get() as u32,
system_memory_mb: 4096, fast_storage: true,
}
}
}
impl Default for ResourceLimits {
fn default() -> Self {
Self {
max_memory_mb: 2048,
max_cpu_percent: 80,
max_latency_ms: 500,
battery_optimization: false,
}
}
}
impl Default for CapabilityRequest {
fn default() -> Self {
Self {
desired_features: vec![AdvancedFeature::StreamingSynthesis],
feature_priorities: vec![FeaturePriority::Preferred],
constraints: ResourceLimits::default(),
fallback_strategy: FallbackStrategy::GracefulDegradation,
}
}
}
impl Default for QualityLevel {
fn default() -> Self {
QualityLevel::High
}
}
impl FromStr for QualityLevel {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"low" => Ok(QualityLevel::Low),
"medium" => Ok(QualityLevel::Medium),
"high" => Ok(QualityLevel::High),
"ultra" => Ok(QualityLevel::Ultra),
_ => Err(format!("Unknown quality level: {s}")),
}
}
}