use serde::{Deserialize, Serialize};
#[cfg(feature = "schema")]
use schemars::JsonSchema;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schema", derive(JsonSchema))]
#[serde(tag = "format")]
pub enum VoiceFormat {
#[serde(rename = "embedded")]
Embedded {
file: String,
loader: VoiceLoader,
},
#[serde(rename = "per_model")]
PerModel {
voice_dir: String,
pattern: String,
},
#[serde(rename = "precomputed_codes")]
PrecomputedCodes {
codes_dir: String,
codes_pattern: String,
transcript_dir: String,
transcript_pattern: String,
},
#[serde(rename = "cloning")]
Cloning {
encoder_model: String,
min_audio_seconds: f32,
max_audio_seconds: f32,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[cfg_attr(feature = "schema", derive(JsonSchema))]
pub enum VoiceLoader {
#[default]
#[serde(rename = "binary_f32_256")]
BinaryF32_256,
#[serde(rename = "numpy_npz")]
NumpyNpz,
#[serde(rename = "json_base64")]
JsonBase64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schema", derive(JsonSchema))]
pub struct VoiceInfo {
pub id: String,
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gender: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub style: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub index: Option<usize>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub preview_url: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
#[cfg_attr(feature = "schema", derive(JsonSchema))]
pub enum VoiceSelectionStrategy {
#[default]
FixedIndex,
TokenLength,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schema", derive(JsonSchema))]
pub struct VoiceConfig {
#[serde(flatten)]
pub format: VoiceFormat,
pub default: String,
pub catalog: Vec<VoiceInfo>,
#[serde(default, skip_serializing_if = "is_default_strategy")]
pub selection_strategy: VoiceSelectionStrategy,
}
fn is_default_strategy(strategy: &VoiceSelectionStrategy) -> bool {
*strategy == VoiceSelectionStrategy::FixedIndex
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_voice_config_serialization() {
let config = VoiceConfig {
format: VoiceFormat::Embedded {
file: "voices.bin".to_string(),
loader: VoiceLoader::BinaryF32_256,
},
default: "af_bella".to_string(),
catalog: vec![
VoiceInfo {
id: "af_bella".to_string(),
name: "Bella".to_string(),
gender: Some("female".to_string()),
language: Some("en-US".to_string()),
style: None,
index: Some(0),
preview_url: None,
},
VoiceInfo {
id: "am_adam".to_string(),
name: "Adam".to_string(),
gender: Some("male".to_string()),
language: Some("en-US".to_string()),
style: Some("neutral".to_string()),
index: Some(1),
preview_url: None,
},
],
selection_strategy: VoiceSelectionStrategy::default(),
};
let json = serde_json::to_string_pretty(&config).unwrap();
assert!(json.contains("\"format\": \"embedded\""));
assert!(json.contains("\"file\": \"voices.bin\""));
assert!(json.contains("\"default\": \"af_bella\""));
let parsed: VoiceConfig = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.default, "af_bella");
assert_eq!(parsed.catalog.len(), 2);
}
#[test]
fn test_voice_format_per_model() {
let json = r#"{
"format": "per_model",
"voice_dir": "voices/",
"pattern": "{voice_id}.onnx",
"default": "en-us-libritts",
"catalog": [
{"id": "en-us-libritts", "name": "LibriTTS", "language": "en-US"}
]
}"#;
let config: VoiceConfig = serde_json::from_str(json).unwrap();
match &config.format {
VoiceFormat::PerModel { voice_dir, pattern } => {
assert_eq!(voice_dir, "voices/");
assert_eq!(pattern, "{voice_id}.onnx");
}
_ => panic!("Expected PerModel format"),
}
}
#[test]
fn test_voice_format_precomputed_codes() {
let json = r#"{
"format": "precomputed_codes",
"codes_dir": "voices/",
"codes_pattern": "{voice_id}.bin",
"transcript_dir": "voices/",
"transcript_pattern": "{voice_id}.txt",
"default": "jo",
"catalog": [
{"id": "jo", "name": "Jo", "gender": "female", "language": "en-US"},
{"id": "dave", "name": "Dave", "gender": "male", "language": "en-US"}
]
}"#;
let config: VoiceConfig = serde_json::from_str(json).unwrap();
match &config.format {
VoiceFormat::PrecomputedCodes {
codes_dir,
codes_pattern,
transcript_dir,
transcript_pattern,
} => {
assert_eq!(codes_dir, "voices/");
assert_eq!(codes_pattern, "{voice_id}.bin");
assert_eq!(transcript_dir, "voices/");
assert_eq!(transcript_pattern, "{voice_id}.txt");
}
_ => panic!("Expected PrecomputedCodes format"),
}
assert_eq!(config.default, "jo");
assert_eq!(config.catalog.len(), 2);
let serialized = serde_json::to_string(&config).unwrap();
let reparsed: VoiceConfig = serde_json::from_str(&serialized).unwrap();
assert_eq!(reparsed.default, "jo");
assert_eq!(reparsed.catalog.len(), 2);
}
#[test]
fn test_voice_format_cloning() {
let json = r#"{
"format": "cloning",
"encoder_model": "voice_encoder.onnx",
"min_audio_seconds": 3.0,
"max_audio_seconds": 30.0,
"default": "cloned",
"catalog": []
}"#;
let config: VoiceConfig = serde_json::from_str(json).unwrap();
match &config.format {
VoiceFormat::Cloning {
encoder_model,
min_audio_seconds,
max_audio_seconds,
} => {
assert_eq!(encoder_model, "voice_encoder.onnx");
assert_eq!(*min_audio_seconds, 3.0);
assert_eq!(*max_audio_seconds, 30.0);
}
_ => panic!("Expected Cloning format"),
}
}
}