use super::{SpeechError, SpeechResult};
#[derive(Debug, Clone)]
pub struct TtsConfig {
pub sample_rate: u32,
pub n_mels: usize,
pub hop_size: usize,
pub win_size: usize,
pub speaking_rate: f32,
pub pitch_shift: f32,
pub energy_scale: f32,
pub max_text_length: usize,
pub max_output_duration: f32,
}
impl Default for TtsConfig {
fn default() -> Self {
Self {
sample_rate: 22050,
n_mels: 80,
hop_size: 256,
win_size: 1024,
speaking_rate: 1.0,
pitch_shift: 0.0,
energy_scale: 1.0,
max_text_length: 500,
max_output_duration: 30.0,
}
}
}
impl TtsConfig {
#[must_use]
pub fn high_quality() -> Self {
Self {
sample_rate: 48000,
n_mels: 128,
hop_size: 512,
win_size: 2048,
..Self::default()
}
}
#[must_use]
pub fn fast() -> Self {
Self {
sample_rate: 16000,
n_mels: 80,
hop_size: 160,
win_size: 640,
..Self::default()
}
}
pub fn validate(&self) -> SpeechResult<()> {
if self.sample_rate == 0 {
return Err(SpeechError::InvalidConfig(
"sample_rate must be > 0".to_string(),
));
}
if self.n_mels == 0 {
return Err(SpeechError::InvalidConfig("n_mels must be > 0".to_string()));
}
if self.hop_size == 0 {
return Err(SpeechError::InvalidConfig(
"hop_size must be > 0".to_string(),
));
}
if self.win_size == 0 || self.win_size < self.hop_size {
return Err(SpeechError::InvalidConfig(
"win_size must be > 0 and >= hop_size".to_string(),
));
}
if self.speaking_rate <= 0.0 || self.speaking_rate > 5.0 {
return Err(SpeechError::InvalidConfig(
"speaking_rate must be in (0, 5]".to_string(),
));
}
if self.pitch_shift < -24.0 || self.pitch_shift > 24.0 {
return Err(SpeechError::InvalidConfig(
"pitch_shift must be in [-24, 24] semitones".to_string(),
));
}
if self.energy_scale <= 0.0 || self.energy_scale > 3.0 {
return Err(SpeechError::InvalidConfig(
"energy_scale must be in (0, 3]".to_string(),
));
}
if self.max_text_length == 0 {
return Err(SpeechError::InvalidConfig(
"max_text_length must be > 0".to_string(),
));
}
if self.max_output_duration <= 0.0 {
return Err(SpeechError::InvalidConfig(
"max_output_duration must be > 0".to_string(),
));
}
Ok(())
}
#[must_use]
pub fn frames_per_second(&self) -> f32 {
self.sample_rate as f32 / self.hop_size as f32
}
#[must_use]
pub fn max_output_samples(&self) -> usize {
(self.max_output_duration * self.sample_rate as f32) as usize
}
}
#[derive(Debug, Clone)]
pub struct SynthesisRequest {
pub text: String,
pub speaker_id: Option<String>,
pub speaking_rate: Option<f32>,
pub pitch_shift: Option<f32>,
pub energy_scale: Option<f32>,
pub language: Option<String>,
}
impl SynthesisRequest {
#[must_use]
pub fn new(text: String) -> Self {
Self {
text,
speaker_id: None,
speaking_rate: None,
pitch_shift: None,
energy_scale: None,
language: None,
}
}
#[must_use]
pub fn with_speaker(mut self, speaker_id: String) -> Self {
self.speaker_id = Some(speaker_id);
self
}
#[must_use]
pub fn with_speaking_rate(mut self, rate: f32) -> Self {
self.speaking_rate = Some(rate);
self
}
#[must_use]
pub fn with_pitch_shift(mut self, semitones: f32) -> Self {
self.pitch_shift = Some(semitones);
self
}
#[must_use]
pub fn with_energy_scale(mut self, scale: f32) -> Self {
self.energy_scale = Some(scale);
self
}
#[must_use]
pub fn with_language(mut self, language: String) -> Self {
self.language = Some(language);
self
}
pub fn validate(&self, config: &TtsConfig) -> SpeechResult<()> {
if self.text.is_empty() {
return Err(SpeechError::InvalidConfig("empty text".to_string()));
}
if self.text.len() > config.max_text_length {
return Err(SpeechError::InvalidConfig(format!(
"text too long: {} chars, max {}",
self.text.len(),
config.max_text_length
)));
}
if let Some(rate) = self.speaking_rate {
if rate <= 0.0 || rate > 5.0 {
return Err(SpeechError::InvalidConfig(
"speaking_rate must be in (0, 5]".to_string(),
));
}
}
if let Some(shift) = self.pitch_shift {
if !(-24.0..=24.0).contains(&shift) {
return Err(SpeechError::InvalidConfig(
"pitch_shift must be in [-24, 24]".to_string(),
));
}
}
if let Some(scale) = self.energy_scale {
if scale <= 0.0 || scale > 3.0 {
return Err(SpeechError::InvalidConfig(
"energy_scale must be in (0, 3]".to_string(),
));
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct SynthesisResult {
pub audio: Vec<f32>,
pub sample_rate: u32,
pub duration: f32,
pub mel_spectrogram: Option<Vec<Vec<f32>>>,
pub alignment: Option<Vec<AlignmentInfo>>,
pub phonemes: Option<Vec<String>>,
}
impl SynthesisResult {
#[must_use]
pub fn new(audio: Vec<f32>, sample_rate: u32) -> Self {
let duration = if sample_rate > 0 {
audio.len() as f32 / sample_rate as f32
} else {
0.0
};
Self {
audio,
sample_rate,
duration,
mel_spectrogram: None,
alignment: None,
phonemes: None,
}
}
pub fn with_mel(&mut self, mel: Vec<Vec<f32>>) {
self.mel_spectrogram = Some(mel);
}
pub fn with_alignment(&mut self, alignment: Vec<AlignmentInfo>) {
self.alignment = Some(alignment);
}
pub fn with_phonemes(&mut self, phonemes: Vec<String>) {
self.phonemes = Some(phonemes);
}
#[must_use]
pub fn num_samples(&self) -> usize {
self.audio.len()
}
#[must_use]
pub fn has_mel(&self) -> bool {
self.mel_spectrogram.is_some()
}
}
#[derive(Debug, Clone)]
pub struct AlignmentInfo {
pub token: String,
pub start: f32,
pub end: f32,
pub confidence: f32,
}
impl AlignmentInfo {
#[must_use]
pub fn new(token: String, start: f32, end: f32) -> Self {
Self {
token,
start,
end,
confidence: 1.0,
}
}
#[must_use]
pub fn with_confidence(mut self, confidence: f32) -> Self {
self.confidence = confidence.clamp(0.0, 1.0);
self
}
#[must_use]
pub fn duration(&self) -> f32 {
self.end - self.start
}
}
pub trait SpeechSynthesizer {
fn synthesize(&self, request: &SynthesisRequest) -> SpeechResult<SynthesisResult>;
fn config(&self) -> &TtsConfig;
fn available_speakers(&self) -> Vec<String>;
fn supports_language(&self, language: &str) -> bool;
}
pub trait Vocoder {
fn vocalize(&self, mel: &[Vec<f32>]) -> SpeechResult<Vec<f32>>;
fn sample_rate(&self) -> u32;
fn n_mels(&self) -> usize;
}
mod synthesizers;
pub use synthesizers::*;
#[cfg(test)]
mod tests;