dracon-tts-runtime 94.7.0

use anyhow;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::sync::Arc;

#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum Gender {
    Male,
    Female,
    Other,
}

impl fmt::Display for Gender {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Gender::Male => write!(f, "male"),
            Gender::Female => write!(f, "female"),
            Gender::Other => write!(f, "other"),
        }
    }
}

impl From<&str> for Gender {
    fn from(s: &str) -> Self {
        match s.to_lowercase().as_str() {
            "male" | "m" => Gender::Male,
            "female" | "f" => Gender::Female,
            _ => Gender::Other,
        }
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct VoiceInfo {
    pub id: String,
    pub name: String,
    pub gender: Gender,
    #[serde(default)]
    pub language: Option<String>,
    #[serde(default)]
    pub description: Option<String>,
}

impl VoiceInfo {
    pub fn new(id: impl Into<String>, name: impl Into<String>, gender: Gender) -> Self {
        Self {
            id: id.into(),
            name: name.into(),
            gender,
            language: None,
            description: None,
        }
    }

    pub fn with_language(mut self, language: impl Into<String>) -> Self {
        self.language = Some(language.into());
        self
    }

    pub fn with_description(mut self, description: impl Into<String>) -> Self {
        self.description = Some(description.into());
        self
    }
}

pub type TtsResult<T> = anyhow::Result<T>;

pub trait TextToSpeech: Send + Sync {
    fn speak(&self, text: &str) -> TtsResult<()>;

    fn stop(&self) -> TtsResult<()>;

    fn is_speaking(&self) -> bool;

    fn name(&self) -> &'static str;

    fn sample_rate(&self) -> u32 {
        24000
    }
}

pub trait VoiceProvider: Send + Sync {
    fn list_voices(&self) -> Vec<VoiceInfo>;

    fn set_voice(&self, voice: &str) -> TtsResult<bool>;

    fn current_voice(&self) -> TtsResult<VoiceInfo>;

    fn resolve_voice(&self, name: &str) -> Option<VoiceInfo> {
        let name_lower = name.to_lowercase();

        for voice in self.list_voices() {
            if voice.id.to_lowercase() == name_lower {
                return Some(voice);
            }
            if voice.name.to_lowercase() == name_lower {
                return Some(voice);
            }
        }

        let gender = Gender::from(name);
        if gender != Gender::Other {
            return self.list_voices().into_iter().find(|v| v.gender == gender);
        }

        None
    }
}

pub trait TtsEngineBase: TextToSpeech + VoiceProvider {}

#[derive(Clone)]
pub struct DynTtsEngine {
    inner: Arc<dyn TextToSpeech>,
    speak_nowait_fn: Option<Arc<dyn Fn(&str) + Send + 'static>>,
    wait_until_done_fn: Option<Arc<dyn Fn() + Send + 'static>>,
}

impl DynTtsEngine {
    pub fn new<T: TextToSpeech + 'static>(engine: Arc<T>) -> Self {
        Self {
            inner: engine,
            speak_nowait_fn: None,
            wait_until_done_fn: None,
        }
    }

    pub fn with_speak_nowait<F>(mut self, f: F) -> Self
    where
        F: Fn(&str) + Send + 'static,
    {
        self.speak_nowait_fn = Some(Arc::new(f));
        self
    }

    pub fn with_wait_until_done<F>(mut self, f: F) -> Self
    where
        F: Fn() + Send + 'static,
    {
        self.wait_until_done_fn = Some(Arc::new(f));
        self
    }

    pub fn speak(&self, text: &str) -> TtsResult<()> {
        self.inner.speak(text)
    }

    pub fn stop(&self) -> TtsResult<()> {
        self.inner.stop()
    }

    pub fn is_speaking(&self) -> bool {
        self.inner.is_speaking()
    }

    pub fn name(&self) -> &'static str {
        self.inner.name()
    }

    pub fn sample_rate(&self) -> u32 {
        self.inner.sample_rate()
    }

    pub async fn speak_nowait(&self, text: &str) -> TtsResult<()> {
        if let Some(f) = &self.speak_nowait_fn {
            f(text);
            Ok(())
        } else {
            let text = text.to_string();
            let inner = self.inner.clone();
            tokio::task::spawn_blocking(move || inner.speak(&text))
                .await
                .map_err(|e| anyhow::anyhow!("join error: {}", e))?
        }
    }

    pub async fn wait_until_done(&self) {
        if let Some(f) = &self.wait_until_done_fn {
            f();
        } else {
            while self.inner.is_speaking() {
                tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
            }
        }
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TtsConfig {
    pub engine: String,
    pub model_path: String,
    pub voices_path: String,
    pub default_voice: String,
    pub sample_rate: u32,
    pub volume: f32,
}

impl Default for TtsConfig {
    fn default() -> Self {
        Self {
            engine: "kitten".to_string(),
            model_path: "assets/models/kitten_tts_nano_v0_8.onnx".to_string(),
            voices_path: "assets/models/voices_v0_8.npz".to_string(),
            default_voice: "expr-voice-3-m".to_string(),
            sample_rate: 24000,
            volume: 1.0,
        }
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SynthesisRequest {
    pub text: String,
    pub voice_id: Option<String>,
    pub speed: Option<f32>,
    pub pitch: Option<f32>,
}

impl SynthesisRequest {
    pub fn new(text: impl Into<String>) -> Self {
        Self {
            text: text.into(),
            voice_id: None,
            speed: None,
            pitch: None,
        }
    }

    pub fn with_voice(mut self, voice_id: impl Into<String>) -> Self {
        self.voice_id = Some(voice_id.into());
        self
    }

    pub fn with_speed(mut self, speed: f32) -> Self {
        self.speed = Some(speed);
        self
    }
}

#[derive(Clone, Debug)]
pub struct SynthesisResult {
    pub samples: Vec<f32>,
    pub sample_rate: u32,
    pub duration_ms: u64,
    pub voice_used: String,
}

pub mod prelude {
    pub use crate::contracts::{
        DynTtsEngine, Gender, SynthesisRequest, SynthesisResult, TextToSpeech, TtsConfig,
        TtsEngineBase, TtsResult, VoiceInfo, VoiceProvider,
    };
}