rustpbx 0.3.19

A SIP PBX implementation in Rust
Documentation
use std::path::{Path, PathBuf};

use crate::addons::ivr_editor::settings::IvrEditorSettings;

/// Common Edge TTS voices.
pub const VOICES: &[(&str, &str)] = &[
    ("en-US-AriaNeural", "English (US) - Aria"),
    ("en-US-GuyNeural", "English (US) - Guy"),
    ("en-US-JennyNeural", "English (US) - Jenny"),
    ("en-GB-SoniaNeural", "English (UK) - Sonia"),
    ("zh-CN-XiaoxiaoNeural", "Chinese (CN) - Xiaoxiao"),
    ("zh-CN-YunxiNeural", "Chinese (CN) - Yunxi"),
    ("ja-JP-NanamiNeural", "Japanese - Nanami"),
    ("ko-KR-SunHiNeural", "Korean - SunHi"),
    ("es-ES-ElviraNeural", "Spanish (ES) - Elvira"),
    ("fr-FR-DeniseNeural", "French - Denise"),
    ("de-DE-KatjaNeural", "German - Katja"),
];

/// Synthesise speech using the `edge-cli` CLI tool.
///
/// Returns the path to the generated audio file relative to the working directory.
pub async fn synthesize(text: &str, voice: &str, filename: &str) -> Result<PathBuf, anyhow::Error> {
    if text.trim().is_empty() {
        anyhow::bail!("text must not be empty");
    }

    // Ensure output directory exists
    let dir = Path::new("storage/sounds/ivr");
    tokio::fs::create_dir_all(dir).await?;

    let sanitized = sanitize_filename(filename);
    let out_path = dir.join(format!("{}.mp3", sanitized));

    let voice = if voice.is_empty() {
        "en-US-AriaNeural"
    } else {
        voice
    };

    let output = tokio::process::Command::new("edge-cli")
        .arg("speak")
        .arg("--text")
        .arg(text)
        .arg("--voice")
        .arg(voice)
        .arg("--write-media")
        .arg(out_path.to_str().unwrap_or("output.mp3"))
        .output()
        .await
        .map_err(|e| anyhow::anyhow!("failed to run edge-cli (is it installed?): {}", e))?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        anyhow::bail!("edge-cli failed: {}", stderr);
    }

    Ok(out_path)
}

fn sanitize_filename(name: &str) -> String {
    let stem = Path::new(name)
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or(name);
    stem.chars()
        .map(|c| {
            if c.is_alphanumeric() || c == '-' || c == '_' {
                c
            } else {
                '_'
            }
        })
        .collect()
}

/// Generate cache key from voice and text using SHA-256 hash.
fn generate_cache_key(voice: &str, text: &str) -> String {
    use sha2::{Digest, Sha256};
    let input = format!("{}|{}", voice, text);
    let hash = Sha256::digest(input.as_bytes());
    hex::encode(hash)
}

/// Get the cache file path for a given voice and text.
fn get_cache_path(voice: &str, text: &str) -> PathBuf {
    let settings = IvrEditorSettings::load();
    let key = generate_cache_key(voice, text);
    Path::new(&settings.tts_cache_dir).join(format!("{}.mp3", key))
}

/// Result of synthesize_with_cache: path and whether it was cached.
#[derive(Debug)]
pub struct SynthesizeResult {
    pub path: PathBuf,
    pub cached: bool,
}

/// Synthesize speech with caching.
///
/// - First checks if `{tts_cache_dir}/{md5(voice+text)}.mp3` exists
/// - If exists, returns the path directly with cached=true
/// - If not, calls edge-cli to generate the audio, then returns the path with cached=false
///
/// Returns the path relative to the working directory.
pub async fn synthesize_with_cache(
    text: &str,
    voice: &str,
) -> Result<SynthesizeResult, anyhow::Error> {
    if text.trim().is_empty() {
        anyhow::bail!("text must not be empty");
    }

    let voice = if voice.is_empty() {
        "en-US-AriaNeural"
    } else {
        voice
    };

    let settings = IvrEditorSettings::load();
    let cache_dir = Path::new(&settings.tts_cache_dir);

    // Check if cached file exists
    let cache_path = get_cache_path(voice, text);
    if cache_path.exists() {
        return Ok(SynthesizeResult {
            path: cache_path,
            cached: true,
        });
    }

    // Ensure cache directory exists
    tokio::fs::create_dir_all(cache_dir).await?;

    // Generate the audio file
    let output = tokio::process::Command::new("edge-cli")
        .arg("speak")
        .arg("--text")
        .arg(text)
        .arg("--voice")
        .arg(voice)
        .arg("--write-media")
        .arg(cache_path.to_str().unwrap_or("output.mp3"))
        .output()
        .await
        .map_err(|e| anyhow::anyhow!("failed to run edge-cli (is it installed?): {}", e))?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        anyhow::bail!("edge-cli failed: {}", stderr);
    }

    Ok(SynthesizeResult {
        path: cache_path,
        cached: false,
    })
}