car-inference 0.14.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
use serde::{Deserialize, Serialize};

fn default_audio_format() -> String {
    "wav".to_string()
}

/// Field names of the Qwen3-TTS-specific advanced controls on
/// [`SynthesizeRequest`]. Exposed for error messages and capability
/// reporting so backends can state *which* field they don't support
/// rather than producing a generic "unsupported" error.
pub const QWEN3_TTS_CONTROL_FIELDS: &[&str] = &[
    "reference_audio_path",
    "reference_text",
    "voice_instruction",
];

/// A text-to-speech synthesis request.
///
/// `text`, `model`, `voice`, `language`, `speed`, `output_path`, and
/// `format` are the generic controls that every TTS backend honors.
///
/// `reference_audio_path`, `reference_text`, and `voice_instruction`
/// are Qwen3-TTS-specific advanced controls (upstream: <https://github.com/QwenLM/Qwen3-TTS>).
/// On a Qwen3-TTS model they drive zero-shot voice cloning and
/// natural-language voice design. On other backends (Kokoro,
/// ElevenLabs) they are not supported — see [`strict_capabilities`]
/// for the rejection policy.
///
/// Each Qwen3 field accepts both the verbose Rust name and the terse
/// upstream Qwen key (`ref_audio` / `ref_text` / `instruct`) as an
/// inbound JSON alias, so payloads copied from Qwen's own docs
/// deserialize without rewriting.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SynthesizeRequest {
    /// Text to synthesize into speech.
    pub text: String,
    /// Optional model override.
    #[serde(default)]
    pub model: Option<String>,
    /// Optional voice identifier or preset name.
    #[serde(default)]
    pub voice: Option<String>,
    /// Optional language or language code, depending on backend.
    #[serde(default)]
    pub language: Option<String>,
    /// Playback speed multiplier when supported by the backend.
    #[serde(default)]
    pub speed: Option<f32>,
    /// Optional output path. If omitted, a temp file is created.
    #[serde(default)]
    pub output_path: Option<String>,
    /// Desired output format.
    #[serde(default = "default_audio_format")]
    pub format: String,
    /// Path to a short reference audio clip that the model clones the
    /// speaker characteristics from. Qwen3-TTS-Base zero-shot cloning
    /// control — pair with [`reference_text`] for best quality.
    /// Ignored by backends without a voice-cloning mode unless
    /// [`strict_capabilities`] is set.
    #[serde(default, alias = "ref_audio", skip_serializing_if = "Option::is_none")]
    pub reference_audio_path: Option<String>,
    /// Transcript of the reference-audio clip. Qwen3-TTS uses this to
    /// align prosody during cloning. Only meaningful when
    /// [`reference_audio_path`] is also set.
    #[serde(default, alias = "ref_text", skip_serializing_if = "Option::is_none")]
    pub reference_text: Option<String>,
    /// Free-form natural-language description of the target voice
    /// (Qwen3-TTS-VoiceDesign). Example: "a warm female voice with a
    /// slight Southern accent, speaking slowly and clearly". Ignored
    /// by backends without a voice-design mode unless
    /// [`strict_capabilities`] is set.
    #[serde(default, alias = "instruct", skip_serializing_if = "Option::is_none")]
    pub voice_instruction: Option<String>,
    /// If true, the runtime returns an error when any of the advanced
    /// controls above is set but the chosen backend can't honor it.
    /// If false (default), the runtime logs a warning and proceeds
    /// with whatever the backend *does* support.
    ///
    /// Production callers that depend on cloning actually happening
    /// should set this; casual callers who may route across Kokoro,
    /// Qwen3-TTS, and ElevenLabs interchangeably should leave it off.
    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
    pub strict_capabilities: bool,
}

impl Default for SynthesizeRequest {
    fn default() -> Self {
        Self {
            text: String::new(),
            model: None,
            voice: None,
            language: None,
            speed: None,
            output_path: None,
            format: default_audio_format(),
            reference_audio_path: None,
            reference_text: None,
            voice_instruction: None,
            strict_capabilities: false,
        }
    }
}

impl SynthesizeRequest {
    /// Return the names of the Qwen3-TTS-specific fields that the
    /// caller actually set. Used by backends to decide whether they
    /// can honor the request and to produce precise error messages.
    pub fn requested_advanced_controls(&self) -> Vec<&'static str> {
        let mut out = Vec::new();
        if self.reference_audio_path.is_some() {
            out.push("reference_audio_path");
        }
        if self.reference_text.is_some() {
            out.push("reference_text");
        }
        if self.voice_instruction.is_some() {
            out.push("voice_instruction");
        }
        out
    }
}

/// Text-to-speech result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SynthesizeResult {
    pub audio_path: String,
    pub media_type: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model_used: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub voice_used: Option<String>,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn legacy_request_payload_still_deserializes() {
        // Existing clients that don't know about the new fields must
        // continue to work. The three Qwen3 fields have `serde(default)`.
        let json = r#"{"text":"hello","voice":"Chelsie"}"#;
        let req: SynthesizeRequest = serde_json::from_str(json).unwrap();
        assert_eq!(req.text, "hello");
        assert_eq!(req.voice.as_deref(), Some("Chelsie"));
        assert!(req.reference_audio_path.is_none());
        assert!(req.reference_text.is_none());
        assert!(req.voice_instruction.is_none());
        assert!(!req.strict_capabilities);
    }

    #[test]
    fn unset_qwen3_fields_are_omitted_from_output_json() {
        // skip_serializing_if prevents the legacy JSON shape from
        // growing new keys when callers don't use them.
        let req = SynthesizeRequest {
            text: "hi".into(),
            ..SynthesizeRequest::default()
        };
        let json = serde_json::to_string(&req).unwrap();
        assert!(!json.contains("reference_audio_path"));
        assert!(!json.contains("reference_text"));
        assert!(!json.contains("voice_instruction"));
        assert!(!json.contains("strict_capabilities"));
    }

    #[test]
    fn set_qwen3_fields_roundtrip() {
        let req = SynthesizeRequest {
            text: "hi".into(),
            model: Some("Qwen3-TTS-12Hz-1.7B-Base-5bit".into()),
            language: Some("en".into()),
            reference_audio_path: Some("/tmp/ref.wav".into()),
            reference_text: Some("the reference sentence".into()),
            voice_instruction: Some("a warm male voice, medium pace".into()),
            strict_capabilities: true,
            ..SynthesizeRequest::default()
        };
        let json = serde_json::to_string(&req).unwrap();
        let parsed: SynthesizeRequest = serde_json::from_str(&json).unwrap();
        assert_eq!(parsed.reference_audio_path.as_deref(), Some("/tmp/ref.wav"));
        assert_eq!(
            parsed.reference_text.as_deref(),
            Some("the reference sentence")
        );
        assert_eq!(
            parsed.voice_instruction.as_deref(),
            Some("a warm male voice, medium pace")
        );
        assert!(parsed.strict_capabilities);
    }

    #[test]
    fn upstream_qwen_terse_keys_are_accepted_as_aliases() {
        // Payloads copied from Qwen3-TTS docs use the terse key names.
        // Our verbose Rust-side names accept them via serde(alias).
        let json = r#"{
            "text": "hi",
            "ref_audio": "/tmp/ref.wav",
            "ref_text": "sentence",
            "instruct": "warm female voice"
        }"#;
        let req: SynthesizeRequest = serde_json::from_str(json).unwrap();
        assert_eq!(req.reference_audio_path.as_deref(), Some("/tmp/ref.wav"));
        assert_eq!(req.reference_text.as_deref(), Some("sentence"));
        assert_eq!(req.voice_instruction.as_deref(), Some("warm female voice"));
    }

    #[test]
    fn requested_advanced_controls_reports_only_set_fields() {
        let req = SynthesizeRequest {
            text: "hi".into(),
            reference_audio_path: Some("/tmp/ref.wav".into()),
            voice_instruction: Some("warm".into()),
            ..SynthesizeRequest::default()
        };
        let controls = req.requested_advanced_controls();
        assert!(controls.contains(&"reference_audio_path"));
        assert!(!controls.contains(&"reference_text"));
        assert!(controls.contains(&"voice_instruction"));
        assert_eq!(controls.len(), 2);
    }

    #[test]
    fn empty_request_reports_no_advanced_controls() {
        let req = SynthesizeRequest::default();
        assert!(req.requested_advanced_controls().is_empty());
    }
}