car-inference 0.15.0

use serde::{Deserialize, Serialize};

/// A speech-to-text transcription request.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscribeRequest {
    /// Path to an audio file on disk.
    pub audio_path: String,
    /// Optional model override.
    #[serde(default)]
    pub model: Option<String>,
    /// Optional spoken language hint.
    #[serde(default)]
    pub language: Option<String>,
    /// Optional prompt/context hint to bias transcription.
    #[serde(default)]
    pub prompt: Option<String>,
    /// Return verbose output with per-word timing when the backend
    /// supports it. When false, the result's `words` field is empty
    /// and only `text` is populated.
    #[serde(default)]
    pub timestamps: bool,
}

/// A timed word span — start/end in seconds from the beginning of the
/// audio clip, plus the decoded text for that span. Parakeet-TDT emits
/// one of these per word (tokens grouped by sentencepiece `▁` markers).
///
/// Field names match the convention used by Whisper, ElevenLabs,
/// Deepgram, AssemblyAI, and NeMo CTM output (`start`, `end` in seconds).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscribedWord {
    /// Start time of this word in seconds.
    pub start: f32,
    /// End time of this word in seconds.
    pub end: f32,
    /// The decoded word text (no leading space).
    pub text: String,
}

/// Speech-to-text result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscribeResult {
    pub text: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model_used: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub language: Option<String>,
    /// Per-word timing spans. Populated when `TranscribeRequest::timestamps`
    /// is true and the backend supports timing (Parakeet-TDT does natively;
    /// ElevenLabs Scribe provides them when verbose output is enabled).
    /// Empty vec when timing wasn't requested OR the backend can't produce it.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub words: Vec<TranscribedWord>,
}

impl TranscribeResult {
    /// Build a text-only result (no timing). Used by backends that
    /// don't produce word-level spans or when the caller didn't ask.
    pub fn text_only(text: String, model_used: Option<String>, language: Option<String>) -> Self {
        Self {
            text,
            model_used,
            language,
            words: Vec::new(),
        }
    }
}