Skip to main content

typecast_rust/
timestamps.rs

1//! Timestamp-aware TTS types and captioning helpers.
2//!
3//! This module exposes [`TTSRequestWithTimestamps`], [`TTSWithTimestampsResponse`],
4//! and alignment segment types. The response type provides [`TTSWithTimestampsResponse::to_srt`]
5//! and [`TTSWithTimestampsResponse::to_vtt`] for generating subtitle files from the
6//! word- or character-level alignment data returned by the API.
7
8use crate::errors::{Result, TypecastError};
9use base64::{engine::general_purpose::STANDARD as B64, Engine};
10use serde::{Deserialize, Serialize};
11use std::fs;
12use std::path::Path;
13
14// ---------------------------------------------------------------------------
15// Public types
16// ---------------------------------------------------------------------------
17
18/// A word-level alignment segment returned by the with-timestamps endpoint.
19#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
20pub struct AlignmentSegmentWord {
21    /// The word text.
22    pub text: String,
23    /// Start time in seconds.
24    pub start: f64,
25    /// End time in seconds.
26    pub end: f64,
27}
28
29/// A character-level alignment segment returned by the with-timestamps endpoint.
30#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
31pub struct AlignmentSegmentCharacter {
32    /// The character text.
33    pub text: String,
34    /// Start time in seconds.
35    pub start: f64,
36    /// End time in seconds.
37    pub end: f64,
38}
39
40/// Request body for `POST /v1/text-to-speech/with-timestamps`.
41#[derive(Debug, Clone, Serialize)]
42pub struct TTSRequestWithTimestamps {
43    /// Voice ID (e.g. `tc_60e5426de8b95f1d3000d7b5`).
44    pub voice_id: String,
45    /// Text to synthesize (max 2000 characters).
46    pub text: String,
47    /// TTS model to use (e.g. `"ssfm-v30"`).
48    pub model: crate::models::TTSModel,
49    /// Language code (ISO 639-3). Auto-detected when omitted.
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub language: Option<String>,
52    /// Emotion/style settings (accepts any serializable value that matches the
53    /// API's `prompt` field — use [`crate::models::TTSPrompt`] or raw JSON).
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub prompt: Option<serde_json::Value>,
56    /// Audio output settings (accepts any serializable value that matches the
57    /// API's `output` field — use [`crate::models::Output`] serialized to JSON).
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub output: Option<serde_json::Value>,
60    /// Random seed for reproducible results.
61    #[serde(skip_serializing_if = "Option::is_none")]
62    pub seed: Option<u32>,
63}
64
65impl TTSRequestWithTimestamps {
66    /// Create a new request with the required fields.
67    pub fn new(
68        voice_id: impl Into<String>,
69        text: impl Into<String>,
70        model: crate::models::TTSModel,
71    ) -> Self {
72        Self {
73            voice_id: voice_id.into(),
74            text: text.into(),
75            model,
76            language: None,
77            prompt: None,
78            output: None,
79            seed: None,
80        }
81    }
82
83    /// Set the language code (ISO 639-3).
84    pub fn language(mut self, language: impl Into<String>) -> Self {
85        self.language = Some(language.into());
86        self
87    }
88
89    /// Set the prompt field as a raw JSON value.
90    pub fn prompt(mut self, prompt: serde_json::Value) -> Self {
91        self.prompt = Some(prompt);
92        self
93    }
94
95    /// Set the output field as a raw JSON value.
96    pub fn output(mut self, output: serde_json::Value) -> Self {
97        self.output = Some(output);
98        self
99    }
100
101    /// Set the random seed.
102    pub fn seed(mut self, seed: u32) -> Self {
103        self.seed = Some(seed);
104        self
105    }
106}
107
108/// Response from `POST /v1/text-to-speech/with-timestamps`.
109///
110/// The `audio` field contains Base64-encoded audio data. Use
111/// [`audio_bytes`][TTSWithTimestampsResponse::audio_bytes] to decode it, or
112/// [`save_audio`][TTSWithTimestampsResponse::save_audio] to write directly to a file.
113///
114/// Call [`to_srt`][TTSWithTimestampsResponse::to_srt] or
115/// [`to_vtt`][TTSWithTimestampsResponse::to_vtt] to generate subtitle output
116/// from the alignment data.
117#[derive(Debug, Clone, Deserialize, Serialize)]
118pub struct TTSWithTimestampsResponse {
119    /// Base64-encoded audio bytes.
120    pub audio: String,
121    /// Audio container format (e.g. `"wav"` or `"mp3"`).
122    pub audio_format: String,
123    /// Total audio duration in seconds.
124    pub audio_duration: f64,
125    /// Word-level alignment segments (present when `granularity` is `"word"` or
126    /// when both granularities are returned).
127    pub words: Option<Vec<AlignmentSegmentWord>>,
128    /// Character-level alignment segments (present when `granularity` is
129    /// `"char"` or when both granularities are returned).
130    pub characters: Option<Vec<AlignmentSegmentCharacter>>,
131}
132
133impl TTSWithTimestampsResponse {
134    /// Decode the Base64-encoded audio field into raw bytes.
135    pub fn audio_bytes(&self) -> Result<Vec<u8>> {
136        B64.decode(&self.audio)
137            .map_err(|e| TypecastError::DecodeError(e.to_string()))
138    }
139
140    /// Decode the audio and write it to `path`.
141    pub fn save_audio<P: AsRef<Path>>(&self, path: P) -> Result<()> {
142        let bytes = self.audio_bytes()?;
143        fs::write(path, bytes).map_err(|e| TypecastError::IoError(e.to_string()))
144    }
145
146    /// Generate an SRT subtitle string from the alignment data.
147    ///
148    /// Word segments are preferred when there are at least two words; otherwise
149    /// character segments are used.
150    pub fn to_srt(&self) -> Result<String> {
151        format_captions(self, true)
152    }
153
154    /// Generate a WebVTT subtitle string from the alignment data.
155    ///
156    /// Word segments are preferred when there are at least two words; otherwise
157    /// character segments are used.
158    pub fn to_vtt(&self) -> Result<String> {
159        format_captions(self, false)
160    }
161}
162
163// ---------------------------------------------------------------------------
164// Internal captioning helpers
165// ---------------------------------------------------------------------------
166
167const MAX_CAPTION_SECONDS: f64 = 7.0;
168const MAX_CAPTION_CHARS: usize = 42;
169const SENTENCE_TERMINATORS: &[&str] = &[".", "?", "!", "\u{3002}", "\u{ff1f}", "\u{ff01}"];
170
171struct Segment {
172    text: String,
173    start: f64,
174    end: f64,
175}
176
177struct Cue {
178    text: String,
179    start: f64,
180    end: f64,
181}
182
183/// Choose which set of segments to use for captioning, and whether word-joining
184/// (space-separated) mode applies.
185///
186/// Priority: words (≥ 2) → characters (non-empty) → words (exactly 1) → error.
187fn pick_segments(
188    resp: &TTSWithTimestampsResponse,
189) -> Result<(Vec<Segment>, bool)> {
190    let word_segs = |words: &[crate::timestamps::AlignmentSegmentWord]| -> Vec<Segment> {
191        words
192            .iter()
193            .map(|w| Segment {
194                text: w.text.clone(),
195                start: w.start,
196                end: w.end,
197            })
198            .collect()
199    };
200    let char_segs = |chars: &[crate::timestamps::AlignmentSegmentCharacter]| -> Vec<Segment> {
201        chars
202            .iter()
203            .map(|c| Segment {
204                text: c.text.clone(),
205                start: c.start,
206                end: c.end,
207            })
208            .collect()
209    };
210
211    // Prefer words when there are at least 2 (single-word edge case falls through).
212    let multi_words = resp.words.as_deref().filter(|w| w.len() >= 2);
213    // Fall back to characters.
214    let chars = resp.characters.as_deref().filter(|c| !c.is_empty());
215    // Single-word fallback.
216    let single_word = resp.words.as_deref().filter(|w| w.len() == 1);
217
218    if let Some(words) = multi_words {
219        Ok((word_segs(words), true))
220    } else if let Some(c) = chars {
221        Ok((char_segs(c), false))
222    } else if let Some(words) = single_word {
223        Ok((word_segs(words), true))
224    } else {
225        Err(TypecastError::CaptioningError(
226            "no alignment segments to caption from".into(),
227        ))
228    }
229}
230
231/// Concatenate parts into a cue text.  In word mode parts are joined with a
232/// space; in character mode they are concatenated directly.
233fn join_parts(parts: &[String], word_mode: bool) -> String {
234    let sep = if word_mode { " " } else { "" };
235    parts.join(sep).trim().to_string()
236}
237
238/// Return `true` if `text` ends with a sentence-terminating punctuation mark.
239fn ends_in_sentence(text: &str) -> bool {
240    let trimmed = text.trim_end();
241    SENTENCE_TERMINATORS.iter().any(|t| trimmed.ends_with(t))
242}
243
244/// Group flat segments into captioning cues obeying the max-duration and
245/// max-char-count constraints and breaking on sentence-terminating punctuation.
246///
247/// TODO(TASK-12430-followup): expose max_seconds / max_chars override to match Python/JS API surface. Default 7.0s / 42 chars (BBC/Netflix guideline).
248/// TODO(TASK-12430-followup): warn or error when alignment array contains majority-empty text segments — server contract should never produce these but defense-in-depth is desirable.
249fn group_into_cues(segs: &[Segment], word_mode: bool) -> Vec<Cue> {
250    let mut cues: Vec<Cue> = Vec::new();
251    let mut parts: Vec<String> = Vec::new();
252    // Invariant: cur_start and last_end are always set whenever parts is non-empty.
253    // Using 0.0 as default sentinels; they are only read when parts is non-empty.
254    let mut cur_start: f64 = 0.0;
255    let mut last_end: f64 = 0.0;
256
257    /// Appends a cue only when its text is non-empty.
258    fn emit(cues: &mut Vec<Cue>, text: String, start: f64, end: f64) {
259        if !text.is_empty() {
260            cues.push(Cue { text, start, end });
261        }
262    }
263
264    for seg in segs {
265        // If we already have content, check whether adding this segment would
266        // violate a hard limit.
267        if !parts.is_empty() {
268            let mut tentative = parts.clone();
269            tentative.push(seg.text.clone());
270            let would_be = join_parts(&tentative, word_mode);
271            let too_long_secs = (seg.end - cur_start) > MAX_CAPTION_SECONDS;
272            let too_long_chars = would_be.chars().count() > MAX_CAPTION_CHARS;
273            if too_long_secs || too_long_chars {
274                // Flush the current cue before starting a new one.
275                emit(&mut cues, join_parts(&parts, word_mode), cur_start, last_end);
276                parts.clear();
277            }
278        }
279
280        // Record the start of a new cue.
281        if parts.is_empty() {
282            cur_start = seg.start;
283        }
284        parts.push(seg.text.clone());
285        last_end = seg.end;
286
287        // Break on sentence-terminating punctuation.
288        if ends_in_sentence(&seg.text) {
289            emit(&mut cues, join_parts(&parts, word_mode), cur_start, seg.end);
290            parts.clear();
291        }
292    }
293
294    // Flush any remaining parts.
295    if !parts.is_empty() {
296        emit(&mut cues, join_parts(&parts, word_mode), cur_start, last_end);
297    }
298
299    cues
300}
301
302/// Format `seconds` as `HH:MM:SS,mmm` (SRT comma separator).
303fn format_srt_time(seconds: f64) -> String {
304    let total_ms = (seconds * 1000.0).round() as i64;
305    let ms = total_ms % 1000;
306    let total_sec = total_ms / 1000;
307    let ss = total_sec % 60;
308    let total_min = total_sec / 60;
309    let mm = total_min % 60;
310    let hh = total_min / 60;
311    format!("{:02}:{:02}:{:02},{:03}", hh, mm, ss, ms)
312}
313
314/// Format `seconds` as `HH:MM:SS.mmm` (VTT dot separator).
315fn format_vtt_time(seconds: f64) -> String {
316    format_srt_time(seconds).replace(',', ".")
317}
318
319/// Core captioning formatter.  When `srt` is `true` emits SRT; otherwise VTT.
320fn format_captions(resp: &TTSWithTimestampsResponse, srt: bool) -> Result<String> {
321    let (segs, word_mode) = pick_segments(resp)?;
322    let cues = group_into_cues(&segs, word_mode);
323    if cues.is_empty() {
324        return Err(TypecastError::CaptioningError(
325            "no alignment segments to caption from".into(),
326        ));
327    }
328
329    let mut out = String::new();
330    if !srt {
331        out.push_str("WEBVTT\n\n");
332    }
333    for (i, cue) in cues.iter().enumerate() {
334        if srt {
335            out.push_str(&format!("{}\n", i + 1));
336        }
337        let (s, e) = if srt {
338            (format_srt_time(cue.start), format_srt_time(cue.end))
339        } else {
340            (format_vtt_time(cue.start), format_vtt_time(cue.end))
341        };
342        out.push_str(&format!("{} --> {}\n", s, e));
343        out.push_str(&cue.text);
344        out.push_str("\n\n");
345    }
346    Ok(out)
347}