1use crate::errors::{Result, TypecastError};
9use base64::{engine::general_purpose::STANDARD as B64, Engine};
10use serde::{Deserialize, Serialize};
11use std::fs;
12use std::path::Path;
13
14#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
20pub struct AlignmentSegmentWord {
21 pub text: String,
23 pub start: f64,
25 pub end: f64,
27}
28
29#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
31pub struct AlignmentSegmentCharacter {
32 pub text: String,
34 pub start: f64,
36 pub end: f64,
38}
39
40#[derive(Debug, Clone, Serialize)]
42pub struct TTSRequestWithTimestamps {
43 pub voice_id: String,
45 pub text: String,
47 pub model: crate::models::TTSModel,
49 #[serde(skip_serializing_if = "Option::is_none")]
51 pub language: Option<String>,
52 #[serde(skip_serializing_if = "Option::is_none")]
55 pub prompt: Option<serde_json::Value>,
56 #[serde(skip_serializing_if = "Option::is_none")]
59 pub output: Option<serde_json::Value>,
60 #[serde(skip_serializing_if = "Option::is_none")]
62 pub seed: Option<u32>,
63}
64
65impl TTSRequestWithTimestamps {
66 pub fn new(
68 voice_id: impl Into<String>,
69 text: impl Into<String>,
70 model: crate::models::TTSModel,
71 ) -> Self {
72 Self {
73 voice_id: voice_id.into(),
74 text: text.into(),
75 model,
76 language: None,
77 prompt: None,
78 output: None,
79 seed: None,
80 }
81 }
82
83 pub fn language(mut self, language: impl Into<String>) -> Self {
85 self.language = Some(language.into());
86 self
87 }
88
89 pub fn prompt(mut self, prompt: serde_json::Value) -> Self {
91 self.prompt = Some(prompt);
92 self
93 }
94
95 pub fn output(mut self, output: serde_json::Value) -> Self {
97 self.output = Some(output);
98 self
99 }
100
101 pub fn seed(mut self, seed: u32) -> Self {
103 self.seed = Some(seed);
104 self
105 }
106}
107
108#[derive(Debug, Clone, Deserialize, Serialize)]
118pub struct TTSWithTimestampsResponse {
119 pub audio: String,
121 pub audio_format: String,
123 pub audio_duration: f64,
125 pub words: Option<Vec<AlignmentSegmentWord>>,
128 pub characters: Option<Vec<AlignmentSegmentCharacter>>,
131}
132
133impl TTSWithTimestampsResponse {
134 pub fn audio_bytes(&self) -> Result<Vec<u8>> {
136 B64.decode(&self.audio)
137 .map_err(|e| TypecastError::DecodeError(e.to_string()))
138 }
139
140 pub fn save_audio<P: AsRef<Path>>(&self, path: P) -> Result<()> {
142 let bytes = self.audio_bytes()?;
143 fs::write(path, bytes).map_err(|e| TypecastError::IoError(e.to_string()))
144 }
145
146 pub fn to_srt(&self) -> Result<String> {
151 format_captions(self, true)
152 }
153
154 pub fn to_vtt(&self) -> Result<String> {
159 format_captions(self, false)
160 }
161}
162
163const MAX_CAPTION_SECONDS: f64 = 7.0;
168const MAX_CAPTION_CHARS: usize = 42;
169const SENTENCE_TERMINATORS: &[&str] = &[".", "?", "!", "\u{3002}", "\u{ff1f}", "\u{ff01}"];
170
171struct Segment {
172 text: String,
173 start: f64,
174 end: f64,
175}
176
177struct Cue {
178 text: String,
179 start: f64,
180 end: f64,
181}
182
183fn pick_segments(
188 resp: &TTSWithTimestampsResponse,
189) -> Result<(Vec<Segment>, bool)> {
190 let word_segs = |words: &[crate::timestamps::AlignmentSegmentWord]| -> Vec<Segment> {
191 words
192 .iter()
193 .map(|w| Segment {
194 text: w.text.clone(),
195 start: w.start,
196 end: w.end,
197 })
198 .collect()
199 };
200 let char_segs = |chars: &[crate::timestamps::AlignmentSegmentCharacter]| -> Vec<Segment> {
201 chars
202 .iter()
203 .map(|c| Segment {
204 text: c.text.clone(),
205 start: c.start,
206 end: c.end,
207 })
208 .collect()
209 };
210
211 let multi_words = resp.words.as_deref().filter(|w| w.len() >= 2);
213 let chars = resp.characters.as_deref().filter(|c| !c.is_empty());
215 let single_word = resp.words.as_deref().filter(|w| w.len() == 1);
217
218 if let Some(words) = multi_words {
219 Ok((word_segs(words), true))
220 } else if let Some(c) = chars {
221 Ok((char_segs(c), false))
222 } else if let Some(words) = single_word {
223 Ok((word_segs(words), true))
224 } else {
225 Err(TypecastError::CaptioningError(
226 "no alignment segments to caption from".into(),
227 ))
228 }
229}
230
231fn join_parts(parts: &[String], word_mode: bool) -> String {
234 let sep = if word_mode { " " } else { "" };
235 parts.join(sep).trim().to_string()
236}
237
238fn ends_in_sentence(text: &str) -> bool {
240 let trimmed = text.trim_end();
241 SENTENCE_TERMINATORS.iter().any(|t| trimmed.ends_with(t))
242}
243
244fn group_into_cues(segs: &[Segment], word_mode: bool) -> Vec<Cue> {
250 let mut cues: Vec<Cue> = Vec::new();
251 let mut parts: Vec<String> = Vec::new();
252 let mut cur_start: f64 = 0.0;
255 let mut last_end: f64 = 0.0;
256
257 fn emit(cues: &mut Vec<Cue>, text: String, start: f64, end: f64) {
259 if !text.is_empty() {
260 cues.push(Cue { text, start, end });
261 }
262 }
263
264 for seg in segs {
265 if !parts.is_empty() {
268 let mut tentative = parts.clone();
269 tentative.push(seg.text.clone());
270 let would_be = join_parts(&tentative, word_mode);
271 let too_long_secs = (seg.end - cur_start) > MAX_CAPTION_SECONDS;
272 let too_long_chars = would_be.chars().count() > MAX_CAPTION_CHARS;
273 if too_long_secs || too_long_chars {
274 emit(&mut cues, join_parts(&parts, word_mode), cur_start, last_end);
276 parts.clear();
277 }
278 }
279
280 if parts.is_empty() {
282 cur_start = seg.start;
283 }
284 parts.push(seg.text.clone());
285 last_end = seg.end;
286
287 if ends_in_sentence(&seg.text) {
289 emit(&mut cues, join_parts(&parts, word_mode), cur_start, seg.end);
290 parts.clear();
291 }
292 }
293
294 if !parts.is_empty() {
296 emit(&mut cues, join_parts(&parts, word_mode), cur_start, last_end);
297 }
298
299 cues
300}
301
302fn format_srt_time(seconds: f64) -> String {
304 let total_ms = (seconds * 1000.0).round() as i64;
305 let ms = total_ms % 1000;
306 let total_sec = total_ms / 1000;
307 let ss = total_sec % 60;
308 let total_min = total_sec / 60;
309 let mm = total_min % 60;
310 let hh = total_min / 60;
311 format!("{:02}:{:02}:{:02},{:03}", hh, mm, ss, ms)
312}
313
314fn format_vtt_time(seconds: f64) -> String {
316 format_srt_time(seconds).replace(',', ".")
317}
318
319fn format_captions(resp: &TTSWithTimestampsResponse, srt: bool) -> Result<String> {
321 let (segs, word_mode) = pick_segments(resp)?;
322 let cues = group_into_cues(&segs, word_mode);
323 if cues.is_empty() {
324 return Err(TypecastError::CaptioningError(
325 "no alignment segments to caption from".into(),
326 ));
327 }
328
329 let mut out = String::new();
330 if !srt {
331 out.push_str("WEBVTT\n\n");
332 }
333 for (i, cue) in cues.iter().enumerate() {
334 if srt {
335 out.push_str(&format!("{}\n", i + 1));
336 }
337 let (s, e) = if srt {
338 (format_srt_time(cue.start), format_srt_time(cue.end))
339 } else {
340 (format_vtt_time(cue.start), format_vtt_time(cue.end))
341 };
342 out.push_str(&format!("{} --> {}\n", s, e));
343 out.push_str(&cue.text);
344 out.push_str("\n\n");
345 }
346 Ok(out)
347}