Skip to main content

omni_dev/transcript/sources/youtube/
timedtext.rs

1//! Parser for YouTube's `json3` (a.k.a. `srv3`) timedtext format.
2//!
3//! The endpoint at `https://www.youtube.com/api/timedtext?...&fmt=json3`
4//! returns a document with an `events` array. Each event has a start time,
5//! duration, and a list of `seg` entries whose `utf8` payloads are
6//! concatenated to form the cue text. Events without `segs` are styling /
7//! window markers and are skipped here.
8
9use serde::Deserialize;
10
11use crate::transcript::cue::Cue;
12use crate::transcript::error::{Result, TranscriptError};
13
14/// Top-level json3 document.
15#[derive(Clone, Debug, Deserialize, Default)]
16struct Json3 {
17    #[serde(default)]
18    events: Vec<Event>,
19}
20
21/// A single event entry. Most fields are optional because YouTube emits
22/// styling-only events that have no timing or text.
23#[derive(Clone, Debug, Deserialize, Default)]
24#[serde(rename_all = "camelCase")]
25struct Event {
26    #[serde(default, rename = "tStartMs")]
27    t_start_ms: Option<u64>,
28    #[serde(default, rename = "dDurationMs")]
29    d_duration_ms: Option<u64>,
30    #[serde(default)]
31    segs: Option<Vec<Segment>>,
32}
33
34/// A single text segment within an event.
35#[derive(Clone, Debug, Deserialize, Default)]
36struct Segment {
37    #[serde(default)]
38    utf8: Option<String>,
39}
40
41/// GET a fully-prepared timedtext URL and return the response body.
42///
43/// `url` is consumed as-is. Callers normally obtain it from
44/// [`super::player_response::SelectedTrack::fetch_url`], which already
45/// carries the signed signature, `fmt=json3`, and any `tlang=` parameter.
46pub async fn fetch(http: &reqwest::Client, url: &str) -> Result<String> {
47    let response = http.get(url).send().await?.error_for_status()?;
48    Ok(response.text().await?)
49}
50
51/// Parse a json3 timedtext document into a list of cues, dropping events
52/// that carry no text (styling / window markers).
53pub fn parse(raw: &str) -> Result<Vec<Cue>> {
54    let doc: Json3 = serde_json::from_str(raw)
55        .map_err(|e| TranscriptError::ParseError(format!("timedtext json3: {e}")))?;
56
57    let mut cues = Vec::with_capacity(doc.events.len());
58    for event in doc.events {
59        let Some(segs) = event.segs else {
60            continue;
61        };
62        let text = segs.into_iter().filter_map(|s| s.utf8).collect::<String>();
63        if text.is_empty() {
64            continue;
65        }
66        let start_ms = event.t_start_ms.unwrap_or(0);
67        let end_ms = start_ms.saturating_add(event.d_duration_ms.unwrap_or(0));
68        cues.push(Cue::new(start_ms, end_ms, text));
69    }
70    Ok(cues)
71}
72
73#[cfg(test)]
74#[allow(clippy::unwrap_used, clippy::expect_used)]
75mod tests {
76    use super::*;
77
78    const FIXTURE_BASIC: &str = include_str!("fixtures/timedtext_basic.json");
79
80    #[test]
81    fn parse_basic_fixture() {
82        let cues = parse(FIXTURE_BASIC).unwrap();
83        assert_eq!(cues.len(), 3);
84        assert_eq!(cues[0], Cue::new(0, 1500, "Hello, world."));
85        assert_eq!(cues[1], Cue::new(2000, 3000, "This is a test."));
86        assert_eq!(cues[2], Cue::new(4000, 6000, "Final cue\nwith newline."));
87    }
88
89    #[test]
90    fn parse_empty_events_array() {
91        let cues = parse(r#"{"events": []}"#).unwrap();
92        assert!(cues.is_empty());
93    }
94
95    #[test]
96    fn parse_missing_events_key_is_empty() {
97        let cues = parse(r"{}").unwrap();
98        assert!(cues.is_empty());
99    }
100
101    #[test]
102    fn parse_skips_event_without_segs() {
103        let raw = r#"{
104            "events": [
105                { "tStartMs": 0, "dDurationMs": 1000 },
106                { "tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "kept"}] }
107            ]
108        }"#;
109        let cues = parse(raw).unwrap();
110        assert_eq!(cues.len(), 1);
111        assert_eq!(cues[0].text, "kept");
112    }
113
114    #[test]
115    fn parse_skips_event_with_empty_text() {
116        let raw = r#"{
117            "events": [
118                { "tStartMs": 0, "dDurationMs": 1000, "segs": [{}] },
119                { "tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": ""}] },
120                { "tStartMs": 2000, "dDurationMs": 1000, "segs": [{"utf8": "kept"}] }
121            ]
122        }"#;
123        let cues = parse(raw).unwrap();
124        assert_eq!(cues.len(), 1);
125        assert_eq!(cues[0].text, "kept");
126    }
127
128    #[test]
129    fn parse_concatenates_multiple_segs() {
130        let raw = r#"{
131            "events": [
132                {
133                    "tStartMs": 0,
134                    "dDurationMs": 500,
135                    "segs": [
136                        {"utf8": "a "},
137                        {"utf8": "b "},
138                        {"utf8": "c"}
139                    ]
140                }
141            ]
142        }"#;
143        let cues = parse(raw).unwrap();
144        assert_eq!(cues, vec![Cue::new(0, 500, "a b c")]);
145    }
146
147    #[test]
148    fn parse_uses_zero_when_start_missing() {
149        let raw = r#"{
150            "events": [
151                { "dDurationMs": 1000, "segs": [{"utf8": "x"}] }
152            ]
153        }"#;
154        let cues = parse(raw).unwrap();
155        assert_eq!(cues, vec![Cue::new(0, 1000, "x")]);
156    }
157
158    #[test]
159    fn parse_uses_zero_when_duration_missing() {
160        let raw = r#"{
161            "events": [
162                { "tStartMs": 1500, "segs": [{"utf8": "instant"}] }
163            ]
164        }"#;
165        let cues = parse(raw).unwrap();
166        assert_eq!(cues, vec![Cue::new(1500, 1500, "instant")]);
167    }
168
169    #[test]
170    fn parse_invalid_json_errors() {
171        let err = parse("{ not json").unwrap_err();
172        assert!(matches!(err, TranscriptError::ParseError(_)));
173        assert!(err.to_string().contains("timedtext json3"));
174    }
175
176    #[test]
177    fn parse_ignores_unknown_event_fields() {
178        let raw = r#"{
179            "events": [
180                {
181                    "tStartMs": 0,
182                    "dDurationMs": 100,
183                    "wWinId": 1,
184                    "wpWinPosId": 2,
185                    "segs": [{"utf8": "x", "tOffsetMs": 0, "acAsrConf": 256}]
186                }
187            ]
188        }"#;
189        let cues = parse(raw).unwrap();
190        assert_eq!(cues, vec![Cue::new(0, 100, "x")]);
191    }
192
193    #[test]
194    fn parse_preserves_event_order() {
195        let raw = r#"{
196            "events": [
197                { "tStartMs": 0,    "dDurationMs": 100, "segs": [{"utf8": "first"}] },
198                { "tStartMs": 200,  "dDurationMs": 100, "segs": [{"utf8": "second"}] },
199                { "tStartMs": 1000, "dDurationMs": 100, "segs": [{"utf8": "third"}] }
200            ]
201        }"#;
202        let cues = parse(raw).unwrap();
203        let texts: Vec<_> = cues.iter().map(|c| c.text.as_str()).collect();
204        assert_eq!(texts, vec!["first", "second", "third"]);
205    }
206
207    #[test]
208    fn parse_handles_unicode_text() {
209        let raw = r#"{
210            "events": [
211                { "tStartMs": 0, "dDurationMs": 100, "segs": [{"utf8": "こんにちは "}, {"utf8": "🌍"}] }
212            ]
213        }"#;
214        let cues = parse(raw).unwrap();
215        assert_eq!(cues, vec![Cue::new(0, 100, "こんにちは 🌍")]);
216    }
217
218    #[tokio::test]
219    async fn fetch_returns_body_for_2xx() {
220        use wiremock::matchers::{method, path, query_param};
221        use wiremock::{Mock, MockServer, ResponseTemplate};
222
223        let server = MockServer::start().await;
224        Mock::given(method("GET"))
225            .and(path("/api/timedtext"))
226            .and(query_param("fmt", "json3"))
227            .respond_with(ResponseTemplate::new(200).set_body_string(FIXTURE_BASIC))
228            .expect(1)
229            .mount(&server)
230            .await;
231
232        let http = reqwest::Client::builder().build().unwrap();
233        let url = format!("{}/api/timedtext?lang=en&fmt=json3", server.uri());
234        let body = fetch(&http, &url).await.unwrap();
235        assert_eq!(body, FIXTURE_BASIC);
236    }
237
238    #[tokio::test]
239    async fn fetch_surfaces_non_2xx_as_http_error() {
240        use wiremock::matchers::{method, path};
241        use wiremock::{Mock, MockServer, ResponseTemplate};
242
243        let server = MockServer::start().await;
244        Mock::given(method("GET"))
245            .and(path("/api/timedtext"))
246            .respond_with(ResponseTemplate::new(404))
247            .mount(&server)
248            .await;
249
250        let http = reqwest::Client::builder().build().unwrap();
251        let url = format!("{}/api/timedtext?lang=en&fmt=json3", server.uri());
252        let err = fetch(&http, &url).await.unwrap_err();
253        assert!(matches!(err, TranscriptError::Http(_)));
254    }
255
256    #[test]
257    fn parse_saturates_when_duration_overflows() {
258        let raw = format!(
259            r#"{{ "events": [ {{ "tStartMs": {start}, "dDurationMs": {dur}, "segs": [{{"utf8":"x"}}] }} ] }}"#,
260            start = u64::MAX - 100,
261            dur = 1000,
262        );
263        let cues = parse(&raw).unwrap();
264        assert_eq!(cues.len(), 1);
265        assert_eq!(cues[0].end_ms, u64::MAX);
266    }
267}