Skip to main content

lean_ctx/core/web/
youtube.rs

1//! YouTube transcript adapter (no API key required).
2//!
3//! Transcripts are obtained via YouTube's **InnerTube** `player` endpoint — the
4//! same internal API the official apps use. We POST an `ANDROID` client context
5//! and read the `captionTracks` it returns; those caption URLs are
6//! server-fetchable, unlike the session-bound ones embedded in the watch page
7//! (which now return empty bodies for non-browser callers). The chosen track is
8//! downloaded as JSON3 (with an srv3/XML fallback) and flattened to text.
9//!
10//! No Data API key is needed (that is only required for *search*/metadata, which
11//! is configured through the config-provider layer). All requests still flow
12//! through the SSRF-guarded [`super::fetch`].
13
14use serde::Deserialize;
15
16use super::{fetch, html_to_text, url_guard};
17
18/// InnerTube `player` endpoint (same host the SSRF guard already permits).
19const INNERTUBE_PLAYER: &str = "https://www.youtube.com/youtubei/v1/player";
20/// Android client identity. InnerTube cross-checks the UA against `clientName`.
21const ANDROID_CLIENT_VERSION: &str = "20.10.38";
22const ANDROID_UA: &str = "com.google.android.youtube/20.10.38 (Linux; U; Android 14) gzip";
23
24/// A flattened transcript ready for compression / distillation.
25#[derive(Debug, Clone)]
26pub struct Transcript {
27    pub video_id: String,
28    pub title: Option<String>,
29    pub source_url: String,
30    pub full_text: String,
31}
32
33/// Extract a YouTube video id from common URL shapes, or `None`.
34pub fn video_id(url: &str) -> Option<String> {
35    let safe = url_guard::validate(url).ok()?;
36    let host = safe.host.to_ascii_lowercase();
37    let (path, query) = split_path_query(path_and_query(&safe));
38
39    if host == "youtu.be" || host.ends_with(".youtu.be") {
40        return clean_id(path.trim_start_matches('/'));
41    }
42    if host == "youtube.com" || host.ends_with(".youtube.com") {
43        if path.starts_with("/watch") {
44            if let Some(v) = query_param(query, "v") {
45                return clean_id(&v);
46            }
47        }
48        for prefix in ["/shorts/", "/embed/", "/v/", "/live/"] {
49            if let Some(rest) = path.strip_prefix(prefix) {
50                return clean_id(rest);
51            }
52        }
53    }
54    None
55}
56
57/// Download and flatten the transcript for `video_id`.
58pub fn fetch_transcript(video_id: &str, timeout_secs: u64) -> Result<Transcript, String> {
59    let player = innertube_player(video_id, timeout_secs)?;
60
61    let tracks = player.caption_tracks();
62    if tracks.is_empty() {
63        return Err(format!(
64            "no captions available for video {video_id}{}",
65            player.unavailable_reason()
66        ));
67    }
68
69    let track = select_caption_track(&tracks);
70    let url = json3_url(&track.base_url);
71    let data = fetch::fetch(&url, fetch::DEFAULT_MAX_BYTES, timeout_secs)?;
72    if data.status >= 400 {
73        return Err(format!(
74            "failed to download transcript (HTTP {})",
75            data.status
76        ));
77    }
78
79    let full_text = parse_timedtext(&data.body_text())?;
80    if full_text.trim().is_empty() {
81        return Err(format!("transcript for video {video_id} was empty"));
82    }
83
84    Ok(Transcript {
85        video_id: video_id.to_string(),
86        title: player.title(),
87        source_url: format!("https://www.youtube.com/watch?v={video_id}"),
88        full_text,
89    })
90}
91
92// ── InnerTube player ───────────────────────────────────────────────────────
93
94fn innertube_player(video_id: &str, timeout_secs: u64) -> Result<PlayerResponse, String> {
95    let body = serde_json::json!({
96        "context": {
97            "client": {
98                "clientName": "ANDROID",
99                "clientVersion": ANDROID_CLIENT_VERSION,
100                "androidSdkVersion": 34,
101                "hl": "en"
102            }
103        },
104        "videoId": video_id
105    })
106    .to_string();
107
108    let resp = fetch::post(
109        INNERTUBE_PLAYER,
110        "application/json",
111        ANDROID_UA,
112        &body,
113        fetch::DEFAULT_MAX_BYTES,
114        timeout_secs,
115    )?;
116    if resp.status >= 400 {
117        return Err(format!("InnerTube player returned HTTP {}", resp.status));
118    }
119
120    serde_json::from_str::<PlayerResponse>(&resp.body_text())
121        .map_err(|e| format!("could not parse InnerTube player response: {e}"))
122}
123
124#[derive(Deserialize)]
125struct PlayerResponse {
126    captions: Option<CaptionsBlock>,
127    #[serde(rename = "videoDetails")]
128    video_details: Option<VideoDetails>,
129    #[serde(rename = "playabilityStatus")]
130    playability: Option<Playability>,
131}
132
133impl PlayerResponse {
134    fn caption_tracks(&self) -> Vec<CaptionTrack> {
135        self.captions
136            .as_ref()
137            .and_then(|c| c.renderer.as_ref())
138            .map(|r| r.caption_tracks.clone())
139            .unwrap_or_default()
140    }
141
142    fn title(&self) -> Option<String> {
143        self.video_details
144            .as_ref()
145            .and_then(|v| v.title.clone())
146            .filter(|t| !t.is_empty())
147    }
148
149    /// A human-readable reason suffix when no captions are present.
150    fn unavailable_reason(&self) -> String {
151        match self.playability.as_ref() {
152            Some(p) if p.status.as_deref().is_some_and(|s| s != "OK") => {
153                let status = p.status.as_deref().unwrap_or("");
154                let reason = p.reason.as_deref().unwrap_or("");
155                format!(
156                    " ({status}{}{reason})",
157                    if reason.is_empty() { "" } else { ": " }
158                )
159            }
160            _ => " (captions disabled or none published)".to_string(),
161        }
162    }
163}
164
165#[derive(Deserialize)]
166struct CaptionsBlock {
167    #[serde(rename = "playerCaptionsTracklistRenderer")]
168    renderer: Option<TracklistRenderer>,
169}
170
171#[derive(Deserialize)]
172struct TracklistRenderer {
173    #[serde(rename = "captionTracks", default)]
174    caption_tracks: Vec<CaptionTrack>,
175}
176
177#[derive(Deserialize)]
178struct VideoDetails {
179    title: Option<String>,
180}
181
182#[derive(Deserialize)]
183struct Playability {
184    status: Option<String>,
185    reason: Option<String>,
186}
187
188// ── Caption track selection ────────────────────────────────────────────────
189
190#[derive(Deserialize, Clone)]
191struct CaptionTrack {
192    #[serde(rename = "baseUrl")]
193    base_url: String,
194    #[serde(rename = "languageCode")]
195    language_code: Option<String>,
196    kind: Option<String>,
197}
198
199impl CaptionTrack {
200    fn is_english(&self) -> bool {
201        self.language_code
202            .as_deref()
203            .is_some_and(|c| c.starts_with("en"))
204    }
205
206    fn is_auto_generated(&self) -> bool {
207        self.kind.as_deref() == Some("asr")
208    }
209}
210
211/// Prefer a manual English track, then any English, then any manual, else first.
212/// The caller guarantees `tracks` is non-empty.
213fn select_caption_track(tracks: &[CaptionTrack]) -> &CaptionTrack {
214    tracks
215        .iter()
216        .find(|t| t.is_english() && !t.is_auto_generated())
217        .or_else(|| tracks.iter().find(|t| t.is_english()))
218        .or_else(|| tracks.iter().find(|t| !t.is_auto_generated()))
219        .unwrap_or(&tracks[0])
220}
221
222/// Force the JSON3 caption format: drop any pre-set `fmt=` and request `json3`.
223fn json3_url(base_url: &str) -> String {
224    let stripped: String = base_url
225        .split('&')
226        .filter(|seg| !seg.starts_with("fmt="))
227        .collect::<Vec<_>>()
228        .join("&");
229    format!("{stripped}&fmt=json3")
230}
231
232// ── Transcript parsing (JSON3 primary, srv3/XML fallback) ───────────────────
233
234fn parse_timedtext(body: &str) -> Result<String, String> {
235    let trimmed = body.trim_start();
236    if trimmed.starts_with('{') {
237        parse_json3(body)
238    } else if trimmed.starts_with('<') {
239        Ok(parse_srv3_xml(body))
240    } else {
241        Err("transcript response was neither JSON3 nor srv3/XML".to_string())
242    }
243}
244
245#[derive(Deserialize)]
246struct Json3 {
247    #[serde(default)]
248    events: Vec<Json3Event>,
249}
250
251#[derive(Deserialize)]
252struct Json3Event {
253    #[serde(default)]
254    segs: Vec<Json3Seg>,
255}
256
257#[derive(Deserialize)]
258struct Json3Seg {
259    #[serde(default)]
260    utf8: String,
261}
262
263fn parse_json3(body: &str) -> Result<String, String> {
264    let parsed: Json3 =
265        serde_json::from_str(body).map_err(|e| format!("could not parse transcript json: {e}"))?;
266
267    let mut out = String::new();
268    for event in parsed.events {
269        let line: String = event.segs.iter().map(|s| s.utf8.as_str()).collect();
270        let line = line.replace('\n', " ");
271        let line = line.trim();
272        if line.is_empty() {
273            continue;
274        }
275        if !out.is_empty() {
276            out.push(' ');
277        }
278        out.push_str(line);
279    }
280    Ok(out)
281}
282
283/// Flatten the srv3/XML timedtext format (`<p ...>text</p>`), entity-decoded.
284fn parse_srv3_xml(xml: &str) -> String {
285    let mut raw = String::with_capacity(xml.len() / 2);
286    let mut in_tag = false;
287    let mut tag = String::new();
288    for c in xml.chars() {
289        match c {
290            '<' => {
291                in_tag = true;
292                tag.clear();
293            }
294            '>' => {
295                in_tag = false;
296                // A closing paragraph marks a caption-line boundary.
297                if tag.starts_with("/p") {
298                    raw.push('\n');
299                }
300            }
301            _ if in_tag => tag.push(c),
302            _ => raw.push(c),
303        }
304    }
305
306    let decoded = html_to_text::decode_entities(&raw);
307    let mut out = String::new();
308    for line in decoded.split('\n') {
309        let line = line.trim();
310        if line.is_empty() {
311            continue;
312        }
313        if !out.is_empty() {
314            out.push(' ');
315        }
316        out.push_str(line);
317    }
318    out
319}
320
321// ── Small URL helpers ──────────────────────────────────────────────────────
322
323fn path_and_query(safe: &url_guard::SafeUrl) -> &str {
324    let prefix = safe.scheme.len() + 3 + safe.authority.len();
325    safe.normalized.get(prefix..).unwrap_or("")
326}
327
328fn split_path_query(pq: &str) -> (&str, &str) {
329    let pq = pq.split('#').next().unwrap_or(pq);
330    match pq.split_once('?') {
331        Some((p, q)) => (if p.is_empty() { "/" } else { p }, q),
332        None => (if pq.is_empty() { "/" } else { pq }, ""),
333    }
334}
335
336fn query_param(query: &str, key: &str) -> Option<String> {
337    query.split('&').find_map(|pair| {
338        let (k, v) = pair.split_once('=')?;
339        if k == key {
340            Some(v.to_string())
341        } else {
342            None
343        }
344    })
345}
346
347fn clean_id(raw: &str) -> Option<String> {
348    let id: String = raw
349        .chars()
350        .take_while(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == '_')
351        .collect();
352    if id.is_empty() {
353        None
354    } else {
355        Some(id)
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn extracts_id_from_watch_url() {
365        assert_eq!(
366            video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=42"),
367            Some("dQw4w9WgXcQ".to_string())
368        );
369    }
370
371    #[test]
372    fn extracts_id_from_short_and_shorts_and_embed() {
373        assert_eq!(
374            video_id("https://youtu.be/dQw4w9WgXcQ?si=abc"),
375            Some("dQw4w9WgXcQ".to_string())
376        );
377        assert_eq!(
378            video_id("https://www.youtube.com/shorts/abc123DEF45"),
379            Some("abc123DEF45".to_string())
380        );
381        assert_eq!(
382            video_id("https://www.youtube.com/embed/xyz789ABCde"),
383            Some("xyz789ABCde".to_string())
384        );
385    }
386
387    #[test]
388    fn non_youtube_url_returns_none() {
389        assert_eq!(video_id("https://example.com/watch?v=abc"), None);
390        assert_eq!(video_id("https://vimeo.com/12345"), None);
391    }
392
393    #[test]
394    fn selects_manual_english_track_over_asr() {
395        let tracks = vec![
396            CaptionTrack {
397                base_url: "https://t/asr".into(),
398                language_code: Some("en".into()),
399                kind: Some("asr".into()),
400            },
401            CaptionTrack {
402                base_url: "https://t/manual".into(),
403                language_code: Some("en".into()),
404                kind: None,
405            },
406            CaptionTrack {
407                base_url: "https://t/de".into(),
408                language_code: Some("de".into()),
409                kind: None,
410            },
411        ];
412        assert_eq!(select_caption_track(&tracks).base_url, "https://t/manual");
413    }
414
415    #[test]
416    fn selects_any_when_no_english() {
417        let tracks = vec![CaptionTrack {
418            base_url: "https://t/fr".into(),
419            language_code: Some("fr".into()),
420            kind: Some("asr".into()),
421        }];
422        assert_eq!(select_caption_track(&tracks).base_url, "https://t/fr");
423    }
424
425    #[test]
426    fn json3_url_forces_format() {
427        assert_eq!(
428            json3_url("https://yt/api/timedtext?v=x&ei=y&fmt=srv3&hl=en"),
429            "https://yt/api/timedtext?v=x&ei=y&hl=en&fmt=json3"
430        );
431        assert_eq!(
432            json3_url("https://yt/api/timedtext?v=x"),
433            "https://yt/api/timedtext?v=x&fmt=json3"
434        );
435    }
436
437    #[test]
438    fn parses_json3_into_joined_text() {
439        let body = r#"{"events":[
440            {"tStartMs":0,"segs":[{"utf8":"Hello"},{"utf8":" world"}]},
441            {"tStartMs":1000,"segs":[{"utf8":"second\n"},{"utf8":"line"}]},
442            {"tStartMs":2000,"segs":[{"utf8":"\n"}]}
443        ]}"#;
444        assert_eq!(parse_json3(body).unwrap(), "Hello world second line");
445    }
446
447    #[test]
448    fn parses_srv3_xml_into_joined_text() {
449        let xml = r#"<?xml version="1.0" encoding="utf-8" ?><timedtext format="3">
450<body>
451<p t="0" d="1680">We&#39;re no strangers</p>
452<p t="1680" d="2000">to <s>love</s></p>
453</body></timedtext>"#;
454        assert_eq!(parse_srv3_xml(xml), "We're no strangers to love");
455    }
456
457    #[test]
458    fn parse_timedtext_dispatches_on_shape() {
459        assert!(parse_timedtext("not json or xml").is_err());
460        assert_eq!(parse_timedtext("{\"events\":[]}").unwrap(), "");
461    }
462}