Skip to main content

imp_core/tools/web/
youtube.rs

1use reqwest::Client;
2use serde_json::Value;
3use url::Url;
4
5use super::types::{ContentFormat, PageContent};
6
7const WATCH_BASE_URL: &str = "https://www.youtube.com/watch";
8const PLAYER_RESPONSE_VAR: &str = "ytInitialPlayerResponse";
9const ANDROID_VR_CLIENT_NAME: &str = "28";
10const ANDROID_VR_CLIENT_VERSION: &str = "1.71.26";
11const ANDROID_VR_USER_AGENT: &str = "com.google.android.apps.youtube.vr.oculus/1.71.26 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip";
12
13#[derive(Debug, Clone, PartialEq)]
14struct VideoId(String);
15
16impl VideoId {
17    fn as_str(&self) -> &str {
18        &self.0
19    }
20}
21
22#[derive(Debug, Clone, PartialEq)]
23struct CaptionTrack {
24    base_url: String,
25    language_code: String,
26    name: String,
27    is_generated: bool,
28}
29
30#[derive(Debug, Clone, PartialEq)]
31struct TranscriptSegment {
32    start_ms: u64,
33    duration_ms: Option<u64>,
34    text: String,
35}
36
37#[derive(Debug, Clone, Default, PartialEq)]
38struct VideoMetadata {
39    title: Option<String>,
40    author: Option<String>,
41    channel_id: Option<String>,
42    duration_seconds: Option<String>,
43    view_count: Option<String>,
44    description: Option<String>,
45    publish_date: Option<String>,
46    upload_date: Option<String>,
47}
48
49struct CaptionSource {
50    tracks: Vec<CaptionTrack>,
51    selected_track: CaptionTrack,
52    segments: Vec<TranscriptSegment>,
53    source_client: &'static str,
54}
55
56pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, YouTubeError> {
57    let parsed_url = Url::parse(url).map_err(|err| YouTubeError::InvalidUrl(err.to_string()))?;
58    let video_id = extract_video_id(&parsed_url).ok_or(YouTubeError::UnsupportedUrl)?;
59    let requested_url = url.to_string();
60    let watch_url = canonical_watch_url(video_id.as_str());
61
62    let response = client
63        .get(watch_url.as_str())
64        .header("User-Agent", super::read::USER_AGENT)
65        .header("Accept", super::read::ACCEPT_HEADER)
66        .header("Accept-Language", "en-US,en;q=0.9")
67        .send()
68        .await
69        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
70
71    let status_code = response.status().as_u16();
72    if !response.status().is_success() {
73        return Err(YouTubeError::HttpStatus(
74            status_code,
75            response
76                .status()
77                .canonical_reason()
78                .unwrap_or("Unknown")
79                .to_string(),
80        ));
81    }
82
83    let final_url = response.url().to_string();
84    let content_type = response
85        .headers()
86        .get("content-type")
87        .and_then(|value| value.to_str().ok())
88        .map(str::to_string);
89    let html = response
90        .text()
91        .await
92        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
93    let raw_body_bytes = html.len();
94
95    let initial_player_response = extract_initial_player_response(&html)?;
96    let metadata = extract_metadata(&initial_player_response);
97    let visitor_data = extract_visitor_data(&html);
98    let caption_source = resolve_caption_source(
99        client,
100        video_id.as_str(),
101        &initial_player_response,
102        visitor_data.as_deref(),
103    )
104    .await;
105
106    let title = metadata
107        .title
108        .clone()
109        .unwrap_or_else(|| format!("YouTube video {}", video_id.as_str()));
110    let (text, diagnostics) = match caption_source {
111        Ok(caption_source) if !caption_source.segments.is_empty() => (
112            format_video_context(
113                video_id.as_str(),
114                &watch_url,
115                &metadata,
116                &caption_source.selected_track,
117                &caption_source.segments,
118            ),
119            build_diagnostics(
120                &caption_source.tracks,
121                &caption_source.selected_track,
122                caption_source.segments.len(),
123                caption_source.source_client,
124            ),
125        ),
126        Ok(caption_source) => (
127            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
128            build_metadata_only_diagnostics(Some(&format!(
129                "Caption tracks were found, but no transcript segments were extracted from {}.",
130                caption_source.source_client
131            ))),
132        ),
133        Err(err @ (YouTubeError::CaptionTracksMissing | YouTubeError::NoUsableCaptionTrack)) => (
134            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
135            build_metadata_only_diagnostics(Some(&err.to_string())),
136        ),
137        Err(err @ (YouTubeError::TranscriptEmpty | YouTubeError::TranscriptParse(_))) => (
138            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
139            build_metadata_only_diagnostics(Some(&err.to_string())),
140        ),
141        Err(err) => return Err(err),
142    };
143
144    let was_redirected = final_url != watch_url;
145
146    Ok(PageContent {
147        title: Some(title),
148        content_length: text.len(),
149        text,
150        url: final_url,
151        requested_url,
152        status_code,
153        content_type,
154        format_received: ContentFormat::Html,
155        was_redirected,
156        raw_body_bytes,
157        diagnostics,
158    })
159}
160
161async fn resolve_caption_source(
162    client: &Client,
163    video_id: &str,
164    initial_player_response: &Value,
165    visitor_data: Option<&str>,
166) -> Result<CaptionSource, YouTubeError> {
167    let web_result = fetch_caption_source_from_response(
168        client,
169        initial_player_response,
170        super::read::USER_AGENT,
171        "web",
172    )
173    .await;
174    if web_result.is_ok() {
175        return web_result;
176    }
177    let web_error = web_result.err();
178
179    let Some(visitor_data) = visitor_data else {
180        return Err(web_error.unwrap_or(YouTubeError::VisitorDataMissing));
181    };
182
183    let android_vr_response =
184        fetch_android_vr_player_response(client, video_id, visitor_data).await?;
185    fetch_caption_source_from_response(
186        client,
187        &android_vr_response,
188        ANDROID_VR_USER_AGENT,
189        "android_vr",
190    )
191    .await
192}
193
194async fn fetch_caption_source_from_response(
195    client: &Client,
196    player_response: &Value,
197    user_agent: &str,
198    source_client: &'static str,
199) -> Result<CaptionSource, YouTubeError> {
200    let tracks = extract_caption_tracks(player_response)?;
201    let selected_track = select_caption_track(&tracks).ok_or(YouTubeError::NoUsableCaptionTrack)?;
202    let segments = fetch_transcript_segments(client, &selected_track, user_agent).await?;
203
204    Ok(CaptionSource {
205        tracks,
206        selected_track,
207        segments,
208        source_client,
209    })
210}
211
212async fn fetch_transcript_segments(
213    client: &Client,
214    track: &CaptionTrack,
215    user_agent: &str,
216) -> Result<Vec<TranscriptSegment>, YouTubeError> {
217    let transcript_url = caption_url_with_json3(&track.base_url)?;
218    let transcript_response = client
219        .get(transcript_url.as_str())
220        .header("User-Agent", user_agent)
221        .header(
222            "Accept",
223            "application/json,text/xml,text/plain;q=0.9,*/*;q=0.5",
224        )
225        .header("Accept-Language", "en-US,en;q=0.9")
226        .send()
227        .await
228        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
229
230    if !transcript_response.status().is_success() {
231        return Err(YouTubeError::TranscriptHttpStatus(
232            transcript_response.status().as_u16(),
233        ));
234    }
235
236    let transcript_text = transcript_response
237        .text()
238        .await
239        .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
240    if transcript_text.trim().is_empty() {
241        return Err(YouTubeError::TranscriptEmpty);
242    }
243
244    if transcript_text.trim_start().starts_with('<') {
245        return Ok(parse_xml_transcript(&transcript_text));
246    }
247
248    let transcript_json: Value = serde_json::from_str(&transcript_text)
249        .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
250    Ok(parse_json3_transcript(&transcript_json))
251}
252
253async fn fetch_android_vr_player_response(
254    client: &Client,
255    video_id: &str,
256    visitor_data: &str,
257) -> Result<Value, YouTubeError> {
258    let payload = serde_json::json!({
259        "context": {
260            "client": {
261                "clientName": "ANDROID_VR",
262                "clientVersion": ANDROID_VR_CLIENT_VERSION,
263                "deviceMake": "Oculus",
264                "deviceModel": "Quest 3",
265                "androidSdkVersion": 32,
266                "userAgent": ANDROID_VR_USER_AGENT,
267                "osName": "Android",
268                "osVersion": "12L",
269                "hl": "en",
270                "gl": "US"
271            }
272        },
273        "videoId": video_id,
274        "contentCheckOk": true,
275        "racyCheckOk": true
276    });
277
278    let response = client
279        .post("https://www.youtube.com/youtubei/v1/player")
280        .header("Content-Type", "application/json")
281        .header("User-Agent", ANDROID_VR_USER_AGENT)
282        .header("X-YouTube-Client-Name", ANDROID_VR_CLIENT_NAME)
283        .header("X-YouTube-Client-Version", ANDROID_VR_CLIENT_VERSION)
284        .header("X-Goog-Visitor-Id", visitor_data)
285        .header("Origin", "https://www.youtube.com")
286        .json(&payload)
287        .send()
288        .await
289        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
290
291    if !response.status().is_success() {
292        return Err(YouTubeError::PlayerApiHttpStatus(
293            response.status().as_u16(),
294        ));
295    }
296
297    let player_response = response
298        .json::<Value>()
299        .await
300        .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))?;
301
302    if player_response
303        .pointer("/playabilityStatus/status")
304        .and_then(Value::as_str)
305        == Some("LOGIN_REQUIRED")
306    {
307        return Err(YouTubeError::PlayerApiLoginRequired(
308            player_response
309                .pointer("/playabilityStatus/reason")
310                .and_then(Value::as_str)
311                .unwrap_or("sign-in required")
312                .to_string(),
313        ));
314    }
315
316    Ok(player_response)
317}
318
319pub fn is_youtube_url(url: &Url) -> bool {
320    url.host_str().is_some_and(|host| {
321        let host = host.to_ascii_lowercase();
322        host == "youtu.be"
323            || host.ends_with(".youtu.be")
324            || host == "youtube.com"
325            || host.ends_with(".youtube.com")
326    })
327}
328
329fn extract_video_id(url: &Url) -> Option<VideoId> {
330    let host = url.host_str()?.to_ascii_lowercase();
331    if host == "youtu.be" || host.ends_with(".youtu.be") {
332        return first_path_segment(url).and_then(VideoId::from_candidate);
333    }
334
335    if !(host == "youtube.com" || host.ends_with(".youtube.com")) {
336        return None;
337    }
338
339    match first_path_segment(url).as_deref() {
340        Some("watch") => url
341            .query_pairs()
342            .find_map(|(key, value)| (key == "v").then(|| value.into_owned()))
343            .and_then(VideoId::from_candidate),
344        Some("shorts" | "embed" | "live") => url
345            .path_segments()
346            .and_then(|mut segments| segments.nth(1).map(str::to_string))
347            .and_then(VideoId::from_candidate),
348        _ => None,
349    }
350}
351
352impl VideoId {
353    fn from_candidate(candidate: String) -> Option<Self> {
354        let id = candidate.trim();
355        let is_valid = id.len() == 11
356            && id
357                .bytes()
358                .all(|byte| byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'-');
359        is_valid.then(|| Self(id.to_string()))
360    }
361}
362
363fn first_path_segment(url: &Url) -> Option<String> {
364    url.path_segments()
365        .and_then(|mut segments| segments.next().map(str::to_string))
366}
367
368fn canonical_watch_url(video_id: &str) -> String {
369    format!("{WATCH_BASE_URL}?v={video_id}")
370}
371
372fn extract_initial_player_response(html: &str) -> Result<Value, YouTubeError> {
373    let marker_index = html
374        .find(PLAYER_RESPONSE_VAR)
375        .ok_or(YouTubeError::PlayerResponseMissing)?;
376    let after_marker = &html[marker_index + PLAYER_RESPONSE_VAR.len()..];
377    let brace_relative = after_marker
378        .find('{')
379        .ok_or(YouTubeError::PlayerResponseMissing)?;
380    let json_start = marker_index + PLAYER_RESPONSE_VAR.len() + brace_relative;
381    let json_end = find_balanced_json_end(html, json_start)?;
382    serde_json::from_str(&html[json_start..json_end])
383        .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))
384}
385
386fn find_balanced_json_end(text: &str, start: usize) -> Result<usize, YouTubeError> {
387    let mut depth = 0u32;
388    let mut in_string = false;
389    let mut escaped = false;
390
391    for (offset, ch) in text[start..].char_indices() {
392        if escaped {
393            escaped = false;
394            continue;
395        }
396
397        if ch == '\\' {
398            escaped = in_string;
399            continue;
400        }
401
402        if ch == '"' {
403            in_string = !in_string;
404            continue;
405        }
406
407        if in_string {
408            continue;
409        }
410
411        match ch {
412            '{' => depth += 1,
413            '}' => {
414                depth = depth.saturating_sub(1);
415                if depth == 0 {
416                    return Ok(start + offset + ch.len_utf8());
417                }
418            }
419            _ => {}
420        }
421    }
422
423    Err(YouTubeError::PlayerResponseUnterminated)
424}
425
426fn extract_visitor_data(html: &str) -> Option<String> {
427    extract_quoted_json_field(html, "VISITOR_DATA")
428        .or_else(|| extract_quoted_json_field(html, "visitorData"))
429}
430
431fn extract_quoted_json_field(text: &str, key: &str) -> Option<String> {
432    let marker = format!("\"{key}\":\"");
433    let start = text.find(&marker)? + marker.len();
434    let tail = &text[start..];
435    let end = tail.find('"')?;
436    Some(tail[..end].to_string())
437}
438
439fn extract_metadata(player_response: &Value) -> VideoMetadata {
440    let details = player_response.get("videoDetails").unwrap_or(&Value::Null);
441    let microformat = player_response
442        .pointer("/microformat/playerMicroformatRenderer")
443        .unwrap_or(&Value::Null);
444
445    VideoMetadata {
446        title: string_at(details, "title").or_else(|| text_runs_at(microformat, "title")),
447        author: string_at(details, "author").or_else(|| string_at(microformat, "ownerChannelName")),
448        channel_id: string_at(details, "channelId")
449            .or_else(|| string_at(microformat, "externalChannelId")),
450        duration_seconds: string_at(details, "lengthSeconds")
451            .or_else(|| string_at(microformat, "lengthSeconds")),
452        view_count: string_at(details, "viewCount").or_else(|| string_at(microformat, "viewCount")),
453        description: string_at(details, "shortDescription")
454            .or_else(|| text_runs_at(microformat, "description")),
455        publish_date: string_at(microformat, "publishDate"),
456        upload_date: string_at(microformat, "uploadDate"),
457    }
458}
459
460fn extract_caption_tracks(player_response: &Value) -> Result<Vec<CaptionTrack>, YouTubeError> {
461    let tracks = player_response
462        .pointer("/captions/playerCaptionsTracklistRenderer/captionTracks")
463        .and_then(Value::as_array)
464        .ok_or(YouTubeError::CaptionTracksMissing)?;
465
466    let parsed = tracks
467        .iter()
468        .filter_map(|track| {
469            Some(CaptionTrack {
470                base_url: string_at(track, "baseUrl")?,
471                language_code: string_at(track, "languageCode")?,
472                name: text_runs_at(track, "name").unwrap_or_else(|| "unknown".to_string()),
473                is_generated: string_at(track, "kind").as_deref() == Some("asr"),
474            })
475        })
476        .collect::<Vec<_>>();
477
478    if parsed.is_empty() {
479        return Err(YouTubeError::NoUsableCaptionTrack);
480    }
481
482    Ok(parsed)
483}
484
485fn select_caption_track(tracks: &[CaptionTrack]) -> Option<CaptionTrack> {
486    tracks
487        .iter()
488        .find(|track| is_english(&track.language_code) && !track.is_generated)
489        .or_else(|| tracks.iter().find(|track| is_english(&track.language_code)))
490        .or_else(|| tracks.first())
491        .cloned()
492}
493
494fn is_english(language_code: &str) -> bool {
495    let language = language_code.to_ascii_lowercase();
496    language == "en" || language.starts_with("en-") || language == "en-orig"
497}
498
499fn caption_url_with_json3(base_url: &str) -> Result<String, YouTubeError> {
500    let mut url =
501        Url::parse(base_url).map_err(|err| YouTubeError::InvalidCaptionUrl(err.to_string()))?;
502    {
503        let has_fmt = url.query_pairs().any(|(key, _)| key == "fmt");
504        if !has_fmt {
505            url.query_pairs_mut().append_pair("fmt", "json3");
506        }
507    }
508    Ok(url.to_string())
509}
510
511fn parse_json3_transcript(value: &Value) -> Vec<TranscriptSegment> {
512    value
513        .get("events")
514        .or_else(|| value.get("aAppend"))
515        .and_then(Value::as_array)
516        .into_iter()
517        .flatten()
518        .filter_map(parse_json3_event)
519        .collect()
520}
521
522fn parse_json3_event(event: &Value) -> Option<TranscriptSegment> {
523    let text = event
524        .get("segs")?
525        .as_array()?
526        .iter()
527        .filter_map(|seg| seg.get("utf8").and_then(Value::as_str))
528        .collect::<String>();
529    let text = normalize_transcript_text(&text);
530    if text.is_empty() || is_noise_segment(&text) {
531        return None;
532    }
533
534    Some(TranscriptSegment {
535        start_ms: event.get("tStartMs").and_then(Value::as_u64).unwrap_or(0),
536        duration_ms: event.get("dDurationMs").and_then(Value::as_u64),
537        text,
538    })
539}
540
541fn parse_xml_transcript(text: &str) -> Vec<TranscriptSegment> {
542    text.split("<p ")
543        .skip(1)
544        .filter_map(parse_xml_paragraph)
545        .collect()
546}
547
548fn parse_xml_paragraph(fragment: &str) -> Option<TranscriptSegment> {
549    let tag_end = fragment.find('>')?;
550    let attrs = &fragment[..tag_end];
551    let body = &fragment[tag_end + 1..fragment.find("</p>")?];
552    let text = normalize_transcript_text(&strip_xml_tags(body));
553    if text.is_empty() || is_noise_segment(&text) {
554        return None;
555    }
556
557    Some(TranscriptSegment {
558        start_ms: extract_xml_time_ms(attrs, "t").unwrap_or(0),
559        duration_ms: extract_xml_time_ms(attrs, "d"),
560        text,
561    })
562}
563
564fn extract_xml_time_ms(attrs: &str, key: &str) -> Option<u64> {
565    let marker = format!(r#"{key}=""#);
566    let start = attrs.find(&marker)? + marker.len();
567    let tail = &attrs[start..];
568    let end = tail.find('"')?;
569    tail[..end].parse().ok()
570}
571
572fn strip_xml_tags(text: &str) -> String {
573    let mut out = String::with_capacity(text.len());
574    let mut in_tag = false;
575    let mut entity = String::new();
576    let mut in_entity = false;
577
578    for ch in text.chars() {
579        if in_entity {
580            entity.push(ch);
581            if ch == ';' {
582                out.push_str(match entity.as_str() {
583                    "amp;" => "&",
584                    "lt;" => "<",
585                    "gt;" => ">",
586                    "quot;" => "\"",
587                    "apos;" | "#39;" => "'",
588                    _ => "",
589                });
590                entity.clear();
591                in_entity = false;
592            }
593            continue;
594        }
595
596        match ch {
597            '<' => in_tag = true,
598            '>' => in_tag = false,
599            '&' if !in_tag => in_entity = true,
600            _ if !in_tag => out.push(ch),
601            _ => {}
602        }
603    }
604
605    out
606}
607
608fn normalize_transcript_text(text: &str) -> String {
609    text.split_whitespace().collect::<Vec<_>>().join(" ")
610}
611
612fn is_noise_segment(text: &str) -> bool {
613    let normalized = text.trim().to_ascii_lowercase();
614    matches!(
615        normalized.as_str(),
616        "[music]" | "[applause]" | "[laughter]" | "♪" | "♫"
617    )
618}
619
620fn format_video_context(
621    video_id: &str,
622    canonical_url: &str,
623    metadata: &VideoMetadata,
624    track: &CaptionTrack,
625    segments: &[TranscriptSegment],
626) -> String {
627    let mut output = String::new();
628    output.push_str("# YouTube Video Context\n\n");
629    output.push_str("## Source\n");
630    output.push_str(&format!("- URL: {canonical_url}\n"));
631    output.push_str(&format!("- Video ID: {video_id}\n"));
632    push_optional(&mut output, "- Title", metadata.title.as_deref());
633    push_optional(&mut output, "- Channel", metadata.author.as_deref());
634    push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
635    push_optional(
636        &mut output,
637        "- Duration seconds",
638        metadata.duration_seconds.as_deref(),
639    );
640    push_optional(&mut output, "- Views", metadata.view_count.as_deref());
641    push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
642    push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
643
644    output.push_str("\n## Transcript Track\n");
645    output.push_str(&format!("- Language: {}\n", track.language_code));
646    output.push_str(&format!("- Name: {}\n", track.name));
647    output.push_str(&format!("- Auto-generated: {}\n", track.is_generated));
648
649    if let Some(description) = metadata.description.as_deref() {
650        output.push_str("\n## Description\n");
651        output.push_str(description.trim());
652        output.push('\n');
653    }
654
655    output.push_str("\n## Transcript\n");
656    for segment in segments {
657        output.push_str(&format!(
658            "[{}] {}\n",
659            format_timestamp(segment.start_ms),
660            segment.text
661        ));
662    }
663
664    output.trim().to_string()
665}
666
667fn format_metadata_only_context(
668    video_id: &str,
669    canonical_url: &str,
670    metadata: &VideoMetadata,
671) -> String {
672    let mut output = String::new();
673    output.push_str("# YouTube Video Context\n\n");
674    output.push_str("## Source\n");
675    output.push_str(&format!("- URL: {canonical_url}\n"));
676    output.push_str(&format!("- Video ID: {video_id}\n"));
677    push_optional(&mut output, "- Title", metadata.title.as_deref());
678    push_optional(&mut output, "- Channel", metadata.author.as_deref());
679    push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
680    push_optional(
681        &mut output,
682        "- Duration seconds",
683        metadata.duration_seconds.as_deref(),
684    );
685    push_optional(&mut output, "- Views", metadata.view_count.as_deref());
686    push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
687    push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
688
689    if let Some(description) = metadata.description.as_deref() {
690        output.push_str("\n## Description\n");
691        output.push_str(description.trim());
692        output.push('\n');
693    }
694
695    output.push_str("\n## Transcript\n");
696    output.push_str("Transcript unavailable. YouTube metadata was extracted, but no usable caption/transcript body was available for this video.\n");
697    output.trim().to_string()
698}
699
700fn build_diagnostics(
701    tracks: &[CaptionTrack],
702    selected_track: &CaptionTrack,
703    segment_count: usize,
704    source_client: &str,
705) -> Vec<String> {
706    vec![
707        "YouTube extraction used native HTTP transcript path; no video/audio was downloaded."
708            .to_string(),
709        format!(
710            "Selected caption track from {source_client}: {} ({}, auto-generated: {}).",
711            selected_track.name, selected_track.language_code, selected_track.is_generated
712        ),
713        format!(
714            "Found {} caption track(s), extracted {} transcript segment(s).",
715            tracks.len(),
716            segment_count
717        ),
718    ]
719}
720
721fn build_metadata_only_diagnostics(reason: Option<&str>) -> Vec<String> {
722    let mut diagnostics = vec![
723        "YouTube extraction used native HTTP metadata path; no video/audio was downloaded."
724            .to_string(),
725        "Transcript unavailable; returning metadata-only YouTube context.".to_string(),
726    ];
727    if let Some(reason) = reason.filter(|reason| !reason.trim().is_empty()) {
728        diagnostics.push(format!("Transcript unavailable reason: {reason}"));
729    }
730    diagnostics
731}
732
733fn push_optional(output: &mut String, label: &str, value: Option<&str>) {
734    if let Some(value) = value.filter(|value| !value.trim().is_empty()) {
735        output.push_str(&format!("{label}: {}\n", value.trim()));
736    }
737}
738
739fn format_timestamp(ms: u64) -> String {
740    let total_seconds = ms / 1000;
741    let hours = total_seconds / 3600;
742    let minutes = (total_seconds % 3600) / 60;
743    let seconds = total_seconds % 60;
744
745    if hours > 0 {
746        format!("{hours:02}:{minutes:02}:{seconds:02}")
747    } else {
748        format!("{minutes:02}:{seconds:02}")
749    }
750}
751
752fn string_at(value: &Value, key: &str) -> Option<String> {
753    value
754        .get(key)
755        .and_then(Value::as_str)
756        .filter(|value| !value.is_empty())
757        .map(str::to_string)
758}
759
760fn text_runs_at(value: &Value, key: &str) -> Option<String> {
761    let text_value = value.get(key)?;
762    if let Some(simple_text) = text_value.get("simpleText").and_then(Value::as_str) {
763        return Some(simple_text.to_string());
764    }
765    let runs = text_value.get("runs")?.as_array()?;
766    let text = runs
767        .iter()
768        .filter_map(|run| run.get("text").and_then(Value::as_str))
769        .collect::<String>();
770    (!text.is_empty()).then_some(text)
771}
772
773#[derive(Debug)]
774pub enum YouTubeError {
775    InvalidUrl(String),
776    UnsupportedUrl,
777    Fetch(String),
778    HttpStatus(u16, String),
779    PlayerResponseMissing,
780    PlayerResponseUnterminated,
781    PlayerResponseParse(String),
782    PlayerApiHttpStatus(u16),
783    PlayerApiLoginRequired(String),
784    VisitorDataMissing,
785    CaptionTracksMissing,
786    NoUsableCaptionTrack,
787    InvalidCaptionUrl(String),
788    TranscriptHttpStatus(u16),
789    TranscriptParse(String),
790    TranscriptEmpty,
791}
792
793impl std::fmt::Display for YouTubeError {
794    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
795        match self {
796            Self::InvalidUrl(msg) => write!(f, "Invalid YouTube URL: {msg}"),
797            Self::UnsupportedUrl => write!(f, "Unsupported YouTube URL"),
798            Self::Fetch(msg) => write!(f, "YouTube fetch failed: {msg}"),
799            Self::HttpStatus(code, reason) => write!(f, "YouTube returned HTTP {code} {reason}"),
800            Self::PlayerResponseMissing => write!(f, "YouTube player response not found"),
801            Self::PlayerResponseUnterminated => {
802                write!(f, "YouTube player response JSON was unterminated")
803            }
804            Self::PlayerResponseParse(msg) => {
805                write!(f, "YouTube player response parse failed: {msg}")
806            }
807            Self::PlayerApiHttpStatus(code) => write!(f, "YouTube player API returned HTTP {code}"),
808            Self::PlayerApiLoginRequired(reason) => {
809                write!(f, "YouTube player API required login: {reason}")
810            }
811            Self::VisitorDataMissing => write!(f, "YouTube visitor data was unavailable"),
812            Self::CaptionTracksMissing => write!(f, "YouTube captions are unavailable for this video"),
813            Self::NoUsableCaptionTrack => write!(f, "No usable YouTube caption track found"),
814            Self::InvalidCaptionUrl(msg) => write!(f, "Invalid YouTube caption URL: {msg}"),
815            Self::TranscriptHttpStatus(code) => write!(f, "YouTube transcript returned HTTP {code}"),
816            Self::TranscriptParse(msg) => write!(f, "YouTube transcript parse failed: {msg}"),
817            Self::TranscriptEmpty => write!(f, "YouTube transcript was empty; caption track metadata was found but YouTube returned no caption body for this client"),
818        }
819    }
820}
821
822#[cfg(test)]
823pub(crate) mod tests {
824    use super::*;
825    use serde_json::json;
826
827    #[test]
828    fn youtube_extracts_video_ids_from_common_urls() {
829        let cases = [
830            ("https://www.youtube.com/watch?v=McO_xcf4IYw", "McO_xcf4IYw"),
831            ("https://youtu.be/McO_xcf4IYw?t=12", "McO_xcf4IYw"),
832            ("https://www.youtube.com/shorts/McO_xcf4IYw", "McO_xcf4IYw"),
833            ("https://www.youtube.com/embed/McO_xcf4IYw", "McO_xcf4IYw"),
834        ];
835
836        for (url, expected) in cases {
837            let parsed = Url::parse(url).unwrap();
838            assert_eq!(extract_video_id(&parsed).unwrap().as_str(), expected);
839        }
840    }
841
842    #[test]
843    fn youtube_rejects_invalid_video_ids() {
844        let parsed =
845            Url::parse("https://www.youtube.com/watch?v=not-valid-because-too-long").unwrap();
846        assert!(extract_video_id(&parsed).is_none());
847    }
848
849    #[test]
850    fn youtube_extracts_balanced_player_response() {
851        let html = r#"<script>var ytInitialPlayerResponse = {"videoDetails":{"title":"A } in string","shortDescription":"escaped \" brace }"},"captions":{}};</script>"#;
852        let response = extract_initial_player_response(html).unwrap();
853        assert_eq!(response["videoDetails"]["title"], "A } in string");
854        assert_eq!(
855            response["videoDetails"]["shortDescription"],
856            "escaped \" brace }"
857        );
858    }
859
860    #[test]
861    fn youtube_extracts_visitor_data() {
862        let html = r#"ytcfg.set({"VISITOR_DATA":"visitor-token","other":true});"#;
863        assert_eq!(extract_visitor_data(html).as_deref(), Some("visitor-token"));
864    }
865
866    #[test]
867    fn youtube_selects_manual_english_before_auto_english() {
868        let tracks = vec![
869            CaptionTrack {
870                base_url: "https://example.com/fr".into(),
871                language_code: "fr".into(),
872                name: "French".into(),
873                is_generated: false,
874            },
875            CaptionTrack {
876                base_url: "https://example.com/en-auto".into(),
877                language_code: "en".into(),
878                name: "English auto".into(),
879                is_generated: true,
880            },
881            CaptionTrack {
882                base_url: "https://example.com/en".into(),
883                language_code: "en".into(),
884                name: "English".into(),
885                is_generated: false,
886            },
887        ];
888
889        let selected = select_caption_track(&tracks).unwrap();
890        assert_eq!(selected.base_url, "https://example.com/en");
891    }
892
893    #[test]
894    fn youtube_parses_json3_transcript_segments() {
895        let transcript = json!({
896            "events": [
897                {"tStartMs": 0, "dDurationMs": 1000, "segs": [{"utf8": "Hello "}, {"utf8": "world"}]},
898                {"tStartMs": 1000, "segs": [{"utf8": "\n"}]},
899                {"tStartMs": 2000, "segs": [{"utf8": "[Music]"}]},
900                {"tStartMs": 3000, "dDurationMs": 500, "segs": [{"utf8": "next line"}]}
901            ]
902        });
903
904        let segments = parse_json3_transcript(&transcript);
905        assert_eq!(segments.len(), 2);
906        assert_eq!(segments[0].text, "Hello world");
907        assert_eq!(segments[0].start_ms, 0);
908        assert_eq!(segments[1].text, "next line");
909    }
910
911    #[test]
912    fn youtube_parses_xml_transcript_segments() {
913        let transcript = r#"<?xml version="1.0" ?><timedtext><body><p t="1000" d="2000">Hello &amp; <s>world</s></p><p t="3000" d="1000">[Music]</p></body></timedtext>"#;
914        let segments = parse_xml_transcript(transcript);
915        assert_eq!(segments.len(), 1);
916        assert_eq!(segments[0].start_ms, 1000);
917        assert_eq!(segments[0].duration_ms, Some(2000));
918        assert_eq!(segments[0].text, "Hello & world");
919    }
920
921    #[test]
922    fn youtube_adds_json3_format_to_caption_url() {
923        let url =
924            caption_url_with_json3("https://www.youtube.com/api/timedtext?v=abc&lang=en").unwrap();
925        assert!(url.contains("fmt=json3"));
926    }
927
928    #[test]
929    fn youtube_formats_metadata_only_context_with_clear_transcript_marker() {
930        let metadata = VideoMetadata {
931            title: Some("No captions example".into()),
932            author: Some("Example Channel".into()),
933            ..VideoMetadata::default()
934        };
935        let text = format_metadata_only_context(
936            "McO_xcf4IYw",
937            "https://www.youtube.com/watch?v=McO_xcf4IYw",
938            &metadata,
939        );
940        assert!(text.contains("No captions example"));
941        assert!(text.contains("Transcript unavailable"));
942    }
943
944    #[test]
945    fn youtube_metadata_only_diagnostics_include_reason() {
946        let diagnostics = build_metadata_only_diagnostics(Some("captions disabled"));
947        assert!(diagnostics
948            .iter()
949            .any(|line| line.contains("metadata-only")));
950        assert!(diagnostics
951            .iter()
952            .any(|line| line.contains("captions disabled")));
953    }
954
955    #[tokio::test]
956    #[ignore = "network smoke test for YouTube extraction"]
957    async fn youtube_reads_sample_video_over_http() {
958        let client = reqwest::Client::builder()
959            .redirect(reqwest::redirect::Policy::limited(10))
960            .build()
961            .unwrap();
962        let page = fetch_and_extract(&client, "https://www.youtube.com/watch?v=McO_xcf4IYw")
963            .await
964            .unwrap();
965        assert!(page.text.contains("# YouTube Video Context"));
966        assert!(page.text.contains("- Video ID: McO_xcf4IYw"));
967        assert!(page.text.contains("## Transcript"));
968        assert!(page.content_length > 1000);
969    }
970}