Skip to main content

imp_core/tools/web/
youtube.rs

1use reqwest::Client;
2use serde_json::Value;
3use url::Url;
4
5use super::types::{ContentFormat, ExtractionQuality, PageContent};
6
7const WATCH_BASE_URL: &str = "https://www.youtube.com/watch";
8const PLAYER_RESPONSE_VAR: &str = "ytInitialPlayerResponse";
9const ANDROID_VR_CLIENT_NAME: &str = "28";
10const ANDROID_VR_CLIENT_VERSION: &str = "1.71.26";
11const ANDROID_VR_USER_AGENT: &str = "com.google.android.apps.youtube.vr.oculus/1.71.26 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip";
12
13#[derive(Debug, Clone, PartialEq)]
14struct VideoId(String);
15
16impl VideoId {
17    fn as_str(&self) -> &str {
18        &self.0
19    }
20}
21
22#[derive(Debug, Clone, PartialEq)]
23struct CaptionTrack {
24    base_url: String,
25    language_code: String,
26    name: String,
27    is_generated: bool,
28}
29
30#[derive(Debug, Clone, PartialEq)]
31struct TranscriptSegment {
32    start_ms: u64,
33    duration_ms: Option<u64>,
34    text: String,
35}
36
37#[derive(Debug, Clone, Default, PartialEq)]
38struct VideoMetadata {
39    title: Option<String>,
40    author: Option<String>,
41    channel_id: Option<String>,
42    duration_seconds: Option<String>,
43    view_count: Option<String>,
44    description: Option<String>,
45    publish_date: Option<String>,
46    upload_date: Option<String>,
47}
48
49struct CaptionSource {
50    tracks: Vec<CaptionTrack>,
51    selected_track: CaptionTrack,
52    segments: Vec<TranscriptSegment>,
53    source_client: &'static str,
54}
55
56pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, YouTubeError> {
57    let parsed_url = Url::parse(url).map_err(|err| YouTubeError::InvalidUrl(err.to_string()))?;
58    let video_id = extract_video_id(&parsed_url).ok_or(YouTubeError::UnsupportedUrl)?;
59    let requested_url = url.to_string();
60    let watch_url = canonical_watch_url(video_id.as_str());
61
62    let response = client
63        .get(watch_url.as_str())
64        .header("User-Agent", super::read::USER_AGENT)
65        .header("Accept", super::read::ACCEPT_HEADER)
66        .header("Accept-Language", "en-US,en;q=0.9")
67        .send()
68        .await
69        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
70
71    let status_code = response.status().as_u16();
72    if !response.status().is_success() {
73        return Err(YouTubeError::HttpStatus(
74            status_code,
75            response
76                .status()
77                .canonical_reason()
78                .unwrap_or("Unknown")
79                .to_string(),
80        ));
81    }
82
83    let final_url = response.url().to_string();
84    let content_type = response
85        .headers()
86        .get("content-type")
87        .and_then(|value| value.to_str().ok())
88        .map(str::to_string);
89    let html = response
90        .text()
91        .await
92        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
93    let raw_body_bytes = html.len();
94
95    let initial_player_response = extract_initial_player_response(&html)?;
96    let metadata = extract_metadata(&initial_player_response);
97    let visitor_data = extract_visitor_data(&html);
98    let caption_source = resolve_caption_source(
99        client,
100        video_id.as_str(),
101        &initial_player_response,
102        visitor_data.as_deref(),
103    )
104    .await;
105
106    let title = metadata
107        .title
108        .clone()
109        .unwrap_or_else(|| format!("YouTube video {}", video_id.as_str()));
110    let (text, diagnostics) = match caption_source {
111        Ok(caption_source) if !caption_source.segments.is_empty() => (
112            format_video_context(
113                video_id.as_str(),
114                &watch_url,
115                &metadata,
116                &caption_source.selected_track,
117                &caption_source.segments,
118            ),
119            build_diagnostics(
120                &caption_source.tracks,
121                &caption_source.selected_track,
122                caption_source.segments.len(),
123                caption_source.source_client,
124            ),
125        ),
126        Ok(caption_source) => (
127            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
128            build_metadata_only_diagnostics(Some(&format!(
129                "Caption tracks were found, but no transcript segments were extracted from {}.",
130                caption_source.source_client
131            ))),
132        ),
133        Err(err @ (YouTubeError::CaptionTracksMissing | YouTubeError::NoUsableCaptionTrack)) => (
134            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
135            build_metadata_only_diagnostics(Some(&err.to_string())),
136        ),
137        Err(err @ (YouTubeError::TranscriptEmpty | YouTubeError::TranscriptParse(_))) => (
138            format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
139            build_metadata_only_diagnostics(Some(&err.to_string())),
140        ),
141        Err(err) => return Err(err),
142    };
143
144    let was_redirected = final_url != watch_url;
145
146    Ok(PageContent {
147        title: Some(title),
148        content_length: text.len(),
149        text,
150        url: final_url,
151        requested_url,
152        status_code,
153        content_type,
154        format_received: ContentFormat::Html,
155        was_redirected,
156        raw_body_bytes,
157        diagnostics,
158        quality: ExtractionQuality::Good,
159        quality_reasons: Vec::new(),
160    })
161}
162
163async fn resolve_caption_source(
164    client: &Client,
165    video_id: &str,
166    initial_player_response: &Value,
167    visitor_data: Option<&str>,
168) -> Result<CaptionSource, YouTubeError> {
169    let web_result = fetch_caption_source_from_response(
170        client,
171        initial_player_response,
172        super::read::USER_AGENT,
173        "web",
174    )
175    .await;
176    if web_result.is_ok() {
177        return web_result;
178    }
179    let web_error = web_result.err();
180
181    let Some(visitor_data) = visitor_data else {
182        return Err(web_error.unwrap_or(YouTubeError::VisitorDataMissing));
183    };
184
185    let android_vr_response =
186        fetch_android_vr_player_response(client, video_id, visitor_data).await?;
187    fetch_caption_source_from_response(
188        client,
189        &android_vr_response,
190        ANDROID_VR_USER_AGENT,
191        "android_vr",
192    )
193    .await
194}
195
196async fn fetch_caption_source_from_response(
197    client: &Client,
198    player_response: &Value,
199    user_agent: &str,
200    source_client: &'static str,
201) -> Result<CaptionSource, YouTubeError> {
202    let tracks = extract_caption_tracks(player_response)?;
203    let selected_track = select_caption_track(&tracks).ok_or(YouTubeError::NoUsableCaptionTrack)?;
204    let segments = fetch_transcript_segments(client, &selected_track, user_agent).await?;
205
206    Ok(CaptionSource {
207        tracks,
208        selected_track,
209        segments,
210        source_client,
211    })
212}
213
214async fn fetch_transcript_segments(
215    client: &Client,
216    track: &CaptionTrack,
217    user_agent: &str,
218) -> Result<Vec<TranscriptSegment>, YouTubeError> {
219    let transcript_url = caption_url_with_json3(&track.base_url)?;
220    let transcript_response = client
221        .get(transcript_url.as_str())
222        .header("User-Agent", user_agent)
223        .header(
224            "Accept",
225            "application/json,text/xml,text/plain;q=0.9,*/*;q=0.5",
226        )
227        .header("Accept-Language", "en-US,en;q=0.9")
228        .send()
229        .await
230        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
231
232    if !transcript_response.status().is_success() {
233        return Err(YouTubeError::TranscriptHttpStatus(
234            transcript_response.status().as_u16(),
235        ));
236    }
237
238    let transcript_text = transcript_response
239        .text()
240        .await
241        .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
242    if transcript_text.trim().is_empty() {
243        return Err(YouTubeError::TranscriptEmpty);
244    }
245
246    if transcript_text.trim_start().starts_with('<') {
247        return Ok(parse_xml_transcript(&transcript_text));
248    }
249
250    let transcript_json: Value = serde_json::from_str(&transcript_text)
251        .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
252    Ok(parse_json3_transcript(&transcript_json))
253}
254
255async fn fetch_android_vr_player_response(
256    client: &Client,
257    video_id: &str,
258    visitor_data: &str,
259) -> Result<Value, YouTubeError> {
260    let payload = serde_json::json!({
261        "context": {
262            "client": {
263                "clientName": "ANDROID_VR",
264                "clientVersion": ANDROID_VR_CLIENT_VERSION,
265                "deviceMake": "Oculus",
266                "deviceModel": "Quest 3",
267                "androidSdkVersion": 32,
268                "userAgent": ANDROID_VR_USER_AGENT,
269                "osName": "Android",
270                "osVersion": "12L",
271                "hl": "en",
272                "gl": "US"
273            }
274        },
275        "videoId": video_id,
276        "contentCheckOk": true,
277        "racyCheckOk": true
278    });
279
280    let response = client
281        .post("https://www.youtube.com/youtubei/v1/player")
282        .header("Content-Type", "application/json")
283        .header("User-Agent", ANDROID_VR_USER_AGENT)
284        .header("X-YouTube-Client-Name", ANDROID_VR_CLIENT_NAME)
285        .header("X-YouTube-Client-Version", ANDROID_VR_CLIENT_VERSION)
286        .header("X-Goog-Visitor-Id", visitor_data)
287        .header("Origin", "https://www.youtube.com")
288        .json(&payload)
289        .send()
290        .await
291        .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
292
293    if !response.status().is_success() {
294        return Err(YouTubeError::PlayerApiHttpStatus(
295            response.status().as_u16(),
296        ));
297    }
298
299    let player_response = response
300        .json::<Value>()
301        .await
302        .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))?;
303
304    if player_response
305        .pointer("/playabilityStatus/status")
306        .and_then(Value::as_str)
307        == Some("LOGIN_REQUIRED")
308    {
309        return Err(YouTubeError::PlayerApiLoginRequired(
310            player_response
311                .pointer("/playabilityStatus/reason")
312                .and_then(Value::as_str)
313                .unwrap_or("sign-in required")
314                .to_string(),
315        ));
316    }
317
318    Ok(player_response)
319}
320
321pub fn is_youtube_url(url: &Url) -> bool {
322    url.host_str().is_some_and(|host| {
323        let host = host.to_ascii_lowercase();
324        host == "youtu.be"
325            || host.ends_with(".youtu.be")
326            || host == "youtube.com"
327            || host.ends_with(".youtube.com")
328    })
329}
330
331fn extract_video_id(url: &Url) -> Option<VideoId> {
332    let host = url.host_str()?.to_ascii_lowercase();
333    if host == "youtu.be" || host.ends_with(".youtu.be") {
334        return first_path_segment(url).and_then(VideoId::from_candidate);
335    }
336
337    if !(host == "youtube.com" || host.ends_with(".youtube.com")) {
338        return None;
339    }
340
341    match first_path_segment(url).as_deref() {
342        Some("watch") => url
343            .query_pairs()
344            .find_map(|(key, value)| (key == "v").then(|| value.into_owned()))
345            .and_then(VideoId::from_candidate),
346        Some("shorts" | "embed" | "live") => url
347            .path_segments()
348            .and_then(|mut segments| segments.nth(1).map(str::to_string))
349            .and_then(VideoId::from_candidate),
350        _ => None,
351    }
352}
353
354impl VideoId {
355    fn from_candidate(candidate: String) -> Option<Self> {
356        let id = candidate.trim();
357        let is_valid = id.len() == 11
358            && id
359                .bytes()
360                .all(|byte| byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'-');
361        is_valid.then(|| Self(id.to_string()))
362    }
363}
364
365fn first_path_segment(url: &Url) -> Option<String> {
366    url.path_segments()
367        .and_then(|mut segments| segments.next().map(str::to_string))
368}
369
370fn canonical_watch_url(video_id: &str) -> String {
371    format!("{WATCH_BASE_URL}?v={video_id}")
372}
373
374fn extract_initial_player_response(html: &str) -> Result<Value, YouTubeError> {
375    let marker_index = html
376        .find(PLAYER_RESPONSE_VAR)
377        .ok_or(YouTubeError::PlayerResponseMissing)?;
378    let after_marker = &html[marker_index + PLAYER_RESPONSE_VAR.len()..];
379    let brace_relative = after_marker
380        .find('{')
381        .ok_or(YouTubeError::PlayerResponseMissing)?;
382    let json_start = marker_index + PLAYER_RESPONSE_VAR.len() + brace_relative;
383    let json_end = find_balanced_json_end(html, json_start)?;
384    serde_json::from_str(&html[json_start..json_end])
385        .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))
386}
387
388fn find_balanced_json_end(text: &str, start: usize) -> Result<usize, YouTubeError> {
389    let mut depth = 0u32;
390    let mut in_string = false;
391    let mut escaped = false;
392
393    for (offset, ch) in text[start..].char_indices() {
394        if escaped {
395            escaped = false;
396            continue;
397        }
398
399        if ch == '\\' {
400            escaped = in_string;
401            continue;
402        }
403
404        if ch == '"' {
405            in_string = !in_string;
406            continue;
407        }
408
409        if in_string {
410            continue;
411        }
412
413        match ch {
414            '{' => depth += 1,
415            '}' => {
416                depth = depth.saturating_sub(1);
417                if depth == 0 {
418                    return Ok(start + offset + ch.len_utf8());
419                }
420            }
421            _ => {}
422        }
423    }
424
425    Err(YouTubeError::PlayerResponseUnterminated)
426}
427
428fn extract_visitor_data(html: &str) -> Option<String> {
429    extract_quoted_json_field(html, "VISITOR_DATA")
430        .or_else(|| extract_quoted_json_field(html, "visitorData"))
431}
432
433fn extract_quoted_json_field(text: &str, key: &str) -> Option<String> {
434    let marker = format!("\"{key}\":\"");
435    let start = text.find(&marker)? + marker.len();
436    let tail = &text[start..];
437    let end = tail.find('"')?;
438    Some(tail[..end].to_string())
439}
440
441fn extract_metadata(player_response: &Value) -> VideoMetadata {
442    let details = player_response.get("videoDetails").unwrap_or(&Value::Null);
443    let microformat = player_response
444        .pointer("/microformat/playerMicroformatRenderer")
445        .unwrap_or(&Value::Null);
446
447    VideoMetadata {
448        title: string_at(details, "title").or_else(|| text_runs_at(microformat, "title")),
449        author: string_at(details, "author").or_else(|| string_at(microformat, "ownerChannelName")),
450        channel_id: string_at(details, "channelId")
451            .or_else(|| string_at(microformat, "externalChannelId")),
452        duration_seconds: string_at(details, "lengthSeconds")
453            .or_else(|| string_at(microformat, "lengthSeconds")),
454        view_count: string_at(details, "viewCount").or_else(|| string_at(microformat, "viewCount")),
455        description: string_at(details, "shortDescription")
456            .or_else(|| text_runs_at(microformat, "description")),
457        publish_date: string_at(microformat, "publishDate"),
458        upload_date: string_at(microformat, "uploadDate"),
459    }
460}
461
462fn extract_caption_tracks(player_response: &Value) -> Result<Vec<CaptionTrack>, YouTubeError> {
463    let tracks = player_response
464        .pointer("/captions/playerCaptionsTracklistRenderer/captionTracks")
465        .and_then(Value::as_array)
466        .ok_or(YouTubeError::CaptionTracksMissing)?;
467
468    let parsed = tracks
469        .iter()
470        .filter_map(|track| {
471            Some(CaptionTrack {
472                base_url: string_at(track, "baseUrl")?,
473                language_code: string_at(track, "languageCode")?,
474                name: text_runs_at(track, "name").unwrap_or_else(|| "unknown".to_string()),
475                is_generated: string_at(track, "kind").as_deref() == Some("asr"),
476            })
477        })
478        .collect::<Vec<_>>();
479
480    if parsed.is_empty() {
481        return Err(YouTubeError::NoUsableCaptionTrack);
482    }
483
484    Ok(parsed)
485}
486
487fn select_caption_track(tracks: &[CaptionTrack]) -> Option<CaptionTrack> {
488    tracks
489        .iter()
490        .find(|track| is_english(&track.language_code) && !track.is_generated)
491        .or_else(|| tracks.iter().find(|track| is_english(&track.language_code)))
492        .or_else(|| tracks.first())
493        .cloned()
494}
495
496fn is_english(language_code: &str) -> bool {
497    let language = language_code.to_ascii_lowercase();
498    language == "en" || language.starts_with("en-") || language == "en-orig"
499}
500
501fn caption_url_with_json3(base_url: &str) -> Result<String, YouTubeError> {
502    let mut url =
503        Url::parse(base_url).map_err(|err| YouTubeError::InvalidCaptionUrl(err.to_string()))?;
504    {
505        let has_fmt = url.query_pairs().any(|(key, _)| key == "fmt");
506        if !has_fmt {
507            url.query_pairs_mut().append_pair("fmt", "json3");
508        }
509    }
510    Ok(url.to_string())
511}
512
513fn parse_json3_transcript(value: &Value) -> Vec<TranscriptSegment> {
514    value
515        .get("events")
516        .or_else(|| value.get("aAppend"))
517        .and_then(Value::as_array)
518        .into_iter()
519        .flatten()
520        .filter_map(parse_json3_event)
521        .collect()
522}
523
524fn parse_json3_event(event: &Value) -> Option<TranscriptSegment> {
525    let text = event
526        .get("segs")?
527        .as_array()?
528        .iter()
529        .filter_map(|seg| seg.get("utf8").and_then(Value::as_str))
530        .collect::<String>();
531    let text = normalize_transcript_text(&text);
532    if text.is_empty() || is_noise_segment(&text) {
533        return None;
534    }
535
536    Some(TranscriptSegment {
537        start_ms: event.get("tStartMs").and_then(Value::as_u64).unwrap_or(0),
538        duration_ms: event.get("dDurationMs").and_then(Value::as_u64),
539        text,
540    })
541}
542
543fn parse_xml_transcript(text: &str) -> Vec<TranscriptSegment> {
544    text.split("<p ")
545        .skip(1)
546        .filter_map(parse_xml_paragraph)
547        .collect()
548}
549
550fn parse_xml_paragraph(fragment: &str) -> Option<TranscriptSegment> {
551    let tag_end = fragment.find('>')?;
552    let attrs = &fragment[..tag_end];
553    let body = &fragment[tag_end + 1..fragment.find("</p>")?];
554    let text = normalize_transcript_text(&strip_xml_tags(body));
555    if text.is_empty() || is_noise_segment(&text) {
556        return None;
557    }
558
559    Some(TranscriptSegment {
560        start_ms: extract_xml_time_ms(attrs, "t").unwrap_or(0),
561        duration_ms: extract_xml_time_ms(attrs, "d"),
562        text,
563    })
564}
565
566fn extract_xml_time_ms(attrs: &str, key: &str) -> Option<u64> {
567    let marker = format!(r#"{key}=""#);
568    let start = attrs.find(&marker)? + marker.len();
569    let tail = &attrs[start..];
570    let end = tail.find('"')?;
571    tail[..end].parse().ok()
572}
573
574fn strip_xml_tags(text: &str) -> String {
575    let mut out = String::with_capacity(text.len());
576    let mut in_tag = false;
577    let mut entity = String::new();
578    let mut in_entity = false;
579
580    for ch in text.chars() {
581        if in_entity {
582            entity.push(ch);
583            if ch == ';' {
584                out.push_str(match entity.as_str() {
585                    "amp;" => "&",
586                    "lt;" => "<",
587                    "gt;" => ">",
588                    "quot;" => "\"",
589                    "apos;" | "#39;" => "'",
590                    _ => "",
591                });
592                entity.clear();
593                in_entity = false;
594            }
595            continue;
596        }
597
598        match ch {
599            '<' => in_tag = true,
600            '>' => in_tag = false,
601            '&' if !in_tag => in_entity = true,
602            _ if !in_tag => out.push(ch),
603            _ => {}
604        }
605    }
606
607    out
608}
609
610fn normalize_transcript_text(text: &str) -> String {
611    text.split_whitespace().collect::<Vec<_>>().join(" ")
612}
613
614fn is_noise_segment(text: &str) -> bool {
615    let normalized = text.trim().to_ascii_lowercase();
616    matches!(
617        normalized.as_str(),
618        "[music]" | "[applause]" | "[laughter]" | "♪" | "♫"
619    )
620}
621
622fn format_video_context(
623    video_id: &str,
624    canonical_url: &str,
625    metadata: &VideoMetadata,
626    track: &CaptionTrack,
627    segments: &[TranscriptSegment],
628) -> String {
629    let mut output = String::new();
630    output.push_str("# YouTube Video Context\n\n");
631    output.push_str("## Source\n");
632    output.push_str(&format!("- URL: {canonical_url}\n"));
633    output.push_str(&format!("- Video ID: {video_id}\n"));
634    push_optional(&mut output, "- Title", metadata.title.as_deref());
635    push_optional(&mut output, "- Channel", metadata.author.as_deref());
636    push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
637    push_optional(
638        &mut output,
639        "- Duration seconds",
640        metadata.duration_seconds.as_deref(),
641    );
642    push_optional(&mut output, "- Views", metadata.view_count.as_deref());
643    push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
644    push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
645
646    output.push_str("\n## Transcript Track\n");
647    output.push_str(&format!("- Language: {}\n", track.language_code));
648    output.push_str(&format!("- Name: {}\n", track.name));
649    output.push_str(&format!("- Auto-generated: {}\n", track.is_generated));
650
651    if let Some(description) = metadata.description.as_deref() {
652        output.push_str("\n## Description\n");
653        output.push_str(description.trim());
654        output.push('\n');
655    }
656
657    output.push_str("\n## Transcript\n");
658    for segment in segments {
659        output.push_str(&format!(
660            "[{}] {}\n",
661            format_timestamp(segment.start_ms),
662            segment.text
663        ));
664    }
665
666    output.trim().to_string()
667}
668
669fn format_metadata_only_context(
670    video_id: &str,
671    canonical_url: &str,
672    metadata: &VideoMetadata,
673) -> String {
674    let mut output = String::new();
675    output.push_str("# YouTube Video Context\n\n");
676    output.push_str("## Source\n");
677    output.push_str(&format!("- URL: {canonical_url}\n"));
678    output.push_str(&format!("- Video ID: {video_id}\n"));
679    push_optional(&mut output, "- Title", metadata.title.as_deref());
680    push_optional(&mut output, "- Channel", metadata.author.as_deref());
681    push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
682    push_optional(
683        &mut output,
684        "- Duration seconds",
685        metadata.duration_seconds.as_deref(),
686    );
687    push_optional(&mut output, "- Views", metadata.view_count.as_deref());
688    push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
689    push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
690
691    if let Some(description) = metadata.description.as_deref() {
692        output.push_str("\n## Description\n");
693        output.push_str(description.trim());
694        output.push('\n');
695    }
696
697    output.push_str("\n## Transcript\n");
698    output.push_str("Transcript unavailable. YouTube metadata was extracted, but no usable caption/transcript body was available for this video.\n");
699    output.trim().to_string()
700}
701
702fn build_diagnostics(
703    tracks: &[CaptionTrack],
704    selected_track: &CaptionTrack,
705    segment_count: usize,
706    source_client: &str,
707) -> Vec<String> {
708    vec![
709        "YouTube extraction used native HTTP transcript path; no video/audio was downloaded."
710            .to_string(),
711        format!(
712            "Selected caption track from {source_client}: {} ({}, auto-generated: {}).",
713            selected_track.name, selected_track.language_code, selected_track.is_generated
714        ),
715        format!(
716            "Found {} caption track(s), extracted {} transcript segment(s).",
717            tracks.len(),
718            segment_count
719        ),
720    ]
721}
722
723fn build_metadata_only_diagnostics(reason: Option<&str>) -> Vec<String> {
724    let mut diagnostics = vec![
725        "YouTube extraction used native HTTP metadata path; no video/audio was downloaded."
726            .to_string(),
727        "Transcript unavailable; returning metadata-only YouTube context.".to_string(),
728    ];
729    if let Some(reason) = reason.filter(|reason| !reason.trim().is_empty()) {
730        diagnostics.push(format!("Transcript unavailable reason: {reason}"));
731    }
732    diagnostics
733}
734
735fn push_optional(output: &mut String, label: &str, value: Option<&str>) {
736    if let Some(value) = value.filter(|value| !value.trim().is_empty()) {
737        output.push_str(&format!("{label}: {}\n", value.trim()));
738    }
739}
740
741fn format_timestamp(ms: u64) -> String {
742    let total_seconds = ms / 1000;
743    let hours = total_seconds / 3600;
744    let minutes = (total_seconds % 3600) / 60;
745    let seconds = total_seconds % 60;
746
747    if hours > 0 {
748        format!("{hours:02}:{minutes:02}:{seconds:02}")
749    } else {
750        format!("{minutes:02}:{seconds:02}")
751    }
752}
753
754fn string_at(value: &Value, key: &str) -> Option<String> {
755    value
756        .get(key)
757        .and_then(Value::as_str)
758        .filter(|value| !value.is_empty())
759        .map(str::to_string)
760}
761
762fn text_runs_at(value: &Value, key: &str) -> Option<String> {
763    let text_value = value.get(key)?;
764    if let Some(simple_text) = text_value.get("simpleText").and_then(Value::as_str) {
765        return Some(simple_text.to_string());
766    }
767    let runs = text_value.get("runs")?.as_array()?;
768    let text = runs
769        .iter()
770        .filter_map(|run| run.get("text").and_then(Value::as_str))
771        .collect::<String>();
772    (!text.is_empty()).then_some(text)
773}
774
775#[derive(Debug)]
776pub enum YouTubeError {
777    InvalidUrl(String),
778    UnsupportedUrl,
779    Fetch(String),
780    HttpStatus(u16, String),
781    PlayerResponseMissing,
782    PlayerResponseUnterminated,
783    PlayerResponseParse(String),
784    PlayerApiHttpStatus(u16),
785    PlayerApiLoginRequired(String),
786    VisitorDataMissing,
787    CaptionTracksMissing,
788    NoUsableCaptionTrack,
789    InvalidCaptionUrl(String),
790    TranscriptHttpStatus(u16),
791    TranscriptParse(String),
792    TranscriptEmpty,
793}
794
795impl std::fmt::Display for YouTubeError {
796    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
797        match self {
798            Self::InvalidUrl(msg) => write!(f, "Invalid YouTube URL: {msg}"),
799            Self::UnsupportedUrl => write!(f, "Unsupported YouTube URL"),
800            Self::Fetch(msg) => write!(f, "YouTube fetch failed: {msg}"),
801            Self::HttpStatus(code, reason) => write!(f, "YouTube returned HTTP {code} {reason}"),
802            Self::PlayerResponseMissing => write!(f, "YouTube player response not found"),
803            Self::PlayerResponseUnterminated => {
804                write!(f, "YouTube player response JSON was unterminated")
805            }
806            Self::PlayerResponseParse(msg) => {
807                write!(f, "YouTube player response parse failed: {msg}")
808            }
809            Self::PlayerApiHttpStatus(code) => write!(f, "YouTube player API returned HTTP {code}"),
810            Self::PlayerApiLoginRequired(reason) => {
811                write!(f, "YouTube player API required login: {reason}")
812            }
813            Self::VisitorDataMissing => write!(f, "YouTube visitor data was unavailable"),
814            Self::CaptionTracksMissing => {
815                write!(f, "YouTube captions are unavailable for this video")
816            }
817            Self::NoUsableCaptionTrack => write!(f, "No usable YouTube caption track found"),
818            Self::InvalidCaptionUrl(msg) => write!(f, "Invalid YouTube caption URL: {msg}"),
819            Self::TranscriptHttpStatus(code) => {
820                write!(f, "YouTube transcript returned HTTP {code}")
821            }
822            Self::TranscriptParse(msg) => write!(f, "YouTube transcript parse failed: {msg}"),
823            Self::TranscriptEmpty => write!(
824                f,
825                "YouTube transcript was empty; caption track metadata was found but YouTube returned no caption body for this client"
826            ),
827        }
828    }
829}
830
831#[cfg(test)]
832pub(crate) mod tests {
833    use super::*;
834    use serde_json::json;
835
836    #[test]
837    fn youtube_extracts_video_ids_from_common_urls() {
838        let cases = [
839            ("https://www.youtube.com/watch?v=McO_xcf4IYw", "McO_xcf4IYw"),
840            ("https://youtu.be/McO_xcf4IYw?t=12", "McO_xcf4IYw"),
841            ("https://www.youtube.com/shorts/McO_xcf4IYw", "McO_xcf4IYw"),
842            ("https://www.youtube.com/embed/McO_xcf4IYw", "McO_xcf4IYw"),
843        ];
844
845        for (url, expected) in cases {
846            let parsed = Url::parse(url).unwrap();
847            assert_eq!(extract_video_id(&parsed).unwrap().as_str(), expected);
848        }
849    }
850
851    #[test]
852    fn youtube_rejects_invalid_video_ids() {
853        let parsed =
854            Url::parse("https://www.youtube.com/watch?v=not-valid-because-too-long").unwrap();
855        assert!(extract_video_id(&parsed).is_none());
856    }
857
858    #[test]
859    fn youtube_extracts_balanced_player_response() {
860        let html = r#"<script>var ytInitialPlayerResponse = {"videoDetails":{"title":"A } in string","shortDescription":"escaped \" brace }"},"captions":{}};</script>"#;
861        let response = extract_initial_player_response(html).unwrap();
862        assert_eq!(response["videoDetails"]["title"], "A } in string");
863        assert_eq!(
864            response["videoDetails"]["shortDescription"],
865            "escaped \" brace }"
866        );
867    }
868
869    #[test]
870    fn youtube_extracts_visitor_data() {
871        let html = r#"ytcfg.set({"VISITOR_DATA":"visitor-token","other":true});"#;
872        assert_eq!(extract_visitor_data(html).as_deref(), Some("visitor-token"));
873    }
874
875    #[test]
876    fn youtube_selects_manual_english_before_auto_english() {
877        let tracks = vec![
878            CaptionTrack {
879                base_url: "https://example.com/fr".into(),
880                language_code: "fr".into(),
881                name: "French".into(),
882                is_generated: false,
883            },
884            CaptionTrack {
885                base_url: "https://example.com/en-auto".into(),
886                language_code: "en".into(),
887                name: "English auto".into(),
888                is_generated: true,
889            },
890            CaptionTrack {
891                base_url: "https://example.com/en".into(),
892                language_code: "en".into(),
893                name: "English".into(),
894                is_generated: false,
895            },
896        ];
897
898        let selected = select_caption_track(&tracks).unwrap();
899        assert_eq!(selected.base_url, "https://example.com/en");
900    }
901
902    #[test]
903    fn youtube_parses_json3_transcript_segments() {
904        let transcript = json!({
905            "events": [
906                {"tStartMs": 0, "dDurationMs": 1000, "segs": [{"utf8": "Hello "}, {"utf8": "world"}]},
907                {"tStartMs": 1000, "segs": [{"utf8": "\n"}]},
908                {"tStartMs": 2000, "segs": [{"utf8": "[Music]"}]},
909                {"tStartMs": 3000, "dDurationMs": 500, "segs": [{"utf8": "next line"}]}
910            ]
911        });
912
913        let segments = parse_json3_transcript(&transcript);
914        assert_eq!(segments.len(), 2);
915        assert_eq!(segments[0].text, "Hello world");
916        assert_eq!(segments[0].start_ms, 0);
917        assert_eq!(segments[1].text, "next line");
918    }
919
920    #[test]
921    fn youtube_parses_xml_transcript_segments() {
922        let transcript = r#"<?xml version="1.0" ?><timedtext><body><p t="1000" d="2000">Hello &amp; <s>world</s></p><p t="3000" d="1000">[Music]</p></body></timedtext>"#;
923        let segments = parse_xml_transcript(transcript);
924        assert_eq!(segments.len(), 1);
925        assert_eq!(segments[0].start_ms, 1000);
926        assert_eq!(segments[0].duration_ms, Some(2000));
927        assert_eq!(segments[0].text, "Hello & world");
928    }
929
930    #[test]
931    fn youtube_adds_json3_format_to_caption_url() {
932        let url =
933            caption_url_with_json3("https://www.youtube.com/api/timedtext?v=abc&lang=en").unwrap();
934        assert!(url.contains("fmt=json3"));
935    }
936
937    #[test]
938    fn youtube_formats_metadata_only_context_with_clear_transcript_marker() {
939        let metadata = VideoMetadata {
940            title: Some("No captions example".into()),
941            author: Some("Example Channel".into()),
942            ..VideoMetadata::default()
943        };
944        let text = format_metadata_only_context(
945            "McO_xcf4IYw",
946            "https://www.youtube.com/watch?v=McO_xcf4IYw",
947            &metadata,
948        );
949        assert!(text.contains("No captions example"));
950        assert!(text.contains("Transcript unavailable"));
951    }
952
953    #[test]
954    fn youtube_metadata_only_diagnostics_include_reason() {
955        let diagnostics = build_metadata_only_diagnostics(Some("captions disabled"));
956        assert!(diagnostics
957            .iter()
958            .any(|line| line.contains("metadata-only")));
959        assert!(diagnostics
960            .iter()
961            .any(|line| line.contains("captions disabled")));
962    }
963
964    #[tokio::test]
965    #[ignore = "network smoke test for YouTube extraction"]
966    async fn youtube_reads_sample_video_over_http() {
967        let client = reqwest::Client::builder()
968            .redirect(reqwest::redirect::Policy::limited(10))
969            .build()
970            .unwrap();
971        let page = fetch_and_extract(&client, "https://www.youtube.com/watch?v=McO_xcf4IYw")
972            .await
973            .unwrap();
974        assert!(page.text.contains("# YouTube Video Context"));
975        assert!(page.text.contains("- Video ID: McO_xcf4IYw"));
976        assert!(page.text.contains("## Transcript"));
977        assert!(page.content_length > 1000);
978    }
979}