1use reqwest::Client;
2use serde_json::Value;
3use url::Url;
4
5use super::types::{ContentFormat, ExtractionQuality, PageContent};
6
7const WATCH_BASE_URL: &str = "https://www.youtube.com/watch";
8const PLAYER_RESPONSE_VAR: &str = "ytInitialPlayerResponse";
9const ANDROID_VR_CLIENT_NAME: &str = "28";
10const ANDROID_VR_CLIENT_VERSION: &str = "1.71.26";
11const ANDROID_VR_USER_AGENT: &str = "com.google.android.apps.youtube.vr.oculus/1.71.26 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip";
12
13#[derive(Debug, Clone, PartialEq)]
14struct VideoId(String);
15
16impl VideoId {
17 fn as_str(&self) -> &str {
18 &self.0
19 }
20}
21
22#[derive(Debug, Clone, PartialEq)]
23struct CaptionTrack {
24 base_url: String,
25 language_code: String,
26 name: String,
27 is_generated: bool,
28}
29
30#[derive(Debug, Clone, PartialEq)]
31struct TranscriptSegment {
32 start_ms: u64,
33 duration_ms: Option<u64>,
34 text: String,
35}
36
37#[derive(Debug, Clone, Default, PartialEq)]
38struct VideoMetadata {
39 title: Option<String>,
40 author: Option<String>,
41 channel_id: Option<String>,
42 duration_seconds: Option<String>,
43 view_count: Option<String>,
44 description: Option<String>,
45 publish_date: Option<String>,
46 upload_date: Option<String>,
47}
48
49struct CaptionSource {
50 tracks: Vec<CaptionTrack>,
51 selected_track: CaptionTrack,
52 segments: Vec<TranscriptSegment>,
53 source_client: &'static str,
54}
55
56pub async fn fetch_and_extract(client: &Client, url: &str) -> Result<PageContent, YouTubeError> {
57 let parsed_url = Url::parse(url).map_err(|err| YouTubeError::InvalidUrl(err.to_string()))?;
58 let video_id = extract_video_id(&parsed_url).ok_or(YouTubeError::UnsupportedUrl)?;
59 let requested_url = url.to_string();
60 let watch_url = canonical_watch_url(video_id.as_str());
61
62 let response = client
63 .get(watch_url.as_str())
64 .header("User-Agent", super::read::USER_AGENT)
65 .header("Accept", super::read::ACCEPT_HEADER)
66 .header("Accept-Language", "en-US,en;q=0.9")
67 .send()
68 .await
69 .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
70
71 let status_code = response.status().as_u16();
72 if !response.status().is_success() {
73 return Err(YouTubeError::HttpStatus(
74 status_code,
75 response
76 .status()
77 .canonical_reason()
78 .unwrap_or("Unknown")
79 .to_string(),
80 ));
81 }
82
83 let final_url = response.url().to_string();
84 let content_type = response
85 .headers()
86 .get("content-type")
87 .and_then(|value| value.to_str().ok())
88 .map(str::to_string);
89 let html = response
90 .text()
91 .await
92 .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
93 let raw_body_bytes = html.len();
94
95 let initial_player_response = extract_initial_player_response(&html)?;
96 let metadata = extract_metadata(&initial_player_response);
97 let visitor_data = extract_visitor_data(&html);
98 let caption_source = resolve_caption_source(
99 client,
100 video_id.as_str(),
101 &initial_player_response,
102 visitor_data.as_deref(),
103 )
104 .await;
105
106 let title = metadata
107 .title
108 .clone()
109 .unwrap_or_else(|| format!("YouTube video {}", video_id.as_str()));
110 let (text, diagnostics) = match caption_source {
111 Ok(caption_source) if !caption_source.segments.is_empty() => (
112 format_video_context(
113 video_id.as_str(),
114 &watch_url,
115 &metadata,
116 &caption_source.selected_track,
117 &caption_source.segments,
118 ),
119 build_diagnostics(
120 &caption_source.tracks,
121 &caption_source.selected_track,
122 caption_source.segments.len(),
123 caption_source.source_client,
124 ),
125 ),
126 Ok(caption_source) => (
127 format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
128 build_metadata_only_diagnostics(Some(&format!(
129 "Caption tracks were found, but no transcript segments were extracted from {}.",
130 caption_source.source_client
131 ))),
132 ),
133 Err(err @ (YouTubeError::CaptionTracksMissing | YouTubeError::NoUsableCaptionTrack)) => (
134 format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
135 build_metadata_only_diagnostics(Some(&err.to_string())),
136 ),
137 Err(err @ (YouTubeError::TranscriptEmpty | YouTubeError::TranscriptParse(_))) => (
138 format_metadata_only_context(video_id.as_str(), &watch_url, &metadata),
139 build_metadata_only_diagnostics(Some(&err.to_string())),
140 ),
141 Err(err) => return Err(err),
142 };
143
144 let was_redirected = final_url != watch_url;
145
146 Ok(PageContent {
147 title: Some(title),
148 content_length: text.len(),
149 text,
150 url: final_url,
151 requested_url,
152 status_code,
153 content_type,
154 format_received: ContentFormat::Html,
155 was_redirected,
156 raw_body_bytes,
157 diagnostics,
158 quality: ExtractionQuality::Good,
159 quality_reasons: Vec::new(),
160 })
161}
162
163async fn resolve_caption_source(
164 client: &Client,
165 video_id: &str,
166 initial_player_response: &Value,
167 visitor_data: Option<&str>,
168) -> Result<CaptionSource, YouTubeError> {
169 let web_result = fetch_caption_source_from_response(
170 client,
171 initial_player_response,
172 super::read::USER_AGENT,
173 "web",
174 )
175 .await;
176 if web_result.is_ok() {
177 return web_result;
178 }
179 let web_error = web_result.err();
180
181 let Some(visitor_data) = visitor_data else {
182 return Err(web_error.unwrap_or(YouTubeError::VisitorDataMissing));
183 };
184
185 let android_vr_response =
186 fetch_android_vr_player_response(client, video_id, visitor_data).await?;
187 fetch_caption_source_from_response(
188 client,
189 &android_vr_response,
190 ANDROID_VR_USER_AGENT,
191 "android_vr",
192 )
193 .await
194}
195
196async fn fetch_caption_source_from_response(
197 client: &Client,
198 player_response: &Value,
199 user_agent: &str,
200 source_client: &'static str,
201) -> Result<CaptionSource, YouTubeError> {
202 let tracks = extract_caption_tracks(player_response)?;
203 let selected_track = select_caption_track(&tracks).ok_or(YouTubeError::NoUsableCaptionTrack)?;
204 let segments = fetch_transcript_segments(client, &selected_track, user_agent).await?;
205
206 Ok(CaptionSource {
207 tracks,
208 selected_track,
209 segments,
210 source_client,
211 })
212}
213
214async fn fetch_transcript_segments(
215 client: &Client,
216 track: &CaptionTrack,
217 user_agent: &str,
218) -> Result<Vec<TranscriptSegment>, YouTubeError> {
219 let transcript_url = caption_url_with_json3(&track.base_url)?;
220 let transcript_response = client
221 .get(transcript_url.as_str())
222 .header("User-Agent", user_agent)
223 .header(
224 "Accept",
225 "application/json,text/xml,text/plain;q=0.9,*/*;q=0.5",
226 )
227 .header("Accept-Language", "en-US,en;q=0.9")
228 .send()
229 .await
230 .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
231
232 if !transcript_response.status().is_success() {
233 return Err(YouTubeError::TranscriptHttpStatus(
234 transcript_response.status().as_u16(),
235 ));
236 }
237
238 let transcript_text = transcript_response
239 .text()
240 .await
241 .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
242 if transcript_text.trim().is_empty() {
243 return Err(YouTubeError::TranscriptEmpty);
244 }
245
246 if transcript_text.trim_start().starts_with('<') {
247 return Ok(parse_xml_transcript(&transcript_text));
248 }
249
250 let transcript_json: Value = serde_json::from_str(&transcript_text)
251 .map_err(|err| YouTubeError::TranscriptParse(err.to_string()))?;
252 Ok(parse_json3_transcript(&transcript_json))
253}
254
255async fn fetch_android_vr_player_response(
256 client: &Client,
257 video_id: &str,
258 visitor_data: &str,
259) -> Result<Value, YouTubeError> {
260 let payload = serde_json::json!({
261 "context": {
262 "client": {
263 "clientName": "ANDROID_VR",
264 "clientVersion": ANDROID_VR_CLIENT_VERSION,
265 "deviceMake": "Oculus",
266 "deviceModel": "Quest 3",
267 "androidSdkVersion": 32,
268 "userAgent": ANDROID_VR_USER_AGENT,
269 "osName": "Android",
270 "osVersion": "12L",
271 "hl": "en",
272 "gl": "US"
273 }
274 },
275 "videoId": video_id,
276 "contentCheckOk": true,
277 "racyCheckOk": true
278 });
279
280 let response = client
281 .post("https://www.youtube.com/youtubei/v1/player")
282 .header("Content-Type", "application/json")
283 .header("User-Agent", ANDROID_VR_USER_AGENT)
284 .header("X-YouTube-Client-Name", ANDROID_VR_CLIENT_NAME)
285 .header("X-YouTube-Client-Version", ANDROID_VR_CLIENT_VERSION)
286 .header("X-Goog-Visitor-Id", visitor_data)
287 .header("Origin", "https://www.youtube.com")
288 .json(&payload)
289 .send()
290 .await
291 .map_err(|err| YouTubeError::Fetch(err.to_string()))?;
292
293 if !response.status().is_success() {
294 return Err(YouTubeError::PlayerApiHttpStatus(
295 response.status().as_u16(),
296 ));
297 }
298
299 let player_response = response
300 .json::<Value>()
301 .await
302 .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))?;
303
304 if player_response
305 .pointer("/playabilityStatus/status")
306 .and_then(Value::as_str)
307 == Some("LOGIN_REQUIRED")
308 {
309 return Err(YouTubeError::PlayerApiLoginRequired(
310 player_response
311 .pointer("/playabilityStatus/reason")
312 .and_then(Value::as_str)
313 .unwrap_or("sign-in required")
314 .to_string(),
315 ));
316 }
317
318 Ok(player_response)
319}
320
321pub fn is_youtube_url(url: &Url) -> bool {
322 url.host_str().is_some_and(|host| {
323 let host = host.to_ascii_lowercase();
324 host == "youtu.be"
325 || host.ends_with(".youtu.be")
326 || host == "youtube.com"
327 || host.ends_with(".youtube.com")
328 })
329}
330
331fn extract_video_id(url: &Url) -> Option<VideoId> {
332 let host = url.host_str()?.to_ascii_lowercase();
333 if host == "youtu.be" || host.ends_with(".youtu.be") {
334 return first_path_segment(url).and_then(VideoId::from_candidate);
335 }
336
337 if !(host == "youtube.com" || host.ends_with(".youtube.com")) {
338 return None;
339 }
340
341 match first_path_segment(url).as_deref() {
342 Some("watch") => url
343 .query_pairs()
344 .find_map(|(key, value)| (key == "v").then(|| value.into_owned()))
345 .and_then(VideoId::from_candidate),
346 Some("shorts" | "embed" | "live") => url
347 .path_segments()
348 .and_then(|mut segments| segments.nth(1).map(str::to_string))
349 .and_then(VideoId::from_candidate),
350 _ => None,
351 }
352}
353
354impl VideoId {
355 fn from_candidate(candidate: String) -> Option<Self> {
356 let id = candidate.trim();
357 let is_valid = id.len() == 11
358 && id
359 .bytes()
360 .all(|byte| byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'-');
361 is_valid.then(|| Self(id.to_string()))
362 }
363}
364
365fn first_path_segment(url: &Url) -> Option<String> {
366 url.path_segments()
367 .and_then(|mut segments| segments.next().map(str::to_string))
368}
369
370fn canonical_watch_url(video_id: &str) -> String {
371 format!("{WATCH_BASE_URL}?v={video_id}")
372}
373
374fn extract_initial_player_response(html: &str) -> Result<Value, YouTubeError> {
375 let marker_index = html
376 .find(PLAYER_RESPONSE_VAR)
377 .ok_or(YouTubeError::PlayerResponseMissing)?;
378 let after_marker = &html[marker_index + PLAYER_RESPONSE_VAR.len()..];
379 let brace_relative = after_marker
380 .find('{')
381 .ok_or(YouTubeError::PlayerResponseMissing)?;
382 let json_start = marker_index + PLAYER_RESPONSE_VAR.len() + brace_relative;
383 let json_end = find_balanced_json_end(html, json_start)?;
384 serde_json::from_str(&html[json_start..json_end])
385 .map_err(|err| YouTubeError::PlayerResponseParse(err.to_string()))
386}
387
388fn find_balanced_json_end(text: &str, start: usize) -> Result<usize, YouTubeError> {
389 let mut depth = 0u32;
390 let mut in_string = false;
391 let mut escaped = false;
392
393 for (offset, ch) in text[start..].char_indices() {
394 if escaped {
395 escaped = false;
396 continue;
397 }
398
399 if ch == '\\' {
400 escaped = in_string;
401 continue;
402 }
403
404 if ch == '"' {
405 in_string = !in_string;
406 continue;
407 }
408
409 if in_string {
410 continue;
411 }
412
413 match ch {
414 '{' => depth += 1,
415 '}' => {
416 depth = depth.saturating_sub(1);
417 if depth == 0 {
418 return Ok(start + offset + ch.len_utf8());
419 }
420 }
421 _ => {}
422 }
423 }
424
425 Err(YouTubeError::PlayerResponseUnterminated)
426}
427
428fn extract_visitor_data(html: &str) -> Option<String> {
429 extract_quoted_json_field(html, "VISITOR_DATA")
430 .or_else(|| extract_quoted_json_field(html, "visitorData"))
431}
432
433fn extract_quoted_json_field(text: &str, key: &str) -> Option<String> {
434 let marker = format!("\"{key}\":\"");
435 let start = text.find(&marker)? + marker.len();
436 let tail = &text[start..];
437 let end = tail.find('"')?;
438 Some(tail[..end].to_string())
439}
440
441fn extract_metadata(player_response: &Value) -> VideoMetadata {
442 let details = player_response.get("videoDetails").unwrap_or(&Value::Null);
443 let microformat = player_response
444 .pointer("/microformat/playerMicroformatRenderer")
445 .unwrap_or(&Value::Null);
446
447 VideoMetadata {
448 title: string_at(details, "title").or_else(|| text_runs_at(microformat, "title")),
449 author: string_at(details, "author").or_else(|| string_at(microformat, "ownerChannelName")),
450 channel_id: string_at(details, "channelId")
451 .or_else(|| string_at(microformat, "externalChannelId")),
452 duration_seconds: string_at(details, "lengthSeconds")
453 .or_else(|| string_at(microformat, "lengthSeconds")),
454 view_count: string_at(details, "viewCount").or_else(|| string_at(microformat, "viewCount")),
455 description: string_at(details, "shortDescription")
456 .or_else(|| text_runs_at(microformat, "description")),
457 publish_date: string_at(microformat, "publishDate"),
458 upload_date: string_at(microformat, "uploadDate"),
459 }
460}
461
462fn extract_caption_tracks(player_response: &Value) -> Result<Vec<CaptionTrack>, YouTubeError> {
463 let tracks = player_response
464 .pointer("/captions/playerCaptionsTracklistRenderer/captionTracks")
465 .and_then(Value::as_array)
466 .ok_or(YouTubeError::CaptionTracksMissing)?;
467
468 let parsed = tracks
469 .iter()
470 .filter_map(|track| {
471 Some(CaptionTrack {
472 base_url: string_at(track, "baseUrl")?,
473 language_code: string_at(track, "languageCode")?,
474 name: text_runs_at(track, "name").unwrap_or_else(|| "unknown".to_string()),
475 is_generated: string_at(track, "kind").as_deref() == Some("asr"),
476 })
477 })
478 .collect::<Vec<_>>();
479
480 if parsed.is_empty() {
481 return Err(YouTubeError::NoUsableCaptionTrack);
482 }
483
484 Ok(parsed)
485}
486
487fn select_caption_track(tracks: &[CaptionTrack]) -> Option<CaptionTrack> {
488 tracks
489 .iter()
490 .find(|track| is_english(&track.language_code) && !track.is_generated)
491 .or_else(|| tracks.iter().find(|track| is_english(&track.language_code)))
492 .or_else(|| tracks.first())
493 .cloned()
494}
495
496fn is_english(language_code: &str) -> bool {
497 let language = language_code.to_ascii_lowercase();
498 language == "en" || language.starts_with("en-") || language == "en-orig"
499}
500
501fn caption_url_with_json3(base_url: &str) -> Result<String, YouTubeError> {
502 let mut url =
503 Url::parse(base_url).map_err(|err| YouTubeError::InvalidCaptionUrl(err.to_string()))?;
504 {
505 let has_fmt = url.query_pairs().any(|(key, _)| key == "fmt");
506 if !has_fmt {
507 url.query_pairs_mut().append_pair("fmt", "json3");
508 }
509 }
510 Ok(url.to_string())
511}
512
513fn parse_json3_transcript(value: &Value) -> Vec<TranscriptSegment> {
514 value
515 .get("events")
516 .or_else(|| value.get("aAppend"))
517 .and_then(Value::as_array)
518 .into_iter()
519 .flatten()
520 .filter_map(parse_json3_event)
521 .collect()
522}
523
524fn parse_json3_event(event: &Value) -> Option<TranscriptSegment> {
525 let text = event
526 .get("segs")?
527 .as_array()?
528 .iter()
529 .filter_map(|seg| seg.get("utf8").and_then(Value::as_str))
530 .collect::<String>();
531 let text = normalize_transcript_text(&text);
532 if text.is_empty() || is_noise_segment(&text) {
533 return None;
534 }
535
536 Some(TranscriptSegment {
537 start_ms: event.get("tStartMs").and_then(Value::as_u64).unwrap_or(0),
538 duration_ms: event.get("dDurationMs").and_then(Value::as_u64),
539 text,
540 })
541}
542
543fn parse_xml_transcript(text: &str) -> Vec<TranscriptSegment> {
544 text.split("<p ")
545 .skip(1)
546 .filter_map(parse_xml_paragraph)
547 .collect()
548}
549
550fn parse_xml_paragraph(fragment: &str) -> Option<TranscriptSegment> {
551 let tag_end = fragment.find('>')?;
552 let attrs = &fragment[..tag_end];
553 let body = &fragment[tag_end + 1..fragment.find("</p>")?];
554 let text = normalize_transcript_text(&strip_xml_tags(body));
555 if text.is_empty() || is_noise_segment(&text) {
556 return None;
557 }
558
559 Some(TranscriptSegment {
560 start_ms: extract_xml_time_ms(attrs, "t").unwrap_or(0),
561 duration_ms: extract_xml_time_ms(attrs, "d"),
562 text,
563 })
564}
565
566fn extract_xml_time_ms(attrs: &str, key: &str) -> Option<u64> {
567 let marker = format!(r#"{key}=""#);
568 let start = attrs.find(&marker)? + marker.len();
569 let tail = &attrs[start..];
570 let end = tail.find('"')?;
571 tail[..end].parse().ok()
572}
573
574fn strip_xml_tags(text: &str) -> String {
575 let mut out = String::with_capacity(text.len());
576 let mut in_tag = false;
577 let mut entity = String::new();
578 let mut in_entity = false;
579
580 for ch in text.chars() {
581 if in_entity {
582 entity.push(ch);
583 if ch == ';' {
584 out.push_str(match entity.as_str() {
585 "amp;" => "&",
586 "lt;" => "<",
587 "gt;" => ">",
588 "quot;" => "\"",
589 "apos;" | "#39;" => "'",
590 _ => "",
591 });
592 entity.clear();
593 in_entity = false;
594 }
595 continue;
596 }
597
598 match ch {
599 '<' => in_tag = true,
600 '>' => in_tag = false,
601 '&' if !in_tag => in_entity = true,
602 _ if !in_tag => out.push(ch),
603 _ => {}
604 }
605 }
606
607 out
608}
609
610fn normalize_transcript_text(text: &str) -> String {
611 text.split_whitespace().collect::<Vec<_>>().join(" ")
612}
613
614fn is_noise_segment(text: &str) -> bool {
615 let normalized = text.trim().to_ascii_lowercase();
616 matches!(
617 normalized.as_str(),
618 "[music]" | "[applause]" | "[laughter]" | "♪" | "♫"
619 )
620}
621
622fn format_video_context(
623 video_id: &str,
624 canonical_url: &str,
625 metadata: &VideoMetadata,
626 track: &CaptionTrack,
627 segments: &[TranscriptSegment],
628) -> String {
629 let mut output = String::new();
630 output.push_str("# YouTube Video Context\n\n");
631 output.push_str("## Source\n");
632 output.push_str(&format!("- URL: {canonical_url}\n"));
633 output.push_str(&format!("- Video ID: {video_id}\n"));
634 push_optional(&mut output, "- Title", metadata.title.as_deref());
635 push_optional(&mut output, "- Channel", metadata.author.as_deref());
636 push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
637 push_optional(
638 &mut output,
639 "- Duration seconds",
640 metadata.duration_seconds.as_deref(),
641 );
642 push_optional(&mut output, "- Views", metadata.view_count.as_deref());
643 push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
644 push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
645
646 output.push_str("\n## Transcript Track\n");
647 output.push_str(&format!("- Language: {}\n", track.language_code));
648 output.push_str(&format!("- Name: {}\n", track.name));
649 output.push_str(&format!("- Auto-generated: {}\n", track.is_generated));
650
651 if let Some(description) = metadata.description.as_deref() {
652 output.push_str("\n## Description\n");
653 output.push_str(description.trim());
654 output.push('\n');
655 }
656
657 output.push_str("\n## Transcript\n");
658 for segment in segments {
659 output.push_str(&format!(
660 "[{}] {}\n",
661 format_timestamp(segment.start_ms),
662 segment.text
663 ));
664 }
665
666 output.trim().to_string()
667}
668
669fn format_metadata_only_context(
670 video_id: &str,
671 canonical_url: &str,
672 metadata: &VideoMetadata,
673) -> String {
674 let mut output = String::new();
675 output.push_str("# YouTube Video Context\n\n");
676 output.push_str("## Source\n");
677 output.push_str(&format!("- URL: {canonical_url}\n"));
678 output.push_str(&format!("- Video ID: {video_id}\n"));
679 push_optional(&mut output, "- Title", metadata.title.as_deref());
680 push_optional(&mut output, "- Channel", metadata.author.as_deref());
681 push_optional(&mut output, "- Channel ID", metadata.channel_id.as_deref());
682 push_optional(
683 &mut output,
684 "- Duration seconds",
685 metadata.duration_seconds.as_deref(),
686 );
687 push_optional(&mut output, "- Views", metadata.view_count.as_deref());
688 push_optional(&mut output, "- Published", metadata.publish_date.as_deref());
689 push_optional(&mut output, "- Uploaded", metadata.upload_date.as_deref());
690
691 if let Some(description) = metadata.description.as_deref() {
692 output.push_str("\n## Description\n");
693 output.push_str(description.trim());
694 output.push('\n');
695 }
696
697 output.push_str("\n## Transcript\n");
698 output.push_str("Transcript unavailable. YouTube metadata was extracted, but no usable caption/transcript body was available for this video.\n");
699 output.trim().to_string()
700}
701
702fn build_diagnostics(
703 tracks: &[CaptionTrack],
704 selected_track: &CaptionTrack,
705 segment_count: usize,
706 source_client: &str,
707) -> Vec<String> {
708 vec![
709 "YouTube extraction used native HTTP transcript path; no video/audio was downloaded."
710 .to_string(),
711 format!(
712 "Selected caption track from {source_client}: {} ({}, auto-generated: {}).",
713 selected_track.name, selected_track.language_code, selected_track.is_generated
714 ),
715 format!(
716 "Found {} caption track(s), extracted {} transcript segment(s).",
717 tracks.len(),
718 segment_count
719 ),
720 ]
721}
722
723fn build_metadata_only_diagnostics(reason: Option<&str>) -> Vec<String> {
724 let mut diagnostics = vec![
725 "YouTube extraction used native HTTP metadata path; no video/audio was downloaded."
726 .to_string(),
727 "Transcript unavailable; returning metadata-only YouTube context.".to_string(),
728 ];
729 if let Some(reason) = reason.filter(|reason| !reason.trim().is_empty()) {
730 diagnostics.push(format!("Transcript unavailable reason: {reason}"));
731 }
732 diagnostics
733}
734
735fn push_optional(output: &mut String, label: &str, value: Option<&str>) {
736 if let Some(value) = value.filter(|value| !value.trim().is_empty()) {
737 output.push_str(&format!("{label}: {}\n", value.trim()));
738 }
739}
740
741fn format_timestamp(ms: u64) -> String {
742 let total_seconds = ms / 1000;
743 let hours = total_seconds / 3600;
744 let minutes = (total_seconds % 3600) / 60;
745 let seconds = total_seconds % 60;
746
747 if hours > 0 {
748 format!("{hours:02}:{minutes:02}:{seconds:02}")
749 } else {
750 format!("{minutes:02}:{seconds:02}")
751 }
752}
753
754fn string_at(value: &Value, key: &str) -> Option<String> {
755 value
756 .get(key)
757 .and_then(Value::as_str)
758 .filter(|value| !value.is_empty())
759 .map(str::to_string)
760}
761
762fn text_runs_at(value: &Value, key: &str) -> Option<String> {
763 let text_value = value.get(key)?;
764 if let Some(simple_text) = text_value.get("simpleText").and_then(Value::as_str) {
765 return Some(simple_text.to_string());
766 }
767 let runs = text_value.get("runs")?.as_array()?;
768 let text = runs
769 .iter()
770 .filter_map(|run| run.get("text").and_then(Value::as_str))
771 .collect::<String>();
772 (!text.is_empty()).then_some(text)
773}
774
775#[derive(Debug)]
776pub enum YouTubeError {
777 InvalidUrl(String),
778 UnsupportedUrl,
779 Fetch(String),
780 HttpStatus(u16, String),
781 PlayerResponseMissing,
782 PlayerResponseUnterminated,
783 PlayerResponseParse(String),
784 PlayerApiHttpStatus(u16),
785 PlayerApiLoginRequired(String),
786 VisitorDataMissing,
787 CaptionTracksMissing,
788 NoUsableCaptionTrack,
789 InvalidCaptionUrl(String),
790 TranscriptHttpStatus(u16),
791 TranscriptParse(String),
792 TranscriptEmpty,
793}
794
795impl std::fmt::Display for YouTubeError {
796 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
797 match self {
798 Self::InvalidUrl(msg) => write!(f, "Invalid YouTube URL: {msg}"),
799 Self::UnsupportedUrl => write!(f, "Unsupported YouTube URL"),
800 Self::Fetch(msg) => write!(f, "YouTube fetch failed: {msg}"),
801 Self::HttpStatus(code, reason) => write!(f, "YouTube returned HTTP {code} {reason}"),
802 Self::PlayerResponseMissing => write!(f, "YouTube player response not found"),
803 Self::PlayerResponseUnterminated => {
804 write!(f, "YouTube player response JSON was unterminated")
805 }
806 Self::PlayerResponseParse(msg) => {
807 write!(f, "YouTube player response parse failed: {msg}")
808 }
809 Self::PlayerApiHttpStatus(code) => write!(f, "YouTube player API returned HTTP {code}"),
810 Self::PlayerApiLoginRequired(reason) => {
811 write!(f, "YouTube player API required login: {reason}")
812 }
813 Self::VisitorDataMissing => write!(f, "YouTube visitor data was unavailable"),
814 Self::CaptionTracksMissing => {
815 write!(f, "YouTube captions are unavailable for this video")
816 }
817 Self::NoUsableCaptionTrack => write!(f, "No usable YouTube caption track found"),
818 Self::InvalidCaptionUrl(msg) => write!(f, "Invalid YouTube caption URL: {msg}"),
819 Self::TranscriptHttpStatus(code) => {
820 write!(f, "YouTube transcript returned HTTP {code}")
821 }
822 Self::TranscriptParse(msg) => write!(f, "YouTube transcript parse failed: {msg}"),
823 Self::TranscriptEmpty => write!(
824 f,
825 "YouTube transcript was empty; caption track metadata was found but YouTube returned no caption body for this client"
826 ),
827 }
828 }
829}
830
831#[cfg(test)]
832pub(crate) mod tests {
833 use super::*;
834 use serde_json::json;
835
836 #[test]
837 fn youtube_extracts_video_ids_from_common_urls() {
838 let cases = [
839 ("https://www.youtube.com/watch?v=McO_xcf4IYw", "McO_xcf4IYw"),
840 ("https://youtu.be/McO_xcf4IYw?t=12", "McO_xcf4IYw"),
841 ("https://www.youtube.com/shorts/McO_xcf4IYw", "McO_xcf4IYw"),
842 ("https://www.youtube.com/embed/McO_xcf4IYw", "McO_xcf4IYw"),
843 ];
844
845 for (url, expected) in cases {
846 let parsed = Url::parse(url).unwrap();
847 assert_eq!(extract_video_id(&parsed).unwrap().as_str(), expected);
848 }
849 }
850
851 #[test]
852 fn youtube_rejects_invalid_video_ids() {
853 let parsed =
854 Url::parse("https://www.youtube.com/watch?v=not-valid-because-too-long").unwrap();
855 assert!(extract_video_id(&parsed).is_none());
856 }
857
858 #[test]
859 fn youtube_extracts_balanced_player_response() {
860 let html = r#"<script>var ytInitialPlayerResponse = {"videoDetails":{"title":"A } in string","shortDescription":"escaped \" brace }"},"captions":{}};</script>"#;
861 let response = extract_initial_player_response(html).unwrap();
862 assert_eq!(response["videoDetails"]["title"], "A } in string");
863 assert_eq!(
864 response["videoDetails"]["shortDescription"],
865 "escaped \" brace }"
866 );
867 }
868
869 #[test]
870 fn youtube_extracts_visitor_data() {
871 let html = r#"ytcfg.set({"VISITOR_DATA":"visitor-token","other":true});"#;
872 assert_eq!(extract_visitor_data(html).as_deref(), Some("visitor-token"));
873 }
874
875 #[test]
876 fn youtube_selects_manual_english_before_auto_english() {
877 let tracks = vec![
878 CaptionTrack {
879 base_url: "https://example.com/fr".into(),
880 language_code: "fr".into(),
881 name: "French".into(),
882 is_generated: false,
883 },
884 CaptionTrack {
885 base_url: "https://example.com/en-auto".into(),
886 language_code: "en".into(),
887 name: "English auto".into(),
888 is_generated: true,
889 },
890 CaptionTrack {
891 base_url: "https://example.com/en".into(),
892 language_code: "en".into(),
893 name: "English".into(),
894 is_generated: false,
895 },
896 ];
897
898 let selected = select_caption_track(&tracks).unwrap();
899 assert_eq!(selected.base_url, "https://example.com/en");
900 }
901
902 #[test]
903 fn youtube_parses_json3_transcript_segments() {
904 let transcript = json!({
905 "events": [
906 {"tStartMs": 0, "dDurationMs": 1000, "segs": [{"utf8": "Hello "}, {"utf8": "world"}]},
907 {"tStartMs": 1000, "segs": [{"utf8": "\n"}]},
908 {"tStartMs": 2000, "segs": [{"utf8": "[Music]"}]},
909 {"tStartMs": 3000, "dDurationMs": 500, "segs": [{"utf8": "next line"}]}
910 ]
911 });
912
913 let segments = parse_json3_transcript(&transcript);
914 assert_eq!(segments.len(), 2);
915 assert_eq!(segments[0].text, "Hello world");
916 assert_eq!(segments[0].start_ms, 0);
917 assert_eq!(segments[1].text, "next line");
918 }
919
920 #[test]
921 fn youtube_parses_xml_transcript_segments() {
922 let transcript = r#"<?xml version="1.0" ?><timedtext><body><p t="1000" d="2000">Hello & <s>world</s></p><p t="3000" d="1000">[Music]</p></body></timedtext>"#;
923 let segments = parse_xml_transcript(transcript);
924 assert_eq!(segments.len(), 1);
925 assert_eq!(segments[0].start_ms, 1000);
926 assert_eq!(segments[0].duration_ms, Some(2000));
927 assert_eq!(segments[0].text, "Hello & world");
928 }
929
930 #[test]
931 fn youtube_adds_json3_format_to_caption_url() {
932 let url =
933 caption_url_with_json3("https://www.youtube.com/api/timedtext?v=abc&lang=en").unwrap();
934 assert!(url.contains("fmt=json3"));
935 }
936
937 #[test]
938 fn youtube_formats_metadata_only_context_with_clear_transcript_marker() {
939 let metadata = VideoMetadata {
940 title: Some("No captions example".into()),
941 author: Some("Example Channel".into()),
942 ..VideoMetadata::default()
943 };
944 let text = format_metadata_only_context(
945 "McO_xcf4IYw",
946 "https://www.youtube.com/watch?v=McO_xcf4IYw",
947 &metadata,
948 );
949 assert!(text.contains("No captions example"));
950 assert!(text.contains("Transcript unavailable"));
951 }
952
953 #[test]
954 fn youtube_metadata_only_diagnostics_include_reason() {
955 let diagnostics = build_metadata_only_diagnostics(Some("captions disabled"));
956 assert!(diagnostics
957 .iter()
958 .any(|line| line.contains("metadata-only")));
959 assert!(diagnostics
960 .iter()
961 .any(|line| line.contains("captions disabled")));
962 }
963
964 #[tokio::test]
965 #[ignore = "network smoke test for YouTube extraction"]
966 async fn youtube_reads_sample_video_over_http() {
967 let client = reqwest::Client::builder()
968 .redirect(reqwest::redirect::Policy::limited(10))
969 .build()
970 .unwrap();
971 let page = fetch_and_extract(&client, "https://www.youtube.com/watch?v=McO_xcf4IYw")
972 .await
973 .unwrap();
974 assert!(page.text.contains("# YouTube Video Context"));
975 assert!(page.text.contains("- Video ID: McO_xcf4IYw"));
976 assert!(page.text.contains("## Transcript"));
977 assert!(page.content_length > 1000);
978 }
979}