ydl/
extractor.rs

1use crate::error::{YdlError, YdlResult};
2use crate::types::{PlayerResponse, SubtitleTrack, SubtitleTrackType, VideoMetadata, YdlOptions};
3use crate::youtube_client::YouTubeSubtitleExtractor;
4use reqwest::Client;
5use std::collections::HashMap;
6use std::time::Duration;
7use tracing::{debug, info};
8
9/// YouTube subtitle extractor for discovering and downloading subtitles
10pub struct SubtitleExtractor {
11    client: Client,
12    options: YdlOptions,
13    youtube_client: YouTubeSubtitleExtractor,
14}
15
16impl SubtitleExtractor {
17    /// Create a new subtitle extractor
18    pub fn new(options: YdlOptions) -> YdlResult<Self> {
19        let mut headers = reqwest::header::HeaderMap::new();
20
21        // Set a realistic User-Agent
22        let user_agent = options.user_agent.as_deref().unwrap_or(
23            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
24        );
25        headers.insert(
26            reqwest::header::USER_AGENT,
27            reqwest::header::HeaderValue::from_str(user_agent).map_err(|_| {
28                YdlError::Configuration {
29                    message: "Invalid user agent".to_string(),
30                }
31            })?,
32        );
33
34        // Set other headers to mimic a real browser
35        headers.insert(
36            reqwest::header::ACCEPT,
37            reqwest::header::HeaderValue::from_static(
38                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
39            ),
40        );
41        headers.insert(
42            reqwest::header::ACCEPT_LANGUAGE,
43            reqwest::header::HeaderValue::from_static("en-US,en;q=0.5"),
44        );
45        // Remove Accept-Encoding to get uncompressed response
46        // reqwest will handle compression automatically if we don't set this
47
48        let mut client_builder = Client::builder()
49            .default_headers(headers)
50            .timeout(Duration::from_secs(options.timeout_seconds))
51            .redirect(reqwest::redirect::Policy::limited(10));
52
53        // Add proxy if specified
54        if let Some(proxy_url) = &options.proxy {
55            let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| YdlError::Configuration {
56                message: format!("Invalid proxy URL: {}", e),
57            })?;
58            client_builder = client_builder.proxy(proxy);
59        }
60
61        let client = client_builder
62            .build()
63            .map_err(|e| YdlError::Configuration {
64                message: format!("Failed to create HTTP client: {}", e),
65            })?;
66
67        let youtube_client = YouTubeSubtitleExtractor::new()?;
68
69        Ok(Self {
70            client,
71            options,
72            youtube_client,
73        })
74    }
75
76    /// Discover available subtitle tracks for a video
77    pub async fn discover_tracks(&self, video_id: &str) -> YdlResult<Vec<SubtitleTrack>> {
78        info!("Discovering subtitle tracks for video: {}", video_id);
79
80        // Try different methods to find subtitles
81        let mut tracks = Vec::new();
82
83        // Method 1: Try InnerTube API first (most reliable)
84        if let Ok(innertube_tracks) = self.youtube_client.discover_tracks(video_id).await {
85            info!("Found {} tracks via InnerTube API", innertube_tracks.len());
86            tracks.extend(innertube_tracks);
87        }
88
89        // Method 2: Try to get from watch page as fallback
90        if tracks.is_empty()
91            && let Ok(page_tracks) = self.discover_from_watch_page(video_id).await
92        {
93            tracks.extend(page_tracks);
94        }
95
96        // Method 3: Try mobile endpoint if no tracks found
97        if tracks.is_empty()
98            && let Ok(mobile_tracks) = self.discover_from_mobile_page(video_id).await
99        {
100            tracks.extend(mobile_tracks);
101        }
102
103        // Method 4: Try direct API approach
104        if tracks.is_empty()
105            && let Ok(api_tracks) = self.discover_from_api(video_id).await
106        {
107            tracks.extend(api_tracks);
108        }
109
110        // Filter based on options
111        self.filter_tracks(tracks, video_id)
112    }
113
114    /// Get video metadata including available subtitles
115    pub async fn get_video_metadata(&self, video_id: &str) -> YdlResult<VideoMetadata> {
116        info!("Getting video metadata for: {}", video_id);
117
118        let url = format!("https://www.youtube.com/watch?v={}", video_id);
119        let response = self.client.get(&url).send().await?;
120
121        if !response.status().is_success() {
122            return Err(self.map_http_error(response.status(), video_id));
123        }
124
125        let html = response.text().await?;
126
127        // Extract basic video info and player response
128        let title = self.extract_video_title(&html)?;
129        let player_response = self.extract_player_response(&html)?;
130
131        let mut metadata = VideoMetadata::new(video_id.to_string(), title);
132
133        // Extract duration if available
134        if let Some(video_details) = &player_response.video_details
135            && let Some(length_str) = &video_details.length_seconds
136            && let Ok(length) = length_str.parse::<u64>()
137        {
138            metadata = metadata.with_duration(Duration::from_secs(length));
139        }
140
141        // Get available subtitles
142        let tracks = self.discover_tracks(video_id).await?;
143        metadata = metadata.with_subtitles(tracks);
144
145        Ok(metadata)
146    }
147
148    /// Download subtitle content from a track
149    pub async fn download_content(
150        &self,
151        track: &SubtitleTrack,
152        video_id: &str,
153    ) -> YdlResult<String> {
154        // If we have a URL from the track, try to use it
155        if let Some(base_url) = &track.url {
156            // First try with the InnerTube client (which handles authentication better)
157            info!("Downloading subtitle content via InnerTube client");
158            match self.youtube_client.download_content(base_url).await {
159                Ok(content) if !content.is_empty() => {
160                    debug!(
161                        "Downloaded {} bytes of subtitle content via InnerTube",
162                        content.len()
163                    );
164
165                    // Save to file for debugging
166                    #[cfg(debug_assertions)]
167                    {
168                        use std::fs;
169                        let _ = fs::write("/tmp/subtitle_content.xml", &content);
170                        debug!("Saved subtitle content to /tmp/subtitle_content.xml for debugging");
171                    }
172
173                    return Ok(content);
174                }
175                Err(e) => {
176                    debug!("InnerTube download failed: {}, trying direct download", e);
177                }
178                _ => {}
179            }
180
181            // Fallback to direct download
182            // Add format parameter - srv3 is YouTube's XML format that works well
183            let url = if base_url.contains("fmt=") {
184                base_url.clone()
185            } else {
186                let separator = if base_url.contains('?') { "&" } else { "?" };
187                format!("{}{separator}fmt=srv3", base_url)
188            };
189
190            info!("Trying direct download from: {}", url);
191            let response = self.client.get(&url).send().await?;
192
193            if response.status().is_success() {
194                let content = response.text().await?;
195                if !content.is_empty() {
196                    debug!("Downloaded {} bytes of subtitle content", content.len());
197                    return Ok(content);
198                }
199            }
200        }
201
202        // Fallback: construct a simple subtitle URL
203        // This works for many videos that have auto-generated subtitles
204        let fallback_url = format!(
205            "https://www.youtube.com/api/timedtext?v={}&lang={}&fmt=srv3",
206            video_id, track.language_code
207        );
208
209        info!("Trying fallback subtitle URL: {}", fallback_url);
210        let response = self.client.get(&fallback_url).send().await?;
211
212        if !response.status().is_success() {
213            return Err(YdlError::SubtitleDiscoveryError {
214                message: format!("HTTP {}: Failed to download subtitles", response.status()),
215            });
216        }
217
218        let content = response.text().await?;
219        debug!("Downloaded {} bytes of subtitle content", content.len());
220
221        if content.is_empty() {
222            return Err(YdlError::SubtitleParsing {
223                message: "Empty subtitle content received".to_string(),
224            });
225        }
226
227        debug!(
228            "Subtitle content preview (first 500 chars): {}",
229            &content.chars().take(500).collect::<String>()
230        );
231
232        Ok(content)
233    }
234
235    /// Discover subtitles from the main watch page
236    async fn discover_from_watch_page(&self, video_id: &str) -> YdlResult<Vec<SubtitleTrack>> {
237        debug!("Trying to discover subtitles from watch page");
238
239        let url = format!("https://www.youtube.com/watch?v={}", video_id);
240        let response = self.client.get(&url).send().await?;
241
242        if !response.status().is_success() {
243            return Err(self.map_http_error(response.status(), video_id));
244        }
245
246        let html = response.text().await?;
247
248        // Debug: save HTML to file for inspection
249        #[cfg(debug_assertions)]
250        {
251            use std::fs;
252            let _ = fs::write("/tmp/youtube_watch_page.html", &html);
253            debug!("Saved HTML to /tmp/youtube_watch_page.html for debugging");
254        }
255
256        let player_response = self.extract_player_response(&html)?;
257
258        // Extract tracks but construct simpler URLs that work
259        let mut tracks = Vec::new();
260        if let Some(captions) = &player_response.captions
261            && let Some(tracklist) = &captions.player_captions_tracklist_renderer
262            && let Some(caption_tracks) = &tracklist.caption_tracks
263        {
264            for track in caption_tracks {
265                // Instead of using the base_url from player response (which needs auth),
266                // construct a simple URL that often works for public videos
267                let simple_url = format!(
268                    "https://www.youtube.com/api/timedtext?v={}&lang={}",
269                    video_id, track.language_code
270                );
271
272                let language_name = track
273                    .name
274                    .as_ref()
275                    .and_then(|n| {
276                        n.simple_text.as_deref().or_else(|| {
277                            n.runs
278                                .as_ref()
279                                .and_then(|runs| runs.first().map(|r| r.text.as_str()))
280                        })
281                    })
282                    .unwrap_or(&track.language_code);
283
284                let track_type = if track.kind == Some("asr".to_string()) {
285                    SubtitleTrackType::AutoGenerated
286                } else {
287                    SubtitleTrackType::Manual
288                };
289
290                let subtitle_track = SubtitleTrack::new(
291                    track.language_code.clone(),
292                    language_name.to_string(),
293                    track_type,
294                )
295                .with_url(simple_url)
296                .with_translatable(track.is_translatable.unwrap_or(false));
297
298                tracks.push(subtitle_track);
299            }
300        }
301
302        if tracks.is_empty() {
303            // Fallback to the original method if our simple approach doesn't work
304            self.extract_tracks_from_player_response(&player_response, video_id)
305        } else {
306            Ok(tracks)
307        }
308    }
309
310    /// Discover subtitles from mobile endpoint
311    async fn discover_from_mobile_page(&self, video_id: &str) -> YdlResult<Vec<SubtitleTrack>> {
312        debug!("Trying to discover subtitles from mobile page");
313
314        let url = format!("https://m.youtube.com/watch?v={}", video_id);
315        let response = self.client.get(&url).send().await?;
316
317        if !response.status().is_success() {
318            return Err(self.map_http_error(response.status(), video_id));
319        }
320
321        let html = response.text().await?;
322        let player_response = self.extract_player_response(&html)?;
323
324        self.extract_tracks_from_player_response(&player_response, video_id)
325    }
326
327    /// Discover subtitles using direct API approach
328    async fn discover_from_api(&self, video_id: &str) -> YdlResult<Vec<SubtitleTrack>> {
329        debug!("Trying to discover subtitles from API");
330
331        // Try the get_video_info endpoint
332        let url = format!(
333            "https://www.youtube.com/get_video_info?video_id={}&el=detailpage&ps=default&eurl=&gl=US&hl=en",
334            video_id
335        );
336
337        let response = self.client.get(&url).send().await?;
338
339        if !response.status().is_success() {
340            return Err(YdlError::SubtitleDiscoveryError {
341                message: "Failed to fetch video info".to_string(),
342            });
343        }
344
345        let content = response.text().await?;
346
347        // Parse URL-encoded response
348        let params: HashMap<String, String> = url::form_urlencoded::parse(content.as_bytes())
349            .into_owned()
350            .collect();
351
352        if let Some(player_response_str) = params.get("player_response")
353            && let Ok(player_response) = serde_json::from_str::<PlayerResponse>(player_response_str)
354        {
355            return self.extract_tracks_from_player_response(&player_response, video_id);
356        }
357
358        Err(YdlError::SubtitleDiscoveryError {
359            message: "No player response found in API response".to_string(),
360        })
361    }
362
363    /// Extract player response JSON from HTML
364    fn extract_player_response(&self, html: &str) -> YdlResult<PlayerResponse> {
365        debug!(
366            "Attempting to extract player response from HTML (length: {})",
367            html.len()
368        );
369
370        // Look for ytInitialPlayerResponse (with or without var)
371        let patterns = [
372            "var ytInitialPlayerResponse = ",
373            "ytInitialPlayerResponse = ",
374        ];
375        for pattern in &patterns {
376            debug!("Searching for pattern: {}", pattern);
377            if let Some(start) = html.find(pattern) {
378                debug!("Found pattern at position {}", start);
379                let json_start = start + pattern.len();
380                // Look for the end of the JSON object - it should end with };
381                if let Some(json_end) = html[json_start..].find("};") {
382                    // Include the closing brace but not the semicolon
383                    let json_str = &html[json_start..json_start + json_end + 1];
384                    debug!("Found ytInitialPlayerResponse, attempting to parse");
385                    match serde_json::from_str::<PlayerResponse>(json_str) {
386                        Ok(player_response) => {
387                            debug!("Successfully parsed player response");
388                            if let Some(_captions) = &player_response.captions {
389                                debug!("Player response has captions field");
390                            } else {
391                                debug!("Player response has NO captions field");
392                            }
393                            return Ok(player_response);
394                        }
395                        Err(e) => {
396                            debug!("Failed to parse player response: {}", e);
397                        }
398                    }
399                }
400            }
401        }
402
403        // Alternative pattern
404        if let Some(start) = html.find("\"PLAYER_RESPONSE\":\"") {
405            let json_start = start + "\"PLAYER_RESPONSE\":\"".len();
406            if let Some(json_end) = html[json_start..].find("\",\"") {
407                let escaped_json = &html[json_start..json_start + json_end];
408                // Unescape the JSON
409                let unescaped = escaped_json.replace("\\\"", "\"").replace("\\\\", "\\");
410                if let Ok(player_response) = serde_json::from_str::<PlayerResponse>(&unescaped) {
411                    return Ok(player_response);
412                }
413            }
414        }
415
416        Err(YdlError::MetadataParsingError {
417            message: "Could not find player response in HTML".to_string(),
418        })
419    }
420
421    /// Extract video title from HTML
422    fn extract_video_title(&self, html: &str) -> YdlResult<String> {
423        // Try to find title in various places
424        if let Some(start) = html.find("<title>")
425            && let Some(end) = html[start..].find("</title>")
426        {
427            let title = &html[start + 7..start + end];
428            // Remove " - YouTube" suffix if present
429            let clean_title = title.replace(" - YouTube", "");
430            return Ok(clean_title);
431        }
432
433        // Fallback: try to find in JSON
434        if let Some(start) = html.find("\"title\":\"") {
435            let title_start = start + 9;
436            if let Some(title_end) = html[title_start..].find("\"") {
437                let title = &html[title_start..title_start + title_end];
438                return Ok(title.to_string());
439            }
440        }
441
442        Ok("Unknown Title".to_string())
443    }
444
445    /// Extract subtitle tracks from player response
446    fn extract_tracks_from_player_response(
447        &self,
448        player_response: &PlayerResponse,
449        video_id: &str,
450    ) -> YdlResult<Vec<SubtitleTrack>> {
451        let mut tracks = Vec::new();
452
453        debug!("Extracting tracks from player response");
454        if let Some(captions) = &player_response.captions {
455            debug!("Found captions in player response");
456            if let Some(tracklist) = &captions.player_captions_tracklist_renderer {
457                debug!("Found tracklist renderer");
458                if let Some(caption_tracks) = &tracklist.caption_tracks {
459                    debug!("Found {} caption tracks", caption_tracks.len());
460                    for track in caption_tracks {
461                        let language_name = track
462                            .name
463                            .as_ref()
464                            .and_then(|n| {
465                                n.simple_text.as_deref().or_else(|| {
466                                    n.runs
467                                        .as_ref()
468                                        .and_then(|runs| runs.first().map(|r| r.text.as_str()))
469                                })
470                            })
471                            .unwrap_or(&track.language_code)
472                            .to_string();
473
474                        // Determine track type based on kind or vss_id
475                        let track_type = if track.kind.as_deref() == Some("asr") {
476                            SubtitleTrackType::AutoGenerated
477                        } else {
478                            SubtitleTrackType::Manual
479                        };
480
481                        debug!(
482                            "Found subtitle track: lang={}, name={}, type={:?}, has_url={}",
483                            track.language_code,
484                            &language_name,
485                            &track_type,
486                            !track.base_url.is_empty()
487                        );
488
489                        let subtitle_track = SubtitleTrack::new(
490                            track.language_code.clone(),
491                            language_name,
492                            track_type,
493                        )
494                        .with_url(track.base_url.clone())
495                        .with_translatable(track.is_translatable.unwrap_or(false));
496
497                        tracks.push(subtitle_track);
498                    }
499                }
500            }
501        }
502
503        if tracks.is_empty() {
504            Err(YdlError::NoSubtitlesAvailable {
505                video_id: video_id.to_string(),
506            })
507        } else {
508            Ok(tracks)
509        }
510    }
511
512    /// Filter tracks based on options
513    fn filter_tracks(
514        &self,
515        tracks: Vec<SubtitleTrack>,
516        video_id: &str,
517    ) -> YdlResult<Vec<SubtitleTrack>> {
518        if tracks.is_empty() {
519            return Err(YdlError::NoSubtitlesAvailable {
520                video_id: video_id.to_string(),
521            });
522        }
523
524        let mut filtered = tracks;
525
526        // Filter by language preference
527        if let Some(preferred_lang) = &self.options.language {
528            let lang_matches: Vec<_> = filtered
529                .iter()
530                .filter(|track| track.language_code == *preferred_lang)
531                .cloned()
532                .collect();
533
534            if !lang_matches.is_empty() {
535                filtered = lang_matches;
536            }
537        }
538
539        // Filter by track type preferences
540        if !self.options.allow_auto_generated {
541            filtered.retain(|track| track.track_type != SubtitleTrackType::AutoGenerated);
542        }
543
544        // Prefer manual subtitles if requested
545        if self.options.prefer_manual {
546            let manual_tracks: Vec<_> = filtered
547                .iter()
548                .filter(|track| track.track_type == SubtitleTrackType::Manual)
549                .cloned()
550                .collect();
551
552            if !manual_tracks.is_empty() {
553                filtered = manual_tracks;
554            }
555        }
556
557        if filtered.is_empty() {
558            // Check if we filtered out everything due to preferences
559            if !self.options.allow_auto_generated {
560                return Err(YdlError::OnlyAutoGenerated {
561                    video_id: video_id.to_string(),
562                });
563            }
564            return Err(YdlError::NoSubtitlesAvailable {
565                video_id: video_id.to_string(),
566            });
567        }
568
569        Ok(filtered)
570    }
571
572    /// Map HTTP status codes to appropriate errors
573    fn map_http_error(&self, status: reqwest::StatusCode, video_id: &str) -> YdlError {
574        match status.as_u16() {
575            404 => YdlError::VideoNotFound {
576                video_id: video_id.to_string(),
577            },
578            403 => YdlError::VideoRestricted {
579                video_id: video_id.to_string(),
580            },
581            429 => YdlError::RateLimited { retry_after: 60 },
582            503 => YdlError::ServiceUnavailable,
583            _ => YdlError::SubtitleDiscoveryError {
584                message: format!("HTTP {} error", status),
585            },
586        }
587    }
588
589    /// Select the best subtitle track based on preferences
590    pub fn select_best_track<'a>(
591        &'a self,
592        tracks: &'a [SubtitleTrack],
593    ) -> Option<&'a SubtitleTrack> {
594        if tracks.is_empty() {
595            return None;
596        }
597
598        // If language is specified, prefer that, but also consider manual preference
599        if let Some(preferred_lang) = &self.options.language {
600            // First try to find a manual track in the preferred language
601            if self.options.prefer_manual
602                && let Some(track) = tracks.iter().find(|t| {
603                    t.language_code == *preferred_lang && t.track_type == SubtitleTrackType::Manual
604                })
605            {
606                return Some(track);
607            }
608
609            // Then try any track in the preferred language
610            if let Some(track) = tracks.iter().find(|t| t.language_code == *preferred_lang) {
611                return Some(track);
612            }
613        }
614
615        // Prefer manual over auto-generated (for any language)
616        if self.options.prefer_manual
617            && let Some(manual) = tracks
618                .iter()
619                .find(|t| t.track_type == SubtitleTrackType::Manual)
620        {
621            return Some(manual);
622        }
623
624        // Fall back to first available track
625        tracks.first()
626    }
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632
633    fn test_options() -> YdlOptions {
634        YdlOptions::new().timeout(10)
635    }
636
637    #[tokio::test]
638    async fn test_extractor_creation() {
639        let options = test_options();
640        let extractor = SubtitleExtractor::new(options);
641        assert!(extractor.is_ok());
642    }
643
644    #[test]
645    fn test_extract_video_title() {
646        let extractor = SubtitleExtractor::new(test_options()).unwrap();
647
648        let html = r"
649        <html>
650        <head>
651            <title>Test Video - YouTube</title>
652        </head>
653        <body></body>
654        </html>
655        ";
656
657        let title = extractor.extract_video_title(html);
658        assert!(title.is_ok());
659        assert_eq!(title.unwrap(), "Test Video");
660    }
661
662    #[test]
663    fn test_filter_tracks() {
664        let extractor = SubtitleExtractor::new(test_options()).unwrap();
665
666        let tracks = vec![
667            SubtitleTrack::new(
668                "en".to_string(),
669                "English".to_string(),
670                SubtitleTrackType::Manual,
671            ),
672            SubtitleTrack::new(
673                "en".to_string(),
674                "English (auto)".to_string(),
675                SubtitleTrackType::AutoGenerated,
676            ),
677            SubtitleTrack::new(
678                "es".to_string(),
679                "Spanish".to_string(),
680                SubtitleTrackType::Manual,
681            ),
682        ];
683
684        let filtered = extractor.filter_tracks(tracks, "test_video_id");
685        assert!(filtered.is_ok());
686
687        let result = filtered.unwrap();
688        assert!(!result.is_empty());
689        // Should prefer manual tracks by default
690        assert!(
691            result
692                .iter()
693                .any(|t| t.track_type == SubtitleTrackType::Manual)
694        );
695    }
696
697    #[test]
698    fn test_select_best_track() {
699        let options = YdlOptions::new().language("en").prefer_manual(true);
700        let extractor = SubtitleExtractor::new(options).unwrap();
701
702        let tracks = vec![
703            SubtitleTrack::new(
704                "es".to_string(),
705                "Spanish".to_string(),
706                SubtitleTrackType::Manual,
707            ),
708            SubtitleTrack::new(
709                "en".to_string(),
710                "English (auto)".to_string(),
711                SubtitleTrackType::AutoGenerated,
712            ),
713            SubtitleTrack::new(
714                "en".to_string(),
715                "English".to_string(),
716                SubtitleTrackType::Manual,
717            ),
718        ];
719
720        let best = extractor.select_best_track(&tracks);
721        assert!(best.is_some());
722
723        let selected = best.unwrap();
724        assert_eq!(selected.language_code, "en");
725        assert_eq!(selected.track_type, SubtitleTrackType::Manual);
726    }
727
728    #[test]
729    fn test_map_http_error() {
730        let extractor = SubtitleExtractor::new(test_options()).unwrap();
731
732        let error_404 = extractor.map_http_error(reqwest::StatusCode::NOT_FOUND, "test123");
733        assert!(matches!(error_404, YdlError::VideoNotFound { .. }));
734
735        let error_403 = extractor.map_http_error(reqwest::StatusCode::FORBIDDEN, "test123");
736        assert!(matches!(error_403, YdlError::VideoRestricted { .. }));
737
738        let error_429 = extractor.map_http_error(reqwest::StatusCode::TOO_MANY_REQUESTS, "test123");
739        assert!(matches!(error_429, YdlError::RateLimited { .. }));
740    }
741}