yt_transcript_rs/
video_data_fetcher.rs

1use reqwest::Client;
2
3use crate::captions_extractor::CaptionsExtractor;
4use crate::errors::CouldNotRetrieveTranscript;
5use crate::js_var_parser::JsVarParser;
6use crate::microformat_extractor::MicroformatExtractor;
7use crate::models::{MicroformatData, StreamingData, VideoDetails, VideoInfos};
8use crate::playability_asserter::PlayabilityAsserter;
9use crate::streaming_data_extractor::StreamingDataExtractor;
10use crate::transcript_list::TranscriptList;
11use crate::video_details_extractor::VideoDetailsExtractor;
12use crate::youtube_page_fetcher::YoutubePageFetcher;
13
14/// # VideoDataFetcher
15///
16/// Core component responsible for fetching transcript data and video details from YouTube.
17///
18/// This struct handles the low-level communication with YouTube's web API to:
19/// - Fetch available transcripts for a video
20/// - Extract caption JSON data from YouTube pages
21/// - Retrieve detailed information about videos, including metadata
22///
23/// The VideoDataFetcher works by parsing YouTube's HTML and JavaScript variables
24/// to extract the necessary data, since YouTube doesn't provide a public API for transcripts.
25///
26/// ## Internal Architecture
27///
28/// This component uses several helper classes to process data:
29/// - `YoutubePageFetcher`: Handles HTTP requests to YouTube, including proxy support
30/// - `JsVarParser`: Extracts JavaScript variables from YouTube's HTML
31/// - `PlayabilityAsserter`: Verifies video availability and access permissions
32/// - `VideoDetailsExtractor`: Extracts detailed information from video data
33pub struct VideoDataFetcher {
34    /// HTTP client for making requests
35    pub client: Client,
36    /// Specialized fetcher for YouTube pages
37    page_fetcher: YoutubePageFetcher,
38}
39
40impl VideoDataFetcher {
41    /// Creates a new VideoDataFetcher instance.
42    ///
43    /// # Parameters
44    ///
45    /// * `client` - A configured reqwest HTTP client to use for requests
46    /// * `proxy_config` - Optional proxy configuration for routing requests through a proxy
47    ///
48    /// # Returns
49    ///
50    /// A new VideoDataFetcher instance.
51    ///
52    /// # Example (internal usage)
53    ///
54    /// ```rust,no_run
55    /// # use reqwest::Client;
56    /// # use yt_transcript_rs::video_data_fetcher::VideoDataFetcher;
57    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
58    /// // Create a client
59    /// let client = Client::new();
60    /// // Create the fetcher
61    /// let fetcher = VideoDataFetcher::new(
62    ///     client
63    /// );
64    /// # Ok(())
65    /// # }
66    /// ```
67    pub fn new(client: Client) -> Self {
68        let page_fetcher = YoutubePageFetcher::new(client.clone());
69
70        Self {
71            client,
72            page_fetcher,
73        }
74    }
75
76    /// Fetches the list of available transcripts for a YouTube video.
77    ///
78    /// This method:
79    /// 1. Retrieves the video page HTML
80    /// 2. Extracts the captions JSON data
81    /// 3. Builds a TranscriptList from the extracted data
82    ///
83    /// # Parameters
84    ///
85    /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ")
86    ///
87    /// # Returns
88    ///
89    /// * `Result<TranscriptList, CouldNotRetrieveTranscript>` - A TranscriptList on success, or an error if retrieval fails
90    ///
91    /// # Errors
92    ///
93    /// This method can fail if:
94    /// - The video doesn't exist or is private
95    /// - The video has no available transcripts
96    /// - YouTube's HTML structure has changed and parsing fails
97    /// - Network errors occur during the request
98    ///
99    /// # Example (internal usage)
100    ///
101    /// ```rust,no_run
102    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
103    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
104    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
105    /// let video_id = "dQw4w9WgXcQ";
106    ///
107    /// // This internally calls VideoDataFetcher::fetch_transcript_list
108    /// let transcript_list = api.list_transcripts(video_id).await?;
109    /// # Ok(())
110    /// # }
111    /// ```
112    pub async fn fetch_transcript_list(
113        &self,
114        video_id: &str,
115    ) -> Result<TranscriptList, CouldNotRetrieveTranscript> {
116        // Get player response with playability check
117        let player_response = self.fetch_player_response(video_id, true).await?;
118
119        // Extract captions data and build transcript list
120        let video_captions = CaptionsExtractor::extract_captions_data(&player_response, video_id)?;
121
122        TranscriptList::build(video_id.to_string(), &video_captions)
123    }
124
125    /// Fetches detailed information about a YouTube video.
126    ///
127    /// This method retrieves comprehensive metadata about a video, including:
128    /// - Title, author, channel ID
129    /// - View count and video length
130    /// - Thumbnails in various resolutions
131    /// - Keywords and description
132    ///
133    /// # Parameters
134    ///
135    /// * `video_id` - The YouTube video ID
136    ///
137    /// # Returns
138    ///
139    /// * `Result<VideoDetails, CouldNotRetrieveTranscript>` - Video details on success, or an error
140    ///
141    /// # Errors
142    ///
143    /// Similar to transcript fetching, this can fail if:
144    /// - The video doesn't exist or is private
145    /// - YouTube's HTML structure has changed and parsing fails
146    /// - Network errors occur during the request
147    ///
148    /// # Example (internal usage)
149    ///
150    /// ```rust,no_run
151    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
152    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
153    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
154    /// let video_id = "dQw4w9WgXcQ";
155    ///
156    /// // This internally calls VideoDataFetcher::fetch_video_details
157    /// let details = api.fetch_video_details(video_id).await?;
158    ///
159    /// println!("Video title: {}", details.title);
160    /// println!("Author: {}", details.author);
161    /// # Ok(())
162    /// # }
163    /// ```
164    pub async fn fetch_video_details(
165        &self,
166        video_id: &str,
167    ) -> Result<VideoDetails, CouldNotRetrieveTranscript> {
168        // Get player response with playability check
169        let player_response = self.fetch_player_response(video_id, true).await?;
170
171        // Extract video details from player response
172        VideoDetailsExtractor::extract_video_details(&player_response, video_id)
173    }
174
175    /// Fetches microformat data for a YouTube video.
176    ///
177    /// This method retrieves additional metadata about a video, including:
178    /// - Available countries
179    /// - Category
180    /// - Embed information
181    /// - Information about whether the video is unlisted, family-safe, etc.
182    ///
183    /// # Parameters
184    ///
185    /// * `video_id` - The YouTube video ID
186    ///
187    /// # Returns
188    ///
189    /// * `Result<MicroformatData, CouldNotRetrieveTranscript>` - Microformat data on success, or an error
190    ///
191    /// # Errors
192    ///
193    /// This method can fail if:
194    /// - The video doesn't exist or is private
195    /// - YouTube's HTML structure has changed and parsing fails
196    /// - Network errors occur during the request
197    ///
198    /// # Example (internal usage)
199    ///
200    /// ```rust,no_run
201    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
202    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
203    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
204    /// let video_id = "dQw4w9WgXcQ";
205    ///
206    /// // This internally calls VideoDataFetcher::fetch_microformat
207    /// let microformat = api.fetch_microformat(video_id).await?;
208    ///
209    /// if let Some(category) = &microformat.category {
210    ///     println!("Video category: {}", category);
211    /// }
212    ///
213    /// if let Some(countries) = &microformat.available_countries {
214    ///     println!("Available in {} countries", countries.len());
215    /// }
216    /// # Ok(())
217    /// # }
218    /// ```
219    pub async fn fetch_microformat(
220        &self,
221        video_id: &str,
222    ) -> Result<MicroformatData, CouldNotRetrieveTranscript> {
223        // Get player response with playability check
224        let player_response = self.fetch_player_response(video_id, true).await?;
225
226        // Extract microformat data from player response
227        MicroformatExtractor::extract_microformat_data(&player_response, video_id)
228    }
229
230    /// Fetches streaming data for a YouTube video.
231    ///
232    /// This method retrieves information about available video and audio formats, including:
233    /// - URLs for different quality versions of the video
234    /// - Resolution, bitrate, and codec information
235    /// - Both combined formats (with audio and video) and separate adaptive formats
236    /// - Information about format expiration
237    ///
238    /// # Parameters
239    ///
240    /// * `video_id` - The YouTube video ID
241    ///
242    /// # Returns
243    ///
244    /// * `Result<StreamingData, CouldNotRetrieveTranscript>` - Streaming data on success, or an error
245    ///
246    /// # Errors
247    ///
248    /// This method can fail if:
249    /// - The video doesn't exist or is private
250    /// - The video has geo-restrictions that prevent access
251    /// - YouTube's HTML structure has changed and parsing fails
252    /// - Network errors occur during the request
253    ///
254    /// # Example (internal usage)
255    ///
256    /// ```rust,no_run
257    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
258    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
259    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
260    /// let video_id = "dQw4w9WgXcQ";
261    ///
262    /// // This internally calls VideoDataFetcher::fetch_streaming_data
263    /// let streaming = api.fetch_streaming_data(video_id).await?;
264    ///
265    /// // Print information about available formats
266    /// println!("Available formats: {}", streaming.formats.len());
267    /// println!("Adaptive formats: {}", streaming.adaptive_formats.len());
268    /// println!("Expires in: {} seconds", streaming.expires_in_seconds);
269    ///
270    /// // Find highest quality video format
271    /// if let Some(best_format) = streaming.adaptive_formats.iter()
272    ///     .filter(|f| f.width.is_some() && f.height.is_some())
273    ///     .max_by_key(|f| f.height.unwrap_or(0)) {
274    ///     println!("Highest quality: {}p", best_format.height.unwrap());
275    /// }
276    /// # Ok(())
277    /// # }
278    /// ```
279    pub async fn fetch_streaming_data(
280        &self,
281        video_id: &str,
282    ) -> Result<StreamingData, CouldNotRetrieveTranscript> {
283        // Get player response with playability check
284        let player_response = self.fetch_player_response(video_id, true).await?;
285
286        // Extract streaming data from player response
287        StreamingDataExtractor::extract_streaming_data(&player_response, video_id)
288    }
289
290    /// Fetches all available information about a YouTube video in a single request.
291    ///
292    /// This method retrieves the video page once and extracts all data, including:
293    /// - Video details (title, author, etc.)
294    /// - Microformat data (category, available countries, etc.)
295    /// - Streaming data (available formats, qualities, etc.)
296    /// - Transcript list (available caption languages)
297    ///
298    /// This is more efficient than calling the individual fetch methods separately
299    /// when multiple types of information are needed, as it avoids multiple HTTP requests.
300    ///
301    /// # Parameters
302    ///
303    /// * `video_id` - The YouTube video ID
304    ///
305    /// # Returns
306    ///
307    /// * `Result<VideoInfos, CouldNotRetrieveTranscript>` - Combined video information on success, or an error
308    ///
309    /// # Errors
310    ///
311    /// This method can fail if:
312    /// - The video doesn't exist or is private
313    /// - YouTube's HTML structure has changed and parsing fails
314    /// - Network errors occur during the request
315    ///
316    /// # Example (internal usage)
317    ///
318    /// ```rust,no_run
319    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
320    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
321    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
322    /// let video_id = "dQw4w9WgXcQ";
323    ///
324    /// // This internally calls VideoDataFetcher::fetch_video_infos
325    /// let infos = api.fetch_video_infos(video_id).await?;
326    ///
327    /// println!("Title: {}", infos.video_details.title);
328    /// println!("Category: {}", infos.microformat.category.unwrap_or_default());
329    /// println!("Available transcripts: {}", infos.transcript_list.transcripts().count());
330    /// # Ok(())
331    /// # }
332    /// ```
333    pub async fn fetch_video_infos(
334        &self,
335        video_id: &str,
336    ) -> Result<VideoInfos, CouldNotRetrieveTranscript> {
337        // Get player response with playability check (single network request)
338        let player_response = self.fetch_player_response(video_id, true).await?;
339
340        // Extract all data in parallel using the various extractors
341        let video_details =
342            VideoDetailsExtractor::extract_video_details(&player_response, video_id)?;
343        let microformat =
344            MicroformatExtractor::extract_microformat_data(&player_response, video_id)?;
345        let streaming_data =
346            StreamingDataExtractor::extract_streaming_data(&player_response, video_id)?;
347
348        // Extract captions data and build transcript list
349        let captions_data = CaptionsExtractor::extract_captions_data(&player_response, video_id)?;
350        let transcript_list = TranscriptList::build(video_id.to_string(), &captions_data)?;
351
352        // Combine all data into the VideoInfos struct
353        Ok(VideoInfos {
354            video_details,
355            microformat,
356            streaming_data,
357            transcript_list,
358        })
359    }
360
361    /// Extracts the ytInitialPlayerResponse JavaScript variable from YouTube's HTML.
362    ///
363    /// This variable contains detailed information about the video, including captions.
364    ///
365    /// # Parameters
366    ///
367    /// * `html` - The HTML content of the YouTube video page
368    /// * `video_id` - The YouTube video ID (used for error reporting)
369    ///
370    /// # Returns
371    ///
372    /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JavaScript object or an error
373    fn extract_yt_initial_player_response(
374        &self,
375        html: &str,
376        video_id: &str,
377    ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
378        let js_var_parser = JsVarParser::new("ytInitialPlayerResponse");
379        let player_response = js_var_parser.parse(html, video_id)?;
380
381        Ok(player_response)
382    }
383
384    /// Helper method that fetches a video page and extracts the player response.
385    ///
386    /// This private method centralizes the common functionality used across multiple
387    /// data fetching methods, eliminating code duplication.
388    ///
389    /// # Parameters
390    ///
391    /// * `video_id` - The YouTube video ID
392    /// * `check_playability` - Whether to verify the video is playable
393    ///
394    /// # Returns
395    ///
396    /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The player response JSON or an error
397    async fn fetch_player_response(
398        &self,
399        video_id: &str,
400        check_playability: bool,
401    ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
402        // Fetch the video page HTML only once
403        let html = self.page_fetcher.fetch_video_page(video_id).await?;
404
405        // Extract the player response
406        let player_response = self.extract_yt_initial_player_response(&html, video_id)?;
407
408        // Check playability status if requested
409        if check_playability {
410            PlayabilityAsserter::assert_playability(&player_response, video_id)?;
411        }
412
413        Ok(player_response)
414    }
415}