yt_transcript_rs/video_data_fetcher.rs
1use reqwest::Client;
2
3use crate::captions_extractor::CaptionsExtractor;
4use crate::errors::CouldNotRetrieveTranscript;
5use crate::js_var_parser::JsVarParser;
6use crate::microformat_extractor::MicroformatExtractor;
7use crate::models::{MicroformatData, StreamingData, VideoDetails, VideoInfos};
8use crate::playability_asserter::PlayabilityAsserter;
9use crate::streaming_data_extractor::StreamingDataExtractor;
10use crate::transcript_list::TranscriptList;
11use crate::video_details_extractor::VideoDetailsExtractor;
12use crate::youtube_page_fetcher::YoutubePageFetcher;
13
14/// # VideoDataFetcher
15///
16/// Core component responsible for fetching transcript data and video details from YouTube.
17///
18/// This struct handles the low-level communication with YouTube's web API to:
19/// - Fetch available transcripts for a video
20/// - Extract caption JSON data from YouTube pages
21/// - Retrieve detailed information about videos, including metadata
22///
23/// The VideoDataFetcher works by parsing YouTube's HTML and JavaScript variables
24/// to extract the necessary data, since YouTube doesn't provide a public API for transcripts.
25///
26/// ## Internal Architecture
27///
28/// This component uses several helper classes to process data:
29/// - `YoutubePageFetcher`: Handles HTTP requests to YouTube, including proxy support
30/// - `JsVarParser`: Extracts JavaScript variables from YouTube's HTML
31/// - `PlayabilityAsserter`: Verifies video availability and access permissions
32/// - `VideoDetailsExtractor`: Extracts detailed information from video data
33pub struct VideoDataFetcher {
34 /// HTTP client for making requests
35 pub client: Client,
36 /// Specialized fetcher for YouTube pages
37 page_fetcher: YoutubePageFetcher,
38}
39
40impl VideoDataFetcher {
41 /// Creates a new VideoDataFetcher instance.
42 ///
43 /// # Parameters
44 ///
45 /// * `client` - A configured reqwest HTTP client to use for requests
46 /// * `proxy_config` - Optional proxy configuration for routing requests through a proxy
47 ///
48 /// # Returns
49 ///
50 /// A new VideoDataFetcher instance.
51 ///
52 /// # Example (internal usage)
53 ///
54 /// ```rust,no_run
55 /// # use reqwest::Client;
56 /// # use yt_transcript_rs::video_data_fetcher::VideoDataFetcher;
57 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
58 /// // Create a client
59 /// let client = Client::new();
60 /// // Create the fetcher
61 /// let fetcher = VideoDataFetcher::new(
62 /// client
63 /// );
64 /// # Ok(())
65 /// # }
66 /// ```
67 pub fn new(client: Client) -> Self {
68 let page_fetcher = YoutubePageFetcher::new(client.clone());
69
70 Self {
71 client,
72 page_fetcher,
73 }
74 }
75
76 /// Fetches the list of available transcripts for a YouTube video.
77 ///
78 /// This method:
79 /// 1. Retrieves the video page HTML
80 /// 2. Extracts the captions JSON data
81 /// 3. Builds a TranscriptList from the extracted data
82 ///
83 /// # Parameters
84 ///
85 /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ")
86 ///
87 /// # Returns
88 ///
89 /// * `Result<TranscriptList, CouldNotRetrieveTranscript>` - A TranscriptList on success, or an error if retrieval fails
90 ///
91 /// # Errors
92 ///
93 /// This method can fail if:
94 /// - The video doesn't exist or is private
95 /// - The video has no available transcripts
96 /// - YouTube's HTML structure has changed and parsing fails
97 /// - Network errors occur during the request
98 ///
99 /// # Example (internal usage)
100 ///
101 /// ```rust,no_run
102 /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
103 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
104 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
105 /// let video_id = "dQw4w9WgXcQ";
106 ///
107 /// // This internally calls VideoDataFetcher::fetch_transcript_list
108 /// let transcript_list = api.list_transcripts(video_id).await?;
109 /// # Ok(())
110 /// # }
111 /// ```
112 pub async fn fetch_transcript_list(
113 &self,
114 video_id: &str,
115 ) -> Result<TranscriptList, CouldNotRetrieveTranscript> {
116 // Get player response with playability check
117 let player_response = self.fetch_player_response(video_id, true).await?;
118
119 // Extract captions data and build transcript list
120 let video_captions = CaptionsExtractor::extract_captions_data(&player_response, video_id)?;
121
122 TranscriptList::build(video_id.to_string(), &video_captions)
123 }
124
125 /// Fetches detailed information about a YouTube video.
126 ///
127 /// This method retrieves comprehensive metadata about a video, including:
128 /// - Title, author, channel ID
129 /// - View count and video length
130 /// - Thumbnails in various resolutions
131 /// - Keywords and description
132 ///
133 /// # Parameters
134 ///
135 /// * `video_id` - The YouTube video ID
136 ///
137 /// # Returns
138 ///
139 /// * `Result<VideoDetails, CouldNotRetrieveTranscript>` - Video details on success, or an error
140 ///
141 /// # Errors
142 ///
143 /// Similar to transcript fetching, this can fail if:
144 /// - The video doesn't exist or is private
145 /// - YouTube's HTML structure has changed and parsing fails
146 /// - Network errors occur during the request
147 ///
148 /// # Example (internal usage)
149 ///
150 /// ```rust,no_run
151 /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
152 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
153 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
154 /// let video_id = "dQw4w9WgXcQ";
155 ///
156 /// // This internally calls VideoDataFetcher::fetch_video_details
157 /// let details = api.fetch_video_details(video_id).await?;
158 ///
159 /// println!("Video title: {}", details.title);
160 /// println!("Author: {}", details.author);
161 /// # Ok(())
162 /// # }
163 /// ```
164 pub async fn fetch_video_details(
165 &self,
166 video_id: &str,
167 ) -> Result<VideoDetails, CouldNotRetrieveTranscript> {
168 // Get player response with playability check
169 let player_response = self.fetch_player_response(video_id, true).await?;
170
171 // Extract video details from player response
172 VideoDetailsExtractor::extract_video_details(&player_response, video_id)
173 }
174
175 /// Fetches microformat data for a YouTube video.
176 ///
177 /// This method retrieves additional metadata about a video, including:
178 /// - Available countries
179 /// - Category
180 /// - Embed information
181 /// - Information about whether the video is unlisted, family-safe, etc.
182 ///
183 /// # Parameters
184 ///
185 /// * `video_id` - The YouTube video ID
186 ///
187 /// # Returns
188 ///
189 /// * `Result<MicroformatData, CouldNotRetrieveTranscript>` - Microformat data on success, or an error
190 ///
191 /// # Errors
192 ///
193 /// This method can fail if:
194 /// - The video doesn't exist or is private
195 /// - YouTube's HTML structure has changed and parsing fails
196 /// - Network errors occur during the request
197 ///
198 /// # Example (internal usage)
199 ///
200 /// ```rust,no_run
201 /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
202 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
203 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
204 /// let video_id = "dQw4w9WgXcQ";
205 ///
206 /// // This internally calls VideoDataFetcher::fetch_microformat
207 /// let microformat = api.fetch_microformat(video_id).await?;
208 ///
209 /// if let Some(category) = µformat.category {
210 /// println!("Video category: {}", category);
211 /// }
212 ///
213 /// if let Some(countries) = µformat.available_countries {
214 /// println!("Available in {} countries", countries.len());
215 /// }
216 /// # Ok(())
217 /// # }
218 /// ```
219 pub async fn fetch_microformat(
220 &self,
221 video_id: &str,
222 ) -> Result<MicroformatData, CouldNotRetrieveTranscript> {
223 // Get player response with playability check
224 let player_response = self.fetch_player_response(video_id, true).await?;
225
226 // Extract microformat data from player response
227 MicroformatExtractor::extract_microformat_data(&player_response, video_id)
228 }
229
230 /// Fetches streaming data for a YouTube video.
231 ///
232 /// This method retrieves information about available video and audio formats, including:
233 /// - URLs for different quality versions of the video
234 /// - Resolution, bitrate, and codec information
235 /// - Both combined formats (with audio and video) and separate adaptive formats
236 /// - Information about format expiration
237 ///
238 /// # Parameters
239 ///
240 /// * `video_id` - The YouTube video ID
241 ///
242 /// # Returns
243 ///
244 /// * `Result<StreamingData, CouldNotRetrieveTranscript>` - Streaming data on success, or an error
245 ///
246 /// # Errors
247 ///
248 /// This method can fail if:
249 /// - The video doesn't exist or is private
250 /// - The video has geo-restrictions that prevent access
251 /// - YouTube's HTML structure has changed and parsing fails
252 /// - Network errors occur during the request
253 ///
254 /// # Example (internal usage)
255 ///
256 /// ```rust,no_run
257 /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
258 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
259 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
260 /// let video_id = "dQw4w9WgXcQ";
261 ///
262 /// // This internally calls VideoDataFetcher::fetch_streaming_data
263 /// let streaming = api.fetch_streaming_data(video_id).await?;
264 ///
265 /// // Print information about available formats
266 /// println!("Available formats: {}", streaming.formats.len());
267 /// println!("Adaptive formats: {}", streaming.adaptive_formats.len());
268 /// println!("Expires in: {} seconds", streaming.expires_in_seconds);
269 ///
270 /// // Find highest quality video format
271 /// if let Some(best_format) = streaming.adaptive_formats.iter()
272 /// .filter(|f| f.width.is_some() && f.height.is_some())
273 /// .max_by_key(|f| f.height.unwrap_or(0)) {
274 /// println!("Highest quality: {}p", best_format.height.unwrap());
275 /// }
276 /// # Ok(())
277 /// # }
278 /// ```
279 pub async fn fetch_streaming_data(
280 &self,
281 video_id: &str,
282 ) -> Result<StreamingData, CouldNotRetrieveTranscript> {
283 // Get player response with playability check
284 let player_response = self.fetch_player_response(video_id, true).await?;
285
286 // Extract streaming data from player response
287 StreamingDataExtractor::extract_streaming_data(&player_response, video_id)
288 }
289
290 /// Fetches all available information about a YouTube video in a single request.
291 ///
292 /// This method retrieves the video page once and extracts all data, including:
293 /// - Video details (title, author, etc.)
294 /// - Microformat data (category, available countries, etc.)
295 /// - Streaming data (available formats, qualities, etc.)
296 /// - Transcript list (available caption languages)
297 ///
298 /// This is more efficient than calling the individual fetch methods separately
299 /// when multiple types of information are needed, as it avoids multiple HTTP requests.
300 ///
301 /// # Parameters
302 ///
303 /// * `video_id` - The YouTube video ID
304 ///
305 /// # Returns
306 ///
307 /// * `Result<VideoInfos, CouldNotRetrieveTranscript>` - Combined video information on success, or an error
308 ///
309 /// # Errors
310 ///
311 /// This method can fail if:
312 /// - The video doesn't exist or is private
313 /// - YouTube's HTML structure has changed and parsing fails
314 /// - Network errors occur during the request
315 ///
316 /// # Example (internal usage)
317 ///
318 /// ```rust,no_run
319 /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
320 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
321 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
322 /// let video_id = "dQw4w9WgXcQ";
323 ///
324 /// // This internally calls VideoDataFetcher::fetch_video_infos
325 /// let infos = api.fetch_video_infos(video_id).await?;
326 ///
327 /// println!("Title: {}", infos.video_details.title);
328 /// println!("Category: {}", infos.microformat.category.unwrap_or_default());
329 /// println!("Available transcripts: {}", infos.transcript_list.transcripts().count());
330 /// # Ok(())
331 /// # }
332 /// ```
333 pub async fn fetch_video_infos(
334 &self,
335 video_id: &str,
336 ) -> Result<VideoInfos, CouldNotRetrieveTranscript> {
337 // Get player response with playability check (single network request)
338 let player_response = self.fetch_player_response(video_id, true).await?;
339
340 // Extract all data in parallel using the various extractors
341 let video_details =
342 VideoDetailsExtractor::extract_video_details(&player_response, video_id)?;
343 let microformat =
344 MicroformatExtractor::extract_microformat_data(&player_response, video_id)?;
345 let streaming_data =
346 StreamingDataExtractor::extract_streaming_data(&player_response, video_id)?;
347
348 // Extract captions data and build transcript list
349 let captions_data = CaptionsExtractor::extract_captions_data(&player_response, video_id)?;
350 let transcript_list = TranscriptList::build(video_id.to_string(), &captions_data)?;
351
352 // Combine all data into the VideoInfos struct
353 Ok(VideoInfos {
354 video_details,
355 microformat,
356 streaming_data,
357 transcript_list,
358 })
359 }
360
361 /// Extracts the ytInitialPlayerResponse JavaScript variable from YouTube's HTML.
362 ///
363 /// This variable contains detailed information about the video, including captions.
364 ///
365 /// # Parameters
366 ///
367 /// * `html` - The HTML content of the YouTube video page
368 /// * `video_id` - The YouTube video ID (used for error reporting)
369 ///
370 /// # Returns
371 ///
372 /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JavaScript object or an error
373 fn extract_yt_initial_player_response(
374 &self,
375 html: &str,
376 video_id: &str,
377 ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
378 let js_var_parser = JsVarParser::new("ytInitialPlayerResponse");
379 let player_response = js_var_parser.parse(html, video_id)?;
380
381 Ok(player_response)
382 }
383
384 /// Helper method that fetches a video page and extracts the player response.
385 ///
386 /// This private method centralizes the common functionality used across multiple
387 /// data fetching methods, eliminating code duplication.
388 ///
389 /// # Parameters
390 ///
391 /// * `video_id` - The YouTube video ID
392 /// * `check_playability` - Whether to verify the video is playable
393 ///
394 /// # Returns
395 ///
396 /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The player response JSON or an error
397 async fn fetch_player_response(
398 &self,
399 video_id: &str,
400 check_playability: bool,
401 ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
402 // Fetch the video page HTML only once
403 let html = self.page_fetcher.fetch_video_page(video_id).await?;
404
405 // Extract the player response
406 let player_response = self.extract_yt_initial_player_response(&html, video_id)?;
407
408 // Check playability status if requested
409 if check_playability {
410 PlayabilityAsserter::assert_playability(&player_response, video_id)?;
411 }
412
413 Ok(player_response)
414 }
415}