yt_transcript_rs/
api.rs

1use reqwest::Client;
2use std::path::Path;
3use std::sync::Arc;
4
5use crate::cookie_jar_loader::CookieJarLoader;
6#[cfg(feature = "ci")]
7use crate::errors::CouldNotRetrieveTranscriptReason;
8use crate::errors::{CookieError, CouldNotRetrieveTranscript};
9use crate::models::{MicroformatData, VideoDetails};
10use crate::proxies::ProxyConfig;
11#[cfg(not(feature = "ci"))]
12use crate::video_data_fetcher::VideoDataFetcher;
13use crate::{FetchedTranscript, TranscriptList};
14
15/// # YouTubeTranscriptApi
16///
17/// The main interface for retrieving YouTube video transcripts and metadata.
18///
19/// This API provides methods to:
20/// - Fetch transcripts from YouTube videos in various languages
21/// - List all available transcript languages for a video
22/// - Retrieve detailed video metadata
23///
24/// The API supports advanced features like:
25/// - Custom HTTP clients and proxies for handling geo-restrictions
26/// - Cookie management for accessing restricted content
27/// - Preserving text formatting in transcripts
28///
29/// ## Simple Usage Example
30///
31/// ```rust,no_run
32/// use yt_transcript_rs::api::YouTubeTranscriptApi;
33///
34/// #[tokio::main]
35/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
36///     // Create a new API instance with default settings
37///     let api = YouTubeTranscriptApi::new(None, None, None)?;
38///     
39///     // Fetch an English transcript
40///     let transcript = api.fetch_transcript(
41///         "dQw4w9WgXcQ",      // Video ID
42///         &["en"],            // Preferred languages
43///         false               // Don't preserve formatting
44///     ).await?;
45///     
46///     // Print each snippet of the transcript
47///     for snippet in transcript.parts() {
48///         println!("[{:.1}s]: {}", snippet.start, snippet.text);
49///     }
50///     
51///     Ok(())
52/// }
53/// ```
54#[derive(Clone)]
55pub struct YouTubeTranscriptApi {
56    /// The internal data fetcher used to retrieve information from YouTube
57    #[cfg(not(feature = "ci"))]
58    fetcher: Arc<VideoDataFetcher>,
59    #[cfg(feature = "ci")]
60    client: Client,
61}
62
63impl YouTubeTranscriptApi {
64    /// Creates a new YouTube Transcript API instance.
65    ///
66    /// This method initializes an API instance with optional customizations for
67    /// cookies, proxies, and HTTP client settings.
68    ///
69    /// # Parameters
70    ///
71    /// * `cookie_path` - Optional path to a Netscape-format cookie file for authenticated requests
72    /// * `proxy_config` - Optional proxy configuration for routing requests through a proxy service
73    /// * `http_client` - Optional pre-configured HTTP client to use instead of the default one
74    ///
75    /// # Returns
76    ///
77    /// * `Result<Self, CookieError>` - A new API instance or a cookie-related error
78    ///
79    /// # Errors
80    ///
81    /// This function will return an error if:
82    /// - The cookie file exists but cannot be read or parsed
83    /// - The cookie file is not in the expected Netscape format
84    ///
85    /// # Examples
86    ///
87    /// ## Basic usage with default settings
88    ///
89    /// ```rust,no_run
90    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
91    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
92    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
93    /// # Ok(())
94    /// # }
95    /// ```
96    ///
97    /// ## Using a cookie file for authenticated access
98    ///
99    /// ```rust,no_run
100    /// # use std::path::Path;
101    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
102    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
103    /// let cookie_path = Path::new("path/to/cookies.txt");
104    /// let api = YouTubeTranscriptApi::new(Some(&cookie_path), None, None)?;
105    /// # Ok(())
106    /// # }
107    /// ```
108    ///
109    /// ## Using a proxy to bypass geographical restrictions
110    ///
111    /// ```rust,no_run
112    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
113    /// # use yt_transcript_rs::proxies::GenericProxyConfig;
114    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
115    /// // Create a proxy configuration
116    /// let proxy = GenericProxyConfig::new(
117    ///     Some("http://proxy.example.com:8080".to_string()),
118    ///     None
119    /// )?;
120    ///
121    /// let api = YouTubeTranscriptApi::new(
122    ///     None,
123    ///     Some(Box::new(proxy)),
124    ///     None
125    /// )?;
126    /// # Ok(())
127    /// # }
128    /// ```
129    pub fn new(
130        cookie_path: Option<&Path>,
131        proxy_config: Option<Box<dyn ProxyConfig + Send + Sync>>,
132        http_client: Option<Client>,
133    ) -> Result<Self, CookieError> {
134        let client = match http_client {
135            Some(client) => client,
136            None => {
137                let mut builder = Client::builder()
138                    .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
139                    .default_headers({
140                        let mut headers = reqwest::header::HeaderMap::new();
141                        headers.insert(
142                            reqwest::header::ACCEPT_LANGUAGE,
143                            reqwest::header::HeaderValue::from_static("en-US"),
144                        );
145                        headers
146                    });
147
148                // Add cookie jar if needed
149                if let Some(cookie_path) = cookie_path {
150                    let cookie_jar = CookieJarLoader::load_cookie_jar(cookie_path)?;
151                    let cookie_jar = Arc::new(cookie_jar);
152                    builder = builder.cookie_store(true).cookie_provider(cookie_jar);
153                }
154
155                // Add proxy configuration if needed
156                if let Some(proxy_config_ref) = &proxy_config {
157                    // Convert the proxy configuration to a map first to avoid borrowing issues
158                    let proxy_map = proxy_config_ref.to_requests_dict();
159
160                    let proxies = reqwest::Proxy::custom(move |url| {
161                        if url.scheme() == "http" {
162                            if let Some(http_proxy) = proxy_map.get("http") {
163                                return Some(http_proxy.clone());
164                            }
165                        } else if url.scheme() == "https" {
166                            if let Some(https_proxy) = proxy_map.get("https") {
167                                return Some(https_proxy.clone());
168                            }
169                        }
170
171                        None
172                    });
173
174                    builder = builder.proxy(proxies);
175
176                    // Disable keep-alive if needed
177                    if proxy_config_ref.prevent_keeping_connections_alive() {
178                        builder = builder.connection_verbose(true).tcp_keepalive(None);
179
180                        let mut headers = reqwest::header::HeaderMap::new();
181                        headers.insert(
182                            reqwest::header::CONNECTION,
183                            reqwest::header::HeaderValue::from_static("close"),
184                        );
185                        builder = builder.default_headers(headers);
186                    }
187                }
188
189                builder.build().unwrap()
190            }
191        };
192
193        #[cfg(not(feature = "ci"))]
194        let fetcher = Arc::new(VideoDataFetcher::new(client.clone(), proxy_config));
195
196        Ok(Self {
197            #[cfg(not(feature = "ci"))]
198            fetcher,
199            #[cfg(feature = "ci")]
200            client,
201        })
202    }
203
204    /// Fetches a transcript for a YouTube video in the specified languages.
205    ///
206    /// This method attempts to retrieve a transcript in the first available language
207    /// from the provided list of language preferences. If none of the specified languages
208    /// are available, an error is returned.
209    ///
210    /// # Parameters
211    ///
212    /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ" from https://www.youtube.com/watch?v=dQw4w9WgXcQ)
213    /// * `languages` - A list of language codes in order of preference (e.g., ["en", "es", "fr"])
214    /// * `preserve_formatting` - Whether to preserve HTML formatting in the transcript text
215    ///
216    /// # Returns
217    ///
218    /// * `Result<FetchedTranscript, CouldNotRetrieveTranscript>` - The transcript or an error
219    ///
220    /// # Errors
221    ///
222    /// This method will return an error if:
223    /// - The video does not exist or is private
224    /// - The video has no transcripts available
225    /// - None of the requested languages are available
226    /// - Network issues prevent fetching the transcript
227    ///
228    /// # Examples
229    ///
230    /// ## Basic usage - get English transcript
231    ///
232    /// ```rust,no_run
233    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
234    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
235    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
236    ///
237    /// // Fetch English transcript
238    /// let transcript = api.fetch_transcript(
239    ///     "dQw4w9WgXcQ",  // Video ID
240    ///     &["en"],        // Try English
241    ///     false           // Don't preserve formatting
242    /// ).await?;
243    ///
244    /// println!("Full transcript text: {}", transcript.text());
245    /// # Ok(())
246    /// # }
247    /// ```
248    ///
249    /// ## Multiple language preferences with formatting preserved
250    ///
251    /// ```rust,no_run
252    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
253    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
254    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
255    ///
256    /// // Try English first, then Spanish, then auto-generated English
257    /// let transcript = api.fetch_transcript(
258    ///     "dQw4w9WgXcQ",
259    ///     &["en", "es", "en-US"],
260    ///     true  // Preserve formatting like <b>bold</b> text
261    /// ).await?;
262    ///
263    /// // Print each segment with timing information
264    /// for snippet in transcript.parts() {
265    ///     println!("[{:.1}s-{:.1}s]: {}",
266    ///         snippet.start,
267    ///         snippet.start + snippet.duration,
268    ///         snippet.text);
269    /// }
270    /// # Ok(())
271    /// # }
272    /// ```
273    #[cfg(feature = "ci")]
274    pub async fn fetch_transcript(
275        &self,
276        video_id: &str,
277        languages: &[&str],
278        _preserve_formatting: bool,
279    ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
280        if video_id == crate::tests::test_utils::NON_EXISTENT_VIDEO_ID {
281            return Err(CouldNotRetrieveTranscript {
282                video_id: video_id.to_string(),
283                reason: Some(CouldNotRetrieveTranscriptReason::VideoUnavailable),
284            });
285        }
286
287        let transcript =
288            crate::tests::mocks::create_mock_fetched_transcript(video_id, languages[0]);
289        Ok(transcript)
290    }
291
292    #[cfg(not(feature = "ci"))]
293    pub async fn fetch_transcript(
294        &self,
295        video_id: &str,
296        languages: &[&str],
297        preserve_formatting: bool,
298    ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
299        let transcript_list = self.list_transcripts(video_id).await?;
300        let transcript = transcript_list.find_transcript(languages)?;
301        transcript.fetch(preserve_formatting).await
302    }
303
304    /// Lists all available transcripts for a YouTube video.
305    ///
306    /// This method retrieves information about all available transcripts for a video,
307    /// including both manual and automatically generated captions in all languages.
308    ///
309    /// # Parameters
310    ///
311    /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ")
312    ///
313    /// # Returns
314    ///
315    /// * `Result<TranscriptList, CouldNotRetrieveTranscript>` - A list of available transcripts or an error
316    ///
317    /// # Errors
318    ///
319    /// This method will return an error if:
320    /// - The video does not exist or is private
321    /// - The video has no transcripts available
322    /// - Network issues prevent fetching the transcript list
323    ///
324    /// # Examples
325    ///
326    /// ```rust,no_run
327    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
328    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
329    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
330    ///
331    /// // Get all available transcripts
332    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
333    ///
334    /// // Print information about each available transcript
335    /// for transcript in transcript_list.transcripts() {
336    ///     println!("Language: {} ({}) - {} generated",
337    ///         transcript.language(),
338    ///         transcript.language_code(),
339    ///         if transcript.is_generated() { "Auto" } else { "Manually" });
340    /// }
341    /// # Ok(())
342    /// # }
343    /// ```
344    #[cfg(feature = "ci")]
345    pub async fn list_transcripts(
346        &self,
347        video_id: &str,
348    ) -> Result<TranscriptList, CouldNotRetrieveTranscript> {
349        // For non-existent video ID, return an error
350        if video_id == crate::tests::test_utils::NON_EXISTENT_VIDEO_ID {
351            return Err(CouldNotRetrieveTranscript {
352                video_id: video_id.to_string(),
353                reason: Some(CouldNotRetrieveTranscriptReason::VideoUnavailable),
354            });
355        }
356
357        // Return mock transcript list
358        Ok(crate::tests::mocks::create_mock_transcript_list(
359            self.client.clone(),
360        ))
361    }
362
363    #[cfg(not(feature = "ci"))]
364    pub async fn list_transcripts(
365        &self,
366        video_id: &str,
367    ) -> Result<TranscriptList, CouldNotRetrieveTranscript> {
368        self.fetcher.fetch_transcript_list(video_id).await
369    }
370
371    /// Fetches detailed metadata about a YouTube video.
372    ///
373    /// This method retrieves comprehensive information about a video, including its
374    /// title, author, view count, description, thumbnails, and other metadata.
375    ///
376    /// # Parameters
377    ///
378    /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ")
379    ///
380    /// # Returns
381    ///
382    /// * `Result<VideoDetails, CouldNotRetrieveTranscript>` - Video details or an error
383    ///
384    /// # Errors
385    ///
386    /// This method will return an error if:
387    /// - The video does not exist or is private
388    /// - Network issues prevent fetching the video details
389    /// - The YouTube page structure has changed and details cannot be extracted
390    ///
391    /// # Examples
392    ///
393    /// ```rust,no_run
394    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
395    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
396    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
397    ///
398    /// // Fetch details about a video
399    /// let details = api.fetch_video_details("dQw4w9WgXcQ").await?;
400    ///
401    /// // Print basic information
402    /// println!("Title: {}", details.title);
403    /// println!("Channel: {}", details.author);
404    /// println!("Views: {}", details.view_count);
405    /// println!("Duration: {} seconds", details.length_seconds);
406    ///
407    /// // Print keywords if available
408    /// if let Some(keywords) = &details.keywords {
409    ///     println!("Keywords: {}", keywords.join(", "));
410    /// }
411    ///
412    /// // Get the highest quality thumbnail
413    /// if let Some(best_thumb) = details.thumbnails.iter()
414    ///     .max_by_key(|t| t.width * t.height) {
415    ///     println!("Best thumbnail: {} ({}x{})",
416    ///         best_thumb.url, best_thumb.width, best_thumb.height);
417    /// }
418    /// # Ok(())
419    /// # }
420    /// ```
421    #[cfg(feature = "ci")]
422    pub async fn fetch_video_details(
423        &self,
424        video_id: &str,
425    ) -> Result<VideoDetails, CouldNotRetrieveTranscript> {
426        // For non-existent video ID, return an error
427        if video_id == crate::tests::test_utils::NON_EXISTENT_VIDEO_ID {
428            return Err(CouldNotRetrieveTranscript {
429                video_id: video_id.to_string(),
430                reason: Some(CouldNotRetrieveTranscriptReason::VideoUnavailable),
431            });
432        }
433
434        // Return mock data
435        Ok(crate::tests::mocks::create_mock_video_details())
436    }
437
438    #[cfg(not(feature = "ci"))]
439    pub async fn fetch_video_details(
440        &self,
441        video_id: &str,
442    ) -> Result<VideoDetails, CouldNotRetrieveTranscript> {
443        self.fetcher.fetch_video_details(video_id).await
444    }
445
446    /// Fetches microformat data for a YouTube video.
447    ///
448    /// This method retrieves additional metadata about a video that's not included
449    /// in the main video details, such as available countries, category, and embed information.
450    ///
451    /// # Parameters
452    ///
453    /// * `video_id` - The YouTube video ID (e.g., "dQw4w9WgXcQ")
454    ///
455    /// # Returns
456    ///
457    /// * `Result<MicroformatData, CouldNotRetrieveTranscript>` - Microformat data or an error
458    ///
459    /// # Errors
460    ///
461    /// This method will return an error if:
462    /// - The video does not exist or is private
463    /// - Network issues prevent fetching the data
464    /// - The YouTube page structure has changed and data cannot be extracted
465    ///
466    /// # Examples
467    ///
468    /// ```rust,no_run
469    /// # use yt_transcript_rs::api::YouTubeTranscriptApi;
470    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
471    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
472    ///
473    /// // Fetch microformat data about a video
474    /// let microformat = api.fetch_microformat("dQw4w9WgXcQ").await?;
475    ///
476    /// // Check if the video is unlisted
477    /// if let Some(is_unlisted) = microformat.is_unlisted {
478    ///     println!("Video is unlisted: {}", is_unlisted);
479    /// }
480    ///
481    /// // Get video category
482    /// if let Some(category) = microformat.category {
483    ///     println!("Video category: {}", category);
484    /// }
485    ///
486    /// // Check availability by country
487    /// if let Some(countries) = microformat.available_countries {
488    ///     println!("Video available in {} countries", countries.len());
489    ///     if countries.contains(&"US".to_string()) {
490    ///         println!("Video is available in the United States");
491    ///     }
492    /// }
493    /// # Ok(())
494    /// # }
495    /// ```
496    #[cfg(feature = "ci")]
497    pub async fn fetch_microformat(
498        &self,
499        video_id: &str,
500    ) -> Result<MicroformatData, CouldNotRetrieveTranscript> {
501        // For non-existent video ID, return an error
502        if video_id == crate::tests::test_utils::NON_EXISTENT_VIDEO_ID {
503            return Err(CouldNotRetrieveTranscript {
504                video_id: video_id.to_string(),
505                reason: Some(CouldNotRetrieveTranscriptReason::VideoUnavailable),
506            });
507        }
508
509        // Return mock data
510        Ok(crate::tests::mocks::create_mock_microformat_data())
511    }
512
513    #[cfg(not(feature = "ci"))]
514    pub async fn fetch_microformat(
515        &self,
516        video_id: &str,
517    ) -> Result<MicroformatData, CouldNotRetrieveTranscript> {
518        self.fetcher.fetch_microformat(video_id).await
519    }
520}