yt_transcript_rs/
transcript.rs

1use reqwest::Client;
2use std::collections::HashMap;
3use std::fmt;
4
5use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
6use crate::fetched_transcript::FetchedTranscript;
7use crate::innertube_client::InnerTubeClient;
8use crate::models::TranslationLanguage;
9use crate::transcript_parser::TranscriptParser;
10
11/// # Transcript
12///
13/// Represents a YouTube transcript that can be fetched or translated.
14///
15/// This struct contains the metadata and access URLs for a transcript but not
16/// the actual transcript text content. It serves as a handle to retrieve the
17/// full transcript text when needed.
18///
19/// A `Transcript` object can represent:
20/// - A native transcript in its original language
21/// - A translatable transcript that can be converted to other languages
22/// - A manually created transcript (more accurate, created by humans)
23/// - An automatically generated transcript (created by YouTube's speech recognition)
24///
25/// ## Usage Example
26///
27/// ```rust,no_run
28/// # use yt_transcript_rs::YouTubeTranscriptApi;
29/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
30/// let api = YouTubeTranscriptApi::new(None, None, None)?;
31/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
32///
33/// // Find an English transcript
34/// let transcript = transcript_list.find_transcript(&["en"])?;
35///
36/// // Check if it can be translated
37/// if transcript.is_translatable() {
38///     // Translate to Spanish
39///     let spanish = transcript.translate("es")?;
40///     
41///     // Fetch the translated content
42///     let client = reqwest::Client::new();
43///     let fetched = spanish.fetch(&client, false).await?;
44///     println!("Spanish transcript: {}", fetched.text());
45/// }
46///
47/// // Or fetch the original transcript
48/// let client = reqwest::Client::new();
49/// let fetched = transcript.fetch(&client, false).await?;
50/// println!("Original transcript: {}", fetched.text());
51/// # Ok(())
52/// # }
53/// ```
54#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
55pub struct Transcript {
56    /// The YouTube video ID this transcript belongs to
57    pub video_id: String,
58
59    /// URL to fetch the transcript content from YouTube
60    pub url: String,
61
62    /// Full human-readable language name (e.g., "English")
63    pub language: String,
64
65    /// Language code (e.g., "en", "en-US", "es")
66    pub language_code: String,
67
68    /// Whether this transcript was automatically generated by YouTube
69    pub is_generated: bool,
70
71    /// List of languages this transcript can be translated to
72    pub translation_languages: Vec<TranslationLanguage>,
73
74    /// Mapping of language codes to language names for available translations
75    pub translation_languages_map: HashMap<String, String>,
76}
77
78impl Transcript {
79    /// Creates a new transcript instance.
80    ///
81    /// This constructor creates a transcript object that can be used to fetch
82    /// the actual transcript content or to generate translations.
83    ///
84    /// # Parameters
85    ///
86    /// * `video_id` - YouTube video ID
87    /// * `url` - URL to fetch the transcript content
88    /// * `language` - Human-readable language name (e.g., "English")
89    /// * `language_code` - Language code (e.g., "en", "en-US")
90    /// * `is_generated` - Whether this transcript was automatically generated
91    /// * `translation_languages` - List of languages this transcript can be translated to
92    ///
93    /// # Returns
94    ///
95    /// A new `Transcript` instance
96    ///
97    /// # Example (internal usage)
98    ///
99    /// ```rust,no_run
100    /// # use reqwest::Client;
101    /// # use yt_transcript_rs::transcript::Transcript;
102    /// # use yt_transcript_rs::models::TranslationLanguage;
103    /// # fn example() {
104    /// // Create a transcript for English
105    /// let transcript = Transcript::new(
106    ///     "dQw4w9WgXcQ".to_string(),
107    ///     "https://www.youtube.com/api/timedtext?...".to_string(),
108    ///     "English".to_string(),
109    ///     "en".to_string(),
110    ///     false, // Not automatically generated
111    ///     vec![
112    ///         TranslationLanguage {
113    ///             language: "Spanish".to_string(),
114    ///             language_code: "es".to_string()
115    ///         }
116    ///     ]
117    /// );
118    /// # }
119    /// ```
120    pub fn new(
121        video_id: String,
122        url: String,
123        language: String,
124        language_code: String,
125        is_generated: bool,
126        translation_languages: Vec<TranslationLanguage>,
127    ) -> Self {
128        let translation_languages_map = translation_languages
129            .iter()
130            .map(|lang| (lang.language_code.clone(), lang.language.clone()))
131            .collect();
132
133        Self {
134            video_id,
135            url,
136            language,
137            language_code,
138            is_generated,
139            translation_languages,
140            translation_languages_map,
141        }
142    }
143
144    /// Fetches the actual transcript content from YouTube.
145    ///
146    /// This method retrieves the transcript text and timing information from YouTube
147    /// using YouTube's internal InnerTube API, which provides reliable access to
148    /// transcript data even when YouTube updates their external API requirements.
149    ///
150    /// # Parameters
151    ///
152    /// * `client` - HTTP client for making requests to YouTube
153    /// * `preserve_formatting` - Whether to preserve HTML formatting in the transcript
154    ///   (e.g., bold, italic, etc.)
155    ///
156    /// # Returns
157    ///
158    /// * `Result<FetchedTranscript, CouldNotRetrieveTranscript>` - The fetched transcript or an error
159    ///
160    /// # Errors
161    ///
162    /// This method will return an error if:
163    /// - The network request to YouTube fails
164    /// - YouTube returns a non-OK status code
165    /// - The transcript data cannot be parsed
166    ///
167    /// # Example
168    ///
169    /// ```rust,no_run
170    /// # use reqwest::Client;
171    /// # use yt_transcript_rs::YouTubeTranscriptApi;
172    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
173    /// let client = Client::new();
174    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
175    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
176    /// let transcript = transcript_list.find_transcript(&["en"])?;
177    ///
178    /// // Fetch without preserving formatting
179    /// let plain_transcript = transcript.fetch(&client, false).await?;
180    ///
181    /// // Fetch and preserve HTML formatting like <b>bold</b> text
182    /// let formatted_transcript = transcript.fetch(&client, true).await?;
183    ///
184    /// // Access the full text
185    /// println!("Transcript: {}", plain_transcript.text());
186    ///
187    /// // Or iterate through individual segments
188    /// for segment in plain_transcript.parts() {
189    ///     println!("[{:.1}s]: {}", segment.start, segment.text);
190    /// }
191    /// # Ok(())
192    /// # }
193    /// ```
194    pub async fn fetch(
195        &self,
196        client: &Client,
197        preserve_formatting: bool,
198    ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
199        // Use InnerTube API directly - this is now the only reliable method
200        let innertube_client = InnerTubeClient::new(client.clone());
201
202        // Get fresh transcript URLs from InnerTube API
203        let data = innertube_client
204            .get_transcript_list(&self.video_id)
205            .await
206            .map_err(|e| CouldNotRetrieveTranscript {
207                video_id: self.video_id.clone(),
208                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
209                    format!("InnerTube API failed: {}", e),
210                )),
211            })?;
212
213        // Extract caption tracks from the InnerTube response
214        let captions = data
215            .get("captions")
216            .ok_or_else(|| CouldNotRetrieveTranscript {
217                video_id: self.video_id.clone(),
218                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
219                    "No captions found in InnerTube response".to_string(),
220                )),
221            })?;
222
223        let player_captions_renderer =
224            captions
225                .get("playerCaptionsTracklistRenderer")
226                .ok_or_else(|| CouldNotRetrieveTranscript {
227                    video_id: self.video_id.clone(),
228                    reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
229                        "No playerCaptionsTracklistRenderer found".to_string(),
230                    )),
231                })?;
232
233        let caption_tracks = player_captions_renderer
234            .get("captionTracks")
235            .and_then(|ct| ct.as_array())
236            .ok_or_else(|| CouldNotRetrieveTranscript {
237                video_id: self.video_id.clone(),
238                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
239                    "No caption tracks found in InnerTube response".to_string(),
240                )),
241            })?;
242
243        // Find the matching transcript URL for our language
244        let mut matching_url = None;
245        for track in caption_tracks {
246            if let Some(language_code) = track.get("languageCode").and_then(|lc| lc.as_str()) {
247                if language_code == self.language_code {
248                    if let Some(base_url) = track.get("baseUrl").and_then(|url| url.as_str()) {
249                        matching_url = Some(base_url.to_string());
250                        break;
251                    }
252                }
253            }
254        }
255
256        let transcript_url = matching_url.ok_or_else(|| CouldNotRetrieveTranscript {
257            video_id: self.video_id.clone(),
258            reason: Some(CouldNotRetrieveTranscriptReason::NoTranscriptFound {
259                requested_language_codes: vec![self.language_code.clone()],
260                transcript_data: crate::transcript_list::TranscriptList::new(
261                    self.video_id.clone(),
262                    HashMap::new(),
263                    HashMap::new(),
264                    vec![],
265                ),
266            }),
267        })?;
268
269        // Fetch transcript content using the fresh URL from InnerTube
270        let response =
271            client
272                .get(&transcript_url)
273                .send()
274                .await
275                .map_err(|e| CouldNotRetrieveTranscript {
276                    video_id: self.video_id.clone(),
277                    reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
278                        format!("Failed to fetch transcript: {}", e),
279                    )),
280                })?;
281
282        if response.status() != reqwest::StatusCode::OK {
283            return Err(CouldNotRetrieveTranscript {
284                video_id: self.video_id.clone(),
285                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
286                    format!("YouTube returned status code {}", response.status()),
287                )),
288            });
289        }
290
291        let text = response
292            .text()
293            .await
294            .map_err(|e| CouldNotRetrieveTranscript {
295                video_id: self.video_id.clone(),
296                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
297                    format!("Failed to read transcript response: {}", e),
298                )),
299            })?;
300
301        if text.is_empty() {
302            return Err(CouldNotRetrieveTranscript {
303                video_id: self.video_id.clone(),
304                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
305                    "YouTube returned empty transcript content. This may indicate additional restrictions or API changes.".to_string()
306                )),
307            });
308        }
309
310        let snippets = TranscriptParser::new(preserve_formatting)
311            .parse(&text)
312            .map_err(|e| CouldNotRetrieveTranscript {
313                video_id: self.video_id.clone(),
314                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
315                    format!("Failed to parse transcript XML: {}", e),
316                )),
317            })?;
318
319        Ok(FetchedTranscript {
320            snippets,
321            video_id: self.video_id.clone(),
322            language: self.language.clone(),
323            language_code: self.language_code.clone(),
324            is_generated: self.is_generated,
325        })
326    }
327
328    /// Checks if this transcript can be translated to other languages.
329    ///
330    /// This method determines whether YouTube offers translation capabilities
331    /// for this transcript. Not all transcripts are translatable.
332    ///
333    /// # Returns
334    ///
335    /// * `bool` - `true` if this transcript can be translated, `false` otherwise
336    ///
337    /// # Example
338    ///
339    /// ```rust,no_run
340    /// # use yt_transcript_rs::YouTubeTranscriptApi;
341    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
342    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
343    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
344    /// let transcript = transcript_list.find_transcript(&["en"])?;
345    ///
346    /// if transcript.is_translatable() {
347    ///     println!("This transcript can be translated to other languages");
348    ///     
349    ///     // Available translation languages
350    ///     for lang in &transcript.translation_languages {
351    ///         println!("- {} ({})", lang.language, lang.language_code);
352    ///     }
353    /// } else {
354    ///     println!("This transcript cannot be translated");
355    /// }
356    /// # Ok(())
357    /// # }
358    /// ```
359    pub fn is_translatable(&self) -> bool {
360        !self.translation_languages.is_empty()
361    }
362
363    /// Creates a translated version of this transcript in the specified language.
364    ///
365    /// This method creates a new `Transcript` instance representing the same content
366    /// but translated to the requested language. Note that this doesn't actually perform
367    /// the translation yet - the translation happens when you call `fetch()` on the
368    /// returned transcript.
369    ///
370    /// # Parameters
371    ///
372    /// * `language_code` - The target language code to translate to (e.g., "es", "fr", "de")
373    ///
374    /// # Returns
375    ///
376    /// * `Result<Self, CouldNotRetrieveTranscript>` - A new transcript object representing
377    ///   the translation, or an error
378    ///
379    /// # Errors
380    ///
381    /// This method will return an error if:
382    /// - The transcript is not translatable
383    /// - The requested language is not available for translation
384    ///
385    /// # Example
386    ///
387    /// ```rust,no_run
388    /// # use reqwest::Client;
389    /// # use yt_transcript_rs::YouTubeTranscriptApi;
390    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
391    /// let client = Client::new();
392    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
393    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
394    /// let transcript = transcript_list.find_transcript(&["en"])?;
395    ///
396    /// // Create Spanish translation
397    /// if transcript.is_translatable() {
398    ///     // Translate to Spanish
399    ///     let spanish_transcript = transcript.translate("es")?;
400    ///     
401    ///     // Fetch the translated content
402    ///     let spanish_content = spanish_transcript.fetch(&client, false).await?;
403    ///     println!("Spanish translation: {}", spanish_content.text());
404    /// }
405    /// # Ok(())
406    /// # }
407    /// ```
408    pub fn translate(&self, language_code: &str) -> Result<Self, CouldNotRetrieveTranscript> {
409        if !self.is_translatable() {
410            return Err(CouldNotRetrieveTranscript {
411                video_id: self.video_id.clone(),
412                reason: Some(CouldNotRetrieveTranscriptReason::TranslationUnavailable(
413                    "This transcript cannot be translated".to_string(),
414                )),
415            });
416        }
417
418        if !self.translation_languages_map.contains_key(language_code) {
419            let available_langs = self
420                .translation_languages
421                .iter()
422                .map(|l| format!("{} ({})", l.language, l.language_code))
423                .collect::<Vec<_>>()
424                .join(", ");
425
426            return Err(CouldNotRetrieveTranscript {
427                video_id: self.video_id.clone(),
428                reason: Some(
429                    CouldNotRetrieveTranscriptReason::TranslationLanguageUnavailable(format!(
430                        "Translation to '{}' is not available. Available languages: {}",
431                        language_code, available_langs
432                    )),
433                ),
434            });
435        }
436
437        let language = self
438            .translation_languages_map
439            .get(language_code)
440            .cloned()
441            .unwrap();
442
443        let translated_url = format!("{}&tlang={}", self.url, language_code);
444
445        Ok(Self {
446            video_id: self.video_id.clone(),
447            url: translated_url,
448            language,
449            language_code: language_code.to_string(),
450            is_generated: self.is_generated,
451            translation_languages: self.translation_languages.clone(),
452            translation_languages_map: self.translation_languages_map.clone(),
453        })
454    }
455
456    /// Translates this transcript and fetches the result in a single operation.
457    ///
458    /// This convenience method combines the `translate` and `fetch` operations.
459    ///
460    /// # Parameters
461    ///
462    /// * `client` - HTTP client for making requests to YouTube
463    /// * `language_code` - The target language code to translate to
464    /// * `preserve_formatting` - Whether to preserve HTML formatting
465    ///
466    /// # Returns
467    ///
468    /// * `Result<FetchedTranscript, CouldNotRetrieveTranscript>` - The fetched translated transcript or an error
469    ///
470    /// # Example
471    ///
472    /// ```rust,no_run
473    /// # use reqwest::Client;
474    /// # use yt_transcript_rs::YouTubeTranscriptApi;
475    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
476    /// let client = Client::new();
477    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
478    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
479    /// let transcript = transcript_list.find_transcript(&["en"])?;
480    ///
481    /// if transcript.is_translatable() {
482    ///     // Translate to Spanish and fetch in one step
483    ///     let spanish_content = transcript.translate_and_fetch(&client, "es", false).await?;
484    ///     println!("Spanish translation: {}", spanish_content.text());
485    /// }
486    /// # Ok(())
487    /// # }
488    /// ```
489    pub async fn translate_and_fetch(
490        &self,
491        client: &Client,
492        language_code: &str,
493        preserve_formatting: bool,
494    ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
495        let translated = self.translate(language_code)?;
496        translated.fetch(client, preserve_formatting).await
497    }
498
499    /// Returns the human-readable language name of this transcript.
500    ///
501    /// # Returns
502    ///
503    /// * `&str` - The language name (e.g., "English", "EspaƱol")
504    pub fn language(&self) -> &str {
505        &self.language
506    }
507
508    /// Returns the language code of this transcript.
509    ///
510    /// # Returns
511    ///
512    /// * `&str` - The language code (e.g., "en", "es", "fr-CA")
513    pub fn language_code(&self) -> &str {
514        &self.language_code
515    }
516
517    /// Checks if this transcript was automatically generated by YouTube.
518    ///
519    /// # Returns
520    ///
521    /// * `bool` - `true` if automatically generated, `false` if manually created
522    pub fn is_generated(&self) -> bool {
523        self.is_generated
524    }
525}
526
527impl fmt::Display for Transcript {
528    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
529        let translation_desc = if self.is_translatable() {
530            "[TRANSLATABLE]"
531        } else {
532            ""
533        };
534        write!(
535            f,
536            "{} ({}){}",
537            self.language_code, self.language, translation_desc
538        )
539    }
540}