yt_transcript_rs/transcript.rs
1use reqwest::Client;
2use std::collections::HashMap;
3use std::fmt;
4
5use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
6use crate::fetched_transcript::FetchedTranscript;
7use crate::innertube_client::InnerTubeClient;
8use crate::models::TranslationLanguage;
9use crate::transcript_parser::TranscriptParser;
10
11/// # Transcript
12///
13/// Represents a YouTube transcript that can be fetched or translated.
14///
15/// This struct contains the metadata and access URLs for a transcript but not
16/// the actual transcript text content. It serves as a handle to retrieve the
17/// full transcript text when needed.
18///
19/// A `Transcript` object can represent:
20/// - A native transcript in its original language
21/// - A translatable transcript that can be converted to other languages
22/// - A manually created transcript (more accurate, created by humans)
23/// - An automatically generated transcript (created by YouTube's speech recognition)
24///
25/// ## Usage Example
26///
27/// ```rust,no_run
28/// # use yt_transcript_rs::YouTubeTranscriptApi;
29/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
30/// let api = YouTubeTranscriptApi::new(None, None, None)?;
31/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
32///
33/// // Find an English transcript
34/// let transcript = transcript_list.find_transcript(&["en"])?;
35///
36/// // Check if it can be translated
37/// if transcript.is_translatable() {
38/// // Translate to Spanish
39/// let spanish = transcript.translate("es")?;
40///
41/// // Fetch the translated content
42/// let client = reqwest::Client::new();
43/// let fetched = spanish.fetch(&client, false).await?;
44/// println!("Spanish transcript: {}", fetched.text());
45/// }
46///
47/// // Or fetch the original transcript
48/// let client = reqwest::Client::new();
49/// let fetched = transcript.fetch(&client, false).await?;
50/// println!("Original transcript: {}", fetched.text());
51/// # Ok(())
52/// # }
53/// ```
54#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
55pub struct Transcript {
56 /// The YouTube video ID this transcript belongs to
57 pub video_id: String,
58
59 /// URL to fetch the transcript content from YouTube
60 pub url: String,
61
62 /// Full human-readable language name (e.g., "English")
63 pub language: String,
64
65 /// Language code (e.g., "en", "en-US", "es")
66 pub language_code: String,
67
68 /// Whether this transcript was automatically generated by YouTube
69 pub is_generated: bool,
70
71 /// List of languages this transcript can be translated to
72 pub translation_languages: Vec<TranslationLanguage>,
73
74 /// Mapping of language codes to language names for available translations
75 pub translation_languages_map: HashMap<String, String>,
76}
77
78impl Transcript {
79 /// Creates a new transcript instance.
80 ///
81 /// This constructor creates a transcript object that can be used to fetch
82 /// the actual transcript content or to generate translations.
83 ///
84 /// # Parameters
85 ///
86 /// * `video_id` - YouTube video ID
87 /// * `url` - URL to fetch the transcript content
88 /// * `language` - Human-readable language name (e.g., "English")
89 /// * `language_code` - Language code (e.g., "en", "en-US")
90 /// * `is_generated` - Whether this transcript was automatically generated
91 /// * `translation_languages` - List of languages this transcript can be translated to
92 ///
93 /// # Returns
94 ///
95 /// A new `Transcript` instance
96 ///
97 /// # Example (internal usage)
98 ///
99 /// ```rust,no_run
100 /// # use reqwest::Client;
101 /// # use yt_transcript_rs::transcript::Transcript;
102 /// # use yt_transcript_rs::models::TranslationLanguage;
103 /// # fn example() {
104 /// // Create a transcript for English
105 /// let transcript = Transcript::new(
106 /// "dQw4w9WgXcQ".to_string(),
107 /// "https://www.youtube.com/api/timedtext?...".to_string(),
108 /// "English".to_string(),
109 /// "en".to_string(),
110 /// false, // Not automatically generated
111 /// vec![
112 /// TranslationLanguage {
113 /// language: "Spanish".to_string(),
114 /// language_code: "es".to_string()
115 /// }
116 /// ]
117 /// );
118 /// # }
119 /// ```
120 pub fn new(
121 video_id: String,
122 url: String,
123 language: String,
124 language_code: String,
125 is_generated: bool,
126 translation_languages: Vec<TranslationLanguage>,
127 ) -> Self {
128 let translation_languages_map = translation_languages
129 .iter()
130 .map(|lang| (lang.language_code.clone(), lang.language.clone()))
131 .collect();
132
133 Self {
134 video_id,
135 url,
136 language,
137 language_code,
138 is_generated,
139 translation_languages,
140 translation_languages_map,
141 }
142 }
143
144 /// Fetches the actual transcript content from YouTube.
145 ///
146 /// This method retrieves the transcript text and timing information from YouTube
147 /// using YouTube's internal InnerTube API, which provides reliable access to
148 /// transcript data even when YouTube updates their external API requirements.
149 ///
150 /// # Parameters
151 ///
152 /// * `client` - HTTP client for making requests to YouTube
153 /// * `preserve_formatting` - Whether to preserve HTML formatting in the transcript
154 /// (e.g., bold, italic, etc.)
155 ///
156 /// # Returns
157 ///
158 /// * `Result<FetchedTranscript, CouldNotRetrieveTranscript>` - The fetched transcript or an error
159 ///
160 /// # Errors
161 ///
162 /// This method will return an error if:
163 /// - The network request to YouTube fails
164 /// - YouTube returns a non-OK status code
165 /// - The transcript data cannot be parsed
166 ///
167 /// # Example
168 ///
169 /// ```rust,no_run
170 /// # use reqwest::Client;
171 /// # use yt_transcript_rs::YouTubeTranscriptApi;
172 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
173 /// let client = Client::new();
174 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
175 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
176 /// let transcript = transcript_list.find_transcript(&["en"])?;
177 ///
178 /// // Fetch without preserving formatting
179 /// let plain_transcript = transcript.fetch(&client, false).await?;
180 ///
181 /// // Fetch and preserve HTML formatting like <b>bold</b> text
182 /// let formatted_transcript = transcript.fetch(&client, true).await?;
183 ///
184 /// // Access the full text
185 /// println!("Transcript: {}", plain_transcript.text());
186 ///
187 /// // Or iterate through individual segments
188 /// for segment in plain_transcript.parts() {
189 /// println!("[{:.1}s]: {}", segment.start, segment.text);
190 /// }
191 /// # Ok(())
192 /// # }
193 /// ```
194 pub async fn fetch(
195 &self,
196 client: &Client,
197 preserve_formatting: bool,
198 ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
199 // Use InnerTube API directly - this is now the only reliable method
200 let innertube_client = InnerTubeClient::new(client.clone());
201
202 // Get fresh transcript URLs from InnerTube API
203 let data = innertube_client
204 .get_transcript_list(&self.video_id)
205 .await
206 .map_err(|e| CouldNotRetrieveTranscript {
207 video_id: self.video_id.clone(),
208 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
209 format!("InnerTube API failed: {}", e),
210 )),
211 })?;
212
213 // Extract caption tracks from the InnerTube response
214 let captions = data
215 .get("captions")
216 .ok_or_else(|| CouldNotRetrieveTranscript {
217 video_id: self.video_id.clone(),
218 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
219 "No captions found in InnerTube response".to_string(),
220 )),
221 })?;
222
223 let player_captions_renderer =
224 captions
225 .get("playerCaptionsTracklistRenderer")
226 .ok_or_else(|| CouldNotRetrieveTranscript {
227 video_id: self.video_id.clone(),
228 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
229 "No playerCaptionsTracklistRenderer found".to_string(),
230 )),
231 })?;
232
233 let caption_tracks = player_captions_renderer
234 .get("captionTracks")
235 .and_then(|ct| ct.as_array())
236 .ok_or_else(|| CouldNotRetrieveTranscript {
237 video_id: self.video_id.clone(),
238 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
239 "No caption tracks found in InnerTube response".to_string(),
240 )),
241 })?;
242
243 // Find the matching transcript URL for our language
244 let mut matching_url = None;
245 for track in caption_tracks {
246 if let Some(language_code) = track.get("languageCode").and_then(|lc| lc.as_str()) {
247 if language_code == self.language_code {
248 if let Some(base_url) = track.get("baseUrl").and_then(|url| url.as_str()) {
249 matching_url = Some(base_url.to_string());
250 break;
251 }
252 }
253 }
254 }
255
256 let transcript_url = matching_url.ok_or_else(|| CouldNotRetrieveTranscript {
257 video_id: self.video_id.clone(),
258 reason: Some(CouldNotRetrieveTranscriptReason::NoTranscriptFound {
259 requested_language_codes: vec![self.language_code.clone()],
260 transcript_data: crate::transcript_list::TranscriptList::new(
261 self.video_id.clone(),
262 HashMap::new(),
263 HashMap::new(),
264 vec![],
265 ),
266 }),
267 })?;
268
269 // Fetch transcript content using the fresh URL from InnerTube
270 let response =
271 client
272 .get(&transcript_url)
273 .send()
274 .await
275 .map_err(|e| CouldNotRetrieveTranscript {
276 video_id: self.video_id.clone(),
277 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
278 format!("Failed to fetch transcript: {}", e),
279 )),
280 })?;
281
282 if response.status() != reqwest::StatusCode::OK {
283 return Err(CouldNotRetrieveTranscript {
284 video_id: self.video_id.clone(),
285 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
286 format!("YouTube returned status code {}", response.status()),
287 )),
288 });
289 }
290
291 let text = response
292 .text()
293 .await
294 .map_err(|e| CouldNotRetrieveTranscript {
295 video_id: self.video_id.clone(),
296 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
297 format!("Failed to read transcript response: {}", e),
298 )),
299 })?;
300
301 if text.is_empty() {
302 return Err(CouldNotRetrieveTranscript {
303 video_id: self.video_id.clone(),
304 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeRequestFailed(
305 "YouTube returned empty transcript content. This may indicate additional restrictions or API changes.".to_string()
306 )),
307 });
308 }
309
310 let snippets = TranscriptParser::new(preserve_formatting)
311 .parse(&text)
312 .map_err(|e| CouldNotRetrieveTranscript {
313 video_id: self.video_id.clone(),
314 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
315 format!("Failed to parse transcript XML: {}", e),
316 )),
317 })?;
318
319 Ok(FetchedTranscript {
320 snippets,
321 video_id: self.video_id.clone(),
322 language: self.language.clone(),
323 language_code: self.language_code.clone(),
324 is_generated: self.is_generated,
325 })
326 }
327
328 /// Checks if this transcript can be translated to other languages.
329 ///
330 /// This method determines whether YouTube offers translation capabilities
331 /// for this transcript. Not all transcripts are translatable.
332 ///
333 /// # Returns
334 ///
335 /// * `bool` - `true` if this transcript can be translated, `false` otherwise
336 ///
337 /// # Example
338 ///
339 /// ```rust,no_run
340 /// # use yt_transcript_rs::YouTubeTranscriptApi;
341 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
342 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
343 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
344 /// let transcript = transcript_list.find_transcript(&["en"])?;
345 ///
346 /// if transcript.is_translatable() {
347 /// println!("This transcript can be translated to other languages");
348 ///
349 /// // Available translation languages
350 /// for lang in &transcript.translation_languages {
351 /// println!("- {} ({})", lang.language, lang.language_code);
352 /// }
353 /// } else {
354 /// println!("This transcript cannot be translated");
355 /// }
356 /// # Ok(())
357 /// # }
358 /// ```
359 pub fn is_translatable(&self) -> bool {
360 !self.translation_languages.is_empty()
361 }
362
363 /// Creates a translated version of this transcript in the specified language.
364 ///
365 /// This method creates a new `Transcript` instance representing the same content
366 /// but translated to the requested language. Note that this doesn't actually perform
367 /// the translation yet - the translation happens when you call `fetch()` on the
368 /// returned transcript.
369 ///
370 /// # Parameters
371 ///
372 /// * `language_code` - The target language code to translate to (e.g., "es", "fr", "de")
373 ///
374 /// # Returns
375 ///
376 /// * `Result<Self, CouldNotRetrieveTranscript>` - A new transcript object representing
377 /// the translation, or an error
378 ///
379 /// # Errors
380 ///
381 /// This method will return an error if:
382 /// - The transcript is not translatable
383 /// - The requested language is not available for translation
384 ///
385 /// # Example
386 ///
387 /// ```rust,no_run
388 /// # use reqwest::Client;
389 /// # use yt_transcript_rs::YouTubeTranscriptApi;
390 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
391 /// let client = Client::new();
392 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
393 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
394 /// let transcript = transcript_list.find_transcript(&["en"])?;
395 ///
396 /// // Create Spanish translation
397 /// if transcript.is_translatable() {
398 /// // Translate to Spanish
399 /// let spanish_transcript = transcript.translate("es")?;
400 ///
401 /// // Fetch the translated content
402 /// let spanish_content = spanish_transcript.fetch(&client, false).await?;
403 /// println!("Spanish translation: {}", spanish_content.text());
404 /// }
405 /// # Ok(())
406 /// # }
407 /// ```
408 pub fn translate(&self, language_code: &str) -> Result<Self, CouldNotRetrieveTranscript> {
409 if !self.is_translatable() {
410 return Err(CouldNotRetrieveTranscript {
411 video_id: self.video_id.clone(),
412 reason: Some(CouldNotRetrieveTranscriptReason::TranslationUnavailable(
413 "This transcript cannot be translated".to_string(),
414 )),
415 });
416 }
417
418 if !self.translation_languages_map.contains_key(language_code) {
419 let available_langs = self
420 .translation_languages
421 .iter()
422 .map(|l| format!("{} ({})", l.language, l.language_code))
423 .collect::<Vec<_>>()
424 .join(", ");
425
426 return Err(CouldNotRetrieveTranscript {
427 video_id: self.video_id.clone(),
428 reason: Some(
429 CouldNotRetrieveTranscriptReason::TranslationLanguageUnavailable(format!(
430 "Translation to '{}' is not available. Available languages: {}",
431 language_code, available_langs
432 )),
433 ),
434 });
435 }
436
437 let language = self
438 .translation_languages_map
439 .get(language_code)
440 .cloned()
441 .unwrap();
442
443 let translated_url = format!("{}&tlang={}", self.url, language_code);
444
445 Ok(Self {
446 video_id: self.video_id.clone(),
447 url: translated_url,
448 language,
449 language_code: language_code.to_string(),
450 is_generated: self.is_generated,
451 translation_languages: self.translation_languages.clone(),
452 translation_languages_map: self.translation_languages_map.clone(),
453 })
454 }
455
456 /// Translates this transcript and fetches the result in a single operation.
457 ///
458 /// This convenience method combines the `translate` and `fetch` operations.
459 ///
460 /// # Parameters
461 ///
462 /// * `client` - HTTP client for making requests to YouTube
463 /// * `language_code` - The target language code to translate to
464 /// * `preserve_formatting` - Whether to preserve HTML formatting
465 ///
466 /// # Returns
467 ///
468 /// * `Result<FetchedTranscript, CouldNotRetrieveTranscript>` - The fetched translated transcript or an error
469 ///
470 /// # Example
471 ///
472 /// ```rust,no_run
473 /// # use reqwest::Client;
474 /// # use yt_transcript_rs::YouTubeTranscriptApi;
475 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
476 /// let client = Client::new();
477 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
478 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
479 /// let transcript = transcript_list.find_transcript(&["en"])?;
480 ///
481 /// if transcript.is_translatable() {
482 /// // Translate to Spanish and fetch in one step
483 /// let spanish_content = transcript.translate_and_fetch(&client, "es", false).await?;
484 /// println!("Spanish translation: {}", spanish_content.text());
485 /// }
486 /// # Ok(())
487 /// # }
488 /// ```
489 pub async fn translate_and_fetch(
490 &self,
491 client: &Client,
492 language_code: &str,
493 preserve_formatting: bool,
494 ) -> Result<FetchedTranscript, CouldNotRetrieveTranscript> {
495 let translated = self.translate(language_code)?;
496 translated.fetch(client, preserve_formatting).await
497 }
498
499 /// Returns the human-readable language name of this transcript.
500 ///
501 /// # Returns
502 ///
503 /// * `&str` - The language name (e.g., "English", "EspaƱol")
504 pub fn language(&self) -> &str {
505 &self.language
506 }
507
508 /// Returns the language code of this transcript.
509 ///
510 /// # Returns
511 ///
512 /// * `&str` - The language code (e.g., "en", "es", "fr-CA")
513 pub fn language_code(&self) -> &str {
514 &self.language_code
515 }
516
517 /// Checks if this transcript was automatically generated by YouTube.
518 ///
519 /// # Returns
520 ///
521 /// * `bool` - `true` if automatically generated, `false` if manually created
522 pub fn is_generated(&self) -> bool {
523 self.is_generated
524 }
525}
526
527impl fmt::Display for Transcript {
528 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
529 let translation_desc = if self.is_translatable() {
530 "[TRANSLATABLE]"
531 } else {
532 ""
533 };
534 write!(
535 f,
536 "{} ({}){}",
537 self.language_code, self.language, translation_desc
538 )
539 }
540}