yt_transcript_rs/transcript_list.rs
1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::fmt;
4
5use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
6use crate::models::TranslationLanguage;
7use crate::transcript::Transcript;
8
9/// # TranscriptList
10///
11/// A collection of available transcripts for a YouTube video.
12///
13/// This struct provides access to all transcripts available for a video, including:
14/// - Manually created transcripts (by the video owner or contributors)
15/// - Automatically generated transcripts (created by YouTube's speech recognition)
16/// - Available translation languages for translatable transcripts
17///
18/// The `TranscriptList` differentiates between manually created and automatically generated
19/// transcripts, as the manually created ones tend to be more accurate. This allows you
20/// to prioritize manually created transcripts over automatically generated ones.
21///
22/// ## Usage Example
23///
24/// ```rust,no_run
25/// # use yt_transcript_rs::YouTubeTranscriptApi;
26/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
27/// let api = YouTubeTranscriptApi::new(None, None, None)?;
28///
29/// // Get a list of all available transcripts for a video
30/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
31///
32/// // Print all available transcripts
33/// println!("Available transcripts: {}", transcript_list);
34///
35/// // Find a transcript in a specific language (prioritizing English)
36/// let transcript = transcript_list.find_transcript(&["en", "en-US"])?;
37///
38/// // Or specifically find a manually created transcript
39/// let manual_transcript = transcript_list.find_manually_created_transcript(&["en"])?;
40///
41/// // Or retrieve an automatically generated transcript
42/// let auto_transcript = transcript_list.find_generated_transcript(&["en"])?;
43/// # Ok(())
44/// # }
45/// ```
46#[derive(Debug, Clone, Deserialize, Serialize)]
47pub struct TranscriptList {
48 /// The YouTube video ID this transcript list belongs to
49 pub video_id: String,
50
51 /// Map of language codes to manually created transcripts
52 pub manually_created_transcripts: HashMap<String, Transcript>,
53
54 /// Map of language codes to automatically generated transcripts
55 pub generated_transcripts: HashMap<String, Transcript>,
56
57 /// List of languages available for translation
58 pub translation_languages: Vec<TranslationLanguage>,
59}
60
61impl TranscriptList {
62 /// Creates a new TranscriptList with the provided components.
63 ///
64 /// # Parameters
65 ///
66 /// * `video_id` - The YouTube video ID this transcript list belongs to
67 /// * `manually_created_transcripts` - Map of language codes to manually created transcripts
68 /// * `generated_transcripts` - Map of language codes to automatically generated transcripts
69 /// * `translation_languages` - List of languages available for translation
70 ///
71 /// # Returns
72 ///
73 /// A new `TranscriptList` instance
74 pub fn new(
75 video_id: String,
76 manually_created_transcripts: HashMap<String, Transcript>,
77 generated_transcripts: HashMap<String, Transcript>,
78 translation_languages: Vec<TranslationLanguage>,
79 ) -> Self {
80 Self {
81 video_id,
82 manually_created_transcripts,
83 generated_transcripts,
84 translation_languages,
85 }
86 }
87
88 /// Creates a TranscriptList from YouTube's caption JSON data.
89 ///
90 /// This method parses YouTube's internal caption data structure to extract:
91 /// - Available transcripts (both manual and automatic)
92 /// - Their respective language codes and names
93 /// - Information about available translation languages
94 ///
95 /// # Parameters
96 ///
97 /// * `video_id` - The YouTube video ID
98 /// * `video_page_html` - JSON data extracted from YouTube's page containing caption information
99 ///
100 /// # Returns
101 ///
102 /// * `Result<Self, CouldNotRetrieveTranscript>` - A transcript list or an error
103 ///
104 /// # Errors
105 ///
106 /// Returns an error if the caption data cannot be properly parsed.
107 pub fn build(
108 video_id: String,
109 video_page_html: &serde_json::Value,
110 ) -> Result<Self, CouldNotRetrieveTranscript> {
111 let transcript_list = Self::build_without_client(video_id, video_page_html)?;
112
113 Ok(transcript_list)
114 }
115
116 /// Creates a TranscriptList from YouTube's caption JSON data without requiring a client.
117 ///
118 /// This method is similar to `build` but doesn't take a client parameter, making it
119 /// suitable for use in serialization/deserialization contexts.
120 ///
121 /// # Parameters
122 ///
123 /// * `video_id` - The YouTube video ID
124 /// * `video_page_html` - JSON data extracted from YouTube's page containing caption information
125 ///
126 /// # Returns
127 ///
128 /// * `Result<Self, CouldNotRetrieveTranscript>` - A transcript list or an error
129 ///
130 /// # Errors
131 ///
132 /// Returns an error if the caption data cannot be properly parsed.
133 pub fn build_without_client(
134 video_id: String,
135 video_page_html: &serde_json::Value,
136 ) -> Result<Self, CouldNotRetrieveTranscript> {
137 // Extract translation languages
138 let empty_vec = vec![];
139 let translation_languages_json = match video_page_html.get("translationLanguages") {
140 Some(val) => val.as_array().unwrap_or(&empty_vec),
141 None => &empty_vec,
142 };
143
144 let translation_languages = translation_languages_json
145 .iter()
146 .filter_map(|lang| {
147 let language_name = lang.get("languageName")?.get("simpleText")?.as_str()?;
148 let language_code = lang.get("languageCode")?.as_str()?;
149
150 Some(TranslationLanguage {
151 language: language_name.to_string(),
152 language_code: language_code.to_string(),
153 })
154 })
155 .collect::<Vec<_>>();
156
157 // Extract transcripts
158 let caption_tracks = match video_page_html.get("captionTracks") {
159 Some(val) => val.as_array().unwrap_or(&empty_vec),
160 None => &empty_vec,
161 };
162
163 let mut manually_created_transcripts = HashMap::new();
164 let mut generated_transcripts = HashMap::new();
165
166 for caption in caption_tracks {
167 let is_asr = caption
168 .get("kind")
169 .and_then(|k| k.as_str())
170 .map(|k| k == "asr")
171 .unwrap_or(false);
172
173 let language_code = match caption.get("languageCode").and_then(|lc| lc.as_str()) {
174 Some(code) => code.to_string(),
175 None => continue,
176 };
177
178 let base_url = match caption.get("baseUrl").and_then(|url| url.as_str()) {
179 Some(url) => url.to_string(),
180 None => continue,
181 };
182
183 let name = match caption
184 .get("name")
185 .and_then(|n| n.get("simpleText"))
186 .and_then(|st| st.as_str())
187 {
188 Some(name) => name.to_string(),
189 None => continue,
190 };
191
192 let is_translatable = caption
193 .get("isTranslatable")
194 .and_then(|t| t.as_bool())
195 .unwrap_or(false);
196
197 let tl = if is_translatable {
198 translation_languages.clone()
199 } else {
200 vec![]
201 };
202
203 let transcript = Transcript::new(
204 video_id.clone(),
205 base_url,
206 name,
207 language_code.clone(),
208 is_asr,
209 tl,
210 );
211
212 if is_asr {
213 generated_transcripts.insert(language_code, transcript);
214 } else {
215 manually_created_transcripts.insert(language_code, transcript);
216 }
217 }
218
219 Ok(TranscriptList::new(
220 video_id,
221 manually_created_transcripts,
222 generated_transcripts,
223 translation_languages,
224 ))
225 }
226
227 /// Finds a transcript matching one of the specified language codes.
228 ///
229 /// This method searches for transcripts in the order of priority:
230 /// 1. Manually created transcripts with the specified language codes (in order)
231 /// 2. Automatically generated transcripts with the specified language codes (in order)
232 ///
233 /// # Parameters
234 ///
235 /// * `language_codes` - Array of language codes to search for, in order of preference
236 ///
237 /// # Returns
238 ///
239 /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
240 ///
241 /// # Errors
242 ///
243 /// Returns an error if no transcript is found for any of the specified language codes.
244 ///
245 /// # Example
246 ///
247 /// ```rust,no_run
248 /// # use yt_transcript_rs::YouTubeTranscriptApi;
249 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
250 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
251 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
252 ///
253 /// // Try to find English, fall back to Spanish, then auto-generated English
254 /// let transcript = transcript_list.find_transcript(&["en", "es", "en-US"])?;
255 /// # Ok(())
256 /// # }
257 /// ```
258 pub fn find_transcript(
259 &self,
260 language_codes: &[&str],
261 ) -> Result<Transcript, CouldNotRetrieveTranscript> {
262 self.find_transcript_in_maps(
263 language_codes,
264 &[
265 &self.manually_created_transcripts,
266 &self.generated_transcripts,
267 ],
268 )
269 }
270
271 /// Finds a manually created transcript matching one of the specified language codes.
272 ///
273 /// This method only searches the manually created transcripts, skipping any
274 /// automatically generated ones. This is useful when you want to ensure you're
275 /// getting a human-created transcript for better accuracy.
276 ///
277 /// # Parameters
278 ///
279 /// * `language_codes` - Array of language codes to search for, in order of preference
280 ///
281 /// # Returns
282 ///
283 /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
284 ///
285 /// # Errors
286 ///
287 /// Returns an error if no manually created transcript is found for any of the
288 /// specified language codes.
289 ///
290 /// # Example
291 ///
292 /// ```rust,no_run
293 /// # use yt_transcript_rs::YouTubeTranscriptApi;
294 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
295 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
296 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
297 ///
298 /// // Only look for manually created transcripts
299 /// match transcript_list.find_manually_created_transcript(&["en"]) {
300 /// Ok(transcript) => {
301 /// println!("Found manual transcript!");
302 /// },
303 /// Err(_) => {
304 /// println!("No manual transcript available, falling back to auto-generated");
305 /// let auto_transcript = transcript_list.find_generated_transcript(&["en"])?;
306 /// }
307 /// }
308 /// # Ok(())
309 /// # }
310 /// ```
311 pub fn find_manually_created_transcript(
312 &self,
313 language_codes: &[&str],
314 ) -> Result<Transcript, CouldNotRetrieveTranscript> {
315 self.find_transcript_in_maps(language_codes, &[&self.manually_created_transcripts])
316 }
317
318 /// Finds an automatically generated transcript matching one of the specified language codes.
319 ///
320 /// This method only searches the automatically generated transcripts, skipping any
321 /// manually created ones. This might be useful in rare cases where you specifically
322 /// want the auto-generated version.
323 ///
324 /// # Parameters
325 ///
326 /// * `language_codes` - Array of language codes to search for, in order of preference
327 ///
328 /// # Returns
329 ///
330 /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
331 ///
332 /// # Errors
333 ///
334 /// Returns an error if no automatically generated transcript is found for any of the
335 /// specified language codes.
336 pub fn find_generated_transcript(
337 &self,
338 language_codes: &[&str],
339 ) -> Result<Transcript, CouldNotRetrieveTranscript> {
340 self.find_transcript_in_maps(language_codes, &[&self.generated_transcripts])
341 }
342
343 /// Helper method to find a transcript in multiple transcript maps.
344 ///
345 /// This internal method is used by the public transcript finding methods to search
346 /// through the provided maps of transcripts for the first match with the specified
347 /// language codes.
348 ///
349 /// # Parameters
350 ///
351 /// * `language_codes` - Array of language codes to search for, in order of preference
352 /// * `transcript_maps` - Array of transcript maps to search through, in order of priority
353 ///
354 /// # Returns
355 ///
356 /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
357 ///
358 /// # Errors
359 ///
360 /// Returns an error if no transcript is found for any of the specified language codes
361 /// in any of the provided transcript maps.
362 fn find_transcript_in_maps(
363 &self,
364 language_codes: &[&str],
365 transcript_maps: &[&HashMap<String, Transcript>],
366 ) -> Result<Transcript, CouldNotRetrieveTranscript> {
367 for lang_code in language_codes {
368 for transcript_map in transcript_maps {
369 if let Some(transcript) = transcript_map.get(*lang_code) {
370 return Ok(transcript.clone());
371 }
372 }
373 }
374
375 Err(CouldNotRetrieveTranscript {
376 video_id: self.video_id.clone(),
377 reason: Some(CouldNotRetrieveTranscriptReason::NoTranscriptFound {
378 requested_language_codes: language_codes.iter().map(|&s| s.to_string()).collect(),
379 transcript_data: self.clone(),
380 }),
381 })
382 }
383
384 /// Returns a reference to all available transcripts.
385 ///
386 /// This method provides access to both manually created and automatically generated
387 /// transcripts as an iterator.
388 ///
389 /// # Returns
390 ///
391 /// An iterator over references to all available transcripts.
392 ///
393 /// # Example
394 ///
395 /// ```rust,no_run
396 /// # use yt_transcript_rs::YouTubeTranscriptApi;
397 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
398 /// let api = YouTubeTranscriptApi::new(None, None, None)?;
399 /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
400 ///
401 /// // Print info about all available transcripts
402 /// for transcript in transcript_list.transcripts() {
403 /// println!("Language: {} ({}), Auto-generated: {}",
404 /// transcript.language(),
405 /// transcript.language_code(),
406 /// transcript.is_generated());
407 /// }
408 /// # Ok(())
409 /// # }
410 /// ```
411 pub fn transcripts(&self) -> impl Iterator<Item = &Transcript> {
412 self.into_iter()
413 }
414}
415
416impl fmt::Display for TranscriptList {
417 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418 let mut transcript_strings = Vec::new();
419
420 // Add manually created transcripts
421 for transcript in self.manually_created_transcripts.values() {
422 transcript_strings.push(format!("{}", transcript));
423 }
424
425 // Add generated transcripts
426 for transcript in self.generated_transcripts.values() {
427 transcript_strings.push(format!("{}", transcript));
428 }
429
430 // Format the output
431 let language_desc = if transcript_strings.is_empty() {
432 "No transcripts found".to_string()
433 } else {
434 format!("Available transcripts: {}", transcript_strings.join(", "))
435 };
436
437 write!(f, "{}", language_desc)
438 }
439}
440
441impl IntoIterator for TranscriptList {
442 type Item = Transcript;
443 type IntoIter = std::vec::IntoIter<Self::Item>;
444
445 fn into_iter(self) -> Self::IntoIter {
446 let mut transcripts = Vec::new();
447 transcripts.extend(self.manually_created_transcripts.into_values());
448 transcripts.extend(self.generated_transcripts.into_values());
449 transcripts.into_iter()
450 }
451}
452
453impl<'a> IntoIterator for &'a TranscriptList {
454 type Item = &'a Transcript;
455 type IntoIter = std::iter::Chain<
456 std::iter::Map<
457 std::collections::hash_map::Values<'a, String, Transcript>,
458 fn(&'a Transcript) -> &'a Transcript,
459 >,
460 std::iter::Map<
461 std::collections::hash_map::Values<'a, String, Transcript>,
462 fn(&'a Transcript) -> &'a Transcript,
463 >,
464 >;
465
466 fn into_iter(self) -> Self::IntoIter {
467 fn id(t: &Transcript) -> &Transcript {
468 t
469 }
470 self.manually_created_transcripts
471 .values()
472 .map(id as _)
473 .chain(self.generated_transcripts.values().map(id as _))
474 }
475}