yt_transcript_rs/
transcript_list.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::fmt;
4
5use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
6use crate::models::TranslationLanguage;
7use crate::transcript::Transcript;
8
9/// # TranscriptList
10///
11/// A collection of available transcripts for a YouTube video.
12///
13/// This struct provides access to all transcripts available for a video, including:
14/// - Manually created transcripts (by the video owner or contributors)
15/// - Automatically generated transcripts (created by YouTube's speech recognition)
16/// - Available translation languages for translatable transcripts
17///
18/// The `TranscriptList` differentiates between manually created and automatically generated
19/// transcripts, as the manually created ones tend to be more accurate. This allows you
20/// to prioritize manually created transcripts over automatically generated ones.
21///
22/// ## Usage Example
23///
24/// ```rust,no_run
25/// # use yt_transcript_rs::YouTubeTranscriptApi;
26/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
27/// let api = YouTubeTranscriptApi::new(None, None, None)?;
28///
29/// // Get a list of all available transcripts for a video
30/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
31///
32/// // Print all available transcripts
33/// println!("Available transcripts: {}", transcript_list);
34///
35/// // Find a transcript in a specific language (prioritizing English)
36/// let transcript = transcript_list.find_transcript(&["en", "en-US"])?;
37///
38/// // Or specifically find a manually created transcript
39/// let manual_transcript = transcript_list.find_manually_created_transcript(&["en"])?;
40///
41/// // Or retrieve an automatically generated transcript
42/// let auto_transcript = transcript_list.find_generated_transcript(&["en"])?;
43/// # Ok(())
44/// # }
45/// ```
46#[derive(Debug, Clone, Deserialize, Serialize)]
47pub struct TranscriptList {
48    /// The YouTube video ID this transcript list belongs to
49    pub video_id: String,
50
51    /// Map of language codes to manually created transcripts
52    pub manually_created_transcripts: HashMap<String, Transcript>,
53
54    /// Map of language codes to automatically generated transcripts
55    pub generated_transcripts: HashMap<String, Transcript>,
56
57    /// List of languages available for translation
58    pub translation_languages: Vec<TranslationLanguage>,
59}
60
61impl TranscriptList {
62    /// Creates a new TranscriptList with the provided components.
63    ///
64    /// # Parameters
65    ///
66    /// * `video_id` - The YouTube video ID this transcript list belongs to
67    /// * `manually_created_transcripts` - Map of language codes to manually created transcripts
68    /// * `generated_transcripts` - Map of language codes to automatically generated transcripts
69    /// * `translation_languages` - List of languages available for translation
70    ///
71    /// # Returns
72    ///
73    /// A new `TranscriptList` instance
74    pub fn new(
75        video_id: String,
76        manually_created_transcripts: HashMap<String, Transcript>,
77        generated_transcripts: HashMap<String, Transcript>,
78        translation_languages: Vec<TranslationLanguage>,
79    ) -> Self {
80        Self {
81            video_id,
82            manually_created_transcripts,
83            generated_transcripts,
84            translation_languages,
85        }
86    }
87
88    /// Creates a TranscriptList from YouTube's caption JSON data.
89    ///
90    /// This method parses YouTube's internal caption data structure to extract:
91    /// - Available transcripts (both manual and automatic)
92    /// - Their respective language codes and names
93    /// - Information about available translation languages
94    ///
95    /// # Parameters
96    ///
97    /// * `video_id` - The YouTube video ID
98    /// * `video_page_html` - JSON data extracted from YouTube's page containing caption information
99    ///
100    /// # Returns
101    ///
102    /// * `Result<Self, CouldNotRetrieveTranscript>` - A transcript list or an error
103    ///
104    /// # Errors
105    ///
106    /// Returns an error if the caption data cannot be properly parsed.
107    pub fn build(
108        video_id: String,
109        video_page_html: &serde_json::Value,
110    ) -> Result<Self, CouldNotRetrieveTranscript> {
111        let transcript_list = Self::build_without_client(video_id, video_page_html)?;
112
113        Ok(transcript_list)
114    }
115
116    /// Creates a TranscriptList from YouTube's caption JSON data without requiring a client.
117    ///
118    /// This method is similar to `build` but doesn't take a client parameter, making it
119    /// suitable for use in serialization/deserialization contexts.
120    ///
121    /// # Parameters
122    ///
123    /// * `video_id` - The YouTube video ID
124    /// * `video_page_html` - JSON data extracted from YouTube's page containing caption information
125    ///
126    /// # Returns
127    ///
128    /// * `Result<Self, CouldNotRetrieveTranscript>` - A transcript list or an error
129    ///
130    /// # Errors
131    ///
132    /// Returns an error if the caption data cannot be properly parsed.
133    pub fn build_without_client(
134        video_id: String,
135        video_page_html: &serde_json::Value,
136    ) -> Result<Self, CouldNotRetrieveTranscript> {
137        // Extract translation languages
138        let empty_vec = vec![];
139        let translation_languages_json = match video_page_html.get("translationLanguages") {
140            Some(val) => val.as_array().unwrap_or(&empty_vec),
141            None => &empty_vec,
142        };
143
144        let translation_languages = translation_languages_json
145            .iter()
146            .filter_map(|lang| {
147                let language_name = lang.get("languageName")?.get("simpleText")?.as_str()?;
148                let language_code = lang.get("languageCode")?.as_str()?;
149
150                Some(TranslationLanguage {
151                    language: language_name.to_string(),
152                    language_code: language_code.to_string(),
153                })
154            })
155            .collect::<Vec<_>>();
156
157        // Extract transcripts
158        let caption_tracks = match video_page_html.get("captionTracks") {
159            Some(val) => val.as_array().unwrap_or(&empty_vec),
160            None => &empty_vec,
161        };
162
163        let mut manually_created_transcripts = HashMap::new();
164        let mut generated_transcripts = HashMap::new();
165
166        for caption in caption_tracks {
167            let is_asr = caption
168                .get("kind")
169                .and_then(|k| k.as_str())
170                .map(|k| k == "asr")
171                .unwrap_or(false);
172
173            let language_code = match caption.get("languageCode").and_then(|lc| lc.as_str()) {
174                Some(code) => code.to_string(),
175                None => continue,
176            };
177
178            let base_url = match caption.get("baseUrl").and_then(|url| url.as_str()) {
179                Some(url) => url.to_string(),
180                None => continue,
181            };
182
183            let name = match caption
184                .get("name")
185                .and_then(|n| n.get("simpleText"))
186                .and_then(|st| st.as_str())
187            {
188                Some(name) => name.to_string(),
189                None => continue,
190            };
191
192            let is_translatable = caption
193                .get("isTranslatable")
194                .and_then(|t| t.as_bool())
195                .unwrap_or(false);
196
197            let tl = if is_translatable {
198                translation_languages.clone()
199            } else {
200                vec![]
201            };
202
203            let transcript = Transcript::new(
204                video_id.clone(),
205                base_url,
206                name,
207                language_code.clone(),
208                is_asr,
209                tl,
210            );
211
212            if is_asr {
213                generated_transcripts.insert(language_code, transcript);
214            } else {
215                manually_created_transcripts.insert(language_code, transcript);
216            }
217        }
218
219        Ok(TranscriptList::new(
220            video_id,
221            manually_created_transcripts,
222            generated_transcripts,
223            translation_languages,
224        ))
225    }
226
227    /// Finds a transcript matching one of the specified language codes.
228    ///
229    /// This method searches for transcripts in the order of priority:
230    /// 1. Manually created transcripts with the specified language codes (in order)
231    /// 2. Automatically generated transcripts with the specified language codes (in order)
232    ///
233    /// # Parameters
234    ///
235    /// * `language_codes` - Array of language codes to search for, in order of preference
236    ///
237    /// # Returns
238    ///
239    /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
240    ///
241    /// # Errors
242    ///
243    /// Returns an error if no transcript is found for any of the specified language codes.
244    ///
245    /// # Example
246    ///
247    /// ```rust,no_run
248    /// # use yt_transcript_rs::YouTubeTranscriptApi;
249    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
250    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
251    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
252    ///
253    /// // Try to find English, fall back to Spanish, then auto-generated English
254    /// let transcript = transcript_list.find_transcript(&["en", "es", "en-US"])?;
255    /// # Ok(())
256    /// # }
257    /// ```
258    pub fn find_transcript(
259        &self,
260        language_codes: &[&str],
261    ) -> Result<Transcript, CouldNotRetrieveTranscript> {
262        self.find_transcript_in_maps(
263            language_codes,
264            &[
265                &self.manually_created_transcripts,
266                &self.generated_transcripts,
267            ],
268        )
269    }
270
271    /// Finds a manually created transcript matching one of the specified language codes.
272    ///
273    /// This method only searches the manually created transcripts, skipping any
274    /// automatically generated ones. This is useful when you want to ensure you're
275    /// getting a human-created transcript for better accuracy.
276    ///
277    /// # Parameters
278    ///
279    /// * `language_codes` - Array of language codes to search for, in order of preference
280    ///
281    /// # Returns
282    ///
283    /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
284    ///
285    /// # Errors
286    ///
287    /// Returns an error if no manually created transcript is found for any of the
288    /// specified language codes.
289    ///
290    /// # Example
291    ///
292    /// ```rust,no_run
293    /// # use yt_transcript_rs::YouTubeTranscriptApi;
294    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
295    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
296    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
297    ///
298    /// // Only look for manually created transcripts
299    /// match transcript_list.find_manually_created_transcript(&["en"]) {
300    ///     Ok(transcript) => {
301    ///         println!("Found manual transcript!");
302    ///     },
303    ///     Err(_) => {
304    ///         println!("No manual transcript available, falling back to auto-generated");
305    ///         let auto_transcript = transcript_list.find_generated_transcript(&["en"])?;
306    ///     }
307    /// }
308    /// # Ok(())
309    /// # }
310    /// ```
311    pub fn find_manually_created_transcript(
312        &self,
313        language_codes: &[&str],
314    ) -> Result<Transcript, CouldNotRetrieveTranscript> {
315        self.find_transcript_in_maps(language_codes, &[&self.manually_created_transcripts])
316    }
317
318    /// Finds an automatically generated transcript matching one of the specified language codes.
319    ///
320    /// This method only searches the automatically generated transcripts, skipping any
321    /// manually created ones. This might be useful in rare cases where you specifically
322    /// want the auto-generated version.
323    ///
324    /// # Parameters
325    ///
326    /// * `language_codes` - Array of language codes to search for, in order of preference
327    ///
328    /// # Returns
329    ///
330    /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
331    ///
332    /// # Errors
333    ///
334    /// Returns an error if no automatically generated transcript is found for any of the
335    /// specified language codes.
336    pub fn find_generated_transcript(
337        &self,
338        language_codes: &[&str],
339    ) -> Result<Transcript, CouldNotRetrieveTranscript> {
340        self.find_transcript_in_maps(language_codes, &[&self.generated_transcripts])
341    }
342
343    /// Helper method to find a transcript in multiple transcript maps.
344    ///
345    /// This internal method is used by the public transcript finding methods to search
346    /// through the provided maps of transcripts for the first match with the specified
347    /// language codes.
348    ///
349    /// # Parameters
350    ///
351    /// * `language_codes` - Array of language codes to search for, in order of preference
352    /// * `transcript_maps` - Array of transcript maps to search through, in order of priority
353    ///
354    /// # Returns
355    ///
356    /// * `Result<Transcript, CouldNotRetrieveTranscript>` - Matching transcript or an error
357    ///
358    /// # Errors
359    ///
360    /// Returns an error if no transcript is found for any of the specified language codes
361    /// in any of the provided transcript maps.
362    fn find_transcript_in_maps(
363        &self,
364        language_codes: &[&str],
365        transcript_maps: &[&HashMap<String, Transcript>],
366    ) -> Result<Transcript, CouldNotRetrieveTranscript> {
367        for lang_code in language_codes {
368            for transcript_map in transcript_maps {
369                if let Some(transcript) = transcript_map.get(*lang_code) {
370                    return Ok(transcript.clone());
371                }
372            }
373        }
374
375        Err(CouldNotRetrieveTranscript {
376            video_id: self.video_id.clone(),
377            reason: Some(CouldNotRetrieveTranscriptReason::NoTranscriptFound {
378                requested_language_codes: language_codes.iter().map(|&s| s.to_string()).collect(),
379                transcript_data: self.clone(),
380            }),
381        })
382    }
383
384    /// Returns a reference to all available transcripts.
385    ///
386    /// This method provides access to both manually created and automatically generated
387    /// transcripts as an iterator.
388    ///
389    /// # Returns
390    ///
391    /// An iterator over references to all available transcripts.
392    ///
393    /// # Example
394    ///
395    /// ```rust,no_run
396    /// # use yt_transcript_rs::YouTubeTranscriptApi;
397    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
398    /// let api = YouTubeTranscriptApi::new(None, None, None)?;
399    /// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
400    ///
401    /// // Print info about all available transcripts
402    /// for transcript in transcript_list.transcripts() {
403    ///     println!("Language: {} ({}), Auto-generated: {}",
404    ///         transcript.language(),
405    ///         transcript.language_code(),
406    ///         transcript.is_generated());
407    /// }
408    /// # Ok(())
409    /// # }
410    /// ```
411    pub fn transcripts(&self) -> impl Iterator<Item = &Transcript> {
412        self.into_iter()
413    }
414}
415
416impl fmt::Display for TranscriptList {
417    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418        let mut transcript_strings = Vec::new();
419
420        // Add manually created transcripts
421        for transcript in self.manually_created_transcripts.values() {
422            transcript_strings.push(format!("{}", transcript));
423        }
424
425        // Add generated transcripts
426        for transcript in self.generated_transcripts.values() {
427            transcript_strings.push(format!("{}", transcript));
428        }
429
430        // Format the output
431        let language_desc = if transcript_strings.is_empty() {
432            "No transcripts found".to_string()
433        } else {
434            format!("Available transcripts: {}", transcript_strings.join(", "))
435        };
436
437        write!(f, "{}", language_desc)
438    }
439}
440
441impl IntoIterator for TranscriptList {
442    type Item = Transcript;
443    type IntoIter = std::vec::IntoIter<Self::Item>;
444
445    fn into_iter(self) -> Self::IntoIter {
446        let mut transcripts = Vec::new();
447        transcripts.extend(self.manually_created_transcripts.into_values());
448        transcripts.extend(self.generated_transcripts.into_values());
449        transcripts.into_iter()
450    }
451}
452
453impl<'a> IntoIterator for &'a TranscriptList {
454    type Item = &'a Transcript;
455    type IntoIter = std::iter::Chain<
456        std::iter::Map<
457            std::collections::hash_map::Values<'a, String, Transcript>,
458            fn(&'a Transcript) -> &'a Transcript,
459        >,
460        std::iter::Map<
461            std::collections::hash_map::Values<'a, String, Transcript>,
462            fn(&'a Transcript) -> &'a Transcript,
463        >,
464    >;
465
466    fn into_iter(self) -> Self::IntoIter {
467        fn id(t: &Transcript) -> &Transcript {
468            t
469        }
470        self.manually_created_transcripts
471            .values()
472            .map(id as _)
473            .chain(self.generated_transcripts.values().map(id as _))
474    }
475}