yt_transcript_rs/
fetched_transcript.rs

1use serde::{Deserialize, Serialize};
2/// Fetched transcript representation and processing.
3///
4/// This module contains the `FetchedTranscript` type, which represents a fully retrieved
5/// transcript from YouTube including all text segments with their timing information.
6/// Unlike the `Transcript` type which serves as a handle for fetching, this type
7/// contains the actual transcript content.
8///
9/// The module provides methods for working with complete transcripts, including
10/// accessing individual segments, formatting the full text, and serializing to
11/// various formats.
12use std::collections::HashMap;
13use std::iter::Iterator;
14use std::vec::IntoIter;
15
16use crate::models::FetchedTranscriptSnippet;
17
18/// A complete transcript with all the snippets and metadata.
19///
20/// This struct represents a successfully fetched transcript from YouTube,
21/// containing both the full text content (divided into timed segments) and
22/// metadata about the transcript.
23///
24/// A `FetchedTranscript` is typically obtained by calling `fetch()` on a `Transcript`
25/// object. It provides the actual transcript content, whereas `Transcript` is more
26/// like a handle for fetching.
27///
28/// # Features
29///
30/// * Contains all text segments with their timing information
31/// * Provides metadata about the transcript (language, source, etc.)
32/// * Can be iterated over to access individual segments
33/// * Supports conversion to various formats for storage or display
34///
35/// # Example
36///
37/// ```rust,no_run
38/// # use yt_transcript_rs::YouTubeTranscriptApi;
39/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
40/// let api = YouTubeTranscriptApi::new(None, None, None)?;
41/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
42/// let transcript = transcript_list.find_transcript(&["en"])?;
43///
44/// // Fetch the actual transcript content
45/// let client = reqwest::Client::new();
46/// let fetched = transcript.fetch(&client, false).await?;
47///
48/// // Access the full text
49/// println!("Full transcript: {}", fetched.text());
50///
51/// // Or work with individual segments
52/// for segment in &fetched {
53///     println!("[{:.1}s - {:.1}s]: {}",
54///         segment.start,
55///         segment.start + segment.duration,
56///         segment.text);
57/// }
58/// # Ok(())
59/// # }
60/// ```
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct FetchedTranscript {
63    /// The list of transcript snippets (text segments with timing information).
64    pub snippets: Vec<FetchedTranscriptSnippet>,
65
66    /// YouTube video ID this transcript belongs to.
67    pub video_id: String,
68
69    /// Human-readable language name (e.g., "English", "Español").
70    pub language: String,
71
72    /// Language code (e.g., "en", "fr", "es-MX").
73    pub language_code: String,
74
75    /// Whether this transcript was automatically generated by YouTube.
76    ///
77    /// `true` indicates an auto-generated transcript (using speech recognition),
78    /// while `false` indicates a manually created transcript (typically more accurate).
79    pub is_generated: bool,
80}
81
82impl FetchedTranscript {
83    /// Converts the transcript to a raw data format suitable for serialization.
84    ///
85    /// This method transforms the transcript into a vector of hashmaps containing
86    /// the text, start time, and duration for each segment. This format is useful
87    /// for JSON serialization or for integrating with other systems.
88    ///
89    /// # Returns
90    ///
91    /// A vector of hashmaps, each representing one transcript segment with keys:
92    /// - "text": The segment text
93    /// - "start": The start time in seconds
94    /// - "duration": The segment duration in seconds
95    ///
96    /// # Example
97    ///
98    /// ```rust,no_run
99    /// # use yt_transcript_rs::YouTubeTranscriptApi;
100    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
101    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
102    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
103    /// # let transcript = transcript_list.find_transcript(&["en"])?;
104    /// # let client = reqwest::Client::new();
105    /// # let fetched = transcript.fetch(&client, false).await?;
106    /// // Convert to raw data (array of objects)
107    /// let raw_data = fetched.to_raw_data();
108    ///
109    /// // Serialize to JSON
110    /// let json = serde_json::to_string_pretty(&raw_data)?;
111    /// println!("JSON transcript:\n{}", json);
112    /// # Ok(())
113    /// # }
114    /// ```
115    pub fn to_raw_data(&self) -> Vec<HashMap<String, serde_json::Value>> {
116        self.snippets
117            .iter()
118            .map(|snippet| {
119                let mut map = HashMap::new();
120                map.insert(
121                    "text".to_string(),
122                    serde_json::Value::String(snippet.text.clone()),
123                );
124                map.insert(
125                    "start".to_string(),
126                    serde_json::Value::Number(serde_json::Number::from_f64(snippet.start).unwrap()),
127                );
128                map.insert(
129                    "duration".to_string(),
130                    serde_json::Value::Number(
131                        serde_json::Number::from_f64(snippet.duration).unwrap(),
132                    ),
133                );
134                map
135            })
136            .collect()
137    }
138
139    /// Returns the full transcript text as a single string.
140    ///
141    /// This method combines all transcript segments into a single string,
142    /// with each segment separated by a space.
143    ///
144    /// # Returns
145    ///
146    /// A String containing the full transcript text.
147    ///
148    /// # Example
149    ///
150    /// ```rust,no_run
151    /// # use yt_transcript_rs::YouTubeTranscriptApi;
152    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
153    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
154    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
155    /// # let transcript = transcript_list.find_transcript(&["en"])?;
156    /// # let client = reqwest::Client::new();
157    /// # let fetched = transcript.fetch(&client, false).await?;
158    /// // Get the full text as a single string
159    /// let full_text = fetched.text();
160    /// println!("Transcript: {}", full_text);
161    /// # Ok(())
162    /// # }
163    /// ```
164    pub fn text(&self) -> String {
165        self.snippets
166            .iter()
167            .map(|snippet| snippet.text.clone())
168            .collect::<Vec<String>>()
169            .join(" ")
170    }
171
172    /// Returns a reference to the individual transcript segments.
173    ///
174    /// This method provides access to the raw transcript segments, each containing
175    /// text with its corresponding timing information.
176    ///
177    /// # Returns
178    ///
179    /// A slice of `FetchedTranscriptSnippet` objects.
180    ///
181    /// # Example
182    ///
183    /// ```rust,no_run
184    /// # use yt_transcript_rs::YouTubeTranscriptApi;
185    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
186    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
187    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
188    /// # let transcript = transcript_list.find_transcript(&["en"])?;
189    /// # let client = reqwest::Client::new();
190    /// # let fetched = transcript.fetch(&client, false).await?;
191    /// // Access individual segments
192    /// for segment in fetched.parts() {
193    ///     // Find segments mentioning a specific word
194    ///     if segment.text.to_lowercase().contains("never") {
195    ///         println!("Found at {}s: {}", segment.start, segment.text);
196    ///     }
197    /// }
198    /// # Ok(())
199    /// # }
200    /// ```
201    pub fn parts(&self) -> &[FetchedTranscriptSnippet] {
202        &self.snippets
203    }
204
205    /// Returns the language of this transcript.
206    ///
207    /// # Returns
208    ///
209    /// The human-readable language name (e.g., "English", "Español")
210    pub fn language(&self) -> &str {
211        &self.language
212    }
213
214    /// Returns the language code of this transcript.
215    ///
216    /// # Returns
217    ///
218    /// The language code (e.g., "en", "es", "fr-CA")
219    pub fn language_code(&self) -> &str {
220        &self.language_code
221    }
222
223    /// Returns whether this transcript was automatically generated.
224    ///
225    /// # Returns
226    ///
227    /// `true` if automatically generated by YouTube, `false` if manually created
228    pub fn is_generated(&self) -> bool {
229        self.is_generated
230    }
231
232    /// Returns the total duration of the transcript in seconds.
233    ///
234    /// This calculates the end time of the last segment in the transcript.
235    ///
236    /// # Returns
237    ///
238    /// The total duration in seconds as a f64, or 0.0 if the transcript is empty.
239    ///
240    /// # Example
241    ///
242    /// ```rust,no_run
243    /// # use yt_transcript_rs::YouTubeTranscriptApi;
244    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
245    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
246    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
247    /// # let transcript = transcript_list.find_transcript(&["en"])?;
248    /// # let client = reqwest::Client::new();
249    /// # let fetched = transcript.fetch(&client, false).await?;
250    /// println!("Video duration: {:.2} seconds", fetched.duration());
251    /// # Ok(())
252    /// # }
253    /// ```
254    pub fn duration(&self) -> f64 {
255        if self.snippets.is_empty() {
256            return 0.0;
257        }
258
259        let last = &self.snippets[self.snippets.len() - 1];
260        last.start + last.duration
261    }
262}
263
264impl IntoIterator for FetchedTranscript {
265    type Item = FetchedTranscriptSnippet;
266    type IntoIter = IntoIter<Self::Item>;
267
268    /// Creates an iterator that takes ownership of the transcript.
269    ///
270    /// This allows iterating over and consuming the transcript segments.
271    fn into_iter(self) -> Self::IntoIter {
272        self.snippets.into_iter()
273    }
274}
275
276impl<'a> IntoIterator for &'a FetchedTranscript {
277    type Item = &'a FetchedTranscriptSnippet;
278    type IntoIter = std::slice::Iter<'a, FetchedTranscriptSnippet>;
279
280    /// Creates an iterator that borrows the transcript.
281    ///
282    /// This allows iterating over the transcript segments without taking ownership.
283    fn into_iter(self) -> Self::IntoIter {
284        self.snippets.iter()
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291    use serde_json::json;
292
293    // Helper function to create a test transcript
294    fn create_test_transcript() -> FetchedTranscript {
295        FetchedTranscript {
296            snippets: vec![
297                FetchedTranscriptSnippet {
298                    text: "Hello world".to_string(),
299                    start: 0.0,
300                    duration: 3.5,
301                },
302                FetchedTranscriptSnippet {
303                    text: "This is a test".to_string(),
304                    start: 3.5,
305                    duration: 2.8,
306                },
307                FetchedTranscriptSnippet {
308                    text: "of the transcript system".to_string(),
309                    start: 6.3,
310                    duration: 4.2,
311                },
312            ],
313            video_id: "test123".to_string(),
314            language: "English".to_string(),
315            language_code: "en".to_string(),
316            is_generated: false,
317        }
318    }
319
320    #[test]
321    fn test_to_raw_data() {
322        let transcript = create_test_transcript();
323        let raw_data = transcript.to_raw_data();
324
325        assert_eq!(raw_data.len(), 3);
326
327        // Check first entry
328        assert_eq!(raw_data[0].get("text").unwrap(), &json!("Hello world"));
329        assert_eq!(raw_data[0].get("start").unwrap(), &json!(0.0));
330        assert_eq!(raw_data[0].get("duration").unwrap(), &json!(3.5));
331
332        // Check last entry
333        assert_eq!(
334            raw_data[2].get("text").unwrap(),
335            &json!("of the transcript system")
336        );
337        assert_eq!(raw_data[2].get("start").unwrap(), &json!(6.3));
338        assert_eq!(raw_data[2].get("duration").unwrap(), &json!(4.2));
339    }
340
341    #[test]
342    fn test_text() {
343        let transcript = create_test_transcript();
344        let full_text = transcript.text();
345
346        assert_eq!(
347            full_text,
348            "Hello world This is a test of the transcript system"
349        );
350    }
351
352    #[test]
353    fn test_parts() {
354        let transcript = create_test_transcript();
355        let parts = transcript.parts();
356
357        assert_eq!(parts.len(), 3);
358        assert_eq!(parts[0].text, "Hello world");
359        assert_eq!(parts[1].start, 3.5);
360        assert_eq!(parts[2].duration, 4.2);
361    }
362
363    #[test]
364    fn test_language_getters() {
365        let transcript = create_test_transcript();
366
367        assert_eq!(transcript.language(), "English");
368        assert_eq!(transcript.language_code(), "en");
369        assert!(!transcript.is_generated());
370    }
371
372    #[test]
373    fn test_duration() {
374        let transcript = create_test_transcript();
375
376        // Last entry starts at 6.3 with duration 4.2, so total should be 10.5
377        assert_eq!(transcript.duration(), 10.5);
378
379        // Test empty transcript
380        let empty_transcript = FetchedTranscript {
381            snippets: vec![],
382            video_id: "empty123".to_string(),
383            language: "English".to_string(),
384            language_code: "en".to_string(),
385            is_generated: false,
386        };
387
388        assert_eq!(empty_transcript.duration(), 0.0);
389    }
390
391    #[test]
392    fn test_into_iterator() {
393        let transcript = create_test_transcript();
394
395        // Test by_ref iterator
396        let mut count = 0;
397        for segment in &transcript {
398            count += 1;
399            assert!(segment.start >= 0.0);
400            assert!(segment.duration > 0.0);
401            assert!(!segment.text.is_empty());
402        }
403        assert_eq!(count, 3);
404
405        // Test consuming iterator
406        let segments: Vec<FetchedTranscriptSnippet> = transcript.into_iter().collect();
407        assert_eq!(segments.len(), 3);
408        assert_eq!(segments[0].text, "Hello world");
409        assert_eq!(segments[1].text, "This is a test");
410        assert_eq!(segments[2].text, "of the transcript system");
411    }
412
413    #[test]
414    fn test_serialization() {
415        let transcript = create_test_transcript();
416
417        // Test serialization
418        let serialized = serde_json::to_string(&transcript).unwrap();
419        assert!(serialized.contains("\"video_id\":\"test123\""));
420        assert!(serialized.contains("\"language\":\"English\""));
421        assert!(serialized.contains("\"language_code\":\"en\""));
422        assert!(serialized.contains("\"is_generated\":false"));
423
424        // Test deserialization
425        let deserialized: FetchedTranscript = serde_json::from_str(&serialized).unwrap();
426        assert_eq!(deserialized.video_id, "test123");
427        assert_eq!(deserialized.language, "English");
428        assert_eq!(deserialized.snippets.len(), 3);
429        assert_eq!(deserialized.snippets[0].text, "Hello world");
430    }
431}