yt_transcript_rs/fetched_transcript.rs
1use serde::{Deserialize, Serialize};
2/// Fetched transcript representation and processing.
3///
4/// This module contains the `FetchedTranscript` type, which represents a fully retrieved
5/// transcript from YouTube including all text segments with their timing information.
6/// Unlike the `Transcript` type which serves as a handle for fetching, this type
7/// contains the actual transcript content.
8///
9/// The module provides methods for working with complete transcripts, including
10/// accessing individual segments, formatting the full text, and serializing to
11/// various formats.
12use std::collections::HashMap;
13use std::iter::Iterator;
14use std::vec::IntoIter;
15
16use crate::models::FetchedTranscriptSnippet;
17
18/// A complete transcript with all the snippets and metadata.
19///
20/// This struct represents a successfully fetched transcript from YouTube,
21/// containing both the full text content (divided into timed segments) and
22/// metadata about the transcript.
23///
24/// A `FetchedTranscript` is typically obtained by calling `fetch()` on a `Transcript`
25/// object. It provides the actual transcript content, whereas `Transcript` is more
26/// like a handle for fetching.
27///
28/// # Features
29///
30/// * Contains all text segments with their timing information
31/// * Provides metadata about the transcript (language, source, etc.)
32/// * Can be iterated over to access individual segments
33/// * Supports conversion to various formats for storage or display
34///
35/// # Example
36///
37/// ```rust,no_run
38/// # use yt_transcript_rs::YouTubeTranscriptApi;
39/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
40/// let api = YouTubeTranscriptApi::new(None, None, None)?;
41/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
42/// let transcript = transcript_list.find_transcript(&["en"])?;
43///
44/// // Fetch the actual transcript content
45/// let client = reqwest::Client::new();
46/// let fetched = transcript.fetch(&client, false).await?;
47///
48/// // Access the full text
49/// println!("Full transcript: {}", fetched.text());
50///
51/// // Or work with individual segments
52/// for segment in &fetched {
53/// println!("[{:.1}s - {:.1}s]: {}",
54/// segment.start,
55/// segment.start + segment.duration,
56/// segment.text);
57/// }
58/// # Ok(())
59/// # }
60/// ```
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct FetchedTranscript {
63 /// The list of transcript snippets (text segments with timing information).
64 pub snippets: Vec<FetchedTranscriptSnippet>,
65
66 /// YouTube video ID this transcript belongs to.
67 pub video_id: String,
68
69 /// Human-readable language name (e.g., "English", "Español").
70 pub language: String,
71
72 /// Language code (e.g., "en", "fr", "es-MX").
73 pub language_code: String,
74
75 /// Whether this transcript was automatically generated by YouTube.
76 ///
77 /// `true` indicates an auto-generated transcript (using speech recognition),
78 /// while `false` indicates a manually created transcript (typically more accurate).
79 pub is_generated: bool,
80}
81
82impl FetchedTranscript {
83 /// Converts the transcript to a raw data format suitable for serialization.
84 ///
85 /// This method transforms the transcript into a vector of hashmaps containing
86 /// the text, start time, and duration for each segment. This format is useful
87 /// for JSON serialization or for integrating with other systems.
88 ///
89 /// # Returns
90 ///
91 /// A vector of hashmaps, each representing one transcript segment with keys:
92 /// - "text": The segment text
93 /// - "start": The start time in seconds
94 /// - "duration": The segment duration in seconds
95 ///
96 /// # Example
97 ///
98 /// ```rust,no_run
99 /// # use yt_transcript_rs::YouTubeTranscriptApi;
100 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
101 /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
102 /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
103 /// # let transcript = transcript_list.find_transcript(&["en"])?;
104 /// # let client = reqwest::Client::new();
105 /// # let fetched = transcript.fetch(&client, false).await?;
106 /// // Convert to raw data (array of objects)
107 /// let raw_data = fetched.to_raw_data();
108 ///
109 /// // Serialize to JSON
110 /// let json = serde_json::to_string_pretty(&raw_data)?;
111 /// println!("JSON transcript:\n{}", json);
112 /// # Ok(())
113 /// # }
114 /// ```
115 pub fn to_raw_data(&self) -> Vec<HashMap<String, serde_json::Value>> {
116 self.snippets
117 .iter()
118 .map(|snippet| {
119 let mut map = HashMap::new();
120 map.insert(
121 "text".to_string(),
122 serde_json::Value::String(snippet.text.clone()),
123 );
124 map.insert(
125 "start".to_string(),
126 serde_json::Value::Number(serde_json::Number::from_f64(snippet.start).unwrap()),
127 );
128 map.insert(
129 "duration".to_string(),
130 serde_json::Value::Number(
131 serde_json::Number::from_f64(snippet.duration).unwrap(),
132 ),
133 );
134 map
135 })
136 .collect()
137 }
138
139 /// Returns the full transcript text as a single string.
140 ///
141 /// This method combines all transcript segments into a single string,
142 /// with each segment separated by a space.
143 ///
144 /// # Returns
145 ///
146 /// A String containing the full transcript text.
147 ///
148 /// # Example
149 ///
150 /// ```rust,no_run
151 /// # use yt_transcript_rs::YouTubeTranscriptApi;
152 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
153 /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
154 /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
155 /// # let transcript = transcript_list.find_transcript(&["en"])?;
156 /// # let client = reqwest::Client::new();
157 /// # let fetched = transcript.fetch(&client, false).await?;
158 /// // Get the full text as a single string
159 /// let full_text = fetched.text();
160 /// println!("Transcript: {}", full_text);
161 /// # Ok(())
162 /// # }
163 /// ```
164 pub fn text(&self) -> String {
165 self.snippets
166 .iter()
167 .map(|snippet| snippet.text.clone())
168 .collect::<Vec<String>>()
169 .join(" ")
170 }
171
172 /// Returns a reference to the individual transcript segments.
173 ///
174 /// This method provides access to the raw transcript segments, each containing
175 /// text with its corresponding timing information.
176 ///
177 /// # Returns
178 ///
179 /// A slice of `FetchedTranscriptSnippet` objects.
180 ///
181 /// # Example
182 ///
183 /// ```rust,no_run
184 /// # use yt_transcript_rs::YouTubeTranscriptApi;
185 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
186 /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
187 /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
188 /// # let transcript = transcript_list.find_transcript(&["en"])?;
189 /// # let client = reqwest::Client::new();
190 /// # let fetched = transcript.fetch(&client, false).await?;
191 /// // Access individual segments
192 /// for segment in fetched.parts() {
193 /// // Find segments mentioning a specific word
194 /// if segment.text.to_lowercase().contains("never") {
195 /// println!("Found at {}s: {}", segment.start, segment.text);
196 /// }
197 /// }
198 /// # Ok(())
199 /// # }
200 /// ```
201 pub fn parts(&self) -> &[FetchedTranscriptSnippet] {
202 &self.snippets
203 }
204
205 /// Returns the language of this transcript.
206 ///
207 /// # Returns
208 ///
209 /// The human-readable language name (e.g., "English", "Español")
210 pub fn language(&self) -> &str {
211 &self.language
212 }
213
214 /// Returns the language code of this transcript.
215 ///
216 /// # Returns
217 ///
218 /// The language code (e.g., "en", "es", "fr-CA")
219 pub fn language_code(&self) -> &str {
220 &self.language_code
221 }
222
223 /// Returns whether this transcript was automatically generated.
224 ///
225 /// # Returns
226 ///
227 /// `true` if automatically generated by YouTube, `false` if manually created
228 pub fn is_generated(&self) -> bool {
229 self.is_generated
230 }
231
232 /// Returns the total duration of the transcript in seconds.
233 ///
234 /// This calculates the end time of the last segment in the transcript.
235 ///
236 /// # Returns
237 ///
238 /// The total duration in seconds as a f64, or 0.0 if the transcript is empty.
239 ///
240 /// # Example
241 ///
242 /// ```rust,no_run
243 /// # use yt_transcript_rs::YouTubeTranscriptApi;
244 /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
245 /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
246 /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
247 /// # let transcript = transcript_list.find_transcript(&["en"])?;
248 /// # let client = reqwest::Client::new();
249 /// # let fetched = transcript.fetch(&client, false).await?;
250 /// println!("Video duration: {:.2} seconds", fetched.duration());
251 /// # Ok(())
252 /// # }
253 /// ```
254 pub fn duration(&self) -> f64 {
255 if self.snippets.is_empty() {
256 return 0.0;
257 }
258
259 let last = &self.snippets[self.snippets.len() - 1];
260 last.start + last.duration
261 }
262}
263
264impl IntoIterator for FetchedTranscript {
265 type Item = FetchedTranscriptSnippet;
266 type IntoIter = IntoIter<Self::Item>;
267
268 /// Creates an iterator that takes ownership of the transcript.
269 ///
270 /// This allows iterating over and consuming the transcript segments.
271 fn into_iter(self) -> Self::IntoIter {
272 self.snippets.into_iter()
273 }
274}
275
276impl<'a> IntoIterator for &'a FetchedTranscript {
277 type Item = &'a FetchedTranscriptSnippet;
278 type IntoIter = std::slice::Iter<'a, FetchedTranscriptSnippet>;
279
280 /// Creates an iterator that borrows the transcript.
281 ///
282 /// This allows iterating over the transcript segments without taking ownership.
283 fn into_iter(self) -> Self::IntoIter {
284 self.snippets.iter()
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291 use serde_json::json;
292
293 // Helper function to create a test transcript
294 fn create_test_transcript() -> FetchedTranscript {
295 FetchedTranscript {
296 snippets: vec![
297 FetchedTranscriptSnippet {
298 text: "Hello world".to_string(),
299 start: 0.0,
300 duration: 3.5,
301 },
302 FetchedTranscriptSnippet {
303 text: "This is a test".to_string(),
304 start: 3.5,
305 duration: 2.8,
306 },
307 FetchedTranscriptSnippet {
308 text: "of the transcript system".to_string(),
309 start: 6.3,
310 duration: 4.2,
311 },
312 ],
313 video_id: "test123".to_string(),
314 language: "English".to_string(),
315 language_code: "en".to_string(),
316 is_generated: false,
317 }
318 }
319
320 #[test]
321 fn test_to_raw_data() {
322 let transcript = create_test_transcript();
323 let raw_data = transcript.to_raw_data();
324
325 assert_eq!(raw_data.len(), 3);
326
327 // Check first entry
328 assert_eq!(raw_data[0].get("text").unwrap(), &json!("Hello world"));
329 assert_eq!(raw_data[0].get("start").unwrap(), &json!(0.0));
330 assert_eq!(raw_data[0].get("duration").unwrap(), &json!(3.5));
331
332 // Check last entry
333 assert_eq!(
334 raw_data[2].get("text").unwrap(),
335 &json!("of the transcript system")
336 );
337 assert_eq!(raw_data[2].get("start").unwrap(), &json!(6.3));
338 assert_eq!(raw_data[2].get("duration").unwrap(), &json!(4.2));
339 }
340
341 #[test]
342 fn test_text() {
343 let transcript = create_test_transcript();
344 let full_text = transcript.text();
345
346 assert_eq!(
347 full_text,
348 "Hello world This is a test of the transcript system"
349 );
350 }
351
352 #[test]
353 fn test_parts() {
354 let transcript = create_test_transcript();
355 let parts = transcript.parts();
356
357 assert_eq!(parts.len(), 3);
358 assert_eq!(parts[0].text, "Hello world");
359 assert_eq!(parts[1].start, 3.5);
360 assert_eq!(parts[2].duration, 4.2);
361 }
362
363 #[test]
364 fn test_language_getters() {
365 let transcript = create_test_transcript();
366
367 assert_eq!(transcript.language(), "English");
368 assert_eq!(transcript.language_code(), "en");
369 assert!(!transcript.is_generated());
370 }
371
372 #[test]
373 fn test_duration() {
374 let transcript = create_test_transcript();
375
376 // Last entry starts at 6.3 with duration 4.2, so total should be 10.5
377 assert_eq!(transcript.duration(), 10.5);
378
379 // Test empty transcript
380 let empty_transcript = FetchedTranscript {
381 snippets: vec![],
382 video_id: "empty123".to_string(),
383 language: "English".to_string(),
384 language_code: "en".to_string(),
385 is_generated: false,
386 };
387
388 assert_eq!(empty_transcript.duration(), 0.0);
389 }
390
391 #[test]
392 fn test_into_iterator() {
393 let transcript = create_test_transcript();
394
395 // Test by_ref iterator
396 let mut count = 0;
397 for segment in &transcript {
398 count += 1;
399 assert!(segment.start >= 0.0);
400 assert!(segment.duration > 0.0);
401 assert!(!segment.text.is_empty());
402 }
403 assert_eq!(count, 3);
404
405 // Test consuming iterator
406 let segments: Vec<FetchedTranscriptSnippet> = transcript.into_iter().collect();
407 assert_eq!(segments.len(), 3);
408 assert_eq!(segments[0].text, "Hello world");
409 assert_eq!(segments[1].text, "This is a test");
410 assert_eq!(segments[2].text, "of the transcript system");
411 }
412
413 #[test]
414 fn test_serialization() {
415 let transcript = create_test_transcript();
416
417 // Test serialization
418 let serialized = serde_json::to_string(&transcript).unwrap();
419 assert!(serialized.contains("\"video_id\":\"test123\""));
420 assert!(serialized.contains("\"language\":\"English\""));
421 assert!(serialized.contains("\"language_code\":\"en\""));
422 assert!(serialized.contains("\"is_generated\":false"));
423
424 // Test deserialization
425 let deserialized: FetchedTranscript = serde_json::from_str(&serialized).unwrap();
426 assert_eq!(deserialized.video_id, "test123");
427 assert_eq!(deserialized.language, "English");
428 assert_eq!(deserialized.snippets.len(), 3);
429 assert_eq!(deserialized.snippets[0].text, "Hello world");
430 }
431}