yt-transcript-rs 0.1.8

use serde::{Deserialize, Serialize};
/// Fetched transcript representation and processing.
///
/// This module contains the `FetchedTranscript` type, which represents a fully retrieved
/// transcript from YouTube including all text segments with their timing information.
/// Unlike the `Transcript` type which serves as a handle for fetching, this type
/// contains the actual transcript content.
///
/// The module provides methods for working with complete transcripts, including
/// accessing individual segments, formatting the full text, and serializing to
/// various formats.
use std::collections::HashMap;
use std::iter::Iterator;
use std::vec::IntoIter;

use crate::models::FetchedTranscriptSnippet;

/// A complete transcript with all the snippets and metadata.
///
/// This struct represents a successfully fetched transcript from YouTube,
/// containing both the full text content (divided into timed segments) and
/// metadata about the transcript.
///
/// A `FetchedTranscript` is typically obtained by calling `fetch()` on a `Transcript`
/// object. It provides the actual transcript content, whereas `Transcript` is more
/// like a handle for fetching.
///
/// # Features
///
/// * Contains all text segments with their timing information
/// * Provides metadata about the transcript (language, source, etc.)
/// * Can be iterated over to access individual segments
/// * Supports conversion to various formats for storage or display
///
/// # Example
///
/// ```rust,no_run
/// # use yt_transcript_rs::YouTubeTranscriptApi;
/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let api = YouTubeTranscriptApi::new(None, None, None)?;
/// let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
/// let transcript = transcript_list.find_transcript(&["en"])?;
///
/// // Fetch the actual transcript content
/// let client = reqwest::Client::new();
/// let fetched = transcript.fetch(&client, false).await?;
///
/// // Access the full text
/// println!("Full transcript: {}", fetched.text());
///
/// // Or work with individual segments
/// for segment in &fetched {
///     println!("[{:.1}s - {:.1}s]: {}",
///         segment.start,
///         segment.start + segment.duration,
///         segment.text);
/// }
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FetchedTranscript {
    /// The list of transcript snippets (text segments with timing information).
    pub snippets: Vec<FetchedTranscriptSnippet>,

    /// YouTube video ID this transcript belongs to.
    pub video_id: String,

    /// Human-readable language name (e.g., "English", "Español").
    pub language: String,

    /// Language code (e.g., "en", "fr", "es-MX").
    pub language_code: String,

    /// Whether this transcript was automatically generated by YouTube.
    ///
    /// `true` indicates an auto-generated transcript (using speech recognition),
    /// while `false` indicates a manually created transcript (typically more accurate).
    pub is_generated: bool,
}

impl FetchedTranscript {
    /// Converts the transcript to a raw data format suitable for serialization.
    ///
    /// This method transforms the transcript into a vector of hashmaps containing
    /// the text, start time, and duration for each segment. This format is useful
    /// for JSON serialization or for integrating with other systems.
    ///
    /// # Returns
    ///
    /// A vector of hashmaps, each representing one transcript segment with keys:
    /// - "text": The segment text
    /// - "start": The start time in seconds
    /// - "duration": The segment duration in seconds
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use yt_transcript_rs::YouTubeTranscriptApi;
    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
    /// # let transcript = transcript_list.find_transcript(&["en"])?;
    /// # let client = reqwest::Client::new();
    /// # let fetched = transcript.fetch(&client, false).await?;
    /// // Convert to raw data (array of objects)
    /// let raw_data = fetched.to_raw_data();
    ///
    /// // Serialize to JSON
    /// let json = serde_json::to_string_pretty(&raw_data)?;
    /// println!("JSON transcript:\n{}", json);
    /// # Ok(())
    /// # }
    /// ```
    pub fn to_raw_data(&self) -> Vec<HashMap<String, serde_json::Value>> {
        self.snippets
            .iter()
            .map(|snippet| {
                let mut map = HashMap::new();
                map.insert(
                    "text".to_string(),
                    serde_json::Value::String(snippet.text.clone()),
                );
                map.insert(
                    "start".to_string(),
                    serde_json::Value::Number(serde_json::Number::from_f64(snippet.start).unwrap()),
                );
                map.insert(
                    "duration".to_string(),
                    serde_json::Value::Number(
                        serde_json::Number::from_f64(snippet.duration).unwrap(),
                    ),
                );
                map
            })
            .collect()
    }

    /// Returns the full transcript text as a single string.
    ///
    /// This method combines all transcript segments into a single string,
    /// with each segment separated by a space.
    ///
    /// # Returns
    ///
    /// A String containing the full transcript text.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use yt_transcript_rs::YouTubeTranscriptApi;
    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
    /// # let transcript = transcript_list.find_transcript(&["en"])?;
    /// # let client = reqwest::Client::new();
    /// # let fetched = transcript.fetch(&client, false).await?;
    /// // Get the full text as a single string
    /// let full_text = fetched.text();
    /// println!("Transcript: {}", full_text);
    /// # Ok(())
    /// # }
    /// ```
    pub fn text(&self) -> String {
        self.snippets
            .iter()
            .map(|snippet| snippet.text.clone())
            .collect::<Vec<String>>()
            .join(" ")
    }

    /// Returns a reference to the individual transcript segments.
    ///
    /// This method provides access to the raw transcript segments, each containing
    /// text with its corresponding timing information.
    ///
    /// # Returns
    ///
    /// A slice of `FetchedTranscriptSnippet` objects.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use yt_transcript_rs::YouTubeTranscriptApi;
    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
    /// # let transcript = transcript_list.find_transcript(&["en"])?;
    /// # let client = reqwest::Client::new();
    /// # let fetched = transcript.fetch(&client, false).await?;
    /// // Access individual segments
    /// for segment in fetched.parts() {
    ///     // Find segments mentioning a specific word
    ///     if segment.text.to_lowercase().contains("never") {
    ///         println!("Found at {}s: {}", segment.start, segment.text);
    ///     }
    /// }
    /// # Ok(())
    /// # }
    /// ```
    pub fn parts(&self) -> &[FetchedTranscriptSnippet] {
        &self.snippets
    }

    /// Returns the language of this transcript.
    ///
    /// # Returns
    ///
    /// The human-readable language name (e.g., "English", "Español")
    pub fn language(&self) -> &str {
        &self.language
    }

    /// Returns the language code of this transcript.
    ///
    /// # Returns
    ///
    /// The language code (e.g., "en", "es", "fr-CA")
    pub fn language_code(&self) -> &str {
        &self.language_code
    }

    /// Returns whether this transcript was automatically generated.
    ///
    /// # Returns
    ///
    /// `true` if automatically generated by YouTube, `false` if manually created
    pub fn is_generated(&self) -> bool {
        self.is_generated
    }

    /// Returns the total duration of the transcript in seconds.
    ///
    /// This calculates the end time of the last segment in the transcript.
    ///
    /// # Returns
    ///
    /// The total duration in seconds as a f64, or 0.0 if the transcript is empty.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use yt_transcript_rs::YouTubeTranscriptApi;
    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
    /// # let api = YouTubeTranscriptApi::new(None, None, None)?;
    /// # let transcript_list = api.list_transcripts("dQw4w9WgXcQ").await?;
    /// # let transcript = transcript_list.find_transcript(&["en"])?;
    /// # let client = reqwest::Client::new();
    /// # let fetched = transcript.fetch(&client, false).await?;
    /// println!("Video duration: {:.2} seconds", fetched.duration());
    /// # Ok(())
    /// # }
    /// ```
    pub fn duration(&self) -> f64 {
        if self.snippets.is_empty() {
            return 0.0;
        }

        let last = &self.snippets[self.snippets.len() - 1];
        last.start + last.duration
    }
}

impl IntoIterator for FetchedTranscript {
    type Item = FetchedTranscriptSnippet;
    type IntoIter = IntoIter<Self::Item>;

    /// Creates an iterator that takes ownership of the transcript.
    ///
    /// This allows iterating over and consuming the transcript segments.
    fn into_iter(self) -> Self::IntoIter {
        self.snippets.into_iter()
    }
}

impl<'a> IntoIterator for &'a FetchedTranscript {
    type Item = &'a FetchedTranscriptSnippet;
    type IntoIter = std::slice::Iter<'a, FetchedTranscriptSnippet>;

    /// Creates an iterator that borrows the transcript.
    ///
    /// This allows iterating over the transcript segments without taking ownership.
    fn into_iter(self) -> Self::IntoIter {
        self.snippets.iter()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    // Helper function to create a test transcript
    fn create_test_transcript() -> FetchedTranscript {
        FetchedTranscript {
            snippets: vec![
                FetchedTranscriptSnippet {
                    text: "Hello world".to_string(),
                    start: 0.0,
                    duration: 3.5,
                },
                FetchedTranscriptSnippet {
                    text: "This is a test".to_string(),
                    start: 3.5,
                    duration: 2.8,
                },
                FetchedTranscriptSnippet {
                    text: "of the transcript system".to_string(),
                    start: 6.3,
                    duration: 4.2,
                },
            ],
            video_id: "test123".to_string(),
            language: "English".to_string(),
            language_code: "en".to_string(),
            is_generated: false,
        }
    }

    #[test]
    fn test_to_raw_data() {
        let transcript = create_test_transcript();
        let raw_data = transcript.to_raw_data();

        assert_eq!(raw_data.len(), 3);

        // Check first entry
        assert_eq!(raw_data[0].get("text").unwrap(), &json!("Hello world"));
        assert_eq!(raw_data[0].get("start").unwrap(), &json!(0.0));
        assert_eq!(raw_data[0].get("duration").unwrap(), &json!(3.5));

        // Check last entry
        assert_eq!(
            raw_data[2].get("text").unwrap(),
            &json!("of the transcript system")
        );
        assert_eq!(raw_data[2].get("start").unwrap(), &json!(6.3));
        assert_eq!(raw_data[2].get("duration").unwrap(), &json!(4.2));
    }

    #[test]
    fn test_text() {
        let transcript = create_test_transcript();
        let full_text = transcript.text();

        assert_eq!(
            full_text,
            "Hello world This is a test of the transcript system"
        );
    }

    #[test]
    fn test_parts() {
        let transcript = create_test_transcript();
        let parts = transcript.parts();

        assert_eq!(parts.len(), 3);
        assert_eq!(parts[0].text, "Hello world");
        assert_eq!(parts[1].start, 3.5);
        assert_eq!(parts[2].duration, 4.2);
    }

    #[test]
    fn test_language_getters() {
        let transcript = create_test_transcript();

        assert_eq!(transcript.language(), "English");
        assert_eq!(transcript.language_code(), "en");
        assert!(!transcript.is_generated());
    }

    #[test]
    fn test_duration() {
        let transcript = create_test_transcript();

        // Last entry starts at 6.3 with duration 4.2, so total should be 10.5
        assert_eq!(transcript.duration(), 10.5);

        // Test empty transcript
        let empty_transcript = FetchedTranscript {
            snippets: vec![],
            video_id: "empty123".to_string(),
            language: "English".to_string(),
            language_code: "en".to_string(),
            is_generated: false,
        };

        assert_eq!(empty_transcript.duration(), 0.0);
    }

    #[test]
    fn test_into_iterator() {
        let transcript = create_test_transcript();

        // Test by_ref iterator
        let mut count = 0;
        for segment in &transcript {
            count += 1;
            assert!(segment.start >= 0.0);
            assert!(segment.duration > 0.0);
            assert!(!segment.text.is_empty());
        }
        assert_eq!(count, 3);

        // Test consuming iterator
        let segments: Vec<FetchedTranscriptSnippet> = transcript.into_iter().collect();
        assert_eq!(segments.len(), 3);
        assert_eq!(segments[0].text, "Hello world");
        assert_eq!(segments[1].text, "This is a test");
        assert_eq!(segments[2].text, "of the transcript system");
    }

    #[test]
    fn test_serialization() {
        let transcript = create_test_transcript();

        // Test serialization
        let serialized = serde_json::to_string(&transcript).unwrap();
        assert!(serialized.contains("\"video_id\":\"test123\""));
        assert!(serialized.contains("\"language\":\"English\""));
        assert!(serialized.contains("\"language_code\":\"en\""));
        assert!(serialized.contains("\"is_generated\":false"));

        // Test deserialization
        let deserialized: FetchedTranscript = serde_json::from_str(&serialized).unwrap();
        assert_eq!(deserialized.video_id, "test123");
        assert_eq!(deserialized.language, "English");
        assert_eq!(deserialized.snippets.len(), 3);
        assert_eq!(deserialized.snippets[0].text, "Hello world");
    }
}