yt_transcript_rs/
transcript_parser.rs

1use anyhow::Result;
2use regex::Regex;
3
4use crate::models::FetchedTranscriptSnippet;
5
6/// # TranscriptParser
7///
8/// Parses YouTube transcript XML data into structured transcript snippets.
9///
10/// This parser handles YouTube's XML format for transcripts and can:
11/// - Extract text content, timing information, and duration
12/// - Optionally preserve specified HTML formatting tags
13/// - Remove unwanted HTML tags
14///
15/// ## Usage Example
16///
17/// ```rust,no_run
18/// use yt_transcript_rs::transcript_parser::TranscriptParser;
19///
20/// // Create a parser that strips all formatting
21/// let parser = TranscriptParser::new(false);
22///
23/// // Or create a parser that preserves certain formatting tags (bold, italic, etc.)
24/// let formatting_parser = TranscriptParser::new(true);
25///
26/// // Parse XML transcript data
27/// let xml = r#"
28///     <transcript>
29///         <text start="0.0" dur="1.0">This is a transcript</text>
30///         <text start="1.0" dur="1.5">With multiple entries</text>
31///     </transcript>
32/// "#;
33///
34/// let snippets = parser.parse(xml).unwrap();
35/// ```
36#[derive(Debug)]
37/// Parser for YouTube transcript XML data
38pub struct TranscriptParser {
39    /// Whether to preserve specified formatting tags in the transcript
40    preserve_formatting: bool,
41    /// Regex pattern for matching HTML tags
42    html_regex: Regex,
43}
44
45impl TranscriptParser {
46    /// List of HTML formatting tags that can be preserved when `preserve_formatting` is enabled.
47    ///
48    /// These tags are commonly used for text formatting and can be preserved in the transcript:
49    /// - strong, b: Bold text
50    /// - em, i: Italic text
51    /// - mark: Highlighted text
52    /// - small: Smaller text
53    /// - del: Deleted/strikethrough text
54    /// - ins: Inserted/underlined text
55    /// - sub: Subscript
56    /// - sup: Superscript
57    const FORMATTING_TAGS: [&'static str; 10] = [
58        "strong", // important (bold)
59        "em",     // emphasized (italic)
60        "b",      // bold
61        "i",      // italic
62        "mark",   // highlighted
63        "small",  // smaller
64        "del",    // deleted/strikethrough
65        "ins",    // inserted/underlined
66        "sub",    // subscript
67        "sup",    // superscript
68    ];
69
70    /// Creates a new transcript parser.
71    ///
72    /// # Parameters
73    ///
74    /// * `preserve_formatting` - If `true`, certain HTML formatting tags (like bold, italic) will be
75    ///   kept in the transcript. If `false`, all HTML tags will be removed.
76    ///
77    /// # Returns
78    ///
79    /// A new `TranscriptParser` instance configured according to the formatting preference.
80    ///
81    /// # Example
82    ///
83    /// ```rust,no_run
84    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
85    /// // Create a parser that removes all HTML tags
86    /// let plain_parser = TranscriptParser::new(false);
87    ///
88    /// // Create a parser that preserves formatting tags
89    /// let formatted_parser = TranscriptParser::new(true);
90    /// ```
91    pub fn new(preserve_formatting: bool) -> Self {
92        // Use a simple regex that matches all HTML tags - we'll handle the preservation logic separately
93        let html_regex = Regex::new(r"<[^>]*>").unwrap();
94
95        Self {
96            preserve_formatting,
97            html_regex,
98        }
99    }
100
101    /// Parses YouTube transcript XML into a collection of transcript snippets.
102    ///
103    /// This method takes raw XML data from YouTube transcripts and processes it into
104    /// structured `FetchedTranscriptSnippet` objects that contain:
105    /// - Text content (with optional formatting)
106    /// - Start time in seconds
107    /// - Duration in seconds
108    ///
109    /// # Parameters
110    ///
111    /// * `raw_data` - The raw XML string containing transcript data from YouTube
112    ///
113    /// # Returns
114    ///
115    /// * `Result<Vec<FetchedTranscriptSnippet>, anyhow::Error>` - A vector of transcript snippets on success,
116    ///   or an error if parsing fails
117    ///
118    /// # Errors
119    ///
120    /// This function will return an error if:
121    /// - The XML data is malformed and cannot be parsed
122    /// - Required attributes are missing or invalid
123    ///
124    /// # Example
125    ///
126    /// ```rust,no_run
127    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
128    /// # let xml = "<transcript><text start=\"0.0\" dur=\"1.0\">Hello</text></transcript>";
129    /// let parser = TranscriptParser::new(false);
130    /// let snippets = parser.parse(xml).unwrap();
131    ///
132    /// for snippet in snippets {
133    ///     println!("[{:.1}-{:.1}s] {}",
134    ///         snippet.start,
135    ///         snippet.start + snippet.duration,
136    ///         snippet.text);
137    /// }
138    /// ```
139    pub fn parse(&self, raw_data: &str) -> Result<Vec<FetchedTranscriptSnippet>, anyhow::Error> {
140        let mut snippets = Vec::new();
141
142        // Parse XML using roxmltree
143        let document = roxmltree::Document::parse(raw_data)?;
144        let transcript_elem = document.root_element();
145
146        // Process each text element in the transcript
147        for text_elem in transcript_elem
148            .children()
149            .filter(|n| n.has_tag_name("text"))
150        {
151            // Extract start time (defaults to 0.0 if missing or invalid)
152            let start = text_elem
153                .attribute("start")
154                .and_then(|s| s.parse::<f64>().ok())
155                .unwrap_or(0.0);
156
157            // Extract duration (defaults to 0.0 if missing or invalid)
158            let duration = text_elem
159                .attribute("dur")
160                .and_then(|s| s.parse::<f64>().ok())
161                .unwrap_or(0.0);
162
163            // Get text directly from the node
164            let text = if let Some(text) = text_elem.text() {
165                text.to_string()
166            } else {
167                String::new()
168            };
169
170            // Process the text based on formatting preferences
171            let text = if self.preserve_formatting {
172                // Keep specified formatting tags, remove others
173                self.process_with_formatting(&text)
174            } else {
175                // Remove all HTML tags
176                self.html_regex.replace_all(&text, "").to_string()
177            };
178
179            // Create and add the snippet to our collection
180            snippets.push(FetchedTranscriptSnippet {
181                text,
182                start,
183                duration,
184            });
185        }
186
187        Ok(snippets)
188    }
189
190    /// Processes text to preserve only specific allowed HTML formatting tags.
191    ///
192    /// This method:
193    /// 1. Identifies all HTML tags in the text
194    /// 2. Keeps only the tags listed in `FORMATTING_TAGS`
195    /// 3. Removes all other HTML tags
196    ///
197    /// # Parameters
198    ///
199    /// * `text` - The text containing HTML tags to process
200    ///
201    /// # Returns
202    ///
203    /// A string with only the allowed formatting tags preserved and all others removed.
204    ///
205    /// # Example (internal usage)
206    ///
207    /// ```rust,no_run
208    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
209    /// # let parser = TranscriptParser::new(true);
210    /// # let input = "<b>Bold</b> and <span>span</span> and <i>italic</i>";
211    /// // Only <b> and <i> tags would be preserved, <span> would be removed
212    /// let result = parser.process_with_formatting(input);
213    /// // Result would be "<b>Bold</b> and span and <i>italic</i>"
214    /// ```
215    pub fn process_with_formatting(&self, text: &str) -> String {
216        let mut result = text.to_string();
217
218        // First pass: collect all HTML tags
219        let tag_matches: Vec<(usize, usize, String)> = self
220            .html_regex
221            .find_iter(text)
222            .map(|m| {
223                let tag_content = &text[m.start()..m.end()];
224                (m.start(), m.end(), tag_content.to_string())
225            })
226            .collect();
227
228        // Second pass: only keep allowed formatting tags
229        let mut offset = 0;
230        for (start, end, tag) in tag_matches {
231            let adjusted_start = start - offset;
232            let adjusted_end = end - offset;
233
234            // Check if this tag should be preserved based on our allowed list
235            let keep_tag = Self::FORMATTING_TAGS.iter().any(|&allowed_tag| {
236                let open_tag = format!("<{}", allowed_tag);
237                let close_tag = format!("</{}", allowed_tag);
238                tag.starts_with(&open_tag) || tag.starts_with(&close_tag)
239            });
240
241            if !keep_tag {
242                // Remove tag that's not in the allowed list
243                result.replace_range(adjusted_start..adjusted_end, "");
244                offset += adjusted_end - adjusted_start;
245            }
246        }
247
248        result
249    }
250}