yt_transcript_rs/transcript_parser.rs
1use anyhow::Result;
2use regex::Regex;
3
4use crate::models::FetchedTranscriptSnippet;
5
6/// # TranscriptParser
7///
8/// Parses YouTube transcript XML data into structured transcript snippets.
9///
10/// This parser handles YouTube's XML format for transcripts and can:
11/// - Extract text content, timing information, and duration
12/// - Optionally preserve specified HTML formatting tags
13/// - Remove unwanted HTML tags
14///
15/// ## Usage Example
16///
17/// ```rust,no_run
18/// use yt_transcript_rs::transcript_parser::TranscriptParser;
19///
20/// // Create a parser that strips all formatting
21/// let parser = TranscriptParser::new(false);
22///
23/// // Or create a parser that preserves certain formatting tags (bold, italic, etc.)
24/// let formatting_parser = TranscriptParser::new(true);
25///
26/// // Parse XML transcript data
27/// let xml = r#"
28/// <transcript>
29/// <text start="0.0" dur="1.0">This is a transcript</text>
30/// <text start="1.0" dur="1.5">With multiple entries</text>
31/// </transcript>
32/// "#;
33///
34/// let snippets = parser.parse(xml).unwrap();
35/// ```
36#[derive(Debug)]
37/// Parser for YouTube transcript XML data
38pub struct TranscriptParser {
39 /// Whether to preserve specified formatting tags in the transcript
40 preserve_formatting: bool,
41 /// Regex pattern for matching HTML tags
42 html_regex: Regex,
43}
44
45impl TranscriptParser {
46 /// List of HTML formatting tags that can be preserved when `preserve_formatting` is enabled.
47 ///
48 /// These tags are commonly used for text formatting and can be preserved in the transcript:
49 /// - strong, b: Bold text
50 /// - em, i: Italic text
51 /// - mark: Highlighted text
52 /// - small: Smaller text
53 /// - del: Deleted/strikethrough text
54 /// - ins: Inserted/underlined text
55 /// - sub: Subscript
56 /// - sup: Superscript
57 const FORMATTING_TAGS: [&'static str; 10] = [
58 "strong", // important (bold)
59 "em", // emphasized (italic)
60 "b", // bold
61 "i", // italic
62 "mark", // highlighted
63 "small", // smaller
64 "del", // deleted/strikethrough
65 "ins", // inserted/underlined
66 "sub", // subscript
67 "sup", // superscript
68 ];
69
70 /// Creates a new transcript parser.
71 ///
72 /// # Parameters
73 ///
74 /// * `preserve_formatting` - If `true`, certain HTML formatting tags (like bold, italic) will be
75 /// kept in the transcript. If `false`, all HTML tags will be removed.
76 ///
77 /// # Returns
78 ///
79 /// A new `TranscriptParser` instance configured according to the formatting preference.
80 ///
81 /// # Example
82 ///
83 /// ```rust,no_run
84 /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
85 /// // Create a parser that removes all HTML tags
86 /// let plain_parser = TranscriptParser::new(false);
87 ///
88 /// // Create a parser that preserves formatting tags
89 /// let formatted_parser = TranscriptParser::new(true);
90 /// ```
91 pub fn new(preserve_formatting: bool) -> Self {
92 // Use a simple regex that matches all HTML tags - we'll handle the preservation logic separately
93 let html_regex = Regex::new(r"<[^>]*>").unwrap();
94
95 Self {
96 preserve_formatting,
97 html_regex,
98 }
99 }
100
101 /// Parses YouTube transcript XML into a collection of transcript snippets.
102 ///
103 /// This method takes raw XML data from YouTube transcripts and processes it into
104 /// structured `FetchedTranscriptSnippet` objects that contain:
105 /// - Text content (with optional formatting)
106 /// - Start time in seconds
107 /// - Duration in seconds
108 ///
109 /// # Parameters
110 ///
111 /// * `raw_data` - The raw XML string containing transcript data from YouTube
112 ///
113 /// # Returns
114 ///
115 /// * `Result<Vec<FetchedTranscriptSnippet>, anyhow::Error>` - A vector of transcript snippets on success,
116 /// or an error if parsing fails
117 ///
118 /// # Errors
119 ///
120 /// This function will return an error if:
121 /// - The XML data is malformed and cannot be parsed
122 /// - Required attributes are missing or invalid
123 ///
124 /// # Example
125 ///
126 /// ```rust,no_run
127 /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
128 /// # let xml = "<transcript><text start=\"0.0\" dur=\"1.0\">Hello</text></transcript>";
129 /// let parser = TranscriptParser::new(false);
130 /// let snippets = parser.parse(xml).unwrap();
131 ///
132 /// for snippet in snippets {
133 /// println!("[{:.1}-{:.1}s] {}",
134 /// snippet.start,
135 /// snippet.start + snippet.duration,
136 /// snippet.text);
137 /// }
138 /// ```
139 pub fn parse(&self, raw_data: &str) -> Result<Vec<FetchedTranscriptSnippet>, anyhow::Error> {
140 let mut snippets = Vec::new();
141
142 // Parse XML using roxmltree
143 let document = roxmltree::Document::parse(raw_data)?;
144 let transcript_elem = document.root_element();
145
146 // Process each text element in the transcript
147 for text_elem in transcript_elem
148 .children()
149 .filter(|n| n.has_tag_name("text"))
150 {
151 // Extract start time (defaults to 0.0 if missing or invalid)
152 let start = text_elem
153 .attribute("start")
154 .and_then(|s| s.parse::<f64>().ok())
155 .unwrap_or(0.0);
156
157 // Extract duration (defaults to 0.0 if missing or invalid)
158 let duration = text_elem
159 .attribute("dur")
160 .and_then(|s| s.parse::<f64>().ok())
161 .unwrap_or(0.0);
162
163 // Get text directly from the node
164 let text = if let Some(text) = text_elem.text() {
165 text.to_string()
166 } else {
167 String::new()
168 };
169
170 // Process the text based on formatting preferences
171 let text = if self.preserve_formatting {
172 // Keep specified formatting tags, remove others
173 self.process_with_formatting(&text)
174 } else {
175 // Remove all HTML tags
176 self.html_regex.replace_all(&text, "").to_string()
177 };
178
179 // Create and add the snippet to our collection
180 snippets.push(FetchedTranscriptSnippet {
181 text,
182 start,
183 duration,
184 });
185 }
186
187 Ok(snippets)
188 }
189
190 /// Processes text to preserve only specific allowed HTML formatting tags.
191 ///
192 /// This method:
193 /// 1. Identifies all HTML tags in the text
194 /// 2. Keeps only the tags listed in `FORMATTING_TAGS`
195 /// 3. Removes all other HTML tags
196 ///
197 /// # Parameters
198 ///
199 /// * `text` - The text containing HTML tags to process
200 ///
201 /// # Returns
202 ///
203 /// A string with only the allowed formatting tags preserved and all others removed.
204 ///
205 /// # Example (internal usage)
206 ///
207 /// ```rust,no_run
208 /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
209 /// # let parser = TranscriptParser::new(true);
210 /// # let input = "<b>Bold</b> and <span>span</span> and <i>italic</i>";
211 /// // Only <b> and <i> tags would be preserved, <span> would be removed
212 /// let result = parser.process_with_formatting(input);
213 /// // Result would be "<b>Bold</b> and span and <i>italic</i>"
214 /// ```
215 pub fn process_with_formatting(&self, text: &str) -> String {
216 let mut result = text.to_string();
217
218 // First pass: collect all HTML tags
219 let tag_matches: Vec<(usize, usize, String)> = self
220 .html_regex
221 .find_iter(text)
222 .map(|m| {
223 let tag_content = &text[m.start()..m.end()];
224 (m.start(), m.end(), tag_content.to_string())
225 })
226 .collect();
227
228 // Second pass: only keep allowed formatting tags
229 let mut offset = 0;
230 for (start, end, tag) in tag_matches {
231 let adjusted_start = start - offset;
232 let adjusted_end = end - offset;
233
234 // Check if this tag should be preserved based on our allowed list
235 let keep_tag = Self::FORMATTING_TAGS.iter().any(|&allowed_tag| {
236 let open_tag = format!("<{}", allowed_tag);
237 let close_tag = format!("</{}", allowed_tag);
238 tag.starts_with(&open_tag) || tag.starts_with(&close_tag)
239 });
240
241 if !keep_tag {
242 // Remove tag that's not in the allowed list
243 result.replace_range(adjusted_start..adjusted_end, "");
244 offset += adjusted_end - adjusted_start;
245 }
246 }
247
248 result
249 }
250}