yt_transcript_rs/
transcript_parser.rs

1use crate::models::FetchedTranscriptSnippet;
2use anyhow::Result;
3use html_escape::decode_html_entities;
4use quick_xml::events::Event;
5use quick_xml::reader::Reader;
6use regex::Regex;
7use scraper::{Html, Selector};
8use serde::{Deserialize, Serialize};
9use std::io::Cursor;
10
11#[derive(Debug, Serialize, Deserialize)]
12struct Transcript {
13    #[serde(rename = "text")]
14    texts: Vec<Text>,
15}
16
17#[derive(Debug, Serialize, Deserialize)]
18struct Text {
19    #[serde(rename = "@start")]
20    start: String,
21
22    #[serde(rename = "@dur")]
23    duration: String,
24
25    // Text content of the element
26    #[serde(rename = "$text")]
27    content: String,
28}
29
30/// # TranscriptParser
31///
32/// Parses YouTube transcript XML data into structured transcript snippets.
33///
34/// This parser handles YouTube's XML format for transcripts and can:
35/// - Extract text content, timing information, and duration
36/// - Optionally preserve specified HTML formatting tags
37/// - Remove unwanted HTML tags
38///
39/// ## Usage Example
40///
41/// ```rust,no_run
42/// use yt_transcript_rs::transcript_parser::TranscriptParser;
43///
44/// // Create a parser that strips all formatting
45/// let parser = TranscriptParser::new(false);
46///
47/// // Or create a parser that preserves certain formatting tags (bold, italic, etc.)
48/// let formatting_parser = TranscriptParser::new(true);
49///
50/// // Parse XML transcript data
51/// let xml = r#"
52///     <transcript>
53///         <text start="0.0" dur="1.0">This is a transcript</text>
54///         <text start="1.0" dur="1.5">With multiple entries</text>
55///     </transcript>
56/// "#;
57///
58/// let snippets = parser.parse(xml).unwrap();
59/// ```
60/// Parser for YouTube transcript XML data
61#[derive(Debug)]
62pub struct TranscriptParser {
63    /// Whether to preserve specified formatting tags in the transcript
64    preserve_formatting: bool,
65    /// Regex pattern for matching HTML tags
66    html_regex: Regex,
67    /// Format for link processing (default is "{text} ({url})")
68    link_format: String,
69}
70
71impl TranscriptParser {
72    /// List of HTML formatting tags that can be preserved when `preserve_formatting` is enabled.
73    ///
74    /// These tags are commonly used for text formatting and can be preserved in the transcript:
75    /// - strong, b: Bold text
76    /// - em, i: Italic text
77    /// - mark: Highlighted text
78    /// - small: Smaller text
79    /// - del: Deleted/strikethrough text
80    /// - ins: Inserted/underlined text
81    /// - sub: Subscript
82    /// - sup: Superscript
83    /// - span: Generic inline container
84    /// - a: Hyperlink
85    const FORMATTING_TAGS: [&'static str; 12] = [
86        "strong", // important (bold)
87        "em",     // emphasized (italic)
88        "b",      // bold
89        "i",      // italic
90        "mark",   // highlighted
91        "small",  // smaller
92        "del",    // deleted/strikethrough
93        "ins",    // inserted/underlined
94        "sub",    // subscript
95        "sup",    // superscript
96        "span",   // generic inline container
97        "a",      // hyperlink
98    ];
99
100    /// Creates a new transcript parser with additional configuration options.
101    ///
102    /// # Parameters
103    ///
104    /// * `preserve_formatting` - If `true`, certain HTML formatting tags (like bold, italic) will be
105    ///   kept in the transcript. If `false`, all HTML tags will be removed.
106    /// * `link_format` - A format string for rendering links. Must contain `{text}` and `{url}` placeholders.
107    ///   For example, "{text} ({url})" will render as "Google (https://google.com)".
108    ///
109    /// # Returns
110    ///
111    /// A new `TranscriptParser` instance configured according to the preferences.
112    ///
113    /// # Example
114    ///
115    /// ```rust,no_run
116    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
117    /// # let result = TranscriptParser::with_config(false, "[{text}]({url})");
118    /// ```
119    pub fn with_config(
120        preserve_formatting: bool,
121        link_format: &str,
122    ) -> Result<Self, anyhow::Error> {
123        if !link_format.contains("{text}") || !link_format.contains("{url}") {
124            return Err(anyhow::anyhow!(
125                "Link format must contain {{text}} and {{url}} placeholders"
126            ));
127        }
128
129        let html_regex = Regex::new(r"<[^>]*>").unwrap();
130
131        Ok(Self {
132            preserve_formatting,
133            html_regex,
134            link_format: link_format.to_string(),
135        })
136    }
137
138    /// Creates a new transcript parser.
139    ///
140    /// # Parameters
141    ///
142    /// * `preserve_formatting` - If `true`, certain HTML formatting tags (like bold, italic) will be
143    ///   kept in the transcript. If `false`, all HTML tags will be removed.
144    ///
145    /// # Returns
146    ///
147    /// A new `TranscriptParser` instance configured according to the formatting preference.
148    ///
149    /// # Example
150    ///
151    /// ```rust,no_run
152    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
153    /// // Create a parser that removes all HTML tags
154    /// let plain_parser = TranscriptParser::new(false);
155    ///
156    /// // Create a parser that preserves formatting tags
157    /// let formatted_parser = TranscriptParser::new(true);
158    /// ```
159    pub fn new(preserve_formatting: bool) -> Self {
160        // Use a simple regex that matches all HTML tags - we'll handle the preservation logic separately
161        let html_regex = Regex::new(r"<[^>]*>").unwrap();
162
163        Self {
164            preserve_formatting,
165            html_regex,
166            link_format: "{text} ({url})".to_string(),
167        }
168    }
169
170    /// Parses YouTube transcript XML into a collection of transcript snippets.
171    ///
172    /// This method takes raw XML data from YouTube transcripts and processes it into
173    /// structured `FetchedTranscriptSnippet` objects that contain:
174    /// - Text content (with optional formatting)
175    /// - Start time in seconds
176    /// - Duration in seconds
177    ///
178    /// # Parameters
179    ///
180    /// * `raw_data` - The raw XML string containing transcript data from YouTube
181    ///
182    /// # Returns
183    ///
184    /// * `Result<Vec<FetchedTranscriptSnippet>, anyhow::Error>` - A vector of transcript snippets on success,
185    ///   or an error if parsing fails
186    ///
187    /// # Errors
188    ///
189    /// This function will return an error if:
190    /// - The XML data is malformed and cannot be parsed
191    /// - Required attributes are missing or invalid
192    ///
193    /// # Example
194    ///
195    /// ```rust,no_run
196    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
197    /// # let xml = "<transcript><text start=\"0.0\" dur=\"1.0\">Hello</text></transcript>";
198    /// let parser = TranscriptParser::new(false);
199    /// let snippets = parser.parse(xml).unwrap();
200    ///
201    /// for snippet in snippets {
202    ///     println!("[{:.1}-{:.1}s] {}",
203    ///         snippet.start,
204    ///         snippet.start + snippet.duration,
205    ///         snippet.text);
206    /// }
207    /// ```
208    pub fn parse(&self, raw_data: &str) -> Result<Vec<FetchedTranscriptSnippet>, anyhow::Error> {
209        let mut reader = Reader::from_reader(Cursor::new(raw_data));
210
211        // Don't trim text to preserve original spacing
212        reader.config_mut().trim_text(false);
213
214        let mut buf = Vec::new();
215
216        let mut snippets = Vec::new();
217        let mut in_text = false;
218        let mut start = String::new();
219        let mut duration = String::new();
220        let mut content = String::new();
221
222        loop {
223            match reader.read_event_into(&mut buf) {
224                Ok(Event::Start(e)) => {
225                    let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
226
227                    if tag_name == "text" {
228                        in_text = true;
229
230                        // Process attributes
231                        for attr in e.attributes().flatten() {
232                            let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
233                            let value = String::from_utf8_lossy(&attr.value).to_string();
234
235                            if key == "start" {
236                                start = value;
237                            } else if key == "dur" {
238                                duration = value;
239                            }
240                        }
241                    } else if in_text {
242                        // This is an HTML tag inside the text content
243                        // Reconstruct the full tag with attributes
244                        let mut tag_with_attrs = format!("<{}", tag_name);
245
246                        for attr in e.attributes().flatten() {
247                            let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
248                            let value = String::from_utf8_lossy(&attr.value).to_string();
249                            tag_with_attrs.push_str(&format!(" {}=\"{}\"", key, value));
250                        }
251
252                        tag_with_attrs.push('>');
253                        content.push_str(&tag_with_attrs);
254                    }
255                }
256                Ok(Event::Text(e)) => {
257                    if in_text {
258                        // Handle XML entities by using unescape
259                        match e.unescape() {
260                            Ok(text) => content.push_str(&text),
261                            Err(_) => content.push_str(&String::from_utf8_lossy(e.as_ref())),
262                        }
263                    }
264                }
265                Ok(Event::CData(e)) => {
266                    if in_text {
267                        content.push_str(&String::from_utf8_lossy(e.as_ref()));
268                    }
269                }
270                Ok(Event::End(e)) => {
271                    let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
272
273                    if tag_name == "text" {
274                        in_text = false;
275
276                        // Process content based on formatting preferences
277                        let processed_text = if self.preserve_formatting {
278                            // When preserving formatting, keep HTML tags based on allowed list
279                            self.process_with_formatting(&content)
280                        } else {
281                            // When removing formatting, use our entity-preserving HTML processor
282                            self.html_to_plain_text(&content)
283                        };
284
285                        // Create and add the snippet
286                        snippets.push(FetchedTranscriptSnippet {
287                            text: processed_text,
288                            start: start.parse::<f64>().unwrap_or(0.0),
289                            duration: duration.parse::<f64>().unwrap_or(0.0),
290                        });
291
292                        // Clear for next item
293                        start.clear();
294                        duration.clear();
295                        content.clear();
296                    } else if in_text {
297                        // This is a closing HTML tag inside the text content
298                        content.push_str(&format!("</{}>", tag_name));
299                    }
300                }
301                Ok(Event::Eof) => break,
302                Err(e) => {
303                    return Err(anyhow::anyhow!(
304                        "Error at position {}: {:?}",
305                        reader.buffer_position(),
306                        e
307                    ));
308                }
309                _ => (),
310            }
311            buf.clear();
312        }
313
314        Ok(snippets)
315    }
316
317    /// Converts HTML to plain text while properly handling entities and spacing.
318    /// This implementation uses the scraper library for robust HTML parsing.
319    fn html_to_plain_text(&self, html: &str) -> String {
320        // Create a mutable copy of the HTML string
321        let mut html_string = html.to_string();
322
323        // Parse the HTML fragment
324        let fragment = Html::parse_fragment(&html_string);
325
326        // Create the link selector
327        let link_selector = Selector::parse("a").unwrap();
328
329        // Extract links and replace them in the text
330        for link in fragment.select(&link_selector) {
331            if let Some(href) = link.value().attr("href") {
332                let link_text = link.text().collect::<String>().trim().to_string();
333
334                // Only process non-empty links
335                if !link_text.is_empty() && !href.is_empty() {
336                    // Format link according to configured format
337                    let link_html = link.html();
338                    let formatted = self
339                        .link_format
340                        .replace("{text}", &link_text)
341                        .replace("{url}", href);
342                    html_string = html_string.replace(&link_html, &formatted);
343                }
344            }
345        }
346
347        // Re-parse with replaced links
348        let fragment = Html::parse_fragment(&html_string);
349        let text_content = fragment.root_element().text().collect::<Vec<_>>().join("");
350
351        // Decode HTML entities
352        let decoded = decode_html_entities(&text_content).to_string();
353
354        // Clean up multiple spaces
355        let space_regex = Regex::new(r"\s{2,}").unwrap();
356        let clean_result = space_regex.replace_all(&decoded, " ");
357
358        // Final trimming
359        clean_result.trim().to_string()
360    }
361
362    /// Processes text to preserve only specific allowed HTML formatting tags.
363    ///
364    /// This method:
365    /// 1. Identifies all HTML tags in the text
366    /// 2. Keeps only the tags listed in `FORMATTING_TAGS`
367    /// 3. Removes all other HTML tags
368    ///
369    /// # Parameters
370    ///
371    /// * `text` - The text containing HTML tags to process
372    ///
373    /// # Returns
374    ///
375    /// A string with only the allowed formatting tags preserved and all others removed.
376    ///
377    /// # Example (internal usage)
378    ///
379    /// ```rust,no_run
380    /// # use yt_transcript_rs::transcript_parser::TranscriptParser;
381    /// # let parser = TranscriptParser::new(true);
382    /// # let input = "<b>Bold</b> and <span>span</span> and <i>italic</i>";
383    /// // Only <b> and <i> tags would be preserved, <span> would be removed
384    /// let result = parser.process_with_formatting(input);
385    /// // Result would be "<b>Bold</b> and span and <i>italic</i>"
386    /// ```
387    pub fn process_with_formatting(&self, text: &str) -> String {
388        let mut result = text.to_string();
389
390        // First pass: collect all HTML tags
391        let tag_matches: Vec<(usize, usize, String)> = self
392            .html_regex
393            .find_iter(text)
394            .map(|m| {
395                let tag_content = &text[m.start()..m.end()];
396                (m.start(), m.end(), tag_content.to_string())
397            })
398            .collect();
399
400        // Second pass: only keep allowed formatting tags
401        let mut offset = 0;
402        for (start, end, tag) in tag_matches {
403            let adjusted_start = start - offset;
404            let adjusted_end = end - offset;
405
406            // Extract the tag name without attributes for comparison
407            let tag_name = if let Some(space_pos) = tag.find(|c: char| c.is_whitespace()) {
408                // Handle tags with attributes: <tag attr="value">
409                let closing_bracket = tag.find('>').unwrap_or(tag.len());
410                let name_end = space_pos.min(closing_bracket);
411                if tag.starts_with("</") {
412                    // Closing tag
413                    tag[2..name_end].to_string()
414                } else {
415                    // Opening tag
416                    tag[1..name_end].to_string()
417                }
418            } else {
419                // Handle simple tags without attributes: <tag> or </tag>
420                if tag.starts_with("</") {
421                    // Closing tag without attributes
422                    let end_pos = tag.find('>').unwrap_or(tag.len());
423                    tag[2..end_pos].to_string()
424                } else {
425                    // Opening tag without attributes
426                    let end_pos = tag.find('>').unwrap_or(tag.len());
427                    tag[1..end_pos].to_string()
428                }
429            };
430
431            // Check if this tag should be preserved based on our allowed list
432            let keep_tag = Self::FORMATTING_TAGS.contains(&tag_name.as_str());
433
434            if !keep_tag {
435                // Remove tag that's not in the allowed list
436                result.replace_range(adjusted_start..adjusted_end, "");
437                offset += adjusted_end - adjusted_start;
438            }
439        }
440
441        result
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448
449    #[test]
450    fn test_parse_basic_transcript() {
451        let parser = TranscriptParser::new(false);
452
453        let xml = r#"
454        <transcript>
455            <text start="0.0" dur="1.0">This is a transcript</text>
456            <text start="1.0" dur="1.5">With multiple entries</text>
457        </transcript>
458        "#;
459
460        let snippets = parser.parse(xml).unwrap();
461        assert_eq!(snippets.len(), 2);
462        assert_eq!(snippets[0].text, "This is a transcript");
463        assert_eq!(snippets[0].start, 0.0);
464        assert_eq!(snippets[0].duration, 1.0);
465        assert_eq!(snippets[1].text, "With multiple entries");
466        assert_eq!(snippets[1].start, 1.0);
467        assert_eq!(snippets[1].duration, 1.5);
468    }
469
470    #[test]
471    fn test_parse_with_html_formatting() {
472        let xml_content = r#"<?xml version="1.0" encoding="utf-8" ?>
473        <transcript>
474            <text start="12.645" dur="1.37">So in <b>college</b>,</text>
475            <text start="15.349" dur="1.564">I was a <i>government</i> major,</text>
476            <text start="16.937" dur="2.462">which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>.</text>
477        </transcript>"#;
478
479        // Test with formatting preserved
480        let parser_with_formatting = TranscriptParser::new(true);
481        let formatted_snippets = parser_with_formatting.parse(xml_content).unwrap();
482
483        assert_eq!(formatted_snippets.len(), 3);
484        println!("Formatted 0: '{}'", formatted_snippets[0].text);
485        println!("Formatted 1: '{}'", formatted_snippets[1].text);
486        println!("Formatted 2: '{}'", formatted_snippets[2].text);
487
488        // Exact assertions for formatting preserved mode
489        assert_eq!(formatted_snippets[0].text, "So in <b>college</b>,");
490        assert_eq!(
491            formatted_snippets[1].text,
492            "I was a <i>government</i> major,"
493        );
494        assert_eq!(
495            formatted_snippets[2].text,
496            "which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>."
497        );
498
499        // Test with formatting removed
500        let plain_parser = TranscriptParser::new(false);
501        let plain_snippets = plain_parser.parse(xml_content).unwrap();
502
503        assert_eq!(plain_snippets.len(), 3);
504        println!("Plain 0: '{}'", plain_snippets[0].text);
505        println!("Plain 1: '{}'", plain_snippets[1].text);
506        println!("Plain 2: '{}'", plain_snippets[2].text);
507
508        // Exact assertions for plain text mode
509        assert_eq!(plain_snippets[0].text, "So in college,");
510        assert_eq!(plain_snippets[1].text, "I was a government major,");
511        assert_eq!(
512            plain_snippets[2].text,
513            "which means I had to write a lot of papers."
514        );
515    }
516
517    #[test]
518    fn test_parse_with_html_attributes() {
519        let xml_with_attributes = r#"<?xml version="1.0" encoding="utf-8" ?>
520        <transcript>
521            <text start="10.0" dur="2.0">This has a <span class="highlight" style="color:red">colored span</span> with attributes.</text>
522            <text start="12.5" dur="3.0">And a <a href="https://example.com" target="_blank">link</a> with multiple attributes.</text>
523            <text start="16.0" dur="2.5">And <b id="bold1" data-test="value">bold with attributes</b> should work too.</text>
524        </transcript>"#;
525
526        // Test with formatting preserved
527        let parser_with_attributes = TranscriptParser::new(true);
528        let formatted_with_attributes = parser_with_attributes.parse(xml_with_attributes).unwrap();
529
530        assert_eq!(formatted_with_attributes.len(), 3);
531        println!(
532            "Formatted with attributes 0: '{}'",
533            formatted_with_attributes[0].text
534        );
535        println!(
536            "Formatted with attributes 1: '{}'",
537            formatted_with_attributes[1].text
538        );
539        println!(
540            "Formatted with attributes 2: '{}'",
541            formatted_with_attributes[2].text
542        );
543
544        // Exact assertions for formatted content
545        assert_eq!(
546            formatted_with_attributes[0].text,
547            "This has a <span class=\"highlight\" style=\"color:red\">colored span</span> with attributes."
548        );
549        assert_eq!(
550            formatted_with_attributes[1].text,
551            "And a <a href=\"https://example.com\" target=\"_blank\">link</a> with multiple attributes."
552        );
553        assert_eq!(
554            formatted_with_attributes[2].text,
555            "And <b id=\"bold1\" data-test=\"value\">bold with attributes</b> should work too."
556        );
557
558        // Test with formatting removed
559        let plain_parser = TranscriptParser::new(false);
560        let plain_with_attributes = plain_parser.parse(xml_with_attributes).unwrap();
561
562        assert_eq!(plain_with_attributes.len(), 3);
563        println!(
564            "Plain with attributes 0: '{}'",
565            plain_with_attributes[0].text
566        );
567        println!(
568            "Plain with attributes 1: '{}'",
569            plain_with_attributes[1].text
570        );
571        println!(
572            "Plain with attributes 2: '{}'",
573            plain_with_attributes[2].text
574        );
575
576        // Exact assertions for plain text content
577        assert_eq!(
578            plain_with_attributes[0].text,
579            "This has a colored span with attributes."
580        );
581        assert_eq!(
582            plain_with_attributes[1].text,
583            "And a link (https://example.com) with multiple attributes."
584        );
585        assert_eq!(
586            plain_with_attributes[2].text,
587            "And bold with attributes should work too."
588        );
589    }
590
591    #[test]
592    fn test_edge_cases() {
593        let parser = TranscriptParser::new(true);
594
595        // Test empty transcript
596        let empty_xml = "<transcript></transcript>";
597        let empty_result = parser.parse(empty_xml).unwrap();
598        assert_eq!(empty_result.len(), 0);
599
600        // Test transcript with empty text elements
601        let empty_text_xml = "<transcript><text start=\"0.0\" dur=\"1.0\"></text></transcript>";
602        let empty_text_result = parser.parse(empty_text_xml).unwrap();
603        assert_eq!(empty_text_result.len(), 1);
604        assert_eq!(empty_text_result[0].text, "");
605
606        // Test self-closing tags (which YouTube doesn't use, but good to test)
607        let self_closing_xml =
608            "<transcript><text start=\"0.0\" dur=\"1.0\">This has a <br/> tag</text></transcript>";
609        let self_closing_result = parser.parse(self_closing_xml).unwrap();
610        assert_eq!(self_closing_result.len(), 1);
611
612        println!("Self-closing formatted: '{}'", self_closing_result[0].text);
613
614        // The space before and after <br/> may vary
615        let text = self_closing_result[0].text.clone();
616        assert!(
617            text.contains("This has a") && text.contains("tag"),
618            "Actual: {}",
619            text
620        );
621
622        // br is not in our formatting tags list, so it should be removed in non-preserve mode
623        let plain_parser = TranscriptParser::new(false);
624        let plain_result = plain_parser.parse(self_closing_xml).unwrap();
625
626        println!("Self-closing plain: '{}'", plain_result[0].text);
627
628        // Check plain text with flexible assertions
629        assert!(
630            plain_result[0].text.contains("This has a") && plain_result[0].text.contains("tag"),
631            "Actual: {}",
632            plain_result[0].text
633        );
634    }
635
636    #[test]
637    fn test_doc_examples() {
638        // Test example from TranscriptParser struct documentation
639        let xml = r#"
640        <transcript>
641            <text start="0.0" dur="1.0">This is a transcript</text>
642            <text start="1.0" dur="1.5">With multiple entries</text>
643        </transcript>
644        "#;
645
646        let parser = TranscriptParser::new(false);
647        let snippets = parser.parse(xml).unwrap();
648        assert_eq!(snippets.len(), 2);
649
650        // Test example from parse method documentation
651        let simple_xml = "<transcript><text start=\"0.0\" dur=\"1.0\">Hello</text></transcript>";
652        let simple_parser = TranscriptParser::new(false);
653        let simple_snippets = simple_parser.parse(simple_xml).unwrap();
654        assert_eq!(simple_snippets.len(), 1);
655        assert_eq!(simple_snippets[0].text, "Hello");
656        assert_eq!(simple_snippets[0].start, 0.0);
657        assert_eq!(simple_snippets[0].duration, 1.0);
658    }
659
660    #[test]
661    fn test_total_duration_calculation() {
662        // Test transcript duration calculation from transcript_parser_demo.rs
663        let xml_content = r#"<?xml version="1.0" encoding="utf-8" ?>
664        <transcript>
665            <text start="12.645" dur="1.37">So in <b>college</b>,</text>
666            <text start="15.349" dur="1.564">I was a <i>government</i> major,</text>
667            <text start="16.937" dur="2.462">which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>.</text>
668        </transcript>"#;
669
670        let parser = TranscriptParser::new(true);
671        let snippets = parser.parse(xml_content).unwrap();
672
673        // Calculate total duration
674        let total_duration: f64 = snippets.iter().map(|snippet| snippet.duration).sum();
675
676        // Use approximate comparison for floating point values (within 0.001)
677        assert!(
678            (total_duration - 5.396).abs() < 0.001,
679            "Total duration {} should be approximately 5.396 seconds",
680            total_duration
681        );
682    }
683
684    #[test]
685    fn test_parse_xml_with_version_declaration() {
686        // Test parsing XML with XML declaration at the beginning
687        let xml_with_declaration = r#"<?xml version="1.0" encoding="utf-8" ?>
688        <transcript>
689            <text start="1.0" dur="2.0">Text with XML declaration</text>
690        </transcript>"#;
691
692        let parser = TranscriptParser::new(false);
693        let snippets = parser.parse(xml_with_declaration).unwrap();
694
695        assert_eq!(snippets.len(), 1);
696        assert_eq!(snippets[0].text, "Text with XML declaration");
697        assert_eq!(snippets[0].start, 1.0);
698        assert_eq!(snippets[0].duration, 2.0);
699    }
700
701    #[test]
702    fn test_parse_with_xml_entities() {
703        // Test transcript with various XML entities
704        let xml_with_entities = r#"<?xml version="1.0" encoding="utf-8" ?>
705        <transcript>
706            <text start="1.0" dur="2.0">I couldn&#39;t quite do stuff.</text>
707            <text start="3.0" dur="2.5">Let&#39;s try &amp; test some entities.</text>
708            <text start="5.5" dur="3.0">Special characters: &lt;tag&gt; and &quot;quotes&quot;</text>
709            <text start="8.5" dur="2.0">French accents: caf&eacute; &agrave; la cr&egrave;me</text>
710            <text start="10.5" dur="1.5">Euro symbol: &euro; and degree: &deg;C</text>
711        </transcript>"#;
712
713        // Test with plain text mode (formatting removed)
714        let plain_parser = TranscriptParser::new(false);
715        let plain_snippets = plain_parser.parse(xml_with_entities).unwrap();
716
717        assert_eq!(plain_snippets.len(), 5);
718
719        // Print outputs for visual inspection
720        println!("Entity test 0: '{}'", plain_snippets[0].text);
721        println!("Entity test 1: '{}'", plain_snippets[1].text);
722        println!("Entity test 2: '{}'", plain_snippets[2].text);
723        println!("Entity test 3: '{}'", plain_snippets[3].text);
724        println!("Entity test 4: '{}'", plain_snippets[4].text);
725
726        // Test plain text conversion - html2text handles entities correctly
727        assert_eq!(plain_snippets[0].text, "I couldn't quite do stuff.");
728        assert_eq!(plain_snippets[1].text, "Let's try & test some entities.");
729        assert_eq!(plain_snippets[2].text, "Special characters: and \"quotes\"");
730        assert_eq!(plain_snippets[3].text, "French accents: café à la crème");
731        assert_eq!(plain_snippets[4].text, "Euro symbol: € and degree: °C");
732
733        // Test with formatting preserved
734        let formatting_parser = TranscriptParser::new(true);
735        let formatted_snippets = formatting_parser.parse(xml_with_entities).unwrap();
736
737        assert_eq!(formatted_snippets.len(), 5);
738
739        // In formatting mode, we still preserve structure but entities are decoded
740        assert_eq!(formatted_snippets[0].text, "I couldn't quite do stuff.");
741        assert_eq!(
742            formatted_snippets[1].text,
743            "Let's try & test some entities."
744        );
745        assert_eq!(
746            formatted_snippets[2].text,
747            "Special characters:  and \"quotes\""
748        );
749    }
750
751    #[test]
752    fn test_process_with_formatting() {
753        let parser = TranscriptParser::new(true);
754
755        // Test basic formatting
756        let input = "<b>Bold</b> and <span>span</span> and <i>italic</i>";
757        let result = parser.process_with_formatting(input);
758        assert_eq!(
759            result,
760            "<b>Bold</b> and <span>span</span> and <i>italic</i>"
761        );
762
763        // Test with unwanted tags
764        let input2 = "This has <div>unwanted</div> tags but <b>keeps</b> the <i>allowed</i> ones.";
765        let result2 = parser.process_with_formatting(input2);
766        assert_eq!(
767            result2,
768            "This has unwanted tags but <b>keeps</b> the <i>allowed</i> ones."
769        );
770
771        // Test with attributes
772        let input3 =
773            "<b id=\"test\">Bold with ID</b> and <i style=\"color:red\">Colored italic</i>";
774        let result3 = parser.process_with_formatting(input3);
775        assert_eq!(
776            result3,
777            "<b id=\"test\">Bold with ID</b> and <i style=\"color:red\">Colored italic</i>"
778        );
779    }
780}
yt_transcript_rs/transcript_parser.rs

yt_transcript_rs/
transcript_parser.rs