1use crate::models::FetchedTranscriptSnippet;
2use anyhow::Result;
3use html_escape::decode_html_entities;
4use quick_xml::events::Event;
5use quick_xml::reader::Reader;
6use regex::Regex;
7use scraper::{Html, Selector};
8use serde::{Deserialize, Serialize};
9use std::io::Cursor;
10
11#[derive(Debug, Serialize, Deserialize)]
12struct Transcript {
13 #[serde(rename = "text")]
14 texts: Vec<Text>,
15}
16
17#[derive(Debug, Serialize, Deserialize)]
18struct Text {
19 #[serde(rename = "@start")]
20 start: String,
21
22 #[serde(rename = "@dur")]
23 duration: String,
24
25 #[serde(rename = "$text")]
27 content: String,
28}
29
30#[derive(Debug)]
62pub struct TranscriptParser {
63 preserve_formatting: bool,
65 html_regex: Regex,
67 link_format: String,
69}
70
71impl TranscriptParser {
72 const FORMATTING_TAGS: [&'static str; 12] = [
86 "strong", "em", "b", "i", "mark", "small", "del", "ins", "sub", "sup", "span", "a", ];
99
100 pub fn with_config(
120 preserve_formatting: bool,
121 link_format: &str,
122 ) -> Result<Self, anyhow::Error> {
123 if !link_format.contains("{text}") || !link_format.contains("{url}") {
124 return Err(anyhow::anyhow!(
125 "Link format must contain {{text}} and {{url}} placeholders"
126 ));
127 }
128
129 let html_regex = Regex::new(r"<[^>]*>").unwrap();
130
131 Ok(Self {
132 preserve_formatting,
133 html_regex,
134 link_format: link_format.to_string(),
135 })
136 }
137
138 pub fn new(preserve_formatting: bool) -> Self {
160 let html_regex = Regex::new(r"<[^>]*>").unwrap();
162
163 Self {
164 preserve_formatting,
165 html_regex,
166 link_format: "{text} ({url})".to_string(),
167 }
168 }
169
170 pub fn parse(&self, raw_data: &str) -> Result<Vec<FetchedTranscriptSnippet>, anyhow::Error> {
209 let mut reader = Reader::from_reader(Cursor::new(raw_data));
210
211 reader.config_mut().trim_text(false);
213
214 let mut buf = Vec::new();
215
216 let mut snippets = Vec::new();
217 let mut in_text = false;
218 let mut start = String::new();
219 let mut duration = String::new();
220 let mut content = String::new();
221
222 loop {
223 match reader.read_event_into(&mut buf) {
224 Ok(Event::Start(e)) => {
225 let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
226
227 if tag_name == "text" {
228 in_text = true;
229
230 for attr in e.attributes().flatten() {
232 let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
233 let value = String::from_utf8_lossy(&attr.value).to_string();
234
235 if key == "start" {
236 start = value;
237 } else if key == "dur" {
238 duration = value;
239 }
240 }
241 } else if in_text {
242 let mut tag_with_attrs = format!("<{}", tag_name);
245
246 for attr in e.attributes().flatten() {
247 let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
248 let value = String::from_utf8_lossy(&attr.value).to_string();
249 tag_with_attrs.push_str(&format!(" {}=\"{}\"", key, value));
250 }
251
252 tag_with_attrs.push('>');
253 content.push_str(&tag_with_attrs);
254 }
255 }
256 Ok(Event::Text(e)) => {
257 if in_text {
258 match e.unescape() {
260 Ok(text) => content.push_str(&text),
261 Err(_) => content.push_str(&String::from_utf8_lossy(e.as_ref())),
262 }
263 }
264 }
265 Ok(Event::CData(e)) => {
266 if in_text {
267 content.push_str(&String::from_utf8_lossy(e.as_ref()));
268 }
269 }
270 Ok(Event::End(e)) => {
271 let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
272
273 if tag_name == "text" {
274 in_text = false;
275
276 let processed_text = if self.preserve_formatting {
278 self.process_with_formatting(&content)
280 } else {
281 self.html_to_plain_text(&content)
283 };
284
285 snippets.push(FetchedTranscriptSnippet {
287 text: processed_text,
288 start: start.parse::<f64>().unwrap_or(0.0),
289 duration: duration.parse::<f64>().unwrap_or(0.0),
290 });
291
292 start.clear();
294 duration.clear();
295 content.clear();
296 } else if in_text {
297 content.push_str(&format!("</{}>", tag_name));
299 }
300 }
301 Ok(Event::Eof) => break,
302 Err(e) => {
303 return Err(anyhow::anyhow!(
304 "Error at position {}: {:?}",
305 reader.buffer_position(),
306 e
307 ));
308 }
309 _ => (),
310 }
311 buf.clear();
312 }
313
314 Ok(snippets)
315 }
316
317 fn html_to_plain_text(&self, html: &str) -> String {
320 let mut html_string = html.to_string();
322
323 let fragment = Html::parse_fragment(&html_string);
325
326 let link_selector = Selector::parse("a").unwrap();
328
329 for link in fragment.select(&link_selector) {
331 if let Some(href) = link.value().attr("href") {
332 let link_text = link.text().collect::<String>().trim().to_string();
333
334 if !link_text.is_empty() && !href.is_empty() {
336 let link_html = link.html();
338 let formatted = self
339 .link_format
340 .replace("{text}", &link_text)
341 .replace("{url}", href);
342 html_string = html_string.replace(&link_html, &formatted);
343 }
344 }
345 }
346
347 let fragment = Html::parse_fragment(&html_string);
349 let text_content = fragment.root_element().text().collect::<Vec<_>>().join("");
350
351 let decoded = decode_html_entities(&text_content).to_string();
353
354 let space_regex = Regex::new(r"\s{2,}").unwrap();
356 let clean_result = space_regex.replace_all(&decoded, " ");
357
358 clean_result.trim().to_string()
360 }
361
362 pub fn process_with_formatting(&self, text: &str) -> String {
388 let mut result = text.to_string();
389
390 let tag_matches: Vec<(usize, usize, String)> = self
392 .html_regex
393 .find_iter(text)
394 .map(|m| {
395 let tag_content = &text[m.start()..m.end()];
396 (m.start(), m.end(), tag_content.to_string())
397 })
398 .collect();
399
400 let mut offset = 0;
402 for (start, end, tag) in tag_matches {
403 let adjusted_start = start - offset;
404 let adjusted_end = end - offset;
405
406 let tag_name = if let Some(space_pos) = tag.find(|c: char| c.is_whitespace()) {
408 let closing_bracket = tag.find('>').unwrap_or(tag.len());
410 let name_end = space_pos.min(closing_bracket);
411 if tag.starts_with("</") {
412 tag[2..name_end].to_string()
414 } else {
415 tag[1..name_end].to_string()
417 }
418 } else {
419 if tag.starts_with("</") {
421 let end_pos = tag.find('>').unwrap_or(tag.len());
423 tag[2..end_pos].to_string()
424 } else {
425 let end_pos = tag.find('>').unwrap_or(tag.len());
427 tag[1..end_pos].to_string()
428 }
429 };
430
431 let keep_tag = Self::FORMATTING_TAGS.contains(&tag_name.as_str());
433
434 if !keep_tag {
435 result.replace_range(adjusted_start..adjusted_end, "");
437 offset += adjusted_end - adjusted_start;
438 }
439 }
440
441 result
442 }
443}
444
445#[cfg(test)]
446mod tests {
447 use super::*;
448
449 #[test]
450 fn test_parse_basic_transcript() {
451 let parser = TranscriptParser::new(false);
452
453 let xml = r#"
454 <transcript>
455 <text start="0.0" dur="1.0">This is a transcript</text>
456 <text start="1.0" dur="1.5">With multiple entries</text>
457 </transcript>
458 "#;
459
460 let snippets = parser.parse(xml).unwrap();
461 assert_eq!(snippets.len(), 2);
462 assert_eq!(snippets[0].text, "This is a transcript");
463 assert_eq!(snippets[0].start, 0.0);
464 assert_eq!(snippets[0].duration, 1.0);
465 assert_eq!(snippets[1].text, "With multiple entries");
466 assert_eq!(snippets[1].start, 1.0);
467 assert_eq!(snippets[1].duration, 1.5);
468 }
469
470 #[test]
471 fn test_parse_with_html_formatting() {
472 let xml_content = r#"<?xml version="1.0" encoding="utf-8" ?>
473 <transcript>
474 <text start="12.645" dur="1.37">So in <b>college</b>,</text>
475 <text start="15.349" dur="1.564">I was a <i>government</i> major,</text>
476 <text start="16.937" dur="2.462">which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>.</text>
477 </transcript>"#;
478
479 let parser_with_formatting = TranscriptParser::new(true);
481 let formatted_snippets = parser_with_formatting.parse(xml_content).unwrap();
482
483 assert_eq!(formatted_snippets.len(), 3);
484 println!("Formatted 0: '{}'", formatted_snippets[0].text);
485 println!("Formatted 1: '{}'", formatted_snippets[1].text);
486 println!("Formatted 2: '{}'", formatted_snippets[2].text);
487
488 assert_eq!(formatted_snippets[0].text, "So in <b>college</b>,");
490 assert_eq!(
491 formatted_snippets[1].text,
492 "I was a <i>government</i> major,"
493 );
494 assert_eq!(
495 formatted_snippets[2].text,
496 "which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>."
497 );
498
499 let plain_parser = TranscriptParser::new(false);
501 let plain_snippets = plain_parser.parse(xml_content).unwrap();
502
503 assert_eq!(plain_snippets.len(), 3);
504 println!("Plain 0: '{}'", plain_snippets[0].text);
505 println!("Plain 1: '{}'", plain_snippets[1].text);
506 println!("Plain 2: '{}'", plain_snippets[2].text);
507
508 assert_eq!(plain_snippets[0].text, "So in college,");
510 assert_eq!(plain_snippets[1].text, "I was a government major,");
511 assert_eq!(
512 plain_snippets[2].text,
513 "which means I had to write a lot of papers."
514 );
515 }
516
517 #[test]
518 fn test_parse_with_html_attributes() {
519 let xml_with_attributes = r#"<?xml version="1.0" encoding="utf-8" ?>
520 <transcript>
521 <text start="10.0" dur="2.0">This has a <span class="highlight" style="color:red">colored span</span> with attributes.</text>
522 <text start="12.5" dur="3.0">And a <a href="https://example.com" target="_blank">link</a> with multiple attributes.</text>
523 <text start="16.0" dur="2.5">And <b id="bold1" data-test="value">bold with attributes</b> should work too.</text>
524 </transcript>"#;
525
526 let parser_with_attributes = TranscriptParser::new(true);
528 let formatted_with_attributes = parser_with_attributes.parse(xml_with_attributes).unwrap();
529
530 assert_eq!(formatted_with_attributes.len(), 3);
531 println!(
532 "Formatted with attributes 0: '{}'",
533 formatted_with_attributes[0].text
534 );
535 println!(
536 "Formatted with attributes 1: '{}'",
537 formatted_with_attributes[1].text
538 );
539 println!(
540 "Formatted with attributes 2: '{}'",
541 formatted_with_attributes[2].text
542 );
543
544 assert_eq!(
546 formatted_with_attributes[0].text,
547 "This has a <span class=\"highlight\" style=\"color:red\">colored span</span> with attributes."
548 );
549 assert_eq!(
550 formatted_with_attributes[1].text,
551 "And a <a href=\"https://example.com\" target=\"_blank\">link</a> with multiple attributes."
552 );
553 assert_eq!(
554 formatted_with_attributes[2].text,
555 "And <b id=\"bold1\" data-test=\"value\">bold with attributes</b> should work too."
556 );
557
558 let plain_parser = TranscriptParser::new(false);
560 let plain_with_attributes = plain_parser.parse(xml_with_attributes).unwrap();
561
562 assert_eq!(plain_with_attributes.len(), 3);
563 println!(
564 "Plain with attributes 0: '{}'",
565 plain_with_attributes[0].text
566 );
567 println!(
568 "Plain with attributes 1: '{}'",
569 plain_with_attributes[1].text
570 );
571 println!(
572 "Plain with attributes 2: '{}'",
573 plain_with_attributes[2].text
574 );
575
576 assert_eq!(
578 plain_with_attributes[0].text,
579 "This has a colored span with attributes."
580 );
581 assert_eq!(
582 plain_with_attributes[1].text,
583 "And a link (https://example.com) with multiple attributes."
584 );
585 assert_eq!(
586 plain_with_attributes[2].text,
587 "And bold with attributes should work too."
588 );
589 }
590
591 #[test]
592 fn test_edge_cases() {
593 let parser = TranscriptParser::new(true);
594
595 let empty_xml = "<transcript></transcript>";
597 let empty_result = parser.parse(empty_xml).unwrap();
598 assert_eq!(empty_result.len(), 0);
599
600 let empty_text_xml = "<transcript><text start=\"0.0\" dur=\"1.0\"></text></transcript>";
602 let empty_text_result = parser.parse(empty_text_xml).unwrap();
603 assert_eq!(empty_text_result.len(), 1);
604 assert_eq!(empty_text_result[0].text, "");
605
606 let self_closing_xml =
608 "<transcript><text start=\"0.0\" dur=\"1.0\">This has a <br/> tag</text></transcript>";
609 let self_closing_result = parser.parse(self_closing_xml).unwrap();
610 assert_eq!(self_closing_result.len(), 1);
611
612 println!("Self-closing formatted: '{}'", self_closing_result[0].text);
613
614 let text = self_closing_result[0].text.clone();
616 assert!(
617 text.contains("This has a") && text.contains("tag"),
618 "Actual: {}",
619 text
620 );
621
622 let plain_parser = TranscriptParser::new(false);
624 let plain_result = plain_parser.parse(self_closing_xml).unwrap();
625
626 println!("Self-closing plain: '{}'", plain_result[0].text);
627
628 assert!(
630 plain_result[0].text.contains("This has a") && plain_result[0].text.contains("tag"),
631 "Actual: {}",
632 plain_result[0].text
633 );
634 }
635
636 #[test]
637 fn test_doc_examples() {
638 let xml = r#"
640 <transcript>
641 <text start="0.0" dur="1.0">This is a transcript</text>
642 <text start="1.0" dur="1.5">With multiple entries</text>
643 </transcript>
644 "#;
645
646 let parser = TranscriptParser::new(false);
647 let snippets = parser.parse(xml).unwrap();
648 assert_eq!(snippets.len(), 2);
649
650 let simple_xml = "<transcript><text start=\"0.0\" dur=\"1.0\">Hello</text></transcript>";
652 let simple_parser = TranscriptParser::new(false);
653 let simple_snippets = simple_parser.parse(simple_xml).unwrap();
654 assert_eq!(simple_snippets.len(), 1);
655 assert_eq!(simple_snippets[0].text, "Hello");
656 assert_eq!(simple_snippets[0].start, 0.0);
657 assert_eq!(simple_snippets[0].duration, 1.0);
658 }
659
660 #[test]
661 fn test_total_duration_calculation() {
662 let xml_content = r#"<?xml version="1.0" encoding="utf-8" ?>
664 <transcript>
665 <text start="12.645" dur="1.37">So in <b>college</b>,</text>
666 <text start="15.349" dur="1.564">I was a <i>government</i> major,</text>
667 <text start="16.937" dur="2.462">which means <b>I had to write</b> <i>a lot</i> of <b>papers</b>.</text>
668 </transcript>"#;
669
670 let parser = TranscriptParser::new(true);
671 let snippets = parser.parse(xml_content).unwrap();
672
673 let total_duration: f64 = snippets.iter().map(|snippet| snippet.duration).sum();
675
676 assert!(
678 (total_duration - 5.396).abs() < 0.001,
679 "Total duration {} should be approximately 5.396 seconds",
680 total_duration
681 );
682 }
683
684 #[test]
685 fn test_parse_xml_with_version_declaration() {
686 let xml_with_declaration = r#"<?xml version="1.0" encoding="utf-8" ?>
688 <transcript>
689 <text start="1.0" dur="2.0">Text with XML declaration</text>
690 </transcript>"#;
691
692 let parser = TranscriptParser::new(false);
693 let snippets = parser.parse(xml_with_declaration).unwrap();
694
695 assert_eq!(snippets.len(), 1);
696 assert_eq!(snippets[0].text, "Text with XML declaration");
697 assert_eq!(snippets[0].start, 1.0);
698 assert_eq!(snippets[0].duration, 2.0);
699 }
700
701 #[test]
702 fn test_parse_with_xml_entities() {
703 let xml_with_entities = r#"<?xml version="1.0" encoding="utf-8" ?>
705 <transcript>
706 <text start="1.0" dur="2.0">I couldn't quite do stuff.</text>
707 <text start="3.0" dur="2.5">Let's try & test some entities.</text>
708 <text start="5.5" dur="3.0">Special characters: <tag> and "quotes"</text>
709 <text start="8.5" dur="2.0">French accents: café à la crème</text>
710 <text start="10.5" dur="1.5">Euro symbol: € and degree: °C</text>
711 </transcript>"#;
712
713 let plain_parser = TranscriptParser::new(false);
715 let plain_snippets = plain_parser.parse(xml_with_entities).unwrap();
716
717 assert_eq!(plain_snippets.len(), 5);
718
719 println!("Entity test 0: '{}'", plain_snippets[0].text);
721 println!("Entity test 1: '{}'", plain_snippets[1].text);
722 println!("Entity test 2: '{}'", plain_snippets[2].text);
723 println!("Entity test 3: '{}'", plain_snippets[3].text);
724 println!("Entity test 4: '{}'", plain_snippets[4].text);
725
726 assert_eq!(plain_snippets[0].text, "I couldn't quite do stuff.");
728 assert_eq!(plain_snippets[1].text, "Let's try & test some entities.");
729 assert_eq!(plain_snippets[2].text, "Special characters: and \"quotes\"");
730 assert_eq!(plain_snippets[3].text, "French accents: café à la crème");
731 assert_eq!(plain_snippets[4].text, "Euro symbol: € and degree: °C");
732
733 let formatting_parser = TranscriptParser::new(true);
735 let formatted_snippets = formatting_parser.parse(xml_with_entities).unwrap();
736
737 assert_eq!(formatted_snippets.len(), 5);
738
739 assert_eq!(formatted_snippets[0].text, "I couldn't quite do stuff.");
741 assert_eq!(
742 formatted_snippets[1].text,
743 "Let's try & test some entities."
744 );
745 assert_eq!(
746 formatted_snippets[2].text,
747 "Special characters: and \"quotes\""
748 );
749 }
750
751 #[test]
752 fn test_process_with_formatting() {
753 let parser = TranscriptParser::new(true);
754
755 let input = "<b>Bold</b> and <span>span</span> and <i>italic</i>";
757 let result = parser.process_with_formatting(input);
758 assert_eq!(
759 result,
760 "<b>Bold</b> and <span>span</span> and <i>italic</i>"
761 );
762
763 let input2 = "This has <div>unwanted</div> tags but <b>keeps</b> the <i>allowed</i> ones.";
765 let result2 = parser.process_with_formatting(input2);
766 assert_eq!(
767 result2,
768 "This has unwanted tags but <b>keeps</b> the <i>allowed</i> ones."
769 );
770
771 let input3 =
773 "<b id=\"test\">Bold with ID</b> and <i style=\"color:red\">Colored italic</i>";
774 let result3 = parser.process_with_formatting(input3);
775 assert_eq!(
776 result3,
777 "<b id=\"test\">Bold with ID</b> and <i style=\"color:red\">Colored italic</i>"
778 );
779 }
780}