use anyhow::Result;
use regex::Regex;
use crate::models::FetchedTranscriptSnippet;
#[derive(Debug)]
pub struct TranscriptParser {
preserve_formatting: bool,
html_regex: Regex,
}
impl TranscriptParser {
const FORMATTING_TAGS: [&'static str; 10] = [
"strong", "em", "b", "i", "mark", "small", "del", "ins", "sub", "sup", ];
pub fn new(preserve_formatting: bool) -> Self {
let html_regex = Regex::new(r"<[^>]*>").unwrap();
Self {
preserve_formatting,
html_regex,
}
}
pub fn parse(&self, raw_data: &str) -> Result<Vec<FetchedTranscriptSnippet>, anyhow::Error> {
let mut snippets = Vec::new();
let document = roxmltree::Document::parse(raw_data)?;
let transcript_elem = document.root_element();
for text_elem in transcript_elem
.children()
.filter(|n| n.has_tag_name("text"))
{
let start = text_elem
.attribute("start")
.and_then(|s| s.parse::<f64>().ok())
.unwrap_or(0.0);
let duration = text_elem
.attribute("dur")
.and_then(|s| s.parse::<f64>().ok())
.unwrap_or(0.0);
let text = if let Some(text) = text_elem.text() {
text.to_string()
} else {
String::new()
};
let text = if self.preserve_formatting {
self.process_with_formatting(&text)
} else {
self.html_regex.replace_all(&text, "").to_string()
};
snippets.push(FetchedTranscriptSnippet {
text,
start,
duration,
});
}
Ok(snippets)
}
pub fn process_with_formatting(&self, text: &str) -> String {
let mut result = text.to_string();
let tag_matches: Vec<(usize, usize, String)> = self
.html_regex
.find_iter(text)
.map(|m| {
let tag_content = &text[m.start()..m.end()];
(m.start(), m.end(), tag_content.to_string())
})
.collect();
let mut offset = 0;
for (start, end, tag) in tag_matches {
let adjusted_start = start - offset;
let adjusted_end = end - offset;
let keep_tag = Self::FORMATTING_TAGS.iter().any(|&allowed_tag| {
let open_tag = format!("<{}", allowed_tag);
let close_tag = format!("</{}", allowed_tag);
tag.starts_with(&open_tag) || tag.starts_with(&close_tag)
});
if !keep_tag {
result.replace_range(adjusted_start..adjusted_end, "");
offset += adjusted_end - adjusted_start;
}
}
result
}
}