use crate::types::TranscriptSegment;
pub fn parse_caption_xml(xml: &str) -> Vec<TranscriptSegment> {
let mut segments = Vec::new();
if xml.contains("<p t=\"") {
for cap in RegexIter::new(xml, "<p t=\"", "</p>") {
let t = extract_attr(cap, "t=\"", "\"").and_then(|s| s.parse::<u64>().ok());
let d = extract_attr(cap, "d=\"", "\"").and_then(|s| s.parse::<u64>().ok());
let text = strip_tags(extract_inner_text(cap));
if let Some(start_ms) = t {
segments.push(TranscriptSegment {
text,
start_ms,
duration_ms: d.unwrap_or(0),
});
}
}
} else {
for cap in RegexIter::new(xml, "<text ", "</text>") {
let start = extract_attr(cap, "start=\"", "\"")
.and_then(|s| s.parse::<f64>().ok())
.map(|s| (s * 1000.0) as u64);
let dur = extract_attr(cap, "dur=\"", "\"")
.and_then(|s| s.parse::<f64>().ok())
.map(|d| (d * 1000.0) as u64);
let text = strip_tags(extract_inner_text(cap));
if let Some(start_ms) = start {
segments.push(TranscriptSegment {
text,
start_ms,
duration_ms: dur.unwrap_or(0),
});
}
}
}
segments
}
struct RegexIter<'a> {
haystack: &'a str,
start_tag: &'a str,
end_tag: &'a str,
pos: usize,
}
impl<'a> RegexIter<'a> {
fn new(haystack: &'a str, start_tag: &'a str, end_tag: &'a str) -> Self {
Self {
haystack,
start_tag,
end_tag,
pos: 0,
}
}
}
impl<'a> Iterator for RegexIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
let rest = &self.haystack[self.pos..];
let start = rest.find(self.start_tag)?;
let after_start = self.pos + start;
let end = self.haystack[after_start..].find(self.end_tag)?;
let slice = &self.haystack[after_start..after_start + end + self.end_tag.len()];
self.pos = after_start + end + self.end_tag.len();
Some(slice)
}
}
fn extract_attr<'a>(s: &'a str, prefix: &str, suffix: &str) -> Option<&'a str> {
let start = s.find(prefix)? + prefix.len();
let rest = &s[start..];
let end = rest.find(suffix)?;
Some(&rest[..end])
}
fn extract_inner_text(s: &str) -> &str {
match s.find('>') {
Some(pos) => {
let rest = &s[pos + 1..];
match rest.rfind('<') {
Some(end) => &rest[..end],
None => rest,
}
}
None => s,
}
}
fn strip_tags(s: &str) -> String {
let mut result = String::new();
let mut in_tag = false;
for c in s.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
result
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_caption_xml_format1() {
let xml = r#"<?xml version="1.0"?>
<transcript>
<text start="18.04" dur="3.56">We're no strangers to love</text>
<text start="22.56" dur="3.22">You know the rules and so do I</text>
</transcript>"#;
let segs = parse_caption_xml(xml);
assert_eq!(segs.len(), 2);
assert_eq!(segs[0].text, "We're no strangers to love");
assert_eq!(segs[0].start_ms, 18040);
assert_eq!(segs[0].duration_ms, 3560);
assert_eq!(segs[1].text, "You know the rules and so do I");
assert_eq!(segs[1].start_ms, 22560);
}
#[test]
fn test_parse_caption_xml_format2() {
let xml = r#"<timedtext><body>
<p t="5000" d="2000">Hello world</p>
<p t="8000" d="1500">Goodbye</p>
</body></timedtext>"#;
let segs = parse_caption_xml(xml);
assert_eq!(segs.len(), 2);
assert_eq!(segs[0].text, "Hello world");
assert_eq!(segs[0].start_ms, 5000);
assert_eq!(segs[0].duration_ms, 2000);
assert_eq!(segs[1].start_ms, 8000);
}
#[test]
fn test_parse_caption_xml_html_entities() {
let xml = r#"<transcript>
<text start="1.0" dur="1.0">Tom & Jerry</text>
<text start="2.0" dur="1.0">It's "great"</text>
</transcript>"#;
let segs = parse_caption_xml(xml);
assert_eq!(segs[0].text, "Tom & Jerry");
assert_eq!(segs[1].text, "It's \"great\"");
}
#[test]
fn test_parse_caption_xml_empty() {
assert!(parse_caption_xml("").is_empty());
assert!(parse_caption_xml("<transcript></transcript>").is_empty());
}
}