use quick_xml::events::Event;
use quick_xml::Reader;
#[derive(Debug, Clone)]
pub struct Attr {
pub name: String,
pub value: String,
}
#[derive(Debug, Clone)]
pub struct Node {
pub name: String,
pub attrs: Vec<Attr>,
pub text: String,
pub text_has_cdata: bool,
pub children: Vec<Node>,
pub line: u32,
pub col: u32,
}
impl Node {
fn new(name: String, attrs: Vec<Attr>, line: u32, col: u32) -> Self {
Node {
name,
attrs,
text: String::new(),
text_has_cdata: false,
children: Vec::new(),
line,
col,
}
}
pub fn attr(&self, name: &str) -> Option<&str> {
self.attrs
.iter()
.find(|a| a.name == name)
.map(|a| a.value.as_str())
}
pub fn has_child(&self, name: &str) -> bool {
self.children.iter().any(|c| c.name == name)
}
pub fn children_named<'a>(&'a self, name: &'a str) -> impl Iterator<Item = &'a Node> {
self.children.iter().filter(move |c| c.name == name)
}
pub fn child(&self, name: &str) -> Option<&Node> {
self.children.iter().find(|c| c.name == name)
}
pub fn find_descendant(&self, name: &str) -> Option<&Node> {
for child in &self.children {
if child.name == name {
return Some(child);
}
if let Some(found) = child.find_descendant(name) {
return Some(found);
}
}
None
}
pub fn has_descendant(&self, name: &str) -> bool {
self.find_descendant(name).is_some()
}
}
#[derive(Debug, Clone)]
pub struct VastDocument {
pub root: Node,
pub parse_error: Option<String>,
}
impl VastDocument {
pub fn vast_root(&self) -> Option<&Node> {
if self.root.name == "VAST" {
Some(&self.root)
} else {
None
}
}
}
fn byte_offset_to_line_col(input: &[u8], offset: usize) -> (u32, u32) {
let safe = offset.min(input.len());
let mut line: u32 = 1;
let mut line_start: usize = 0;
for (i, &b) in input[..safe].iter().enumerate() {
if b == b'\n' {
line += 1;
line_start = i + 1;
}
}
let col = (safe - line_start) as u32 + 1;
(line, col)
}
fn append_text_segment(node: &mut Node, segment: &str, from_cdata: bool) {
let trimmed = segment.trim();
if trimmed.is_empty() {
return;
}
node.text.push_str(trimmed);
if from_cdata {
node.text_has_cdata = true;
}
}
fn decode_general_reference(reference: &str) -> Option<String> {
if let Some(hex) = reference
.strip_prefix("#x")
.or_else(|| reference.strip_prefix("#X"))
{
let codepoint = u32::from_str_radix(hex, 16).ok()?;
let ch = char::from_u32(codepoint)?;
return Some(ch.to_string());
}
if let Some(decimal) = reference.strip_prefix('#') {
let codepoint = decimal.parse::<u32>().ok()?;
let ch = char::from_u32(codepoint)?;
return Some(ch.to_string());
}
match reference {
"amp" => Some("&".to_owned()),
"lt" => Some("<".to_owned()),
"gt" => Some(">".to_owned()),
"apos" => Some("'".to_owned()),
"quot" => Some("\"".to_owned()),
_ => None,
}
}
pub fn parse(input: &str) -> VastDocument {
let input_bytes = input.as_bytes();
let mut reader = Reader::from_str(input);
reader.config_mut().trim_text(true);
let mut stack: Vec<Node> = Vec::new();
let mut parse_error: Option<String> = None;
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
let end_pos = reader.buffer_position() as usize;
let tag_bytes = e.as_ref();
let tag_len = tag_bytes.len() + 2; let start_pos = end_pos.saturating_sub(tag_len);
let (line, col) = byte_offset_to_line_col(input_bytes, start_pos);
let name = std::str::from_utf8(e.local_name().as_ref())
.unwrap_or("")
.to_owned();
let mut attrs = Vec::new();
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.local_name().as_ref())
.unwrap_or("")
.to_owned();
let val = std::str::from_utf8(attr.value.as_ref())
.unwrap_or("")
.to_owned();
attrs.push(Attr {
name: key,
value: val,
});
}
stack.push(Node::new(name, attrs, line, col));
}
Ok(Event::End(_)) if stack.len() > 1 => {
let finished = stack.pop().unwrap();
stack.last_mut().unwrap().children.push(finished);
}
Ok(Event::End(_)) => {}
Ok(Event::Empty(e)) => {
let end_pos = reader.buffer_position() as usize;
let tag_bytes = e.as_ref();
let tag_len = tag_bytes.len() + 3; let start_pos = end_pos.saturating_sub(tag_len);
let (line, col) = byte_offset_to_line_col(input_bytes, start_pos);
let name = std::str::from_utf8(e.local_name().as_ref())
.unwrap_or("")
.to_owned();
let mut attrs = Vec::new();
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.local_name().as_ref())
.unwrap_or("")
.to_owned();
let val = std::str::from_utf8(attr.value.as_ref())
.unwrap_or("")
.to_owned();
attrs.push(Attr {
name: key,
value: val,
});
}
let node = Node::new(name, attrs, line, col);
if let Some(parent) = stack.last_mut() {
parent.children.push(node);
} else {
stack.push(node);
}
}
Ok(Event::Text(e)) => {
if let Some(node) = stack.last_mut() {
if let Ok(text) = e.xml10_content() {
append_text_segment(node, text.as_ref(), false);
}
}
}
Ok(Event::CData(e)) => {
if let Some(node) = stack.last_mut() {
let bytes = e.into_inner();
if let Ok(text) = std::str::from_utf8(&bytes) {
append_text_segment(node, text, true);
}
}
}
Ok(Event::GeneralRef(e)) => {
if let Some(node) = stack.last_mut() {
if let Ok(reference) = std::str::from_utf8(e.as_ref()) {
if let Some(decoded) = decode_general_reference(reference) {
append_text_segment(node, &decoded, false);
} else {
let raw = format!("&{};", reference);
append_text_segment(node, &raw, false);
}
}
}
}
Ok(Event::Eof) => {
if stack.len() > 1 {
let unclosed = stack
.iter()
.skip(1)
.map(|node| node.name.as_str())
.collect::<Vec<_>>()
.join(" > ");
parse_error = Some(format!(
"XML parse error at end of document: unexpected EOF with unclosed element(s): {}",
unclosed
));
}
break;
}
Err(e) => {
parse_error = Some(format!(
"XML parse error at position {}: {}",
reader.error_position(),
e
));
break;
}
_ => {}
}
}
let root = if stack.is_empty() {
Node::new("__empty__".to_owned(), Vec::new(), 0, 0)
} else {
while stack.len() > 1 {
let node = stack.pop().unwrap();
stack.last_mut().unwrap().children.push(node);
}
stack.pop().unwrap()
};
VastDocument { root, parse_error }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_minimal_vast() {
let xml = r#"<VAST version="4.1"></VAST>"#;
let doc = parse(xml);
assert!(doc.parse_error.is_none());
assert_eq!(doc.root.name, "VAST");
assert_eq!(doc.root.attr("version"), Some("4.1"));
}
#[test]
fn parses_self_closing_child() {
let xml = r#"<VAST version="4.1"><Ad id="1"/></VAST>"#;
let doc = parse(xml);
assert!(doc.root.has_child("Ad"));
}
#[test]
fn captures_cdata_text() {
let xml = r#"<VAST version="4.1"><Ad><InLine><Impression><![CDATA[https://example.com/imp]]></Impression></InLine></Ad></VAST>"#;
let doc = parse(xml);
let imp = doc
.root
.child("Ad")
.unwrap()
.child("InLine")
.unwrap()
.child("Impression")
.unwrap();
assert_eq!(imp.text, "https://example.com/imp");
assert!(imp.text_has_cdata);
}
#[test]
fn concatenates_split_cdata_text() {
let xml = r#"<VAST version="4.1"><Ad><InLine><Impression><![CDATA[https://example.com/imp?a=1]]><![CDATA[&b=2]]></Impression></InLine></Ad></VAST>"#;
let doc = parse(xml);
let imp = doc
.root
.child("Ad")
.unwrap()
.child("InLine")
.unwrap()
.child("Impression")
.unwrap();
assert_eq!(imp.text, "https://example.com/imp?a=1&b=2");
assert!(imp.text_has_cdata);
}
#[test]
fn unexpected_eof_sets_parse_error() {
let doc = parse(r#"<VAST version="4.0"><Ad><InLine><AdSystem>Broken"#);
assert!(doc.parse_error.is_some());
}
#[test]
fn malformed_inputs_set_parse_error_across_multiple_shapes() {
for xml in [
r#"<VAST version="4.0"><Ad><InLine><AdSystem>Broken"#,
r#"<VAST version="4.0"><Ad></VAST>"#,
r#"<VAST version="4.0"><Ad><InLine></Ad></InLine></VAST>"#,
r#"<VAST version="4.0"><Ad id="broken><InLine /></Ad></VAST>"#,
] {
let doc = parse(xml);
assert!(doc.parse_error.is_some(), "expected parse error for {xml}");
}
}
#[test]
fn preserves_entity_references_in_text() {
let xml = r#"<VAST version="2.0"><Ad><InLine><AdSystem>Test</AdSystem><AdTitle>Test</AdTitle><Impression>https://t.example.com/imp</Impression><Creatives><Creative><Linear><Duration>00:00:30</Duration><MediaFiles><MediaFile delivery="progressive" type="video/mp4" width="640" height="360">https://cdn.example.com/ad.mp4</MediaFile></MediaFiles></Linear></Creative></Creatives><Extensions><Extension>alpha&beta</Extension></Extensions></InLine></Ad></VAST>"#;
let doc = parse(xml);
let extension = doc
.root
.child("Ad")
.unwrap()
.child("InLine")
.unwrap()
.child("Extensions")
.unwrap()
.child("Extension")
.unwrap();
assert_eq!(extension.text, "alpha&beta");
assert!(!extension.text_has_cdata);
}
#[test]
fn sets_parse_error_on_malformed_xml() {
let xml = r#"<VAST version="4.1"><Ad></VAST>"#;
let doc = parse(xml);
assert!(doc.parse_error.is_some());
}
}