use crate::infer;
use crate::parser::{Confidence, FormatParser, STRONG, TEXT};
use crate::table::TableBuilder;
use ax_core::{AxError, Column, Value};
use roxmltree::{Document, Node};
use std::collections::BTreeMap;
#[derive(Debug, Default, Clone)]
pub struct XmlParser;
fn find_records<'a, 'input>(doc: &'a Document<'input>) -> Vec<Node<'a, 'input>> {
let mut best: Vec<Node> = Vec::new();
for parent in doc.descendants().filter(Node::is_element) {
let mut groups: Vec<(&str, Vec<Node>)> = Vec::new();
for child in parent.children().filter(Node::is_element) {
let name = child.tag_name().name();
match groups.iter_mut().find(|(n, _)| *n == name) {
Some(group) => group.1.push(child),
None => groups.push((name, vec![child])),
}
}
for (_, nodes) in groups {
if nodes.len() >= 2 && nodes.len() > best.len() {
best = nodes;
}
}
}
if best.is_empty() {
best.push(doc.root_element());
}
best
}
fn is_leaf(node: &Node) -> bool {
!node.children().any(|c| c.is_element())
}
impl XmlParser {
fn err(&self, msg: impl std::fmt::Display) -> AxError {
AxError::Parse {
format: self.id().to_string(),
message: msg.to_string(),
}
}
}
impl FormatParser for XmlParser {
fn id(&self) -> &'static str {
"xml"
}
fn extensions(&self) -> &'static [&'static str] {
&["xml", "nessus"]
}
fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
let text = std::str::from_utf8(bytes).ok()?;
let trimmed = text.trim_start();
if trimmed.starts_with("<?xml") {
return Some(STRONG);
}
let after = trimmed.strip_prefix('<')?;
after
.starts_with(|c: char| c.is_ascii_alphabetic() || c == '_')
.then_some(TEXT)
}
fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
let doc = Document::parse(text).map_err(|e| self.err(e))?;
let mut builder = TableBuilder::new();
for record in find_records(&doc) {
let mut row: BTreeMap<String, Value> = BTreeMap::new();
for attr in record.attributes() {
row.insert(attr.name().to_string(), infer::infer_scalar(attr.value()));
}
for child in record.children().filter(Node::is_element) {
if is_leaf(&child) {
let text = child.text().unwrap_or("").trim();
let cell = if text.is_empty() {
Value::Null
} else {
infer::infer_scalar(text)
};
row.insert(child.tag_name().name().to_string(), cell);
}
}
builder.push_row(row);
}
Ok(builder.finish())
}
}
#[cfg(test)]
mod tests {
use super::*;
use ax_core::ColType;
const REPORT: &str = r#"<?xml version="1.0"?>
<vulns>
<vuln id="1" severity="high"><name>SQLi</name><port>443</port></vuln>
<vuln id="2" severity="low"><name>XSS</name><port>80</port></vuln>
<vuln id="3" severity="high"><name>RCE</name><port>22</port></vuln>
</vulns>"#;
fn parse(s: &str) -> Vec<Column> {
XmlParser.parse("-", s.as_bytes()).unwrap()
}
fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
cols.iter()
.find(|c| c.name == name)
.unwrap_or_else(|| panic!("missing column {name}"))
}
#[test]
fn repeated_element_becomes_rows_with_attrs_and_leaf_children() {
let cols = parse(REPORT);
assert_eq!(col(&cols, "id").cells.len(), 3);
assert_eq!(col(&cols, "id").ty, ColType::Int);
assert_eq!(
col(&cols, "id").cells,
vec![Value::Int(1), Value::Int(2), Value::Int(3)]
);
assert_eq!(
col(&cols, "severity").cells,
vec![
Value::Str("high".into()),
Value::Str("low".into()),
Value::Str("high".into())
]
);
assert_eq!(col(&cols, "name").cells[0], Value::Str("SQLi".into()));
assert_eq!(col(&cols, "port").ty, ColType::Int);
assert_eq!(
col(&cols, "port").cells,
vec![Value::Int(443), Value::Int(80), Value::Int(22)]
);
}
#[test]
fn picks_the_deepest_repeated_group_not_a_leaf() {
assert_eq!(parse(REPORT).len(), 4); assert_eq!(col(&parse(REPORT), "name").cells.len(), 3);
}
#[test]
fn tie_breaks_to_the_first_repeated_group() {
let cols = parse(r#"<root><a x="1"/><a x="2"/><b y="3"/><b y="4"/></root>"#);
assert_eq!(col(&cols, "x").cells, vec![Value::Int(1), Value::Int(2)]);
assert!(
cols.iter().all(|c| c.name != "y"),
"the later group must lose"
);
}
#[test]
fn non_leaf_children_are_not_flattened() {
let cols = parse(
"<items><item id=\"1\"><meta><sub>d</sub></meta><tag>a</tag></item>\
<item id=\"2\"><meta><sub>d</sub></meta><tag>b</tag></item></items>",
);
assert_eq!(
col(&cols, "tag").cells,
vec![Value::Str("a".into()), Value::Str("b".into())]
);
assert!(
cols.iter().all(|c| c.name != "meta"),
"non-leaf child skipped"
);
}
#[test]
fn no_repetition_treats_root_as_one_record() {
let cols = parse("<config><host>web01</host><port>8080</port></config>");
assert_eq!(col(&cols, "host").cells, vec![Value::Str("web01".into())]);
assert_eq!(col(&cols, "port").cells, vec![Value::Int(8080)]);
}
#[test]
fn empty_leaf_is_null() {
let cols = parse("<r><a>x</a><b></b></r>");
assert_eq!(col(&cols, "a").cells[0], Value::Str("x".into()));
assert_eq!(col(&cols, "b").cells[0], Value::Null);
}
#[test]
fn malformed_xml_errors() {
assert!(matches!(
XmlParser.parse("-", b"<unclosed>"),
Err(AxError::Parse { .. })
));
assert!(matches!(
XmlParser.parse("-", b"not xml at all"),
Err(AxError::Parse { .. })
));
}
#[test]
fn sniff_recognizes_xml() {
assert_eq!(XmlParser.sniff(REPORT.as_bytes()), Some(STRONG)); assert_eq!(XmlParser.sniff(b"<vulns><vuln/></vulns>"), Some(TEXT)); assert_eq!(XmlParser.sniff(b" <root>x</root>"), Some(TEXT)); assert_eq!(XmlParser.sniff(b"<34>Oct syslog"), None); assert_eq!(XmlParser.sniff(b"{\"a\":1}"), None);
assert_eq!(XmlParser.sniff(b"a,b,c\n1,2,3"), None);
}
#[test]
fn claims_xml_extensions() {
assert_eq!(XmlParser.extensions(), &["xml", "nessus"]);
}
#[test]
fn resolves_by_extension_and_content() {
let reg = crate::parser::ParserRegistry::default();
assert_eq!(reg.resolve("scan.xml", b"zz").unwrap().id(), "xml");
assert_eq!(reg.resolve("scan.nessus", b"zz").unwrap().id(), "xml");
assert_eq!(reg.resolve("-", REPORT.as_bytes()).unwrap().id(), "xml");
}
}