anomalyx_normalize/parsers/
xml.rs1use crate::infer;
15use crate::parser::{Confidence, FormatParser, STRONG, TEXT};
16use crate::table::TableBuilder;
17use ax_core::{AxError, Column, Value};
18use roxmltree::{Document, Node};
19use std::collections::BTreeMap;
20
21#[derive(Debug, Default, Clone)]
22pub struct XmlParser;
23
24fn find_records<'a, 'input>(doc: &'a Document<'input>) -> Vec<Node<'a, 'input>> {
28 let mut best: Vec<Node> = Vec::new();
29 for parent in doc.descendants().filter(Node::is_element) {
30 let mut groups: Vec<(&str, Vec<Node>)> = Vec::new();
32 for child in parent.children().filter(Node::is_element) {
33 let name = child.tag_name().name();
34 match groups.iter_mut().find(|(n, _)| *n == name) {
35 Some(group) => group.1.push(child),
36 None => groups.push((name, vec![child])),
37 }
38 }
39 for (_, nodes) in groups {
40 if nodes.len() >= 2 && nodes.len() > best.len() {
41 best = nodes;
42 }
43 }
44 }
45 if best.is_empty() {
46 best.push(doc.root_element());
47 }
48 best
49}
50
51fn is_leaf(node: &Node) -> bool {
53 !node.children().any(|c| c.is_element())
54}
55
56impl XmlParser {
57 fn err(&self, msg: impl std::fmt::Display) -> AxError {
58 AxError::Parse {
59 format: self.id().to_string(),
60 message: msg.to_string(),
61 }
62 }
63}
64
65impl FormatParser for XmlParser {
66 fn id(&self) -> &'static str {
67 "xml"
68 }
69 fn extensions(&self) -> &'static [&'static str] {
70 &["xml", "nessus"]
71 }
72 fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
73 let text = std::str::from_utf8(bytes).ok()?;
74 let trimmed = text.trim_start();
75 if trimmed.starts_with("<?xml") {
76 return Some(STRONG);
77 }
78 let after = trimmed.strip_prefix('<')?;
81 after
82 .starts_with(|c: char| c.is_ascii_alphabetic() || c == '_')
83 .then_some(TEXT)
84 }
85 fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
86 let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
87 let doc = Document::parse(text).map_err(|e| self.err(e))?;
88 let mut builder = TableBuilder::new();
89 for record in find_records(&doc) {
90 let mut row: BTreeMap<String, Value> = BTreeMap::new();
91 for attr in record.attributes() {
92 row.insert(attr.name().to_string(), infer::infer_scalar(attr.value()));
93 }
94 for child in record.children().filter(Node::is_element) {
95 if is_leaf(&child) {
96 let text = child.text().unwrap_or("").trim();
97 let cell = if text.is_empty() {
98 Value::Null
99 } else {
100 infer::infer_scalar(text)
101 };
102 row.insert(child.tag_name().name().to_string(), cell);
103 }
104 }
105 builder.push_row(row);
106 }
107 Ok(builder.finish())
108 }
109}
110
111#[cfg(test)]
112mod tests {
113 use super::*;
114 use ax_core::ColType;
115
116 const REPORT: &str = r#"<?xml version="1.0"?>
117<vulns>
118 <vuln id="1" severity="high"><name>SQLi</name><port>443</port></vuln>
119 <vuln id="2" severity="low"><name>XSS</name><port>80</port></vuln>
120 <vuln id="3" severity="high"><name>RCE</name><port>22</port></vuln>
121</vulns>"#;
122
123 fn parse(s: &str) -> Vec<Column> {
124 XmlParser.parse("-", s.as_bytes()).unwrap()
125 }
126 fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
127 cols.iter()
128 .find(|c| c.name == name)
129 .unwrap_or_else(|| panic!("missing column {name}"))
130 }
131
132 #[test]
133 fn repeated_element_becomes_rows_with_attrs_and_leaf_children() {
134 let cols = parse(REPORT);
135 assert_eq!(col(&cols, "id").cells.len(), 3);
137 assert_eq!(col(&cols, "id").ty, ColType::Int);
139 assert_eq!(
140 col(&cols, "id").cells,
141 vec![Value::Int(1), Value::Int(2), Value::Int(3)]
142 );
143 assert_eq!(
144 col(&cols, "severity").cells,
145 vec![
146 Value::Str("high".into()),
147 Value::Str("low".into()),
148 Value::Str("high".into())
149 ]
150 );
151 assert_eq!(col(&cols, "name").cells[0], Value::Str("SQLi".into()));
153 assert_eq!(col(&cols, "port").ty, ColType::Int);
154 assert_eq!(
155 col(&cols, "port").cells,
156 vec![Value::Int(443), Value::Int(80), Value::Int(22)]
157 );
158 }
159
160 #[test]
161 fn picks_the_deepest_repeated_group_not_a_leaf() {
162 assert_eq!(parse(REPORT).len(), 4); assert_eq!(col(&parse(REPORT), "name").cells.len(), 3);
166 }
167
168 #[test]
169 fn tie_breaks_to_the_first_repeated_group() {
170 let cols = parse(r#"<root><a x="1"/><a x="2"/><b y="3"/><b y="4"/></root>"#);
173 assert_eq!(col(&cols, "x").cells, vec![Value::Int(1), Value::Int(2)]);
174 assert!(
175 cols.iter().all(|c| c.name != "y"),
176 "the later group must lose"
177 );
178 }
179
180 #[test]
181 fn non_leaf_children_are_not_flattened() {
182 let cols = parse(
185 "<items><item id=\"1\"><meta><sub>d</sub></meta><tag>a</tag></item>\
186 <item id=\"2\"><meta><sub>d</sub></meta><tag>b</tag></item></items>",
187 );
188 assert_eq!(
189 col(&cols, "tag").cells,
190 vec![Value::Str("a".into()), Value::Str("b".into())]
191 );
192 assert!(
193 cols.iter().all(|c| c.name != "meta"),
194 "non-leaf child skipped"
195 );
196 }
197
198 #[test]
199 fn no_repetition_treats_root_as_one_record() {
200 let cols = parse("<config><host>web01</host><port>8080</port></config>");
201 assert_eq!(col(&cols, "host").cells, vec![Value::Str("web01".into())]);
202 assert_eq!(col(&cols, "port").cells, vec![Value::Int(8080)]);
203 }
204
205 #[test]
206 fn empty_leaf_is_null() {
207 let cols = parse("<r><a>x</a><b></b></r>");
208 assert_eq!(col(&cols, "a").cells[0], Value::Str("x".into()));
209 assert_eq!(col(&cols, "b").cells[0], Value::Null);
210 }
211
212 #[test]
213 fn malformed_xml_errors() {
214 assert!(matches!(
215 XmlParser.parse("-", b"<unclosed>"),
216 Err(AxError::Parse { .. })
217 ));
218 assert!(matches!(
219 XmlParser.parse("-", b"not xml at all"),
220 Err(AxError::Parse { .. })
221 ));
222 }
223
224 #[test]
225 fn sniff_recognizes_xml() {
226 assert_eq!(XmlParser.sniff(REPORT.as_bytes()), Some(STRONG)); assert_eq!(XmlParser.sniff(b"<vulns><vuln/></vulns>"), Some(TEXT)); assert_eq!(XmlParser.sniff(b" <root>x</root>"), Some(TEXT)); assert_eq!(XmlParser.sniff(b"<34>Oct syslog"), None); assert_eq!(XmlParser.sniff(b"{\"a\":1}"), None);
231 assert_eq!(XmlParser.sniff(b"a,b,c\n1,2,3"), None);
232 }
233
234 #[test]
235 fn claims_xml_extensions() {
236 assert_eq!(XmlParser.extensions(), &["xml", "nessus"]);
237 }
238
239 #[test]
240 fn resolves_by_extension_and_content() {
241 let reg = crate::parser::ParserRegistry::default();
242 assert_eq!(reg.resolve("scan.xml", b"zz").unwrap().id(), "xml");
243 assert_eq!(reg.resolve("scan.nessus", b"zz").unwrap().id(), "xml");
244 assert_eq!(reg.resolve("-", REPORT.as_bytes()).unwrap().id(), "xml");
245 }
246}