Skip to main content

anomalyx_normalize/parsers/
xml.rs

1//! XML parser — scan reports (Nessus/OpenVAS), SOAP, and configs.
2//!
3//! XML is a tree, so we flatten it to rows by finding the **record element**:
4//! the most-repeated group of sibling elements (e.g. `<ReportItem>` under a
5//! Nessus `<ReportHost>`, or `<vuln>` in a report). Each such element becomes a
6//! row whose columns are its **attributes** plus its **leaf child elements**
7//! (text-only children), type-inferred. With no repetition the document is a
8//! single record (e.g. a config) and the root element becomes one row.
9//!
10//! Flattened this way it feeds `structural` and `dist` like any other corpus.
11//! Detected by an `<?xml` declaration (`STRONG`) or a leading element tag
12//! (`TEXT`); extensions `.xml` / `.nessus`.
13
14use crate::infer;
15use crate::parser::{Confidence, FormatParser, STRONG, TEXT};
16use crate::table::TableBuilder;
17use ax_core::{AxError, Column, Value};
18use roxmltree::{Document, Node};
19use std::collections::BTreeMap;
20
21#[derive(Debug, Default, Clone)]
22pub struct XmlParser;
23
24/// Selects the record elements: the largest group of same-named sibling elements
25/// (first such group in document order on a tie). Falls back to the root element
26/// as a single record when nothing repeats.
27fn find_records<'a, 'input>(doc: &'a Document<'input>) -> Vec<Node<'a, 'input>> {
28    let mut best: Vec<Node> = Vec::new();
29    for parent in doc.descendants().filter(Node::is_element) {
30        // Group this parent's direct element children by tag, preserving order.
31        let mut groups: Vec<(&str, Vec<Node>)> = Vec::new();
32        for child in parent.children().filter(Node::is_element) {
33            let name = child.tag_name().name();
34            match groups.iter_mut().find(|(n, _)| *n == name) {
35                Some(group) => group.1.push(child),
36                None => groups.push((name, vec![child])),
37            }
38        }
39        for (_, nodes) in groups {
40            if nodes.len() >= 2 && nodes.len() > best.len() {
41                best = nodes;
42            }
43        }
44    }
45    if best.is_empty() {
46        best.push(doc.root_element());
47    }
48    best
49}
50
51/// Is this element a leaf (no child elements of its own)?
52fn is_leaf(node: &Node) -> bool {
53    !node.children().any(|c| c.is_element())
54}
55
56impl XmlParser {
57    fn err(&self, msg: impl std::fmt::Display) -> AxError {
58        AxError::Parse {
59            format: self.id().to_string(),
60            message: msg.to_string(),
61        }
62    }
63}
64
65impl FormatParser for XmlParser {
66    fn id(&self) -> &'static str {
67        "xml"
68    }
69    fn extensions(&self) -> &'static [&'static str] {
70        &["xml", "nessus"]
71    }
72    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
73        let text = std::str::from_utf8(bytes).ok()?;
74        let trimmed = text.trim_start();
75        if trimmed.starts_with("<?xml") {
76            return Some(STRONG);
77        }
78        // A leading element tag (`<` then a name char) is XML; `<` then a digit is
79        // a syslog priority, not XML.
80        let after = trimmed.strip_prefix('<')?;
81        after
82            .starts_with(|c: char| c.is_ascii_alphabetic() || c == '_')
83            .then_some(TEXT)
84    }
85    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
86        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
87        let doc = Document::parse(text).map_err(|e| self.err(e))?;
88        let mut builder = TableBuilder::new();
89        for record in find_records(&doc) {
90            let mut row: BTreeMap<String, Value> = BTreeMap::new();
91            for attr in record.attributes() {
92                row.insert(attr.name().to_string(), infer::infer_scalar(attr.value()));
93            }
94            for child in record.children().filter(Node::is_element) {
95                if is_leaf(&child) {
96                    let text = child.text().unwrap_or("").trim();
97                    let cell = if text.is_empty() {
98                        Value::Null
99                    } else {
100                        infer::infer_scalar(text)
101                    };
102                    row.insert(child.tag_name().name().to_string(), cell);
103                }
104            }
105            builder.push_row(row);
106        }
107        Ok(builder.finish())
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114    use ax_core::ColType;
115
116    const REPORT: &str = r#"<?xml version="1.0"?>
117<vulns>
118  <vuln id="1" severity="high"><name>SQLi</name><port>443</port></vuln>
119  <vuln id="2" severity="low"><name>XSS</name><port>80</port></vuln>
120  <vuln id="3" severity="high"><name>RCE</name><port>22</port></vuln>
121</vulns>"#;
122
123    fn parse(s: &str) -> Vec<Column> {
124        XmlParser.parse("-", s.as_bytes()).unwrap()
125    }
126    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
127        cols.iter()
128            .find(|c| c.name == name)
129            .unwrap_or_else(|| panic!("missing column {name}"))
130    }
131
132    #[test]
133    fn repeated_element_becomes_rows_with_attrs_and_leaf_children() {
134        let cols = parse(REPORT);
135        // Three <vuln> records.
136        assert_eq!(col(&cols, "id").cells.len(), 3);
137        // Attributes are columns (id typed Int, severity categorical Str).
138        assert_eq!(col(&cols, "id").ty, ColType::Int);
139        assert_eq!(
140            col(&cols, "id").cells,
141            vec![Value::Int(1), Value::Int(2), Value::Int(3)]
142        );
143        assert_eq!(
144            col(&cols, "severity").cells,
145            vec![
146                Value::Str("high".into()),
147                Value::Str("low".into()),
148                Value::Str("high".into())
149            ]
150        );
151        // Leaf child elements are columns (name Str, port Int).
152        assert_eq!(col(&cols, "name").cells[0], Value::Str("SQLi".into()));
153        assert_eq!(col(&cols, "port").ty, ColType::Int);
154        assert_eq!(
155            col(&cols, "port").cells,
156            vec![Value::Int(443), Value::Int(80), Value::Int(22)]
157        );
158    }
159
160    #[test]
161    fn picks_the_deepest_repeated_group_not_a_leaf() {
162        // <name> and <port> each occur 3 times globally, but the repeated SIBLING
163        // group is <vuln>; the records must be the vulns, not the names.
164        assert_eq!(parse(REPORT).len(), 4); // id, severity, name, port columns
165        assert_eq!(col(&parse(REPORT), "name").cells.len(), 3);
166    }
167
168    #[test]
169    fn tie_breaks_to_the_first_repeated_group() {
170        // Two groups of equal count (2) under one parent; the FIRST (a) wins, so
171        // the rows carry `x`, not `y`.
172        let cols = parse(r#"<root><a x="1"/><a x="2"/><b y="3"/><b y="4"/></root>"#);
173        assert_eq!(col(&cols, "x").cells, vec![Value::Int(1), Value::Int(2)]);
174        assert!(
175            cols.iter().all(|c| c.name != "y"),
176            "the later group must lose"
177        );
178    }
179
180    #[test]
181    fn non_leaf_children_are_not_flattened() {
182        // A child with its own children (<meta>) is not a leaf and must not become
183        // a column; only the leaf <tag> does.
184        let cols = parse(
185            "<items><item id=\"1\"><meta><sub>d</sub></meta><tag>a</tag></item>\
186             <item id=\"2\"><meta><sub>d</sub></meta><tag>b</tag></item></items>",
187        );
188        assert_eq!(
189            col(&cols, "tag").cells,
190            vec![Value::Str("a".into()), Value::Str("b".into())]
191        );
192        assert!(
193            cols.iter().all(|c| c.name != "meta"),
194            "non-leaf child skipped"
195        );
196    }
197
198    #[test]
199    fn no_repetition_treats_root_as_one_record() {
200        let cols = parse("<config><host>web01</host><port>8080</port></config>");
201        assert_eq!(col(&cols, "host").cells, vec![Value::Str("web01".into())]);
202        assert_eq!(col(&cols, "port").cells, vec![Value::Int(8080)]);
203    }
204
205    #[test]
206    fn empty_leaf_is_null() {
207        let cols = parse("<r><a>x</a><b></b></r>");
208        assert_eq!(col(&cols, "a").cells[0], Value::Str("x".into()));
209        assert_eq!(col(&cols, "b").cells[0], Value::Null);
210    }
211
212    #[test]
213    fn malformed_xml_errors() {
214        assert!(matches!(
215            XmlParser.parse("-", b"<unclosed>"),
216            Err(AxError::Parse { .. })
217        ));
218        assert!(matches!(
219            XmlParser.parse("-", b"not xml at all"),
220            Err(AxError::Parse { .. })
221        ));
222    }
223
224    #[test]
225    fn sniff_recognizes_xml() {
226        assert_eq!(XmlParser.sniff(REPORT.as_bytes()), Some(STRONG)); // <?xml
227        assert_eq!(XmlParser.sniff(b"<vulns><vuln/></vulns>"), Some(TEXT)); // bare element
228        assert_eq!(XmlParser.sniff(b"  <root>x</root>"), Some(TEXT)); // leading whitespace
229        assert_eq!(XmlParser.sniff(b"<34>Oct syslog"), None); // syslog priority, not XML
230        assert_eq!(XmlParser.sniff(b"{\"a\":1}"), None);
231        assert_eq!(XmlParser.sniff(b"a,b,c\n1,2,3"), None);
232    }
233
234    #[test]
235    fn claims_xml_extensions() {
236        assert_eq!(XmlParser.extensions(), &["xml", "nessus"]);
237    }
238
239    #[test]
240    fn resolves_by_extension_and_content() {
241        let reg = crate::parser::ParserRegistry::default();
242        assert_eq!(reg.resolve("scan.xml", b"zz").unwrap().id(), "xml");
243        assert_eq!(reg.resolve("scan.nessus", b"zz").unwrap().id(), "xml");
244        assert_eq!(reg.resolve("-", REPORT.as_bytes()).unwrap().id(), "xml");
245    }
246}