Skip to main content

zerodds_xml/
parser.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 ZeroDDS Contributors
3//! Generic well-formed XML loader for DDS-XML 1.0 §7.1.
4//!
5//! Cluster-F foundation: provides a roxmltree-based parser that
6//! adheres to the well-formedness rules from §7.1.1 (UTF-8,
7//! whitespace-tolerant, comment-stripping, namespace-aware) and stores the
8//! result in a generic [`DdsXmlDocument`] container.
9//!
10//! Building-block-specific decoders (QoS library, types, domains,
11//! participants, applications, samples) build on this container
12//! (Cluster G/H/I/J in `docs/spec-coverage/zerodds-xml-1.0.open.md`).
13
14use crate::errors::XmlError;
15use alloc::collections::BTreeMap;
16use alloc::format;
17use alloc::string::{String, ToString};
18use alloc::vec::Vec;
19
20/// DDS-XML 1.0 spec namespace for building-block top-level elements.
21///
22/// Spec ref: §7.3.x ("targetNamespace = http://www.omg.org/spec/DDS-XML").
23pub const DDS_XML_NS: &str = "http://www.omg.org/spec/DDS-XML";
24
25/// DoS cap for list elements (children per node).
26pub const MAX_LIST_ELEMENTS: usize = 1024;
27
28/// DoS cap for the total node count per document.
29pub const MAX_TOTAL_ELEMENTS: usize = 64 * 1024;
30
31/// DoS cap for the recursive tree depth.
32///
33/// Protects the recursive `build_element` call from stack overflow
34/// on adversarial deeply-nested XML inputs (TS-1 finding 3,
35/// `docs/test-harness/plan.md`). Realistic DDS-XML profiles go
36/// 4-8 deep; even complex `<application>`/`<participant>`
37/// nestings stay below 32.
38pub const MAX_TREE_DEPTH: usize = 64;
39
40/// Generic in-memory container for a DDS-XML document per §7.1.
41///
42/// An element is mapped as an [`XmlElement`] (tag, attributes, children, text).
43/// Building-block decoders navigate over this tree and
44/// produce typed structures (e.g. QoS profiles in Cluster G).
45#[derive(Debug, Clone, PartialEq, Eq)]
46pub struct DdsXmlDocument {
47    /// Root element of the document (e.g. `<dds>`, `<qos_library>`,
48    /// `<domain_participant_library>`).
49    pub root: XmlElement,
50}
51
52impl DdsXmlDocument {
53    /// Returns the local tag name of the root (without the namespace prefix).
54    #[must_use]
55    pub fn root_name(&self) -> &str {
56        &self.root.name
57    }
58}
59
60/// A single XML element per §7.1.4 / §7.1.5 (element + attributes).
61#[derive(Debug, Clone, PartialEq, Eq, Default)]
62pub struct XmlElement {
63    /// Local tag name (without the namespace prefix).
64    pub name: String,
65    /// Optional namespace URI per §7.1.3.
66    pub namespace: Option<String>,
67    /// Attributes (name -> value, alphabetically sorted via BTreeMap for
68    /// deterministic iteration).
69    pub attributes: BTreeMap<String, String>,
70    /// Child elements in document order.
71    pub children: Vec<XmlElement>,
72    /// Direct text content (contiguous text part immediately
73    /// before the first child element). Empty if no text is present.
74    pub text: String,
75}
76
77impl XmlElement {
78    /// Returns the first child with the given local name, if
79    /// present.
80    #[must_use]
81    pub fn child(&self, name: &str) -> Option<&XmlElement> {
82        self.children.iter().find(|c| c.name == name)
83    }
84
85    /// Returns all children with the given local name.
86    pub fn children_named<'a>(
87        &'a self,
88        name: &'a str,
89    ) -> impl Iterator<Item = &'a XmlElement> + 'a {
90        self.children.iter().filter(move |c| c.name == name)
91    }
92
93    /// Returns the value of an attribute, if present.
94    #[must_use]
95    pub fn attribute(&self, name: &str) -> Option<&str> {
96        self.attributes.get(name).map(String::as_str)
97    }
98
99    /// Iterates over `<element>` children per Spec §7.2.4.1
100    /// (IDL sequence mapping) and §7.2.5 (IDL array mapping = sequence
101    /// mapping with the same element tag).
102    ///
103    /// Spec OMG DDS-XML 1.0 §7.2.4.1: "The complexType contains zero
104    /// or more elements named element. Nested inside each element is
105    /// the XSD schema obtained from mapping the IDL type of the
106    /// element itself."
107    ///
108    /// Spec §7.2.5: "The XML representation of IDL arrays is the same
109    /// as it would be for IDL sequences of the same element type." —
110    /// i.e. the `<element>` tag is also used for arrays.
111    ///
112    /// Used by the IDL-PSM mapping (`qos_parser`, `sample`) to
113    /// iterate generic sequences/arrays.
114    pub fn sequence_elements(&self) -> impl Iterator<Item = &XmlElement> + '_ {
115        self.children_named("element")
116    }
117}
118
119/// Parses a DDS-XML 1.0 document per §7.1.
120///
121/// Well-formedness guarantees:
122/// * XML declaration optional (Spec §7.1.1).
123/// * Whitespace-tolerant (roxmltree normalizes text).
124/// * Comments are stripped (`<!-- ... -->` are not delivered by roxmltree
125///   as element nodes).
126/// * Namespace-aware: the top-level namespace is carried in the root
127///   (Spec §7.3.x: `targetNamespace = http://www.omg.org/spec/DDS-XML`).
128///
129/// DoS caps per Cluster F: at most [`MAX_LIST_ELEMENTS`] children per
130/// node, at most [`MAX_TOTAL_ELEMENTS`] nodes total.
131///
132/// # Errors
133/// * [`XmlError::InvalidXml`] — XML not well-formed.
134/// * [`XmlError::LimitExceeded`] — a DoS cap was hit.
135pub fn parse_xml_tree(xml: &str) -> Result<DdsXmlDocument, XmlError> {
136    // Pre-validation: deep nesting would cause a stack overflow in the
137    // recursive `roxmltree` parser — we reject
138    // inputs over `MAX_TREE_DEPTH` *before* the parser call.
139    // TS-1 finding 3.
140    precheck_depth(xml)?;
141    let opts = roxmltree::ParsingOptions {
142        allow_dtd: false,
143        ..roxmltree::ParsingOptions::default()
144    };
145    let doc = roxmltree::Document::parse_with_options(xml, opts)
146        .map_err(|e| XmlError::InvalidXml(e.to_string()))?;
147    let mut counter: usize = 0;
148    let root = build_element(doc.root_element(), &mut counter, 0)?;
149    Ok(DdsXmlDocument { root })
150}
151
152/// Byte-level pre-check of the tag nesting depth.
153///
154/// Walks the bytes and counts:
155/// * `<` without `/`, `!`, `?` directly after it: depth + 1
156/// * `</`: depth - 1
157/// * `<!` or `<?`: comment / PI / DTD — skipped
158/// * `/>`: self-closing — depth + 1 - 1 (net 0)
159///
160/// Heuristic, but an **upper bound** on the real tag depth —
161/// if this bound is already over `MAX_TREE_DEPTH`, the
162/// downstream recursive parser is unsafe.
163fn precheck_depth(xml: &str) -> Result<(), XmlError> {
164    let bytes = xml.as_bytes();
165    let mut depth: i64 = 0;
166    let mut max_seen: i64 = 0;
167    let mut i = 0;
168    while i < bytes.len() {
169        if bytes[i] != b'<' {
170            i += 1;
171            continue;
172        }
173        // Look-ahead after the first character.
174        let next = bytes.get(i + 1).copied();
175        match next {
176            Some(b'/') => {
177                depth = depth.saturating_sub(1);
178                i += 2;
179            }
180            Some(b'!') | Some(b'?') => {
181                // Skip to the next `>` (comment or PI).
182                i += 2;
183                while i < bytes.len() && bytes[i] != b'>' {
184                    i += 1;
185                }
186            }
187            _ => {
188                // Opening tag — check if self-closing: scan to `>`
189                // and see whether the byte before it is `/`.
190                let start = i;
191                i += 1;
192                while i < bytes.len() && bytes[i] != b'>' {
193                    i += 1;
194                }
195                let self_closing = i > start && bytes.get(i - 1) == Some(&b'/');
196                if !self_closing {
197                    depth += 1;
198                    if depth > max_seen {
199                        max_seen = depth;
200                    }
201                    if depth > MAX_TREE_DEPTH as i64 {
202                        return Err(XmlError::LimitExceeded(format!(
203                            "tag nesting exceeds {MAX_TREE_DEPTH} — refusing to parse to \
204                             protect against stack overflow"
205                        )));
206                    }
207                }
208            }
209        }
210        // Skip `>` (at the end of the match arm, i is on `>`).
211        if i < bytes.len() && bytes[i] == b'>' {
212            i += 1;
213        }
214    }
215    Ok(())
216}
217
218/// Recursive tree build from a roxmltree node.
219///
220/// `depth` counts the current nesting depth (root = 0) and
221/// protects against stack overflow via [`MAX_TREE_DEPTH`].
222///
223/// zerodds-lint: recursion-depth = xml-tree-depth (capped by `MAX_TREE_DEPTH`;
224/// additionally secured against wide/tall DoS by `MAX_TOTAL_ELEMENTS` and
225/// `MAX_LIST_ELEMENTS`).
226fn build_element(
227    node: roxmltree::Node<'_, '_>,
228    counter: &mut usize,
229    depth: usize,
230) -> Result<XmlElement, XmlError> {
231    if depth > MAX_TREE_DEPTH {
232        return Err(XmlError::LimitExceeded(format!(
233            "tree depth exceeds {MAX_TREE_DEPTH} — refusing to build to protect against \
234             stack overflow"
235        )));
236    }
237    *counter += 1;
238    if *counter > MAX_TOTAL_ELEMENTS {
239        return Err(XmlError::LimitExceeded(format!(
240            "document exceeds {MAX_TOTAL_ELEMENTS} elements"
241        )));
242    }
243
244    let tag = node.tag_name();
245    let mut element = XmlElement {
246        name: tag.name().to_string(),
247        namespace: tag.namespace().map(ToString::to_string),
248        attributes: BTreeMap::new(),
249        children: Vec::new(),
250        text: String::new(),
251    };
252
253    // §7.1.5 Tab.7.2 — take over the attributes.
254    for attr in node.attributes() {
255        element
256            .attributes
257            .insert(attr.name().to_string(), attr.value().to_string());
258    }
259
260    // §7.1.4 Tab.7.1 — first text content (before the first element child).
261    // roxmltree delivers text as its own nodes between elements.
262    if let Some(text) = node.text() {
263        let trimmed = text.trim();
264        if !trimmed.is_empty() {
265            element.text = trimmed.to_string();
266        }
267    }
268
269    // Child element nodes (comments + whitespace text are
270    // filtered out automatically).
271    let mut child_count: usize = 0;
272    for child_node in node.children().filter(roxmltree::Node::is_element) {
273        child_count += 1;
274        if child_count > MAX_LIST_ELEMENTS {
275            return Err(XmlError::LimitExceeded(format!(
276                "<{}> has more than {MAX_LIST_ELEMENTS} children",
277                element.name
278            )));
279        }
280        element
281            .children
282            .push(build_element(child_node, counter, depth + 1)?);
283    }
284
285    Ok(element)
286}
287
288#[cfg(test)]
289#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn parse_minimal_document() {
295        let xml = r#"<root/>"#;
296        let doc = parse_xml_tree(xml).expect("parse");
297        assert_eq!(doc.root_name(), "root");
298        assert!(doc.root.children.is_empty());
299    }
300
301    #[test]
302    fn parse_with_xml_declaration() {
303        let xml = r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#;
304        let doc = parse_xml_tree(xml).expect("parse");
305        assert_eq!(doc.root_name(), "root");
306    }
307
308    #[test]
309    fn parse_namespace_aware() {
310        let xml = r#"<dds xmlns="http://www.omg.org/spec/DDS-XML"/>"#;
311        let doc = parse_xml_tree(xml).expect("parse");
312        assert_eq!(doc.root.namespace.as_deref(), Some(DDS_XML_NS));
313    }
314
315    #[test]
316    fn comments_stripped() {
317        let xml = r#"<root>
318            <!-- this is a comment -->
319            <child>value</child>
320            <!-- another -->
321        </root>"#;
322        let doc = parse_xml_tree(xml).expect("parse");
323        assert_eq!(doc.root.children.len(), 1);
324        assert_eq!(doc.root.children[0].name, "child");
325        assert_eq!(doc.root.children[0].text, "value");
326    }
327
328    #[test]
329    fn whitespace_tolerant() {
330        let xml = r#"
331            <root>
332                <child>  hello  </child>
333            </root>
334        "#;
335        let doc = parse_xml_tree(xml).expect("parse");
336        // text is trimmed
337        assert_eq!(doc.root.children[0].text, "hello");
338    }
339
340    #[test]
341    fn attributes_preserved() {
342        let xml = r#"<profile name="P1" base_name="P0"/>"#;
343        let doc = parse_xml_tree(xml).expect("parse");
344        assert_eq!(doc.root.attribute("name"), Some("P1"));
345        assert_eq!(doc.root.attribute("base_name"), Some("P0"));
346        assert_eq!(doc.root.attribute("missing"), None);
347    }
348
349    #[test]
350    fn invalid_xml_rejected() {
351        let xml = "<root><unclosed></root>";
352        let err = parse_xml_tree(xml).expect_err("invalid");
353        assert!(matches!(err, XmlError::InvalidXml(_)));
354    }
355
356    #[test]
357    fn dtd_rejected() {
358        // §7.1.1 — XML 1.0 Fifth Edition; DTDs are allowed, but we
359        // forbid them for security reasons (XXE avoidance).
360        let xml = r#"<?xml version="1.0"?>
361<!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
362<root>&xxe;</root>"#;
363        let err = parse_xml_tree(xml).expect_err("dtd");
364        assert!(matches!(err, XmlError::InvalidXml(_)));
365    }
366
367    #[test]
368    fn child_helper() {
369        let xml = r#"<root><a/><b/><a/></root>"#;
370        let doc = parse_xml_tree(xml).expect("parse");
371        assert_eq!(doc.root.child("a").map(|c| c.name.as_str()), Some("a"));
372        assert_eq!(doc.root.children_named("a").count(), 2);
373        assert_eq!(doc.root.children_named("missing").count(), 0);
374    }
375
376    #[test]
377    fn list_dos_cap() {
378        // Build XML with MAX_LIST_ELEMENTS+1 children.
379        let mut xml = String::from("<root>");
380        for _ in 0..(MAX_LIST_ELEMENTS + 1) {
381            xml.push_str("<c/>");
382        }
383        xml.push_str("</root>");
384        let err = parse_xml_tree(&xml).expect_err("dos");
385        assert!(matches!(err, XmlError::LimitExceeded(_)));
386    }
387
388    #[test]
389    fn nested_structure() {
390        let xml = r#"<root>
391            <profile name="P1">
392                <history>
393                    <kind>KEEP_LAST_HISTORY_QOS</kind>
394                    <depth>10</depth>
395                </history>
396            </profile>
397        </root>"#;
398        let doc = parse_xml_tree(xml).expect("parse");
399        let profile = doc.root.child("profile").expect("profile");
400        assert_eq!(profile.attribute("name"), Some("P1"));
401        let history = profile.child("history").expect("history");
402        assert_eq!(
403            history.child("kind").map(|c| c.text.as_str()),
404            Some("KEEP_LAST_HISTORY_QOS")
405        );
406        assert_eq!(history.child("depth").map(|c| c.text.as_str()), Some("10"));
407    }
408
409    // ---- §7.2.4.1 + §7.2.5 sequence_elements ------------------------
410
411    #[test]
412    fn sequence_elements_iterates_element_tag_children() {
413        // Spec §7.2.4.1: an IDL sequence is represented as a <complexType> with
414        // 0..n <element> children.
415        let xml = r#"<root>
416            <ports>
417                <element>7400</element>
418                <element>7401</element>
419                <element>7402</element>
420            </ports>
421        </root>"#;
422        let doc = parse_xml_tree(xml).expect("parse");
423        let ports = doc.root.child("ports").expect("ports");
424        let texts: Vec<&str> = ports.sequence_elements().map(|e| e.text.as_str()).collect();
425        assert_eq!(texts, vec!["7400", "7401", "7402"]);
426    }
427
428    #[test]
429    fn sequence_elements_skips_non_element_tagged_children() {
430        // Other tags such as <kind>, <depth> are ignored — only the
431        // <element> tag counts as a sequence entry.
432        let xml = r#"<root>
433            <history>
434                <kind>KEEP_LAST_HISTORY_QOS</kind>
435                <depth>10</depth>
436                <element>not-a-real-history-field</element>
437            </history>
438        </root>"#;
439        let doc = parse_xml_tree(xml).expect("parse");
440        let hist = doc.root.child("history").expect("hist");
441        let texts: Vec<&str> = hist.sequence_elements().map(|e| e.text.as_str()).collect();
442        assert_eq!(texts, vec!["not-a-real-history-field"]);
443    }
444
445    #[test]
446    fn sequence_elements_empty_for_zero_children() {
447        let xml = r#"<root><list></list></root>"#;
448        let doc = parse_xml_tree(xml).expect("parse");
449        let list = doc.root.child("list").expect("list");
450        assert_eq!(list.sequence_elements().count(), 0);
451    }
452
453    #[test]
454    fn array_uses_same_element_tag_as_sequence() {
455        // Spec §7.2.5: IDL array mapping is identical to IDL sequence
456        // mapping. So the same `sequence_elements` iterator must
457        // also work for fixed-size IDL arrays.
458        let xml = r#"<root>
459            <coords_3d>
460                <element>1.0</element>
461                <element>2.0</element>
462                <element>3.0</element>
463            </coords_3d>
464        </root>"#;
465        let doc = parse_xml_tree(xml).expect("parse");
466        let arr = doc.root.child("coords_3d").expect("array");
467        let texts: Vec<&str> = arr.sequence_elements().map(|e| e.text.as_str()).collect();
468        assert_eq!(texts.len(), 3, "IDL-Array[3] = 3 <element>-Children");
469        assert_eq!(texts, vec!["1.0", "2.0", "3.0"]);
470    }
471}