Skip to main content

zerodds_xml/
parser.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 ZeroDDS Contributors
3//! Generischer well-formed XML-Loader fuer DDS-XML 1.0 §7.1.
4//!
5//! Cluster-F-Foundation: liefert einen Roxmltree-basierten Parser, der
6//! sich an die Wohlgeformtheits-Regeln aus §7.1.1 haelt (UTF-8,
7//! Whitespace-tolerant, Comment-Stripping, Namespace-Aware) und das
8//! Ergebnis in einem generischen [`DdsXmlDocument`]-Container ablegt.
9//!
10//! Building-Block-spezifische Decoder (QoS-Library, Types, Domains,
11//! Participants, Applications, Samples) bauen auf diesem Container auf
12//! (Cluster G/H/I/J in `docs/spec-coverage/zerodds-xml-1.0.open.md`).
13
14use crate::errors::XmlError;
15use alloc::collections::BTreeMap;
16use alloc::format;
17use alloc::string::{String, ToString};
18use alloc::vec::Vec;
19
20/// DDS-XML 1.0 Spec-Namespace fuer Building-Block-Top-Level-Elemente.
21///
22/// Spec-Ref: §7.3.x ("targetNamespace = http://www.omg.org/spec/DDS-XML").
23pub const DDS_XML_NS: &str = "http://www.omg.org/spec/DDS-XML";
24
25/// DoS-Cap fuer Listen-Elemente (Children pro Knoten).
26pub const MAX_LIST_ELEMENTS: usize = 1024;
27
28/// DoS-Cap fuer Gesamt-Knoten-Anzahl pro Dokument.
29pub const MAX_TOTAL_ELEMENTS: usize = 64 * 1024;
30
31/// DoS-Cap fuer die rekursive Baum-Tiefe.
32///
33/// Schuetzt den rekursiven `build_element`-Aufruf vor Stack-Overflow
34/// bei adversarial deeply-nested XML-Inputs (TS-1-Finding 3,
35/// `docs/test-harness/plan.md`). Realistische DDS-XML-Profile gehen
36/// 4-8 tief; selbst komplexe `<application>`-/`<participant>`-
37/// Verschachtelungen bleiben unter 32.
38pub const MAX_TREE_DEPTH: usize = 64;
39
40/// Generischer In-Memory-Container fuer ein DDS-XML-Dokument nach §7.1.
41///
42/// Ein Element wird als [`XmlElement`] (Tag, Attribute, Children, Text)
43/// abgebildet. Building-Block-Decoder navigieren ueber diesen Baum und
44/// erzeugen typed Strukturen (z.B. QoS-Profile in Cluster G).
45#[derive(Debug, Clone, PartialEq, Eq)]
46pub struct DdsXmlDocument {
47    /// Wurzel-Element des Dokuments (z.B. `<dds>`, `<qos_library>`,
48    /// `<domain_participant_library>`).
49    pub root: XmlElement,
50}
51
52impl DdsXmlDocument {
53    /// Liefert den lokalen Tag-Namen der Wurzel (ohne Namespace-Prefix).
54    #[must_use]
55    pub fn root_name(&self) -> &str {
56        &self.root.name
57    }
58}
59
60/// Ein einzelnes XML-Element nach §7.1.4 / §7.1.5 (Element + Attribute).
61#[derive(Debug, Clone, PartialEq, Eq, Default)]
62pub struct XmlElement {
63    /// Lokaler Tag-Name (ohne Namespace-Prefix).
64    pub name: String,
65    /// Optionaler Namespace-URI gemaess §7.1.3.
66    pub namespace: Option<String>,
67    /// Attribute (Name -> Wert, alphabetisch sortiert via BTreeMap fuer
68    /// deterministische Iteration).
69    pub attributes: BTreeMap<String, String>,
70    /// Kind-Elemente in Dokument-Reihenfolge.
71    pub children: Vec<XmlElement>,
72    /// Direkter Text-Inhalt (zusammenhaengender Text-Teil unmittelbar
73    /// vor dem ersten Kind-Element). Leer wenn kein Text vorhanden.
74    pub text: String,
75}
76
77impl XmlElement {
78    /// Liefert den ersten Kind mit dem gegebenen lokalen Namen, falls
79    /// vorhanden.
80    #[must_use]
81    pub fn child(&self, name: &str) -> Option<&XmlElement> {
82        self.children.iter().find(|c| c.name == name)
83    }
84
85    /// Liefert alle Kinder mit dem gegebenen lokalen Namen.
86    pub fn children_named<'a>(
87        &'a self,
88        name: &'a str,
89    ) -> impl Iterator<Item = &'a XmlElement> + 'a {
90        self.children.iter().filter(move |c| c.name == name)
91    }
92
93    /// Liefert den Wert eines Attributs, falls vorhanden.
94    #[must_use]
95    pub fn attribute(&self, name: &str) -> Option<&str> {
96        self.attributes.get(name).map(String::as_str)
97    }
98
99    /// Iteriert ueber `<element>`-Kinder gemaess Spec §7.2.4.1
100    /// (IDL-Sequence-Mapping) und §7.2.5 (IDL-Array-Mapping = Sequence-
101    /// Mapping mit gleichem Element-Tag).
102    ///
103    /// Spec OMG DDS-XML 1.0 §7.2.4.1: "The complexType contains zero
104    /// or more elements named element. Nested inside each element is
105    /// the XSD schema obtained from mapping the IDL type of the
106    /// element itself."
107    ///
108    /// Spec §7.2.5: "The XML representation of IDL arrays is the same
109    /// as it would be for IDL sequences of the same element type." —
110    /// d.h. `<element>`-Tag wird auch fuer Arrays genutzt.
111    ///
112    /// Wird vom IDL-PSM-Mapping (`qos_parser`, `sample`) genutzt, um
113    /// generische Sequenzen/Arrays zu iterieren.
114    pub fn sequence_elements(&self) -> impl Iterator<Item = &XmlElement> + '_ {
115        self.children_named("element")
116    }
117}
118
119/// Parses a DDS-XML 1.0 document gemaess §7.1.
120///
121/// Wohlgeformtheits-Garantien:
122/// * XML-Declaration optional (Spec §7.1.1).
123/// * Whitespace-tolerant (roxmltree normalisiert Text).
124/// * Kommentare werden gestrippt (`<!-- ... -->` werden von roxmltree
125///   nicht als Element-Knoten geliefert).
126/// * Namespace-Aware: Top-Level-Namespace wird im Root mitgefuehrt
127///   (Spec §7.3.x: `targetNamespace = http://www.omg.org/spec/DDS-XML`).
128///
129/// DoS-Caps gemaess Cluster-F: maximal [`MAX_LIST_ELEMENTS`] Children pro
130/// Knoten, maximal [`MAX_TOTAL_ELEMENTS`] Knoten insgesamt.
131///
132/// # Errors
133/// * [`XmlError::InvalidXml`] — XML nicht wohlgeformt.
134/// * [`XmlError::LimitExceeded`] — DoS-Cap getroffen.
135pub fn parse_xml_tree(xml: &str) -> Result<DdsXmlDocument, XmlError> {
136    // Pre-Validation: tiefe Verschachtelung wuerde im rekursiven
137    // `roxmltree`-Parser zu Stack-Overflow fuehren — wir lehnen
138    // Inputs ueber `MAX_TREE_DEPTH` *vor* dem Parser-Call ab.
139    // TS-1-Finding 3.
140    precheck_depth(xml)?;
141    let opts = roxmltree::ParsingOptions {
142        allow_dtd: false,
143        ..roxmltree::ParsingOptions::default()
144    };
145    let doc = roxmltree::Document::parse_with_options(xml, opts)
146        .map_err(|e| XmlError::InvalidXml(e.to_string()))?;
147    let mut counter: usize = 0;
148    let root = build_element(doc.root_element(), &mut counter, 0)?;
149    Ok(DdsXmlDocument { root })
150}
151
152/// Byte-level Vor-Pruefung der Tag-Verschachtelungstiefe.
153///
154/// Geht Bytes durch und zaehlt:
155/// * `<` ohne `/`, `!`, `?` direkt danach: depth + 1
156/// * `</`: depth - 1
157/// * `<!` oder `<?`: Kommentar / PI / DTD — uebersprungen
158/// * `/>`: self-closing — depth + 1 - 1 (netto 0)
159///
160/// Heuristisch, aber ein **Upper Bound** auf die echte Tag-Tiefe —
161/// wenn dieser Bound bereits ueber `MAX_TREE_DEPTH` liegt, ist der
162/// nachgelagerte rekursive Parser unsicher.
163fn precheck_depth(xml: &str) -> Result<(), XmlError> {
164    let bytes = xml.as_bytes();
165    let mut depth: i64 = 0;
166    let mut max_seen: i64 = 0;
167    let mut i = 0;
168    while i < bytes.len() {
169        if bytes[i] != b'<' {
170            i += 1;
171            continue;
172        }
173        // Look-ahead nach dem ersten Zeichen.
174        let next = bytes.get(i + 1).copied();
175        match next {
176            Some(b'/') => {
177                depth = depth.saturating_sub(1);
178                i += 2;
179            }
180            Some(b'!') | Some(b'?') => {
181                // Skip bis zum naechsten `>` (Kommentar oder PI).
182                i += 2;
183                while i < bytes.len() && bytes[i] != b'>' {
184                    i += 1;
185                }
186            }
187            _ => {
188                // Opening-Tag — pruefe ob self-closing: skanne bis `>`
189                // und schau, ob das Byte davor `/` ist.
190                let start = i;
191                i += 1;
192                while i < bytes.len() && bytes[i] != b'>' {
193                    i += 1;
194                }
195                let self_closing = i > start && bytes.get(i - 1) == Some(&b'/');
196                if !self_closing {
197                    depth += 1;
198                    if depth > max_seen {
199                        max_seen = depth;
200                    }
201                    if depth > MAX_TREE_DEPTH as i64 {
202                        return Err(XmlError::LimitExceeded(format!(
203                            "tag nesting exceeds {MAX_TREE_DEPTH} — refusing to parse to \
204                             protect against stack overflow"
205                        )));
206                    }
207                }
208            }
209        }
210        // `>` ueberspringen (am Ende des match arms steht i auf `>`).
211        if i < bytes.len() && bytes[i] == b'>' {
212            i += 1;
213        }
214    }
215    Ok(())
216}
217
218/// Rekursiver Baum-Aufbau aus einem roxmltree-Knoten.
219///
220/// `depth` zaehlt die aktuelle Verschachtelungstiefe (Wurzel = 0) und
221/// schuetzt vor Stack-Overflow ueber [`MAX_TREE_DEPTH`].
222///
223/// zerodds-lint: recursion-depth = xml-tree-depth (durch `MAX_TREE_DEPTH`
224/// gecappt; durch `MAX_TOTAL_ELEMENTS` und `MAX_LIST_ELEMENTS`
225/// zusaetzlich gegen Wide-/Tall-DoS abgesichert).
226fn build_element(
227    node: roxmltree::Node<'_, '_>,
228    counter: &mut usize,
229    depth: usize,
230) -> Result<XmlElement, XmlError> {
231    if depth > MAX_TREE_DEPTH {
232        return Err(XmlError::LimitExceeded(format!(
233            "tree depth exceeds {MAX_TREE_DEPTH} — refusing to build to protect against \
234             stack overflow"
235        )));
236    }
237    *counter += 1;
238    if *counter > MAX_TOTAL_ELEMENTS {
239        return Err(XmlError::LimitExceeded(format!(
240            "document exceeds {MAX_TOTAL_ELEMENTS} elements"
241        )));
242    }
243
244    let tag = node.tag_name();
245    let mut element = XmlElement {
246        name: tag.name().to_string(),
247        namespace: tag.namespace().map(ToString::to_string),
248        attributes: BTreeMap::new(),
249        children: Vec::new(),
250        text: String::new(),
251    };
252
253    // §7.1.5 Tab.7.2 — Attribute uebernehmen.
254    for attr in node.attributes() {
255        element
256            .attributes
257            .insert(attr.name().to_string(), attr.value().to_string());
258    }
259
260    // §7.1.4 Tab.7.1 — Erster Text-Inhalt (vor dem ersten Element-Kind).
261    // roxmltree liefert Text als eigene Knoten zwischen Elementen.
262    if let Some(text) = node.text() {
263        let trimmed = text.trim();
264        if !trimmed.is_empty() {
265            element.text = trimmed.to_string();
266        }
267    }
268
269    // Children-Element-Knoten (Comments + Whitespace-Text werden
270    // automatisch gefiltert).
271    let mut child_count: usize = 0;
272    for child_node in node.children().filter(roxmltree::Node::is_element) {
273        child_count += 1;
274        if child_count > MAX_LIST_ELEMENTS {
275            return Err(XmlError::LimitExceeded(format!(
276                "<{}> has more than {MAX_LIST_ELEMENTS} children",
277                element.name
278            )));
279        }
280        element
281            .children
282            .push(build_element(child_node, counter, depth + 1)?);
283    }
284
285    Ok(element)
286}
287
288#[cfg(test)]
289#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn parse_minimal_document() {
295        let xml = r#"<root/>"#;
296        let doc = parse_xml_tree(xml).expect("parse");
297        assert_eq!(doc.root_name(), "root");
298        assert!(doc.root.children.is_empty());
299    }
300
301    #[test]
302    fn parse_with_xml_declaration() {
303        let xml = r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#;
304        let doc = parse_xml_tree(xml).expect("parse");
305        assert_eq!(doc.root_name(), "root");
306    }
307
308    #[test]
309    fn parse_namespace_aware() {
310        let xml = r#"<dds xmlns="http://www.omg.org/spec/DDS-XML"/>"#;
311        let doc = parse_xml_tree(xml).expect("parse");
312        assert_eq!(doc.root.namespace.as_deref(), Some(DDS_XML_NS));
313    }
314
315    #[test]
316    fn comments_stripped() {
317        let xml = r#"<root>
318            <!-- this is a comment -->
319            <child>value</child>
320            <!-- another -->
321        </root>"#;
322        let doc = parse_xml_tree(xml).expect("parse");
323        assert_eq!(doc.root.children.len(), 1);
324        assert_eq!(doc.root.children[0].name, "child");
325        assert_eq!(doc.root.children[0].text, "value");
326    }
327
328    #[test]
329    fn whitespace_tolerant() {
330        let xml = r#"
331            <root>
332                <child>  hello  </child>
333            </root>
334        "#;
335        let doc = parse_xml_tree(xml).expect("parse");
336        // text wird getrimmed
337        assert_eq!(doc.root.children[0].text, "hello");
338    }
339
340    #[test]
341    fn attributes_preserved() {
342        let xml = r#"<profile name="P1" base_name="P0"/>"#;
343        let doc = parse_xml_tree(xml).expect("parse");
344        assert_eq!(doc.root.attribute("name"), Some("P1"));
345        assert_eq!(doc.root.attribute("base_name"), Some("P0"));
346        assert_eq!(doc.root.attribute("missing"), None);
347    }
348
349    #[test]
350    fn invalid_xml_rejected() {
351        let xml = "<root><unclosed></root>";
352        let err = parse_xml_tree(xml).expect_err("invalid");
353        assert!(matches!(err, XmlError::InvalidXml(_)));
354    }
355
356    #[test]
357    fn dtd_rejected() {
358        // §7.1.1 — XML 1.0 Fifth Edition; DTDs sind erlaubt, aber wir
359        // verbieten sie aus Security-Gruenden (XXE-Vermeidung).
360        let xml = r#"<?xml version="1.0"?>
361<!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
362<root>&xxe;</root>"#;
363        let err = parse_xml_tree(xml).expect_err("dtd");
364        assert!(matches!(err, XmlError::InvalidXml(_)));
365    }
366
367    #[test]
368    fn child_helper() {
369        let xml = r#"<root><a/><b/><a/></root>"#;
370        let doc = parse_xml_tree(xml).expect("parse");
371        assert_eq!(doc.root.child("a").map(|c| c.name.as_str()), Some("a"));
372        assert_eq!(doc.root.children_named("a").count(), 2);
373        assert_eq!(doc.root.children_named("missing").count(), 0);
374    }
375
376    #[test]
377    fn list_dos_cap() {
378        // Build XML with MAX_LIST_ELEMENTS+1 children.
379        let mut xml = String::from("<root>");
380        for _ in 0..(MAX_LIST_ELEMENTS + 1) {
381            xml.push_str("<c/>");
382        }
383        xml.push_str("</root>");
384        let err = parse_xml_tree(&xml).expect_err("dos");
385        assert!(matches!(err, XmlError::LimitExceeded(_)));
386    }
387
388    #[test]
389    fn nested_structure() {
390        let xml = r#"<root>
391            <profile name="P1">
392                <history>
393                    <kind>KEEP_LAST_HISTORY_QOS</kind>
394                    <depth>10</depth>
395                </history>
396            </profile>
397        </root>"#;
398        let doc = parse_xml_tree(xml).expect("parse");
399        let profile = doc.root.child("profile").expect("profile");
400        assert_eq!(profile.attribute("name"), Some("P1"));
401        let history = profile.child("history").expect("history");
402        assert_eq!(
403            history.child("kind").map(|c| c.text.as_str()),
404            Some("KEEP_LAST_HISTORY_QOS")
405        );
406        assert_eq!(history.child("depth").map(|c| c.text.as_str()), Some("10"));
407    }
408
409    // ---- §7.2.4.1 + §7.2.5 sequence_elements ------------------------
410
411    #[test]
412    fn sequence_elements_iterates_element_tag_children() {
413        // Spec §7.2.4.1: IDL-Sequence wird als <complexType> mit
414        // 0..n <element>-Children dargestellt.
415        let xml = r#"<root>
416            <ports>
417                <element>7400</element>
418                <element>7401</element>
419                <element>7402</element>
420            </ports>
421        </root>"#;
422        let doc = parse_xml_tree(xml).expect("parse");
423        let ports = doc.root.child("ports").expect("ports");
424        let texts: Vec<&str> = ports.sequence_elements().map(|e| e.text.as_str()).collect();
425        assert_eq!(texts, vec!["7400", "7401", "7402"]);
426    }
427
428    #[test]
429    fn sequence_elements_skips_non_element_tagged_children() {
430        // Andere Tags wie <kind>, <depth> werden ignoriert — nur
431        // <element>-Tag zaehlt als Sequenz-Eintrag.
432        let xml = r#"<root>
433            <history>
434                <kind>KEEP_LAST_HISTORY_QOS</kind>
435                <depth>10</depth>
436                <element>not-a-real-history-field</element>
437            </history>
438        </root>"#;
439        let doc = parse_xml_tree(xml).expect("parse");
440        let hist = doc.root.child("history").expect("hist");
441        let texts: Vec<&str> = hist.sequence_elements().map(|e| e.text.as_str()).collect();
442        assert_eq!(texts, vec!["not-a-real-history-field"]);
443    }
444
445    #[test]
446    fn sequence_elements_empty_for_zero_children() {
447        let xml = r#"<root><list></list></root>"#;
448        let doc = parse_xml_tree(xml).expect("parse");
449        let list = doc.root.child("list").expect("list");
450        assert_eq!(list.sequence_elements().count(), 0);
451    }
452
453    #[test]
454    fn array_uses_same_element_tag_as_sequence() {
455        // Spec §7.2.5: IDL-Array-Mapping ist identisch zu IDL-Sequence-
456        // Mapping. Der gleiche `sequence_elements`-Iterator muss also
457        // auch fuer fixed-size IDL-Arrays funktionieren.
458        let xml = r#"<root>
459            <coords_3d>
460                <element>1.0</element>
461                <element>2.0</element>
462                <element>3.0</element>
463            </coords_3d>
464        </root>"#;
465        let doc = parse_xml_tree(xml).expect("parse");
466        let arr = doc.root.child("coords_3d").expect("array");
467        let texts: Vec<&str> = arr.sequence_elements().map(|e| e.text.as_str()).collect();
468        assert_eq!(texts.len(), 3, "IDL-Array[3] = 3 <element>-Children");
469        assert_eq!(texts, vec!["1.0", "2.0", "3.0"]);
470    }
471}