Skip to main content

xdoc/parser/
mod.rs

1//! Safe XML parsing into the core model.
2//!
3//! This module converts XML parser events into the `core::Document` tree. It
4//! rejects DTDs and unresolved general entities by default using the shared
5//! `security` module policy.
6
7use std::collections::{BTreeMap, BTreeSet};
8use std::io::Read;
9use std::str;
10
11use quick_xml::events::{BytesStart, Event};
12use quick_xml::reader::Reader;
13use quick_xml::XmlVersion;
14
15use crate::core::{
16    validate_namespace_binding, Attribute, Document, ErrorKind, NamespaceDeclaration, QName, Span,
17    XmlError, XmlResult, XML_NAMESPACE_URI,
18};
19use crate::security::{EntityPolicy, ParserSecurityConfig, SecurityLimits};
20
21/// Parser configuration with safe defaults for the MVP.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct ParserConfig {
24    preserve_comments: bool,
25    preserve_cdata: bool,
26    security: ParserSecurityConfig,
27}
28
29impl ParserConfig {
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    pub fn with_preserve_comments(mut self, preserve: bool) -> Self {
35        self.preserve_comments = preserve;
36        self
37    }
38
39    pub fn with_preserve_cdata(mut self, preserve: bool) -> Self {
40        self.preserve_cdata = preserve;
41        self
42    }
43
44    pub fn with_max_document_bytes(mut self, limit: usize) -> Self {
45        let limits = self
46            .security
47            .limits()
48            .clone()
49            .with_max_document_bytes(limit);
50        self.security = self.security.with_limits(limits);
51        self
52    }
53
54    pub fn with_max_text_bytes(mut self, limit: usize) -> Self {
55        let limits = self.security.limits().clone().with_max_text_bytes(limit);
56        self.security = self.security.with_limits(limits);
57        self
58    }
59
60    pub fn with_max_depth(mut self, limit: usize) -> Self {
61        let limits = self.security.limits().clone().with_max_depth(limit);
62        self.security = self.security.with_limits(limits);
63        self
64    }
65
66    pub fn with_max_nodes(mut self, limit: usize) -> Self {
67        let limits = self.security.limits().clone().with_max_nodes(limit);
68        self.security = self.security.with_limits(limits);
69        self
70    }
71
72    pub fn with_security(mut self, security: ParserSecurityConfig) -> Self {
73        self.security = security;
74        self
75    }
76
77    pub fn preserve_comments(&self) -> bool {
78        self.preserve_comments
79    }
80
81    pub fn preserve_cdata(&self) -> bool {
82        self.preserve_cdata
83    }
84
85    pub fn security(&self) -> &ParserSecurityConfig {
86        &self.security
87    }
88
89    fn limits(&self) -> &SecurityLimits {
90        self.security.limits()
91    }
92}
93
94impl Default for ParserConfig {
95    fn default() -> Self {
96        Self {
97            preserve_comments: true,
98            preserve_cdata: true,
99            security: ParserSecurityConfig::default(),
100        }
101    }
102}
103
104/// Parses XML from a string using safe default configuration.
105pub fn parse_str(xml: &str) -> XmlResult<Document> {
106    parse_str_with_config(xml, &ParserConfig::default())
107}
108
109/// Parses XML from a string using an explicit configuration.
110pub fn parse_str_with_config(xml: &str, config: &ParserConfig) -> XmlResult<Document> {
111    config.limits().check_document_size(xml.len())?;
112
113    let mut reader = Reader::from_str(xml);
114    reader.config_mut().trim_text(false);
115    reader.config_mut().expand_empty_elements = false;
116    reader.config_mut().check_end_names = true;
117
118    parse_events(xml, &mut reader, config)
119}
120
121/// Parses XML from a reader using safe default configuration.
122pub fn parse_reader(reader: impl Read) -> XmlResult<Document> {
123    parse_reader_with_config(reader, &ParserConfig::default())
124}
125
126/// Parses XML from a reader using an explicit configuration.
127pub fn parse_reader_with_config(
128    mut reader: impl Read,
129    config: &ParserConfig,
130) -> XmlResult<Document> {
131    let mut bytes = Vec::new();
132    let limit = config.limits().max_document_bytes() as u64 + 1;
133    reader
134        .by_ref()
135        .take(limit)
136        .read_to_end(&mut bytes)
137        .map_err(|error| XmlError::new(ErrorKind::Io, error.to_string()))?;
138
139    config.limits().check_document_size(bytes.len())?;
140
141    let xml = String::from_utf8(bytes).map_err(|error| {
142        XmlError::new(
143            ErrorKind::Parse,
144            format!("XML input must be valid UTF-8: {error}"),
145        )
146    })?;
147
148    parse_str_with_config(&xml, config)
149}
150
151fn parse_events(
152    xml: &str,
153    reader: &mut Reader<&[u8]>,
154    config: &ParserConfig,
155) -> XmlResult<Document> {
156    let mut state = ParserState::new(config);
157
158    loop {
159        let event = reader.read_event().map_err(|error| {
160            parse_error_with_position(xml, reader.error_position() as usize, error.to_string())
161        })?;
162
163        match event {
164            Event::Start(start) => state.start_element(start, reader, xml)?,
165            Event::Empty(start) => {
166                state.start_element(start, reader, xml)?;
167                state.end_element();
168            }
169            Event::End(_) => state.end_element(),
170            Event::Text(text) => {
171                let value = text.xml10_content().map_err(|error| {
172                    parse_error_with_position(
173                        xml,
174                        reader.error_position() as usize,
175                        error.to_string(),
176                    )
177                })?;
178                state.text(value.as_ref())?;
179            }
180            Event::CData(cdata) => {
181                let value = cdata.decode().map_err(|error| {
182                    parse_error_with_position(
183                        xml,
184                        reader.error_position() as usize,
185                        error.to_string(),
186                    )
187                })?;
188                state.cdata(value.as_ref())?;
189            }
190            Event::Comment(comment) => {
191                if config.preserve_comments {
192                    let value = comment.xml10_content().map_err(|error| {
193                        parse_error_with_position(
194                            xml,
195                            reader.error_position() as usize,
196                            error.to_string(),
197                        )
198                    })?;
199                    state.comment(value.as_ref())?;
200                }
201            }
202            Event::PI(pi) => {
203                let content = str::from_utf8(pi.content()).map_err(|error| {
204                    parse_error_with_position(
205                        xml,
206                        reader.error_position() as usize,
207                        error.to_string(),
208                    )
209                })?;
210                let target = str::from_utf8(pi.target()).map_err(|error| {
211                    parse_error_with_position(
212                        xml,
213                        reader.error_position() as usize,
214                        error.to_string(),
215                    )
216                })?;
217                state.processing_instruction(target, processing_instruction_data(content))?;
218            }
219            Event::Decl(_) => {}
220            Event::DocType(_) => {
221                config
222                    .security()
223                    .entity_policy()
224                    .reject_doctype()
225                    .map_err(|error| {
226                        error.with_span(span_for_byte(xml, reader.error_position() as usize))
227                    })?;
228            }
229            Event::GeneralRef(reference) => {
230                if let Some(ch) = reference.resolve_char_ref().map_err(|error| {
231                    parse_error_with_position(
232                        xml,
233                        reader.error_position() as usize,
234                        error.to_string(),
235                    )
236                })? {
237                    state.text(&ch.to_string())?;
238                } else {
239                    let name = reference.decode().map_err(|error| {
240                        parse_error_with_position(
241                            xml,
242                            reader.error_position() as usize,
243                            error.to_string(),
244                        )
245                    })?;
246                    let value = match predefined_entity(name.as_ref()) {
247                        Some(value) => value,
248                        None => {
249                            return Err(unresolved_entity_error(
250                                config.security().entity_policy(),
251                                name.as_ref(),
252                                span_for_byte(xml, reader.error_position() as usize),
253                            ));
254                        }
255                    };
256                    state.text(value)?;
257                }
258            }
259            Event::Eof => break,
260        }
261    }
262
263    state.finish()
264}
265
266struct ParserState<'a> {
267    config: &'a ParserConfig,
268    document: Document,
269    stack: Vec<crate::core::NodeId>,
270    namespace_stack: Vec<NamespaceScope>,
271    node_count: usize,
272}
273
274impl<'a> ParserState<'a> {
275    fn new(config: &'a ParserConfig) -> Self {
276        Self {
277            config,
278            document: Document::new(),
279            stack: Vec::new(),
280            namespace_stack: vec![NamespaceScope::default()],
281            node_count: 0,
282        }
283    }
284
285    fn start_element(
286        &mut self,
287        start: BytesStart<'_>,
288        reader: &Reader<&[u8]>,
289        xml: &str,
290    ) -> XmlResult<()> {
291        let declarations = namespace_declarations(&start, reader, xml)?;
292        let scope = self
293            .namespace_stack
294            .last()
295            .expect("root namespace scope exists")
296            .with_declarations(&declarations);
297        let name = qname_from_raw(start.name().as_ref(), &scope, true)?;
298        let id = match self.stack.last().copied() {
299            Some(parent) => self.document.add_element(parent, name)?,
300            None => self.document.add_root_element(name)?,
301        };
302
303        self.count_node()?;
304        self.config.limits().check_depth(self.stack.len() + 1)?;
305
306        for declaration in declarations {
307            let core_declaration = match declaration.prefix {
308                Some(prefix) => NamespaceDeclaration::prefixed(prefix, declaration.uri)?,
309                None => NamespaceDeclaration::default(declaration.uri)?,
310            };
311            self.document
312                .add_namespace_declaration(id, core_declaration)?;
313        }
314
315        let mut attribute_names = BTreeSet::new();
316        for attribute in start.attributes() {
317            let attribute =
318                attribute.map_err(|error| parse_error_with_position(xml, 0, error.to_string()))?;
319            if is_namespace_declaration(attribute.key.as_ref()) {
320                continue;
321            }
322            let name = qname_from_raw(attribute.key.as_ref(), &scope, false)?;
323            if !attribute_names.insert(expanded_attribute_name(&name)) {
324                return Err(XmlError::new(
325                    ErrorKind::Parse,
326                    format!(
327                        "duplicate attribute `{}` by expanded name",
328                        name.lexical_name()
329                    ),
330                ));
331            }
332            let value = attribute
333                .decoded_and_normalized_value(XmlVersion::Explicit1_0, reader.decoder())
334                .map_err(|error| parse_error_with_position(xml, 0, error.to_string()))?;
335            self.document
336                .add_attribute(id, Attribute::new(name, value.as_ref()))?;
337        }
338
339        self.stack.push(id);
340        self.namespace_stack.push(scope);
341        Ok(())
342    }
343
344    fn end_element(&mut self) {
345        self.stack.pop();
346        self.namespace_stack.pop();
347    }
348
349    fn text(&mut self, value: &str) -> XmlResult<()> {
350        if value.is_empty() {
351            return Ok(());
352        }
353        if self.stack.is_empty() {
354            if value.trim().is_empty() {
355                return Ok(());
356            }
357            return Err(XmlError::new(
358                ErrorKind::Parse,
359                "non-whitespace text outside the document root is not allowed",
360            ));
361        }
362        self.check_text_limit(value)?;
363        let parent = self.current_parent()?;
364        self.document.add_text(parent, value)?;
365        self.count_node()
366    }
367
368    fn cdata(&mut self, value: &str) -> XmlResult<()> {
369        if value.is_empty() {
370            return Ok(());
371        }
372        if self.stack.is_empty() {
373            return Err(XmlError::new(
374                ErrorKind::Parse,
375                "CDATA outside the document root is not allowed",
376            ));
377        }
378        self.check_text_limit(value)?;
379        let parent = self.current_parent()?;
380        if self.config.preserve_cdata {
381            self.document.add_cdata(parent, value)?;
382        } else {
383            self.document.add_text(parent, value)?;
384        }
385        self.count_node()
386    }
387
388    fn comment(&mut self, value: &str) -> XmlResult<()> {
389        if self.stack.is_empty() {
390            return Ok(());
391        }
392        self.check_text_limit(value)?;
393        let parent = self.current_parent()?;
394        self.document.add_comment(parent, value)?;
395        self.count_node()
396    }
397
398    fn processing_instruction(&mut self, target: &str, data: Option<&str>) -> XmlResult<()> {
399        if self.stack.is_empty() {
400            return Ok(());
401        }
402        let parent = self.current_parent()?;
403        self.document
404            .add_processing_instruction(parent, target, data)?;
405        self.count_node()
406    }
407
408    fn finish(self) -> XmlResult<Document> {
409        if !self.stack.is_empty() {
410            return Err(XmlError::new(
411                ErrorKind::Parse,
412                "XML document ended before closing all elements",
413            ));
414        }
415        if self.document.root().is_none() {
416            return Err(XmlError::new(
417                ErrorKind::Parse,
418                "XML document must contain one root element",
419            ));
420        }
421        Ok(self.document)
422    }
423
424    fn current_parent(&self) -> XmlResult<crate::core::NodeId> {
425        self.stack.last().copied().ok_or_else(|| {
426            XmlError::new(
427                ErrorKind::Parse,
428                "XML content outside the document root is not supported",
429            )
430        })
431    }
432
433    fn count_node(&mut self) -> XmlResult<()> {
434        self.node_count += 1;
435        self.config.limits().check_nodes(self.node_count)
436    }
437
438    fn check_text_limit(&self, value: &str) -> XmlResult<()> {
439        self.config.limits().check_text_size(value.len())
440    }
441}
442
443#[derive(Debug, Clone, Default)]
444struct NamespaceScope {
445    default_namespace: Option<String>,
446    prefixed: BTreeMap<String, String>,
447}
448
449impl NamespaceScope {
450    fn with_declarations(&self, declarations: &[ParsedNamespaceDeclaration]) -> Self {
451        let mut next = self.clone();
452        for declaration in declarations {
453            match &declaration.prefix {
454                Some(prefix) => {
455                    next.prefixed
456                        .insert(prefix.clone(), declaration.uri.clone());
457                }
458                None => {
459                    next.default_namespace = Some(declaration.uri.clone());
460                }
461            }
462        }
463        next
464    }
465}
466
467#[derive(Debug, Clone, PartialEq, Eq)]
468struct ParsedNamespaceDeclaration {
469    prefix: Option<String>,
470    uri: String,
471}
472
473fn namespace_declarations(
474    start: &BytesStart<'_>,
475    reader: &Reader<&[u8]>,
476    xml: &str,
477) -> XmlResult<Vec<ParsedNamespaceDeclaration>> {
478    let mut declarations = Vec::new();
479    for attribute in start.attributes() {
480        let attribute =
481            attribute.map_err(|error| parse_error_with_position(xml, 0, error.to_string()))?;
482        let raw_name = attribute.key.as_ref();
483        if !is_namespace_declaration(raw_name) {
484            continue;
485        }
486        let uri = attribute
487            .decoded_and_normalized_value(XmlVersion::Explicit1_0, reader.decoder())
488            .map_err(|error| parse_error_with_position(xml, 0, error.to_string()))?;
489        let prefix = raw_name
490            .strip_prefix(b"xmlns:")
491            .map(bytes_to_string)
492            .transpose()?;
493        validate_namespace_binding(prefix.as_deref(), uri.as_ref())?;
494        declarations.push(ParsedNamespaceDeclaration {
495            prefix,
496            uri: uri.into_owned(),
497        });
498    }
499    Ok(declarations)
500}
501
502fn qname_from_raw(raw: &[u8], scope: &NamespaceScope, default_applies: bool) -> XmlResult<QName> {
503    let raw = bytes_to_string(raw)?;
504    match raw.split_once(':') {
505        Some((prefix, local)) => {
506            let uri = if prefix == "xml" {
507                XML_NAMESPACE_URI
508            } else {
509                scope.prefixed.get(prefix).ok_or_else(|| {
510                    XmlError::new(
511                        ErrorKind::UnknownNamespacePrefix,
512                        format!("namespace prefix `{prefix}` is not declared"),
513                    )
514                })?
515            };
516            QName::qualified(prefix, local, uri)
517        }
518        None if default_applies => match &scope.default_namespace {
519            Some(uri) => QName::namespaced(raw, uri),
520            None => QName::new(raw),
521        },
522        None => QName::new(raw),
523    }
524}
525
526fn expanded_attribute_name(name: &QName) -> (Option<String>, String) {
527    (
528        name.namespace_uri().map(|uri| uri.as_str().to_owned()),
529        name.local().to_owned(),
530    )
531}
532
533fn is_namespace_declaration(raw_name: &[u8]) -> bool {
534    raw_name == b"xmlns" || raw_name.starts_with(b"xmlns:")
535}
536
537fn bytes_to_string(bytes: &[u8]) -> XmlResult<String> {
538    str::from_utf8(bytes)
539        .map(str::to_owned)
540        .map_err(|error| XmlError::new(ErrorKind::Parse, error.to_string()))
541}
542
543fn empty_to_none(value: &str) -> Option<&str> {
544    if value.is_empty() {
545        None
546    } else {
547        Some(value)
548    }
549}
550
551fn processing_instruction_data(value: &str) -> Option<&str> {
552    let value = value
553        .strip_prefix(' ')
554        .or_else(|| value.strip_prefix('\t'))
555        .or_else(|| value.strip_prefix('\r'))
556        .or_else(|| value.strip_prefix('\n'))
557        .unwrap_or(value);
558    empty_to_none(value)
559}
560
561fn predefined_entity(name: &str) -> Option<&'static str> {
562    match name {
563        "lt" => Some("<"),
564        "gt" => Some(">"),
565        "amp" => Some("&"),
566        "apos" => Some("'"),
567        "quot" => Some("\""),
568        _ => None,
569    }
570}
571
572fn unresolved_entity_error(policy: &EntityPolicy, name: &str, span: Span) -> XmlError {
573    match policy.reject_external_entity(name) {
574        Err(error) => error.with_span(span),
575        Ok(()) => XmlError::new(
576            ErrorKind::Parse,
577            format!("external entity resolution is not implemented for `&{name};`"),
578        )
579        .with_span(span),
580    }
581}
582
583fn parse_error_with_position(
584    xml: &str,
585    byte_position: usize,
586    message: impl Into<String>,
587) -> XmlError {
588    XmlError::new(ErrorKind::Parse, message).with_span(span_for_byte(xml, byte_position))
589}
590
591fn span_for_byte(xml: &str, byte_position: usize) -> Span {
592    let mut line = 1;
593    let mut column = 1;
594    for (index, ch) in xml.char_indices() {
595        if index >= byte_position {
596            break;
597        }
598        if ch == '\n' {
599            line += 1;
600            column = 1;
601        } else {
602            column += 1;
603        }
604    }
605    Span::new(line, column)
606}
607
608#[cfg(test)]
609mod tests {
610    use std::io::Cursor;
611
612    use super::*;
613    use crate::core::{Attribute, NamespaceDeclaration, NodeKind};
614    use crate::writer::to_string_compact;
615
616    #[test]
617    fn parser_parse_str_reads_simple_xml() -> XmlResult<()> {
618        let document = parse_str("<Root><Child>value</Child></Root>")?;
619        let root = document.root().expect("root");
620        let [child] = document.children(root)? else {
621            panic!("expected one child");
622        };
623        let [text] = document.children(*child)? else {
624            panic!("expected one text child");
625        };
626
627        assert!(matches!(document.node(*text)?.kind(), NodeKind::Text(value) if value == "value"));
628        Ok(())
629    }
630
631    #[test]
632    fn parser_parse_reader_reads_xml() -> XmlResult<()> {
633        let document = parse_reader(Cursor::new("<Root><Child/></Root>"))?;
634
635        assert_eq!(to_string_compact(&document)?, "<Root><Child/></Root>");
636        Ok(())
637    }
638
639    #[test]
640    fn parser_whitespace_around_root_is_allowed() -> XmlResult<()> {
641        let document = parse_str("  \n\t<Root/>  \n")?;
642
643        assert_eq!(to_string_compact(&document)?, "<Root/>");
644        Ok(())
645    }
646
647    #[test]
648    fn parser_xml_declaration_and_boundary_misc_are_allowed() -> XmlResult<()> {
649        let document = parse_str(
650            "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- before -->\n<?before ok?>\n<Root/>\n<!-- after -->\n<?after ok?>",
651        )?;
652
653        assert_eq!(to_string_compact(&document)?, "<Root/>");
654        Ok(())
655    }
656
657    #[test]
658    fn parser_processing_instruction_roundtrips_without_accumulating_separator_space(
659    ) -> XmlResult<()> {
660        let document = parse_str("<Root><?format keep?></Root>")?;
661
662        assert_eq!(
663            to_string_compact(&document)?,
664            "<Root><?format keep?></Root>"
665        );
666        Ok(())
667    }
668
669    #[test]
670    fn parser_rejects_non_whitespace_text_outside_root() {
671        let before = parse_str("text<Root/>").expect_err("text before root must fail");
672        let after = parse_str("<Root/>text").expect_err("text after root must fail");
673
674        assert_eq!(before.kind(), &ErrorKind::Parse);
675        assert!(before.message().contains("outside the document root"));
676        assert_eq!(after.kind(), &ErrorKind::Parse);
677        assert!(after.message().contains("outside the document root"));
678    }
679
680    #[test]
681    fn parser_empty_document_requires_root() {
682        let empty = parse_str("").expect_err("empty document must fail");
683        let whitespace = parse_str(" \n\t ").expect_err("whitespace-only document must fail");
684        let comment =
685            parse_str("<!-- only comment -->").expect_err("comment-only document must fail");
686        let pi = parse_str("<?xml-stylesheet href=\"style.xsl\"?>")
687            .expect_err("PI-only document must fail");
688
689        for error in [empty, whitespace, comment, pi] {
690            assert_eq!(error.kind(), &ErrorKind::Parse);
691            assert!(error.message().contains("root element"));
692        }
693    }
694
695    #[test]
696    fn parser_namespaces_preserves_qnames_and_attributes() -> XmlResult<()> {
697        let document = parse_str(
698            r#"<doc:Root xmlns="urn:default" xmlns:doc="urn:doc" doc:id="A1"><Child plain="yes"/></doc:Root>"#,
699        )?;
700        let root = document.root().expect("root");
701        let root_node = document.node(root)?;
702        let NodeKind::Element(root_element) = root_node.kind() else {
703            panic!("expected root element");
704        };
705
706        assert_eq!(
707            root_element.name().prefix().map(|prefix| prefix.as_str()),
708            Some("doc")
709        );
710        assert_eq!(
711            root_element.name().namespace_uri().map(|uri| uri.as_str()),
712            Some("urn:doc")
713        );
714        assert_eq!(root_element.namespace_declarations().len(), 2);
715        assert_eq!(root_element.attributes()[0].name().lexical_name(), "doc:id");
716
717        let child = document.children(root)?[0];
718        let NodeKind::Element(child_element) = document.node(child)?.kind() else {
719            panic!("expected child element");
720        };
721        assert_eq!(
722            child_element.name().namespace_uri().map(|uri| uri.as_str()),
723            Some("urn:default")
724        );
725        assert_eq!(child_element.attributes()[0].name().namespace_uri(), None);
726        Ok(())
727    }
728
729    #[test]
730    fn parser_namespace_reserved_xml_prefix_is_implicit() -> XmlResult<()> {
731        let document = parse_str(r#"<Root xml:lang="en" xml:space="preserve"/>"#)?;
732        let root = document.root().expect("root");
733        let NodeKind::Element(element) = document.node(root)?.kind() else {
734            panic!("expected root element");
735        };
736
737        assert_eq!(element.attributes().len(), 2);
738        assert_eq!(element.attributes()[0].name().lexical_name(), "xml:lang");
739        assert_eq!(
740            element.attributes()[0]
741                .name()
742                .namespace_uri()
743                .map(|uri| uri.as_str()),
744            Some(XML_NAMESPACE_URI)
745        );
746        assert_eq!(element.attributes()[1].name().lexical_name(), "xml:space");
747        Ok(())
748    }
749
750    #[test]
751    fn parser_namespace_rejects_reserved_declaration_misuse() {
752        let cases = [
753            r#"<Root xmlns:xml="urn:wrong"/>"#,
754            r#"<Root xmlns:doc="http://www.w3.org/XML/1998/namespace"/>"#,
755            r#"<Root xmlns:xmlns="urn:any"/>"#,
756            r#"<Root xmlns="http://www.w3.org/2000/xmlns/"/>"#,
757        ];
758
759        for xml in cases {
760            let error = parse_str(xml).expect_err("reserved namespace misuse must fail");
761            assert_eq!(error.kind(), &ErrorKind::InvalidNamespace, "{xml}");
762        }
763    }
764
765    #[test]
766    fn parser_namespace_rejects_duplicate_attributes_by_expanded_name() {
767        let direct = parse_str(r#"<Root id="1" id="2"/>"#)
768            .expect_err("duplicate unqualified attributes must fail");
769        let expanded = parse_str(r#"<Root xmlns:a="urn:x" xmlns:b="urn:x" a:id="1" b:id="2"/>"#)
770            .expect_err("duplicate expanded attributes must fail");
771
772        assert_eq!(direct.kind(), &ErrorKind::Parse);
773        assert!(direct.message().contains("duplicate"));
774        assert_eq!(expanded.kind(), &ErrorKind::Parse);
775        assert!(expanded.message().contains("duplicate"));
776    }
777
778    #[test]
779    fn parser_namespace_default_does_not_apply_to_attributes() -> XmlResult<()> {
780        let document = parse_str(r#"<Root xmlns="urn:root" id="A1"/>"#)?;
781        let root = document.root().expect("root");
782        let NodeKind::Element(element) = document.node(root)?.kind() else {
783            panic!("expected root element");
784        };
785
786        assert_eq!(
787            element.name().namespace_uri().map(|uri| uri.as_str()),
788            Some("urn:root")
789        );
790        assert_eq!(element.attributes()[0].name().namespace_uri(), None);
791        Ok(())
792    }
793
794    #[test]
795    fn parser_comments_can_be_preserved_or_discarded() -> XmlResult<()> {
796        let preserved = parse_str("<Root><!-- note --><Child/></Root>")?;
797        assert!(matches!(
798            preserved.node(preserved.children(preserved.root().unwrap())?[0])?.kind(),
799            NodeKind::Comment(comment) if comment == " note "
800        ));
801
802        let discarded = parse_str_with_config(
803            "<Root><!-- note --><Child/></Root>",
804            &ParserConfig::default().with_preserve_comments(false),
805        )?;
806        assert_eq!(discarded.children(discarded.root().unwrap())?.len(), 1);
807        Ok(())
808    }
809
810    #[test]
811    fn parser_preserves_cdata() -> XmlResult<()> {
812        let document = parse_str("<Root><![CDATA[a < b]]></Root>")?;
813        let root = document.root().expect("root");
814        let child = document.children(root)?[0];
815
816        assert!(matches!(document.node(child)?.kind(), NodeKind::CData(value) if value == "a < b"));
817        Ok(())
818    }
819
820    #[test]
821    fn parser_security_rejects_external_entities_by_default() {
822        let error = parse_str(r#"<!DOCTYPE Root SYSTEM "file:///tmp/x"><Root/>"#)
823            .expect_err("doctype must be blocked");
824
825        assert_eq!(error.kind(), &ErrorKind::Parse);
826        assert!(error.message().contains("DOCTYPE"));
827    }
828
829    #[test]
830    fn parser_entity_predefined_references_are_resolved() -> XmlResult<()> {
831        let document = parse_str("<Root>&lt;&amp;&gt;&apos;&quot;</Root>")?;
832
833        assert_eq!(
834            to_string_compact(&document)?,
835            "<Root>&lt;&amp;&gt;'\"</Root>"
836        );
837        Ok(())
838    }
839
840    #[test]
841    fn parser_entity_unknown_reference_is_rejected_by_default() {
842        let error = parse_str("<Root>&xxe;</Root>").expect_err("unknown entity must fail");
843
844        assert_eq!(error.kind(), &ErrorKind::Parse);
845        assert!(error.message().contains("disabled by default"));
846        assert!(error.span().is_some());
847    }
848
849    #[test]
850    fn parser_entity_permissive_policy_still_rejects_unimplemented_resolution() {
851        let security = ParserSecurityConfig::default()
852            .with_entity_policy(EntityPolicy::secure().with_external_entities(true));
853        let config = ParserConfig::default().with_security(security);
854
855        let error = parse_str_with_config("<Root>&xxe;</Root>", &config)
856            .expect_err("unknown entity must fail without panic");
857
858        assert_eq!(error.kind(), &ErrorKind::Parse);
859        assert!(error.message().contains("not implemented"));
860        assert!(error.span().is_some());
861    }
862
863    #[test]
864    fn parser_respects_max_depth() {
865        let config = ParserConfig::default().with_max_depth(1);
866        let error =
867            parse_str_with_config("<Root><Child/></Root>", &config).expect_err("depth must fail");
868
869        assert_eq!(error.kind(), &ErrorKind::Parse);
870        assert!(error.message().contains("depth"));
871    }
872
873    #[test]
874    fn parser_consumes_shared_security_config() {
875        let security = ParserSecurityConfig::default()
876            .with_limits(SecurityLimits::default().with_max_document_bytes(6));
877        let config = ParserConfig::default().with_security(security);
878
879        let error = parse_str_with_config("<Root/>", &config).expect_err("size must fail");
880
881        assert_eq!(error.kind(), &ErrorKind::Parse);
882        assert!(error.message().contains("maximum size"));
883    }
884
885    #[test]
886    fn parser_reports_span_for_malformed_xml() {
887        let error = parse_str("<Root>\n  <Child></Root>").expect_err("malformed XML must fail");
888
889        assert_eq!(error.kind(), &ErrorKind::Parse);
890        assert!(error.span().is_some());
891    }
892
893    #[test]
894    fn parser_roundtrip_reads_writer_output() -> XmlResult<()> {
895        let mut document = Document::new();
896        let root = document.add_root_element(QName::qualified("doc", "Root", "urn:doc")?)?;
897        document
898            .add_namespace_declaration(root, NamespaceDeclaration::prefixed("doc", "urn:doc")?)?;
899        document.add_attribute(root, Attribute::new(QName::new("id")?, "A1"))?;
900        document.add_text(root, "value")?;
901
902        let xml = to_string_compact(&document)?;
903        let parsed = parse_str(&xml)?;
904
905        assert_eq!(to_string_compact(&parsed)?, xml);
906        Ok(())
907    }
908}