roxmltree/
parse.rs

1use alloc::string::{String, ToString};
2use alloc::{vec, vec::Vec};
3use alloc::borrow::Cow;
4use core::ops::Range;
5use core::mem::take;
6use core::fmt;
7use memchr::{memchr, memchr2, memchr_iter};
8
9use crate::{
10    AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
11    NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
12    XMLNS,
13};
14
15use crate::tokenizer::{self, Reference, StrSpan, Stream};
16
17type Result<T> = core::result::Result<T, Error>;
18
19/// A list of all possible errors.
20#[derive(Clone, PartialEq, Eq, Hash, Debug)]
21pub enum Error {
22    /// The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.
23    InvalidXmlPrefixUri(TextPos),
24
25    /// Only the `xmlns:xml` attribute can have the <http://www.w3.org/XML/1998/namespace> URI.
26    UnexpectedXmlUri(TextPos),
27
28    /// The <http://www.w3.org/2000/xmlns/> URI must not be declared.
29    UnexpectedXmlnsUri(TextPos),
30
31    /// `xmlns` can't be used as an element prefix.
32    InvalidElementNamePrefix(TextPos),
33
34    /// A namespace was already defined on this element.
35    DuplicatedNamespace(String, TextPos),
36
37    /// An unknown namespace.
38    ///
39    /// Indicates that an element or an attribute has an unknown qualified name prefix.
40    ///
41    /// The first value is a prefix.
42    UnknownNamespace(String, TextPos),
43
44    /// Incorrect tree structure.
45    ///
46    /// expected, actual, position
47    UnexpectedCloseTag(String, String, TextPos),
48
49    /// Entity value starts with a close tag.
50    ///
51    /// Example:
52    /// ```xml
53    /// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
54    /// <root>&p;</root>
55    /// ```
56    UnexpectedEntityCloseTag(TextPos),
57
58    /// A reference to an entity that was not defined in the DTD.
59    UnknownEntityReference(String, TextPos),
60
61    /// A malformed entity reference.
62    ///
63    /// A `&` character inside an attribute value or text indicates an entity reference.
64    /// Otherwise, the document is not well-formed.
65    MalformedEntityReference(TextPos),
66
67    /// A possible entity reference loop.
68    ///
69    /// The current depth limit is 10. The max number of references per reference is 255.
70    EntityReferenceLoop(TextPos),
71
72    /// Attribute value cannot have a `<` character.
73    InvalidAttributeValue(TextPos),
74
75    /// An element has a duplicated attributes.
76    ///
77    /// This also includes namespaces resolving.
78    /// So an element like this will lead to an error.
79    /// ```xml
80    /// <e xmlns:n1='http://www.w3.org' xmlns:n2='http://www.w3.org' n1:a='b1' n2:a='b2'/>
81    /// ```
82    DuplicatedAttribute(String, TextPos),
83
84    /// The XML document must have at least one element.
85    NoRootNode,
86
87    /// The root node was opened but never closed.
88    UnclosedRootNode,
89
90    /// An XML document can have only one XML declaration
91    /// and it must be at the start of the document.
92    UnexpectedDeclaration(TextPos),
93
94    /// An XML with DTD detected.
95    ///
96    /// This error will be emitted only when `ParsingOptions::allow_dtd` is set to `false`.
97    DtdDetected,
98
99    /// Indicates that the [`ParsingOptions::nodes_limit`] was reached.
100    NodesLimitReached,
101
102    /// Indicates that too many attributes were parsed.
103    AttributesLimitReached,
104
105    /// Indicates that too many namespaces were parsed.
106    NamespacesLimitReached,
107
108    /// An invalid name.
109    InvalidName(TextPos),
110
111    /// A non-XML character has occurred.
112    ///
113    /// Valid characters are: <https://www.w3.org/TR/xml/#char32>
114    NonXmlChar(char, TextPos),
115
116    /// An invalid/unexpected character.
117    ///
118    /// expected, actual, position
119    InvalidChar(u8, u8, TextPos),
120
121    /// An invalid/unexpected character.
122    ///
123    /// expected, actual, position
124    InvalidChar2(&'static str, u8, TextPos),
125
126    /// An unexpected string.
127    ///
128    /// Contains what string was expected.
129    InvalidString(&'static str, TextPos),
130
131    /// An invalid ExternalID in the DTD.
132    InvalidExternalID(TextPos),
133
134    /// The given entity resolved yielded an errror.
135    EntityResolver(TextPos, String),
136
137    /// A comment cannot contain `--` or end with `-`.
138    InvalidComment(TextPos),
139
140    /// A Character Data node contains an invalid data.
141    ///
142    /// Currently, only `]]>` is not allowed.
143    InvalidCharacterData(TextPos),
144
145    /// An unknown token.
146    UnknownToken(TextPos),
147
148    /// The steam ended earlier than we expected.
149    ///
150    /// Should only appear on invalid input data.
151    UnexpectedEndOfStream,
152}
153
154impl Error {
155    /// Returns the error position.
156    pub fn pos(&self) -> TextPos {
157        match *self {
158            Error::InvalidXmlPrefixUri(pos) => pos,
159            Error::UnexpectedXmlUri(pos) => pos,
160            Error::UnexpectedXmlnsUri(pos) => pos,
161            Error::InvalidElementNamePrefix(pos) => pos,
162            Error::DuplicatedNamespace(_, pos) => pos,
163            Error::UnknownNamespace(_, pos) => pos,
164            Error::UnexpectedCloseTag(_, _, pos) => pos,
165            Error::UnexpectedEntityCloseTag(pos) => pos,
166            Error::UnknownEntityReference(_, pos) => pos,
167            Error::MalformedEntityReference(pos) => pos,
168            Error::EntityReferenceLoop(pos) => pos,
169            Error::InvalidAttributeValue(pos) => pos,
170            Error::DuplicatedAttribute(_, pos) => pos,
171            Error::NoRootNode => TextPos::new(1, 1),
172            Error::UnclosedRootNode => TextPos::new(1, 1),
173            Error::UnexpectedDeclaration(pos) => pos,
174            Error::DtdDetected => TextPos::new(1, 1),
175            Error::NodesLimitReached => TextPos::new(1, 1),
176            Error::AttributesLimitReached => TextPos::new(1, 1),
177            Error::NamespacesLimitReached => TextPos::new(1, 1),
178            Error::InvalidName(pos) => pos,
179            Error::NonXmlChar(_, pos) => pos,
180            Error::InvalidChar(_, _, pos) => pos,
181            Error::InvalidChar2(_, _, pos) => pos,
182            Error::InvalidString(_, pos) => pos,
183            Error::InvalidExternalID(pos) => pos,
184            Error::EntityResolver(pos, _) => pos,
185            Error::InvalidComment(pos) => pos,
186            Error::InvalidCharacterData(pos) => pos,
187            Error::UnknownToken(pos) => pos,
188            Error::UnexpectedEndOfStream => TextPos::new(1, 1),
189        }
190    }
191}
192
193impl core::fmt::Display for Error {
194    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
195        match self {
196            Error::InvalidXmlPrefixUri(pos) => {
197                write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
198            }
199            Error::UnexpectedXmlUri(pos) => {
200                write!(
201                    f,
202                    "the 'xml' namespace URI is used for not 'xml' prefix at {}",
203                    pos
204                )
205            }
206            Error::UnexpectedXmlnsUri(pos) => {
207                write!(
208                    f,
209                    "the 'xmlns' URI is used at {}, but it must not be declared",
210                    pos
211                )
212            }
213            Error::InvalidElementNamePrefix(pos) => {
214                write!(
215                    f,
216                    "the 'xmlns' prefix is used at {}, but it must not be",
217                    pos
218                )
219            }
220            Error::DuplicatedNamespace(ref name, pos) => {
221                write!(f, "namespace '{}' at {} is already defined", name, pos)
222            }
223            Error::UnknownNamespace(ref name, pos) => {
224                write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
225            }
226            Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
227                write!(
228                    f,
229                    "expected '{}' tag, not '{}' at {}",
230                    expected, actual, pos
231                )
232            }
233            Error::UnexpectedEntityCloseTag(pos) => {
234                write!(f, "unexpected close tag at {}", pos)
235            }
236            Error::MalformedEntityReference(pos) => {
237                write!(f, "malformed entity reference at {}", pos)
238            }
239            Error::UnknownEntityReference(ref name, pos) => {
240                write!(f, "unknown entity reference '{}' at {}", name, pos)
241            }
242            Error::EntityReferenceLoop(pos) => {
243                write!(f, "a possible entity reference loop is detected at {}", pos)
244            }
245            Error::InvalidAttributeValue(pos) => {
246                write!(f, "unescaped '<' found at {}", pos)
247            }
248            Error::DuplicatedAttribute(ref name, pos) => {
249                write!(f, "attribute '{}' at {} is already defined", name, pos)
250            }
251            Error::NoRootNode => {
252                write!(f, "the document does not have a root node")
253            }
254            Error::UnclosedRootNode => {
255                write!(f, "the root node was opened but never closed")
256            }
257            Error::UnexpectedDeclaration(pos) => {
258                write!(f, "unexpected XML declaration at {}", pos)
259            }
260            Error::DtdDetected => {
261                write!(f, "XML with DTD detected")
262            }
263            Error::NodesLimitReached => {
264                write!(f, "nodes limit reached")
265            }
266            Error::AttributesLimitReached => {
267                write!(f, "more than 2^32 attributes were parsed")
268            }
269            Error::NamespacesLimitReached => {
270                write!(f, "more than 2^16 unique namespaces were parsed")
271            }
272            Error::InvalidName(pos) => {
273                write!(f, "invalid name token at {}", pos)
274            }
275            Error::NonXmlChar(c, pos) => {
276                write!(f, "a non-XML character {:?} found at {}", c, pos)
277            }
278            Error::InvalidChar(expected, actual, pos) => {
279                write!(
280                    f,
281                    "expected '{}' not '{}' at {}",
282                    *expected as char, *actual as char, pos
283                )
284            }
285            Error::InvalidChar2(expected, actual, pos) => {
286                write!(
287                    f,
288                    "expected {} not '{}' at {}",
289                    expected, *actual as char, pos
290                )
291            }
292            Error::InvalidString(expected, pos) => {
293                write!(f, "expected '{}' at {}", expected, pos)
294            }
295            Error::InvalidExternalID(pos) => {
296                write!(f, "invalid ExternalID at {}", pos)
297            }
298            Error::EntityResolver(pos, msg) => {
299                write!(f, "entity resolver failed at {}: {}", pos, msg)
300            }
301            Error::InvalidComment(pos) => {
302                write!(f, "comment at {} contains '--'", pos)
303            }
304            Error::InvalidCharacterData(pos) => {
305                write!(f, "']]>' at {} is not allowed inside a character data", pos)
306            }
307            Error::UnknownToken(pos) => {
308                write!(f, "unknown token at {}", pos)
309            }
310            Error::UnexpectedEndOfStream => {
311                write!(f, "unexpected end of stream")
312            }
313        }
314    }
315}
316
317#[cfg(feature = "std")]
318impl std::error::Error for Error {
319    fn description(&self) -> &str {
320        "an XML parsing error"
321    }
322}
323
324/// Parsing options.
325pub struct ParsingOptions<'input> {
326    /// Allow DTD parsing.
327    ///
328    /// When set to `false`, XML with DTD will cause an error.
329    /// Empty DTD block is not an error.
330    ///
331    /// Currently, there is no option to simply skip DTD.
332    /// Mainly because you will get `UnknownEntityReference` error later anyway.
333    ///
334    /// This flag is set to `false` by default for security reasons,
335    /// but `roxmltree` still has checks for billion laughs attack,
336    /// so this is just an extra security measure.
337    ///
338    /// Default: false
339    pub allow_dtd: bool,
340
341    /// Sets the maximum number of nodes to parse.
342    ///
343    /// Useful when dealing with random input to limit memory usage.
344    ///
345    /// Default: u32::MAX (no limit)
346    pub nodes_limit: u32,
347
348    /// Function to resolve external entities
349    ///
350    /// See [`EntityResolver`] for the signature
351    /// and the expected behaviour.
352    pub entity_resolver: Option<&'input EntityResolver<'input>>,
353}
354
355/// Function to resolve external entities
356///
357/// This function is passed the optional public ID
358/// and the mandatory URI of the external entity.
359///
360/// It is expected to yield the content defining
361/// the URI, possibly preceeded by a text declaration.
362///
363/// If it yields [`None`], the entity is discarded
364/// and will yield errors if referenced during parsing.
365///
366/// Errors must be stringified so they can be propagated
367/// via [`Error::EntityResolver`].
368pub type EntityResolver<'input> =
369    dyn Fn(Option<&str>, &str) -> core::result::Result<Option<&'input str>, String> + 'input;
370
371impl Default for ParsingOptions<'_> {
372    fn default() -> Self {
373        ParsingOptions {
374            allow_dtd: false,
375            nodes_limit: u32::MAX,
376            entity_resolver: None,
377        }
378    }
379}
380
381impl fmt::Debug for ParsingOptions<'_> {
382    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
383        let entity_resolver = if self.entity_resolver.is_some() {
384            "Some(..)"
385        } else {
386            "None"
387        };
388
389        fmt.debug_struct("ParsingOptions")
390            .field("allow_dtd", &self.allow_dtd)
391            .field("nodes_limit", &self.nodes_limit)
392            .field("entity_resolver", &entity_resolver)
393            .finish()
394    }
395}
396
397struct TempAttributeData<'input> {
398    prefix: &'input str,
399    local: &'input str,
400    value: StringStorage<'input>,
401    range: Range<usize>,
402    #[cfg(feature = "positions")]
403    qname_len: u16,
404    #[cfg(feature = "positions")]
405    eq_len: u8,
406}
407
408impl<'input> Document<'input> {
409    /// Parses the input XML string.
410    ///
411    /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
412    /// UTF-8 string.
413    ///
414    /// This is a shorthand for `Document::parse_with_options(data, ParsingOptions::default())`.
415    ///
416    /// # Examples
417    ///
418    /// ```
419    /// let doc = roxmltree::Document::parse("<e/>").unwrap();
420    /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
421    /// ```
422    #[inline]
423    pub fn parse(text: &'input str) -> Result<Self> {
424        Self::parse_with_options(text, ParsingOptions::default())
425    }
426
427    /// Parses the input XML string using to selected options.
428    ///
429    /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
430    /// UTF-8 string.
431    ///
432    /// # Examples
433    ///
434    /// ```
435    /// let opt = roxmltree::ParsingOptions::default();
436    /// let doc = roxmltree::Document::parse_with_options("<e/>", opt).unwrap();
437    /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
438    /// ```
439    #[inline]
440    pub fn parse_with_options(text: &'input str, opt: ParsingOptions<'input>) -> Result<Self> {
441        parse(text, opt)
442    }
443}
444
445struct Entity<'input> {
446    name: &'input str,
447    value: StrSpan<'input>,
448}
449
450#[derive(Clone, Copy)]
451struct TagNameSpan<'input> {
452    prefix: &'input str,
453    name: &'input str,
454    pos: usize,
455    prefix_pos: usize,
456}
457
458impl<'input> TagNameSpan<'input> {
459    #[inline]
460    fn new_null() -> Self {
461        Self {
462            prefix: "",
463            name: "",
464            pos: 0,
465            prefix_pos: 0,
466        }
467    }
468}
469
470/// An entity loop detector.
471///
472/// Limits:
473/// - Entities depth is 10.
474/// - Maximum number of entity references per entity reference is 255.
475///
476/// Basically, if a text or an attribute has an entity reference and this reference
477/// has more than 10 nested references - this is an error.
478///
479/// This is useful for simple loops like:
480///
481/// ```text
482/// <!ENTITY a '&b;'>
483/// <!ENTITY b '&a;'>
484/// ```
485///
486/// And, if a text or an attribute has an entity reference and it references more
487/// than 255 references - this is an error.
488///
489/// This is useful for cases like billion laughs attack, where depth can be pretty small,
490/// but the number of references is exponentially increasing:
491///
492/// ```text
493/// <!ENTITY lol "lol">
494/// <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
495/// <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
496/// <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
497/// <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
498/// ```
499#[derive(Default)]
500struct LoopDetector {
501    /// References depth.
502    depth: u8,
503    /// Number of references resolved by the root reference.
504    references: u8,
505}
506
507impl LoopDetector {
508    #[inline]
509    fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
510        if self.depth < 10 {
511            self.depth += 1;
512            Ok(())
513        } else {
514            Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
515        }
516    }
517
518    #[inline]
519    fn dec_depth(&mut self) {
520        if self.depth > 0 {
521            self.depth -= 1;
522        }
523
524        // Reset references count after reaching zero depth.
525        if self.depth == 0 {
526            self.references = 0;
527        }
528    }
529
530    #[inline]
531    fn inc_references(&mut self, stream: &Stream) -> Result<()> {
532        if self.depth == 0 {
533            // Allow infinite amount of references at zero depth.
534            Ok(())
535        } else {
536            if self.references == u8::MAX {
537                return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
538            }
539
540            self.references += 1;
541            Ok(())
542        }
543    }
544}
545
546struct Context<'input> {
547    opt: ParsingOptions<'input>,
548    namespace_start_idx: usize,
549    current_attributes: Vec<TempAttributeData<'input>>,
550    awaiting_subtree: Vec<NodeId>,
551    parent_prefixes: Vec<&'input str>,
552    entities: Vec<Entity<'input>>,
553    after_text: Vec<Cow<'input, str>>,
554    parent_id: NodeId,
555    tag_name: TagNameSpan<'input>,
556    loop_detector: LoopDetector,
557    doc: Document<'input>,
558}
559
560impl<'input> Context<'input> {
561    fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
562        if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
563            return Err(Error::NodesLimitReached);
564        }
565
566        #[cfg(not(feature = "positions"))]
567        let _ = range;
568
569        let new_child_id = NodeId::from(self.doc.nodes.len());
570
571        let appending_element = matches!(kind, NodeKind::Element { .. });
572        self.doc.nodes.push(NodeData {
573            parent: Some(self.parent_id),
574            prev_sibling: None,
575            next_subtree: None,
576            last_child: None,
577            kind,
578            #[cfg(feature = "positions")]
579            range,
580        });
581
582        let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
583        self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
584        self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
585
586        for id in &self.awaiting_subtree {
587            self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
588        }
589        self.awaiting_subtree.clear();
590
591        if !appending_element {
592            self.awaiting_subtree
593                .push(NodeId::from(self.doc.nodes.len() - 1));
594        }
595
596        Ok(new_child_id)
597    }
598
599    fn append_text(
600        &mut self,
601        text: Cow<'input, str>,
602        range: Range<usize>,
603    ) -> Result<()> {
604        if self.after_text.is_empty() {
605            let text = match &text {
606                Cow::Borrowed(text) => StringStorage::Borrowed(text),
607                Cow::Owned(text) => StringStorage::new_owned(text.as_str()),
608            };
609
610            self.append_node(NodeKind::Text(text), range)?;
611        }
612
613        self.after_text.push(text);
614        Ok(())
615    }
616
617    #[cold]
618    #[inline(never)]
619    fn merge_text(&mut self) {
620        let node = &mut self.doc.nodes.last_mut().unwrap();
621
622        let text = match &mut node.kind {
623            NodeKind::Text(text) => text,
624            _ => unreachable!(),
625        };
626
627        *text = StringStorage::new_owned(&self.after_text.join(""));
628    }
629
630    #[inline]
631    fn reset_after_text(&mut self) {
632        if self.after_text.is_empty() {
633            return;
634        }
635
636        if self.after_text.len() > 1 {
637            self.merge_text();
638        }
639
640        self.after_text.clear();
641    }
642}
643
644fn parse<'input>(text: &'input str, opt: ParsingOptions<'input>) -> Result<Document<'input>> {
645    // Trying to guess rough nodes and attributes amount.
646    let nodes_capacity = memchr_iter(b'<', text.as_bytes()).count();
647    let attributes_capacity = memchr_iter(b'=', text.as_bytes()).count();
648
649    // Init document.
650    let mut doc = Document {
651        text,
652        nodes: Vec::with_capacity(nodes_capacity),
653        attributes: Vec::with_capacity(attributes_capacity),
654        namespaces: Namespaces::default(),
655    };
656
657    // Add a root node.
658    doc.nodes.push(NodeData {
659        parent: None,
660        prev_sibling: None,
661        next_subtree: None,
662        last_child: None,
663        kind: NodeKind::Root,
664        #[cfg(feature = "positions")]
665        range: 0..text.len(),
666    });
667
668    doc.namespaces
669        .push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
670
671    let allow_dtd = opt.allow_dtd;
672
673    let mut ctx = Context {
674        opt,
675        namespace_start_idx: 1,
676        current_attributes: Vec::with_capacity(16),
677        entities: Vec::new(),
678        awaiting_subtree: Vec::new(),
679        parent_prefixes: vec![""],
680        after_text: Vec::with_capacity(1),
681        parent_id: NodeId::new(0),
682        tag_name: TagNameSpan::new_null(),
683        loop_detector: LoopDetector::default(),
684        doc,
685    };
686
687    tokenizer::parse(text, allow_dtd, &mut ctx)?;
688
689    let mut doc = ctx.doc;
690    if !doc.root().children().any(|n| n.is_element()) {
691        return Err(Error::NoRootNode);
692    }
693
694    if ctx.parent_prefixes.len() > 1 {
695        return Err(Error::UnclosedRootNode);
696    }
697
698    doc.nodes.shrink_to_fit();
699    doc.attributes.shrink_to_fit();
700    doc.namespaces.shrink_to_fit();
701
702    Ok(doc)
703}
704
705impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
706    #[inline(always)]
707    fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
708        match token {
709            tokenizer::Token::ProcessingInstruction(target, value, range) => {
710                self.reset_after_text();
711                let pi = NodeKind::PI(PI { target, value });
712                self.append_node(pi, range)?;
713            }
714            tokenizer::Token::Comment(text, range) => {
715                self.reset_after_text();
716                self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
717            }
718            tokenizer::Token::EntityDeclaration(name, definition) => {
719                self.entities.push(Entity {
720                    name,
721                    value: definition,
722                });
723            }
724            tokenizer::Token::ElementStart(prefix, local, start) => {
725                self.reset_after_text();
726
727                if prefix == XMLNS {
728                    let pos = self.doc.text_pos_at(start + 1);
729                    return Err(Error::InvalidElementNamePrefix(pos));
730                }
731
732                self.tag_name = TagNameSpan {
733                    prefix,
734                    name: local,
735                    pos: start,
736                    prefix_pos: start + 1,
737                };
738            }
739            tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
740                process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
741            }
742            tokenizer::Token::ElementEnd(end, range) => {
743                self.reset_after_text();
744                process_element(end, range, self)?;
745            }
746            tokenizer::Token::Text(text, range) => {
747                process_text(text, range, self)?;
748            }
749            tokenizer::Token::Cdata(text, range) => {
750                process_cdata(text, range, self)?;
751            }
752        }
753
754        Ok(())
755    }
756
757    fn resolve_entity(&mut self, pub_id: Option<&str>, uri: &str) -> core::result::Result<Option<&'input str>, String> {
758        match &mut self.opt.entity_resolver {
759            Some(entity_resolver) => entity_resolver(pub_id, uri),
760            None => Ok(None),
761        }
762    }
763
764}
765
766#[allow(clippy::too_many_arguments)]
767fn process_attribute<'input>(
768    range: Range<usize>,
769    qname_len: u16,
770    eq_len: u8,
771    prefix: &'input str,
772    local: &'input str,
773    value: StrSpan<'input>,
774    ctx: &mut Context<'input>,
775) -> Result<()> {
776    let value = normalize_attribute(value, ctx)?;
777
778    if prefix == XMLNS {
779        // The xmlns namespace MUST NOT be declared as the default namespace.
780        if value.as_str() == NS_XMLNS_URI {
781            let pos = ctx.doc.text_pos_at(range.start);
782            return Err(Error::UnexpectedXmlnsUri(pos));
783        }
784
785        let is_xml_ns_uri = value.as_str() == NS_XML_URI;
786
787        // The prefix 'xml' is by definition bound to the namespace name
788        // http://www.w3.org/XML/1998/namespace.
789        // It MUST NOT be bound to any other namespace name.
790        if local == NS_XML_PREFIX {
791            if !is_xml_ns_uri {
792                let pos = ctx.doc.text_pos_at(range.start);
793                return Err(Error::InvalidXmlPrefixUri(pos));
794            }
795        } else {
796            // The xml namespace MUST NOT be bound to a non-xml prefix.
797            if is_xml_ns_uri {
798                let pos = ctx.doc.text_pos_at(range.start);
799                return Err(Error::UnexpectedXmlUri(pos));
800            }
801        }
802
803        // Check for duplicated namespaces.
804        if ctx
805            .doc
806            .namespaces
807            .exists(ctx.namespace_start_idx, Some(local))
808        {
809            let pos = ctx.doc.text_pos_at(range.start);
810            return Err(Error::DuplicatedNamespace(local.to_string(), pos));
811        }
812
813        // Xml namespace should not be added to the namespaces.
814        if !is_xml_ns_uri {
815            ctx.doc.namespaces.push_ns(Some(local), value)?;
816        }
817    } else if local == XMLNS {
818        // The xml namespace MUST NOT be declared as the default namespace.
819        if value.as_str() == NS_XML_URI {
820            let pos = ctx.doc.text_pos_at(range.start);
821            return Err(Error::UnexpectedXmlUri(pos));
822        }
823
824        // The xmlns namespace MUST NOT be declared as the default namespace.
825        if value.as_str() == NS_XMLNS_URI {
826            let pos = ctx.doc.text_pos_at(range.start);
827            return Err(Error::UnexpectedXmlnsUri(pos));
828        }
829
830        ctx.doc.namespaces.push_ns(None, value)?;
831    } else {
832        #[cfg(not(feature = "positions"))]
833        let _ = (qname_len, eq_len);
834
835        ctx.current_attributes.push(TempAttributeData {
836            prefix,
837            local,
838            value,
839            range,
840            #[cfg(feature = "positions")]
841            qname_len,
842            #[cfg(feature = "positions")]
843            eq_len,
844        });
845    }
846
847    Ok(())
848}
849
850fn process_element<'input>(
851    end_token: tokenizer::ElementEnd<'input>,
852    token_range: Range<usize>,
853    ctx: &mut Context<'input>,
854) -> Result<()> {
855    if ctx.tag_name.name.is_empty() {
856        // May occur in XML like this:
857        // <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
858        // <root>&p;</root>
859
860        if let tokenizer::ElementEnd::Close(..) = end_token {
861            return Err(Error::UnexpectedEntityCloseTag(
862                ctx.doc.text_pos_at(token_range.start),
863            ));
864        } else {
865            unreachable!("should be already checked by the tokenizer");
866        }
867    }
868
869    let namespaces = ctx.resolve_namespaces();
870    ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
871
872    let attributes = resolve_attributes(namespaces, ctx)?;
873
874    match end_token {
875        tokenizer::ElementEnd::Empty => {
876            let tag_ns_idx = get_ns_idx_by_prefix(
877                namespaces,
878                ctx.tag_name.prefix_pos,
879                ctx.tag_name.prefix,
880                &ctx.doc,
881            )?;
882            let new_element_id = ctx.append_node(
883                NodeKind::Element {
884                    tag_name: ExpandedNameIndexed {
885                        namespace_idx: tag_ns_idx,
886                        local_name: ctx.tag_name.name,
887                    },
888                    attributes,
889                    namespaces,
890                },
891                ctx.tag_name.pos..token_range.end,
892            )?;
893            ctx.awaiting_subtree.push(new_element_id);
894        }
895        tokenizer::ElementEnd::Close(prefix, local) => {
896            let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
897            // should never panic as we start with the single prefix of the
898            // root node and always push another one when changing the parent
899            let parent_prefix = *ctx.parent_prefixes.last().unwrap();
900
901            #[cfg(feature = "positions")]
902            {
903                parent_node.range.end = token_range.end;
904            }
905
906            if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
907                if prefix != parent_prefix || local != tag_name.local_name {
908                    return Err(Error::UnexpectedCloseTag(
909                        gen_qname_string(parent_prefix, tag_name.local_name),
910                        gen_qname_string(prefix, local),
911                        ctx.doc.text_pos_at(token_range.start),
912                    ));
913                }
914            }
915            ctx.awaiting_subtree.push(ctx.parent_id);
916
917            if let Some(id) = parent_node.parent {
918                ctx.parent_id = id;
919                ctx.parent_prefixes.pop();
920                debug_assert!(!ctx.parent_prefixes.is_empty());
921            } else {
922                // May occur in XML like this:
923                // <!DOCTYPE test [ <!ENTITY p '<p></p></p>'> ]>
924                // <p>&p;&p;
925
926                return Err(Error::UnexpectedEntityCloseTag(
927                    ctx.doc.text_pos_at(token_range.start),
928                ));
929            }
930        }
931        tokenizer::ElementEnd::Open => {
932            let tag_ns_idx = get_ns_idx_by_prefix(
933                namespaces,
934                ctx.tag_name.prefix_pos,
935                ctx.tag_name.prefix,
936                &ctx.doc,
937            )?;
938            ctx.parent_id = ctx.append_node(
939                NodeKind::Element {
940                    tag_name: ExpandedNameIndexed {
941                        namespace_idx: tag_ns_idx,
942                        local_name: ctx.tag_name.name,
943                    },
944                    attributes,
945                    namespaces,
946                },
947                ctx.tag_name.pos..token_range.end,
948            )?;
949            ctx.parent_prefixes.push(ctx.tag_name.prefix);
950        }
951    }
952
953    Ok(())
954}
955
956impl Context<'_> {
957    fn resolve_namespaces(&mut self) -> ShortRange {
958        if let NodeKind::Element { ref namespaces, .. } =
959            self.doc.nodes[self.parent_id.get_usize()].kind
960        {
961            let parent_ns = *namespaces;
962            if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
963                return parent_ns;
964            }
965
966            for i in parent_ns.to_urange() {
967                if !self.doc.namespaces.exists(
968                    self.namespace_start_idx,
969                    self.doc
970                        .namespaces
971                        .get(self.doc.namespaces.tree_order[i])
972                        .name,
973                ) {
974                    self.doc.namespaces.push_ref(i);
975                }
976            }
977        }
978
979        (self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
980    }
981}
982
983fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
984    if ctx.current_attributes.is_empty() {
985        return Ok(ShortRange::new(0, 0));
986    }
987
988    if ctx.doc.attributes.len() + ctx.current_attributes.len() >= u32::MAX as usize {
989        return Err(Error::AttributesLimitReached);
990    }
991
992    let start_idx = ctx.doc.attributes.len();
993
994    for attr in ctx.current_attributes.drain(..) {
995        let namespace_idx = if attr.prefix == NS_XML_PREFIX {
996            // The prefix 'xml' is by definition bound to the namespace name
997            // http://www.w3.org/XML/1998/namespace. This namespace is added
998            // to the document on creation and is always element 0.
999            Some(NamespaceIdx(0))
1000        } else if attr.prefix.is_empty() {
1001            // 'The namespace name for an unprefixed attribute name
1002            // always has no value.'
1003            None
1004        } else {
1005            get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, &ctx.doc)?
1006        };
1007
1008        let attr_name = ExpandedNameIndexed {
1009            namespace_idx,
1010            local_name: attr.local,
1011        };
1012
1013        // Check for duplicated attributes.
1014        if ctx.doc.attributes[start_idx..].iter().any(|attr| {
1015            attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
1016        }) {
1017            let pos = ctx.doc.text_pos_at(attr.range.start);
1018            return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
1019        }
1020
1021        ctx.doc.attributes.push(AttributeData {
1022            name: attr_name,
1023            value: attr.value,
1024            #[cfg(feature = "positions")]
1025            range: attr.range,
1026            #[cfg(feature = "positions")]
1027            qname_len: attr.qname_len,
1028            #[cfg(feature = "positions")]
1029            eq_len: attr.eq_len,
1030        });
1031    }
1032
1033    Ok((start_idx..ctx.doc.attributes.len()).into())
1034}
1035
1036fn process_text<'input>(
1037    text: &'input str,
1038    range: Range<usize>,
1039    ctx: &mut Context<'input>,
1040) -> Result<()> {
1041    // Add text as is if it has only valid characters.
1042    if memchr2(b'&', b'\r', text.as_bytes()).is_none() {
1043        ctx.append_text(Cow::Borrowed(text), range)?;
1044        return Ok(());
1045    }
1046
1047    let mut text_buffer = TextBuffer::new();
1048    let mut is_as_is = false; // TODO: explain
1049    let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
1050    while !stream.at_end() {
1051        match parse_next_chunk(&mut stream, &ctx.entities)? {
1052            NextChunk::Byte(c) => {
1053                if is_as_is {
1054                    text_buffer.push_raw(c);
1055                    is_as_is = false;
1056                } else {
1057                    text_buffer.push_from_text(c, stream.at_end());
1058                }
1059            }
1060            NextChunk::Char(c) => {
1061                for b in CharToBytes::new(c) {
1062                    if ctx.loop_detector.depth > 0 {
1063                        text_buffer.push_from_text(b, stream.at_end());
1064                    } else {
1065                        // Characters not from entity should be added as is.
1066                        // Not sure why... At least `lxml` produces the same result.
1067                        text_buffer.push_raw(b);
1068                        is_as_is = true;
1069                    }
1070                }
1071            }
1072            NextChunk::Text(fragment) => {
1073                is_as_is = false;
1074
1075                if !text_buffer.is_empty() {
1076                    ctx.append_text(Cow::Owned(text_buffer.finish()), range.clone())?;
1077                }
1078
1079                ctx.loop_detector.inc_references(&stream)?;
1080                ctx.loop_detector.inc_depth(&stream)?;
1081
1082                let text = if fragment.range().start == 0 {
1083                    fragment.as_str()
1084                } else {
1085                    ctx.doc.text
1086                };
1087
1088                let mut stream = Stream::from_substr(text, fragment.range());
1089                let prev_tag_name = ctx.tag_name;
1090                ctx.tag_name = TagNameSpan::new_null();
1091                tokenizer::parse_content(&mut stream, ctx)?;
1092                ctx.tag_name = prev_tag_name;
1093                text_buffer.clear();
1094
1095                ctx.loop_detector.dec_depth();
1096            }
1097        }
1098    }
1099
1100    if !text_buffer.is_empty() {
1101        ctx.append_text(Cow::Owned(text_buffer.finish()), range)?;
1102    }
1103
1104    Ok(())
1105}
1106
1107// While the whole purpose of CDATA is to indicate to an XML library that this text
1108// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
1109fn process_cdata<'input>(
1110    mut text: &'input str,
1111    range: Range<usize>,
1112    ctx: &mut Context<'input>,
1113) -> Result<()> {
1114    let mut pos = memchr(b'\r', text.as_bytes());
1115
1116    // Add text as is if it has only valid characters.
1117    if pos.is_none() {
1118        ctx.append_text(Cow::Borrowed(text), range)?;
1119        return Ok(());
1120    }
1121
1122    let mut buf = String::new();
1123
1124    while let Some(pos1) = pos {
1125        let (line, rest) = text.split_at(pos1);
1126
1127        buf.push_str(line);
1128        buf.push('\n');
1129
1130        text = if rest.as_bytes().get(1) == Some(&b'\n') {
1131            &rest[2..]
1132        } else {
1133            &rest[1..]
1134        };
1135
1136        pos = memchr(b'\r', text.as_bytes());
1137    }
1138
1139    buf.push_str(text);
1140
1141    ctx.append_text(Cow::Owned(buf), range)?;
1142    Ok(())
1143}
1144
1145enum NextChunk<'a> {
1146    Byte(u8),
1147    Char(char),
1148    Text(StrSpan<'a>),
1149}
1150
1151fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1152    debug_assert!(!stream.at_end());
1153
1154    // Safe, because we already checked that stream is not at the end.
1155    // But we have an additional `debug_assert` above just in case.
1156    let c = stream.curr_byte_unchecked();
1157
1158    // Check for character/entity references.
1159    if c == b'&' {
1160        let start = stream.pos();
1161        match stream.consume_reference() {
1162            Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1163            Some(Reference::Entity(name)) => entities
1164                .iter()
1165                .find(|e| e.name == name)
1166                .map(|e| NextChunk::Text(e.value))
1167                .ok_or_else(|| {
1168                    let pos = stream.gen_text_pos_from(start);
1169                    Error::UnknownEntityReference(name.into(), pos)
1170                }),
1171            None => {
1172                let pos = stream.gen_text_pos_from(start);
1173                Err(Error::MalformedEntityReference(pos))
1174            }
1175        }
1176    } else {
1177        stream.advance(1);
1178        Ok(NextChunk::Byte(c))
1179    }
1180}
1181
1182// https://www.w3.org/TR/REC-xml/#AVNormalize
1183fn normalize_attribute<'input>(
1184    text: StrSpan<'input>,
1185    ctx: &mut Context<'input>,
1186) -> Result<StringStorage<'input>> {
1187    // We assume that `&` indicates an entity or a character reference.
1188    // But in rare cases it can be just an another character.
1189    if memchr2(b'&', b'\t', text.as_str().as_bytes()).is_some() || memchr2(b'\n', b'\r', text.as_str().as_bytes()).is_some() {
1190        let mut text_buffer = TextBuffer::new();
1191        _normalize_attribute(text, &mut text_buffer, ctx)?;
1192        Ok(StringStorage::new_owned(&text_buffer.finish()))
1193    } else {
1194        Ok(StringStorage::Borrowed(text.as_str()))
1195    }
1196}
1197
1198fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1199    let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1200    while !stream.at_end() {
1201        // Safe, because we already checked that the stream is not at the end.
1202        let c = stream.curr_byte_unchecked();
1203
1204        if c != b'&' {
1205            stream.advance(1);
1206            buffer.push_from_attr(c, stream.curr_byte().ok());
1207            continue;
1208        }
1209
1210        // Check for character/entity references.
1211        let start = stream.pos();
1212        match stream.consume_reference() {
1213            Some(Reference::Char(ch)) => {
1214                for b in CharToBytes::new(ch) {
1215                    if ctx.loop_detector.depth > 0 {
1216                        // Escaped `<` inside an ENTITY is an error.
1217                        // Escaped `<` outside an ENTITY is ok.
1218                        if b == b'<' {
1219                            return Err(Error::InvalidAttributeValue(
1220                                stream.gen_text_pos_from(start),
1221                            ));
1222                        }
1223
1224                        buffer.push_from_attr(b, None);
1225                    } else {
1226                        // Characters not from entity should be added as is.
1227                        // Not sure why... At least `lxml` produces the same results.
1228                        buffer.push_raw(b);
1229                    }
1230                }
1231            }
1232            Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
1233                Some(entity) => {
1234                    ctx.loop_detector.inc_references(&stream)?;
1235                    ctx.loop_detector.inc_depth(&stream)?;
1236                    _normalize_attribute(entity.value, buffer, ctx)?;
1237                    ctx.loop_detector.dec_depth();
1238                }
1239                None => {
1240                    let pos = stream.gen_text_pos_from(start);
1241                    return Err(Error::UnknownEntityReference(name.into(), pos));
1242                }
1243            },
1244            None => {
1245                let pos = stream.gen_text_pos_from(start);
1246                return Err(Error::MalformedEntityReference(pos));
1247            }
1248        }
1249    }
1250
1251    Ok(())
1252}
1253
1254fn get_ns_idx_by_prefix(
1255    namespaces: ShortRange,
1256    prefix_pos: usize,
1257    prefix: &str,
1258    doc: &Document<'_>,
1259) -> Result<Option<NamespaceIdx>> {
1260    // Prefix CAN be empty when the default namespace was defined.
1261    //
1262    // Example:
1263    // <e xmlns='http://www.w3.org'/>
1264    let prefix_opt = if prefix.is_empty() {
1265        None
1266    } else {
1267        Some(prefix)
1268    };
1269
1270    let idx = doc.namespaces.tree_order[namespaces.to_urange()]
1271        .iter()
1272        .find(|idx| doc.namespaces.get(**idx).name == prefix_opt);
1273
1274    match idx {
1275        Some(idx) => Ok(Some(*idx)),
1276        None => {
1277            if !prefix.is_empty() {
1278                // If an URI was not found and prefix IS NOT empty than
1279                // we have an unknown namespace.
1280                //
1281                // Example:
1282                // <e random:a='b'/>
1283                let pos = doc.text_pos_at(prefix_pos);
1284                Err(Error::UnknownNamespace(prefix.to_string(), pos))
1285            } else {
1286                // If an URI was not found and prefix IS empty than
1287                // an element or an attribute doesn't have a namespace.
1288                //
1289                // Example:
1290                // <e a='b'/>
1291                Ok(None)
1292            }
1293        }
1294    }
1295}
1296
1297fn gen_qname_string(prefix: &str, local: &str) -> String {
1298    if prefix.is_empty() {
1299        local.to_string()
1300    } else {
1301        alloc::format!("{}:{}", prefix, local)
1302    }
1303}
1304
1305/// Iterate over `char` by `u8`.
1306struct CharToBytes {
1307    buf: [u8; 4],
1308    idx: u8,
1309}
1310
1311impl CharToBytes {
1312    #[inline]
1313    fn new(c: char) -> Self {
1314        let mut buf = [0xFF; 4];
1315        c.encode_utf8(&mut buf);
1316
1317        CharToBytes { buf, idx: 0 }
1318    }
1319}
1320
1321impl Iterator for CharToBytes {
1322    type Item = u8;
1323
1324    #[inline]
1325    fn next(&mut self) -> Option<Self::Item> {
1326        if self.idx < 4 {
1327            let b = self.buf[self.idx as usize];
1328
1329            if b != 0xFF {
1330                self.idx += 1;
1331                return Some(b);
1332            } else {
1333                self.idx = 4;
1334            }
1335        }
1336
1337        None
1338    }
1339}
1340
1341struct TextBuffer {
1342    buffer: Vec<u8>,
1343}
1344
1345impl TextBuffer {
1346    #[inline]
1347    fn new() -> Self {
1348        TextBuffer {
1349            buffer: Vec::with_capacity(32),
1350        }
1351    }
1352
1353    #[inline]
1354    fn push_raw(&mut self, c: u8) {
1355        self.buffer.push(c);
1356    }
1357
1358    fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1359        // \r in \r\n should be ignored.
1360        if current == b'\r' && next == Some(b'\n') {
1361            return;
1362        }
1363
1364        // \n, \r and \t should be converted into spaces.
1365        current = match current {
1366            b'\n' | b'\r' | b'\t' => b' ',
1367            _ => current,
1368        };
1369
1370        self.buffer.push(current);
1371    }
1372
1373    // Translate \r\n and any \r that is not followed by \n into a single \n character.
1374    //
1375    // https://www.w3.org/TR/xml/#sec-line-ends
1376    fn push_from_text(&mut self, c: u8, at_end: bool) {
1377        if self.buffer.last() == Some(&b'\r') {
1378            let idx = self.buffer.len() - 1;
1379            self.buffer[idx] = b'\n';
1380
1381            if at_end && c == b'\r' {
1382                self.buffer.push(b'\n');
1383            } else if c != b'\n' {
1384                self.buffer.push(c);
1385            }
1386        } else if at_end && c == b'\r' {
1387            self.buffer.push(b'\n');
1388        } else {
1389            self.buffer.push(c);
1390        }
1391    }
1392
1393    #[inline]
1394    fn clear(&mut self) {
1395        self.buffer.clear();
1396    }
1397
1398    #[inline]
1399    fn is_empty(&self) -> bool {
1400        self.buffer.is_empty()
1401    }
1402
1403    #[inline]
1404    fn finish(&mut self) -> String {
1405        // `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1406        String::from_utf8(take(&mut self.buffer)).unwrap()
1407    }
1408}
roxmltree/parse.rs

roxmltree/
parse.rs