instant_xml/
de.rs

1use std::borrow::Cow;
2use std::collections::{BTreeMap, VecDeque};
3use std::str::{self, FromStr};
4
5use xmlparser::{ElementEnd, Token, Tokenizer};
6
7use crate::impls::CowStrAccumulator;
8use crate::{Error, Id};
9
10pub struct Deserializer<'cx, 'xml> {
11    pub(crate) local: &'xml str,
12    prefix: Option<&'xml str>,
13    level: usize,
14    done: bool,
15    context: &'cx mut Context<'xml>,
16}
17
18impl<'cx, 'xml> Deserializer<'cx, 'xml> {
19    pub(crate) fn new(element: Element<'xml>, context: &'cx mut Context<'xml>) -> Self {
20        let level = context.stack.len();
21        Self {
22            local: element.local,
23            prefix: element.prefix,
24            level,
25            done: false,
26            context,
27        }
28    }
29
30    pub fn take_str(&mut self) -> Result<Option<Cow<'xml, str>>, Error> {
31        loop {
32            match self.next() {
33                Some(Ok(Node::AttributeValue(s))) => return Ok(Some(s)),
34                Some(Ok(Node::Text(s))) => return Ok(Some(s)),
35                Some(Ok(Node::Attribute(_))) => continue,
36                Some(Ok(node)) => return Err(Error::ExpectedScalar(format!("{node:?}"))),
37                Some(Err(e)) => return Err(e),
38                None => return Ok(None),
39            }
40        }
41    }
42
43    pub fn nested<'a>(&'a mut self, element: Element<'xml>) -> Deserializer<'a, 'xml>
44    where
45        'cx: 'a,
46    {
47        Deserializer::new(element, self.context)
48    }
49
50    pub fn ignore(&mut self) -> Result<(), Error> {
51        loop {
52            match self.next() {
53                Some(Err(e)) => return Err(e),
54                Some(Ok(Node::Open(element))) => {
55                    let mut nested = self.nested(element);
56                    nested.ignore()?;
57                }
58                Some(_) => continue,
59                None => return Ok(()),
60            }
61        }
62    }
63
64    pub fn for_node<'a>(&'a mut self, node: Node<'xml>) -> Deserializer<'a, 'xml>
65    where
66        'cx: 'a,
67    {
68        self.context.records.push_front(node);
69        Deserializer {
70            local: self.local,
71            prefix: self.prefix,
72            level: self.level,
73            done: self.done,
74            context: self.context,
75        }
76    }
77
78    pub fn parent(&self) -> Id<'xml> {
79        Id {
80            ns: match self.prefix {
81                Some(ns) => self.context.lookup(ns).unwrap(),
82                None => self.context.default_ns(),
83            },
84            name: self.local,
85        }
86    }
87
88    #[inline]
89    pub fn element_id(&self, element: &Element<'xml>) -> Result<Id<'xml>, Error> {
90        self.context.element_id(element)
91    }
92
93    #[inline]
94    pub fn attribute_id(&self, attr: &Attribute<'xml>) -> Result<Id<'xml>, Error> {
95        self.context.attribute_id(attr)
96    }
97}
98
99impl<'xml> Iterator for Deserializer<'_, 'xml> {
100    type Item = Result<Node<'xml>, Error>;
101
102    fn next(&mut self) -> Option<Self::Item> {
103        if self.done {
104            return None;
105        }
106
107        let (prefix, local) = match self.context.next() {
108            Some(Ok(Node::Close { prefix, local })) => (prefix, local),
109            item => return item,
110        };
111
112        if self.context.stack.len() == self.level - 1
113            && local == self.local
114            && prefix == self.prefix
115        {
116            self.done = true;
117            return None;
118        }
119
120        Some(Err(Error::UnexpectedState("close element mismatch")))
121    }
122}
123
124pub(crate) struct Context<'xml> {
125    parser: Tokenizer<'xml>,
126    stack: Vec<Level<'xml>>,
127    records: VecDeque<Node<'xml>>,
128}
129
130impl<'xml> Context<'xml> {
131    pub(crate) fn new(input: &'xml str) -> Result<(Self, Element<'xml>), Error> {
132        let mut new = Self {
133            parser: Tokenizer::from(input),
134            stack: Vec::new(),
135            records: VecDeque::new(),
136        };
137
138        let root = match new.next() {
139            Some(result) => match result? {
140                Node::Open(element) => element,
141                _ => return Err(Error::UnexpectedState("first node does not open element")),
142            },
143            None => return Err(Error::UnexpectedEndOfStream),
144        };
145
146        Ok((new, root))
147    }
148
149    pub(crate) fn element_id(&self, element: &Element<'xml>) -> Result<Id<'xml>, Error> {
150        Ok(Id {
151            ns: match (element.default_ns, element.prefix) {
152                (_, Some(prefix)) => match self.lookup(prefix) {
153                    Some(ns) => ns,
154                    None => return Err(Error::UnknownPrefix(prefix.to_owned())),
155                },
156                (Some(ns), None) => ns,
157                (None, None) => self.default_ns(),
158            },
159            name: element.local,
160        })
161    }
162
163    fn attribute_id(&self, attr: &Attribute<'xml>) -> Result<Id<'xml>, Error> {
164        Ok(Id {
165            ns: match attr.prefix {
166                Some(ns) => self
167                    .lookup(ns)
168                    .ok_or_else(|| Error::UnknownPrefix(ns.to_owned()))?,
169                None => "",
170            },
171            name: attr.local,
172        })
173    }
174
175    fn default_ns(&self) -> &'xml str {
176        self.stack
177            .iter()
178            .rev()
179            .find_map(|level| level.default_ns)
180            .unwrap_or("")
181    }
182
183    fn lookup(&self, prefix: &str) -> Option<&'xml str> {
184        // The prefix xml is by definition bound to the namespace
185        // name http://www.w3.org/XML/1998/namespace
186        // See https://www.w3.org/TR/xml-names/#ns-decl
187        if prefix == "xml" {
188            return Some("http://www.w3.org/XML/1998/namespace");
189        }
190
191        self.stack
192            .iter()
193            .rev()
194            .find_map(|level| level.prefixes.get(prefix).copied())
195    }
196}
197
198impl<'xml> Iterator for Context<'xml> {
199    type Item = Result<Node<'xml>, Error>;
200
201    fn next(&mut self) -> Option<Self::Item> {
202        if let Some(record) = self.records.pop_front() {
203            if let Node::Close { .. } = &record {
204                self.stack.pop();
205            }
206            return Some(Ok(record));
207        }
208
209        loop {
210            match self.parser.next()? {
211                Ok(Token::ElementStart { prefix, local, .. }) => {
212                    let prefix = prefix.as_str();
213                    self.stack.push(Level {
214                        local: local.as_str(),
215                        prefix: match prefix.is_empty() {
216                            true => None,
217                            false => Some(prefix),
218                        },
219                        default_ns: None,
220                        prefixes: BTreeMap::new(),
221                    });
222                }
223                Ok(Token::ElementEnd { end, .. }) => match end {
224                    ElementEnd::Open => {
225                        let level = match self.stack.last() {
226                            Some(level) => level,
227                            None => {
228                                return Some(Err(Error::UnexpectedState(
229                                    "opening element with no parent",
230                                )))
231                            }
232                        };
233
234                        let element = Element {
235                            local: level.local,
236                            prefix: level.prefix,
237                            default_ns: level.default_ns,
238                        };
239
240                        return Some(Ok(Node::Open(element)));
241                    }
242                    ElementEnd::Close(prefix, v) => {
243                        let level = match self.stack.pop() {
244                            Some(level) => level,
245                            None => {
246                                return Some(Err(Error::UnexpectedState(
247                                    "closing element without parent",
248                                )))
249                            }
250                        };
251
252                        let prefix = match prefix.is_empty() {
253                            true => None,
254                            false => Some(prefix.as_str()),
255                        };
256
257                        match v.as_str() == level.local && prefix == level.prefix {
258                            true => {
259                                return Some(Ok(Node::Close {
260                                    prefix,
261                                    local: level.local,
262                                }))
263                            }
264                            false => {
265                                return Some(Err(Error::UnexpectedState("close element mismatch")))
266                            }
267                        }
268                    }
269                    ElementEnd::Empty => {
270                        let level = match self.stack.last() {
271                            Some(level) => level,
272                            None => {
273                                return Some(Err(Error::UnexpectedState(
274                                    "opening element with no parent",
275                                )))
276                            }
277                        };
278
279                        self.records.push_back(Node::Close {
280                            prefix: level.prefix,
281                            local: level.local,
282                        });
283
284                        let element = Element {
285                            local: level.local,
286                            prefix: level.prefix,
287                            default_ns: level.default_ns,
288                        };
289
290                        return Some(Ok(Node::Open(element)));
291                    }
292                },
293                Ok(Token::Attribute {
294                    prefix,
295                    local,
296                    value,
297                    ..
298                }) => {
299                    if prefix.is_empty() && local.as_str() == "xmlns" {
300                        match self.stack.last_mut() {
301                            Some(level) => level.default_ns = Some(value.as_str()),
302                            None => {
303                                return Some(Err(Error::UnexpectedState(
304                                    "attribute without element context",
305                                )))
306                            }
307                        }
308                    } else if prefix.as_str() == "xmlns" {
309                        match self.stack.last_mut() {
310                            Some(level) => {
311                                level.prefixes.insert(local.as_str(), value.as_str());
312                            }
313                            None => {
314                                return Some(Err(Error::UnexpectedState(
315                                    "attribute without element context",
316                                )))
317                            }
318                        }
319                    } else {
320                        let value = match decode(value.as_str()) {
321                            Ok(value) => value,
322                            Err(e) => return Some(Err(e)),
323                        };
324
325                        self.records.push_back(Node::Attribute(Attribute {
326                            prefix: match prefix.is_empty() {
327                                true => None,
328                                false => Some(prefix.as_str()),
329                            },
330                            local: local.as_str(),
331                            value,
332                        }));
333                    }
334                }
335                Ok(Token::Text { text }) => {
336                    return Some(decode(text.as_str()).map(Node::Text));
337                }
338                Ok(Token::Cdata { text, .. }) => {
339                    return Some(Ok(Node::Text(Cow::Borrowed(text.as_str()))));
340                }
341                Ok(token @ Token::Declaration { .. }) => {
342                    if !self.stack.is_empty() {
343                        return Some(Err(Error::UnexpectedToken(format!("{token:?}"))));
344                    }
345                }
346                Ok(Token::Comment { .. }) => continue,
347                Ok(token) => return Some(Err(Error::UnexpectedToken(format!("{token:?}")))),
348                Err(e) => return Some(Err(Error::Parse(e))),
349            }
350        }
351    }
352}
353
354pub fn borrow_cow_str<'a, 'xml: 'a>(
355    into: &mut CowStrAccumulator<'xml, 'a>,
356    field: &'static str,
357    deserializer: &mut Deserializer<'_, 'xml>,
358) -> Result<(), Error> {
359    if into.inner.is_some() {
360        return Err(Error::DuplicateValue(field));
361    }
362
363    match deserializer.take_str()? {
364        Some(value) => into.inner = Some(value),
365        None => return Ok(()),
366    };
367
368    deserializer.ignore()?;
369    Ok(())
370}
371
372pub fn borrow_cow_slice_u8<'xml>(
373    into: &mut Option<Cow<'xml, [u8]>>,
374    field: &'static str,
375    deserializer: &mut Deserializer<'_, 'xml>,
376) -> Result<(), Error> {
377    if into.is_some() {
378        return Err(Error::DuplicateValue(field));
379    }
380
381    if let Some(value) = deserializer.take_str()? {
382        *into = Some(match value {
383            Cow::Borrowed(v) => Cow::Borrowed(v.as_bytes()),
384            Cow::Owned(v) => Cow::Owned(v.into_bytes()),
385        });
386    }
387
388    deserializer.ignore()?;
389    Ok(())
390}
391
392fn decode(input: &str) -> Result<Cow<'_, str>, Error> {
393    let mut result = String::with_capacity(input.len());
394    let (mut state, mut last_end) = (DecodeState::Normal, 0);
395    for (i, &b) in input.as_bytes().iter().enumerate() {
396        // use a state machine to find entities
397        state = match (state, b) {
398            (DecodeState::Normal, b'&') => DecodeState::Entity([0; 6], 0),
399            (DecodeState::Normal, _) => DecodeState::Normal,
400            (DecodeState::Entity(chars, len), b';') => {
401                let decoded = match &chars[..len] {
402                    [b'a', b'm', b'p'] => '&',
403                    [b'a', b'p', b'o', b's'] => '\'',
404                    [b'g', b't'] => '>',
405                    [b'l', b't'] => '<',
406                    [b'q', b'u', b'o', b't'] => '"',
407                    [b'#', b'x' | b'X', hex @ ..] => {
408                        // Hexadecimal character reference e.g. "&#x007c;" -> '|'
409                        str::from_utf8(hex)
410                            .ok()
411                            .and_then(|hex_str| u32::from_str_radix(hex_str, 16).ok())
412                            .and_then(char::from_u32)
413                            .filter(valid_xml_character)
414                            .ok_or_else(|| {
415                                Error::InvalidEntity(
416                                    String::from_utf8_lossy(&chars[..len]).into_owned(),
417                                )
418                            })?
419                    }
420                    [b'#', decimal @ ..] => {
421                        // Decimal character reference e.g. "&#1234;" -> 'Ӓ'
422                        str::from_utf8(decimal)
423                            .ok()
424                            .and_then(|decimal_str| u32::from_str(decimal_str).ok())
425                            .and_then(char::from_u32)
426                            .filter(valid_xml_character)
427                            .ok_or_else(|| {
428                                Error::InvalidEntity(
429                                    String::from_utf8_lossy(&chars[..len]).into_owned(),
430                                )
431                            })?
432                    }
433                    _ => {
434                        return Err(Error::InvalidEntity(
435                            String::from_utf8_lossy(&chars[..len]).into_owned(),
436                        ))
437                    }
438                };
439
440                let start = i - (len + 1); // current position - (length of entity characters + 1 for '&')
441                if last_end < start {
442                    // Unwrap should be safe: `last_end` and `start` must be at character boundaries.
443                    result.push_str(input.get(last_end..start).unwrap());
444                }
445
446                last_end = i + 1;
447                result.push(decoded);
448                DecodeState::Normal
449            }
450            (DecodeState::Entity(mut chars, len), b) => {
451                if len >= 6 {
452                    let mut bytes = Vec::with_capacity(7);
453                    bytes.extend(&chars[..len]);
454                    bytes.push(b);
455                    return Err(Error::InvalidEntity(
456                        String::from_utf8_lossy(&bytes).into_owned(),
457                    ));
458                }
459
460                chars[len] = b;
461                DecodeState::Entity(chars, len + 1)
462            }
463        };
464    }
465
466    // Unterminated entity (& without ;) at end of input
467    if let DecodeState::Entity(chars, len) = state {
468        return Err(Error::InvalidEntity(
469            String::from_utf8_lossy(&chars[..len]).into_owned(),
470        ));
471    }
472
473    Ok(match result.is_empty() {
474        true => Cow::Borrowed(input),
475        false => {
476            // Unwrap should be safe: `last_end` and `input.len()` must be at character boundaries.
477            result.push_str(input.get(last_end..input.len()).unwrap());
478            Cow::Owned(result)
479        }
480    })
481}
482
483#[derive(Debug)]
484enum DecodeState {
485    Normal,
486    Entity([u8; 6], usize),
487}
488
489/// Valid character ranges per <https://www.w3.org/TR/xml/#NT-Char>
490fn valid_xml_character(c: &char) -> bool {
491    matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}')
492}
493
494#[derive(Debug)]
495pub enum Node<'xml> {
496    Attribute(Attribute<'xml>),
497    AttributeValue(Cow<'xml, str>),
498    Close {
499        prefix: Option<&'xml str>,
500        local: &'xml str,
501    },
502    Text(Cow<'xml, str>),
503    Open(Element<'xml>),
504}
505
506#[derive(Debug)]
507pub struct Element<'xml> {
508    local: &'xml str,
509    default_ns: Option<&'xml str>,
510    prefix: Option<&'xml str>,
511}
512
513#[derive(Debug)]
514struct Level<'xml> {
515    local: &'xml str,
516    prefix: Option<&'xml str>,
517    default_ns: Option<&'xml str>,
518    prefixes: BTreeMap<&'xml str, &'xml str>,
519}
520
521#[derive(Debug)]
522pub struct Attribute<'xml> {
523    pub prefix: Option<&'xml str>,
524    pub local: &'xml str,
525    pub value: Cow<'xml, str>,
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    #[test]
533    fn test_decode() {
534        decode_ok("foo", "foo");
535        decode_ok("foo &amp; bar", "foo & bar");
536        decode_ok("foo &lt; bar", "foo < bar");
537        decode_ok("foo &gt; bar", "foo > bar");
538        decode_ok("foo &quot; bar", "foo \" bar");
539        decode_ok("foo &apos; bar", "foo ' bar");
540        decode_ok("foo &amp;lt; bar", "foo &lt; bar");
541        decode_ok("&amp; foo", "& foo");
542        decode_ok("foo &amp;", "foo &");
543        decode_ok("cbdtéda&amp;sü", "cbdtéda&sü");
544        // Decimal character references
545        decode_ok("&#1234;", "Ӓ");
546        decode_ok("foo &#9; bar", "foo \t bar");
547        decode_ok("foo &#124; bar", "foo | bar");
548        decode_ok("foo &#1234; bar", "foo Ӓ bar");
549        // Hexadecimal character references
550        decode_ok("&#xc4;", "Ä");
551        decode_ok("&#x00c4;", "Ä");
552        decode_ok("foo &#x9; bar", "foo \t bar");
553        decode_ok("foo &#x007c; bar", "foo | bar");
554        decode_ok("foo &#xc4; bar", "foo Ä bar");
555        decode_ok("foo &#x00c4; bar", "foo Ä bar");
556        decode_ok("foo &#x10de; bar", "foo პ bar");
557
558        decode_err("&");
559        decode_err("&#");
560        decode_err("&#;");
561        decode_err("foo&");
562        decode_err("&bar");
563        decode_err("&foo;");
564        decode_err("&foobar;");
565        decode_err("cbdtéd&ampü");
566    }
567
568    fn decode_ok(input: &str, expected: &'static str) {
569        assert_eq!(super::decode(input).unwrap(), expected, "{input:?}");
570    }
571
572    fn decode_err(input: &str) {
573        assert!(super::decode(input).is_err(), "{input:?}");
574    }
575}