instant_xml/
de.rs

1use std::borrow::Cow;
2use std::collections::{BTreeMap, VecDeque};
3use std::str::{self, FromStr};
4
5use xmlparser::{ElementEnd, Token, Tokenizer};
6
7use crate::impls::CowStrAccumulator;
8use crate::{Error, Id};
9
10pub struct Deserializer<'cx, 'xml> {
11    pub(crate) local: &'xml str,
12    prefix: Option<&'xml str>,
13    level: usize,
14    done: bool,
15    context: &'cx mut Context<'xml>,
16}
17
18impl<'cx, 'xml> Deserializer<'cx, 'xml> {
19    pub(crate) fn new(element: Element<'xml>, context: &'cx mut Context<'xml>) -> Self {
20        let level = context.stack.len();
21        if !element.empty {
22            context.stack.push(element.level);
23        }
24
25        Self {
26            local: element.local,
27            prefix: element.prefix,
28            level,
29            done: false,
30            context,
31        }
32    }
33
34    pub fn take_str(&mut self) -> Result<Option<Cow<'xml, str>>, Error> {
35        loop {
36            match self.next() {
37                Some(Ok(Node::AttributeValue(s))) => return Ok(Some(s)),
38                Some(Ok(Node::Text(s))) => return Ok(Some(s)),
39                Some(Ok(Node::Attribute(_))) => continue,
40                Some(Ok(node)) => return Err(Error::ExpectedScalar(format!("{node:?}"))),
41                Some(Err(e)) => return Err(e),
42                None => return Ok(None),
43            }
44        }
45    }
46
47    pub fn nested<'a>(&'a mut self, element: Element<'xml>) -> Deserializer<'a, 'xml>
48    where
49        'cx: 'a,
50    {
51        Deserializer::new(element, self.context)
52    }
53
54    pub fn ignore(&mut self) -> Result<(), Error> {
55        loop {
56            match self.next() {
57                Some(Err(e)) => return Err(e),
58                Some(Ok(Node::Open(element))) => {
59                    let mut nested = self.nested(element);
60                    nested.ignore()?;
61                }
62                Some(_) => continue,
63                None => return Ok(()),
64            }
65        }
66    }
67
68    pub fn for_node<'a>(&'a mut self, node: Node<'xml>) -> Deserializer<'a, 'xml>
69    where
70        'cx: 'a,
71    {
72        self.context.records.push_front(node);
73        Deserializer {
74            local: self.local,
75            prefix: self.prefix,
76            level: self.level,
77            done: self.done,
78            context: self.context,
79        }
80    }
81
82    pub fn parent(&self) -> Id<'xml> {
83        Id {
84            ns: match self.prefix {
85                Some(ns) => self.context.lookup(ns).unwrap(),
86                None => self.context.default_ns(),
87            },
88            name: self.local,
89        }
90    }
91
92    #[inline]
93    pub fn element_id(&self, element: &Element<'xml>) -> Result<Id<'xml>, Error> {
94        self.context.element_id(element)
95    }
96
97    #[inline]
98    pub fn attribute_id(&self, attr: &Attribute<'xml>) -> Result<Id<'xml>, Error> {
99        self.context.attribute_id(attr)
100    }
101}
102
103impl<'xml> Iterator for Deserializer<'_, 'xml> {
104    type Item = Result<Node<'xml>, Error>;
105
106    fn next(&mut self) -> Option<Self::Item> {
107        if self.done {
108            return None;
109        }
110
111        let (prefix, local) = match self.context.next() {
112            Some(Ok(Node::Close { prefix, local })) => (prefix, local),
113            item => return item,
114        };
115
116        if self.context.stack.len() == self.level && local == self.local && prefix == self.prefix {
117            self.done = true;
118            return None;
119        }
120
121        Some(Err(Error::UnexpectedState("close element mismatch")))
122    }
123}
124
125pub(crate) struct Context<'xml> {
126    parser: Tokenizer<'xml>,
127    stack: Vec<Level<'xml>>,
128    records: VecDeque<Node<'xml>>,
129}
130
131impl<'xml> Context<'xml> {
132    pub(crate) fn new(input: &'xml str) -> Result<(Self, Element<'xml>), Error> {
133        let mut new = Self {
134            parser: Tokenizer::from(input),
135            stack: Vec::new(),
136            records: VecDeque::new(),
137        };
138
139        let root = match new.next() {
140            Some(result) => match result? {
141                Node::Open(element) => element,
142                _ => return Err(Error::UnexpectedState("first node does not open element")),
143            },
144            None => return Err(Error::UnexpectedEndOfStream),
145        };
146
147        Ok((new, root))
148    }
149
150    pub(crate) fn element_id(&self, element: &Element<'xml>) -> Result<Id<'xml>, Error> {
151        Ok(Id {
152            ns: match (element.default_ns, element.prefix) {
153                (_, Some(prefix)) => match element.level.prefixes.get(prefix) {
154                    Some(ns) => ns,
155                    None => match self.lookup(prefix) {
156                        Some(ns) => ns,
157                        None => return Err(Error::UnknownPrefix(prefix.to_owned())),
158                    },
159                },
160                (Some(ns), None) => ns,
161                (None, None) => self.default_ns(),
162            },
163            name: element.local,
164        })
165    }
166
167    fn attribute_id(&self, attr: &Attribute<'xml>) -> Result<Id<'xml>, Error> {
168        Ok(Id {
169            ns: match attr.prefix {
170                Some(ns) => self
171                    .lookup(ns)
172                    .ok_or_else(|| Error::UnknownPrefix(ns.to_owned()))?,
173                None => "",
174            },
175            name: attr.local,
176        })
177    }
178
179    fn default_ns(&self) -> &'xml str {
180        self.stack
181            .iter()
182            .rev()
183            .find_map(|level| level.default_ns)
184            .unwrap_or("")
185    }
186
187    fn lookup(&self, prefix: &str) -> Option<&'xml str> {
188        // The prefix xml is by definition bound to the namespace
189        // name http://www.w3.org/XML/1998/namespace
190        // See https://www.w3.org/TR/xml-names/#ns-decl
191        if prefix == "xml" {
192            return Some("http://www.w3.org/XML/1998/namespace");
193        }
194
195        self.stack
196            .iter()
197            .rev()
198            .find_map(|level| level.prefixes.get(prefix).copied())
199    }
200}
201
202impl<'xml> Iterator for Context<'xml> {
203    type Item = Result<Node<'xml>, Error>;
204
205    fn next(&mut self) -> Option<Self::Item> {
206        if let Some(record) = self.records.pop_front() {
207            return Some(Ok(record));
208        }
209
210        let mut current = None;
211        loop {
212            let token = match self.parser.next() {
213                Some(v) => v,
214                None => return None,
215            };
216
217            match token {
218                Ok(Token::ElementStart { prefix, local, .. }) => {
219                    let prefix = prefix.as_str();
220                    current = Some(Level {
221                        local: local.as_str(),
222                        prefix: match prefix.is_empty() {
223                            true => None,
224                            false => Some(prefix),
225                        },
226                        default_ns: None,
227                        prefixes: BTreeMap::new(),
228                    });
229                }
230                Ok(Token::ElementEnd { end, .. }) => match end {
231                    ElementEnd::Open => {
232                        let level = match current {
233                            Some(level) => level,
234                            None => {
235                                return Some(Err(Error::UnexpectedState(
236                                    "opening element with no parent",
237                                )))
238                            }
239                        };
240
241                        let element = Element {
242                            local: level.local,
243                            prefix: level.prefix,
244                            default_ns: level.default_ns,
245                            level,
246                            empty: false,
247                        };
248
249                        return Some(Ok(Node::Open(element)));
250                    }
251                    ElementEnd::Close(prefix, v) => {
252                        let level = match self.stack.pop() {
253                            Some(level) => level,
254                            None => {
255                                return Some(Err(Error::UnexpectedState(
256                                    "closing element without parent",
257                                )))
258                            }
259                        };
260
261                        let prefix = match prefix.is_empty() {
262                            true => None,
263                            false => Some(prefix.as_str()),
264                        };
265
266                        match v.as_str() == level.local && prefix == level.prefix {
267                            true => {
268                                return Some(Ok(Node::Close {
269                                    prefix,
270                                    local: level.local,
271                                }))
272                            }
273                            false => {
274                                return Some(Err(Error::UnexpectedState("close element mismatch")))
275                            }
276                        }
277                    }
278                    ElementEnd::Empty => {
279                        let level = match current {
280                            Some(level) => level,
281                            None => {
282                                return Some(Err(Error::UnexpectedState(
283                                    "opening element with no parent",
284                                )))
285                            }
286                        };
287
288                        self.records.push_back(Node::Close {
289                            prefix: level.prefix,
290                            local: level.local,
291                        });
292
293                        let element = Element {
294                            local: level.local,
295                            prefix: level.prefix,
296                            default_ns: level.default_ns,
297                            level,
298                            empty: true,
299                        };
300
301                        return Some(Ok(Node::Open(element)));
302                    }
303                },
304                Ok(Token::Attribute {
305                    prefix,
306                    local,
307                    value,
308                    ..
309                }) => {
310                    if prefix.is_empty() && local.as_str() == "xmlns" {
311                        match &mut current {
312                            Some(level) => level.default_ns = Some(value.as_str()),
313                            None => {
314                                return Some(Err(Error::UnexpectedState(
315                                    "attribute without element context",
316                                )))
317                            }
318                        }
319                    } else if prefix.as_str() == "xmlns" {
320                        match &mut current {
321                            Some(level) => {
322                                level.prefixes.insert(local.as_str(), value.as_str());
323                            }
324                            None => {
325                                return Some(Err(Error::UnexpectedState(
326                                    "attribute without element context",
327                                )))
328                            }
329                        }
330                    } else {
331                        let value = match decode(value.as_str()) {
332                            Ok(value) => value,
333                            Err(e) => return Some(Err(e)),
334                        };
335
336                        self.records.push_back(Node::Attribute(Attribute {
337                            prefix: match prefix.is_empty() {
338                                true => None,
339                                false => Some(prefix.as_str()),
340                            },
341                            local: local.as_str(),
342                            value,
343                        }));
344                    }
345                }
346                Ok(Token::Text { text }) => {
347                    return Some(decode(text.as_str()).map(Node::Text));
348                }
349                Ok(Token::Cdata { text, .. }) => {
350                    return Some(Ok(Node::Text(Cow::Borrowed(text.as_str()))));
351                }
352                Ok(Token::Declaration { .. }) => {
353                    if !self.stack.is_empty() {
354                        return Some(Err(Error::UnexpectedToken(format!("{token:?}"))));
355                    }
356                }
357                Ok(Token::Comment { .. }) => continue,
358                Ok(token) => return Some(Err(Error::UnexpectedToken(format!("{token:?}")))),
359                Err(e) => return Some(Err(Error::Parse(e))),
360            }
361        }
362    }
363}
364
365pub fn borrow_cow_str<'a, 'xml: 'a>(
366    into: &mut CowStrAccumulator<'xml, 'a>,
367    field: &'static str,
368    deserializer: &mut Deserializer<'_, 'xml>,
369) -> Result<(), Error> {
370    if into.inner.is_some() {
371        return Err(Error::DuplicateValue(field));
372    }
373
374    match deserializer.take_str()? {
375        Some(value) => into.inner = Some(value),
376        None => return Ok(()),
377    };
378
379    deserializer.ignore()?;
380    Ok(())
381}
382
383pub fn borrow_cow_slice_u8<'xml>(
384    into: &mut Option<Cow<'xml, [u8]>>,
385    field: &'static str,
386    deserializer: &mut Deserializer<'_, 'xml>,
387) -> Result<(), Error> {
388    if into.is_some() {
389        return Err(Error::DuplicateValue(field));
390    }
391
392    if let Some(value) = deserializer.take_str()? {
393        *into = Some(match value {
394            Cow::Borrowed(v) => Cow::Borrowed(v.as_bytes()),
395            Cow::Owned(v) => Cow::Owned(v.into_bytes()),
396        });
397    }
398
399    deserializer.ignore()?;
400    Ok(())
401}
402
403fn decode(input: &str) -> Result<Cow<'_, str>, Error> {
404    let mut result = String::with_capacity(input.len());
405    let (mut state, mut last_end) = (DecodeState::Normal, 0);
406    for (i, &b) in input.as_bytes().iter().enumerate() {
407        // use a state machine to find entities
408        state = match (state, b) {
409            (DecodeState::Normal, b'&') => DecodeState::Entity([0; 6], 0),
410            (DecodeState::Normal, _) => DecodeState::Normal,
411            (DecodeState::Entity(chars, len), b';') => {
412                let decoded = match &chars[..len] {
413                    [b'a', b'm', b'p'] => '&',
414                    [b'a', b'p', b'o', b's'] => '\'',
415                    [b'g', b't'] => '>',
416                    [b'l', b't'] => '<',
417                    [b'q', b'u', b'o', b't'] => '"',
418                    [b'#', b'x' | b'X', hex @ ..] => {
419                        // Hexadecimal character reference e.g. "&#x007c;" -> '|'
420                        str::from_utf8(hex)
421                            .ok()
422                            .and_then(|hex_str| u32::from_str_radix(hex_str, 16).ok())
423                            .and_then(char::from_u32)
424                            .filter(valid_xml_character)
425                            .ok_or_else(|| {
426                                Error::InvalidEntity(
427                                    String::from_utf8_lossy(&chars[..len]).into_owned(),
428                                )
429                            })?
430                    }
431                    [b'#', decimal @ ..] => {
432                        // Decimal character reference e.g. "&#1234;" -> 'Ӓ'
433                        str::from_utf8(decimal)
434                            .ok()
435                            .and_then(|decimal_str| u32::from_str(decimal_str).ok())
436                            .and_then(char::from_u32)
437                            .filter(valid_xml_character)
438                            .ok_or_else(|| {
439                                Error::InvalidEntity(
440                                    String::from_utf8_lossy(&chars[..len]).into_owned(),
441                                )
442                            })?
443                    }
444                    _ => {
445                        return Err(Error::InvalidEntity(
446                            String::from_utf8_lossy(&chars[..len]).into_owned(),
447                        ))
448                    }
449                };
450
451                let start = i - (len + 1); // current position - (length of entity characters + 1 for '&')
452                if last_end < start {
453                    // Unwrap should be safe: `last_end` and `start` must be at character boundaries.
454                    result.push_str(input.get(last_end..start).unwrap());
455                }
456
457                last_end = i + 1;
458                result.push(decoded);
459                DecodeState::Normal
460            }
461            (DecodeState::Entity(mut chars, len), b) => {
462                if len >= 6 {
463                    let mut bytes = Vec::with_capacity(7);
464                    bytes.extend(&chars[..len]);
465                    bytes.push(b);
466                    return Err(Error::InvalidEntity(
467                        String::from_utf8_lossy(&bytes).into_owned(),
468                    ));
469                }
470
471                chars[len] = b;
472                DecodeState::Entity(chars, len + 1)
473            }
474        };
475    }
476
477    // Unterminated entity (& without ;) at end of input
478    if let DecodeState::Entity(chars, len) = state {
479        return Err(Error::InvalidEntity(
480            String::from_utf8_lossy(&chars[..len]).into_owned(),
481        ));
482    }
483
484    Ok(match result.is_empty() {
485        true => Cow::Borrowed(input),
486        false => {
487            // Unwrap should be safe: `last_end` and `input.len()` must be at character boundaries.
488            result.push_str(input.get(last_end..input.len()).unwrap());
489            Cow::Owned(result)
490        }
491    })
492}
493
494#[derive(Debug)]
495enum DecodeState {
496    Normal,
497    Entity([u8; 6], usize),
498}
499
500/// Valid character ranges per <https://www.w3.org/TR/xml/#NT-Char>
501fn valid_xml_character(c: &char) -> bool {
502    matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}')
503}
504
505#[derive(Debug)]
506pub enum Node<'xml> {
507    Attribute(Attribute<'xml>),
508    AttributeValue(Cow<'xml, str>),
509    Close {
510        prefix: Option<&'xml str>,
511        local: &'xml str,
512    },
513    Text(Cow<'xml, str>),
514    Open(Element<'xml>),
515}
516
517#[derive(Debug)]
518pub struct Element<'xml> {
519    local: &'xml str,
520    default_ns: Option<&'xml str>,
521    prefix: Option<&'xml str>,
522    level: Level<'xml>,
523    empty: bool,
524}
525
526#[derive(Debug)]
527struct Level<'xml> {
528    local: &'xml str,
529    prefix: Option<&'xml str>,
530    default_ns: Option<&'xml str>,
531    prefixes: BTreeMap<&'xml str, &'xml str>,
532}
533
534#[derive(Debug)]
535pub struct Attribute<'xml> {
536    pub prefix: Option<&'xml str>,
537    pub local: &'xml str,
538    pub value: Cow<'xml, str>,
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544
545    #[test]
546    fn test_decode() {
547        decode_ok("foo", "foo");
548        decode_ok("foo &amp; bar", "foo & bar");
549        decode_ok("foo &lt; bar", "foo < bar");
550        decode_ok("foo &gt; bar", "foo > bar");
551        decode_ok("foo &quot; bar", "foo \" bar");
552        decode_ok("foo &apos; bar", "foo ' bar");
553        decode_ok("foo &amp;lt; bar", "foo &lt; bar");
554        decode_ok("&amp; foo", "& foo");
555        decode_ok("foo &amp;", "foo &");
556        decode_ok("cbdtéda&amp;sü", "cbdtéda&sü");
557        // Decimal character references
558        decode_ok("&#1234;", "Ӓ");
559        decode_ok("foo &#9; bar", "foo \t bar");
560        decode_ok("foo &#124; bar", "foo | bar");
561        decode_ok("foo &#1234; bar", "foo Ӓ bar");
562        // Hexadecimal character references
563        decode_ok("&#xc4;", "Ä");
564        decode_ok("&#x00c4;", "Ä");
565        decode_ok("foo &#x9; bar", "foo \t bar");
566        decode_ok("foo &#x007c; bar", "foo | bar");
567        decode_ok("foo &#xc4; bar", "foo Ä bar");
568        decode_ok("foo &#x00c4; bar", "foo Ä bar");
569        decode_ok("foo &#x10de; bar", "foo პ bar");
570
571        decode_err("&");
572        decode_err("&#");
573        decode_err("&#;");
574        decode_err("foo&");
575        decode_err("&bar");
576        decode_err("&foo;");
577        decode_err("&foobar;");
578        decode_err("cbdtéd&ampü");
579    }
580
581    fn decode_ok(input: &str, expected: &'static str) {
582        assert_eq!(super::decode(input).unwrap(), expected, "{input:?}");
583    }
584
585    fn decode_err(input: &str) {
586        assert!(super::decode(input).is_err(), "{input:?}");
587    }
588}