fast_xml/
reader.rs

1//! A module to handle `Reader`
2
3#[cfg(feature = "encoding")]
4use std::borrow::Cow;
5use std::io::{self, BufRead, BufReader};
6use std::{fs::File, path::Path, str::from_utf8};
7
8#[cfg(feature = "encoding")]
9use encoding_rs::{Encoding, UTF_16BE, UTF_16LE};
10
11use crate::errors::{Error, Result};
12use crate::events::attributes::Attribute;
13use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
14
15use memchr;
16
17#[derive(Clone)]
18enum TagState {
19    Opened,
20    Closed,
21    Empty,
22    /// Either Eof or Errored
23    Exit,
24}
25
26/// A low level encoding-agnostic XML event reader.
27///
28/// Consumes a `BufRead` and streams XML `Event`s.
29///
30/// # Examples
31///
32/// ```
33/// use fast_xml::Reader;
34/// use fast_xml::events::Event;
35///
36/// let xml = r#"<tag1 att1 = "test">
37///                 <tag2><!--Test comment-->Test</tag2>
38///                 <tag2>Test 2</tag2>
39///             </tag1>"#;
40/// let mut reader = Reader::from_str(xml);
41/// reader.trim_text(true);
42/// let mut count = 0;
43/// let mut txt = Vec::new();
44/// let mut buf = Vec::new();
45/// loop {
46///     match reader.read_event(&mut buf) {
47///         Ok(Event::Start(ref e)) => {
48///             match e.name() {
49///                 b"tag1" => println!("attributes values: {:?}",
50///                                     e.attributes().map(|a| a.unwrap().value)
51///                                     .collect::<Vec<_>>()),
52///                 b"tag2" => count += 1,
53///                 _ => (),
54///             }
55///         },
56///         Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
57///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
58///         Ok(Event::Eof) => break,
59///         _ => (),
60///     }
61///     buf.clear();
62/// }
63/// ```
64#[derive(Clone)]
65pub struct Reader<R: BufRead> {
66    /// reader
67    pub(crate) reader: R,
68    /// current buffer position, useful for debugging errors
69    buf_position: usize,
70    /// current state Open/Close
71    tag_state: TagState,
72    /// expand empty element into an opening and closing element
73    expand_empty_elements: bool,
74    /// trims leading whitespace in Text events, skip the element if text is empty
75    trim_text_start: bool,
76    /// trims trailing whitespace in Text events.
77    trim_text_end: bool,
78    /// trims trailing whitespaces from markup names in closing tags `</a >`
79    trim_markup_names_in_closing_tags: bool,
80    /// check if End nodes match last Start node
81    check_end_names: bool,
82    /// check if comments contains `--` (false per default)
83    check_comments: bool,
84    /// All currently Started elements which didn't have a matching
85    /// End element yet.
86    ///
87    /// For an XML
88    ///
89    /// ```xml
90    /// <root><one/><inner attr="value">|<tag></inner></root>
91    /// ```
92    /// when cursor at the `|` position buffer contains:
93    ///
94    /// ```text
95    /// rootinner
96    /// ^   ^
97    /// ```
98    ///
99    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
100    /// (0 and 4 in that case).
101    opened_buffer: Vec<u8>,
102    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
103    /// for that field for details
104    opened_starts: Vec<usize>,
105    /// a buffer to manage namespaces
106    ns_resolver: NamespaceResolver,
107    #[cfg(feature = "encoding")]
108    /// the encoding specified in the xml, defaults to utf8
109    encoding: &'static Encoding,
110    #[cfg(feature = "encoding")]
111    /// check if quick-rs could find out the encoding
112    is_encoding_set: bool,
113}
114
115impl<R: BufRead> Reader<R> {
116    /// Creates a `Reader` that reads from a reader implementing `BufRead`.
117    pub fn from_reader(reader: R) -> Reader<R> {
118        Reader {
119            reader,
120            opened_buffer: Vec::new(),
121            opened_starts: Vec::new(),
122            tag_state: TagState::Closed,
123            expand_empty_elements: false,
124            trim_text_start: false,
125            trim_text_end: false,
126            trim_markup_names_in_closing_tags: true,
127            check_end_names: true,
128            buf_position: 0,
129            check_comments: false,
130            ns_resolver: NamespaceResolver::default(),
131            #[cfg(feature = "encoding")]
132            encoding: ::encoding_rs::UTF_8,
133            #[cfg(feature = "encoding")]
134            is_encoding_set: false,
135        }
136    }
137
138    /// Changes whether empty elements should be split into an `Open` and a `Close` event.
139    ///
140    /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
141    /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
142    /// default), those tags are represented by an [`Empty`] event instead.
143    ///
144    /// Note, that setting this to `true` will lead to additional allocates that
145    /// needed to store tag name for an [`End`] event. There is no additional
146    /// allocation, however, if [`Self::check_end_names()`] is also set.
147    ///
148    /// (`false` by default)
149    ///
150    /// [`Empty`]: events/enum.Event.html#variant.Empty
151    /// [`Start`]: events/enum.Event.html#variant.Start
152    /// [`End`]: events/enum.Event.html#variant.End
153    pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader<R> {
154        self.expand_empty_elements = val;
155        self
156    }
157
158    /// Changes whether whitespace before and after character data should be removed.
159    ///
160    /// When set to `true`, all [`Text`] events are trimmed. If they are empty, no event will be
161    /// pushed.
162    ///
163    /// (`false` by default)
164    ///
165    /// [`Text`]: events/enum.Event.html#variant.Text
166    pub fn trim_text(&mut self, val: bool) -> &mut Reader<R> {
167        self.trim_text_start = val;
168        self.trim_text_end = val;
169        self
170    }
171
172    /// Changes whether whitespace after character data should be removed.
173    ///
174    /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
175    ///
176    /// (`false` by default)
177    ///
178    /// [`Text`]: events/enum.Event.html#variant.Text
179    pub fn trim_text_end(&mut self, val: bool) -> &mut Reader<R> {
180        self.trim_text_end = val;
181        self
182    }
183
184    /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
185    /// `</a >`.
186    ///
187    /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
188    ///
189    /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
190    /// going to fail erronously if a closing tag contains trailing whitespaces.
191    ///
192    /// (`true` by default)
193    ///
194    /// [`End`]: events/enum.Event.html#variant.End
195    pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader<R> {
196        self.trim_markup_names_in_closing_tags = val;
197        self
198    }
199
200    /// Changes whether mismatched closing tag names should be detected.
201    ///
202    /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
203    /// For example, `<mytag></different_tag>` will be permitted.
204    ///
205    /// If the XML is known to be sane (already processed, etc.) this saves extra time.
206    ///
207    /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
208    /// contain the data of the mismatched end tag.
209    ///
210    /// Note, that setting this to `true` will lead to additional allocates that
211    /// needed to store tag name for an [`End`] event. There is no additional
212    /// allocation, however, if [`Self::expand_empty_elements()`] is also set.
213    ///
214    /// (`true` by default)
215    ///
216    /// [`End`]: events/enum.Event.html#variant.End
217    pub fn check_end_names(&mut self, val: bool) -> &mut Reader<R> {
218        self.check_end_names = val;
219        self
220    }
221
222    /// Changes whether comments should be validated.
223    ///
224    /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
225    /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
226    /// really care about comment correctness, thus the default value is `false` to improve
227    /// performance.
228    ///
229    /// (`false` by default)
230    ///
231    /// [`Comment`]: events/enum.Event.html#variant.Comment
232    pub fn check_comments(&mut self, val: bool) -> &mut Reader<R> {
233        self.check_comments = val;
234        self
235    }
236
237    /// Gets the current byte position in the input data.
238    ///
239    /// Useful when debugging errors.
240    pub fn buffer_position(&self) -> usize {
241        // when internal state is Opened, we have actually read until '<',
242        // which we don't want to show
243        if let TagState::Opened = self.tag_state {
244            self.buf_position - 1
245        } else {
246            self.buf_position
247        }
248    }
249
250    /// private function to read until '<' is found
251    /// return a `Text` event
252    fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
253    where
254        R: XmlSource<'i, B>,
255    {
256        self.tag_state = TagState::Opened;
257
258        if self.trim_text_start {
259            self.reader.skip_whitespace(&mut self.buf_position)?;
260            if self.reader.skip_one(b'<', &mut self.buf_position)? {
261                return self.read_event_buffered(buf);
262            }
263        }
264
265        match self
266            .reader
267            .read_bytes_until(b'<', buf, &mut self.buf_position)
268        {
269            Ok(Some(bytes)) if self.trim_text_end => {
270                // Skip the ending '<
271                let len = bytes
272                    .iter()
273                    .rposition(|&b| !is_whitespace(b))
274                    .map_or_else(|| bytes.len(), |p| p + 1);
275                Ok(Event::Text(BytesText::from_escaped(&bytes[..len])))
276            }
277            Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))),
278            Ok(None) => Ok(Event::Eof),
279            Err(e) => Err(e),
280        }
281    }
282
283    /// Private function to read until `>` is found. This function expects that
284    /// it was called just after encounter a `<` symbol.
285    fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
286    where
287        R: XmlSource<'i, B>,
288    {
289        self.tag_state = TagState::Closed;
290
291        match self.reader.peek_one() {
292            // `<!` - comment, CDATA or DOCTYPE declaration
293            Ok(Some(b'!')) => match self.reader.read_bang_element(buf, &mut self.buf_position) {
294                Ok(None) => Ok(Event::Eof),
295                Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes),
296                Err(e) => Err(e),
297            },
298            // `</` - closing tag
299            Ok(Some(b'/')) => match self
300                .reader
301                .read_bytes_until(b'>', buf, &mut self.buf_position)
302            {
303                Ok(None) => Ok(Event::Eof),
304                Ok(Some(bytes)) => self.read_end(bytes),
305                Err(e) => Err(e),
306            },
307            // `<?` - processing instruction
308            Ok(Some(b'?')) => match self
309                .reader
310                .read_bytes_until(b'>', buf, &mut self.buf_position)
311            {
312                Ok(None) => Ok(Event::Eof),
313                Ok(Some(bytes)) => self.read_question_mark(bytes),
314                Err(e) => Err(e),
315            },
316            // `<...` - opening or self-closed tag
317            Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) {
318                Ok(None) => Ok(Event::Eof),
319                Ok(Some(bytes)) => self.read_start(bytes),
320                Err(e) => Err(e),
321            },
322            Ok(None) => Ok(Event::Eof),
323            Err(e) => Err(e),
324        }
325    }
326
327    /// reads `BytesElement` starting with a `/`,
328    /// if `self.check_end_names`, checks that element matches last opened element
329    /// return `End` event
330    fn read_end<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
331        // XML standard permits whitespaces after the markup name in closing tags.
332        // Let's strip them from the buffer before comparing tag names.
333        let name = if self.trim_markup_names_in_closing_tags {
334            if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
335                let (name, _) = buf[1..].split_at(pos_end_name + 1);
336                name
337            } else {
338                &buf[1..]
339            }
340        } else {
341            &buf[1..]
342        };
343        if self.check_end_names {
344            let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| {
345                *buf_position -= buf.len();
346                Err(Error::EndEventMismatch {
347                    expected: from_utf8(expected).unwrap_or("").to_owned(),
348                    found: from_utf8(found).unwrap_or("").to_owned(),
349                })
350            };
351            match self.opened_starts.pop() {
352                Some(start) => {
353                    let expected = &self.opened_buffer[start..];
354                    if name != expected {
355                        mismatch_err(expected, name, &mut self.buf_position)
356                    } else {
357                        self.opened_buffer.truncate(start);
358                        Ok(Event::End(BytesEnd::borrowed(name)))
359                    }
360                }
361                None => mismatch_err(b"", &buf[1..], &mut self.buf_position),
362            }
363        } else {
364            Ok(Event::End(BytesEnd::borrowed(name)))
365        }
366    }
367
368    /// reads `BytesElement` starting with a `!`,
369    /// return `Comment`, `CData` or `DocType` event
370    fn read_bang<'a, 'b>(&'a mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
371        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
372            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
373        };
374
375        let len = buf.len();
376        match bang_type {
377            BangType::Comment if buf.starts_with(b"!--") => {
378                if self.check_comments {
379                    // search if '--' not in comments
380                    if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
381                        .position(|p| buf[3 + p + 1] == b'-')
382                    {
383                        self.buf_position += len - p;
384                        return Err(Error::UnexpectedToken("--".to_string()));
385                    }
386                }
387                Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2])))
388            }
389            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
390                Ok(Event::CData(BytesCData::new(&buf[8..])))
391            }
392            BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
393                let start = buf[8..]
394                    .iter()
395                    .position(|b| !is_whitespace(*b))
396                    .unwrap_or_else(|| len - 8);
397                debug_assert!(start < len - 8, "DocType must have a name");
398                Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..])))
399            }
400            _ => Err(bang_type.to_err()),
401        }
402    }
403
404    /// reads `BytesElement` starting with a `?`,
405    /// return `Decl` or `PI` event
406    fn read_question_mark<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
407        let len = buf.len();
408        if len > 2 && buf[len - 1] == b'?' {
409            if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
410                let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3));
411
412                // Try getting encoding from the declaration event
413                #[cfg(feature = "encoding")]
414                if let Some(enc) = event.encoder() {
415                    self.encoding = enc;
416                    self.is_encoding_set = true;
417                }
418
419                Ok(Event::Decl(event))
420            } else {
421                Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1])))
422            }
423        } else {
424            self.buf_position -= len;
425            Err(Error::UnexpectedEof("XmlDecl".to_string()))
426        }
427    }
428
429    #[inline]
430    fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
431        self.tag_state = TagState::Closed;
432        let name = self
433            .opened_buffer
434            .split_off(self.opened_starts.pop().unwrap());
435        Ok(Event::End(BytesEnd::owned(name)))
436    }
437
438    /// reads `BytesElement` starting with any character except `/`, `!` or ``?`
439    /// return `Start` or `Empty` event
440    fn read_start<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
441        // TODO: do this directly when reading bufreader ...
442        let len = buf.len();
443        let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
444        if let Some(&b'/') = buf.last() {
445            let end = if name_end < len { name_end } else { len - 1 };
446            if self.expand_empty_elements {
447                self.tag_state = TagState::Empty;
448                self.opened_starts.push(self.opened_buffer.len());
449                self.opened_buffer.extend(&buf[..end]);
450                Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end)))
451            } else {
452                Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end)))
453            }
454        } else {
455            if self.check_end_names {
456                self.opened_starts.push(self.opened_buffer.len());
457                self.opened_buffer.extend(&buf[..name_end]);
458            }
459            Ok(Event::Start(BytesStart::borrowed(buf, name_end)))
460        }
461    }
462
463    /// Reads the next `Event`.
464    ///
465    /// This is the main entry point for reading XML `Event`s.
466    ///
467    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
468    /// internally).
469    ///
470    /// Having the possibility to control the internal buffers gives you some additional benefits
471    /// such as:
472    ///
473    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
474    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
475    ///   end of your loop).
476    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
477    ///
478    /// # Examples
479    ///
480    /// ```
481    /// use fast_xml::Reader;
482    /// use fast_xml::events::Event;
483    ///
484    /// let xml = r#"<tag1 att1 = "test">
485    ///                 <tag2><!--Test comment-->Test</tag2>
486    ///                 <tag2>Test 2</tag2>
487    ///             </tag1>"#;
488    /// let mut reader = Reader::from_str(xml);
489    /// reader.trim_text(true);
490    /// let mut count = 0;
491    /// let mut buf = Vec::new();
492    /// let mut txt = Vec::new();
493    /// loop {
494    ///     match reader.read_event(&mut buf) {
495    ///         Ok(Event::Start(ref e)) => count += 1,
496    ///         Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).expect("Error!")),
497    ///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
498    ///         Ok(Event::Eof) => break,
499    ///         _ => (),
500    ///     }
501    ///     buf.clear();
502    /// }
503    /// println!("Found {} start events", count);
504    /// println!("Text events: {:?}", txt);
505    /// ```
506    #[inline]
507    pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
508        self.read_event_buffered(buf)
509    }
510
511    /// Read text into the given buffer, and return an event that borrows from
512    /// either that buffer or from the input itself, based on the type of the
513    /// reader.
514    fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
515    where
516        R: XmlSource<'i, B>,
517    {
518        let event = match self.tag_state {
519            TagState::Opened => self.read_until_close(buf),
520            TagState::Closed => self.read_until_open(buf),
521            TagState::Empty => self.close_expanded_empty(),
522            TagState::Exit => return Ok(Event::Eof),
523        };
524        match event {
525            Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit,
526            _ => {}
527        }
528        event
529    }
530
531    /// Resolves a potentially qualified **event name** into (namespace name, local name).
532    ///
533    /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined
534    /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
535    /// can be defined on the same element as the attribute in question.
536    ///
537    /// *Unqualified* event inherits the current *default namespace*.
538    #[inline]
539    pub fn event_namespace<'a, 'b, 'c>(
540        &'a self,
541        qname: &'b [u8],
542        namespace_buffer: &'c [u8],
543    ) -> (Option<&'c [u8]>, &'b [u8]) {
544        self.ns_resolver.resolve(qname, namespace_buffer, true)
545    }
546
547    /// Resolves a potentially qualified **attribute name** into (namespace name, local name).
548    ///
549    /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined
550    /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
551    /// can be defined on the same element as the attribute in question.
552    ///
553    /// *Unqualified* attribute names do *not* inherit the current *default namespace*.
554    #[inline]
555    pub fn attribute_namespace<'a, 'b, 'c>(
556        &'a self,
557        qname: &'b [u8],
558        namespace_buffer: &'c [u8],
559    ) -> (Option<&'c [u8]>, &'b [u8]) {
560        self.ns_resolver.resolve(qname, namespace_buffer, false)
561    }
562
563    /// Reads the next event and resolves its namespace (if applicable).
564    ///
565    /// # Examples
566    ///
567    /// ```
568    /// use std::str::from_utf8;
569    /// use fast_xml::Reader;
570    /// use fast_xml::events::Event;
571    ///
572    /// let xml = r#"<x:tag1 xmlns:x="www.xxxx" xmlns:y="www.yyyy" att1 = "test">
573    ///                 <y:tag2><!--Test comment-->Test</y:tag2>
574    ///                 <y:tag2>Test 2</y:tag2>
575    ///             </x:tag1>"#;
576    /// let mut reader = Reader::from_str(xml);
577    /// reader.trim_text(true);
578    /// let mut count = 0;
579    /// let mut buf = Vec::new();
580    /// let mut ns_buf = Vec::new();
581    /// let mut txt = Vec::new();
582    /// loop {
583    ///     match reader.read_namespaced_event(&mut buf, &mut ns_buf) {
584    ///         Ok((ref ns, Event::Start(ref e))) => {
585    ///             count += 1;
586    ///             match (*ns, e.local_name()) {
587    ///                 (Some(b"www.xxxx"), b"tag1") => (),
588    ///                 (Some(b"www.yyyy"), b"tag2") => (),
589    ///                 (ns, n) => panic!("Namespace and local name mismatch"),
590    ///             }
591    ///             println!("Resolved namespace: {:?}", ns.and_then(|ns| from_utf8(ns).ok()));
592    ///         }
593    ///         Ok((_, Event::Text(e))) => {
594    ///             txt.push(e.unescape_and_decode(&reader).expect("Error!"))
595    ///         },
596    ///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
597    ///         Ok((_, Event::Eof)) => break,
598    ///         _ => (),
599    ///     }
600    ///     buf.clear();
601    /// }
602    /// println!("Found {} start events", count);
603    /// println!("Text events: {:?}", txt);
604    /// ```
605    pub fn read_namespaced_event<'a, 'b, 'c>(
606        &'a mut self,
607        buf: &'b mut Vec<u8>,
608        namespace_buffer: &'c mut Vec<u8>,
609    ) -> Result<(Option<&'c [u8]>, Event<'b>)> {
610        self.ns_resolver.pop(namespace_buffer);
611        match self.read_event(buf) {
612            Ok(Event::Eof) => Ok((None, Event::Eof)),
613            Ok(Event::Start(e)) => {
614                self.ns_resolver.push(&e, namespace_buffer);
615                Ok((
616                    self.ns_resolver.find(e.name(), &**namespace_buffer),
617                    Event::Start(e),
618                ))
619            }
620            Ok(Event::Empty(e)) => {
621                // For empty elements we need to 'artificially' keep the namespace scope on the
622                // stack until the next `next()` call occurs.
623                // Otherwise the caller has no chance to use `resolve` in the context of the
624                // namespace declarations that are 'in scope' for the empty element alone.
625                // Ex: <img rdf:nodeID="abc" xmlns:rdf="urn:the-rdf-uri" />
626                self.ns_resolver.push(&e, namespace_buffer);
627                // notify next `read_namespaced_event()` invocation that it needs to pop this
628                // namespace scope
629                self.ns_resolver.pending_pop = true;
630                Ok((
631                    self.ns_resolver.find(e.name(), &**namespace_buffer),
632                    Event::Empty(e),
633                ))
634            }
635            Ok(Event::End(e)) => {
636                // notify next `read_namespaced_event()` invocation that it needs to pop this
637                // namespace scope
638                self.ns_resolver.pending_pop = true;
639                Ok((
640                    self.ns_resolver.find(e.name(), &**namespace_buffer),
641                    Event::End(e),
642                ))
643            }
644            Ok(e) => Ok((None, e)),
645            Err(e) => Err(e),
646        }
647    }
648
649    /// Returns the `Reader`s encoding.
650    ///
651    /// The used encoding may change after parsing the XML declaration.
652    ///
653    /// This encoding will be used by [`decode`].
654    ///
655    /// [`decode`]: #method.decode
656    #[cfg(feature = "encoding")]
657    pub fn encoding(&self) -> &'static Encoding {
658        self.encoding
659    }
660
661    /// Decodes a slice using the encoding specified in the XML declaration.
662    ///
663    /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
664    /// `U+FFFD REPLACEMENT CHARACTER`.
665    ///
666    /// If no encoding is specified, defaults to UTF-8.
667    #[inline]
668    #[cfg(feature = "encoding")]
669    pub fn decode<'b, 'c>(&'b self, bytes: &'c [u8]) -> Cow<'c, str> {
670        self.encoding.decode(bytes).0
671    }
672
673    /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration.
674    ///
675    /// Decode `bytes` without BOM and with malformed sequences replaced with the
676    /// `U+FFFD REPLACEMENT CHARACTER`.
677    ///
678    /// # Note
679    ///
680    /// If you instead want to use XML declared encoding, use the `encoding` feature
681    #[inline]
682    #[cfg(not(feature = "encoding"))]
683    pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
684        if bytes.starts_with(b"\xEF\xBB\xBF") {
685            from_utf8(&bytes[3..]).map_err(Error::Utf8)
686        } else {
687            from_utf8(bytes).map_err(Error::Utf8)
688        }
689    }
690
691    /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration.
692    ///
693    /// Decode `bytes` without BOM and with malformed sequences replaced with the
694    /// `U+FFFD REPLACEMENT CHARACTER`.
695    ///
696    /// If no encoding is specified, defaults to UTF-8.
697    #[inline]
698    #[cfg(feature = "encoding")]
699    pub fn decode_without_bom<'b, 'c>(&'b mut self, mut bytes: &'c [u8]) -> Cow<'c, str> {
700        if self.is_encoding_set {
701            return self.encoding.decode_with_bom_removal(bytes).0;
702        }
703        if bytes.starts_with(b"\xEF\xBB\xBF") {
704            self.is_encoding_set = true;
705            bytes = &bytes[3..];
706        } else if bytes.starts_with(b"\xFF\xFE") {
707            self.is_encoding_set = true;
708            self.encoding = UTF_16LE;
709            bytes = &bytes[2..];
710        } else if bytes.starts_with(b"\xFE\xFF") {
711            self.is_encoding_set = true;
712            self.encoding = UTF_16BE;
713            bytes = &bytes[3..];
714        };
715        self.encoding.decode_without_bom_handling(bytes).0
716    }
717
718    /// Decodes a UTF8 slice regardless of XML declaration.
719    ///
720    /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
721    /// `U+FFFD REPLACEMENT CHARACTER`.
722    ///
723    /// # Note
724    ///
725    /// If you instead want to use XML declared encoding, use the `encoding` feature
726    #[inline]
727    #[cfg(not(feature = "encoding"))]
728    pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
729        from_utf8(bytes).map_err(Error::Utf8)
730    }
731
732    /// Get utf8 decoder
733    #[cfg(feature = "encoding")]
734    pub fn decoder(&self) -> Decoder {
735        Decoder {
736            encoding: self.encoding,
737        }
738    }
739
740    /// Get utf8 decoder
741    #[cfg(not(feature = "encoding"))]
742    pub fn decoder(&self) -> Decoder {
743        Decoder
744    }
745
746    /// Reads until end element is found
747    ///
748    /// Manages nested cases where parent and child elements have the same name
749    pub fn read_to_end<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<()> {
750        let mut depth = 0;
751        let end = end.as_ref();
752        loop {
753            match self.read_event(buf) {
754                Ok(Event::End(ref e)) if e.name() == end => {
755                    if depth == 0 {
756                        return Ok(());
757                    }
758                    depth -= 1;
759                }
760                Ok(Event::Start(ref e)) if e.name() == end => depth += 1,
761                Err(e) => return Err(e),
762                Ok(Event::Eof) => {
763                    return Err(Error::UnexpectedEof(format!("</{:?}>", from_utf8(end))));
764                }
765                _ => (),
766            }
767            buf.clear();
768        }
769    }
770
771    /// Reads optional text between start and end tags.
772    ///
773    /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
774    /// `String`. If the next event is an [`End`] event, returns the empty string. In all other
775    /// cases, returns an error.
776    ///
777    /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
778    /// if none is specified).
779    ///
780    /// # Examples
781    ///
782    /// ```
783    /// # use pretty_assertions::assert_eq;
784    /// use fast_xml::Reader;
785    /// use fast_xml::events::Event;
786    ///
787    /// let mut xml = Reader::from_reader(b"
788    ///     <a>&lt;b&gt;</a>
789    ///     <a></a>
790    /// " as &[u8]);
791    /// xml.trim_text(true);
792    ///
793    /// let expected = ["<b>", ""];
794    /// for &content in expected.iter() {
795    ///     match xml.read_event(&mut Vec::new()) {
796    ///         Ok(Event::Start(ref e)) => {
797    ///             assert_eq!(&xml.read_text(e.name(), &mut Vec::new()).unwrap(), content);
798    ///         },
799    ///         e => panic!("Expecting Start event, found {:?}", e),
800    ///     }
801    /// }
802    /// ```
803    ///
804    /// [`Text`]: events/enum.Event.html#variant.Text
805    /// [`End`]: events/enum.Event.html#variant.End
806    pub fn read_text<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<String> {
807        let s = match self.read_event(buf) {
808            Ok(Event::Text(e)) => e.unescape_and_decode(self),
809            Ok(Event::End(ref e)) if e.name() == end.as_ref() => return Ok("".to_string()),
810            Err(e) => return Err(e),
811            Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
812            _ => return Err(Error::TextNotFound),
813        };
814        self.read_to_end(end, buf)?;
815        s
816    }
817
818    /// Consumes `Reader` returning the underlying reader
819    ///
820    /// Can be used to compute line and column of a parsing error position
821    ///
822    /// # Examples
823    ///
824    /// ```
825    /// # use pretty_assertions::assert_eq;
826    /// use std::{str, io::Cursor};
827    /// use fast_xml::Reader;
828    /// use fast_xml::events::Event;
829    ///
830    /// let xml = r#"<tag1 att1 = "test">
831    ///                 <tag2><!--Test comment-->Test</tag2>
832    ///                 <tag3>Test 2</tag3>
833    ///             </tag1>"#;
834    /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
835    /// let mut buf = Vec::new();
836    ///
837    /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
838    ///     let end_pos = reader.buffer_position();
839    ///     let mut cursor = reader.into_inner();
840    ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
841    ///         .expect("can't make a string");
842    ///     let mut line = 1;
843    ///     let mut column = 0;
844    ///     for c in s.chars() {
845    ///         if c == '\n' {
846    ///             line += 1;
847    ///             column = 0;
848    ///         } else {
849    ///             column += 1;
850    ///         }
851    ///     }
852    ///     (line, column)
853    /// }
854    ///
855    /// loop {
856    ///     match reader.read_event(&mut buf) {
857    ///         Ok(Event::Start(ref e)) => match e.name() {
858    ///             b"tag1" | b"tag2" => (),
859    ///             tag => {
860    ///                 assert_eq!(b"tag3", tag);
861    ///                 assert_eq!((3, 22), into_line_and_column(reader));
862    ///                 break;
863    ///             }
864    ///         },
865    ///         Ok(Event::Eof) => unreachable!(),
866    ///         _ => (),
867    ///     }
868    ///     buf.clear();
869    /// }
870    /// ```
871    pub fn into_inner(self) -> R {
872        self.reader
873    }
874
875    /// Gets a reference to the underlying reader.
876    pub fn get_ref(&self) -> &R {
877        &self.reader
878    }
879
880    /// Gets a mutable reference to the underlying reader.
881    pub fn get_mut(&mut self) -> &mut R {
882        &mut self.reader
883    }
884}
885
886impl Reader<BufReader<File>> {
887    /// Creates an XML reader from a file path.
888    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Reader<BufReader<File>>> {
889        let file = File::open(path).map_err(Error::Io)?;
890        let reader = BufReader::new(file);
891        Ok(Reader::from_reader(reader))
892    }
893}
894
895impl<'a> Reader<&'a [u8]> {
896    /// Creates an XML reader from a string slice.
897    pub fn from_str(s: &'a str) -> Reader<&'a [u8]> {
898        Reader::from_reader(s.as_bytes())
899    }
900
901    /// Creates an XML reader from a slice of bytes.
902    pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> {
903        Reader::from_reader(s)
904    }
905
906    /// Read an event that borrows from the input rather than a buffer.
907    #[inline]
908    pub fn read_event_unbuffered(&mut self) -> Result<Event<'a>> {
909        self.read_event_buffered(())
910    }
911
912    /// Reads until end element is found
913    ///
914    /// Manages nested cases where parent and child elements have the same name
915    pub fn read_to_end_unbuffered<K: AsRef<[u8]>>(&mut self, end: K) -> Result<()> {
916        let mut depth = 0;
917        let end = end.as_ref();
918        loop {
919            match self.read_event_unbuffered() {
920                Ok(Event::End(ref e)) if e.name() == end => {
921                    if depth == 0 {
922                        return Ok(());
923                    }
924                    depth -= 1;
925                }
926                Ok(Event::Start(ref e)) if e.name() == end => depth += 1,
927                Err(e) => return Err(e),
928                Ok(Event::Eof) => {
929                    return Err(Error::UnexpectedEof(format!("</{:?}>", from_utf8(end))));
930                }
931                _ => (),
932            }
933        }
934    }
935}
936
937/// Represents an input for a reader that can return borrowed data.
938///
939/// There are two implementors of this trait: generic one that read data from
940/// `Self`, copies some part of it into a provided buffer of type `B` and then
941/// returns data that borrow from that buffer.
942///
943/// The other implementor is for `&[u8]` and instead of copying data returns
944/// borrowed data from `Self` instead. This implementation allows zero-copy
945/// deserialization.
946///
947/// # Parameters
948/// - `'r`: lifetime of a buffer from which events will borrow
949/// - `B`: a type of a buffer that can be used to store data read from `Self` and
950///   from which events can borrow
951trait XmlSource<'r, B> {
952    /// Read input until `byte` is found or end of input is reached.
953    ///
954    /// Returns a slice of data read up to `byte`, which does not include into result.
955    /// If input (`Self`) is exhausted, returns `None`.
956    ///
957    /// # Example
958    ///
959    /// ```ignore
960    /// let mut position = 0;
961    /// let mut input = b"abc*def".as_ref();
962    /// //                    ^= 4
963    ///
964    /// assert_eq!(
965    ///     input.read_bytes_until(b'*', (), &mut position).unwrap(),
966    ///     Some(b"abc".as_ref())
967    /// );
968    /// assert_eq!(position, 4); // position after the symbol matched
969    /// ```
970    ///
971    /// # Parameters
972    /// - `byte`: Byte for search
973    /// - `buf`: Buffer that could be filled from an input (`Self`) and
974    ///   from which [events] could borrow their data
975    /// - `position`: Will be increased by amount of bytes consumed
976    ///
977    /// [events]: crate::events::Event
978    fn read_bytes_until(
979        &mut self,
980        byte: u8,
981        buf: B,
982        position: &mut usize,
983    ) -> Result<Option<&'r [u8]>>;
984
985    /// Read input until comment, CDATA or processing instruction is finished.
986    ///
987    /// This method expect that `<` already was read.
988    ///
989    /// Returns a slice of data read up to end of comment, CDATA or processing
990    /// instruction (`>`), which does not include into result.
991    ///
992    /// If input (`Self`) is exhausted and nothing was read, returns `None`.
993    ///
994    /// # Parameters
995    /// - `buf`: Buffer that could be filled from an input (`Self`) and
996    ///   from which [events] could borrow their data
997    /// - `position`: Will be increased by amount of bytes consumed
998    ///
999    /// [events]: crate::events::Event
1000    fn read_bang_element(
1001        &mut self,
1002        buf: B,
1003        position: &mut usize,
1004    ) -> Result<Option<(BangType, &'r [u8])>>;
1005
1006    /// Read input until XML element is closed by approaching a `>` symbol.
1007    /// Returns `Some(buffer)` that contains a data between `<` and `>` or
1008    /// `None` if end-of-input was reached and nothing was read.
1009    ///
1010    /// Derived from `read_until`, but modified to handle XML attributes
1011    /// using a minimal state machine.
1012    ///
1013    /// Attribute values are [defined] as follows:
1014    /// ```plain
1015    /// AttValue := '"' (([^<&"]) | Reference)* '"'
1016    ///           | "'" (([^<&']) | Reference)* "'"
1017    /// ```
1018    /// (`Reference` is something like `&quot;`, but we don't care about
1019    /// escaped characters at this level)
1020    ///
1021    /// # Parameters
1022    /// - `buf`: Buffer that could be filled from an input (`Self`) and
1023    ///   from which [events] could borrow their data
1024    /// - `position`: Will be increased by amount of bytes consumed
1025    ///
1026    /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
1027    /// [events]: crate::events::Event
1028    fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
1029
1030    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
1031
1032    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
1033
1034    fn peek_one(&mut self) -> Result<Option<u8>>;
1035}
1036
1037/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
1038/// `Vec<u8>` as buffer that will be borrowed by events.
1039impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
1040    #[inline]
1041    fn read_bytes_until(
1042        &mut self,
1043        byte: u8,
1044        buf: &'b mut Vec<u8>,
1045        position: &mut usize,
1046    ) -> Result<Option<&'b [u8]>> {
1047        let mut read = 0;
1048        let mut done = false;
1049        let start = buf.len();
1050        while !done {
1051            let used = {
1052                let available = match self.fill_buf() {
1053                    Ok(n) if n.is_empty() => break,
1054                    Ok(n) => n,
1055                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1056                    Err(e) => {
1057                        *position += read;
1058                        return Err(Error::Io(e));
1059                    }
1060                };
1061
1062                match memchr::memchr(byte, available) {
1063                    Some(i) => {
1064                        buf.extend_from_slice(&available[..i]);
1065                        done = true;
1066                        i + 1
1067                    }
1068                    None => {
1069                        buf.extend_from_slice(available);
1070                        available.len()
1071                    }
1072                }
1073            };
1074            self.consume(used);
1075            read += used;
1076        }
1077        *position += read;
1078
1079        if read == 0 {
1080            Ok(None)
1081        } else {
1082            Ok(Some(&buf[start..]))
1083        }
1084    }
1085
1086    fn read_bang_element(
1087        &mut self,
1088        buf: &'b mut Vec<u8>,
1089        position: &mut usize,
1090    ) -> Result<Option<(BangType, &'b [u8])>> {
1091        // Peeked one bang ('!') before being called, so it's guaranteed to
1092        // start with it.
1093        let start = buf.len();
1094        let mut read = 1;
1095        buf.push(b'!');
1096        self.consume(1);
1097
1098        let bang_type = BangType::new(self.peek_one()?)?;
1099
1100        loop {
1101            match self.fill_buf() {
1102                // Note: Do not update position, so the error points to
1103                // somewhere sane rather than at the EOF
1104                Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
1105                Ok(available) => {
1106                    if let Some((consumed, used)) = bang_type.parse(available, read) {
1107                        buf.extend_from_slice(consumed);
1108
1109                        self.consume(used);
1110                        read += used;
1111
1112                        *position += read;
1113                        break;
1114                    } else {
1115                        buf.extend_from_slice(available);
1116
1117                        let used = available.len();
1118                        self.consume(used);
1119                        read += used;
1120                    }
1121                }
1122                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1123                Err(e) => {
1124                    *position += read;
1125                    return Err(Error::Io(e));
1126                }
1127            }
1128        }
1129
1130        if read == 0 {
1131            Ok(None)
1132        } else {
1133            Ok(Some((bang_type, &buf[start..])))
1134        }
1135    }
1136
1137    #[inline]
1138    fn read_element(
1139        &mut self,
1140        buf: &'b mut Vec<u8>,
1141        position: &mut usize,
1142    ) -> Result<Option<&'b [u8]>> {
1143        let mut state = ReadElementState::Elem;
1144        let mut read = 0;
1145
1146        let start = buf.len();
1147        loop {
1148            match self.fill_buf() {
1149                Ok(n) if n.is_empty() => break,
1150                Ok(available) => {
1151                    if let Some((consumed, used)) = state.change(available) {
1152                        buf.extend_from_slice(consumed);
1153
1154                        self.consume(used);
1155                        read += used;
1156
1157                        *position += read;
1158                        break;
1159                    } else {
1160                        buf.extend_from_slice(available);
1161
1162                        let used = available.len();
1163                        self.consume(used);
1164                        read += used;
1165                    }
1166                }
1167                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1168                Err(e) => {
1169                    *position += read;
1170                    return Err(Error::Io(e));
1171                }
1172            };
1173        }
1174
1175        if read == 0 {
1176            Ok(None)
1177        } else {
1178            Ok(Some(&buf[start..]))
1179        }
1180    }
1181
1182    /// Consume and discard all the whitespace until the next non-whitespace
1183    /// character or EOF.
1184    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
1185        loop {
1186            break match self.fill_buf() {
1187                Ok(n) => {
1188                    let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
1189                    if count > 0 {
1190                        self.consume(count);
1191                        *position += count;
1192                        continue;
1193                    } else {
1194                        Ok(())
1195                    }
1196                }
1197                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1198                Err(e) => Err(Error::Io(e)),
1199            };
1200        }
1201    }
1202
1203    /// Consume and discard one character if it matches the given byte. Return
1204    /// true if it matched.
1205    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
1206        match self.peek_one()? {
1207            Some(b) if b == byte => {
1208                *position += 1;
1209                self.consume(1);
1210                Ok(true)
1211            }
1212            _ => Ok(false),
1213        }
1214    }
1215
1216    /// Return one character without consuming it, so that future `read_*` calls
1217    /// will still include it. On EOF, return None.
1218    fn peek_one(&mut self) -> Result<Option<u8>> {
1219        loop {
1220            break match self.fill_buf() {
1221                Ok(n) if n.is_empty() => Ok(None),
1222                Ok(n) => Ok(Some(n[0])),
1223                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1224                Err(e) => Err(Error::Io(e)),
1225            };
1226        }
1227    }
1228}
1229
1230/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
1231/// that will be borrowed by events. This implementation provides a zero-copy deserialization
1232impl<'a> XmlSource<'a, ()> for &'a [u8] {
1233    fn read_bytes_until(
1234        &mut self,
1235        byte: u8,
1236        _buf: (),
1237        position: &mut usize,
1238    ) -> Result<Option<&'a [u8]>> {
1239        if self.is_empty() {
1240            return Ok(None);
1241        }
1242
1243        Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
1244            *position += i + 1;
1245            let bytes = &self[..i];
1246            *self = &self[i + 1..];
1247            bytes
1248        } else {
1249            *position += self.len();
1250            let bytes = &self[..];
1251            *self = &[];
1252            bytes
1253        }))
1254    }
1255
1256    fn read_bang_element(
1257        &mut self,
1258        _buf: (),
1259        position: &mut usize,
1260    ) -> Result<Option<(BangType, &'a [u8])>> {
1261        // Peeked one bang ('!') before being called, so it's guaranteed to
1262        // start with it.
1263        debug_assert_eq!(self[0], b'!');
1264
1265        let bang_type = BangType::new(self[1..].first().copied())?;
1266
1267        if let Some((bytes, i)) = bang_type.parse(self, 0) {
1268            *position += i;
1269            *self = &self[i..];
1270            return Ok(Some((bang_type, bytes)));
1271        }
1272
1273        // Note: Do not update position, so the error points to
1274        // somewhere sane rather than at the EOF
1275        Err(bang_type.to_err())
1276    }
1277
1278    fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
1279        if self.is_empty() {
1280            return Ok(None);
1281        }
1282
1283        let mut state = ReadElementState::Elem;
1284
1285        if let Some((bytes, i)) = state.change(self) {
1286            *position += i;
1287            *self = &self[i..];
1288            return Ok(Some(bytes));
1289        }
1290
1291        // Note: Do not update position, so the error points to a sane place
1292        // rather than at the EOF.
1293        Err(Error::UnexpectedEof("Element".to_string()))
1294
1295        // FIXME: Figure out why the other one works without UnexpectedEof
1296    }
1297
1298    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
1299        let whitespaces = self
1300            .iter()
1301            .position(|b| !is_whitespace(*b))
1302            .unwrap_or(self.len());
1303        *position += whitespaces;
1304        *self = &self[whitespaces..];
1305        Ok(())
1306    }
1307
1308    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
1309        if self.first() == Some(&byte) {
1310            *self = &self[1..];
1311            *position += 1;
1312            Ok(true)
1313        } else {
1314            Ok(false)
1315        }
1316    }
1317
1318    fn peek_one(&mut self) -> Result<Option<u8>> {
1319        Ok(self.first().copied())
1320    }
1321}
1322
1323/// Possible elements started with `<!`
1324#[derive(Debug, PartialEq)]
1325enum BangType {
1326    /// <![CDATA[...]]>
1327    CData,
1328    /// <!--...-->
1329    Comment,
1330    /// <!DOCTYPE...>
1331    DocType,
1332}
1333impl BangType {
1334    #[inline(always)]
1335    fn new(byte: Option<u8>) -> Result<Self> {
1336        Ok(match byte {
1337            Some(b'[') => Self::CData,
1338            Some(b'-') => Self::Comment,
1339            Some(b'D') | Some(b'd') => Self::DocType,
1340            Some(b) => return Err(Error::UnexpectedBang(b)),
1341            None => return Err(Error::UnexpectedEof("Bang".to_string())),
1342        })
1343    }
1344
1345    /// If element is finished, returns its content up to `>` symbol and
1346    /// an index of this symbol, otherwise returns `None`
1347    #[inline(always)]
1348    fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
1349        for i in memchr::memchr_iter(b'>', chunk) {
1350            match self {
1351                // Need to read at least 6 symbols (`!---->`) for properly finished comment
1352                // <!----> - XML comment
1353                //  012345 - i
1354                Self::Comment => {
1355                    if offset + i > 4 && chunk[..i].ends_with(b"--") {
1356                        // We cannot strip last `--` from the buffer because we need it in case of
1357                        // check_comments enabled option. XML standard requires that comment
1358                        // will not end with `--->` sequence because this is a special case of
1359                        // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1360                        return Some((&chunk[..i], i + 1)); // +1 for `>`
1361                    }
1362                }
1363                Self::CData => {
1364                    if chunk[..i].ends_with(b"]]") {
1365                        return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
1366                    }
1367                }
1368                Self::DocType => {
1369                    let content = &chunk[..i];
1370                    let balance = memchr::memchr2_iter(b'<', b'>', content)
1371                        .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
1372                        .sum::<i32>();
1373                    if balance == 0 {
1374                        return Some((content, i + 1)); // +1 for `>`
1375                    }
1376                }
1377            }
1378        }
1379        None
1380    }
1381    #[inline]
1382    fn to_err(self) -> Error {
1383        let bang_str = match self {
1384            Self::CData => "CData",
1385            Self::Comment => "Comment",
1386            Self::DocType => "DOCTYPE",
1387        };
1388        Error::UnexpectedEof(bang_str.to_string())
1389    }
1390}
1391
1392/// State machine for the [`XmlSource::read_element`]
1393#[derive(Clone, Copy)]
1394enum ReadElementState {
1395    /// The initial state (inside element, but outside of attribute value)
1396    Elem,
1397    /// Inside a single-quoted attribute value
1398    SingleQ,
1399    /// Inside a double-quoted attribute value
1400    DoubleQ,
1401}
1402impl ReadElementState {
1403    /// Changes state by analyzing part of input.
1404    /// Returns a tuple with part of chunk up to element closing symbol `>`
1405    /// and a position after that symbol or `None` if such symbol was not found
1406    #[inline(always)]
1407    fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1408        for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
1409            *self = match (*self, chunk[i]) {
1410                // only allowed to match `>` while we are in state `Elem`
1411                (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
1412                (Self::Elem, b'\'') => Self::SingleQ,
1413                (Self::Elem, b'\"') => Self::DoubleQ,
1414
1415                // the only end_byte that gets us out if the same character
1416                (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
1417
1418                // all other bytes: no state change
1419                _ => *self,
1420            };
1421        }
1422        None
1423    }
1424}
1425
1426/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
1427#[inline]
1428pub(crate) fn is_whitespace(b: u8) -> bool {
1429    match b {
1430        b' ' | b'\r' | b'\n' | b'\t' => true,
1431        _ => false,
1432    }
1433}
1434
1435/// An entry that contains index into the buffer with namespace bindings.
1436///
1437/// Defines a mapping from *[namespace prefix]* to *[namespace name]*.
1438/// If prefix is empty, defines a *default namespace* binding that applies to
1439/// unprefixed element names (unprefixed attribute names do not bind to any
1440/// namespace and they processing is dependent on the element in which their
1441/// defined).
1442///
1443/// [namespace prefix]: https://www.w3.org/TR/xml-names11/#dt-prefix
1444/// [namespace name]: https://www.w3.org/TR/xml-names11/#dt-NSName
1445#[derive(Debug, Clone)]
1446struct NamespaceEntry {
1447    /// Index of the namespace in the buffer
1448    start: usize,
1449    /// Length of the prefix
1450    /// * if greater than zero, then binds this namespace to the slice
1451    ///   `[start..start + prefix_len]` in the buffer.
1452    /// * else defines the current default namespace.
1453    prefix_len: usize,
1454    /// The length of a namespace name (the URI) of this namespace declaration.
1455    /// Name started just after prefix and extend for `value_len` bytes.
1456    ///
1457    /// The XML standard [specifies] that an empty namespace value 'removes' a namespace declaration
1458    /// for the extent of its scope. For prefix declarations that's not very interesting, but it is
1459    /// vital for default namespace declarations. With `xmlns=""` you can revert back to the default
1460    /// behaviour of leaving unqualified element names unqualified.
1461    ///
1462    /// [specifies]: https://www.w3.org/TR/xml-names11/#scoping
1463    value_len: usize,
1464    /// Level of nesting at which this namespace was declared. The declaring element is included,
1465    /// i.e., a declaration on the document root has `level = 1`.
1466    /// This is used to pop the namespace when the element gets closed.
1467    level: i32,
1468}
1469
1470impl NamespaceEntry {
1471    /// Gets the namespace name (the URI) slice out of namespace buffer
1472    ///
1473    /// Returns `None` if namespace for this prefix was explicitly removed from
1474    /// scope, using `xmlns[:prefix]=""`
1475    #[inline]
1476    fn namespace<'b>(&self, buffer: &'b [u8]) -> Option<&'b [u8]> {
1477        if self.value_len == 0 {
1478            None
1479        } else {
1480            let start = self.start + self.prefix_len;
1481            Some(&buffer[start..start + self.value_len])
1482        }
1483    }
1484
1485    /// Check if the namespace matches the potentially qualified name
1486    #[inline]
1487    fn is_match(&self, buffer: &[u8], qname: &[u8]) -> bool {
1488        if self.prefix_len == 0 {
1489            !qname.contains(&b':')
1490        } else {
1491            qname.get(self.prefix_len).map_or(false, |n| *n == b':')
1492                && qname.starts_with(&buffer[self.start..self.start + self.prefix_len])
1493        }
1494    }
1495}
1496
1497/// A namespace management buffer.
1498///
1499/// Holds all internal logic to push/pop namespaces with their levels.
1500#[derive(Debug, Default, Clone)]
1501struct NamespaceResolver {
1502    /// A stack of namespace bindings to prefixes that currently in scope
1503    bindings: Vec<NamespaceEntry>,
1504    /// The number of open tags at the moment. We need to keep track of this to know which namespace
1505    /// declarations to remove when we encounter an `End` event.
1506    nesting_level: i32,
1507    /// For `Empty` events keep the 'scope' of the element on the stack artificially. That way, the
1508    /// consumer has a chance to use `resolve` in the context of the empty element. We perform the
1509    /// pop as the first operation in the next `next()` call.
1510    pending_pop: bool,
1511}
1512
1513impl NamespaceResolver {
1514    /// Finds a [namespace name] for a given qualified name of element, borrow it
1515    /// from the specified buffer.
1516    ///
1517    /// Returns `None`, if:
1518    /// - name is unqualified
1519    /// - prefix not found in the current scope
1520    /// - prefix was [unbound] using `xmlns:prefix=""`
1521    ///
1522    /// # Lifetimes
1523    ///
1524    /// - `'n`: lifetime of an element name
1525    /// - `'b`: lifetime of a namespaces buffer, where all found namespaces are stored
1526    ///
1527    /// [namespace name]: https://www.w3.org/TR/xml-names11/#dt-NSName
1528    /// [unbound]: https://www.w3.org/TR/xml-names11/#scoping
1529    #[inline]
1530    fn find<'n, 'b>(&self, element_name: &'n [u8], buffer: &'b [u8]) -> Option<&'b [u8]> {
1531        self.bindings
1532            .iter()
1533            .rfind(|n| n.is_match(buffer, element_name))
1534            .and_then(|n| n.namespace(buffer))
1535    }
1536
1537    /// Ends a top-most scope by popping all [namespace binding], that was added by
1538    /// last call to [`Self::push()`].
1539    ///
1540    /// [namespace binding]: https://www.w3.org/TR/xml-names11/#dt-NSDecl
1541    fn pop(&mut self, buffer: &mut Vec<u8>) {
1542        if !self.pending_pop {
1543            return;
1544        }
1545        self.pending_pop = false;
1546        self.nesting_level -= 1;
1547        let current_level = self.nesting_level;
1548        // from the back (most deeply nested scope), look for the first scope that is still valid
1549        match self.bindings.iter().rposition(|n| n.level <= current_level) {
1550            // none of the namespaces are valid, remove all of them
1551            None => {
1552                buffer.clear();
1553                self.bindings.clear();
1554            }
1555            // drop all namespaces past the last valid namespace
1556            Some(last_valid_pos) => {
1557                if let Some(len) = self.bindings.get(last_valid_pos + 1).map(|n| n.start) {
1558                    buffer.truncate(len);
1559                    self.bindings.truncate(last_valid_pos + 1);
1560                }
1561            }
1562        }
1563    }
1564
1565    /// Begins a new scope and add to it all [namespace bindings] that found in
1566    /// the specified start element.
1567    ///
1568    /// [namespace binding]: https://www.w3.org/TR/xml-names11/#dt-NSDecl
1569    fn push(&mut self, start: &BytesStart, buffer: &mut Vec<u8>) {
1570        self.nesting_level += 1;
1571        let level = self.nesting_level;
1572        // adds new namespaces for attributes starting with 'xmlns:' and for the 'xmlns'
1573        // (default namespace) attribute.
1574        for a in start.attributes().with_checks(false) {
1575            if let Ok(Attribute { key: k, value: v }) = a {
1576                if k.starts_with(b"xmlns") {
1577                    match k.get(5) {
1578                        None => {
1579                            let start = buffer.len();
1580                            buffer.extend_from_slice(&*v);
1581                            self.bindings.push(NamespaceEntry {
1582                                start,
1583                                prefix_len: 0,
1584                                value_len: v.len(),
1585                                level,
1586                            });
1587                        }
1588                        Some(&b':') => {
1589                            let start = buffer.len();
1590                            buffer.extend_from_slice(&k[6..]);
1591                            buffer.extend_from_slice(&*v);
1592                            self.bindings.push(NamespaceEntry {
1593                                start,
1594                                prefix_len: k.len() - 6,
1595                                value_len: v.len(),
1596                                level,
1597                            });
1598                        }
1599                        _ => break,
1600                    }
1601                }
1602            } else {
1603                break;
1604            }
1605        }
1606    }
1607
1608    /// Resolves a potentially qualified **attribute name** into (namespace name, local name).
1609    ///
1610    /// *Qualified* attribute names have the form `prefix:local-name` where the `prefix` is defined
1611    /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
1612    /// can be defined on the same element as the attribute in question.
1613    ///
1614    /// *Unqualified* attribute names do *not* inherit the current *default namespace*.
1615    ///
1616    /// # Lifetimes
1617    ///
1618    /// - `'n`: lifetime of an attribute or an element name
1619    /// - `'b`: lifetime of a namespaces buffer, where all found namespaces are stored
1620    #[inline]
1621    fn resolve<'n, 'b>(
1622        &self,
1623        qname: &'n [u8],
1624        buffer: &'b [u8],
1625        use_default: bool,
1626    ) -> (Option<&'b [u8]>, &'n [u8]) {
1627        self.bindings
1628            .iter()
1629            .rfind(|n| n.is_match(buffer, qname))
1630            .map_or((None, qname), |n| {
1631                let len = n.prefix_len;
1632                if len > 0 {
1633                    (n.namespace(buffer), &qname[len + 1..])
1634                } else if use_default {
1635                    (n.namespace(buffer), qname)
1636                } else {
1637                    (None, qname)
1638                }
1639            })
1640    }
1641}
1642
1643/// Utf8 Decoder
1644#[cfg(not(feature = "encoding"))]
1645#[derive(Clone, Copy, Debug)]
1646pub struct Decoder;
1647
1648/// Utf8 Decoder
1649#[cfg(feature = "encoding")]
1650#[derive(Clone, Copy, Debug)]
1651pub struct Decoder {
1652    encoding: &'static Encoding,
1653}
1654
1655impl Decoder {
1656    #[cfg(not(feature = "encoding"))]
1657    pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
1658        from_utf8(bytes).map_err(Error::Utf8)
1659    }
1660
1661    #[cfg(not(feature = "encoding"))]
1662    pub fn decode_owned<'c>(&self, bytes: Vec<u8>) -> Result<String> {
1663        String::from_utf8(bytes).map_err(|e| Error::Utf8(e.utf8_error()))
1664    }
1665
1666    #[cfg(feature = "encoding")]
1667    pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> {
1668        self.encoding.decode(bytes).0
1669    }
1670}
1671
1672#[cfg(test)]
1673mod test {
1674    macro_rules! check {
1675        ($buf:expr) => {
1676            mod read_bytes_until {
1677                use crate::reader::XmlSource;
1678                // Use Bytes for printing bytes as strings for ASCII range
1679                use crate::utils::Bytes;
1680                use pretty_assertions::assert_eq;
1681
1682                /// Checks that search in the empty buffer returns `None`
1683                #[test]
1684                fn empty() {
1685                    let buf = $buf;
1686                    let mut position = 0;
1687                    let mut input = b"".as_ref();
1688                    //                ^= 0
1689
1690                    assert_eq!(
1691                        input
1692                            .read_bytes_until(b'*', buf, &mut position)
1693                            .unwrap()
1694                            .map(Bytes),
1695                        None
1696                    );
1697                    assert_eq!(position, 0);
1698                }
1699
1700                /// Checks that search in the buffer non-existent value returns entire buffer
1701                /// as a result and set `position` to `len()`
1702                #[test]
1703                fn non_existent() {
1704                    let buf = $buf;
1705                    let mut position = 0;
1706                    let mut input = b"abcdef".as_ref();
1707                    //                      ^= 6
1708
1709                    assert_eq!(
1710                        input
1711                            .read_bytes_until(b'*', buf, &mut position)
1712                            .unwrap()
1713                            .map(Bytes),
1714                        Some(Bytes(b"abcdef"))
1715                    );
1716                    assert_eq!(position, 6);
1717                }
1718
1719                /// Checks that search in the buffer an element that is located in the front of
1720                /// buffer returns empty slice as a result and set `position` to one symbol
1721                /// after match (`1`)
1722                #[test]
1723                fn at_the_start() {
1724                    let buf = $buf;
1725                    let mut position = 0;
1726                    let mut input = b"*abcdef".as_ref();
1727                    //                 ^= 1
1728
1729                    assert_eq!(
1730                        input
1731                            .read_bytes_until(b'*', buf, &mut position)
1732                            .unwrap()
1733                            .map(Bytes),
1734                        Some(Bytes(b""))
1735                    );
1736                    assert_eq!(position, 1); // position after the symbol matched
1737                }
1738
1739                /// Checks that search in the buffer an element that is located in the middle of
1740                /// buffer returns slice before that symbol as a result and set `position` to one
1741                /// symbol after match
1742                #[test]
1743                fn inside() {
1744                    let buf = $buf;
1745                    let mut position = 0;
1746                    let mut input = b"abc*def".as_ref();
1747                    //                    ^= 4
1748
1749                    assert_eq!(
1750                        input
1751                            .read_bytes_until(b'*', buf, &mut position)
1752                            .unwrap()
1753                            .map(Bytes),
1754                        Some(Bytes(b"abc"))
1755                    );
1756                    assert_eq!(position, 4); // position after the symbol matched
1757                }
1758
1759                /// Checks that search in the buffer an element that is located in the end of
1760                /// buffer returns slice before that symbol as a result and set `position` to one
1761                /// symbol after match (`len()`)
1762                #[test]
1763                fn in_the_end() {
1764                    let buf = $buf;
1765                    let mut position = 0;
1766                    let mut input = b"abcdef*".as_ref();
1767                    //                       ^= 7
1768
1769                    assert_eq!(
1770                        input
1771                            .read_bytes_until(b'*', buf, &mut position)
1772                            .unwrap()
1773                            .map(Bytes),
1774                        Some(Bytes(b"abcdef"))
1775                    );
1776                    assert_eq!(position, 7); // position after the symbol matched
1777                }
1778            }
1779
1780            mod read_bang_element {
1781                /// Checks that reading CDATA content works correctly
1782                mod cdata {
1783                    use crate::errors::Error;
1784                    use crate::reader::{BangType, XmlSource};
1785                    use crate::utils::Bytes;
1786                    use pretty_assertions::assert_eq;
1787
1788                    /// Checks that if input begins like CDATA element, but CDATA start sequence
1789                    /// is not finished, parsing ends with an error
1790                    #[test]
1791                    #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1792                    fn not_properly_start() {
1793                        let buf = $buf;
1794                        let mut position = 0;
1795                        let mut input = b"![]]>other content".as_ref();
1796                        //                ^= 0
1797
1798                        match input.read_bang_element(buf, &mut position) {
1799                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1800                            x => assert!(
1801                                false,
1802                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1803                                x
1804                            ),
1805                        }
1806                        assert_eq!(position, 0);
1807                    }
1808
1809                    /// Checks that if CDATA startup sequence was matched, but an end sequence
1810                    /// is not found, parsing ends with an error
1811                    #[test]
1812                    fn not_closed() {
1813                        let buf = $buf;
1814                        let mut position = 0;
1815                        let mut input = b"![CDATA[other content".as_ref();
1816                        //                ^= 0
1817
1818                        match input.read_bang_element(buf, &mut position) {
1819                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1820                            x => assert!(
1821                                false,
1822                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1823                                x
1824                            ),
1825                        }
1826                        assert_eq!(position, 0);
1827                    }
1828
1829                    /// Checks that CDATA element without content inside parsed successfully
1830                    #[test]
1831                    fn empty() {
1832                        let buf = $buf;
1833                        let mut position = 0;
1834                        let mut input = b"![CDATA[]]>other content".as_ref();
1835                        //                           ^= 11
1836
1837                        assert_eq!(
1838                            input
1839                                .read_bang_element(buf, &mut position)
1840                                .unwrap()
1841                                .map(|(ty, data)| (ty, Bytes(data))),
1842                            Some((BangType::CData, Bytes(b"![CDATA[")))
1843                        );
1844                        assert_eq!(position, 11);
1845                    }
1846
1847                    /// Checks that CDATA element with content parsed successfully.
1848                    /// Additionally checks that sequences inside CDATA that may look like
1849                    /// a CDATA end sequence do not interrupt CDATA parsing
1850                    #[test]
1851                    fn with_content() {
1852                        let buf = $buf;
1853                        let mut position = 0;
1854                        let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1855                        //                                            ^= 28
1856
1857                        assert_eq!(
1858                            input
1859                                .read_bang_element(buf, &mut position)
1860                                .unwrap()
1861                                .map(|(ty, data)| (ty, Bytes(data))),
1862                            Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
1863                        );
1864                        assert_eq!(position, 28);
1865                    }
1866                }
1867
1868                /// Checks that reading XML comments works correctly. According to the [specification],
1869                /// comment data can contain any sequence except `--`:
1870                ///
1871                /// ```peg
1872                /// comment = '<--' (!'--' char)* '-->';
1873                /// char = [#x1-#x2C]
1874                ///      / [#x2E-#xD7FF]
1875                ///      / [#xE000-#xFFFD]
1876                ///      / [#x10000-#x10FFFF]
1877                /// ```
1878                ///
1879                /// The presence of this limitation, however, is simply a poorly designed specification
1880                /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1881                /// presence of these sequences by default. This tests allow such content.
1882                ///
1883                /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1884                mod comment {
1885                    use crate::errors::Error;
1886                    use crate::reader::{BangType, XmlSource};
1887                    use crate::utils::Bytes;
1888                    use pretty_assertions::assert_eq;
1889
1890                    #[test]
1891                    #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1892                    fn not_properly_start() {
1893                        let buf = $buf;
1894                        let mut position = 0;
1895                        let mut input = b"!- -->other content".as_ref();
1896                        //                ^= 0
1897
1898                        match input.read_bang_element(buf, &mut position) {
1899                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1900                            x => assert!(
1901                                false,
1902                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1903                                x
1904                            ),
1905                        }
1906                        assert_eq!(position, 0);
1907                    }
1908
1909                    #[test]
1910                    fn not_properly_end() {
1911                        let buf = $buf;
1912                        let mut position = 0;
1913                        let mut input = b"!->other content".as_ref();
1914                        //                ^= 0
1915
1916                        match input.read_bang_element(buf, &mut position) {
1917                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1918                            x => assert!(
1919                                false,
1920                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1921                                x
1922                            ),
1923                        }
1924                        assert_eq!(position, 0);
1925                    }
1926
1927                    #[test]
1928                    fn not_closed1() {
1929                        let buf = $buf;
1930                        let mut position = 0;
1931                        let mut input = b"!--other content".as_ref();
1932                        //                ^= 0
1933
1934                        match input.read_bang_element(buf, &mut position) {
1935                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1936                            x => assert!(
1937                                false,
1938                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1939                                x
1940                            ),
1941                        }
1942                        assert_eq!(position, 0);
1943                    }
1944
1945                    #[test]
1946                    fn not_closed2() {
1947                        let buf = $buf;
1948                        let mut position = 0;
1949                        let mut input = b"!-->other content".as_ref();
1950                        //                ^= 0
1951
1952                        match input.read_bang_element(buf, &mut position) {
1953                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1954                            x => assert!(
1955                                false,
1956                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1957                                x
1958                            ),
1959                        }
1960                        assert_eq!(position, 0);
1961                    }
1962
1963                    #[test]
1964                    fn not_closed3() {
1965                        let buf = $buf;
1966                        let mut position = 0;
1967                        let mut input = b"!--->other content".as_ref();
1968                        //                ^= 0
1969
1970                        match input.read_bang_element(buf, &mut position) {
1971                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1972                            x => assert!(
1973                                false,
1974                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1975                                x
1976                            ),
1977                        }
1978                        assert_eq!(position, 0);
1979                    }
1980
1981                    #[test]
1982                    fn empty() {
1983                        let buf = $buf;
1984                        let mut position = 0;
1985                        let mut input = b"!---->other content".as_ref();
1986                        //                      ^= 6
1987
1988                        assert_eq!(
1989                            input
1990                                .read_bang_element(buf, &mut position)
1991                                .unwrap()
1992                                .map(|(ty, data)| (ty, Bytes(data))),
1993                            Some((BangType::Comment, Bytes(b"!----")))
1994                        );
1995                        assert_eq!(position, 6);
1996                    }
1997
1998                    #[test]
1999                    fn with_content() {
2000                        let buf = $buf;
2001                        let mut position = 0;
2002                        let mut input = b"!--->comment<--->other content".as_ref();
2003                        //                                 ^= 17
2004
2005                        assert_eq!(
2006                            input
2007                                .read_bang_element(buf, &mut position)
2008                                .unwrap()
2009                                .map(|(ty, data)| (ty, Bytes(data))),
2010                            Some((BangType::Comment, Bytes(b"!--->comment<---")))
2011                        );
2012                        assert_eq!(position, 17);
2013                    }
2014                }
2015
2016                /// Checks that reading DOCTYPE definition works correctly
2017                mod doctype {
2018                    mod uppercase {
2019                        use crate::errors::Error;
2020                        use crate::reader::{BangType, XmlSource};
2021                        use crate::utils::Bytes;
2022                        use pretty_assertions::assert_eq;
2023
2024                        #[test]
2025                        fn not_properly_start() {
2026                            let buf = $buf;
2027                            let mut position = 0;
2028                            let mut input = b"!D other content".as_ref();
2029                            //                ^= 0
2030
2031                            match input.read_bang_element(buf, &mut position) {
2032                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2033                                x => assert!(
2034                                    false,
2035                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2036                                    x
2037                                ),
2038                            }
2039                            assert_eq!(position, 0);
2040                        }
2041
2042                        #[test]
2043                        fn without_space() {
2044                            let buf = $buf;
2045                            let mut position = 0;
2046                            let mut input = b"!DOCTYPEother content".as_ref();
2047                            //                ^= 0
2048
2049                            match input.read_bang_element(buf, &mut position) {
2050                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2051                                x => assert!(
2052                                    false,
2053                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2054                                    x
2055                                ),
2056                            }
2057                            assert_eq!(position, 0);
2058                        }
2059
2060                        #[test]
2061                        fn empty() {
2062                            let buf = $buf;
2063                            let mut position = 0;
2064                            let mut input = b"!DOCTYPE>other content".as_ref();
2065                            //                         ^= 9
2066
2067                            assert_eq!(
2068                                input
2069                                    .read_bang_element(buf, &mut position)
2070                                    .unwrap()
2071                                    .map(|(ty, data)| (ty, Bytes(data))),
2072                                Some((BangType::DocType, Bytes(b"!DOCTYPE")))
2073                            );
2074                            assert_eq!(position, 9);
2075                        }
2076
2077                        #[test]
2078                        fn not_closed() {
2079                            let buf = $buf;
2080                            let mut position = 0;
2081                            let mut input = b"!DOCTYPE other content".as_ref();
2082                            //                ^= 0
2083
2084                            match input.read_bang_element(buf, &mut position) {
2085                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2086                                x => assert!(
2087                                    false,
2088                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2089                                    x
2090                                ),
2091                            }
2092                            assert_eq!(position, 0);
2093                        }
2094                    }
2095
2096                    mod lowercase {
2097                        use crate::errors::Error;
2098                        use crate::reader::{BangType, XmlSource};
2099                        use crate::utils::Bytes;
2100                        use pretty_assertions::assert_eq;
2101
2102                        #[test]
2103                        fn not_properly_start() {
2104                            let buf = $buf;
2105                            let mut position = 0;
2106                            let mut input = b"!d other content".as_ref();
2107                            //                ^= 0
2108
2109                            match input.read_bang_element(buf, &mut position) {
2110                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2111                                x => assert!(
2112                                    false,
2113                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2114                                    x
2115                                ),
2116                            }
2117                            assert_eq!(position, 0);
2118                        }
2119
2120                        #[test]
2121                        fn without_space() {
2122                            let buf = $buf;
2123                            let mut position = 0;
2124                            let mut input = b"!doctypeother content".as_ref();
2125                            //                ^= 0
2126
2127                            match input.read_bang_element(buf, &mut position) {
2128                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2129                                x => assert!(
2130                                    false,
2131                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2132                                    x
2133                                ),
2134                            }
2135                            assert_eq!(position, 0);
2136                        }
2137
2138                        #[test]
2139                        fn empty() {
2140                            let buf = $buf;
2141                            let mut position = 0;
2142                            let mut input = b"!doctype>other content".as_ref();
2143                            //                         ^= 9
2144
2145                            assert_eq!(
2146                                input
2147                                    .read_bang_element(buf, &mut position)
2148                                    .unwrap()
2149                                    .map(|(ty, data)| (ty, Bytes(data))),
2150                                Some((BangType::DocType, Bytes(b"!doctype")))
2151                            );
2152                            assert_eq!(position, 9);
2153                        }
2154
2155                        #[test]
2156                        fn not_closed() {
2157                            let buf = $buf;
2158                            let mut position = 0;
2159                            let mut input = b"!doctype other content".as_ref();
2160                            //                ^= 0
2161
2162                            match input.read_bang_element(buf, &mut position) {
2163                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2164                                x => assert!(
2165                                    false,
2166                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2167                                    x
2168                                ),
2169                            }
2170                            assert_eq!(position, 0);
2171                        }
2172                    }
2173                }
2174            }
2175
2176            mod read_element {
2177                use crate::reader::XmlSource;
2178                use crate::utils::Bytes;
2179                use pretty_assertions::assert_eq;
2180
2181                /// Checks that nothing was read from empty buffer
2182                #[test]
2183                fn empty() {
2184                    let buf = $buf;
2185                    let mut position = 0;
2186                    let mut input = b"".as_ref();
2187                    //                ^= 0
2188
2189                    assert_eq!(input.read_element(buf, &mut position).unwrap().map(Bytes), None);
2190                    assert_eq!(position, 0);
2191                }
2192
2193                mod open {
2194                    use crate::reader::XmlSource;
2195                    use crate::utils::Bytes;
2196                    use pretty_assertions::assert_eq;
2197
2198                    #[test]
2199                    fn empty_tag() {
2200                        let buf = $buf;
2201                        let mut position = 0;
2202                        let mut input = b">".as_ref();
2203                        //                 ^= 1
2204
2205                        assert_eq!(
2206                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2207                            Some(Bytes(b""))
2208                        );
2209                        assert_eq!(position, 1);
2210                    }
2211
2212                    #[test]
2213                    fn normal() {
2214                        let buf = $buf;
2215                        let mut position = 0;
2216                        let mut input = b"tag>".as_ref();
2217                        //                    ^= 4
2218
2219                        assert_eq!(
2220                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2221                            Some(Bytes(b"tag"))
2222                        );
2223                        assert_eq!(position, 4);
2224                    }
2225
2226                    #[test]
2227                    fn empty_ns_empty_tag() {
2228                        let buf = $buf;
2229                        let mut position = 0;
2230                        let mut input = b":>".as_ref();
2231                        //                  ^= 2
2232
2233                        assert_eq!(
2234                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2235                            Some(Bytes(b":"))
2236                        );
2237                        assert_eq!(position, 2);
2238                    }
2239
2240                    #[test]
2241                    fn empty_ns() {
2242                        let buf = $buf;
2243                        let mut position = 0;
2244                        let mut input = b":tag>".as_ref();
2245                        //                     ^= 5
2246
2247                        assert_eq!(
2248                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2249                            Some(Bytes(b":tag"))
2250                        );
2251                        assert_eq!(position, 5);
2252                    }
2253
2254                    #[test]
2255                    fn with_attributes() {
2256                        let buf = $buf;
2257                        let mut position = 0;
2258                        let mut input = br#"tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
2259                        //                                                        ^= 38
2260
2261                        assert_eq!(
2262                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2263                            Some(Bytes(br#"tag  attr-1=">"  attr2  =  '>'  3attr"#))
2264                        );
2265                        assert_eq!(position, 38);
2266                    }
2267                }
2268
2269                mod self_closed {
2270                    use crate::reader::XmlSource;
2271                    use crate::utils::Bytes;
2272                    use pretty_assertions::assert_eq;
2273
2274                    #[test]
2275                    fn empty_tag() {
2276                        let buf = $buf;
2277                        let mut position = 0;
2278                        let mut input = b"/>".as_ref();
2279                        //                  ^= 2
2280
2281                        assert_eq!(
2282                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2283                            Some(Bytes(b"/"))
2284                        );
2285                        assert_eq!(position, 2);
2286                    }
2287
2288                    #[test]
2289                    fn normal() {
2290                        let buf = $buf;
2291                        let mut position = 0;
2292                        let mut input = b"tag/>".as_ref();
2293                        //                     ^= 5
2294
2295                        assert_eq!(
2296                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2297                            Some(Bytes(b"tag/"))
2298                        );
2299                        assert_eq!(position, 5);
2300                    }
2301
2302                    #[test]
2303                    fn empty_ns_empty_tag() {
2304                        let buf = $buf;
2305                        let mut position = 0;
2306                        let mut input = b":/>".as_ref();
2307                        //                   ^= 3
2308
2309                        assert_eq!(
2310                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2311                            Some(Bytes(b":/"))
2312                        );
2313                        assert_eq!(position, 3);
2314                    }
2315
2316                    #[test]
2317                    fn empty_ns() {
2318                        let buf = $buf;
2319                        let mut position = 0;
2320                        let mut input = b":tag/>".as_ref();
2321                        //                      ^= 6
2322
2323                        assert_eq!(
2324                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2325                            Some(Bytes(b":tag/"))
2326                        );
2327                        assert_eq!(position, 6);
2328                    }
2329
2330                    #[test]
2331                    fn with_attributes() {
2332                        let buf = $buf;
2333                        let mut position = 0;
2334                        let mut input = br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/>"#.as_ref();
2335                        //                                                           ^= 41
2336
2337                        assert_eq!(
2338                            input.read_element(buf, &mut position).unwrap().map(Bytes),
2339                            Some(Bytes(br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/"#))
2340                        );
2341                        assert_eq!(position, 41);
2342                    }
2343                }
2344            }
2345
2346            mod issue_344 {
2347                use crate::errors::Error;
2348
2349                #[test]
2350                fn cdata() {
2351                    let doc = "![]]>";
2352                    let mut reader = crate::Reader::from_str(doc);
2353
2354                    match reader.read_until_close($buf) {
2355                        Err(Error::UnexpectedEof(s)) if s == "CData" => {}
2356                        x => assert!(
2357                            false,
2358                            r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
2359                            x
2360                        ),
2361                    }
2362                }
2363
2364                #[test]
2365                fn comment() {
2366                    let doc = "!- -->";
2367                    let mut reader = crate::Reader::from_str(doc);
2368
2369                    match reader.read_until_close($buf) {
2370                        Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
2371                        x => assert!(
2372                            false,
2373                            r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
2374                            x
2375                        ),
2376                    }
2377                }
2378
2379                #[test]
2380                fn doctype_uppercase() {
2381                    let doc = "!D>";
2382                    let mut reader = crate::Reader::from_str(doc);
2383
2384                    match reader.read_until_close($buf) {
2385                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2386                        x => assert!(
2387                            false,
2388                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2389                            x
2390                        ),
2391                    }
2392                }
2393
2394                #[test]
2395                fn doctype_lowercase() {
2396                    let doc = "!d>";
2397                    let mut reader = crate::Reader::from_str(doc);
2398
2399                    match reader.read_until_close($buf) {
2400                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2401                        x => assert!(
2402                            false,
2403                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2404                            x
2405                        ),
2406                    }
2407                }
2408            }
2409        };
2410    }
2411
2412    /// Tests for reader that generates events that borrow from the provided buffer
2413    mod buffered {
2414        check!(&mut Vec::new());
2415    }
2416
2417    /// Tests for reader that generates events that borrow from the input
2418    mod borrowed {
2419        check!(());
2420    }
2421}
fast_xml/reader.rs

fast_xml/
reader.rs