Skip to main content

quick_xml/reader/
slice_reader.rs

1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::io;
6
7#[cfg(feature = "encoding")]
8use crate::encoding::DetectedEncoding;
9#[cfg(feature = "encoding")]
10use crate::reader::EncodingRef;
11#[cfg(feature = "encoding")]
12use encoding_rs;
13
14use crate::errors::{Error, Result};
15use crate::events::{BytesText, Event};
16use crate::name::QName;
17use crate::parser::Parser;
18use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
19use crate::utils::is_whitespace;
20
21/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
22/// This implementation supports not using an intermediate buffer as the byte slice
23/// itself can be used to borrow from.
24impl<'a> Reader<&'a [u8]> {
25    /// Creates an XML reader from a string slice.
26    #[allow(clippy::should_implement_trait)]
27    pub fn from_str(s: &'a str) -> Self {
28        // Rust strings are guaranteed to be UTF-8, so lock the encoding
29        #[cfg(feature = "encoding")]
30        {
31            let mut reader = Self::from_reader(s.as_bytes());
32            reader.state.encoding = EncodingRef::Explicit(encoding_rs::UTF_8);
33            reader
34        }
35
36        #[cfg(not(feature = "encoding"))]
37        Self::from_reader(s.as_bytes())
38    }
39
40    /// Read an event that borrows from the input rather than a buffer.
41    ///
42    /// There is no asynchronous `read_event_async()` version of this function,
43    /// because it is not necessary -- the contents are already in memory and no IO
44    /// is needed, therefore there is no potential for blocking.
45    ///
46    /// # Examples
47    ///
48    /// ```
49    /// # use pretty_assertions::assert_eq;
50    /// use quick_xml::events::Event;
51    /// use quick_xml::reader::Reader;
52    ///
53    /// let mut reader = Reader::from_str(r#"
54    ///     <tag1 att1 = "test">
55    ///        <tag2><!--Test comment-->Test</tag2>
56    ///        <tag2>Test 2</tag2>
57    ///     </tag1>
58    /// "#);
59    /// reader.config_mut().trim_text(true);
60    ///
61    /// let mut count = 0;
62    /// let mut txt = Vec::new();
63    /// loop {
64    ///     match reader.read_event().unwrap() {
65    ///         Event::Start(e) => count += 1,
66    ///         Event::Text(e) => txt.push(e.decode().unwrap().into_owned()),
67    ///         Event::Eof => break,
68    ///         _ => (),
69    ///     }
70    /// }
71    /// assert_eq!(count, 3);
72    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
73    /// ```
74    #[inline]
75    pub fn read_event(&mut self) -> Result<Event<'a>> {
76        self.read_event_impl(())
77    }
78
79    /// Reads until end element is found. This function is supposed to be called
80    /// after you already read a [`Start`] event.
81    ///
82    /// Returns a span that cover content between `>` of an opening tag and `<` of
83    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
84    /// this method was called after reading expanded [`Start`] event.
85    ///
86    /// Manages nested cases where parent and child elements have the _literally_
87    /// same name.
88    ///
89    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
90    /// will be returned. In particularly, that error will be returned if you call
91    /// this method without consuming the corresponding [`Start`] event first.
92    ///
93    /// The `end` parameter should contain name of the end element _in the reader
94    /// encoding_. It is good practice to always get that parameter using
95    /// [`BytesStart::to_end()`] method.
96    ///
97    /// The correctness of the skipped events does not checked, if you disabled
98    /// the [`check_end_names`] option.
99    ///
100    /// There is no asynchronous `read_to_end_async()` version of this function,
101    /// because it is not necessary -- the contents are already in memory and no IO
102    /// is needed, therefore there is no potential for blocking.
103    ///
104    /// # Namespaces
105    ///
106    /// While the `Reader` does not support namespace resolution, namespaces
107    /// does not change the algorithm for comparing names. Although the names
108    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
109    /// same namespace, are semantically equivalent, `</b:name>` cannot close
110    /// `<a:name>`, because according to [the specification]
111    ///
112    /// > The end of every element that begins with a **start-tag** MUST be marked
113    /// > by an **end-tag** containing a name that echoes the element's type as
114    /// > given in the **start-tag**
115    ///
116    /// # Examples
117    ///
118    /// This example shows, how you can skip XML content after you read the
119    /// start event.
120    ///
121    /// ```
122    /// # use pretty_assertions::assert_eq;
123    /// use quick_xml::events::{BytesStart, Event};
124    /// use quick_xml::reader::Reader;
125    ///
126    /// let mut reader = Reader::from_str(r#"
127    ///     <outer>
128    ///         <inner>
129    ///             <inner></inner>
130    ///             <inner/>
131    ///             <outer></outer>
132    ///             <outer/>
133    ///         </inner>
134    ///     </outer>
135    /// "#);
136    /// reader.config_mut().trim_text(true);
137    ///
138    /// let start = BytesStart::new("outer");
139    /// let end   = start.to_end().into_owned();
140    ///
141    /// // First, we read a start event...
142    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
143    ///
144    /// // ...then, we could skip all events to the corresponding end event.
145    /// // This call will correctly handle nested <outer> elements.
146    /// // Note, however, that this method does not handle namespaces.
147    /// reader.read_to_end(end.name()).unwrap();
148    ///
149    /// // At the end we should get an Eof event, because we ate the whole XML
150    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
151    /// ```
152    ///
153    /// [`Start`]: Event::Start
154    /// [`End`]: Event::End
155    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
156    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
157    /// [`check_end_names`]: crate::reader::Config::check_end_names
158    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
159    pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
160        Ok(read_to_end!(self, end, (), read_event_impl, {}))
161    }
162
163    /// Reads content between start and end tags, including any markup. This
164    /// function is supposed to be called after you already read a [`Start`] event.
165    ///
166    /// Manages nested cases where parent and child elements have the _literally_
167    /// same name.
168    ///
169    /// This method does not unescape read data, instead it returns content
170    /// "as is" of the XML document. This is because it has no idea what text
171    /// it reads, and if, for example, it contains CDATA section, attempt to
172    /// unescape it content will spoil data.
173    ///
174    /// Any text will be decoded using the XML current [`decoder()`].
175    ///
176    /// Actually, this method perform the following code:
177    ///
178    /// ```ignore
179    /// let span = reader.read_to_end(end)?;
180    /// let text = reader.decoder().decode(&reader.inner_slice[span]);
181    /// ```
182    ///
183    /// # Examples
184    ///
185    /// This example shows, how you can read a HTML content from your XML document.
186    ///
187    /// ```
188    /// # use pretty_assertions::assert_eq;
189    /// # use std::borrow::Cow;
190    /// use quick_xml::events::{BytesStart, Event};
191    /// use quick_xml::reader::Reader;
192    ///
193    /// let mut reader = Reader::from_str("
194    ///     <html>
195    ///         <title>This is a HTML text</title>
196    ///         <p>Usual XML rules does not apply inside it
197    ///         <p>For example, elements not needed to be &quot;closed&quot;
198    ///     </html>
199    /// ");
200    /// reader.config_mut().trim_text(true);
201    ///
202    /// let start = BytesStart::new("html");
203    /// let end   = start.to_end().into_owned();
204    ///
205    /// // First, we read a start event...
206    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
207    /// // ...and disable checking of end names because we expect HTML further...
208    /// reader.config_mut().check_end_names = false;
209    ///
210    /// // ...then, we could read text content until close tag.
211    /// // This call will correctly handle nested <html> elements.
212    /// let text = reader.read_text(end.name()).unwrap();
213    /// let text = text.decode().unwrap();
214    /// assert_eq!(text, r#"
215    ///         <title>This is a HTML text</title>
216    ///         <p>Usual XML rules does not apply inside it
217    ///         <p>For example, elements not needed to be &quot;closed&quot;
218    ///     "#);
219    /// assert!(matches!(text, Cow::Borrowed(_)));
220    ///
221    /// // Now we can enable checks again
222    /// reader.config_mut().check_end_names = true;
223    ///
224    /// // At the end we should get an Eof event, because we ate the whole XML
225    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
226    /// ```
227    ///
228    /// [`Start`]: Event::Start
229    /// [`decoder()`]: Self::decoder()
230    pub fn read_text(&mut self, end: QName) -> Result<BytesText<'a>> {
231        // self.reader will be changed, so store original reference
232        let buffer = self.reader;
233        let span = self.read_to_end(end)?;
234
235        let len = span.end - span.start;
236        // SAFETY: `span` can only contain indexes up to usize::MAX because it
237        // was created from offsets from a single &[u8] slice
238        Ok(BytesText::wrap(&buffer[0..len as usize], self.decoder()))
239    }
240}
241
242////////////////////////////////////////////////////////////////////////////////////////////////////
243
244/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
245/// that will be borrowed by events. This implementation provides a zero-copy deserialization
246impl<'a> XmlSource<'a, ()> for &'a [u8] {
247    #[cfg(not(feature = "encoding"))]
248    #[inline]
249    fn remove_utf8_bom(&mut self) -> io::Result<()> {
250        if self.starts_with(crate::encoding::UTF8_BOM) {
251            *self = &self[crate::encoding::UTF8_BOM.len()..];
252        }
253        Ok(())
254    }
255
256    #[cfg(feature = "encoding")]
257    #[inline]
258    fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
259        if let Some(detected) = crate::encoding::detect_encoding(self) {
260            *self = &self[detected.bom_len() as usize..];
261            return Ok(Some(detected));
262        }
263        Ok(None)
264    }
265
266    #[inline]
267    fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
268        // Search for start of markup or an entity or character reference
269        match memchr::memchr2(b'<', b'&', self) {
270            Some(0) if self[0] == b'<' => ReadTextResult::Markup(()),
271            // Do not consume `&` because it may be lone and we would be need to
272            // return it as part of Text event
273            Some(0) => ReadTextResult::Ref(()),
274            Some(i) if self[i] == b'<' => {
275                let (bytes, rest) = self.split_at(i);
276                *self = rest;
277                *position += i as u64;
278                ReadTextResult::UpToMarkup(bytes)
279            }
280            Some(i) => {
281                let (bytes, rest) = self.split_at(i);
282                *self = rest;
283                *position += i as u64;
284                ReadTextResult::UpToRef(bytes)
285            }
286            None => {
287                let bytes = &self[..];
288                *self = &[];
289                *position += bytes.len() as u64;
290                ReadTextResult::UpToEof(bytes)
291            }
292        }
293    }
294
295    #[inline]
296    fn read_ref(&mut self, _buf: (), position: &mut u64) -> ReadRefResult<'a> {
297        debug_assert!(
298            self.starts_with(b"&"),
299            "`read_ref` must be called at `&`:\n{:?}",
300            crate::utils::Bytes(self)
301        );
302        // Search for the end of reference or a start of another reference or a markup
303        match memchr::memchr3(b';', b'&', b'<', &self[1..]) {
304            Some(i) if self[i + 1] == b';' => {
305                // +1 for the start `&`
306                // +1 for the end `;`
307                let end = i + 2;
308                let (bytes, rest) = self.split_at(end);
309                *self = rest;
310                *position += end as u64;
311
312                ReadRefResult::Ref(bytes)
313            }
314            // Do not consume `&` because it may be lone and we would be need to
315            // return it as part of Text event
316            Some(i) => {
317                let is_amp = self[i + 1] == b'&';
318                let (bytes, rest) = self.split_at(i + 1);
319                *self = rest;
320                *position += i as u64 + 1;
321
322                if is_amp {
323                    ReadRefResult::UpToRef(bytes)
324                } else {
325                    ReadRefResult::UpToMarkup(bytes)
326                }
327            }
328            None => {
329                let bytes = &self[..];
330                *self = &[];
331                *position += bytes.len() as u64;
332
333                ReadRefResult::UpToEof(bytes)
334            }
335        }
336    }
337
338    #[inline]
339    fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
340    where
341        P: Parser,
342    {
343        if let Some(i) = parser.feed(self) {
344            let used = i + 1; // +1 for `>`
345            *position += used as u64;
346            let (bytes, rest) = self.split_at(used);
347            *self = rest;
348            return Ok(bytes);
349        }
350
351        *position += self.len() as u64;
352        Err(Error::Syntax(parser.eof_error(self)))
353    }
354
355    #[inline]
356    fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
357        // Peeked one bang ('!') before being called, so it's guaranteed to
358        // start with it.
359        debug_assert!(
360            self.starts_with(b"<!"),
361            "`read_bang_element` must be called at `<!`:\n{:?}",
362            crate::utils::Bytes(self)
363        );
364
365        let mut bang_type = BangType::new(self.get(2).copied())?;
366
367        if let Some(i) = bang_type.feed(&[], self) {
368            let consumed = i + 1; // +1 for `>`
369            *position += consumed as u64;
370            let (bytes, rest) = self.split_at(consumed);
371            *self = rest;
372            return Ok((bang_type, bytes));
373        }
374
375        *position += self.len() as u64;
376        Err(Error::Syntax(bang_type.to_err()))
377    }
378
379    #[inline]
380    fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
381        let whitespaces = self
382            .iter()
383            .position(|b| !is_whitespace(*b))
384            .unwrap_or(self.len());
385        *position += whitespaces as u64;
386        *self = &self[whitespaces..];
387        Ok(())
388    }
389
390    #[inline]
391    fn peek_one(&mut self) -> io::Result<Option<u8>> {
392        debug_assert!(
393            self.starts_with(b"<"),
394            "markup must start from '<':\n{:?}",
395            crate::utils::Bytes(self)
396        );
397        Ok(self.get(1).copied())
398    }
399}
400
401#[cfg(test)]
402mod test {
403    use crate::reader::test::check;
404    use crate::reader::XmlSource;
405
406    /// Default buffer constructor just pass the byte array from the test
407    fn identity<T>(input: T) -> T {
408        input
409    }
410
411    check!(
412        #[test]
413        read_event_impl,
414        read_until_close,
415        identity,
416        0,
417        ()
418    );
419}