Skip to main content

quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::{BytesText, Event};
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17        #[cfg(not(feature = "encoding"))]
18        #[inline]
19        $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20            use crate::encoding::UTF8_BOM;
21
22            loop {
23                break match self $(.$reader)? .fill_buf() $(.$await)? {
24                    Ok(n) => {
25                        if n.starts_with(UTF8_BOM) {
26                            self $(.$reader)? .consume(UTF8_BOM.len());
27                        }
28                        Ok(())
29                    },
30                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31                    Err(e) => Err(e),
32                };
33            }
34        }
35
36        #[cfg(feature = "encoding")]
37        #[inline]
38        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39            loop {
40                break match self $(.$reader)? .fill_buf() $(.$await)? {
41                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42                        self $(.$reader)? .consume(bom_len);
43                        Ok(Some(enc))
44                    } else {
45                        Ok(None)
46                    },
47                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48                    Err(e) => Err(e),
49                };
50            }
51        }
52
53        #[inline]
54        $($async)? fn read_text $(<$lf>)? (
55            &mut self,
56            buf: &'b mut Vec<u8>,
57            position: &mut u64,
58        ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59            let mut read = 0;
60            let start = buf.len();
61            loop {
62                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63                    Ok(n) if n.is_empty() => break,
64                    Ok(n) => n,
65                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66                    Err(e) => {
67                        *position += read;
68                        return ReadTextResult::Err(e);
69                    }
70                };
71
72                // Search for start of markup or an entity or character reference
73                match memchr::memchr2(b'<', b'&', available) {
74                    // Special handling is needed only on the first iteration.
75                    // On next iterations we already read something and should emit Text event
76                    Some(0) if read == 0 && available[0] == b'<' => return ReadTextResult::Markup(buf),
77                    // Do not consume `&` because it may be lone and we would be need to
78                    // return it as part of Text event
79                    Some(0) if read == 0 => return ReadTextResult::Ref(buf),
80                    Some(i) if available[i] == b'<' => {
81                        buf.extend_from_slice(&available[..i]);
82
83                        self $(.$reader)? .consume(i);
84                        read += i as u64;
85
86                        *position += read;
87                        return ReadTextResult::UpToMarkup(&buf[start..]);
88                    }
89                    Some(i) => {
90                        buf.extend_from_slice(&available[..i]);
91
92                        self $(.$reader)? .consume(i);
93                        read += i as u64;
94
95                        *position += read;
96                        return ReadTextResult::UpToRef(&buf[start..]);
97                    }
98                    None => {
99                        buf.extend_from_slice(available);
100
101                        let used = available.len();
102                        self $(.$reader)? .consume(used);
103                        read += used as u64;
104                    }
105                }
106            }
107
108            *position += read;
109            ReadTextResult::UpToEof(&buf[start..])
110        }
111
112        #[inline]
113        $($async)? fn read_ref $(<$lf>)? (
114            &mut self,
115            buf: &'b mut Vec<u8>,
116            position: &mut u64,
117        ) -> ReadRefResult<'b> {
118            let mut read = 0;
119            let start = buf.len();
120            loop {
121                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
122                    Ok(n) if n.is_empty() => break,
123                    Ok(n) => n,
124                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
125                    Err(e) => {
126                        *position += read;
127                        return ReadRefResult::Err(e);
128                    }
129                };
130                // `read_ref` called when the first character is `&`, so we
131                // should explicitly skip it at first iteration lest we confuse
132                // it with the end
133                if read == 0 {
134                    debug_assert!(
135                        available.starts_with(b"&"),
136                        "`read_ref` must be called at `&`:\n{:?}",
137                        crate::utils::Bytes(available)
138                    );
139                    // If that ampersand is lone, then it will be part of text
140                    // and we should keep it
141                    buf.push(b'&');
142                    self $(.$reader)? .consume(1);
143                    read += 1;
144                    continue;
145                }
146
147                match memchr::memchr3(b';', b'&', b'<', available) {
148                    Some(i) if available[i] == b';' => {
149                        // +1 -- skip the end `;`
150                        let used = i + 1;
151
152                        buf.extend_from_slice(&available[..used]);
153                        self $(.$reader)? .consume(used);
154                        read += used as u64;
155
156                        *position += read;
157
158                        return ReadRefResult::Ref(&buf[start..]);
159                    }
160                    // Do not consume `&` because it may be lone and we would be need to
161                    // return it as part of Text event
162                    Some(i) => {
163                        let is_amp = available[i] == b'&';
164                        buf.extend_from_slice(&available[..i]);
165
166                        self $(.$reader)? .consume(i);
167                        read += i as u64;
168
169                        *position += read;
170
171                        return if is_amp {
172                            ReadRefResult::UpToRef(&buf[start..])
173                        } else {
174                            ReadRefResult::UpToMarkup(&buf[start..])
175                        };
176                    }
177                    None => {
178                        buf.extend_from_slice(available);
179
180                        let used = available.len();
181                        self $(.$reader)? .consume(used);
182                        read += used as u64;
183                    }
184                }
185            }
186
187            *position += read;
188            ReadRefResult::UpToEof(&buf[start..])
189        }
190
191        #[inline]
192        $($async)? fn read_with<$($lf,)? P: Parser>(
193            &mut self,
194            mut parser: P,
195            buf: &'b mut Vec<u8>,
196            position: &mut u64,
197        ) -> Result<&'b [u8]> {
198            let mut read = 1;
199            let start = buf.len();
200            // '<' was consumed in peek_one(), but not placed in buf
201            buf.push(b'<');
202            loop {
203                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
204                    Ok(n) if n.is_empty() => break,
205                    Ok(n) => n,
206                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
207                    Err(e) => {
208                        *position += read;
209                        return Err(Error::Io(e.into()));
210                    }
211                };
212
213                if let Some(i) = parser.feed(available) {
214                    let used = i + 1; // +1 for `>`
215                    buf.extend_from_slice(&available[..used]);
216
217                    self $(.$reader)? .consume(used);
218                    read += used as u64;
219
220                    *position += read;
221                    return Ok(&buf[start..]);
222                }
223
224                // The `>` symbol not yet found, continue reading
225                buf.extend_from_slice(available);
226
227                let used = available.len();
228                self $(.$reader)? .consume(used);
229                read += used as u64;
230            }
231
232            *position += read;
233            Err(Error::Syntax(parser.eof_error(&buf[start..])))
234        }
235
236        #[inline]
237        $($async)? fn read_bang_element $(<$lf>)? (
238            &mut self,
239            buf: &'b mut Vec<u8>,
240            position: &mut u64,
241        ) -> Result<(BangType, &'b [u8])> {
242            // Peeked '<!' before being called, so it's guaranteed to start with it.
243            let start = buf.len();
244            let mut read = 2;
245            // '<' was consumed in peek_one(), but not placed in buf
246            buf.push(b'<');
247            buf.push(b'!');
248            self $(.$reader)? .consume(1);
249
250            let mut bang_type = loop {
251                break match self $(.$reader)? .fill_buf() $(.$await)? {
252                    Ok(n) => BangType::new(n.first().cloned())?,
253                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
254                    Err(e) => return Err(Error::Io(e.into())),
255                };
256            };
257
258            loop {
259                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
260                    Ok(n) if n.is_empty() => break,
261                    Ok(n) => n,
262                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
263                    Err(e) => {
264                        *position += read;
265                        return Err(Error::Io(e.into()));
266                    }
267                };
268                // We only parse from start because we don't want to consider
269                // whatever is in the buffer before the bang element
270                if let Some(i) = bang_type.feed(&buf[start..], available) {
271                    let consumed = i + 1; // +1 for `>`
272                    buf.extend_from_slice(&available[..consumed]);
273
274                    self $(.$reader)? .consume(consumed);
275                    read += consumed as u64;
276
277                    *position += read;
278                    return Ok((bang_type, &buf[start..]));
279                }
280
281                // The `>` symbol not yet found, continue reading
282                buf.extend_from_slice(available);
283
284                let used = available.len();
285                self $(.$reader)? .consume(used);
286                read += used as u64;
287            }
288
289            *position += read;
290            Err(Error::Syntax(bang_type.to_err()))
291        }
292
293        #[inline]
294        $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
295            loop {
296                break match self $(.$reader)? .fill_buf() $(.$await)? {
297                    Ok(n) => {
298                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
299                        if count > 0 {
300                            self $(.$reader)? .consume(count);
301                            *position += count as u64;
302                            continue;
303                        } else {
304                            Ok(())
305                        }
306                    }
307                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
308                    Err(e) => Err(e),
309                };
310            }
311        }
312
313        #[inline]
314        $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
315            // That method is called only when available buffer starts from '<'
316            // We need to consume it
317            self $(.$reader)? .consume(1);
318            let available = loop {
319                break match self $(.$reader)? .fill_buf() $(.$await)? {
320                    Ok(n) => n,
321                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
322                    Err(e) => return Err(e),
323                };
324            };
325            Ok(available.first().cloned())
326        }
327    };
328}
329
330// Make it public for use in async implementations.
331// New rustc reports
332// > warning: the item `impl_buffered_source` is imported redundantly
333// so make it public only when async feature is enabled
334#[cfg(feature = "async-tokio")]
335pub(super) use impl_buffered_source;
336
337/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
338/// `Vec<u8>` as buffer that will be borrowed by events.
339impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
340    impl_buffered_source!();
341}
342
343////////////////////////////////////////////////////////////////////////////////////////////////////
344
345/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
346impl<R: BufRead> Reader<R> {
347    /// Reads the next `Event`.
348    ///
349    /// This is the main entry point for reading XML `Event`s.
350    ///
351    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
352    /// internally).
353    ///
354    /// Having the possibility to control the internal buffers gives you some additional benefits
355    /// such as:
356    ///
357    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
358    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
359    ///   end of your loop).
360    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
361    ///
362    /// # Examples
363    ///
364    /// ```
365    /// # use pretty_assertions::assert_eq;
366    /// use quick_xml::events::Event;
367    /// use quick_xml::reader::Reader;
368    ///
369    /// let xml = r#"<tag1 att1 = "test">
370    ///                 <tag2><!--Test comment-->Test</tag2>
371    ///                 <tag2>Test 2</tag2>
372    ///              </tag1>"#;
373    /// let mut reader = Reader::from_str(xml);
374    /// reader.config_mut().trim_text(true);
375    /// let mut count = 0;
376    /// let mut buf = Vec::new();
377    /// let mut txt = Vec::new();
378    /// loop {
379    ///     match reader.read_event_into(&mut buf) {
380    ///         Ok(Event::Start(_)) => count += 1,
381    ///         Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
382    ///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
383    ///         Ok(Event::Eof) => break,
384    ///         _ => (),
385    ///     }
386    ///     buf.clear();
387    /// }
388    /// assert_eq!(count, 3);
389    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
390    /// ```
391    #[inline]
392    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
393        self.read_event_impl(buf)
394    }
395
396    /// Reads until end element is found using provided buffer as intermediate
397    /// storage for events content. This function is supposed to be called after
398    /// you already read a [`Start`] event.
399    ///
400    /// Returns a span that cover content between `>` of an opening tag and `<` of
401    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
402    /// this method was called after reading expanded [`Start`] event.
403    ///
404    /// Manages nested cases where parent and child elements have the _literally_
405    /// same name.
406    ///
407    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
408    /// will be returned. In particularly, that error will be returned if you call
409    /// this method without consuming the corresponding [`Start`] event first.
410    ///
411    /// If your reader created from a string slice or byte array slice, it is
412    /// better to use [`read_to_end()`] method, because it will not copy bytes
413    /// into intermediate buffer.
414    ///
415    /// The provided `buf` buffer will be filled only by one event content at time.
416    /// Before reading of each event the buffer will be cleared. If you know an
417    /// appropriate size of each event, you can preallocate the buffer to reduce
418    /// number of reallocations.
419    ///
420    /// The `end` parameter should contain name of the end element _in the reader
421    /// encoding_. It is good practice to always get that parameter using
422    /// [`BytesStart::to_end()`] method.
423    ///
424    /// The correctness of the skipped events does not checked, if you disabled
425    /// the [`check_end_names`] option.
426    ///
427    /// # Namespaces
428    ///
429    /// While the `Reader` does not support namespace resolution, namespaces
430    /// does not change the algorithm for comparing names. Although the names
431    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
432    /// same namespace, are semantically equivalent, `</b:name>` cannot close
433    /// `<a:name>`, because according to [the specification]
434    ///
435    /// > The end of every element that begins with a **start-tag** MUST be marked
436    /// > by an **end-tag** containing a name that echoes the element's type as
437    /// > given in the **start-tag**
438    ///
439    /// # Examples
440    ///
441    /// This example shows, how you can skip XML content after you read the
442    /// start event.
443    ///
444    /// ```
445    /// # use pretty_assertions::assert_eq;
446    /// use quick_xml::events::{BytesStart, Event};
447    /// use quick_xml::reader::Reader;
448    ///
449    /// let mut reader = Reader::from_str(r#"
450    ///     <outer>
451    ///         <inner>
452    ///             <inner></inner>
453    ///             <inner/>
454    ///             <outer></outer>
455    ///             <outer/>
456    ///         </inner>
457    ///     </outer>
458    /// "#);
459    /// reader.config_mut().trim_text(true);
460    /// let mut buf = Vec::new();
461    ///
462    /// let start = BytesStart::new("outer");
463    /// let end   = start.to_end().into_owned();
464    ///
465    /// // First, we read a start event...
466    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
467    ///
468    /// // ...then, we could skip all events to the corresponding end event.
469    /// // This call will correctly handle nested <outer> elements.
470    /// // Note, however, that this method does not handle namespaces.
471    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
472    ///
473    /// // At the end we should get an Eof event, because we ate the whole XML
474    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
475    /// ```
476    ///
477    /// [`Start`]: Event::Start
478    /// [`End`]: Event::End
479    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
480    /// [`read_to_end()`]: Self::read_to_end
481    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
482    /// [`check_end_names`]: crate::reader::Config::check_end_names
483    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
484    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
485        Ok(read_to_end!(self, end, buf, read_event_impl, {
486            buf.clear();
487        }))
488    }
489
490    /// Reads content between start and end tags, including any markup using
491    /// provided buffer as intermediate storage for events content. This function
492    /// is supposed to be called after you already read a [`Start`] event.
493    ///
494    /// Manages nested cases where parent and child elements have the _literally_
495    /// same name.
496    ///
497    /// This method does not unescape read data, instead it returns content
498    /// "as is" of the XML document. This is because it has no idea what text
499    /// it reads, and if, for example, it contains CDATA section, attempt to
500    /// unescape it content will spoil data.
501    ///
502    /// If your reader created from a string slice or byte array slice, it is
503    /// better to use [`read_text()`] method, because it will not copy bytes
504    /// into intermediate buffer.
505    ///
506    /// # Examples
507    ///
508    /// This example shows, how you can read a HTML content from your XML document.
509    ///
510    /// ```
511    /// # use pretty_assertions::assert_eq;
512    /// # use std::borrow::Cow;
513    /// use quick_xml::events::{BytesStart, Event};
514    /// use quick_xml::reader::Reader;
515    ///
516    /// let mut reader = Reader::from_reader("
517    ///     <html>
518    ///         <title>This is a HTML text</title>
519    ///         <p>Usual XML rules does not apply inside it
520    ///         <p>For example, elements not needed to be &quot;closed&quot;
521    ///     </html>
522    /// ".as_bytes());
523    /// reader.config_mut().trim_text(true);
524    ///
525    /// let start = BytesStart::new("html");
526    /// let end   = start.to_end().into_owned();
527    ///
528    /// let mut buf = Vec::new();
529    ///
530    /// // First, we read a start event...
531    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
532    /// // ...and disable checking of end names because we expect HTML further...
533    /// reader.config_mut().check_end_names = false;
534    ///
535    /// // ...then, we could read text content until close tag.
536    /// // This call will correctly handle nested <html> elements.
537    /// let text = reader.read_text_into(end.name(), &mut buf).unwrap();
538    /// let text = text.decode().unwrap();
539    /// assert_eq!(text, r#"
540    ///         <title>This is a HTML text</title>
541    ///         <p>Usual XML rules does not apply inside it
542    ///         <p>For example, elements not needed to be &quot;closed&quot;
543    ///     "#);
544    /// assert!(matches!(text, Cow::Borrowed(_)));
545    ///
546    /// // Now we can enable checks again
547    /// reader.config_mut().check_end_names = true;
548    ///
549    /// // At the end we should get an Eof event, because we ate the whole XML
550    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
551    /// ```
552    ///
553    /// [`Start`]: Event::Start
554    /// [`read_text()`]: Self::read_text()
555    pub fn read_text_into<'b>(
556        &mut self,
557        end: QName,
558        buf: &'b mut Vec<u8>,
559    ) -> Result<BytesText<'b>> {
560        let start = buf.len();
561        let span = read_to_end!(self, end, buf, read_event_impl, {});
562
563        let len = span.end - span.start;
564        // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is
565        // not cleared when reading event, length of the returned span should fit into
566        // usize (because otherwise we panic at appending to the buffer before that point)
567        let end = start + len as usize;
568
569        Ok(BytesText::wrap(&buf[start..end], self.decoder()))
570    }
571}
572
573impl Reader<BufReader<File>> {
574    /// Creates an XML reader from a file path.
575    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
576        let file = File::open(path)?;
577        let reader = BufReader::new(file);
578        Ok(Self::from_reader(reader))
579    }
580}
581
582#[cfg(test)]
583mod test {
584    use crate::reader::test::check;
585    use crate::reader::XmlSource;
586
587    /// Default buffer constructor just pass the byte array from the test
588    fn identity<T>(input: T) -> T {
589        input
590    }
591
592    check!(
593        #[test]
594        read_event_impl,
595        read_until_close,
596        identity,
597        1,
598        &mut Vec::new()
599    );
600}