Skip to main content

quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::encoding;
9use crate::errors::{Error, Result};
10use crate::events::{BytesText, Event};
11use crate::name::QName;
12use crate::parser::Parser;
13use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
14use crate::utils::is_whitespace;
15
16macro_rules! impl_buffered_source {
17    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
18        #[cfg(not(feature = "encoding"))]
19        #[inline]
20        $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
21            loop {
22                break match self $(.$reader)? .fill_buf() $(.$await)? {
23                    Ok(n) => {
24                        if n.starts_with(encoding::UTF8_BOM) {
25                            self $(.$reader)? .consume(encoding::UTF8_BOM.len());
26                        }
27                        Ok(())
28                    },
29                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
30                    Err(e) => Err(e),
31                };
32            }
33        }
34
35        #[cfg(feature = "encoding")]
36        #[inline]
37        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<encoding::DetectedEncoding>> {
38            loop {
39                break match self $(.$reader)? .fill_buf() $(.$await)? {
40                    Ok(n) => if let Some(detected) = encoding::detect_encoding(n) {
41                        self $(.$reader)? .consume(detected.bom_len());
42                        Ok(Some(detected))
43                    } else {
44                        Ok(None)
45                    },
46                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
47                    Err(e) => Err(e),
48                };
49            }
50        }
51
52        #[inline]
53        $($async)? fn read_text $(<$lf>)? (
54            &mut self,
55            buf: &'b mut Vec<u8>,
56            position: &mut u64,
57        ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
58            let mut read = 0;
59            let start = buf.len();
60            loop {
61                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
62                    Ok(n) if n.is_empty() => break,
63                    Ok(n) => n,
64                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
65                    Err(e) => {
66                        *position += read;
67                        return ReadTextResult::Err(e);
68                    }
69                };
70
71                // Search for start of markup or an entity or character reference
72                match memchr::memchr2(b'<', b'&', available) {
73                    // Special handling is needed only on the first iteration.
74                    // On next iterations we already read something and should emit Text event
75                    Some(0) if read == 0 && available[0] == b'<' => return ReadTextResult::Markup(buf),
76                    // Do not consume `&` because it may be lone and we would be need to
77                    // return it as part of Text event
78                    Some(0) if read == 0 => return ReadTextResult::Ref(buf),
79                    Some(i) if available[i] == b'<' => {
80                        buf.extend_from_slice(&available[..i]);
81
82                        self $(.$reader)? .consume(i);
83                        read += i as u64;
84
85                        *position += read;
86                        return ReadTextResult::UpToMarkup(&buf[start..]);
87                    }
88                    Some(i) => {
89                        buf.extend_from_slice(&available[..i]);
90
91                        self $(.$reader)? .consume(i);
92                        read += i as u64;
93
94                        *position += read;
95                        return ReadTextResult::UpToRef(&buf[start..]);
96                    }
97                    None => {
98                        buf.extend_from_slice(available);
99
100                        let used = available.len();
101                        self $(.$reader)? .consume(used);
102                        read += used as u64;
103                    }
104                }
105            }
106
107            *position += read;
108            ReadTextResult::UpToEof(&buf[start..])
109        }
110
111        #[inline]
112        $($async)? fn read_ref $(<$lf>)? (
113            &mut self,
114            buf: &'b mut Vec<u8>,
115            position: &mut u64,
116        ) -> ReadRefResult<'b> {
117            let mut read = 0;
118            let start = buf.len();
119            loop {
120                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
121                    Ok(n) if n.is_empty() => break,
122                    Ok(n) => n,
123                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
124                    Err(e) => {
125                        *position += read;
126                        return ReadRefResult::Err(e);
127                    }
128                };
129                // `read_ref` called when the first character is `&`, so we
130                // should explicitly skip it at first iteration lest we confuse
131                // it with the end
132                if read == 0 {
133                    debug_assert!(
134                        available.starts_with(b"&"),
135                        "`read_ref` must be called at `&`:\n{:?}",
136                        crate::utils::Bytes(available)
137                    );
138                    // If that ampersand is lone, then it will be part of text
139                    // and we should keep it
140                    buf.push(b'&');
141                    self $(.$reader)? .consume(1);
142                    read += 1;
143                    continue;
144                }
145
146                match memchr::memchr3(b';', b'&', b'<', available) {
147                    Some(i) if available[i] == b';' => {
148                        // +1 -- skip the end `;`
149                        let used = i + 1;
150
151                        buf.extend_from_slice(&available[..used]);
152                        self $(.$reader)? .consume(used);
153                        read += used as u64;
154
155                        *position += read;
156
157                        return ReadRefResult::Ref(&buf[start..]);
158                    }
159                    // Do not consume `&` because it may be lone and we would be need to
160                    // return it as part of Text event
161                    Some(i) => {
162                        let is_amp = available[i] == b'&';
163                        buf.extend_from_slice(&available[..i]);
164
165                        self $(.$reader)? .consume(i);
166                        read += i as u64;
167
168                        *position += read;
169
170                        return if is_amp {
171                            ReadRefResult::UpToRef(&buf[start..])
172                        } else {
173                            ReadRefResult::UpToMarkup(&buf[start..])
174                        };
175                    }
176                    None => {
177                        buf.extend_from_slice(available);
178
179                        let used = available.len();
180                        self $(.$reader)? .consume(used);
181                        read += used as u64;
182                    }
183                }
184            }
185
186            *position += read;
187            ReadRefResult::UpToEof(&buf[start..])
188        }
189
190        #[inline]
191        $($async)? fn read_with<$($lf,)? P: Parser>(
192            &mut self,
193            mut parser: P,
194            buf: &'b mut Vec<u8>,
195            position: &mut u64,
196        ) -> Result<&'b [u8]> {
197            let mut read = 1;
198            let start = buf.len();
199            // '<' was consumed in peek_one(), but not placed in buf
200            buf.push(b'<');
201            loop {
202                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
203                    Ok(n) if n.is_empty() => break,
204                    Ok(n) => n,
205                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
206                    Err(e) => {
207                        *position += read;
208                        return Err(Error::from(e));
209                    }
210                };
211
212                if let Some(i) = parser.feed(available) {
213                    let used = i + 1; // +1 for `>`
214                    buf.extend_from_slice(&available[..used]);
215
216                    self $(.$reader)? .consume(used);
217                    read += used as u64;
218
219                    *position += read;
220                    return Ok(&buf[start..]);
221                }
222
223                // The `>` symbol not yet found, continue reading
224                buf.extend_from_slice(available);
225
226                let used = available.len();
227                self $(.$reader)? .consume(used);
228                read += used as u64;
229            }
230
231            *position += read;
232            Err(Error::Syntax(parser.eof_error(&buf[start..])))
233        }
234
235        #[inline]
236        $($async)? fn read_bang_element $(<$lf>)? (
237            &mut self,
238            buf: &'b mut Vec<u8>,
239            position: &mut u64,
240        ) -> Result<(BangType, &'b [u8])> {
241            // Peeked '<!' before being called, so it's guaranteed to start with it.
242            let start = buf.len();
243            let mut read = 2;
244            // '<' was consumed in peek_one(), but not placed in buf
245            buf.push(b'<');
246            buf.push(b'!');
247            self $(.$reader)? .consume(1);
248
249            let mut bang_type = loop {
250                break match self $(.$reader)? .fill_buf() $(.$await)? {
251                    Ok(n) => BangType::new(n.first().cloned())?,
252                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
253                    Err(e) => return Err(Error::from(e)),
254                };
255            };
256
257            loop {
258                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
259                    Ok(n) if n.is_empty() => break,
260                    Ok(n) => n,
261                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
262                    Err(e) => {
263                        *position += read;
264                        return Err(Error::from(e));
265                    }
266                };
267                // We only parse from start because we don't want to consider
268                // whatever is in the buffer before the bang element
269                if let Some(i) = bang_type.feed(&buf[start..], available) {
270                    let consumed = i + 1; // +1 for `>`
271                    buf.extend_from_slice(&available[..consumed]);
272
273                    self $(.$reader)? .consume(consumed);
274                    read += consumed as u64;
275
276                    *position += read;
277                    return Ok((bang_type, &buf[start..]));
278                }
279
280                // The `>` symbol not yet found, continue reading
281                buf.extend_from_slice(available);
282
283                let used = available.len();
284                self $(.$reader)? .consume(used);
285                read += used as u64;
286            }
287
288            *position += read;
289            Err(Error::Syntax(bang_type.to_err()))
290        }
291
292        #[inline]
293        $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
294            loop {
295                break match self $(.$reader)? .fill_buf() $(.$await)? {
296                    Ok(n) => {
297                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
298                        if count > 0 {
299                            self $(.$reader)? .consume(count);
300                            *position += count as u64;
301                            continue;
302                        } else {
303                            Ok(())
304                        }
305                    }
306                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
307                    Err(e) => Err(e),
308                };
309            }
310        }
311
312        #[inline]
313        $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
314            // That method is called only when available buffer starts from '<'
315            // We need to consume it
316            self $(.$reader)? .consume(1);
317            let available = loop {
318                break match self $(.$reader)? .fill_buf() $(.$await)? {
319                    Ok(n) => n,
320                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
321                    Err(e) => return Err(e),
322                };
323            };
324            Ok(available.first().cloned())
325        }
326    };
327}
328
329// Make it public for use in async implementations.
330// New rustc reports
331// > warning: the item `impl_buffered_source` is imported redundantly
332// so make it public only when async feature is enabled
333#[cfg(feature = "async-tokio")]
334pub(super) use impl_buffered_source;
335
336/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
337/// `Vec<u8>` as buffer that will be borrowed by events.
338impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
339    impl_buffered_source!();
340}
341
342////////////////////////////////////////////////////////////////////////////////////////////////////
343
344/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
345impl<R: BufRead> Reader<R> {
346    /// Reads the next `Event`.
347    ///
348    /// This is the main entry point for reading XML `Event`s.
349    ///
350    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
351    /// internally).
352    ///
353    /// Having the possibility to control the internal buffers gives you some additional benefits
354    /// such as:
355    ///
356    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
357    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
358    ///   end of your loop).
359    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
360    ///
361    /// # Examples
362    ///
363    /// ```
364    /// # use pretty_assertions::assert_eq;
365    /// use quick_xml::events::Event;
366    /// use quick_xml::reader::Reader;
367    ///
368    /// let xml = r#"<tag1 att1 = "test">
369    ///                 <tag2><!--Test comment-->Test</tag2>
370    ///                 <tag2>Test 2</tag2>
371    ///              </tag1>"#;
372    /// let mut reader = Reader::from_str(xml);
373    /// reader.config_mut().trim_text(true);
374    /// let mut count = 0;
375    /// let mut buf = Vec::new();
376    /// let mut txt = Vec::new();
377    /// loop {
378    ///     match reader.read_event_into(&mut buf) {
379    ///         Ok(Event::Start(_)) => count += 1,
380    ///         Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
381    ///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
382    ///         Ok(Event::Eof) => break,
383    ///         _ => (),
384    ///     }
385    ///     buf.clear();
386    /// }
387    /// assert_eq!(count, 3);
388    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
389    /// ```
390    #[inline]
391    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
392        self.read_event_impl(buf)
393    }
394
395    /// Reads until end element is found using provided buffer as intermediate
396    /// storage for events content. This function is supposed to be called after
397    /// you already read a [`Start`] event.
398    ///
399    /// Returns a span that cover content between `>` of an opening tag and `<` of
400    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
401    /// this method was called after reading expanded [`Start`] event.
402    ///
403    /// Manages nested cases where parent and child elements have the _literally_
404    /// same name.
405    ///
406    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
407    /// will be returned. In particularly, that error will be returned if you call
408    /// this method without consuming the corresponding [`Start`] event first.
409    ///
410    /// If your reader created from a string slice or byte array slice, it is
411    /// better to use [`read_to_end()`] method, because it will not copy bytes
412    /// into intermediate buffer.
413    ///
414    /// The provided `buf` buffer will be filled only by one event content at time.
415    /// Before reading of each event the buffer will be cleared. If you know an
416    /// appropriate size of each event, you can preallocate the buffer to reduce
417    /// number of reallocations.
418    ///
419    /// The `end` parameter should contain name of the end element _in the reader
420    /// encoding_. It is good practice to always get that parameter using
421    /// [`BytesStart::to_end()`] method.
422    ///
423    /// The correctness of the skipped events does not checked, if you disabled
424    /// the [`check_end_names`] option.
425    ///
426    /// # Namespaces
427    ///
428    /// While the `Reader` does not support namespace resolution, namespaces
429    /// does not change the algorithm for comparing names. Although the names
430    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
431    /// same namespace, are semantically equivalent, `</b:name>` cannot close
432    /// `<a:name>`, because according to [the specification]
433    ///
434    /// > The end of every element that begins with a **start-tag** MUST be marked
435    /// > by an **end-tag** containing a name that echoes the element's type as
436    /// > given in the **start-tag**
437    ///
438    /// # Examples
439    ///
440    /// This example shows, how you can skip XML content after you read the
441    /// start event.
442    ///
443    /// ```
444    /// # use pretty_assertions::assert_eq;
445    /// use quick_xml::events::{BytesStart, Event};
446    /// use quick_xml::reader::Reader;
447    ///
448    /// let mut reader = Reader::from_str(r#"
449    ///     <outer>
450    ///         <inner>
451    ///             <inner></inner>
452    ///             <inner/>
453    ///             <outer></outer>
454    ///             <outer/>
455    ///         </inner>
456    ///     </outer>
457    /// "#);
458    /// reader.config_mut().trim_text(true);
459    /// let mut buf = Vec::new();
460    ///
461    /// let start = BytesStart::new("outer");
462    /// let end   = start.to_end().into_owned();
463    ///
464    /// // First, we read a start event...
465    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
466    ///
467    /// // ...then, we could skip all events to the corresponding end event.
468    /// // This call will correctly handle nested <outer> elements.
469    /// // Note, however, that this method does not handle namespaces.
470    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
471    ///
472    /// // At the end we should get an Eof event, because we ate the whole XML
473    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
474    /// ```
475    ///
476    /// [`Start`]: Event::Start
477    /// [`End`]: Event::End
478    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
479    /// [`read_to_end()`]: Self::read_to_end
480    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
481    /// [`check_end_names`]: crate::reader::Config::check_end_names
482    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
483    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
484        Ok(read_to_end!(self, end, buf, read_event_impl, {
485            buf.clear();
486        }))
487    }
488
489    /// Reads content between start and end tags, including any markup using
490    /// provided buffer as intermediate storage for events content. This function
491    /// is supposed to be called after you already read a [`Start`] event.
492    ///
493    /// Manages nested cases where parent and child elements have the _literally_
494    /// same name.
495    ///
496    /// This method does not unescape read data, instead it returns content
497    /// "as is" of the XML document. This is because it has no idea what text
498    /// it reads, and if, for example, it contains CDATA section, attempt to
499    /// unescape it content will spoil data.
500    ///
501    /// If your reader created from a string slice or byte array slice, it is
502    /// better to use [`read_text()`] method, because it will not copy bytes
503    /// into intermediate buffer.
504    ///
505    /// # Examples
506    ///
507    /// This example shows, how you can read a HTML content from your XML document.
508    ///
509    /// ```
510    /// # use pretty_assertions::assert_eq;
511    /// # use std::borrow::Cow;
512    /// use quick_xml::events::{BytesStart, Event};
513    /// use quick_xml::reader::Reader;
514    ///
515    /// let mut reader = Reader::from_reader("
516    ///     <html>
517    ///         <title>This is a HTML text</title>
518    ///         <p>Usual XML rules does not apply inside it
519    ///         <p>For example, elements not needed to be &quot;closed&quot;
520    ///     </html>
521    /// ".as_bytes());
522    /// reader.config_mut().trim_text(true);
523    ///
524    /// let start = BytesStart::new("html");
525    /// let end   = start.to_end().into_owned();
526    ///
527    /// let mut buf = Vec::new();
528    ///
529    /// // First, we read a start event...
530    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
531    /// // ...and disable checking of end names because we expect HTML further...
532    /// reader.config_mut().check_end_names = false;
533    ///
534    /// // ...then, we could read text content until close tag.
535    /// // This call will correctly handle nested <html> elements.
536    /// let text = reader.read_text_into(end.name(), &mut buf).unwrap();
537    /// let text = text.decode().unwrap();
538    /// assert_eq!(text, r#"
539    ///         <title>This is a HTML text</title>
540    ///         <p>Usual XML rules does not apply inside it
541    ///         <p>For example, elements not needed to be &quot;closed&quot;
542    ///     "#);
543    /// assert!(matches!(text, Cow::Borrowed(_)));
544    ///
545    /// // Now we can enable checks again
546    /// reader.config_mut().check_end_names = true;
547    ///
548    /// // At the end we should get an Eof event, because we ate the whole XML
549    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
550    /// ```
551    ///
552    /// [`Start`]: Event::Start
553    /// [`read_text()`]: Self::read_text()
554    pub fn read_text_into<'b>(
555        &mut self,
556        end: QName,
557        buf: &'b mut Vec<u8>,
558    ) -> Result<BytesText<'b>> {
559        let start = buf.len();
560        let span = read_to_end!(self, end, buf, read_event_impl, {});
561
562        let len = span.end - span.start;
563        // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is
564        // not cleared when reading event, length of the returned span should fit into
565        // usize (because otherwise we panic at appending to the buffer before that point)
566        let end = start + len as usize;
567
568        Ok(BytesText::wrap(&buf[start..end], self.decoder()))
569    }
570}
571
572impl Reader<BufReader<File>> {
573    /// Creates an XML reader from a file path.
574    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
575        let file = File::open(path)?;
576        let reader = BufReader::new(file);
577        Ok(Self::from_reader(reader))
578    }
579}
580
581#[cfg(test)]
582mod test {
583    use crate::reader::test::check;
584    use crate::reader::XmlSource;
585
586    /// Default buffer constructor just pass the byte array from the test
587    fn identity<T>(input: T) -> T {
588        input
589    }
590
591    check!(
592        #[test]
593        read_event_impl,
594        read_until_close,
595        identity,
596        1,
597        &mut Vec::new()
598    );
599}