quick_xml/reader/buffered_reader.rs
1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::{BytesText, Event};
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17 #[cfg(not(feature = "encoding"))]
18 #[inline]
19 $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20 use crate::encoding::UTF8_BOM;
21
22 loop {
23 break match self $(.$reader)? .fill_buf() $(.$await)? {
24 Ok(n) => {
25 if n.starts_with(UTF8_BOM) {
26 self $(.$reader)? .consume(UTF8_BOM.len());
27 }
28 Ok(())
29 },
30 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31 Err(e) => Err(e),
32 };
33 }
34 }
35
36 #[cfg(feature = "encoding")]
37 #[inline]
38 $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39 loop {
40 break match self $(.$reader)? .fill_buf() $(.$await)? {
41 Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42 self $(.$reader)? .consume(bom_len);
43 Ok(Some(enc))
44 } else {
45 Ok(None)
46 },
47 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48 Err(e) => Err(e),
49 };
50 }
51 }
52
53 #[inline]
54 $($async)? fn read_text $(<$lf>)? (
55 &mut self,
56 buf: &'b mut Vec<u8>,
57 position: &mut u64,
58 ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59 let mut read = 0;
60 let start = buf.len();
61 loop {
62 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63 Ok(n) if n.is_empty() => break,
64 Ok(n) => n,
65 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66 Err(e) => {
67 *position += read;
68 return ReadTextResult::Err(e);
69 }
70 };
71
72 // Search for start of markup or an entity or character reference
73 match memchr::memchr2(b'<', b'&', available) {
74 // Special handling is needed only on the first iteration.
75 // On next iterations we already read something and should emit Text event
76 Some(0) if read == 0 && available[0] == b'<' => return ReadTextResult::Markup(buf),
77 // Do not consume `&` because it may be lone and we would be need to
78 // return it as part of Text event
79 Some(0) if read == 0 => return ReadTextResult::Ref(buf),
80 Some(i) if available[i] == b'<' => {
81 buf.extend_from_slice(&available[..i]);
82
83 self $(.$reader)? .consume(i);
84 read += i as u64;
85
86 *position += read;
87 return ReadTextResult::UpToMarkup(&buf[start..]);
88 }
89 Some(i) => {
90 buf.extend_from_slice(&available[..i]);
91
92 self $(.$reader)? .consume(i);
93 read += i as u64;
94
95 *position += read;
96 return ReadTextResult::UpToRef(&buf[start..]);
97 }
98 None => {
99 buf.extend_from_slice(available);
100
101 let used = available.len();
102 self $(.$reader)? .consume(used);
103 read += used as u64;
104 }
105 }
106 }
107
108 *position += read;
109 ReadTextResult::UpToEof(&buf[start..])
110 }
111
112 #[inline]
113 $($async)? fn read_ref $(<$lf>)? (
114 &mut self,
115 buf: &'b mut Vec<u8>,
116 position: &mut u64,
117 ) -> ReadRefResult<'b> {
118 let mut read = 0;
119 let start = buf.len();
120 loop {
121 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
122 Ok(n) if n.is_empty() => break,
123 Ok(n) => n,
124 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
125 Err(e) => {
126 *position += read;
127 return ReadRefResult::Err(e);
128 }
129 };
130 // `read_ref` called when the first character is `&`, so we
131 // should explicitly skip it at first iteration lest we confuse
132 // it with the end
133 if read == 0 {
134 debug_assert!(
135 available.starts_with(b"&"),
136 "`read_ref` must be called at `&`:\n{:?}",
137 crate::utils::Bytes(available)
138 );
139 // If that ampersand is lone, then it will be part of text
140 // and we should keep it
141 buf.push(b'&');
142 self $(.$reader)? .consume(1);
143 read += 1;
144 continue;
145 }
146
147 match memchr::memchr3(b';', b'&', b'<', available) {
148 Some(i) if available[i] == b';' => {
149 // +1 -- skip the end `;`
150 let used = i + 1;
151
152 buf.extend_from_slice(&available[..used]);
153 self $(.$reader)? .consume(used);
154 read += used as u64;
155
156 *position += read;
157
158 return ReadRefResult::Ref(&buf[start..]);
159 }
160 // Do not consume `&` because it may be lone and we would be need to
161 // return it as part of Text event
162 Some(i) => {
163 let is_amp = available[i] == b'&';
164 buf.extend_from_slice(&available[..i]);
165
166 self $(.$reader)? .consume(i);
167 read += i as u64;
168
169 *position += read;
170
171 return if is_amp {
172 ReadRefResult::UpToRef(&buf[start..])
173 } else {
174 ReadRefResult::UpToMarkup(&buf[start..])
175 };
176 }
177 None => {
178 buf.extend_from_slice(available);
179
180 let used = available.len();
181 self $(.$reader)? .consume(used);
182 read += used as u64;
183 }
184 }
185 }
186
187 *position += read;
188 ReadRefResult::UpToEof(&buf[start..])
189 }
190
191 #[inline]
192 $($async)? fn read_with<$($lf,)? P: Parser>(
193 &mut self,
194 mut parser: P,
195 buf: &'b mut Vec<u8>,
196 position: &mut u64,
197 ) -> Result<&'b [u8]> {
198 let mut read = 1;
199 let start = buf.len();
200 // '<' was consumed in peek_one(), but not placed in buf
201 buf.push(b'<');
202 loop {
203 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
204 Ok(n) if n.is_empty() => break,
205 Ok(n) => n,
206 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
207 Err(e) => {
208 *position += read;
209 return Err(Error::Io(e.into()));
210 }
211 };
212
213 if let Some(i) = parser.feed(available) {
214 let used = i + 1; // +1 for `>`
215 buf.extend_from_slice(&available[..used]);
216
217 self $(.$reader)? .consume(used);
218 read += used as u64;
219
220 *position += read;
221 return Ok(&buf[start..]);
222 }
223
224 // The `>` symbol not yet found, continue reading
225 buf.extend_from_slice(available);
226
227 let used = available.len();
228 self $(.$reader)? .consume(used);
229 read += used as u64;
230 }
231
232 *position += read;
233 Err(Error::Syntax(parser.eof_error(&buf[start..])))
234 }
235
236 #[inline]
237 $($async)? fn read_bang_element $(<$lf>)? (
238 &mut self,
239 buf: &'b mut Vec<u8>,
240 position: &mut u64,
241 ) -> Result<(BangType, &'b [u8])> {
242 // Peeked '<!' before being called, so it's guaranteed to start with it.
243 let start = buf.len();
244 let mut read = 2;
245 // '<' was consumed in peek_one(), but not placed in buf
246 buf.push(b'<');
247 buf.push(b'!');
248 self $(.$reader)? .consume(1);
249
250 let mut bang_type = loop {
251 break match self $(.$reader)? .fill_buf() $(.$await)? {
252 Ok(n) => BangType::new(n.first().cloned())?,
253 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
254 Err(e) => return Err(Error::Io(e.into())),
255 };
256 };
257
258 loop {
259 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
260 Ok(n) if n.is_empty() => break,
261 Ok(n) => n,
262 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
263 Err(e) => {
264 *position += read;
265 return Err(Error::Io(e.into()));
266 }
267 };
268 // We only parse from start because we don't want to consider
269 // whatever is in the buffer before the bang element
270 if let Some(i) = bang_type.feed(&buf[start..], available) {
271 let consumed = i + 1; // +1 for `>`
272 buf.extend_from_slice(&available[..consumed]);
273
274 self $(.$reader)? .consume(consumed);
275 read += consumed as u64;
276
277 *position += read;
278 return Ok((bang_type, &buf[start..]));
279 }
280
281 // The `>` symbol not yet found, continue reading
282 buf.extend_from_slice(available);
283
284 let used = available.len();
285 self $(.$reader)? .consume(used);
286 read += used as u64;
287 }
288
289 *position += read;
290 Err(Error::Syntax(bang_type.to_err()))
291 }
292
293 #[inline]
294 $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
295 loop {
296 break match self $(.$reader)? .fill_buf() $(.$await)? {
297 Ok(n) => {
298 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
299 if count > 0 {
300 self $(.$reader)? .consume(count);
301 *position += count as u64;
302 continue;
303 } else {
304 Ok(())
305 }
306 }
307 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
308 Err(e) => Err(e),
309 };
310 }
311 }
312
313 #[inline]
314 $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
315 // That method is called only when available buffer starts from '<'
316 // We need to consume it
317 self $(.$reader)? .consume(1);
318 let available = loop {
319 break match self $(.$reader)? .fill_buf() $(.$await)? {
320 Ok(n) => n,
321 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
322 Err(e) => return Err(e),
323 };
324 };
325 Ok(available.first().cloned())
326 }
327 };
328}
329
330// Make it public for use in async implementations.
331// New rustc reports
332// > warning: the item `impl_buffered_source` is imported redundantly
333// so make it public only when async feature is enabled
334#[cfg(feature = "async-tokio")]
335pub(super) use impl_buffered_source;
336
337/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
338/// `Vec<u8>` as buffer that will be borrowed by events.
339impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
340 impl_buffered_source!();
341}
342
343////////////////////////////////////////////////////////////////////////////////////////////////////
344
345/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
346impl<R: BufRead> Reader<R> {
347 /// Reads the next `Event`.
348 ///
349 /// This is the main entry point for reading XML `Event`s.
350 ///
351 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
352 /// internally).
353 ///
354 /// Having the possibility to control the internal buffers gives you some additional benefits
355 /// such as:
356 ///
357 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
358 /// you can call `buf.clear()` once you are done with processing the event (typically at the
359 /// end of your loop).
360 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
361 ///
362 /// # Examples
363 ///
364 /// ```
365 /// # use pretty_assertions::assert_eq;
366 /// use quick_xml::events::Event;
367 /// use quick_xml::reader::Reader;
368 ///
369 /// let xml = r#"<tag1 att1 = "test">
370 /// <tag2><!--Test comment-->Test</tag2>
371 /// <tag2>Test 2</tag2>
372 /// </tag1>"#;
373 /// let mut reader = Reader::from_str(xml);
374 /// reader.config_mut().trim_text(true);
375 /// let mut count = 0;
376 /// let mut buf = Vec::new();
377 /// let mut txt = Vec::new();
378 /// loop {
379 /// match reader.read_event_into(&mut buf) {
380 /// Ok(Event::Start(_)) => count += 1,
381 /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
382 /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
383 /// Ok(Event::Eof) => break,
384 /// _ => (),
385 /// }
386 /// buf.clear();
387 /// }
388 /// assert_eq!(count, 3);
389 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
390 /// ```
391 #[inline]
392 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
393 self.read_event_impl(buf)
394 }
395
396 /// Reads until end element is found using provided buffer as intermediate
397 /// storage for events content. This function is supposed to be called after
398 /// you already read a [`Start`] event.
399 ///
400 /// Returns a span that cover content between `>` of an opening tag and `<` of
401 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
402 /// this method was called after reading expanded [`Start`] event.
403 ///
404 /// Manages nested cases where parent and child elements have the _literally_
405 /// same name.
406 ///
407 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
408 /// will be returned. In particularly, that error will be returned if you call
409 /// this method without consuming the corresponding [`Start`] event first.
410 ///
411 /// If your reader created from a string slice or byte array slice, it is
412 /// better to use [`read_to_end()`] method, because it will not copy bytes
413 /// into intermediate buffer.
414 ///
415 /// The provided `buf` buffer will be filled only by one event content at time.
416 /// Before reading of each event the buffer will be cleared. If you know an
417 /// appropriate size of each event, you can preallocate the buffer to reduce
418 /// number of reallocations.
419 ///
420 /// The `end` parameter should contain name of the end element _in the reader
421 /// encoding_. It is good practice to always get that parameter using
422 /// [`BytesStart::to_end()`] method.
423 ///
424 /// The correctness of the skipped events does not checked, if you disabled
425 /// the [`check_end_names`] option.
426 ///
427 /// # Namespaces
428 ///
429 /// While the `Reader` does not support namespace resolution, namespaces
430 /// does not change the algorithm for comparing names. Although the names
431 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
432 /// same namespace, are semantically equivalent, `</b:name>` cannot close
433 /// `<a:name>`, because according to [the specification]
434 ///
435 /// > The end of every element that begins with a **start-tag** MUST be marked
436 /// > by an **end-tag** containing a name that echoes the element's type as
437 /// > given in the **start-tag**
438 ///
439 /// # Examples
440 ///
441 /// This example shows, how you can skip XML content after you read the
442 /// start event.
443 ///
444 /// ```
445 /// # use pretty_assertions::assert_eq;
446 /// use quick_xml::events::{BytesStart, Event};
447 /// use quick_xml::reader::Reader;
448 ///
449 /// let mut reader = Reader::from_str(r#"
450 /// <outer>
451 /// <inner>
452 /// <inner></inner>
453 /// <inner/>
454 /// <outer></outer>
455 /// <outer/>
456 /// </inner>
457 /// </outer>
458 /// "#);
459 /// reader.config_mut().trim_text(true);
460 /// let mut buf = Vec::new();
461 ///
462 /// let start = BytesStart::new("outer");
463 /// let end = start.to_end().into_owned();
464 ///
465 /// // First, we read a start event...
466 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
467 ///
468 /// // ...then, we could skip all events to the corresponding end event.
469 /// // This call will correctly handle nested <outer> elements.
470 /// // Note, however, that this method does not handle namespaces.
471 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
472 ///
473 /// // At the end we should get an Eof event, because we ate the whole XML
474 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
475 /// ```
476 ///
477 /// [`Start`]: Event::Start
478 /// [`End`]: Event::End
479 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
480 /// [`read_to_end()`]: Self::read_to_end
481 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
482 /// [`check_end_names`]: crate::reader::Config::check_end_names
483 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
484 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
485 Ok(read_to_end!(self, end, buf, read_event_impl, {
486 buf.clear();
487 }))
488 }
489
490 /// Reads content between start and end tags, including any markup using
491 /// provided buffer as intermediate storage for events content. This function
492 /// is supposed to be called after you already read a [`Start`] event.
493 ///
494 /// Manages nested cases where parent and child elements have the _literally_
495 /// same name.
496 ///
497 /// This method does not unescape read data, instead it returns content
498 /// "as is" of the XML document. This is because it has no idea what text
499 /// it reads, and if, for example, it contains CDATA section, attempt to
500 /// unescape it content will spoil data.
501 ///
502 /// If your reader created from a string slice or byte array slice, it is
503 /// better to use [`read_text()`] method, because it will not copy bytes
504 /// into intermediate buffer.
505 ///
506 /// # Examples
507 ///
508 /// This example shows, how you can read a HTML content from your XML document.
509 ///
510 /// ```
511 /// # use pretty_assertions::assert_eq;
512 /// # use std::borrow::Cow;
513 /// use quick_xml::events::{BytesStart, Event};
514 /// use quick_xml::reader::Reader;
515 ///
516 /// let mut reader = Reader::from_reader("
517 /// <html>
518 /// <title>This is a HTML text</title>
519 /// <p>Usual XML rules does not apply inside it
520 /// <p>For example, elements not needed to be "closed"
521 /// </html>
522 /// ".as_bytes());
523 /// reader.config_mut().trim_text(true);
524 ///
525 /// let start = BytesStart::new("html");
526 /// let end = start.to_end().into_owned();
527 ///
528 /// let mut buf = Vec::new();
529 ///
530 /// // First, we read a start event...
531 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
532 /// // ...and disable checking of end names because we expect HTML further...
533 /// reader.config_mut().check_end_names = false;
534 ///
535 /// // ...then, we could read text content until close tag.
536 /// // This call will correctly handle nested <html> elements.
537 /// let text = reader.read_text_into(end.name(), &mut buf).unwrap();
538 /// let text = text.decode().unwrap();
539 /// assert_eq!(text, r#"
540 /// <title>This is a HTML text</title>
541 /// <p>Usual XML rules does not apply inside it
542 /// <p>For example, elements not needed to be "closed"
543 /// "#);
544 /// assert!(matches!(text, Cow::Borrowed(_)));
545 ///
546 /// // Now we can enable checks again
547 /// reader.config_mut().check_end_names = true;
548 ///
549 /// // At the end we should get an Eof event, because we ate the whole XML
550 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
551 /// ```
552 ///
553 /// [`Start`]: Event::Start
554 /// [`read_text()`]: Self::read_text()
555 pub fn read_text_into<'b>(
556 &mut self,
557 end: QName,
558 buf: &'b mut Vec<u8>,
559 ) -> Result<BytesText<'b>> {
560 let start = buf.len();
561 let span = read_to_end!(self, end, buf, read_event_impl, {});
562
563 let len = span.end - span.start;
564 // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is
565 // not cleared when reading event, length of the returned span should fit into
566 // usize (because otherwise we panic at appending to the buffer before that point)
567 let end = start + len as usize;
568
569 Ok(BytesText::wrap(&buf[start..end], self.decoder()))
570 }
571}
572
573impl Reader<BufReader<File>> {
574 /// Creates an XML reader from a file path.
575 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
576 let file = File::open(path)?;
577 let reader = BufReader::new(file);
578 Ok(Self::from_reader(reader))
579 }
580}
581
582#[cfg(test)]
583mod test {
584 use crate::reader::test::check;
585 use crate::reader::XmlSource;
586
587 /// Default buffer constructor just pass the byte array from the test
588 fn identity<T>(input: T) -> T {
589 input
590 }
591
592 check!(
593 #[test]
594 read_event_impl,
595 read_until_close,
596 identity,
597 1,
598 &mut Vec::new()
599 );
600}