quick_xml/reader/buffered_reader.rs
1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::encoding;
9use crate::errors::{Error, Result};
10use crate::events::{BytesText, Event};
11use crate::name::QName;
12use crate::parser::Parser;
13use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
14use crate::utils::is_whitespace;
15
16macro_rules! impl_buffered_source {
17 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
18 #[cfg(not(feature = "encoding"))]
19 #[inline]
20 $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
21 loop {
22 break match self $(.$reader)? .fill_buf() $(.$await)? {
23 Ok(n) => {
24 if n.starts_with(encoding::UTF8_BOM) {
25 self $(.$reader)? .consume(encoding::UTF8_BOM.len());
26 }
27 Ok(())
28 },
29 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
30 Err(e) => Err(e),
31 };
32 }
33 }
34
35 #[cfg(feature = "encoding")]
36 #[inline]
37 $($async)? fn detect_encoding(&mut self) -> io::Result<Option<encoding::DetectedEncoding>> {
38 loop {
39 break match self $(.$reader)? .fill_buf() $(.$await)? {
40 Ok(n) => if let Some(detected) = encoding::detect_encoding(n) {
41 self $(.$reader)? .consume(detected.bom_len());
42 Ok(Some(detected))
43 } else {
44 Ok(None)
45 },
46 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
47 Err(e) => Err(e),
48 };
49 }
50 }
51
52 #[inline]
53 $($async)? fn read_text $(<$lf>)? (
54 &mut self,
55 buf: &'b mut Vec<u8>,
56 position: &mut u64,
57 ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
58 let mut read = 0;
59 let start = buf.len();
60 loop {
61 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
62 Ok(n) if n.is_empty() => break,
63 Ok(n) => n,
64 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
65 Err(e) => {
66 *position += read;
67 return ReadTextResult::Err(e);
68 }
69 };
70
71 // Search for start of markup or an entity or character reference
72 match memchr::memchr2(b'<', b'&', available) {
73 // Special handling is needed only on the first iteration.
74 // On next iterations we already read something and should emit Text event
75 Some(0) if read == 0 && available[0] == b'<' => return ReadTextResult::Markup(buf),
76 // Do not consume `&` because it may be lone and we would be need to
77 // return it as part of Text event
78 Some(0) if read == 0 => return ReadTextResult::Ref(buf),
79 Some(i) if available[i] == b'<' => {
80 buf.extend_from_slice(&available[..i]);
81
82 self $(.$reader)? .consume(i);
83 read += i as u64;
84
85 *position += read;
86 return ReadTextResult::UpToMarkup(&buf[start..]);
87 }
88 Some(i) => {
89 buf.extend_from_slice(&available[..i]);
90
91 self $(.$reader)? .consume(i);
92 read += i as u64;
93
94 *position += read;
95 return ReadTextResult::UpToRef(&buf[start..]);
96 }
97 None => {
98 buf.extend_from_slice(available);
99
100 let used = available.len();
101 self $(.$reader)? .consume(used);
102 read += used as u64;
103 }
104 }
105 }
106
107 *position += read;
108 ReadTextResult::UpToEof(&buf[start..])
109 }
110
111 #[inline]
112 $($async)? fn read_ref $(<$lf>)? (
113 &mut self,
114 buf: &'b mut Vec<u8>,
115 position: &mut u64,
116 ) -> ReadRefResult<'b> {
117 let mut read = 0;
118 let start = buf.len();
119 loop {
120 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
121 Ok(n) if n.is_empty() => break,
122 Ok(n) => n,
123 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
124 Err(e) => {
125 *position += read;
126 return ReadRefResult::Err(e);
127 }
128 };
129 // `read_ref` called when the first character is `&`, so we
130 // should explicitly skip it at first iteration lest we confuse
131 // it with the end
132 if read == 0 {
133 debug_assert!(
134 available.starts_with(b"&"),
135 "`read_ref` must be called at `&`:\n{:?}",
136 crate::utils::Bytes(available)
137 );
138 // If that ampersand is lone, then it will be part of text
139 // and we should keep it
140 buf.push(b'&');
141 self $(.$reader)? .consume(1);
142 read += 1;
143 continue;
144 }
145
146 match memchr::memchr3(b';', b'&', b'<', available) {
147 Some(i) if available[i] == b';' => {
148 // +1 -- skip the end `;`
149 let used = i + 1;
150
151 buf.extend_from_slice(&available[..used]);
152 self $(.$reader)? .consume(used);
153 read += used as u64;
154
155 *position += read;
156
157 return ReadRefResult::Ref(&buf[start..]);
158 }
159 // Do not consume `&` because it may be lone and we would be need to
160 // return it as part of Text event
161 Some(i) => {
162 let is_amp = available[i] == b'&';
163 buf.extend_from_slice(&available[..i]);
164
165 self $(.$reader)? .consume(i);
166 read += i as u64;
167
168 *position += read;
169
170 return if is_amp {
171 ReadRefResult::UpToRef(&buf[start..])
172 } else {
173 ReadRefResult::UpToMarkup(&buf[start..])
174 };
175 }
176 None => {
177 buf.extend_from_slice(available);
178
179 let used = available.len();
180 self $(.$reader)? .consume(used);
181 read += used as u64;
182 }
183 }
184 }
185
186 *position += read;
187 ReadRefResult::UpToEof(&buf[start..])
188 }
189
190 #[inline]
191 $($async)? fn read_with<$($lf,)? P: Parser>(
192 &mut self,
193 mut parser: P,
194 buf: &'b mut Vec<u8>,
195 position: &mut u64,
196 ) -> Result<&'b [u8]> {
197 let mut read = 1;
198 let start = buf.len();
199 // '<' was consumed in peek_one(), but not placed in buf
200 buf.push(b'<');
201 loop {
202 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
203 Ok(n) if n.is_empty() => break,
204 Ok(n) => n,
205 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
206 Err(e) => {
207 *position += read;
208 return Err(Error::from(e));
209 }
210 };
211
212 if let Some(i) = parser.feed(available) {
213 let used = i + 1; // +1 for `>`
214 buf.extend_from_slice(&available[..used]);
215
216 self $(.$reader)? .consume(used);
217 read += used as u64;
218
219 *position += read;
220 return Ok(&buf[start..]);
221 }
222
223 // The `>` symbol not yet found, continue reading
224 buf.extend_from_slice(available);
225
226 let used = available.len();
227 self $(.$reader)? .consume(used);
228 read += used as u64;
229 }
230
231 *position += read;
232 Err(Error::Syntax(parser.eof_error(&buf[start..])))
233 }
234
235 #[inline]
236 $($async)? fn read_bang_element $(<$lf>)? (
237 &mut self,
238 buf: &'b mut Vec<u8>,
239 position: &mut u64,
240 ) -> Result<(BangType, &'b [u8])> {
241 // Peeked '<!' before being called, so it's guaranteed to start with it.
242 let start = buf.len();
243 let mut read = 2;
244 // '<' was consumed in peek_one(), but not placed in buf
245 buf.push(b'<');
246 buf.push(b'!');
247 self $(.$reader)? .consume(1);
248
249 let mut bang_type = loop {
250 break match self $(.$reader)? .fill_buf() $(.$await)? {
251 Ok(n) => BangType::new(n.first().cloned())?,
252 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
253 Err(e) => return Err(Error::from(e)),
254 };
255 };
256
257 loop {
258 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
259 Ok(n) if n.is_empty() => break,
260 Ok(n) => n,
261 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
262 Err(e) => {
263 *position += read;
264 return Err(Error::from(e));
265 }
266 };
267 // We only parse from start because we don't want to consider
268 // whatever is in the buffer before the bang element
269 if let Some(i) = bang_type.feed(&buf[start..], available) {
270 let consumed = i + 1; // +1 for `>`
271 buf.extend_from_slice(&available[..consumed]);
272
273 self $(.$reader)? .consume(consumed);
274 read += consumed as u64;
275
276 *position += read;
277 return Ok((bang_type, &buf[start..]));
278 }
279
280 // The `>` symbol not yet found, continue reading
281 buf.extend_from_slice(available);
282
283 let used = available.len();
284 self $(.$reader)? .consume(used);
285 read += used as u64;
286 }
287
288 *position += read;
289 Err(Error::Syntax(bang_type.to_err()))
290 }
291
292 #[inline]
293 $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
294 loop {
295 break match self $(.$reader)? .fill_buf() $(.$await)? {
296 Ok(n) => {
297 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
298 if count > 0 {
299 self $(.$reader)? .consume(count);
300 *position += count as u64;
301 continue;
302 } else {
303 Ok(())
304 }
305 }
306 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
307 Err(e) => Err(e),
308 };
309 }
310 }
311
312 #[inline]
313 $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
314 // That method is called only when available buffer starts from '<'
315 // We need to consume it
316 self $(.$reader)? .consume(1);
317 let available = loop {
318 break match self $(.$reader)? .fill_buf() $(.$await)? {
319 Ok(n) => n,
320 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
321 Err(e) => return Err(e),
322 };
323 };
324 Ok(available.first().cloned())
325 }
326 };
327}
328
329// Make it public for use in async implementations.
330// New rustc reports
331// > warning: the item `impl_buffered_source` is imported redundantly
332// so make it public only when async feature is enabled
333#[cfg(feature = "async-tokio")]
334pub(super) use impl_buffered_source;
335
336/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
337/// `Vec<u8>` as buffer that will be borrowed by events.
338impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
339 impl_buffered_source!();
340}
341
342////////////////////////////////////////////////////////////////////////////////////////////////////
343
344/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
345impl<R: BufRead> Reader<R> {
346 /// Reads the next `Event`.
347 ///
348 /// This is the main entry point for reading XML `Event`s.
349 ///
350 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
351 /// internally).
352 ///
353 /// Having the possibility to control the internal buffers gives you some additional benefits
354 /// such as:
355 ///
356 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
357 /// you can call `buf.clear()` once you are done with processing the event (typically at the
358 /// end of your loop).
359 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
360 ///
361 /// # Examples
362 ///
363 /// ```
364 /// # use pretty_assertions::assert_eq;
365 /// use quick_xml::events::Event;
366 /// use quick_xml::reader::Reader;
367 ///
368 /// let xml = r#"<tag1 att1 = "test">
369 /// <tag2><!--Test comment-->Test</tag2>
370 /// <tag2>Test 2</tag2>
371 /// </tag1>"#;
372 /// let mut reader = Reader::from_str(xml);
373 /// reader.config_mut().trim_text(true);
374 /// let mut count = 0;
375 /// let mut buf = Vec::new();
376 /// let mut txt = Vec::new();
377 /// loop {
378 /// match reader.read_event_into(&mut buf) {
379 /// Ok(Event::Start(_)) => count += 1,
380 /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
381 /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
382 /// Ok(Event::Eof) => break,
383 /// _ => (),
384 /// }
385 /// buf.clear();
386 /// }
387 /// assert_eq!(count, 3);
388 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
389 /// ```
390 #[inline]
391 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
392 self.read_event_impl(buf)
393 }
394
395 /// Reads until end element is found using provided buffer as intermediate
396 /// storage for events content. This function is supposed to be called after
397 /// you already read a [`Start`] event.
398 ///
399 /// Returns a span that cover content between `>` of an opening tag and `<` of
400 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
401 /// this method was called after reading expanded [`Start`] event.
402 ///
403 /// Manages nested cases where parent and child elements have the _literally_
404 /// same name.
405 ///
406 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
407 /// will be returned. In particularly, that error will be returned if you call
408 /// this method without consuming the corresponding [`Start`] event first.
409 ///
410 /// If your reader created from a string slice or byte array slice, it is
411 /// better to use [`read_to_end()`] method, because it will not copy bytes
412 /// into intermediate buffer.
413 ///
414 /// The provided `buf` buffer will be filled only by one event content at time.
415 /// Before reading of each event the buffer will be cleared. If you know an
416 /// appropriate size of each event, you can preallocate the buffer to reduce
417 /// number of reallocations.
418 ///
419 /// The `end` parameter should contain name of the end element _in the reader
420 /// encoding_. It is good practice to always get that parameter using
421 /// [`BytesStart::to_end()`] method.
422 ///
423 /// The correctness of the skipped events does not checked, if you disabled
424 /// the [`check_end_names`] option.
425 ///
426 /// # Namespaces
427 ///
428 /// While the `Reader` does not support namespace resolution, namespaces
429 /// does not change the algorithm for comparing names. Although the names
430 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
431 /// same namespace, are semantically equivalent, `</b:name>` cannot close
432 /// `<a:name>`, because according to [the specification]
433 ///
434 /// > The end of every element that begins with a **start-tag** MUST be marked
435 /// > by an **end-tag** containing a name that echoes the element's type as
436 /// > given in the **start-tag**
437 ///
438 /// # Examples
439 ///
440 /// This example shows, how you can skip XML content after you read the
441 /// start event.
442 ///
443 /// ```
444 /// # use pretty_assertions::assert_eq;
445 /// use quick_xml::events::{BytesStart, Event};
446 /// use quick_xml::reader::Reader;
447 ///
448 /// let mut reader = Reader::from_str(r#"
449 /// <outer>
450 /// <inner>
451 /// <inner></inner>
452 /// <inner/>
453 /// <outer></outer>
454 /// <outer/>
455 /// </inner>
456 /// </outer>
457 /// "#);
458 /// reader.config_mut().trim_text(true);
459 /// let mut buf = Vec::new();
460 ///
461 /// let start = BytesStart::new("outer");
462 /// let end = start.to_end().into_owned();
463 ///
464 /// // First, we read a start event...
465 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
466 ///
467 /// // ...then, we could skip all events to the corresponding end event.
468 /// // This call will correctly handle nested <outer> elements.
469 /// // Note, however, that this method does not handle namespaces.
470 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
471 ///
472 /// // At the end we should get an Eof event, because we ate the whole XML
473 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
474 /// ```
475 ///
476 /// [`Start`]: Event::Start
477 /// [`End`]: Event::End
478 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
479 /// [`read_to_end()`]: Self::read_to_end
480 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
481 /// [`check_end_names`]: crate::reader::Config::check_end_names
482 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
483 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
484 Ok(read_to_end!(self, end, buf, read_event_impl, {
485 buf.clear();
486 }))
487 }
488
489 /// Reads content between start and end tags, including any markup using
490 /// provided buffer as intermediate storage for events content. This function
491 /// is supposed to be called after you already read a [`Start`] event.
492 ///
493 /// Manages nested cases where parent and child elements have the _literally_
494 /// same name.
495 ///
496 /// This method does not unescape read data, instead it returns content
497 /// "as is" of the XML document. This is because it has no idea what text
498 /// it reads, and if, for example, it contains CDATA section, attempt to
499 /// unescape it content will spoil data.
500 ///
501 /// If your reader created from a string slice or byte array slice, it is
502 /// better to use [`read_text()`] method, because it will not copy bytes
503 /// into intermediate buffer.
504 ///
505 /// # Examples
506 ///
507 /// This example shows, how you can read a HTML content from your XML document.
508 ///
509 /// ```
510 /// # use pretty_assertions::assert_eq;
511 /// # use std::borrow::Cow;
512 /// use quick_xml::events::{BytesStart, Event};
513 /// use quick_xml::reader::Reader;
514 ///
515 /// let mut reader = Reader::from_reader("
516 /// <html>
517 /// <title>This is a HTML text</title>
518 /// <p>Usual XML rules does not apply inside it
519 /// <p>For example, elements not needed to be "closed"
520 /// </html>
521 /// ".as_bytes());
522 /// reader.config_mut().trim_text(true);
523 ///
524 /// let start = BytesStart::new("html");
525 /// let end = start.to_end().into_owned();
526 ///
527 /// let mut buf = Vec::new();
528 ///
529 /// // First, we read a start event...
530 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
531 /// // ...and disable checking of end names because we expect HTML further...
532 /// reader.config_mut().check_end_names = false;
533 ///
534 /// // ...then, we could read text content until close tag.
535 /// // This call will correctly handle nested <html> elements.
536 /// let text = reader.read_text_into(end.name(), &mut buf).unwrap();
537 /// let text = text.decode().unwrap();
538 /// assert_eq!(text, r#"
539 /// <title>This is a HTML text</title>
540 /// <p>Usual XML rules does not apply inside it
541 /// <p>For example, elements not needed to be "closed"
542 /// "#);
543 /// assert!(matches!(text, Cow::Borrowed(_)));
544 ///
545 /// // Now we can enable checks again
546 /// reader.config_mut().check_end_names = true;
547 ///
548 /// // At the end we should get an Eof event, because we ate the whole XML
549 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
550 /// ```
551 ///
552 /// [`Start`]: Event::Start
553 /// [`read_text()`]: Self::read_text()
554 pub fn read_text_into<'b>(
555 &mut self,
556 end: QName,
557 buf: &'b mut Vec<u8>,
558 ) -> Result<BytesText<'b>> {
559 let start = buf.len();
560 let span = read_to_end!(self, end, buf, read_event_impl, {});
561
562 let len = span.end - span.start;
563 // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is
564 // not cleared when reading event, length of the returned span should fit into
565 // usize (because otherwise we panic at appending to the buffer before that point)
566 let end = start + len as usize;
567
568 Ok(BytesText::wrap(&buf[start..end], self.decoder()))
569 }
570}
571
572impl Reader<BufReader<File>> {
573 /// Creates an XML reader from a file path.
574 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
575 let file = File::open(path)?;
576 let reader = BufReader::new(file);
577 Ok(Self::from_reader(reader))
578 }
579}
580
581#[cfg(test)]
582mod test {
583 use crate::reader::test::check;
584 use crate::reader::XmlSource;
585
586 /// Default buffer constructor just pass the byte array from the test
587 fn identity<T>(input: T) -> T {
588 input
589 }
590
591 check!(
592 #[test]
593 read_event_impl,
594 read_until_close,
595 identity,
596 1,
597 &mut Vec::new()
598 );
599}