fast_xml/reader.rs
1//! A module to handle `Reader`
2
3#[cfg(feature = "encoding")]
4use std::borrow::Cow;
5use std::io::{self, BufRead, BufReader};
6use std::{fs::File, path::Path, str::from_utf8};
7
8#[cfg(feature = "encoding")]
9use encoding_rs::{Encoding, UTF_16BE, UTF_16LE};
10
11use crate::errors::{Error, Result};
12use crate::events::attributes::Attribute;
13use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
14
15use memchr;
16
17#[derive(Clone)]
18enum TagState {
19 Opened,
20 Closed,
21 Empty,
22 /// Either Eof or Errored
23 Exit,
24}
25
26/// A low level encoding-agnostic XML event reader.
27///
28/// Consumes a `BufRead` and streams XML `Event`s.
29///
30/// # Examples
31///
32/// ```
33/// use fast_xml::Reader;
34/// use fast_xml::events::Event;
35///
36/// let xml = r#"<tag1 att1 = "test">
37/// <tag2><!--Test comment-->Test</tag2>
38/// <tag2>Test 2</tag2>
39/// </tag1>"#;
40/// let mut reader = Reader::from_str(xml);
41/// reader.trim_text(true);
42/// let mut count = 0;
43/// let mut txt = Vec::new();
44/// let mut buf = Vec::new();
45/// loop {
46/// match reader.read_event(&mut buf) {
47/// Ok(Event::Start(ref e)) => {
48/// match e.name() {
49/// b"tag1" => println!("attributes values: {:?}",
50/// e.attributes().map(|a| a.unwrap().value)
51/// .collect::<Vec<_>>()),
52/// b"tag2" => count += 1,
53/// _ => (),
54/// }
55/// },
56/// Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
57/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
58/// Ok(Event::Eof) => break,
59/// _ => (),
60/// }
61/// buf.clear();
62/// }
63/// ```
64#[derive(Clone)]
65pub struct Reader<R: BufRead> {
66 /// reader
67 pub(crate) reader: R,
68 /// current buffer position, useful for debugging errors
69 buf_position: usize,
70 /// current state Open/Close
71 tag_state: TagState,
72 /// expand empty element into an opening and closing element
73 expand_empty_elements: bool,
74 /// trims leading whitespace in Text events, skip the element if text is empty
75 trim_text_start: bool,
76 /// trims trailing whitespace in Text events.
77 trim_text_end: bool,
78 /// trims trailing whitespaces from markup names in closing tags `</a >`
79 trim_markup_names_in_closing_tags: bool,
80 /// check if End nodes match last Start node
81 check_end_names: bool,
82 /// check if comments contains `--` (false per default)
83 check_comments: bool,
84 /// All currently Started elements which didn't have a matching
85 /// End element yet.
86 ///
87 /// For an XML
88 ///
89 /// ```xml
90 /// <root><one/><inner attr="value">|<tag></inner></root>
91 /// ```
92 /// when cursor at the `|` position buffer contains:
93 ///
94 /// ```text
95 /// rootinner
96 /// ^ ^
97 /// ```
98 ///
99 /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
100 /// (0 and 4 in that case).
101 opened_buffer: Vec<u8>,
102 /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
103 /// for that field for details
104 opened_starts: Vec<usize>,
105 /// a buffer to manage namespaces
106 ns_resolver: NamespaceResolver,
107 #[cfg(feature = "encoding")]
108 /// the encoding specified in the xml, defaults to utf8
109 encoding: &'static Encoding,
110 #[cfg(feature = "encoding")]
111 /// check if quick-rs could find out the encoding
112 is_encoding_set: bool,
113}
114
115impl<R: BufRead> Reader<R> {
116 /// Creates a `Reader` that reads from a reader implementing `BufRead`.
117 pub fn from_reader(reader: R) -> Reader<R> {
118 Reader {
119 reader,
120 opened_buffer: Vec::new(),
121 opened_starts: Vec::new(),
122 tag_state: TagState::Closed,
123 expand_empty_elements: false,
124 trim_text_start: false,
125 trim_text_end: false,
126 trim_markup_names_in_closing_tags: true,
127 check_end_names: true,
128 buf_position: 0,
129 check_comments: false,
130 ns_resolver: NamespaceResolver::default(),
131 #[cfg(feature = "encoding")]
132 encoding: ::encoding_rs::UTF_8,
133 #[cfg(feature = "encoding")]
134 is_encoding_set: false,
135 }
136 }
137
138 /// Changes whether empty elements should be split into an `Open` and a `Close` event.
139 ///
140 /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
141 /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
142 /// default), those tags are represented by an [`Empty`] event instead.
143 ///
144 /// Note, that setting this to `true` will lead to additional allocates that
145 /// needed to store tag name for an [`End`] event. There is no additional
146 /// allocation, however, if [`Self::check_end_names()`] is also set.
147 ///
148 /// (`false` by default)
149 ///
150 /// [`Empty`]: events/enum.Event.html#variant.Empty
151 /// [`Start`]: events/enum.Event.html#variant.Start
152 /// [`End`]: events/enum.Event.html#variant.End
153 pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader<R> {
154 self.expand_empty_elements = val;
155 self
156 }
157
158 /// Changes whether whitespace before and after character data should be removed.
159 ///
160 /// When set to `true`, all [`Text`] events are trimmed. If they are empty, no event will be
161 /// pushed.
162 ///
163 /// (`false` by default)
164 ///
165 /// [`Text`]: events/enum.Event.html#variant.Text
166 pub fn trim_text(&mut self, val: bool) -> &mut Reader<R> {
167 self.trim_text_start = val;
168 self.trim_text_end = val;
169 self
170 }
171
172 /// Changes whether whitespace after character data should be removed.
173 ///
174 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
175 ///
176 /// (`false` by default)
177 ///
178 /// [`Text`]: events/enum.Event.html#variant.Text
179 pub fn trim_text_end(&mut self, val: bool) -> &mut Reader<R> {
180 self.trim_text_end = val;
181 self
182 }
183
184 /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
185 /// `</a >`.
186 ///
187 /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
188 ///
189 /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
190 /// going to fail erronously if a closing tag contains trailing whitespaces.
191 ///
192 /// (`true` by default)
193 ///
194 /// [`End`]: events/enum.Event.html#variant.End
195 pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader<R> {
196 self.trim_markup_names_in_closing_tags = val;
197 self
198 }
199
200 /// Changes whether mismatched closing tag names should be detected.
201 ///
202 /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
203 /// For example, `<mytag></different_tag>` will be permitted.
204 ///
205 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
206 ///
207 /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
208 /// contain the data of the mismatched end tag.
209 ///
210 /// Note, that setting this to `true` will lead to additional allocates that
211 /// needed to store tag name for an [`End`] event. There is no additional
212 /// allocation, however, if [`Self::expand_empty_elements()`] is also set.
213 ///
214 /// (`true` by default)
215 ///
216 /// [`End`]: events/enum.Event.html#variant.End
217 pub fn check_end_names(&mut self, val: bool) -> &mut Reader<R> {
218 self.check_end_names = val;
219 self
220 }
221
222 /// Changes whether comments should be validated.
223 ///
224 /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
225 /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
226 /// really care about comment correctness, thus the default value is `false` to improve
227 /// performance.
228 ///
229 /// (`false` by default)
230 ///
231 /// [`Comment`]: events/enum.Event.html#variant.Comment
232 pub fn check_comments(&mut self, val: bool) -> &mut Reader<R> {
233 self.check_comments = val;
234 self
235 }
236
237 /// Gets the current byte position in the input data.
238 ///
239 /// Useful when debugging errors.
240 pub fn buffer_position(&self) -> usize {
241 // when internal state is Opened, we have actually read until '<',
242 // which we don't want to show
243 if let TagState::Opened = self.tag_state {
244 self.buf_position - 1
245 } else {
246 self.buf_position
247 }
248 }
249
250 /// private function to read until '<' is found
251 /// return a `Text` event
252 fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
253 where
254 R: XmlSource<'i, B>,
255 {
256 self.tag_state = TagState::Opened;
257
258 if self.trim_text_start {
259 self.reader.skip_whitespace(&mut self.buf_position)?;
260 if self.reader.skip_one(b'<', &mut self.buf_position)? {
261 return self.read_event_buffered(buf);
262 }
263 }
264
265 match self
266 .reader
267 .read_bytes_until(b'<', buf, &mut self.buf_position)
268 {
269 Ok(Some(bytes)) if self.trim_text_end => {
270 // Skip the ending '<
271 let len = bytes
272 .iter()
273 .rposition(|&b| !is_whitespace(b))
274 .map_or_else(|| bytes.len(), |p| p + 1);
275 Ok(Event::Text(BytesText::from_escaped(&bytes[..len])))
276 }
277 Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))),
278 Ok(None) => Ok(Event::Eof),
279 Err(e) => Err(e),
280 }
281 }
282
283 /// Private function to read until `>` is found. This function expects that
284 /// it was called just after encounter a `<` symbol.
285 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
286 where
287 R: XmlSource<'i, B>,
288 {
289 self.tag_state = TagState::Closed;
290
291 match self.reader.peek_one() {
292 // `<!` - comment, CDATA or DOCTYPE declaration
293 Ok(Some(b'!')) => match self.reader.read_bang_element(buf, &mut self.buf_position) {
294 Ok(None) => Ok(Event::Eof),
295 Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes),
296 Err(e) => Err(e),
297 },
298 // `</` - closing tag
299 Ok(Some(b'/')) => match self
300 .reader
301 .read_bytes_until(b'>', buf, &mut self.buf_position)
302 {
303 Ok(None) => Ok(Event::Eof),
304 Ok(Some(bytes)) => self.read_end(bytes),
305 Err(e) => Err(e),
306 },
307 // `<?` - processing instruction
308 Ok(Some(b'?')) => match self
309 .reader
310 .read_bytes_until(b'>', buf, &mut self.buf_position)
311 {
312 Ok(None) => Ok(Event::Eof),
313 Ok(Some(bytes)) => self.read_question_mark(bytes),
314 Err(e) => Err(e),
315 },
316 // `<...` - opening or self-closed tag
317 Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) {
318 Ok(None) => Ok(Event::Eof),
319 Ok(Some(bytes)) => self.read_start(bytes),
320 Err(e) => Err(e),
321 },
322 Ok(None) => Ok(Event::Eof),
323 Err(e) => Err(e),
324 }
325 }
326
327 /// reads `BytesElement` starting with a `/`,
328 /// if `self.check_end_names`, checks that element matches last opened element
329 /// return `End` event
330 fn read_end<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
331 // XML standard permits whitespaces after the markup name in closing tags.
332 // Let's strip them from the buffer before comparing tag names.
333 let name = if self.trim_markup_names_in_closing_tags {
334 if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
335 let (name, _) = buf[1..].split_at(pos_end_name + 1);
336 name
337 } else {
338 &buf[1..]
339 }
340 } else {
341 &buf[1..]
342 };
343 if self.check_end_names {
344 let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| {
345 *buf_position -= buf.len();
346 Err(Error::EndEventMismatch {
347 expected: from_utf8(expected).unwrap_or("").to_owned(),
348 found: from_utf8(found).unwrap_or("").to_owned(),
349 })
350 };
351 match self.opened_starts.pop() {
352 Some(start) => {
353 let expected = &self.opened_buffer[start..];
354 if name != expected {
355 mismatch_err(expected, name, &mut self.buf_position)
356 } else {
357 self.opened_buffer.truncate(start);
358 Ok(Event::End(BytesEnd::borrowed(name)))
359 }
360 }
361 None => mismatch_err(b"", &buf[1..], &mut self.buf_position),
362 }
363 } else {
364 Ok(Event::End(BytesEnd::borrowed(name)))
365 }
366 }
367
368 /// reads `BytesElement` starting with a `!`,
369 /// return `Comment`, `CData` or `DocType` event
370 fn read_bang<'a, 'b>(&'a mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
371 let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
372 string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
373 };
374
375 let len = buf.len();
376 match bang_type {
377 BangType::Comment if buf.starts_with(b"!--") => {
378 if self.check_comments {
379 // search if '--' not in comments
380 if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
381 .position(|p| buf[3 + p + 1] == b'-')
382 {
383 self.buf_position += len - p;
384 return Err(Error::UnexpectedToken("--".to_string()));
385 }
386 }
387 Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2])))
388 }
389 BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
390 Ok(Event::CData(BytesCData::new(&buf[8..])))
391 }
392 BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
393 let start = buf[8..]
394 .iter()
395 .position(|b| !is_whitespace(*b))
396 .unwrap_or_else(|| len - 8);
397 debug_assert!(start < len - 8, "DocType must have a name");
398 Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..])))
399 }
400 _ => Err(bang_type.to_err()),
401 }
402 }
403
404 /// reads `BytesElement` starting with a `?`,
405 /// return `Decl` or `PI` event
406 fn read_question_mark<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
407 let len = buf.len();
408 if len > 2 && buf[len - 1] == b'?' {
409 if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
410 let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3));
411
412 // Try getting encoding from the declaration event
413 #[cfg(feature = "encoding")]
414 if let Some(enc) = event.encoder() {
415 self.encoding = enc;
416 self.is_encoding_set = true;
417 }
418
419 Ok(Event::Decl(event))
420 } else {
421 Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1])))
422 }
423 } else {
424 self.buf_position -= len;
425 Err(Error::UnexpectedEof("XmlDecl".to_string()))
426 }
427 }
428
429 #[inline]
430 fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
431 self.tag_state = TagState::Closed;
432 let name = self
433 .opened_buffer
434 .split_off(self.opened_starts.pop().unwrap());
435 Ok(Event::End(BytesEnd::owned(name)))
436 }
437
438 /// reads `BytesElement` starting with any character except `/`, `!` or ``?`
439 /// return `Start` or `Empty` event
440 fn read_start<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
441 // TODO: do this directly when reading bufreader ...
442 let len = buf.len();
443 let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
444 if let Some(&b'/') = buf.last() {
445 let end = if name_end < len { name_end } else { len - 1 };
446 if self.expand_empty_elements {
447 self.tag_state = TagState::Empty;
448 self.opened_starts.push(self.opened_buffer.len());
449 self.opened_buffer.extend(&buf[..end]);
450 Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end)))
451 } else {
452 Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end)))
453 }
454 } else {
455 if self.check_end_names {
456 self.opened_starts.push(self.opened_buffer.len());
457 self.opened_buffer.extend(&buf[..name_end]);
458 }
459 Ok(Event::Start(BytesStart::borrowed(buf, name_end)))
460 }
461 }
462
463 /// Reads the next `Event`.
464 ///
465 /// This is the main entry point for reading XML `Event`s.
466 ///
467 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
468 /// internally).
469 ///
470 /// Having the possibility to control the internal buffers gives you some additional benefits
471 /// such as:
472 ///
473 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
474 /// you can call `buf.clear()` once you are done with processing the event (typically at the
475 /// end of your loop).
476 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
477 ///
478 /// # Examples
479 ///
480 /// ```
481 /// use fast_xml::Reader;
482 /// use fast_xml::events::Event;
483 ///
484 /// let xml = r#"<tag1 att1 = "test">
485 /// <tag2><!--Test comment-->Test</tag2>
486 /// <tag2>Test 2</tag2>
487 /// </tag1>"#;
488 /// let mut reader = Reader::from_str(xml);
489 /// reader.trim_text(true);
490 /// let mut count = 0;
491 /// let mut buf = Vec::new();
492 /// let mut txt = Vec::new();
493 /// loop {
494 /// match reader.read_event(&mut buf) {
495 /// Ok(Event::Start(ref e)) => count += 1,
496 /// Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).expect("Error!")),
497 /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
498 /// Ok(Event::Eof) => break,
499 /// _ => (),
500 /// }
501 /// buf.clear();
502 /// }
503 /// println!("Found {} start events", count);
504 /// println!("Text events: {:?}", txt);
505 /// ```
506 #[inline]
507 pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
508 self.read_event_buffered(buf)
509 }
510
511 /// Read text into the given buffer, and return an event that borrows from
512 /// either that buffer or from the input itself, based on the type of the
513 /// reader.
514 fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
515 where
516 R: XmlSource<'i, B>,
517 {
518 let event = match self.tag_state {
519 TagState::Opened => self.read_until_close(buf),
520 TagState::Closed => self.read_until_open(buf),
521 TagState::Empty => self.close_expanded_empty(),
522 TagState::Exit => return Ok(Event::Eof),
523 };
524 match event {
525 Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit,
526 _ => {}
527 }
528 event
529 }
530
531 /// Resolves a potentially qualified **event name** into (namespace name, local name).
532 ///
533 /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined
534 /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
535 /// can be defined on the same element as the attribute in question.
536 ///
537 /// *Unqualified* event inherits the current *default namespace*.
538 #[inline]
539 pub fn event_namespace<'a, 'b, 'c>(
540 &'a self,
541 qname: &'b [u8],
542 namespace_buffer: &'c [u8],
543 ) -> (Option<&'c [u8]>, &'b [u8]) {
544 self.ns_resolver.resolve(qname, namespace_buffer, true)
545 }
546
547 /// Resolves a potentially qualified **attribute name** into (namespace name, local name).
548 ///
549 /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined
550 /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
551 /// can be defined on the same element as the attribute in question.
552 ///
553 /// *Unqualified* attribute names do *not* inherit the current *default namespace*.
554 #[inline]
555 pub fn attribute_namespace<'a, 'b, 'c>(
556 &'a self,
557 qname: &'b [u8],
558 namespace_buffer: &'c [u8],
559 ) -> (Option<&'c [u8]>, &'b [u8]) {
560 self.ns_resolver.resolve(qname, namespace_buffer, false)
561 }
562
563 /// Reads the next event and resolves its namespace (if applicable).
564 ///
565 /// # Examples
566 ///
567 /// ```
568 /// use std::str::from_utf8;
569 /// use fast_xml::Reader;
570 /// use fast_xml::events::Event;
571 ///
572 /// let xml = r#"<x:tag1 xmlns:x="www.xxxx" xmlns:y="www.yyyy" att1 = "test">
573 /// <y:tag2><!--Test comment-->Test</y:tag2>
574 /// <y:tag2>Test 2</y:tag2>
575 /// </x:tag1>"#;
576 /// let mut reader = Reader::from_str(xml);
577 /// reader.trim_text(true);
578 /// let mut count = 0;
579 /// let mut buf = Vec::new();
580 /// let mut ns_buf = Vec::new();
581 /// let mut txt = Vec::new();
582 /// loop {
583 /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) {
584 /// Ok((ref ns, Event::Start(ref e))) => {
585 /// count += 1;
586 /// match (*ns, e.local_name()) {
587 /// (Some(b"www.xxxx"), b"tag1") => (),
588 /// (Some(b"www.yyyy"), b"tag2") => (),
589 /// (ns, n) => panic!("Namespace and local name mismatch"),
590 /// }
591 /// println!("Resolved namespace: {:?}", ns.and_then(|ns| from_utf8(ns).ok()));
592 /// }
593 /// Ok((_, Event::Text(e))) => {
594 /// txt.push(e.unescape_and_decode(&reader).expect("Error!"))
595 /// },
596 /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
597 /// Ok((_, Event::Eof)) => break,
598 /// _ => (),
599 /// }
600 /// buf.clear();
601 /// }
602 /// println!("Found {} start events", count);
603 /// println!("Text events: {:?}", txt);
604 /// ```
605 pub fn read_namespaced_event<'a, 'b, 'c>(
606 &'a mut self,
607 buf: &'b mut Vec<u8>,
608 namespace_buffer: &'c mut Vec<u8>,
609 ) -> Result<(Option<&'c [u8]>, Event<'b>)> {
610 self.ns_resolver.pop(namespace_buffer);
611 match self.read_event(buf) {
612 Ok(Event::Eof) => Ok((None, Event::Eof)),
613 Ok(Event::Start(e)) => {
614 self.ns_resolver.push(&e, namespace_buffer);
615 Ok((
616 self.ns_resolver.find(e.name(), &**namespace_buffer),
617 Event::Start(e),
618 ))
619 }
620 Ok(Event::Empty(e)) => {
621 // For empty elements we need to 'artificially' keep the namespace scope on the
622 // stack until the next `next()` call occurs.
623 // Otherwise the caller has no chance to use `resolve` in the context of the
624 // namespace declarations that are 'in scope' for the empty element alone.
625 // Ex: <img rdf:nodeID="abc" xmlns:rdf="urn:the-rdf-uri" />
626 self.ns_resolver.push(&e, namespace_buffer);
627 // notify next `read_namespaced_event()` invocation that it needs to pop this
628 // namespace scope
629 self.ns_resolver.pending_pop = true;
630 Ok((
631 self.ns_resolver.find(e.name(), &**namespace_buffer),
632 Event::Empty(e),
633 ))
634 }
635 Ok(Event::End(e)) => {
636 // notify next `read_namespaced_event()` invocation that it needs to pop this
637 // namespace scope
638 self.ns_resolver.pending_pop = true;
639 Ok((
640 self.ns_resolver.find(e.name(), &**namespace_buffer),
641 Event::End(e),
642 ))
643 }
644 Ok(e) => Ok((None, e)),
645 Err(e) => Err(e),
646 }
647 }
648
649 /// Returns the `Reader`s encoding.
650 ///
651 /// The used encoding may change after parsing the XML declaration.
652 ///
653 /// This encoding will be used by [`decode`].
654 ///
655 /// [`decode`]: #method.decode
656 #[cfg(feature = "encoding")]
657 pub fn encoding(&self) -> &'static Encoding {
658 self.encoding
659 }
660
661 /// Decodes a slice using the encoding specified in the XML declaration.
662 ///
663 /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
664 /// `U+FFFD REPLACEMENT CHARACTER`.
665 ///
666 /// If no encoding is specified, defaults to UTF-8.
667 #[inline]
668 #[cfg(feature = "encoding")]
669 pub fn decode<'b, 'c>(&'b self, bytes: &'c [u8]) -> Cow<'c, str> {
670 self.encoding.decode(bytes).0
671 }
672
673 /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration.
674 ///
675 /// Decode `bytes` without BOM and with malformed sequences replaced with the
676 /// `U+FFFD REPLACEMENT CHARACTER`.
677 ///
678 /// # Note
679 ///
680 /// If you instead want to use XML declared encoding, use the `encoding` feature
681 #[inline]
682 #[cfg(not(feature = "encoding"))]
683 pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
684 if bytes.starts_with(b"\xEF\xBB\xBF") {
685 from_utf8(&bytes[3..]).map_err(Error::Utf8)
686 } else {
687 from_utf8(bytes).map_err(Error::Utf8)
688 }
689 }
690
691 /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration.
692 ///
693 /// Decode `bytes` without BOM and with malformed sequences replaced with the
694 /// `U+FFFD REPLACEMENT CHARACTER`.
695 ///
696 /// If no encoding is specified, defaults to UTF-8.
697 #[inline]
698 #[cfg(feature = "encoding")]
699 pub fn decode_without_bom<'b, 'c>(&'b mut self, mut bytes: &'c [u8]) -> Cow<'c, str> {
700 if self.is_encoding_set {
701 return self.encoding.decode_with_bom_removal(bytes).0;
702 }
703 if bytes.starts_with(b"\xEF\xBB\xBF") {
704 self.is_encoding_set = true;
705 bytes = &bytes[3..];
706 } else if bytes.starts_with(b"\xFF\xFE") {
707 self.is_encoding_set = true;
708 self.encoding = UTF_16LE;
709 bytes = &bytes[2..];
710 } else if bytes.starts_with(b"\xFE\xFF") {
711 self.is_encoding_set = true;
712 self.encoding = UTF_16BE;
713 bytes = &bytes[3..];
714 };
715 self.encoding.decode_without_bom_handling(bytes).0
716 }
717
718 /// Decodes a UTF8 slice regardless of XML declaration.
719 ///
720 /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
721 /// `U+FFFD REPLACEMENT CHARACTER`.
722 ///
723 /// # Note
724 ///
725 /// If you instead want to use XML declared encoding, use the `encoding` feature
726 #[inline]
727 #[cfg(not(feature = "encoding"))]
728 pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
729 from_utf8(bytes).map_err(Error::Utf8)
730 }
731
732 /// Get utf8 decoder
733 #[cfg(feature = "encoding")]
734 pub fn decoder(&self) -> Decoder {
735 Decoder {
736 encoding: self.encoding,
737 }
738 }
739
740 /// Get utf8 decoder
741 #[cfg(not(feature = "encoding"))]
742 pub fn decoder(&self) -> Decoder {
743 Decoder
744 }
745
746 /// Reads until end element is found
747 ///
748 /// Manages nested cases where parent and child elements have the same name
749 pub fn read_to_end<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<()> {
750 let mut depth = 0;
751 let end = end.as_ref();
752 loop {
753 match self.read_event(buf) {
754 Ok(Event::End(ref e)) if e.name() == end => {
755 if depth == 0 {
756 return Ok(());
757 }
758 depth -= 1;
759 }
760 Ok(Event::Start(ref e)) if e.name() == end => depth += 1,
761 Err(e) => return Err(e),
762 Ok(Event::Eof) => {
763 return Err(Error::UnexpectedEof(format!("</{:?}>", from_utf8(end))));
764 }
765 _ => (),
766 }
767 buf.clear();
768 }
769 }
770
771 /// Reads optional text between start and end tags.
772 ///
773 /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
774 /// `String`. If the next event is an [`End`] event, returns the empty string. In all other
775 /// cases, returns an error.
776 ///
777 /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
778 /// if none is specified).
779 ///
780 /// # Examples
781 ///
782 /// ```
783 /// # use pretty_assertions::assert_eq;
784 /// use fast_xml::Reader;
785 /// use fast_xml::events::Event;
786 ///
787 /// let mut xml = Reader::from_reader(b"
788 /// <a><b></a>
789 /// <a></a>
790 /// " as &[u8]);
791 /// xml.trim_text(true);
792 ///
793 /// let expected = ["<b>", ""];
794 /// for &content in expected.iter() {
795 /// match xml.read_event(&mut Vec::new()) {
796 /// Ok(Event::Start(ref e)) => {
797 /// assert_eq!(&xml.read_text(e.name(), &mut Vec::new()).unwrap(), content);
798 /// },
799 /// e => panic!("Expecting Start event, found {:?}", e),
800 /// }
801 /// }
802 /// ```
803 ///
804 /// [`Text`]: events/enum.Event.html#variant.Text
805 /// [`End`]: events/enum.Event.html#variant.End
806 pub fn read_text<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<String> {
807 let s = match self.read_event(buf) {
808 Ok(Event::Text(e)) => e.unescape_and_decode(self),
809 Ok(Event::End(ref e)) if e.name() == end.as_ref() => return Ok("".to_string()),
810 Err(e) => return Err(e),
811 Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
812 _ => return Err(Error::TextNotFound),
813 };
814 self.read_to_end(end, buf)?;
815 s
816 }
817
818 /// Consumes `Reader` returning the underlying reader
819 ///
820 /// Can be used to compute line and column of a parsing error position
821 ///
822 /// # Examples
823 ///
824 /// ```
825 /// # use pretty_assertions::assert_eq;
826 /// use std::{str, io::Cursor};
827 /// use fast_xml::Reader;
828 /// use fast_xml::events::Event;
829 ///
830 /// let xml = r#"<tag1 att1 = "test">
831 /// <tag2><!--Test comment-->Test</tag2>
832 /// <tag3>Test 2</tag3>
833 /// </tag1>"#;
834 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
835 /// let mut buf = Vec::new();
836 ///
837 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
838 /// let end_pos = reader.buffer_position();
839 /// let mut cursor = reader.into_inner();
840 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
841 /// .expect("can't make a string");
842 /// let mut line = 1;
843 /// let mut column = 0;
844 /// for c in s.chars() {
845 /// if c == '\n' {
846 /// line += 1;
847 /// column = 0;
848 /// } else {
849 /// column += 1;
850 /// }
851 /// }
852 /// (line, column)
853 /// }
854 ///
855 /// loop {
856 /// match reader.read_event(&mut buf) {
857 /// Ok(Event::Start(ref e)) => match e.name() {
858 /// b"tag1" | b"tag2" => (),
859 /// tag => {
860 /// assert_eq!(b"tag3", tag);
861 /// assert_eq!((3, 22), into_line_and_column(reader));
862 /// break;
863 /// }
864 /// },
865 /// Ok(Event::Eof) => unreachable!(),
866 /// _ => (),
867 /// }
868 /// buf.clear();
869 /// }
870 /// ```
871 pub fn into_inner(self) -> R {
872 self.reader
873 }
874
875 /// Gets a reference to the underlying reader.
876 pub fn get_ref(&self) -> &R {
877 &self.reader
878 }
879
880 /// Gets a mutable reference to the underlying reader.
881 pub fn get_mut(&mut self) -> &mut R {
882 &mut self.reader
883 }
884}
885
886impl Reader<BufReader<File>> {
887 /// Creates an XML reader from a file path.
888 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Reader<BufReader<File>>> {
889 let file = File::open(path).map_err(Error::Io)?;
890 let reader = BufReader::new(file);
891 Ok(Reader::from_reader(reader))
892 }
893}
894
895impl<'a> Reader<&'a [u8]> {
896 /// Creates an XML reader from a string slice.
897 pub fn from_str(s: &'a str) -> Reader<&'a [u8]> {
898 Reader::from_reader(s.as_bytes())
899 }
900
901 /// Creates an XML reader from a slice of bytes.
902 pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> {
903 Reader::from_reader(s)
904 }
905
906 /// Read an event that borrows from the input rather than a buffer.
907 #[inline]
908 pub fn read_event_unbuffered(&mut self) -> Result<Event<'a>> {
909 self.read_event_buffered(())
910 }
911
912 /// Reads until end element is found
913 ///
914 /// Manages nested cases where parent and child elements have the same name
915 pub fn read_to_end_unbuffered<K: AsRef<[u8]>>(&mut self, end: K) -> Result<()> {
916 let mut depth = 0;
917 let end = end.as_ref();
918 loop {
919 match self.read_event_unbuffered() {
920 Ok(Event::End(ref e)) if e.name() == end => {
921 if depth == 0 {
922 return Ok(());
923 }
924 depth -= 1;
925 }
926 Ok(Event::Start(ref e)) if e.name() == end => depth += 1,
927 Err(e) => return Err(e),
928 Ok(Event::Eof) => {
929 return Err(Error::UnexpectedEof(format!("</{:?}>", from_utf8(end))));
930 }
931 _ => (),
932 }
933 }
934 }
935}
936
937/// Represents an input for a reader that can return borrowed data.
938///
939/// There are two implementors of this trait: generic one that read data from
940/// `Self`, copies some part of it into a provided buffer of type `B` and then
941/// returns data that borrow from that buffer.
942///
943/// The other implementor is for `&[u8]` and instead of copying data returns
944/// borrowed data from `Self` instead. This implementation allows zero-copy
945/// deserialization.
946///
947/// # Parameters
948/// - `'r`: lifetime of a buffer from which events will borrow
949/// - `B`: a type of a buffer that can be used to store data read from `Self` and
950/// from which events can borrow
951trait XmlSource<'r, B> {
952 /// Read input until `byte` is found or end of input is reached.
953 ///
954 /// Returns a slice of data read up to `byte`, which does not include into result.
955 /// If input (`Self`) is exhausted, returns `None`.
956 ///
957 /// # Example
958 ///
959 /// ```ignore
960 /// let mut position = 0;
961 /// let mut input = b"abc*def".as_ref();
962 /// // ^= 4
963 ///
964 /// assert_eq!(
965 /// input.read_bytes_until(b'*', (), &mut position).unwrap(),
966 /// Some(b"abc".as_ref())
967 /// );
968 /// assert_eq!(position, 4); // position after the symbol matched
969 /// ```
970 ///
971 /// # Parameters
972 /// - `byte`: Byte for search
973 /// - `buf`: Buffer that could be filled from an input (`Self`) and
974 /// from which [events] could borrow their data
975 /// - `position`: Will be increased by amount of bytes consumed
976 ///
977 /// [events]: crate::events::Event
978 fn read_bytes_until(
979 &mut self,
980 byte: u8,
981 buf: B,
982 position: &mut usize,
983 ) -> Result<Option<&'r [u8]>>;
984
985 /// Read input until comment, CDATA or processing instruction is finished.
986 ///
987 /// This method expect that `<` already was read.
988 ///
989 /// Returns a slice of data read up to end of comment, CDATA or processing
990 /// instruction (`>`), which does not include into result.
991 ///
992 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
993 ///
994 /// # Parameters
995 /// - `buf`: Buffer that could be filled from an input (`Self`) and
996 /// from which [events] could borrow their data
997 /// - `position`: Will be increased by amount of bytes consumed
998 ///
999 /// [events]: crate::events::Event
1000 fn read_bang_element(
1001 &mut self,
1002 buf: B,
1003 position: &mut usize,
1004 ) -> Result<Option<(BangType, &'r [u8])>>;
1005
1006 /// Read input until XML element is closed by approaching a `>` symbol.
1007 /// Returns `Some(buffer)` that contains a data between `<` and `>` or
1008 /// `None` if end-of-input was reached and nothing was read.
1009 ///
1010 /// Derived from `read_until`, but modified to handle XML attributes
1011 /// using a minimal state machine.
1012 ///
1013 /// Attribute values are [defined] as follows:
1014 /// ```plain
1015 /// AttValue := '"' (([^<&"]) | Reference)* '"'
1016 /// | "'" (([^<&']) | Reference)* "'"
1017 /// ```
1018 /// (`Reference` is something like `"`, but we don't care about
1019 /// escaped characters at this level)
1020 ///
1021 /// # Parameters
1022 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1023 /// from which [events] could borrow their data
1024 /// - `position`: Will be increased by amount of bytes consumed
1025 ///
1026 /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
1027 /// [events]: crate::events::Event
1028 fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
1029
1030 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
1031
1032 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
1033
1034 fn peek_one(&mut self) -> Result<Option<u8>>;
1035}
1036
1037/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
1038/// `Vec<u8>` as buffer that will be borrowed by events.
1039impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
1040 #[inline]
1041 fn read_bytes_until(
1042 &mut self,
1043 byte: u8,
1044 buf: &'b mut Vec<u8>,
1045 position: &mut usize,
1046 ) -> Result<Option<&'b [u8]>> {
1047 let mut read = 0;
1048 let mut done = false;
1049 let start = buf.len();
1050 while !done {
1051 let used = {
1052 let available = match self.fill_buf() {
1053 Ok(n) if n.is_empty() => break,
1054 Ok(n) => n,
1055 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1056 Err(e) => {
1057 *position += read;
1058 return Err(Error::Io(e));
1059 }
1060 };
1061
1062 match memchr::memchr(byte, available) {
1063 Some(i) => {
1064 buf.extend_from_slice(&available[..i]);
1065 done = true;
1066 i + 1
1067 }
1068 None => {
1069 buf.extend_from_slice(available);
1070 available.len()
1071 }
1072 }
1073 };
1074 self.consume(used);
1075 read += used;
1076 }
1077 *position += read;
1078
1079 if read == 0 {
1080 Ok(None)
1081 } else {
1082 Ok(Some(&buf[start..]))
1083 }
1084 }
1085
1086 fn read_bang_element(
1087 &mut self,
1088 buf: &'b mut Vec<u8>,
1089 position: &mut usize,
1090 ) -> Result<Option<(BangType, &'b [u8])>> {
1091 // Peeked one bang ('!') before being called, so it's guaranteed to
1092 // start with it.
1093 let start = buf.len();
1094 let mut read = 1;
1095 buf.push(b'!');
1096 self.consume(1);
1097
1098 let bang_type = BangType::new(self.peek_one()?)?;
1099
1100 loop {
1101 match self.fill_buf() {
1102 // Note: Do not update position, so the error points to
1103 // somewhere sane rather than at the EOF
1104 Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
1105 Ok(available) => {
1106 if let Some((consumed, used)) = bang_type.parse(available, read) {
1107 buf.extend_from_slice(consumed);
1108
1109 self.consume(used);
1110 read += used;
1111
1112 *position += read;
1113 break;
1114 } else {
1115 buf.extend_from_slice(available);
1116
1117 let used = available.len();
1118 self.consume(used);
1119 read += used;
1120 }
1121 }
1122 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1123 Err(e) => {
1124 *position += read;
1125 return Err(Error::Io(e));
1126 }
1127 }
1128 }
1129
1130 if read == 0 {
1131 Ok(None)
1132 } else {
1133 Ok(Some((bang_type, &buf[start..])))
1134 }
1135 }
1136
1137 #[inline]
1138 fn read_element(
1139 &mut self,
1140 buf: &'b mut Vec<u8>,
1141 position: &mut usize,
1142 ) -> Result<Option<&'b [u8]>> {
1143 let mut state = ReadElementState::Elem;
1144 let mut read = 0;
1145
1146 let start = buf.len();
1147 loop {
1148 match self.fill_buf() {
1149 Ok(n) if n.is_empty() => break,
1150 Ok(available) => {
1151 if let Some((consumed, used)) = state.change(available) {
1152 buf.extend_from_slice(consumed);
1153
1154 self.consume(used);
1155 read += used;
1156
1157 *position += read;
1158 break;
1159 } else {
1160 buf.extend_from_slice(available);
1161
1162 let used = available.len();
1163 self.consume(used);
1164 read += used;
1165 }
1166 }
1167 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1168 Err(e) => {
1169 *position += read;
1170 return Err(Error::Io(e));
1171 }
1172 };
1173 }
1174
1175 if read == 0 {
1176 Ok(None)
1177 } else {
1178 Ok(Some(&buf[start..]))
1179 }
1180 }
1181
1182 /// Consume and discard all the whitespace until the next non-whitespace
1183 /// character or EOF.
1184 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
1185 loop {
1186 break match self.fill_buf() {
1187 Ok(n) => {
1188 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
1189 if count > 0 {
1190 self.consume(count);
1191 *position += count;
1192 continue;
1193 } else {
1194 Ok(())
1195 }
1196 }
1197 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1198 Err(e) => Err(Error::Io(e)),
1199 };
1200 }
1201 }
1202
1203 /// Consume and discard one character if it matches the given byte. Return
1204 /// true if it matched.
1205 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
1206 match self.peek_one()? {
1207 Some(b) if b == byte => {
1208 *position += 1;
1209 self.consume(1);
1210 Ok(true)
1211 }
1212 _ => Ok(false),
1213 }
1214 }
1215
1216 /// Return one character without consuming it, so that future `read_*` calls
1217 /// will still include it. On EOF, return None.
1218 fn peek_one(&mut self) -> Result<Option<u8>> {
1219 loop {
1220 break match self.fill_buf() {
1221 Ok(n) if n.is_empty() => Ok(None),
1222 Ok(n) => Ok(Some(n[0])),
1223 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1224 Err(e) => Err(Error::Io(e)),
1225 };
1226 }
1227 }
1228}
1229
1230/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
1231/// that will be borrowed by events. This implementation provides a zero-copy deserialization
1232impl<'a> XmlSource<'a, ()> for &'a [u8] {
1233 fn read_bytes_until(
1234 &mut self,
1235 byte: u8,
1236 _buf: (),
1237 position: &mut usize,
1238 ) -> Result<Option<&'a [u8]>> {
1239 if self.is_empty() {
1240 return Ok(None);
1241 }
1242
1243 Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
1244 *position += i + 1;
1245 let bytes = &self[..i];
1246 *self = &self[i + 1..];
1247 bytes
1248 } else {
1249 *position += self.len();
1250 let bytes = &self[..];
1251 *self = &[];
1252 bytes
1253 }))
1254 }
1255
1256 fn read_bang_element(
1257 &mut self,
1258 _buf: (),
1259 position: &mut usize,
1260 ) -> Result<Option<(BangType, &'a [u8])>> {
1261 // Peeked one bang ('!') before being called, so it's guaranteed to
1262 // start with it.
1263 debug_assert_eq!(self[0], b'!');
1264
1265 let bang_type = BangType::new(self[1..].first().copied())?;
1266
1267 if let Some((bytes, i)) = bang_type.parse(self, 0) {
1268 *position += i;
1269 *self = &self[i..];
1270 return Ok(Some((bang_type, bytes)));
1271 }
1272
1273 // Note: Do not update position, so the error points to
1274 // somewhere sane rather than at the EOF
1275 Err(bang_type.to_err())
1276 }
1277
1278 fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
1279 if self.is_empty() {
1280 return Ok(None);
1281 }
1282
1283 let mut state = ReadElementState::Elem;
1284
1285 if let Some((bytes, i)) = state.change(self) {
1286 *position += i;
1287 *self = &self[i..];
1288 return Ok(Some(bytes));
1289 }
1290
1291 // Note: Do not update position, so the error points to a sane place
1292 // rather than at the EOF.
1293 Err(Error::UnexpectedEof("Element".to_string()))
1294
1295 // FIXME: Figure out why the other one works without UnexpectedEof
1296 }
1297
1298 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
1299 let whitespaces = self
1300 .iter()
1301 .position(|b| !is_whitespace(*b))
1302 .unwrap_or(self.len());
1303 *position += whitespaces;
1304 *self = &self[whitespaces..];
1305 Ok(())
1306 }
1307
1308 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
1309 if self.first() == Some(&byte) {
1310 *self = &self[1..];
1311 *position += 1;
1312 Ok(true)
1313 } else {
1314 Ok(false)
1315 }
1316 }
1317
1318 fn peek_one(&mut self) -> Result<Option<u8>> {
1319 Ok(self.first().copied())
1320 }
1321}
1322
1323/// Possible elements started with `<!`
1324#[derive(Debug, PartialEq)]
1325enum BangType {
1326 /// <![CDATA[...]]>
1327 CData,
1328 /// <!--...-->
1329 Comment,
1330 /// <!DOCTYPE...>
1331 DocType,
1332}
1333impl BangType {
1334 #[inline(always)]
1335 fn new(byte: Option<u8>) -> Result<Self> {
1336 Ok(match byte {
1337 Some(b'[') => Self::CData,
1338 Some(b'-') => Self::Comment,
1339 Some(b'D') | Some(b'd') => Self::DocType,
1340 Some(b) => return Err(Error::UnexpectedBang(b)),
1341 None => return Err(Error::UnexpectedEof("Bang".to_string())),
1342 })
1343 }
1344
1345 /// If element is finished, returns its content up to `>` symbol and
1346 /// an index of this symbol, otherwise returns `None`
1347 #[inline(always)]
1348 fn parse<'b>(&self, chunk: &'b [u8], offset: usize) -> Option<(&'b [u8], usize)> {
1349 for i in memchr::memchr_iter(b'>', chunk) {
1350 match self {
1351 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1352 // <!----> - XML comment
1353 // 012345 - i
1354 Self::Comment => {
1355 if offset + i > 4 && chunk[..i].ends_with(b"--") {
1356 // We cannot strip last `--` from the buffer because we need it in case of
1357 // check_comments enabled option. XML standard requires that comment
1358 // will not end with `--->` sequence because this is a special case of
1359 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1360 return Some((&chunk[..i], i + 1)); // +1 for `>`
1361 }
1362 }
1363 Self::CData => {
1364 if chunk[..i].ends_with(b"]]") {
1365 return Some((&chunk[..i - 2], i + 1)); // +1 for `>`
1366 }
1367 }
1368 Self::DocType => {
1369 let content = &chunk[..i];
1370 let balance = memchr::memchr2_iter(b'<', b'>', content)
1371 .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
1372 .sum::<i32>();
1373 if balance == 0 {
1374 return Some((content, i + 1)); // +1 for `>`
1375 }
1376 }
1377 }
1378 }
1379 None
1380 }
1381 #[inline]
1382 fn to_err(self) -> Error {
1383 let bang_str = match self {
1384 Self::CData => "CData",
1385 Self::Comment => "Comment",
1386 Self::DocType => "DOCTYPE",
1387 };
1388 Error::UnexpectedEof(bang_str.to_string())
1389 }
1390}
1391
1392/// State machine for the [`XmlSource::read_element`]
1393#[derive(Clone, Copy)]
1394enum ReadElementState {
1395 /// The initial state (inside element, but outside of attribute value)
1396 Elem,
1397 /// Inside a single-quoted attribute value
1398 SingleQ,
1399 /// Inside a double-quoted attribute value
1400 DoubleQ,
1401}
1402impl ReadElementState {
1403 /// Changes state by analyzing part of input.
1404 /// Returns a tuple with part of chunk up to element closing symbol `>`
1405 /// and a position after that symbol or `None` if such symbol was not found
1406 #[inline(always)]
1407 fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1408 for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
1409 *self = match (*self, chunk[i]) {
1410 // only allowed to match `>` while we are in state `Elem`
1411 (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
1412 (Self::Elem, b'\'') => Self::SingleQ,
1413 (Self::Elem, b'\"') => Self::DoubleQ,
1414
1415 // the only end_byte that gets us out if the same character
1416 (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
1417
1418 // all other bytes: no state change
1419 _ => *self,
1420 };
1421 }
1422 None
1423 }
1424}
1425
1426/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
1427#[inline]
1428pub(crate) fn is_whitespace(b: u8) -> bool {
1429 match b {
1430 b' ' | b'\r' | b'\n' | b'\t' => true,
1431 _ => false,
1432 }
1433}
1434
1435/// An entry that contains index into the buffer with namespace bindings.
1436///
1437/// Defines a mapping from *[namespace prefix]* to *[namespace name]*.
1438/// If prefix is empty, defines a *default namespace* binding that applies to
1439/// unprefixed element names (unprefixed attribute names do not bind to any
1440/// namespace and they processing is dependent on the element in which their
1441/// defined).
1442///
1443/// [namespace prefix]: https://www.w3.org/TR/xml-names11/#dt-prefix
1444/// [namespace name]: https://www.w3.org/TR/xml-names11/#dt-NSName
1445#[derive(Debug, Clone)]
1446struct NamespaceEntry {
1447 /// Index of the namespace in the buffer
1448 start: usize,
1449 /// Length of the prefix
1450 /// * if greater than zero, then binds this namespace to the slice
1451 /// `[start..start + prefix_len]` in the buffer.
1452 /// * else defines the current default namespace.
1453 prefix_len: usize,
1454 /// The length of a namespace name (the URI) of this namespace declaration.
1455 /// Name started just after prefix and extend for `value_len` bytes.
1456 ///
1457 /// The XML standard [specifies] that an empty namespace value 'removes' a namespace declaration
1458 /// for the extent of its scope. For prefix declarations that's not very interesting, but it is
1459 /// vital for default namespace declarations. With `xmlns=""` you can revert back to the default
1460 /// behaviour of leaving unqualified element names unqualified.
1461 ///
1462 /// [specifies]: https://www.w3.org/TR/xml-names11/#scoping
1463 value_len: usize,
1464 /// Level of nesting at which this namespace was declared. The declaring element is included,
1465 /// i.e., a declaration on the document root has `level = 1`.
1466 /// This is used to pop the namespace when the element gets closed.
1467 level: i32,
1468}
1469
1470impl NamespaceEntry {
1471 /// Gets the namespace name (the URI) slice out of namespace buffer
1472 ///
1473 /// Returns `None` if namespace for this prefix was explicitly removed from
1474 /// scope, using `xmlns[:prefix]=""`
1475 #[inline]
1476 fn namespace<'b>(&self, buffer: &'b [u8]) -> Option<&'b [u8]> {
1477 if self.value_len == 0 {
1478 None
1479 } else {
1480 let start = self.start + self.prefix_len;
1481 Some(&buffer[start..start + self.value_len])
1482 }
1483 }
1484
1485 /// Check if the namespace matches the potentially qualified name
1486 #[inline]
1487 fn is_match(&self, buffer: &[u8], qname: &[u8]) -> bool {
1488 if self.prefix_len == 0 {
1489 !qname.contains(&b':')
1490 } else {
1491 qname.get(self.prefix_len).map_or(false, |n| *n == b':')
1492 && qname.starts_with(&buffer[self.start..self.start + self.prefix_len])
1493 }
1494 }
1495}
1496
1497/// A namespace management buffer.
1498///
1499/// Holds all internal logic to push/pop namespaces with their levels.
1500#[derive(Debug, Default, Clone)]
1501struct NamespaceResolver {
1502 /// A stack of namespace bindings to prefixes that currently in scope
1503 bindings: Vec<NamespaceEntry>,
1504 /// The number of open tags at the moment. We need to keep track of this to know which namespace
1505 /// declarations to remove when we encounter an `End` event.
1506 nesting_level: i32,
1507 /// For `Empty` events keep the 'scope' of the element on the stack artificially. That way, the
1508 /// consumer has a chance to use `resolve` in the context of the empty element. We perform the
1509 /// pop as the first operation in the next `next()` call.
1510 pending_pop: bool,
1511}
1512
1513impl NamespaceResolver {
1514 /// Finds a [namespace name] for a given qualified name of element, borrow it
1515 /// from the specified buffer.
1516 ///
1517 /// Returns `None`, if:
1518 /// - name is unqualified
1519 /// - prefix not found in the current scope
1520 /// - prefix was [unbound] using `xmlns:prefix=""`
1521 ///
1522 /// # Lifetimes
1523 ///
1524 /// - `'n`: lifetime of an element name
1525 /// - `'b`: lifetime of a namespaces buffer, where all found namespaces are stored
1526 ///
1527 /// [namespace name]: https://www.w3.org/TR/xml-names11/#dt-NSName
1528 /// [unbound]: https://www.w3.org/TR/xml-names11/#scoping
1529 #[inline]
1530 fn find<'n, 'b>(&self, element_name: &'n [u8], buffer: &'b [u8]) -> Option<&'b [u8]> {
1531 self.bindings
1532 .iter()
1533 .rfind(|n| n.is_match(buffer, element_name))
1534 .and_then(|n| n.namespace(buffer))
1535 }
1536
1537 /// Ends a top-most scope by popping all [namespace binding], that was added by
1538 /// last call to [`Self::push()`].
1539 ///
1540 /// [namespace binding]: https://www.w3.org/TR/xml-names11/#dt-NSDecl
1541 fn pop(&mut self, buffer: &mut Vec<u8>) {
1542 if !self.pending_pop {
1543 return;
1544 }
1545 self.pending_pop = false;
1546 self.nesting_level -= 1;
1547 let current_level = self.nesting_level;
1548 // from the back (most deeply nested scope), look for the first scope that is still valid
1549 match self.bindings.iter().rposition(|n| n.level <= current_level) {
1550 // none of the namespaces are valid, remove all of them
1551 None => {
1552 buffer.clear();
1553 self.bindings.clear();
1554 }
1555 // drop all namespaces past the last valid namespace
1556 Some(last_valid_pos) => {
1557 if let Some(len) = self.bindings.get(last_valid_pos + 1).map(|n| n.start) {
1558 buffer.truncate(len);
1559 self.bindings.truncate(last_valid_pos + 1);
1560 }
1561 }
1562 }
1563 }
1564
1565 /// Begins a new scope and add to it all [namespace bindings] that found in
1566 /// the specified start element.
1567 ///
1568 /// [namespace binding]: https://www.w3.org/TR/xml-names11/#dt-NSDecl
1569 fn push(&mut self, start: &BytesStart, buffer: &mut Vec<u8>) {
1570 self.nesting_level += 1;
1571 let level = self.nesting_level;
1572 // adds new namespaces for attributes starting with 'xmlns:' and for the 'xmlns'
1573 // (default namespace) attribute.
1574 for a in start.attributes().with_checks(false) {
1575 if let Ok(Attribute { key: k, value: v }) = a {
1576 if k.starts_with(b"xmlns") {
1577 match k.get(5) {
1578 None => {
1579 let start = buffer.len();
1580 buffer.extend_from_slice(&*v);
1581 self.bindings.push(NamespaceEntry {
1582 start,
1583 prefix_len: 0,
1584 value_len: v.len(),
1585 level,
1586 });
1587 }
1588 Some(&b':') => {
1589 let start = buffer.len();
1590 buffer.extend_from_slice(&k[6..]);
1591 buffer.extend_from_slice(&*v);
1592 self.bindings.push(NamespaceEntry {
1593 start,
1594 prefix_len: k.len() - 6,
1595 value_len: v.len(),
1596 level,
1597 });
1598 }
1599 _ => break,
1600 }
1601 }
1602 } else {
1603 break;
1604 }
1605 }
1606 }
1607
1608 /// Resolves a potentially qualified **attribute name** into (namespace name, local name).
1609 ///
1610 /// *Qualified* attribute names have the form `prefix:local-name` where the `prefix` is defined
1611 /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix
1612 /// can be defined on the same element as the attribute in question.
1613 ///
1614 /// *Unqualified* attribute names do *not* inherit the current *default namespace*.
1615 ///
1616 /// # Lifetimes
1617 ///
1618 /// - `'n`: lifetime of an attribute or an element name
1619 /// - `'b`: lifetime of a namespaces buffer, where all found namespaces are stored
1620 #[inline]
1621 fn resolve<'n, 'b>(
1622 &self,
1623 qname: &'n [u8],
1624 buffer: &'b [u8],
1625 use_default: bool,
1626 ) -> (Option<&'b [u8]>, &'n [u8]) {
1627 self.bindings
1628 .iter()
1629 .rfind(|n| n.is_match(buffer, qname))
1630 .map_or((None, qname), |n| {
1631 let len = n.prefix_len;
1632 if len > 0 {
1633 (n.namespace(buffer), &qname[len + 1..])
1634 } else if use_default {
1635 (n.namespace(buffer), qname)
1636 } else {
1637 (None, qname)
1638 }
1639 })
1640 }
1641}
1642
1643/// Utf8 Decoder
1644#[cfg(not(feature = "encoding"))]
1645#[derive(Clone, Copy, Debug)]
1646pub struct Decoder;
1647
1648/// Utf8 Decoder
1649#[cfg(feature = "encoding")]
1650#[derive(Clone, Copy, Debug)]
1651pub struct Decoder {
1652 encoding: &'static Encoding,
1653}
1654
1655impl Decoder {
1656 #[cfg(not(feature = "encoding"))]
1657 pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
1658 from_utf8(bytes).map_err(Error::Utf8)
1659 }
1660
1661 #[cfg(not(feature = "encoding"))]
1662 pub fn decode_owned<'c>(&self, bytes: Vec<u8>) -> Result<String> {
1663 String::from_utf8(bytes).map_err(|e| Error::Utf8(e.utf8_error()))
1664 }
1665
1666 #[cfg(feature = "encoding")]
1667 pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> {
1668 self.encoding.decode(bytes).0
1669 }
1670}
1671
1672#[cfg(test)]
1673mod test {
1674 macro_rules! check {
1675 ($buf:expr) => {
1676 mod read_bytes_until {
1677 use crate::reader::XmlSource;
1678 // Use Bytes for printing bytes as strings for ASCII range
1679 use crate::utils::Bytes;
1680 use pretty_assertions::assert_eq;
1681
1682 /// Checks that search in the empty buffer returns `None`
1683 #[test]
1684 fn empty() {
1685 let buf = $buf;
1686 let mut position = 0;
1687 let mut input = b"".as_ref();
1688 // ^= 0
1689
1690 assert_eq!(
1691 input
1692 .read_bytes_until(b'*', buf, &mut position)
1693 .unwrap()
1694 .map(Bytes),
1695 None
1696 );
1697 assert_eq!(position, 0);
1698 }
1699
1700 /// Checks that search in the buffer non-existent value returns entire buffer
1701 /// as a result and set `position` to `len()`
1702 #[test]
1703 fn non_existent() {
1704 let buf = $buf;
1705 let mut position = 0;
1706 let mut input = b"abcdef".as_ref();
1707 // ^= 6
1708
1709 assert_eq!(
1710 input
1711 .read_bytes_until(b'*', buf, &mut position)
1712 .unwrap()
1713 .map(Bytes),
1714 Some(Bytes(b"abcdef"))
1715 );
1716 assert_eq!(position, 6);
1717 }
1718
1719 /// Checks that search in the buffer an element that is located in the front of
1720 /// buffer returns empty slice as a result and set `position` to one symbol
1721 /// after match (`1`)
1722 #[test]
1723 fn at_the_start() {
1724 let buf = $buf;
1725 let mut position = 0;
1726 let mut input = b"*abcdef".as_ref();
1727 // ^= 1
1728
1729 assert_eq!(
1730 input
1731 .read_bytes_until(b'*', buf, &mut position)
1732 .unwrap()
1733 .map(Bytes),
1734 Some(Bytes(b""))
1735 );
1736 assert_eq!(position, 1); // position after the symbol matched
1737 }
1738
1739 /// Checks that search in the buffer an element that is located in the middle of
1740 /// buffer returns slice before that symbol as a result and set `position` to one
1741 /// symbol after match
1742 #[test]
1743 fn inside() {
1744 let buf = $buf;
1745 let mut position = 0;
1746 let mut input = b"abc*def".as_ref();
1747 // ^= 4
1748
1749 assert_eq!(
1750 input
1751 .read_bytes_until(b'*', buf, &mut position)
1752 .unwrap()
1753 .map(Bytes),
1754 Some(Bytes(b"abc"))
1755 );
1756 assert_eq!(position, 4); // position after the symbol matched
1757 }
1758
1759 /// Checks that search in the buffer an element that is located in the end of
1760 /// buffer returns slice before that symbol as a result and set `position` to one
1761 /// symbol after match (`len()`)
1762 #[test]
1763 fn in_the_end() {
1764 let buf = $buf;
1765 let mut position = 0;
1766 let mut input = b"abcdef*".as_ref();
1767 // ^= 7
1768
1769 assert_eq!(
1770 input
1771 .read_bytes_until(b'*', buf, &mut position)
1772 .unwrap()
1773 .map(Bytes),
1774 Some(Bytes(b"abcdef"))
1775 );
1776 assert_eq!(position, 7); // position after the symbol matched
1777 }
1778 }
1779
1780 mod read_bang_element {
1781 /// Checks that reading CDATA content works correctly
1782 mod cdata {
1783 use crate::errors::Error;
1784 use crate::reader::{BangType, XmlSource};
1785 use crate::utils::Bytes;
1786 use pretty_assertions::assert_eq;
1787
1788 /// Checks that if input begins like CDATA element, but CDATA start sequence
1789 /// is not finished, parsing ends with an error
1790 #[test]
1791 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1792 fn not_properly_start() {
1793 let buf = $buf;
1794 let mut position = 0;
1795 let mut input = b"![]]>other content".as_ref();
1796 // ^= 0
1797
1798 match input.read_bang_element(buf, &mut position) {
1799 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1800 x => assert!(
1801 false,
1802 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1803 x
1804 ),
1805 }
1806 assert_eq!(position, 0);
1807 }
1808
1809 /// Checks that if CDATA startup sequence was matched, but an end sequence
1810 /// is not found, parsing ends with an error
1811 #[test]
1812 fn not_closed() {
1813 let buf = $buf;
1814 let mut position = 0;
1815 let mut input = b"![CDATA[other content".as_ref();
1816 // ^= 0
1817
1818 match input.read_bang_element(buf, &mut position) {
1819 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1820 x => assert!(
1821 false,
1822 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1823 x
1824 ),
1825 }
1826 assert_eq!(position, 0);
1827 }
1828
1829 /// Checks that CDATA element without content inside parsed successfully
1830 #[test]
1831 fn empty() {
1832 let buf = $buf;
1833 let mut position = 0;
1834 let mut input = b"![CDATA[]]>other content".as_ref();
1835 // ^= 11
1836
1837 assert_eq!(
1838 input
1839 .read_bang_element(buf, &mut position)
1840 .unwrap()
1841 .map(|(ty, data)| (ty, Bytes(data))),
1842 Some((BangType::CData, Bytes(b"![CDATA[")))
1843 );
1844 assert_eq!(position, 11);
1845 }
1846
1847 /// Checks that CDATA element with content parsed successfully.
1848 /// Additionally checks that sequences inside CDATA that may look like
1849 /// a CDATA end sequence do not interrupt CDATA parsing
1850 #[test]
1851 fn with_content() {
1852 let buf = $buf;
1853 let mut position = 0;
1854 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1855 // ^= 28
1856
1857 assert_eq!(
1858 input
1859 .read_bang_element(buf, &mut position)
1860 .unwrap()
1861 .map(|(ty, data)| (ty, Bytes(data))),
1862 Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content")))
1863 );
1864 assert_eq!(position, 28);
1865 }
1866 }
1867
1868 /// Checks that reading XML comments works correctly. According to the [specification],
1869 /// comment data can contain any sequence except `--`:
1870 ///
1871 /// ```peg
1872 /// comment = '<--' (!'--' char)* '-->';
1873 /// char = [#x1-#x2C]
1874 /// / [#x2E-#xD7FF]
1875 /// / [#xE000-#xFFFD]
1876 /// / [#x10000-#x10FFFF]
1877 /// ```
1878 ///
1879 /// The presence of this limitation, however, is simply a poorly designed specification
1880 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1881 /// presence of these sequences by default. This tests allow such content.
1882 ///
1883 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1884 mod comment {
1885 use crate::errors::Error;
1886 use crate::reader::{BangType, XmlSource};
1887 use crate::utils::Bytes;
1888 use pretty_assertions::assert_eq;
1889
1890 #[test]
1891 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1892 fn not_properly_start() {
1893 let buf = $buf;
1894 let mut position = 0;
1895 let mut input = b"!- -->other content".as_ref();
1896 // ^= 0
1897
1898 match input.read_bang_element(buf, &mut position) {
1899 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1900 x => assert!(
1901 false,
1902 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1903 x
1904 ),
1905 }
1906 assert_eq!(position, 0);
1907 }
1908
1909 #[test]
1910 fn not_properly_end() {
1911 let buf = $buf;
1912 let mut position = 0;
1913 let mut input = b"!->other content".as_ref();
1914 // ^= 0
1915
1916 match input.read_bang_element(buf, &mut position) {
1917 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1918 x => assert!(
1919 false,
1920 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1921 x
1922 ),
1923 }
1924 assert_eq!(position, 0);
1925 }
1926
1927 #[test]
1928 fn not_closed1() {
1929 let buf = $buf;
1930 let mut position = 0;
1931 let mut input = b"!--other content".as_ref();
1932 // ^= 0
1933
1934 match input.read_bang_element(buf, &mut position) {
1935 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1936 x => assert!(
1937 false,
1938 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1939 x
1940 ),
1941 }
1942 assert_eq!(position, 0);
1943 }
1944
1945 #[test]
1946 fn not_closed2() {
1947 let buf = $buf;
1948 let mut position = 0;
1949 let mut input = b"!-->other content".as_ref();
1950 // ^= 0
1951
1952 match input.read_bang_element(buf, &mut position) {
1953 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1954 x => assert!(
1955 false,
1956 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1957 x
1958 ),
1959 }
1960 assert_eq!(position, 0);
1961 }
1962
1963 #[test]
1964 fn not_closed3() {
1965 let buf = $buf;
1966 let mut position = 0;
1967 let mut input = b"!--->other content".as_ref();
1968 // ^= 0
1969
1970 match input.read_bang_element(buf, &mut position) {
1971 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1972 x => assert!(
1973 false,
1974 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1975 x
1976 ),
1977 }
1978 assert_eq!(position, 0);
1979 }
1980
1981 #[test]
1982 fn empty() {
1983 let buf = $buf;
1984 let mut position = 0;
1985 let mut input = b"!---->other content".as_ref();
1986 // ^= 6
1987
1988 assert_eq!(
1989 input
1990 .read_bang_element(buf, &mut position)
1991 .unwrap()
1992 .map(|(ty, data)| (ty, Bytes(data))),
1993 Some((BangType::Comment, Bytes(b"!----")))
1994 );
1995 assert_eq!(position, 6);
1996 }
1997
1998 #[test]
1999 fn with_content() {
2000 let buf = $buf;
2001 let mut position = 0;
2002 let mut input = b"!--->comment<--->other content".as_ref();
2003 // ^= 17
2004
2005 assert_eq!(
2006 input
2007 .read_bang_element(buf, &mut position)
2008 .unwrap()
2009 .map(|(ty, data)| (ty, Bytes(data))),
2010 Some((BangType::Comment, Bytes(b"!--->comment<---")))
2011 );
2012 assert_eq!(position, 17);
2013 }
2014 }
2015
2016 /// Checks that reading DOCTYPE definition works correctly
2017 mod doctype {
2018 mod uppercase {
2019 use crate::errors::Error;
2020 use crate::reader::{BangType, XmlSource};
2021 use crate::utils::Bytes;
2022 use pretty_assertions::assert_eq;
2023
2024 #[test]
2025 fn not_properly_start() {
2026 let buf = $buf;
2027 let mut position = 0;
2028 let mut input = b"!D other content".as_ref();
2029 // ^= 0
2030
2031 match input.read_bang_element(buf, &mut position) {
2032 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2033 x => assert!(
2034 false,
2035 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2036 x
2037 ),
2038 }
2039 assert_eq!(position, 0);
2040 }
2041
2042 #[test]
2043 fn without_space() {
2044 let buf = $buf;
2045 let mut position = 0;
2046 let mut input = b"!DOCTYPEother content".as_ref();
2047 // ^= 0
2048
2049 match input.read_bang_element(buf, &mut position) {
2050 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2051 x => assert!(
2052 false,
2053 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2054 x
2055 ),
2056 }
2057 assert_eq!(position, 0);
2058 }
2059
2060 #[test]
2061 fn empty() {
2062 let buf = $buf;
2063 let mut position = 0;
2064 let mut input = b"!DOCTYPE>other content".as_ref();
2065 // ^= 9
2066
2067 assert_eq!(
2068 input
2069 .read_bang_element(buf, &mut position)
2070 .unwrap()
2071 .map(|(ty, data)| (ty, Bytes(data))),
2072 Some((BangType::DocType, Bytes(b"!DOCTYPE")))
2073 );
2074 assert_eq!(position, 9);
2075 }
2076
2077 #[test]
2078 fn not_closed() {
2079 let buf = $buf;
2080 let mut position = 0;
2081 let mut input = b"!DOCTYPE other content".as_ref();
2082 // ^= 0
2083
2084 match input.read_bang_element(buf, &mut position) {
2085 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2086 x => assert!(
2087 false,
2088 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2089 x
2090 ),
2091 }
2092 assert_eq!(position, 0);
2093 }
2094 }
2095
2096 mod lowercase {
2097 use crate::errors::Error;
2098 use crate::reader::{BangType, XmlSource};
2099 use crate::utils::Bytes;
2100 use pretty_assertions::assert_eq;
2101
2102 #[test]
2103 fn not_properly_start() {
2104 let buf = $buf;
2105 let mut position = 0;
2106 let mut input = b"!d other content".as_ref();
2107 // ^= 0
2108
2109 match input.read_bang_element(buf, &mut position) {
2110 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2111 x => assert!(
2112 false,
2113 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2114 x
2115 ),
2116 }
2117 assert_eq!(position, 0);
2118 }
2119
2120 #[test]
2121 fn without_space() {
2122 let buf = $buf;
2123 let mut position = 0;
2124 let mut input = b"!doctypeother content".as_ref();
2125 // ^= 0
2126
2127 match input.read_bang_element(buf, &mut position) {
2128 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2129 x => assert!(
2130 false,
2131 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2132 x
2133 ),
2134 }
2135 assert_eq!(position, 0);
2136 }
2137
2138 #[test]
2139 fn empty() {
2140 let buf = $buf;
2141 let mut position = 0;
2142 let mut input = b"!doctype>other content".as_ref();
2143 // ^= 9
2144
2145 assert_eq!(
2146 input
2147 .read_bang_element(buf, &mut position)
2148 .unwrap()
2149 .map(|(ty, data)| (ty, Bytes(data))),
2150 Some((BangType::DocType, Bytes(b"!doctype")))
2151 );
2152 assert_eq!(position, 9);
2153 }
2154
2155 #[test]
2156 fn not_closed() {
2157 let buf = $buf;
2158 let mut position = 0;
2159 let mut input = b"!doctype other content".as_ref();
2160 // ^= 0
2161
2162 match input.read_bang_element(buf, &mut position) {
2163 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2164 x => assert!(
2165 false,
2166 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2167 x
2168 ),
2169 }
2170 assert_eq!(position, 0);
2171 }
2172 }
2173 }
2174 }
2175
2176 mod read_element {
2177 use crate::reader::XmlSource;
2178 use crate::utils::Bytes;
2179 use pretty_assertions::assert_eq;
2180
2181 /// Checks that nothing was read from empty buffer
2182 #[test]
2183 fn empty() {
2184 let buf = $buf;
2185 let mut position = 0;
2186 let mut input = b"".as_ref();
2187 // ^= 0
2188
2189 assert_eq!(input.read_element(buf, &mut position).unwrap().map(Bytes), None);
2190 assert_eq!(position, 0);
2191 }
2192
2193 mod open {
2194 use crate::reader::XmlSource;
2195 use crate::utils::Bytes;
2196 use pretty_assertions::assert_eq;
2197
2198 #[test]
2199 fn empty_tag() {
2200 let buf = $buf;
2201 let mut position = 0;
2202 let mut input = b">".as_ref();
2203 // ^= 1
2204
2205 assert_eq!(
2206 input.read_element(buf, &mut position).unwrap().map(Bytes),
2207 Some(Bytes(b""))
2208 );
2209 assert_eq!(position, 1);
2210 }
2211
2212 #[test]
2213 fn normal() {
2214 let buf = $buf;
2215 let mut position = 0;
2216 let mut input = b"tag>".as_ref();
2217 // ^= 4
2218
2219 assert_eq!(
2220 input.read_element(buf, &mut position).unwrap().map(Bytes),
2221 Some(Bytes(b"tag"))
2222 );
2223 assert_eq!(position, 4);
2224 }
2225
2226 #[test]
2227 fn empty_ns_empty_tag() {
2228 let buf = $buf;
2229 let mut position = 0;
2230 let mut input = b":>".as_ref();
2231 // ^= 2
2232
2233 assert_eq!(
2234 input.read_element(buf, &mut position).unwrap().map(Bytes),
2235 Some(Bytes(b":"))
2236 );
2237 assert_eq!(position, 2);
2238 }
2239
2240 #[test]
2241 fn empty_ns() {
2242 let buf = $buf;
2243 let mut position = 0;
2244 let mut input = b":tag>".as_ref();
2245 // ^= 5
2246
2247 assert_eq!(
2248 input.read_element(buf, &mut position).unwrap().map(Bytes),
2249 Some(Bytes(b":tag"))
2250 );
2251 assert_eq!(position, 5);
2252 }
2253
2254 #[test]
2255 fn with_attributes() {
2256 let buf = $buf;
2257 let mut position = 0;
2258 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
2259 // ^= 38
2260
2261 assert_eq!(
2262 input.read_element(buf, &mut position).unwrap().map(Bytes),
2263 Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#))
2264 );
2265 assert_eq!(position, 38);
2266 }
2267 }
2268
2269 mod self_closed {
2270 use crate::reader::XmlSource;
2271 use crate::utils::Bytes;
2272 use pretty_assertions::assert_eq;
2273
2274 #[test]
2275 fn empty_tag() {
2276 let buf = $buf;
2277 let mut position = 0;
2278 let mut input = b"/>".as_ref();
2279 // ^= 2
2280
2281 assert_eq!(
2282 input.read_element(buf, &mut position).unwrap().map(Bytes),
2283 Some(Bytes(b"/"))
2284 );
2285 assert_eq!(position, 2);
2286 }
2287
2288 #[test]
2289 fn normal() {
2290 let buf = $buf;
2291 let mut position = 0;
2292 let mut input = b"tag/>".as_ref();
2293 // ^= 5
2294
2295 assert_eq!(
2296 input.read_element(buf, &mut position).unwrap().map(Bytes),
2297 Some(Bytes(b"tag/"))
2298 );
2299 assert_eq!(position, 5);
2300 }
2301
2302 #[test]
2303 fn empty_ns_empty_tag() {
2304 let buf = $buf;
2305 let mut position = 0;
2306 let mut input = b":/>".as_ref();
2307 // ^= 3
2308
2309 assert_eq!(
2310 input.read_element(buf, &mut position).unwrap().map(Bytes),
2311 Some(Bytes(b":/"))
2312 );
2313 assert_eq!(position, 3);
2314 }
2315
2316 #[test]
2317 fn empty_ns() {
2318 let buf = $buf;
2319 let mut position = 0;
2320 let mut input = b":tag/>".as_ref();
2321 // ^= 6
2322
2323 assert_eq!(
2324 input.read_element(buf, &mut position).unwrap().map(Bytes),
2325 Some(Bytes(b":tag/"))
2326 );
2327 assert_eq!(position, 6);
2328 }
2329
2330 #[test]
2331 fn with_attributes() {
2332 let buf = $buf;
2333 let mut position = 0;
2334 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
2335 // ^= 41
2336
2337 assert_eq!(
2338 input.read_element(buf, &mut position).unwrap().map(Bytes),
2339 Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#))
2340 );
2341 assert_eq!(position, 41);
2342 }
2343 }
2344 }
2345
2346 mod issue_344 {
2347 use crate::errors::Error;
2348
2349 #[test]
2350 fn cdata() {
2351 let doc = "![]]>";
2352 let mut reader = crate::Reader::from_str(doc);
2353
2354 match reader.read_until_close($buf) {
2355 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
2356 x => assert!(
2357 false,
2358 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
2359 x
2360 ),
2361 }
2362 }
2363
2364 #[test]
2365 fn comment() {
2366 let doc = "!- -->";
2367 let mut reader = crate::Reader::from_str(doc);
2368
2369 match reader.read_until_close($buf) {
2370 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
2371 x => assert!(
2372 false,
2373 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
2374 x
2375 ),
2376 }
2377 }
2378
2379 #[test]
2380 fn doctype_uppercase() {
2381 let doc = "!D>";
2382 let mut reader = crate::Reader::from_str(doc);
2383
2384 match reader.read_until_close($buf) {
2385 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2386 x => assert!(
2387 false,
2388 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2389 x
2390 ),
2391 }
2392 }
2393
2394 #[test]
2395 fn doctype_lowercase() {
2396 let doc = "!d>";
2397 let mut reader = crate::Reader::from_str(doc);
2398
2399 match reader.read_until_close($buf) {
2400 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
2401 x => assert!(
2402 false,
2403 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
2404 x
2405 ),
2406 }
2407 }
2408 }
2409 };
2410 }
2411
2412 /// Tests for reader that generates events that borrow from the provided buffer
2413 mod buffered {
2414 check!(&mut Vec::new());
2415 }
2416
2417 /// Tests for reader that generates events that borrow from the input
2418 mod borrowed {
2419 check!(());
2420 }
2421}