quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, IllFormedError, SyntaxError};
10use crate::events::{BytesRef, Event};
11use crate::parser::{ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27 /// Whether lone ampersand character (without a paired semicolon) should be
28 /// allowed in textual content. Unless enabled, in case of a dangling ampersand,
29 /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
30 ///
31 /// Default: `false`
32 ///
33 /// # Example
34 ///
35 /// ```
36 /// # use quick_xml::events::{BytesRef, BytesText, Event};
37 /// # use quick_xml::reader::Reader;
38 /// # use pretty_assertions::assert_eq;
39 /// let mut reader = Reader::from_str("text with & & & alone");
40 /// reader.config_mut().allow_dangling_amp = true;
41 ///
42 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
43 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
44 /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
45 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
46 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
47 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
48 /// ```
49 ///
50 /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
51 pub allow_dangling_amp: bool,
52
53 /// Whether unmatched closing tag names should be allowed. Unless enabled,
54 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
55 /// is returned from read methods.
56 ///
57 /// When set to `true`, it won't check if a closing tag has a corresponding
58 /// opening tag at all. For example, `<a></a></b>` will be permitted.
59 ///
60 /// Note that the emitted [`End`] event will not be modified if this is enabled,
61 /// ie. it will contain the data of the unmatched end tag.
62 ///
63 /// Note, that setting this to `true` will lead to additional allocates that
64 /// needed to store tag name for an [`End`] event.
65 ///
66 /// Default: `false`
67 ///
68 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
69 /// [`End`]: crate::events::Event::End
70 pub allow_unmatched_ends: bool,
71
72 /// Whether comments should be validated. If enabled, in case of invalid comment
73 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
74 ///
75 /// When set to `true`, every [`Comment`] event will be checked for not
76 /// containing `--`, which [is not allowed] in XML comments. Most of the time
77 /// we don't want comments at all so we don't really care about comment
78 /// correctness, thus the default value is `false` to improve performance.
79 ///
80 /// Default: `false`
81 ///
82 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
83 /// [`Comment`]: crate::events::Event::Comment
84 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
85 pub check_comments: bool,
86
87 /// Whether mismatched closing tag names should be detected. If enabled, in
88 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
89 /// read methods.
90 ///
91 /// Note, that start and end tags [should match literally][spec], they cannot
92 /// have different prefixes even if both prefixes resolve to the same namespace.
93 /// The XML
94 ///
95 /// ```xml
96 /// <outer xmlns="namespace" xmlns:p="namespace">
97 /// </p:outer>
98 /// ```
99 ///
100 /// is not valid, even though semantically the start tag is the same as the
101 /// end tag. The reason is that namespaces are an extension of the original
102 /// XML specification (without namespaces) and it should be backward-compatible.
103 ///
104 /// When set to `false`, it won't check if a closing tag matches the corresponding
105 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
106 ///
107 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
108 ///
109 /// Note that the emitted [`End`] event will not be modified if this is disabled,
110 /// ie. it will contain the data of the mismatched end tag.
111 ///
112 /// Note, that setting this to `true` will lead to additional allocates that
113 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
114 /// is also set, only one additional allocation will be performed that support
115 /// both these options.
116 ///
117 /// Default: `true`
118 ///
119 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
120 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
121 /// [`End`]: crate::events::Event::End
122 /// [`expand_empty_elements`]: Self::expand_empty_elements
123 pub check_end_names: bool,
124
125 /// Whether empty elements should be split into an `Open` and a `Close` event.
126 ///
127 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
128 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
129 /// event. When set to `false` (the default), those tags are represented by
130 /// an [`Empty`] event instead.
131 ///
132 /// Note, that setting this to `true` will lead to additional allocates that
133 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
134 /// is also set, only one additional allocation will be performed that support
135 /// both these options.
136 ///
137 /// Default: `false`
138 ///
139 /// [`Empty`]: crate::events::Event::Empty
140 /// [`Start`]: crate::events::Event::Start
141 /// [`End`]: crate::events::Event::End
142 /// [`check_end_names`]: Self::check_end_names
143 pub expand_empty_elements: bool,
144
145 /// Whether trailing whitespace after the markup name are trimmed in closing
146 /// tags `</a >`.
147 ///
148 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
149 /// after the markup name.
150 ///
151 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
152 /// of markup names is going to fail erroneously if a closing tag contains
153 /// trailing whitespace.
154 ///
155 /// Default: `true`
156 ///
157 /// [`End`]: crate::events::Event::End
158 /// [`check_end_names`]: Self::check_end_names
159 pub trim_markup_names_in_closing_tags: bool,
160
161 /// Whether whitespace before character data should be removed.
162 ///
163 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
164 /// If after that the event is empty it will not be pushed.
165 ///
166 /// Default: `false`
167 ///
168 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
169 ///
170 /// WARNING: With this option every text events will be trimmed which is
171 /// incorrect behavior when text events delimited by comments, processing
172 /// instructions or CDATA sections. To correctly trim data manually apply
173 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
174 /// only to necessary events.
175 /// </div>
176 ///
177 /// [`Text`]: crate::events::Event::Text
178 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
179 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
180 pub trim_text_start: bool,
181
182 /// Whether whitespace after character data should be removed.
183 ///
184 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
185 /// If after that the event is empty it will not be pushed.
186 ///
187 /// Default: `false`
188 ///
189 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
190 ///
191 /// WARNING: With this option every text events will be trimmed which is
192 /// incorrect behavior when text events delimited by comments, processing
193 /// instructions or CDATA sections. To correctly trim data manually apply
194 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
195 /// only to necessary events.
196 /// </div>
197 ///
198 /// [`Text`]: crate::events::Event::Text
199 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
200 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
201 pub trim_text_end: bool,
202}
203
204impl Config {
205 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
206 ///
207 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
208 ///
209 /// WARNING: With this option every text events will be trimmed which is
210 /// incorrect behavior when text events delimited by comments, processing
211 /// instructions or CDATA sections. To correctly trim data manually apply
212 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
213 /// only to necessary events.
214 /// </div>
215 ///
216 /// [`trim_text_start`]: Self::trim_text_start
217 /// [`trim_text_end`]: Self::trim_text_end
218 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
219 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
220 #[inline]
221 pub fn trim_text(&mut self, trim: bool) {
222 self.trim_text_start = trim;
223 self.trim_text_end = trim;
224 }
225
226 /// Turn on or off all checks for well-formedness. Currently it is that settings:
227 /// - [`check_comments`](Self::check_comments)
228 /// - [`check_end_names`](Self::check_end_names)
229 #[inline]
230 pub fn enable_all_checks(&mut self, enable: bool) {
231 self.check_comments = enable;
232 self.check_end_names = enable;
233 }
234}
235
236impl Default for Config {
237 fn default() -> Self {
238 Self {
239 allow_dangling_amp: false,
240 allow_unmatched_ends: false,
241 check_comments: false,
242 check_end_names: true,
243 expand_empty_elements: false,
244 trim_markup_names_in_closing_tags: true,
245 trim_text_start: false,
246 trim_text_end: false,
247 }
248 }
249}
250
251////////////////////////////////////////////////////////////////////////////////////////////////////
252
253macro_rules! read_event_impl {
254 (
255 $self:ident, $buf:ident,
256 $reader:expr,
257 $read_until_close:ident
258 $(, $await:ident)?
259 ) => {{
260 let event = loop {
261 break match $self.state.state {
262 ParseState::Init => { // Go to InsideText state
263 // If encoding set explicitly, we not need to detect it. For example,
264 // explicit UTF-8 set automatically if Reader was created using `from_str`.
265 // But we still need to remove BOM for consistency with no encoding
266 // feature enabled path
267 #[cfg(feature = "encoding")]
268 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
269 if $self.state.encoding.can_be_refined() {
270 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
271 }
272 }
273
274 // Removes UTF-8 BOM if it is present
275 #[cfg(not(feature = "encoding"))]
276 $reader.remove_utf8_bom() $(.$await)? ?;
277
278 $self.state.state = ParseState::InsideText;
279 continue;
280 },
281 ParseState::InsideRef => { // Go to InsideText
282 let start = $self.state.offset;
283 match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? {
284 // Emit reference, go to InsideText state
285 ReadRefResult::Ref(bytes) => {
286 $self.state.state = ParseState::InsideText;
287 // +1 to skip start `&`
288 Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder())))
289 }
290 // Go to Done state
291 ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
292 $self.state.state = ParseState::Done;
293 Ok(Event::Text($self.state.emit_text(bytes)))
294 }
295 ReadRefResult::UpToEof(_) => {
296 $self.state.state = ParseState::Done;
297 $self.state.last_error_offset = start;
298 Err(Error::IllFormed(IllFormedError::UnclosedReference))
299 }
300 // Do not change state, stay in InsideRef
301 ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
302 Ok(Event::Text($self.state.emit_text(bytes)))
303 }
304 ReadRefResult::UpToRef(_) => {
305 $self.state.last_error_offset = start;
306 Err(Error::IllFormed(IllFormedError::UnclosedReference))
307 }
308 // Go to InsideMarkup state
309 ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
310 $self.state.state = ParseState::InsideMarkup;
311 Ok(Event::Text($self.state.emit_text(bytes)))
312 }
313 ReadRefResult::UpToMarkup(_) => {
314 $self.state.state = ParseState::InsideMarkup;
315 $self.state.last_error_offset = start;
316 Err(Error::IllFormed(IllFormedError::UnclosedReference))
317 }
318 ReadRefResult::Err(e) => Err(Error::Io(e.into())),
319 }
320 }
321 ParseState::InsideText => { // Go to InsideMarkup or Done state
322 if $self.state.config.trim_text_start {
323 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
324 }
325
326 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
327 ReadTextResult::Markup(buf) => {
328 $self.state.state = ParseState::InsideMarkup;
329 // Pass `buf` to the next next iteration of parsing loop
330 $buf = buf;
331 continue;
332 }
333 ReadTextResult::Ref(buf) => {
334 $self.state.state = ParseState::InsideRef;
335 // Pass `buf` to the next next iteration of parsing loop
336 $buf = buf;
337 continue;
338 }
339 ReadTextResult::UpToMarkup(bytes) => {
340 $self.state.state = ParseState::InsideMarkup;
341 // FIXME: Can produce an empty event if:
342 // - event contains only spaces
343 // - trim_text_start = false
344 // - trim_text_end = true
345 Ok(Event::Text($self.state.emit_text(bytes)))
346 }
347 ReadTextResult::UpToRef(bytes) => {
348 $self.state.state = ParseState::InsideRef;
349 // Return Text event with `bytes` content or Eof if bytes is empty
350 Ok(Event::Text($self.state.emit_text(bytes)))
351 }
352 ReadTextResult::UpToEof(bytes) => {
353 $self.state.state = ParseState::Done;
354 // Trim bytes from end if required
355 let event = $self.state.emit_text(bytes);
356 if event.is_empty() {
357 Ok(Event::Eof)
358 } else {
359 Ok(Event::Text(event))
360 }
361 }
362 ReadTextResult::Err(e) => Err(Error::Io(e.into())),
363 }
364 },
365 // Go to InsideText state in next two arms
366 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
367 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
368 ParseState::Done => Ok(Event::Eof),
369 };
370 };
371 match event {
372 // #513: In case of ill-formed errors we already consume the wrong data
373 // and change the state. We can continue parsing if we wish
374 Err(Error::IllFormed(_)) => {}
375 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
376 _ => {}
377 }
378 event
379 }};
380}
381
382/// Read bytes up to the `>` and skip it. This method is expected to be called
383/// after seeing the `<` symbol and skipping it. Inspects the next (current)
384/// symbol and returns an appropriate [`Event`]:
385///
386/// |Symbol |Event
387/// |-------|-------------------------------------
388/// |`!` |[`Comment`], [`CData`] or [`DocType`]
389/// |`/` |[`End`]
390/// |`?` |[`PI`]
391/// |_other_|[`Start`] or [`Empty`]
392///
393/// Moves parser to the `InsideText` state.
394///
395/// [`Comment`]: Event::Comment
396/// [`CData`]: Event::CData
397/// [`DocType`]: Event::DocType
398/// [`End`]: Event::End
399/// [`PI`]: Event::PI
400/// [`Start`]: Event::Start
401/// [`Empty`]: Event::Empty
402macro_rules! read_until_close {
403 (
404 $self:ident, $buf:ident,
405 $reader:expr
406 $(, $await:ident)?
407 ) => {{
408 $self.state.state = ParseState::InsideText;
409
410 let start = $self.state.offset;
411 match $reader.peek_one() $(.$await)? {
412 // `<!` - comment, CDATA or DOCTYPE declaration
413 Ok(Some(b'!')) => match $reader
414 .read_bang_element($buf, &mut $self.state.offset)
415 $(.$await)?
416 {
417 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
418 Err(e) => {
419 // We want to report error at `<`, but offset was increased,
420 // so return it back (-1 for `<`)
421 $self.state.last_error_offset = start - 1;
422 Err(e)
423 }
424 },
425 // `</` - closing tag
426 // #776: We parse using ElementParser which allows us to have attributes
427 // in close tags. While such tags are not allowed by the specification,
428 // we anyway allow to parse them because:
429 // - we do not check constraints during parsing. This is performed by the
430 // optional validate step which user should call manually
431 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
432 // `</tag attr=">` and text `" >` which probably no one existing parser
433 // does. This is malformed XML, however it is tolerated by some parsers
434 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
435 Ok(Some(b'/')) => match $reader
436 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
437 $(.$await)?
438 {
439 Ok(bytes) => $self.state.emit_end(bytes),
440 Err(e) => {
441 // We want to report error at `<`, but offset was increased,
442 // so return it back (-1 for `<`)
443 $self.state.last_error_offset = start - 1;
444 Err(e)
445 }
446 },
447 // `<?` - processing instruction
448 Ok(Some(b'?')) => match $reader
449 .read_with(PiParser(false), $buf, &mut $self.state.offset)
450 $(.$await)?
451 {
452 Ok(bytes) => $self.state.emit_question_mark(bytes),
453 Err(e) => {
454 // We want to report error at `<`, but offset was increased,
455 // so return it back (-1 for `<`)
456 $self.state.last_error_offset = start - 1;
457 Err(e)
458 }
459 },
460 // `<...` - opening or self-closed tag
461 Ok(Some(_)) => match $reader
462 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
463 $(.$await)?
464 {
465 Ok(bytes) => Ok($self.state.emit_start(bytes)),
466 Err(e) => {
467 // We want to report error at `<`, but offset was increased,
468 // so return it back (-1 for `<`)
469 $self.state.last_error_offset = start - 1;
470 Err(e)
471 }
472 },
473 // `<` - syntax error, tag not closed
474 Ok(None) => {
475 // We want to report error at `<`, but offset was increased,
476 // so return it back (-1 for `<`)
477 $self.state.last_error_offset = start - 1;
478 Err(Error::Syntax(SyntaxError::UnclosedTag))
479 }
480 Err(e) => Err(Error::Io(e.into())),
481 }
482 }};
483}
484
485/// Generalization of `read_to_end` method for buffered and borrowed readers
486macro_rules! read_to_end {
487 (
488 // $self: &mut Reader
489 $self:expr, $end:expr, $buf:expr,
490 $read_event:ident,
491 // Code block that performs clearing of internal buffer after read of each event
492 $clear:block
493 $(, $await:ident)?
494 ) => {{
495 // Because we take position after the event before the End event,
496 // it is important that this position indicates beginning of the End event.
497 // If between last event and the End event would be only spaces, then we
498 // take position before the spaces, but spaces would be skipped without
499 // generating event if `trim_text_start` is set to `true`. To prevent that
500 // we temporary disable start text trimming.
501 //
502 // We also cannot take position after getting End event, because if
503 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
504 // we do not known the real size of the End event that it is occupies in
505 // the source and cannot correct the position after the End event.
506 // So, we in any case should tweak parser configuration.
507 let config = $self.config_mut();
508 let trim = config.trim_text_start;
509 config.trim_text_start = false;
510
511 let start = $self.buffer_position();
512 let mut depth = 0;
513 loop {
514 $clear
515 let end = $self.buffer_position();
516 match $self.$read_event($buf) $(.$await)? {
517 Err(e) => {
518 $self.config_mut().trim_text_start = trim;
519 return Err(e);
520 }
521
522 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
523 Ok(Event::End(e)) if e.name() == $end => {
524 if depth == 0 {
525 $self.config_mut().trim_text_start = trim;
526 break start..end;
527 }
528 depth -= 1;
529 }
530 Ok(Event::Eof) => {
531 $self.config_mut().trim_text_start = trim;
532 return Err(Error::missed_end($end, $self.decoder()));
533 }
534 _ => (),
535 }
536 }
537 }};
538}
539
540#[cfg(feature = "async-tokio")]
541mod async_tokio;
542mod buffered_reader;
543mod ns_reader;
544mod slice_reader;
545mod state;
546
547pub use ns_reader::NsReader;
548
549/// Range of input in bytes, that corresponds to some piece of XML
550pub type Span = Range<u64>;
551
552////////////////////////////////////////////////////////////////////////////////////////////////////
553
554/// Possible reader states. The state transition diagram (`true` and `false` shows
555/// value of [`Config::expand_empty_elements`] option):
556///
557/// ```mermaid
558/// flowchart LR
559/// subgraph _
560/// direction LR
561///
562/// Init -- "(no event)"\n --> InsideMarkup
563/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
564/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
565/// InsideRef -- "(no event)"\nGeneralRef --> InsideText
566/// end
567/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
568/// InsideEmpty -- End --> InsideText
569/// _ -. Eof .-> Done
570/// ```
571#[derive(Clone, Debug)]
572enum ParseState {
573 /// Initial state in which reader stay after creation. Transition from that
574 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
575 /// state is always `InsideMarkup`. The reader will never return to this state. The
576 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
577 /// first symbol not `<`, otherwise no event are emitted.
578 Init,
579 /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other
580 /// events could be generated.
581 ///
582 /// After generating one event the reader moves to the `ClosedTag` state.
583 InsideRef,
584 /// State after seeing the `<` symbol. Depending on the next symbol all other
585 /// events could be generated.
586 ///
587 /// After generating one event the reader moves to the `InsideText` state.
588 InsideMarkup,
589 /// State in which reader searches the `<` symbol of a markup. All bytes before
590 /// that symbol will be returned in the [`Event::Text`] event. After that
591 /// the reader moves to the `InsideMarkup` state.
592 InsideText,
593 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
594 /// Reader enters to this state when it is in a `InsideText` state and emits an
595 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
596 /// after which reader returned to the `InsideText` state.
597 ///
598 /// [`expand_empty_elements`]: Config::expand_empty_elements
599 InsideEmpty,
600 /// Reader enters this state when `Eof` event generated or an error occurred.
601 /// This is the last state, the reader stay in it forever.
602 Done,
603}
604
605/// A reference to an encoding together with information about how it was retrieved.
606///
607/// The state transition diagram:
608///
609/// ```mermaid
610/// flowchart LR
611/// Implicit -- from_str --> Explicit
612/// Implicit -- BOM --> BomDetected
613/// Implicit -- "encoding=..." --> XmlDetected
614/// BomDetected -- "encoding=..." --> XmlDetected
615/// ```
616#[cfg(feature = "encoding")]
617#[derive(Clone, Copy, Debug)]
618enum EncodingRef {
619 /// Encoding was implicitly assumed to have a specified value. It can be refined
620 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
621 Implicit(&'static Encoding),
622 /// Encoding was explicitly set to the desired value. It cannot be changed
623 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
624 Explicit(&'static Encoding),
625 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
626 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
627 BomDetected(&'static Encoding),
628 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
629 /// It can no longer change
630 XmlDetected(&'static Encoding),
631}
632#[cfg(feature = "encoding")]
633impl EncodingRef {
634 #[inline]
635 const fn encoding(&self) -> &'static Encoding {
636 match self {
637 Self::Implicit(e) => e,
638 Self::Explicit(e) => e,
639 Self::BomDetected(e) => e,
640 Self::XmlDetected(e) => e,
641 }
642 }
643 #[inline]
644 const fn can_be_refined(&self) -> bool {
645 match self {
646 Self::Implicit(_) | Self::BomDetected(_) => true,
647 Self::Explicit(_) | Self::XmlDetected(_) => false,
648 }
649 }
650}
651
652////////////////////////////////////////////////////////////////////////////////////////////////////
653
654/// A direct stream to the underlying [`Reader`]s reader which updates
655/// [`Reader::buffer_position()`] when read from it.
656#[derive(Debug)]
657#[must_use = "streams do nothing unless read or polled"]
658pub struct BinaryStream<'r, R> {
659 inner: &'r mut R,
660 offset: &'r mut u64,
661}
662
663impl<'r, R> BinaryStream<'r, R> {
664 /// Returns current position in bytes in the original source.
665 #[inline]
666 pub const fn offset(&self) -> u64 {
667 *self.offset
668 }
669
670 /// Gets a reference to the underlying reader.
671 #[inline]
672 pub const fn get_ref(&self) -> &R {
673 self.inner
674 }
675
676 /// Gets a mutable reference to the underlying reader.
677 ///
678 /// Avoid read from this reader because this will not update reader's position
679 /// and will lead to incorrect positions of errors. Read from this stream instead.
680 #[inline]
681 pub fn get_mut(&mut self) -> &mut R {
682 self.inner
683 }
684}
685
686impl<'r, R> io::Read for BinaryStream<'r, R>
687where
688 R: io::Read,
689{
690 #[inline]
691 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
692 let amt = self.inner.read(buf)?;
693 *self.offset += amt as u64;
694 Ok(amt)
695 }
696}
697
698impl<'r, R> io::BufRead for BinaryStream<'r, R>
699where
700 R: io::BufRead,
701{
702 #[inline]
703 fn fill_buf(&mut self) -> io::Result<&[u8]> {
704 self.inner.fill_buf()
705 }
706
707 #[inline]
708 fn consume(&mut self, amt: usize) {
709 self.inner.consume(amt);
710 *self.offset += amt as u64;
711 }
712}
713
714////////////////////////////////////////////////////////////////////////////////////////////////////
715
716/// A low level encoding-agnostic XML event reader.
717///
718/// Consumes bytes and streams XML [`Event`]s.
719///
720/// This reader does not manage namespace declarations and not able to resolve
721/// prefixes. If you want these features, use the [`NsReader`].
722///
723/// # Examples
724///
725/// ```
726/// use quick_xml::events::Event;
727/// use quick_xml::reader::Reader;
728///
729/// let xml = r#"<tag1 att1 = "test">
730/// <tag2><!--Test comment-->Test</tag2>
731/// <tag2>Test 2</tag2>
732/// </tag1>"#;
733/// let mut reader = Reader::from_str(xml);
734/// reader.config_mut().trim_text(true);
735///
736/// let mut count = 0;
737/// let mut txt = Vec::new();
738/// let mut buf = Vec::new();
739///
740/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
741/// loop {
742/// // NOTE: this is the generic case when we don't know about the input BufRead.
743/// // when the input is a &str or a &[u8], we don't actually need to use another
744/// // buffer, we could directly call `reader.read_event()`
745/// match reader.read_event_into(&mut buf) {
746/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
747/// // exits the loop when reaching end of file
748/// Ok(Event::Eof) => break,
749///
750/// Ok(Event::Start(e)) => {
751/// match e.name().as_ref() {
752/// b"tag1" => println!("attributes values: {:?}",
753/// e.attributes().map(|a| a.unwrap().value)
754/// .collect::<Vec<_>>()),
755/// b"tag2" => count += 1,
756/// _ => (),
757/// }
758/// }
759/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
760///
761/// // There are several other `Event`s we do not consider here
762/// _ => (),
763/// }
764/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
765/// buf.clear();
766/// }
767/// ```
768///
769/// [`NsReader`]: crate::reader::NsReader
770#[derive(Debug, Clone)]
771pub struct Reader<R> {
772 /// Source of data for parse
773 reader: R,
774 /// Configuration and current parse state
775 state: ReaderState,
776}
777
778/// Builder methods
779impl<R> Reader<R> {
780 /// Creates a `Reader` that reads from a given reader.
781 pub fn from_reader(reader: R) -> Self {
782 Self {
783 reader,
784 state: ReaderState::default(),
785 }
786 }
787
788 /// Returns reference to the parser configuration
789 pub const fn config(&self) -> &Config {
790 &self.state.config
791 }
792
793 /// Returns mutable reference to the parser configuration
794 pub fn config_mut(&mut self) -> &mut Config {
795 &mut self.state.config
796 }
797}
798
799/// Getters
800impl<R> Reader<R> {
801 /// Consumes `Reader` returning the underlying reader
802 ///
803 /// Can be used to compute line and column of a parsing error position
804 ///
805 /// # Examples
806 ///
807 /// ```
808 /// # use pretty_assertions::assert_eq;
809 /// use std::{str, io::Cursor};
810 /// use quick_xml::events::Event;
811 /// use quick_xml::reader::Reader;
812 ///
813 /// let xml = r#"<tag1 att1 = "test">
814 /// <tag2><!--Test comment-->Test</tag2>
815 /// <tag3>Test 2</tag3>
816 /// </tag1>"#;
817 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
818 /// let mut buf = Vec::new();
819 ///
820 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
821 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
822 /// let end_pos = reader.buffer_position() as usize;
823 /// let mut cursor = reader.into_inner();
824 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
825 /// .expect("can't make a string");
826 /// let mut line = 1;
827 /// let mut column = 0;
828 /// for c in s.chars() {
829 /// if c == '\n' {
830 /// line += 1;
831 /// column = 0;
832 /// } else {
833 /// column += 1;
834 /// }
835 /// }
836 /// (line, column)
837 /// }
838 ///
839 /// loop {
840 /// match reader.read_event_into(&mut buf) {
841 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
842 /// b"tag1" | b"tag2" => (),
843 /// tag => {
844 /// assert_eq!(b"tag3", tag);
845 /// assert_eq!((3, 22), into_line_and_column(reader));
846 /// break;
847 /// }
848 /// },
849 /// Ok(Event::Eof) => unreachable!(),
850 /// _ => (),
851 /// }
852 /// buf.clear();
853 /// }
854 /// ```
855 pub fn into_inner(self) -> R {
856 self.reader
857 }
858
859 /// Gets a reference to the underlying reader.
860 pub const fn get_ref(&self) -> &R {
861 &self.reader
862 }
863
864 /// Gets a mutable reference to the underlying reader.
865 ///
866 /// Avoid read from this reader because this will not update reader's position
867 /// and will lead to incorrect positions of errors. If you want to read, use
868 /// [`stream()`] instead.
869 ///
870 /// [`stream()`]: Self::stream
871 pub fn get_mut(&mut self) -> &mut R {
872 &mut self.reader
873 }
874
875 /// Gets the byte position in the input data just after the last emitted event
876 /// (i.e. this is position where data of last event ends).
877 ///
878 /// Note, that for text events which is originally ended with whitespace characters
879 /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position
880 /// before trim, not the position of the last byte of the [`Event::Text`] content.
881 pub const fn buffer_position(&self) -> u64 {
882 // when internal state is InsideMarkup, we have actually read until '<',
883 // which we don't want to show
884 if let ParseState::InsideMarkup = self.state.state {
885 self.state.offset - 1
886 } else {
887 self.state.offset
888 }
889 }
890
891 /// Gets the last error byte position in the input data. If there is no errors
892 /// yet, returns `0`.
893 ///
894 /// Unlike `buffer_position` it will point to the place where it is rational
895 /// to report error to the end user. For example, all [`SyntaxError`]s are
896 /// reported when the parser sees EOF inside of some kind of markup. The
897 /// `buffer_position()` will point to the last byte of input which is not
898 /// very useful. `error_position()` will point to the start of corresponding
899 /// markup element (i. e. to the `<` character).
900 ///
901 /// This position is always `<= buffer_position()`.
902 pub const fn error_position(&self) -> u64 {
903 self.state.last_error_offset
904 }
905
906 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
907 ///
908 /// If [`encoding`] feature is enabled, the used encoding may change after
909 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
910 ///
911 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
912 /// defaults to UTF-8.
913 ///
914 /// [`encoding`]: ../index.html#encoding
915 #[inline]
916 pub const fn decoder(&self) -> Decoder {
917 self.state.decoder()
918 }
919
920 /// Get the direct access to the underlying reader, but tracks the amount of
921 /// read data and update [`Reader::buffer_position()`] accordingly.
922 ///
923 /// Note, that this method gives you access to the internal reader and read
924 /// data will not be returned in any subsequent events read by `read_event`
925 /// family of methods.
926 ///
927 /// # Example
928 ///
929 /// This example demonstrates how to read stream raw bytes from an XML document.
930 /// This could be used to implement streaming read of text, or to read raw binary
931 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
932 /// valid XML, but XML-derived file formats exist where such documents are valid).
933 ///
934 /// ```
935 /// # use pretty_assertions::assert_eq;
936 /// use std::io::{BufRead, Read};
937 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
938 /// use quick_xml::reader::Reader;
939 ///
940 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
941 /// // ^ ^ ^ ^
942 /// // 0 5 21 27
943 ///
944 /// assert_eq!(
945 /// (reader.read_event().unwrap(), reader.buffer_position()),
946 /// // 5 - end of the `<tag>`
947 /// (Event::Start(BytesStart::new("tag")), 5)
948 /// );
949 ///
950 /// // Reading directly from underlying reader will not update position
951 /// // let mut inner = reader.get_mut();
952 ///
953 /// // Reading from the stream() advances position
954 /// let mut inner = reader.stream();
955 ///
956 /// // Read binary data. We must know its size
957 /// let mut binary = [0u8; 16];
958 /// inner.read_exact(&mut binary).unwrap();
959 /// assert_eq!(&binary, b"binary << data&>");
960 /// // 21 - end of the `binary << data&>`
961 /// assert_eq!(inner.offset(), 21);
962 /// assert_eq!(reader.buffer_position(), 21);
963 ///
964 /// assert_eq!(
965 /// (reader.read_event().unwrap(), reader.buffer_position()),
966 /// // 27 - end of the `</tag>`
967 /// (Event::End(BytesEnd::new("tag")), 27)
968 /// );
969 ///
970 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
971 /// ```
972 #[inline]
973 pub fn stream(&mut self) -> BinaryStream<R> {
974 BinaryStream {
975 inner: &mut self.reader,
976 offset: &mut self.state.offset,
977 }
978 }
979}
980
981/// Private sync reading methods
982impl<R> Reader<R> {
983 /// Read text into the given buffer, and return an event that borrows from
984 /// either that buffer or from the input itself, based on the type of the
985 /// reader.
986 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
987 where
988 R: XmlSource<'i, B>,
989 {
990 read_event_impl!(self, buf, self.reader, read_until_close)
991 }
992
993 /// Private function to read until `>` is found. This function expects that
994 /// it was called just after encounter a `<` symbol.
995 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
996 where
997 R: XmlSource<'i, B>,
998 {
999 read_until_close!(self, buf, self.reader)
1000 }
1001}
1002
1003////////////////////////////////////////////////////////////////////////////////////////////////////
1004
1005/// Result of an attempt to read XML textual data from the source.
1006#[derive(Debug)]
1007enum ReadTextResult<'r, B> {
1008 /// Start of markup (`<` character) was found in the first byte. `<` was consumed.
1009 /// Contains buffer that should be returned back to the next iteration cycle
1010 /// to satisfy borrow checker requirements.
1011 Markup(B),
1012 /// Start of reference (`&` character) was found in the first byte.
1013 /// `&` was not consumed.
1014 /// Contains buffer that should be returned back to the next iteration cycle
1015 /// to satisfy borrow checker requirements.
1016 Ref(B),
1017 /// Contains text block up to start of markup (`<` character). `<` was consumed.
1018 UpToMarkup(&'r [u8]),
1019 /// Contains text block up to start of reference (`&` character).
1020 /// `&` was not consumed.
1021 UpToRef(&'r [u8]),
1022 /// Contains text block up to EOF, neither start of markup (`<` character)
1023 /// or start of reference (`&` character) was found.
1024 UpToEof(&'r [u8]),
1025 /// IO error occurred.
1026 Err(io::Error),
1027}
1028
1029/// Result of an attempt to read general reference from the reader.
1030#[derive(Debug)]
1031enum ReadRefResult<'r> {
1032 /// Contains text block up to end of reference (`;` character).
1033 /// Result includes start `&`, but not end `;`.
1034 Ref(&'r [u8]),
1035 /// Contains text block up to EOF. Neither end of reference (`;`), start of
1036 /// another reference (`&`) or start of markup (`<`) characters was found.
1037 /// Result includes start `&`.
1038 UpToEof(&'r [u8]),
1039 /// Contains text block up to next possible reference (`&` character).
1040 /// Result includes start `&`.
1041 UpToRef(&'r [u8]),
1042 /// Contains text block up to start of markup (`<` character).
1043 /// Result includes start `&`.
1044 UpToMarkup(&'r [u8]),
1045 /// IO error occurred.
1046 Err(io::Error),
1047}
1048
1049/// Represents an input for a reader that can return borrowed data.
1050///
1051/// There are two implementors of this trait: generic one that read data from
1052/// `Self`, copies some part of it into a provided buffer of type `B` and then
1053/// returns data that borrow from that buffer.
1054///
1055/// The other implementor is for `&[u8]` and instead of copying data returns
1056/// borrowed data from `Self` instead. This implementation allows zero-copy
1057/// deserialization.
1058///
1059/// # Parameters
1060/// - `'r`: lifetime of a buffer from which events will borrow
1061/// - `B`: a type of a buffer that can be used to store data read from `Self` and
1062/// from which events can borrow
1063trait XmlSource<'r, B> {
1064 /// Removes UTF-8 BOM if it is present
1065 #[cfg(not(feature = "encoding"))]
1066 fn remove_utf8_bom(&mut self) -> io::Result<()>;
1067
1068 /// Determines encoding from the start of input and removes BOM if it is present
1069 #[cfg(feature = "encoding")]
1070 fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
1071
1072 /// Read input until start of markup (the `<`) is found, start of general entity
1073 /// reference (the `&`) is found or end of input is reached.
1074 ///
1075 /// # Parameters
1076 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1077 /// from which [events] could borrow their data
1078 /// - `position`: Will be increased by amount of bytes consumed
1079 ///
1080 /// [events]: crate::events::Event
1081 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
1082
1083 /// Read input until end of general reference (the `;`) is found, start of
1084 /// another general reference (the `&`) is found or end of input is reached.
1085 ///
1086 /// This method must be called when current character is `&`.
1087 ///
1088 /// # Parameters
1089 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1090 /// from which [events] could borrow their data
1091 /// - `position`: Will be increased by amount of bytes consumed
1092 ///
1093 /// [events]: crate::events::Event
1094 fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>;
1095
1096 /// Read input until processing instruction is finished.
1097 ///
1098 /// This method expect that start sequence of a parser already was read.
1099 ///
1100 /// Returns a slice of data read up to the end of the thing being parsed.
1101 /// The end of thing and the returned content is determined by the used parser.
1102 ///
1103 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
1104 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
1105 ///
1106 /// # Parameters
1107 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1108 /// from which [events] could borrow their data
1109 /// - `position`: Will be increased by amount of bytes consumed
1110 ///
1111 /// A `P` type parameter is used to preserve state between calls to the underlying
1112 /// reader which provides bytes fed into the parser.
1113 ///
1114 /// [events]: crate::events::Event
1115 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
1116 where
1117 P: Parser;
1118
1119 /// Read input until comment or CDATA is finished.
1120 ///
1121 /// This method expect that `<` already was read.
1122 ///
1123 /// Returns a slice of data read up to end of comment or CDATA (`>`),
1124 /// which does not include into result.
1125 ///
1126 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
1127 ///
1128 /// # Parameters
1129 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1130 /// from which [events] could borrow their data
1131 /// - `position`: Will be increased by amount of bytes consumed
1132 ///
1133 /// [events]: crate::events::Event
1134 fn read_bang_element(
1135 &mut self,
1136 buf: B,
1137 position: &mut u64,
1138 ) -> Result<(BangType, &'r [u8]), Error>;
1139
1140 /// Consume and discard all the whitespace until the next non-whitespace
1141 /// character or EOF.
1142 ///
1143 /// # Parameters
1144 /// - `position`: Will be increased by amount of bytes consumed
1145 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1146
1147 /// Return one character without consuming it, so that future `read_*` calls
1148 /// will still include it. On EOF, return `None`.
1149 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1150}
1151
1152/// Possible elements started with `<!`
1153#[derive(Debug, PartialEq)]
1154enum BangType {
1155 /// <![CDATA[...]]>
1156 CData,
1157 /// <!--...-->
1158 Comment,
1159 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1160 DocType(i32),
1161}
1162impl BangType {
1163 #[inline(always)]
1164 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1165 Ok(match byte {
1166 Some(b'[') => Self::CData,
1167 Some(b'-') => Self::Comment,
1168 Some(b'D') | Some(b'd') => Self::DocType(0),
1169 _ => return Err(SyntaxError::InvalidBangMarkup),
1170 })
1171 }
1172
1173 /// If element is finished, returns its content up to `>` symbol and
1174 /// an index of this symbol, otherwise returns `None`
1175 ///
1176 /// # Parameters
1177 /// - `buf`: buffer with data consumed on previous iterations
1178 /// - `chunk`: data read on current iteration and not yet consumed from reader
1179 #[inline(always)]
1180 fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1181 match self {
1182 Self::Comment => {
1183 for i in memchr::memchr_iter(b'>', chunk) {
1184 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1185 // <!----> - XML comment
1186 // 012345 - i
1187 if buf.len() + i > 4 {
1188 if chunk[..i].ends_with(b"--") {
1189 // We cannot strip last `--` from the buffer because we need it in case of
1190 // check_comments enabled option. XML standard requires that comment
1191 // will not end with `--->` sequence because this is a special case of
1192 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1193 return Some((&chunk[..i], i + 1)); // +1 for `>`
1194 }
1195 // End sequence `-|->` was splitted at |
1196 // buf --/ \-- chunk
1197 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1198 return Some((&chunk[..i], i + 1)); // +1 for `>`
1199 }
1200 // End sequence `--|>` was splitted at |
1201 // buf --/ \-- chunk
1202 if i == 0 && buf.ends_with(b"--") {
1203 return Some((&[], i + 1)); // +1 for `>`
1204 }
1205 }
1206 }
1207 }
1208 Self::CData => {
1209 for i in memchr::memchr_iter(b'>', chunk) {
1210 if chunk[..i].ends_with(b"]]") {
1211 return Some((&chunk[..i], i + 1)); // +1 for `>`
1212 }
1213 // End sequence `]|]>` was splitted at |
1214 // buf --/ \-- chunk
1215 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1216 return Some((&chunk[..i], i + 1)); // +1 for `>`
1217 }
1218 // End sequence `]]|>` was splitted at |
1219 // buf --/ \-- chunk
1220 if i == 0 && buf.ends_with(b"]]") {
1221 return Some((&[], i + 1)); // +1 for `>`
1222 }
1223 }
1224 }
1225 Self::DocType(ref mut balance) => {
1226 for i in memchr::memchr2_iter(b'<', b'>', chunk) {
1227 if chunk[i] == b'<' {
1228 *balance += 1;
1229 } else {
1230 if *balance == 0 {
1231 return Some((&chunk[..i], i + 1)); // +1 for `>`
1232 }
1233 *balance -= 1;
1234 }
1235 }
1236 }
1237 }
1238 None
1239 }
1240 #[inline]
1241 const fn to_err(&self) -> SyntaxError {
1242 match self {
1243 Self::CData => SyntaxError::UnclosedCData,
1244 Self::Comment => SyntaxError::UnclosedComment,
1245 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1246 }
1247 }
1248}
1249
1250////////////////////////////////////////////////////////////////////////////////////////////////////
1251
1252#[cfg(test)]
1253mod test {
1254 /// Checks the internal implementation of the various reader methods
1255 macro_rules! check {
1256 (
1257 #[$test:meta]
1258 $read_event:ident,
1259 $read_until_close:ident,
1260 // constructor of the XML source on which internal functions will be called
1261 $source:path,
1262 // constructor of the buffer to which read data will stored
1263 $buf:expr
1264 $(, $async:ident, $await:ident)?
1265 ) => {
1266 mod read_bang_element {
1267 use super::*;
1268 use crate::errors::{Error, SyntaxError};
1269 use crate::reader::BangType;
1270 use crate::utils::Bytes;
1271
1272 /// Checks that reading CDATA content works correctly
1273 mod cdata {
1274 use super::*;
1275 use pretty_assertions::assert_eq;
1276
1277 /// Checks that if input begins like CDATA element, but CDATA start sequence
1278 /// is not finished, parsing ends with an error
1279 #[$test]
1280 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1281 $($async)? fn not_properly_start() {
1282 let buf = $buf;
1283 let mut position = 1;
1284 let mut input = b"![]]>other content".as_ref();
1285 // ^= 1
1286
1287 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1288 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1289 x => panic!(
1290 "Expected `Err(Syntax(_))`, but got `{:?}`",
1291 x
1292 ),
1293 }
1294 assert_eq!(position, 1);
1295 }
1296
1297 /// Checks that if CDATA startup sequence was matched, but an end sequence
1298 /// is not found, parsing ends with an error
1299 #[$test]
1300 $($async)? fn not_closed() {
1301 let buf = $buf;
1302 let mut position = 1;
1303 let mut input = b"![CDATA[other content".as_ref();
1304 // ^= 1 ^= 22
1305
1306 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1307 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1308 x => panic!(
1309 "Expected `Err(Syntax(_))`, but got `{:?}`",
1310 x
1311 ),
1312 }
1313 assert_eq!(position, 22);
1314 }
1315
1316 /// Checks that CDATA element without content inside parsed successfully
1317 #[$test]
1318 $($async)? fn empty() {
1319 let buf = $buf;
1320 let mut position = 1;
1321 let mut input = b"![CDATA[]]>other content".as_ref();
1322 // ^= 1 ^= 12
1323
1324 let (ty, bytes) = $source(&mut input)
1325 .read_bang_element(buf, &mut position)
1326 $(.$await)?
1327 .unwrap();
1328 assert_eq!(
1329 (ty, Bytes(bytes)),
1330 (BangType::CData, Bytes(b"![CDATA[]]"))
1331 );
1332 assert_eq!(position, 12);
1333 }
1334
1335 /// Checks that CDATA element with content parsed successfully.
1336 /// Additionally checks that sequences inside CDATA that may look like
1337 /// a CDATA end sequence do not interrupt CDATA parsing
1338 #[$test]
1339 $($async)? fn with_content() {
1340 let buf = $buf;
1341 let mut position = 1;
1342 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1343 // ^= 1 ^= 29
1344
1345 let (ty, bytes) = $source(&mut input)
1346 .read_bang_element(buf, &mut position)
1347 $(.$await)?
1348 .unwrap();
1349 assert_eq!(
1350 (ty, Bytes(bytes)),
1351 (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1352 );
1353 assert_eq!(position, 29);
1354 }
1355 }
1356
1357 /// Checks that reading XML comments works correctly. According to the [specification],
1358 /// comment data can contain any sequence except `--`:
1359 ///
1360 /// ```peg
1361 /// comment = '<--' (!'--' char)* '-->';
1362 /// char = [#x1-#x2C]
1363 /// / [#x2E-#xD7FF]
1364 /// / [#xE000-#xFFFD]
1365 /// / [#x10000-#x10FFFF]
1366 /// ```
1367 ///
1368 /// The presence of this limitation, however, is simply a poorly designed specification
1369 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1370 /// presence of these sequences by default. This tests allow such content.
1371 ///
1372 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1373 mod comment {
1374 use super::*;
1375 use pretty_assertions::assert_eq;
1376
1377 #[$test]
1378 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1379 $($async)? fn not_properly_start() {
1380 let buf = $buf;
1381 let mut position = 1;
1382 let mut input = b"!- -->other content".as_ref();
1383 // ^= 1
1384
1385 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1386 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1387 x => panic!(
1388 "Expected `Err(Syntax(_))`, but got `{:?}`",
1389 x
1390 ),
1391 }
1392 assert_eq!(position, 1);
1393 }
1394
1395 #[$test]
1396 $($async)? fn not_properly_end() {
1397 let buf = $buf;
1398 let mut position = 1;
1399 let mut input = b"!->other content".as_ref();
1400 // ^= 1 ^= 17
1401
1402 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1403 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1404 x => panic!(
1405 "Expected `Err(Syntax(_))`, but got `{:?}`",
1406 x
1407 ),
1408 }
1409 assert_eq!(position, 17);
1410 }
1411
1412 #[$test]
1413 $($async)? fn not_closed1() {
1414 let buf = $buf;
1415 let mut position = 1;
1416 let mut input = b"!--other content".as_ref();
1417 // ^= 1 ^= 17
1418
1419 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1420 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1421 x => panic!(
1422 "Expected `Err(Syntax(_))`, but got `{:?}`",
1423 x
1424 ),
1425 }
1426 assert_eq!(position, 17);
1427 }
1428
1429 #[$test]
1430 $($async)? fn not_closed2() {
1431 let buf = $buf;
1432 let mut position = 1;
1433 let mut input = b"!-->other content".as_ref();
1434 // ^= 1 ^= 18
1435
1436 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1437 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1438 x => panic!(
1439 "Expected `Err(Syntax(_))`, but got `{:?}`",
1440 x
1441 ),
1442 }
1443 assert_eq!(position, 18);
1444 }
1445
1446 #[$test]
1447 $($async)? fn not_closed3() {
1448 let buf = $buf;
1449 let mut position = 1;
1450 let mut input = b"!--->other content".as_ref();
1451 // ^= 1 ^= 19
1452
1453 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1454 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1455 x => panic!(
1456 "Expected `Err(Syntax(_))`, but got `{:?}`",
1457 x
1458 ),
1459 }
1460 assert_eq!(position, 19);
1461 }
1462
1463 #[$test]
1464 $($async)? fn empty() {
1465 let buf = $buf;
1466 let mut position = 1;
1467 let mut input = b"!---->other content".as_ref();
1468 // ^= 1 ^= 7
1469
1470 let (ty, bytes) = $source(&mut input)
1471 .read_bang_element(buf, &mut position)
1472 $(.$await)?
1473 .unwrap();
1474 assert_eq!(
1475 (ty, Bytes(bytes)),
1476 (BangType::Comment, Bytes(b"!----"))
1477 );
1478 assert_eq!(position, 7);
1479 }
1480
1481 #[$test]
1482 $($async)? fn with_content() {
1483 let buf = $buf;
1484 let mut position = 1;
1485 let mut input = b"!--->comment<--->other content".as_ref();
1486 // ^= 1 ^= 18
1487
1488 let (ty, bytes) = $source(&mut input)
1489 .read_bang_element(buf, &mut position)
1490 $(.$await)?
1491 .unwrap();
1492 assert_eq!(
1493 (ty, Bytes(bytes)),
1494 (BangType::Comment, Bytes(b"!--->comment<---"))
1495 );
1496 assert_eq!(position, 18);
1497 }
1498 }
1499
1500 /// Checks that reading DOCTYPE definition works correctly
1501 mod doctype {
1502 use super::*;
1503
1504 mod uppercase {
1505 use super::*;
1506 use pretty_assertions::assert_eq;
1507
1508 #[$test]
1509 $($async)? fn not_properly_start() {
1510 let buf = $buf;
1511 let mut position = 1;
1512 let mut input = b"!D other content".as_ref();
1513 // ^= 1 ^= 17
1514
1515 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1516 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1517 x => panic!(
1518 "Expected `Err(Syntax(_))`, but got `{:?}`",
1519 x
1520 ),
1521 }
1522 assert_eq!(position, 17);
1523 }
1524
1525 #[$test]
1526 $($async)? fn without_space() {
1527 let buf = $buf;
1528 let mut position = 1;
1529 let mut input = b"!DOCTYPEother content".as_ref();
1530 // ^= 1 ^= 22
1531
1532 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1533 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1534 x => panic!(
1535 "Expected `Err(Syntax(_))`, but got `{:?}`",
1536 x
1537 ),
1538 }
1539 assert_eq!(position, 22);
1540 }
1541
1542 #[$test]
1543 $($async)? fn empty() {
1544 let buf = $buf;
1545 let mut position = 1;
1546 let mut input = b"!DOCTYPE>other content".as_ref();
1547 // ^= 1 ^= 10
1548
1549 let (ty, bytes) = $source(&mut input)
1550 .read_bang_element(buf, &mut position)
1551 $(.$await)?
1552 .unwrap();
1553 assert_eq!(
1554 (ty, Bytes(bytes)),
1555 (BangType::DocType(0), Bytes(b"!DOCTYPE"))
1556 );
1557 assert_eq!(position, 10);
1558 }
1559
1560 #[$test]
1561 $($async)? fn not_closed() {
1562 let buf = $buf;
1563 let mut position = 1;
1564 let mut input = b"!DOCTYPE other content".as_ref();
1565 // ^= 1 ^23
1566
1567 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1568 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1569 x => panic!(
1570 "Expected `Err(Syntax(_))`, but got `{:?}`",
1571 x
1572 ),
1573 }
1574 assert_eq!(position, 23);
1575 }
1576 }
1577
1578 mod lowercase {
1579 use super::*;
1580 use pretty_assertions::assert_eq;
1581
1582 #[$test]
1583 $($async)? fn not_properly_start() {
1584 let buf = $buf;
1585 let mut position = 1;
1586 let mut input = b"!d other content".as_ref();
1587 // ^= 1 ^= 17
1588
1589 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1590 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1591 x => panic!(
1592 "Expected `Err(Syntax(_))`, but got `{:?}`",
1593 x
1594 ),
1595 }
1596 assert_eq!(position, 17);
1597 }
1598
1599 #[$test]
1600 $($async)? fn without_space() {
1601 let buf = $buf;
1602 let mut position = 1;
1603 let mut input = b"!doctypeother content".as_ref();
1604 // ^= 1 ^= 22
1605
1606 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1607 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1608 x => panic!(
1609 "Expected `Err(Syntax(_))`, but got `{:?}`",
1610 x
1611 ),
1612 }
1613 assert_eq!(position, 22);
1614 }
1615
1616 #[$test]
1617 $($async)? fn empty() {
1618 let buf = $buf;
1619 let mut position = 1;
1620 let mut input = b"!doctype>other content".as_ref();
1621 // ^= 1 ^= 10
1622
1623 let (ty, bytes) = $source(&mut input)
1624 .read_bang_element(buf, &mut position)
1625 $(.$await)?
1626 .unwrap();
1627 assert_eq!(
1628 (ty, Bytes(bytes)),
1629 (BangType::DocType(0), Bytes(b"!doctype"))
1630 );
1631 assert_eq!(position, 10);
1632 }
1633
1634 #[$test]
1635 $($async)? fn not_closed() {
1636 let buf = $buf;
1637 let mut position = 1;
1638 let mut input = b"!doctype other content".as_ref();
1639 // ^= 1 ^= 23
1640
1641 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1642 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1643 x => panic!(
1644 "Expected `Err(Syntax(_))`, but got `{:?}`",
1645 x
1646 ),
1647 }
1648 assert_eq!(position, 23);
1649 }
1650 }
1651 }
1652 }
1653
1654 mod read_text {
1655 use super::*;
1656 use crate::reader::ReadTextResult;
1657 use crate::utils::Bytes;
1658 use pretty_assertions::assert_eq;
1659
1660 #[$test]
1661 $($async)? fn empty() {
1662 let buf = $buf;
1663 let mut position = 1;
1664 let mut input = b"".as_ref();
1665 // ^= 1
1666
1667 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1668 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")),
1669 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1670 }
1671 assert_eq!(position, 1);
1672 }
1673
1674 #[$test]
1675 $($async)? fn markup() {
1676 let buf = $buf;
1677 let mut position = 1;
1678 let mut input = b"<".as_ref();
1679 // ^= 2
1680
1681 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1682 ReadTextResult::Markup(b) => assert_eq!(b, $buf),
1683 x => panic!("Expected `Markup(_)`, but got `{:?}`", x),
1684 }
1685 assert_eq!(position, 2);
1686 }
1687
1688 #[$test]
1689 $($async)? fn ref_() {
1690 let buf = $buf;
1691 let mut position = 1;
1692 let mut input = b"&".as_ref();
1693 // ^= 1
1694
1695 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1696 ReadTextResult::Ref(b) => assert_eq!(b, $buf),
1697 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1698 }
1699 assert_eq!(position, 1);
1700 }
1701
1702 #[$test]
1703 $($async)? fn up_to_markup() {
1704 let buf = $buf;
1705 let mut position = 1;
1706 let mut input = b"a<".as_ref();
1707 // 1 ^= 3
1708
1709 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1710 ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1711 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1712 }
1713 assert_eq!(position, 3);
1714 }
1715
1716 #[$test]
1717 $($async)? fn up_to_ref() {
1718 let buf = $buf;
1719 let mut position = 1;
1720 let mut input = b"a&".as_ref();
1721 // ^= 2
1722
1723 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1724 ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1725 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1726 }
1727 assert_eq!(position, 2);
1728 }
1729
1730 #[$test]
1731 $($async)? fn up_to_eof() {
1732 let buf = $buf;
1733 let mut position = 1;
1734 let mut input = b"a".as_ref();
1735 // ^= 2
1736
1737 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1738 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1739 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1740 }
1741 assert_eq!(position, 2);
1742 }
1743 }
1744
1745 mod read_ref {
1746 use super::*;
1747 use crate::reader::ReadRefResult;
1748 use crate::utils::Bytes;
1749 use pretty_assertions::assert_eq;
1750
1751 // Empty input is not allowed for `read_ref` so not tested.
1752 // Borrowed source triggers debug assertion,
1753 // buffered do nothing due to implementation details.
1754
1755 #[$test]
1756 $($async)? fn up_to_eof() {
1757 let buf = $buf;
1758 let mut position = 1;
1759 let mut input = b"&".as_ref();
1760 // ^= 2
1761
1762 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1763 ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1764 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1765 }
1766 assert_eq!(position, 2);
1767 }
1768
1769 #[$test]
1770 $($async)? fn up_to_ref() {
1771 let buf = $buf;
1772 let mut position = 1;
1773 let mut input = b"&&".as_ref();
1774 // ^= 2
1775
1776 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1777 ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1778 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1779 }
1780 assert_eq!(position, 2);
1781 }
1782
1783 #[$test]
1784 $($async)? fn up_to_markup() {
1785 let buf = $buf;
1786 let mut position = 1;
1787 let mut input = b"&<".as_ref();
1788 // ^= 3
1789
1790 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1791 ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1792 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1793 }
1794 assert_eq!(position, 3);
1795 }
1796
1797 #[$test]
1798 $($async)? fn empty_ref() {
1799 let buf = $buf;
1800 let mut position = 1;
1801 let mut input = b"&;".as_ref();
1802 // ^= 3
1803
1804 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1805 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1806 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1807 }
1808 assert_eq!(position, 3);
1809 }
1810
1811 #[$test]
1812 $($async)? fn normal() {
1813 let buf = $buf;
1814 let mut position = 1;
1815 let mut input = b"<".as_ref();
1816 // ^= 5
1817
1818 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1819 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")),
1820 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1821 }
1822 assert_eq!(position, 5);
1823 }
1824 }
1825
1826 mod read_element {
1827 use super::*;
1828 use crate::errors::{Error, SyntaxError};
1829 use crate::parser::ElementParser;
1830 use crate::utils::Bytes;
1831 use pretty_assertions::assert_eq;
1832
1833 /// Checks that nothing was read from empty buffer
1834 #[$test]
1835 $($async)? fn empty() {
1836 let buf = $buf;
1837 let mut position = 1;
1838 let mut input = b"".as_ref();
1839 // ^= 1
1840
1841 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1842 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1843 x => panic!(
1844 "Expected `Err(Syntax(_))`, but got `{:?}`",
1845 x
1846 ),
1847 }
1848 assert_eq!(position, 1);
1849 }
1850
1851 mod open {
1852 use super::*;
1853 use pretty_assertions::assert_eq;
1854
1855 #[$test]
1856 $($async)? fn empty_tag() {
1857 let buf = $buf;
1858 let mut position = 1;
1859 let mut input = b">".as_ref();
1860 // ^= 2
1861
1862 assert_eq!(
1863 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1864 Bytes(b"")
1865 );
1866 assert_eq!(position, 2);
1867 }
1868
1869 #[$test]
1870 $($async)? fn normal() {
1871 let buf = $buf;
1872 let mut position = 1;
1873 let mut input = b"tag>".as_ref();
1874 // ^= 5
1875
1876 assert_eq!(
1877 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1878 Bytes(b"tag")
1879 );
1880 assert_eq!(position, 5);
1881 }
1882
1883 #[$test]
1884 $($async)? fn empty_ns_empty_tag() {
1885 let buf = $buf;
1886 let mut position = 1;
1887 let mut input = b":>".as_ref();
1888 // ^= 3
1889
1890 assert_eq!(
1891 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1892 Bytes(b":")
1893 );
1894 assert_eq!(position, 3);
1895 }
1896
1897 #[$test]
1898 $($async)? fn empty_ns() {
1899 let buf = $buf;
1900 let mut position = 1;
1901 let mut input = b":tag>".as_ref();
1902 // ^= 6
1903
1904 assert_eq!(
1905 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1906 Bytes(b":tag")
1907 );
1908 assert_eq!(position, 6);
1909 }
1910
1911 #[$test]
1912 $($async)? fn with_attributes() {
1913 let buf = $buf;
1914 let mut position = 1;
1915 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1916 // ^= 39
1917
1918 assert_eq!(
1919 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1920 Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
1921 );
1922 assert_eq!(position, 39);
1923 }
1924 }
1925
1926 mod self_closed {
1927 use super::*;
1928 use pretty_assertions::assert_eq;
1929
1930 #[$test]
1931 $($async)? fn empty_tag() {
1932 let buf = $buf;
1933 let mut position = 1;
1934 let mut input = b"/>".as_ref();
1935 // ^= 3
1936
1937 assert_eq!(
1938 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1939 Bytes(b"/")
1940 );
1941 assert_eq!(position, 3);
1942 }
1943
1944 #[$test]
1945 $($async)? fn normal() {
1946 let buf = $buf;
1947 let mut position = 1;
1948 let mut input = b"tag/>".as_ref();
1949 // ^= 6
1950
1951 assert_eq!(
1952 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1953 Bytes(b"tag/")
1954 );
1955 assert_eq!(position, 6);
1956 }
1957
1958 #[$test]
1959 $($async)? fn empty_ns_empty_tag() {
1960 let buf = $buf;
1961 let mut position = 1;
1962 let mut input = b":/>".as_ref();
1963 // ^= 4
1964
1965 assert_eq!(
1966 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1967 Bytes(b":/")
1968 );
1969 assert_eq!(position, 4);
1970 }
1971
1972 #[$test]
1973 $($async)? fn empty_ns() {
1974 let buf = $buf;
1975 let mut position = 1;
1976 let mut input = b":tag/>".as_ref();
1977 // ^= 7
1978
1979 assert_eq!(
1980 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1981 Bytes(b":tag/")
1982 );
1983 assert_eq!(position, 7);
1984 }
1985
1986 #[$test]
1987 $($async)? fn with_attributes() {
1988 let buf = $buf;
1989 let mut position = 1;
1990 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1991 // ^= 42
1992
1993 assert_eq!(
1994 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1995 Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
1996 );
1997 assert_eq!(position, 42);
1998 }
1999 }
2000
2001 mod close {
2002 use super::*;
2003 use pretty_assertions::assert_eq;
2004
2005 #[$test]
2006 $($async)? fn empty_tag() {
2007 let buf = $buf;
2008 let mut position = 1;
2009 let mut input = b"/ >".as_ref();
2010 // ^= 4
2011
2012 assert_eq!(
2013 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2014 Bytes(b"/ ")
2015 );
2016 assert_eq!(position, 4);
2017 }
2018
2019 #[$test]
2020 $($async)? fn normal() {
2021 let buf = $buf;
2022 let mut position = 1;
2023 let mut input = b"/tag>".as_ref();
2024 // ^= 6
2025
2026 assert_eq!(
2027 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2028 Bytes(b"/tag")
2029 );
2030 assert_eq!(position, 6);
2031 }
2032
2033 #[$test]
2034 $($async)? fn empty_ns_empty_tag() {
2035 let buf = $buf;
2036 let mut position = 1;
2037 let mut input = b"/:>".as_ref();
2038 // ^= 4
2039
2040 assert_eq!(
2041 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2042 Bytes(b"/:")
2043 );
2044 assert_eq!(position, 4);
2045 }
2046
2047 #[$test]
2048 $($async)? fn empty_ns() {
2049 let buf = $buf;
2050 let mut position = 1;
2051 let mut input = b"/:tag>".as_ref();
2052 // ^= 7
2053
2054 assert_eq!(
2055 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2056 Bytes(b"/:tag")
2057 );
2058 assert_eq!(position, 7);
2059 }
2060
2061 #[$test]
2062 $($async)? fn with_attributes() {
2063 let buf = $buf;
2064 let mut position = 1;
2065 let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
2066 // ^= 40
2067
2068 assert_eq!(
2069 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2070 Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#)
2071 );
2072 assert_eq!(position, 40);
2073 }
2074 }
2075 }
2076
2077 /// Ensures, that no empty `Text` events are generated
2078 mod $read_event {
2079 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
2080 use crate::reader::Reader;
2081 use pretty_assertions::assert_eq;
2082
2083 /// When `encoding` feature is enabled, encoding should be detected
2084 /// from BOM (UTF-8) and BOM should be stripped.
2085 ///
2086 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
2087 /// character should be stripped for consistency
2088 #[$test]
2089 $($async)? fn bom_from_reader() {
2090 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
2091
2092 assert_eq!(
2093 reader.$read_event($buf) $(.$await)? .unwrap(),
2094 Event::Text(BytesText::from_escaped("\u{feff}"))
2095 );
2096
2097 assert_eq!(
2098 reader.$read_event($buf) $(.$await)? .unwrap(),
2099 Event::Eof
2100 );
2101 }
2102
2103 /// When parsing from &str, encoding is fixed (UTF-8), so
2104 /// - when `encoding` feature is disabled, the behavior the
2105 /// same as in `bom_from_reader` text
2106 /// - when `encoding` feature is enabled, the behavior should
2107 /// stay consistent, so the first BOM character is stripped
2108 #[$test]
2109 $($async)? fn bom_from_str() {
2110 let mut reader = Reader::from_str("\u{feff}\u{feff}");
2111
2112 assert_eq!(
2113 reader.$read_event($buf) $(.$await)? .unwrap(),
2114 Event::Text(BytesText::from_escaped("\u{feff}"))
2115 );
2116
2117 assert_eq!(
2118 reader.$read_event($buf) $(.$await)? .unwrap(),
2119 Event::Eof
2120 );
2121 }
2122
2123 #[$test]
2124 $($async)? fn declaration() {
2125 let mut reader = Reader::from_str("<?xml ?>");
2126
2127 assert_eq!(
2128 reader.$read_event($buf) $(.$await)? .unwrap(),
2129 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
2130 );
2131 }
2132
2133 #[$test]
2134 $($async)? fn doctype() {
2135 let mut reader = Reader::from_str("<!DOCTYPE x>");
2136
2137 assert_eq!(
2138 reader.$read_event($buf) $(.$await)? .unwrap(),
2139 Event::DocType(BytesText::from_escaped("x"))
2140 );
2141 }
2142
2143 #[$test]
2144 $($async)? fn processing_instruction() {
2145 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
2146
2147 assert_eq!(
2148 reader.$read_event($buf) $(.$await)? .unwrap(),
2149 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
2150 );
2151 }
2152
2153 /// Lone closing tags are not allowed, so testing it together with start tag
2154 #[$test]
2155 $($async)? fn start_and_end() {
2156 let mut reader = Reader::from_str("<tag></tag>");
2157
2158 assert_eq!(
2159 reader.$read_event($buf) $(.$await)? .unwrap(),
2160 Event::Start(BytesStart::new("tag"))
2161 );
2162
2163 assert_eq!(
2164 reader.$read_event($buf) $(.$await)? .unwrap(),
2165 Event::End(BytesEnd::new("tag"))
2166 );
2167 }
2168
2169 #[$test]
2170 $($async)? fn empty() {
2171 let mut reader = Reader::from_str("<tag/>");
2172
2173 assert_eq!(
2174 reader.$read_event($buf) $(.$await)? .unwrap(),
2175 Event::Empty(BytesStart::new("tag"))
2176 );
2177 }
2178
2179 #[$test]
2180 $($async)? fn text() {
2181 let mut reader = Reader::from_str("text");
2182
2183 assert_eq!(
2184 reader.$read_event($buf) $(.$await)? .unwrap(),
2185 Event::Text(BytesText::from_escaped("text"))
2186 );
2187 }
2188
2189 #[$test]
2190 $($async)? fn cdata() {
2191 let mut reader = Reader::from_str("<![CDATA[]]>");
2192
2193 assert_eq!(
2194 reader.$read_event($buf) $(.$await)? .unwrap(),
2195 Event::CData(BytesCData::new(""))
2196 );
2197 }
2198
2199 #[$test]
2200 $($async)? fn comment() {
2201 let mut reader = Reader::from_str("<!---->");
2202
2203 assert_eq!(
2204 reader.$read_event($buf) $(.$await)? .unwrap(),
2205 Event::Comment(BytesText::from_escaped(""))
2206 );
2207 }
2208
2209 #[$test]
2210 $($async)? fn eof() {
2211 let mut reader = Reader::from_str("");
2212
2213 assert_eq!(
2214 reader.$read_event($buf) $(.$await)? .unwrap(),
2215 Event::Eof
2216 );
2217 }
2218 }
2219 };
2220 }
2221
2222 // Export macros for the child modules:
2223 // - buffered_reader
2224 // - slice_reader
2225 pub(super) use check;
2226}