quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, IllFormedError, SyntaxError};
10use crate::events::{BytesRef, Event};
11use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27 /// Whether lone ampersand character (without a paired semicolon) should be
28 /// allowed in textual content. Unless enabled, in case of a dangling ampersand,
29 /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
30 ///
31 /// Default: `false`
32 ///
33 /// # Example
34 ///
35 /// ```
36 /// # use quick_xml::events::{BytesRef, BytesText, Event};
37 /// # use quick_xml::reader::Reader;
38 /// # use pretty_assertions::assert_eq;
39 /// let mut reader = Reader::from_str("text with & & & alone");
40 /// reader.config_mut().allow_dangling_amp = true;
41 ///
42 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
43 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
44 /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
45 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
46 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
47 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
48 /// ```
49 ///
50 /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
51 pub allow_dangling_amp: bool,
52
53 /// Whether unmatched closing tag names should be allowed. Unless enabled,
54 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
55 /// is returned from read methods.
56 ///
57 /// When set to `true`, it won't check if a closing tag has a corresponding
58 /// opening tag at all. For example, `<a></a></b>` will be permitted.
59 ///
60 /// Note that the emitted [`End`] event will not be modified if this is enabled,
61 /// ie. it will contain the data of the unmatched end tag.
62 ///
63 /// Note, that setting this to `true` will lead to additional allocates that
64 /// needed to store tag name for an [`End`] event.
65 ///
66 /// Default: `false`
67 ///
68 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
69 /// [`End`]: crate::events::Event::End
70 pub allow_unmatched_ends: bool,
71
72 /// Whether comments should be validated. If enabled, in case of invalid comment
73 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
74 ///
75 /// When set to `true`, every [`Comment`] event will be checked for not
76 /// containing `--`, which [is not allowed] in XML comments. Most of the time
77 /// we don't want comments at all so we don't really care about comment
78 /// correctness, thus the default value is `false` to improve performance.
79 ///
80 /// Default: `false`
81 ///
82 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
83 /// [`Comment`]: crate::events::Event::Comment
84 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
85 pub check_comments: bool,
86
87 /// Whether mismatched closing tag names should be detected. If enabled, in
88 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
89 /// read methods.
90 ///
91 /// Note, that start and end tags [should match literally][spec], they cannot
92 /// have different prefixes even if both prefixes resolve to the same namespace.
93 /// The XML
94 ///
95 /// ```xml
96 /// <outer xmlns="namespace" xmlns:p="namespace">
97 /// </p:outer>
98 /// ```
99 ///
100 /// is not valid, even though semantically the start tag is the same as the
101 /// end tag. The reason is that namespaces are an extension of the original
102 /// XML specification (without namespaces) and it should be backward-compatible.
103 ///
104 /// When set to `false`, it won't check if a closing tag matches the corresponding
105 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
106 ///
107 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
108 ///
109 /// Note that the emitted [`End`] event will not be modified if this is disabled,
110 /// ie. it will contain the data of the mismatched end tag.
111 ///
112 /// Note, that setting this to `true` will lead to additional allocates that
113 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
114 /// is also set, only one additional allocation will be performed that support
115 /// both these options.
116 ///
117 /// Default: `true`
118 ///
119 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
120 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
121 /// [`End`]: crate::events::Event::End
122 /// [`expand_empty_elements`]: Self::expand_empty_elements
123 pub check_end_names: bool,
124
125 /// Whether empty elements should be split into an `Open` and a `Close` event.
126 ///
127 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
128 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
129 /// event. When set to `false` (the default), those tags are represented by
130 /// an [`Empty`] event instead.
131 ///
132 /// Note, that setting this to `true` will lead to additional allocates that
133 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
134 /// is also set, only one additional allocation will be performed that support
135 /// both these options.
136 ///
137 /// Default: `false`
138 ///
139 /// [`Empty`]: crate::events::Event::Empty
140 /// [`Start`]: crate::events::Event::Start
141 /// [`End`]: crate::events::Event::End
142 /// [`check_end_names`]: Self::check_end_names
143 pub expand_empty_elements: bool,
144
145 /// Whether trailing whitespace after the markup name are trimmed in closing
146 /// tags `</a >`.
147 ///
148 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
149 /// after the markup name.
150 ///
151 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
152 /// of markup names is going to fail erroneously if a closing tag contains
153 /// trailing whitespace.
154 ///
155 /// Default: `true`
156 ///
157 /// [`End`]: crate::events::Event::End
158 /// [`check_end_names`]: Self::check_end_names
159 pub trim_markup_names_in_closing_tags: bool,
160
161 /// Whether whitespace before character data should be removed.
162 ///
163 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
164 /// If after that the event is empty it will not be pushed.
165 ///
166 /// Default: `false`
167 ///
168 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
169 ///
170 /// WARNING: With this option every text events will be trimmed which is
171 /// incorrect behavior when text events delimited by comments, processing
172 /// instructions or CDATA sections. To correctly trim data manually apply
173 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
174 /// only to necessary events.
175 /// </div>
176 ///
177 /// [`Text`]: crate::events::Event::Text
178 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
179 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
180 pub trim_text_start: bool,
181
182 /// Whether whitespace after character data should be removed.
183 ///
184 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
185 /// If after that the event is empty it will not be pushed.
186 ///
187 /// Default: `false`
188 ///
189 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
190 ///
191 /// WARNING: With this option every text events will be trimmed which is
192 /// incorrect behavior when text events delimited by comments, processing
193 /// instructions or CDATA sections. To correctly trim data manually apply
194 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
195 /// only to necessary events.
196 /// </div>
197 ///
198 /// [`Text`]: crate::events::Event::Text
199 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
200 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
201 pub trim_text_end: bool,
202}
203
204impl Config {
205 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
206 ///
207 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
208 ///
209 /// WARNING: With this option every text events will be trimmed which is
210 /// incorrect behavior when text events delimited by comments, processing
211 /// instructions or CDATA sections. To correctly trim data manually apply
212 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
213 /// only to necessary events.
214 /// </div>
215 ///
216 /// [`trim_text_start`]: Self::trim_text_start
217 /// [`trim_text_end`]: Self::trim_text_end
218 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
219 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
220 #[inline]
221 pub fn trim_text(&mut self, trim: bool) {
222 self.trim_text_start = trim;
223 self.trim_text_end = trim;
224 }
225
226 /// Turn on or off all checks for well-formedness. Currently it is that settings:
227 /// - [`check_comments`](Self::check_comments)
228 /// - [`check_end_names`](Self::check_end_names)
229 #[inline]
230 pub fn enable_all_checks(&mut self, enable: bool) {
231 self.check_comments = enable;
232 self.check_end_names = enable;
233 }
234}
235
236impl Default for Config {
237 fn default() -> Self {
238 Self {
239 allow_dangling_amp: false,
240 allow_unmatched_ends: false,
241 check_comments: false,
242 check_end_names: true,
243 expand_empty_elements: false,
244 trim_markup_names_in_closing_tags: true,
245 trim_text_start: false,
246 trim_text_end: false,
247 }
248 }
249}
250
251////////////////////////////////////////////////////////////////////////////////////////////////////
252
253macro_rules! read_event_impl {
254 (
255 $self:ident, $buf:ident,
256 $reader:expr,
257 $read_until_close:ident
258 $(, $await:ident)?
259 ) => {{
260 let event = loop {
261 break match $self.state.state {
262 ParseState::Init => { // Go to InsideText state
263 // If encoding set explicitly, we not need to detect it. For example,
264 // explicit UTF-8 set automatically if Reader was created using `from_str`.
265 // But we still need to remove BOM for consistency with no encoding
266 // feature enabled path
267 #[cfg(feature = "encoding")]
268 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
269 if $self.state.encoding.can_be_refined() {
270 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
271 }
272 }
273
274 // Removes UTF-8 BOM if it is present
275 #[cfg(not(feature = "encoding"))]
276 $reader.remove_utf8_bom() $(.$await)? ?;
277
278 $self.state.state = ParseState::InsideText;
279 continue;
280 },
281 ParseState::InsideRef => { // Go to InsideText
282 let start = $self.state.offset;
283 match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? {
284 // Emit reference, go to InsideText state
285 ReadRefResult::Ref(bytes) => {
286 $self.state.state = ParseState::InsideText;
287 // +1 to skip start `&`
288 Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder())))
289 }
290 // Go to Done state
291 ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
292 $self.state.state = ParseState::Done;
293 Ok(Event::Text($self.state.emit_text(bytes)))
294 }
295 ReadRefResult::UpToEof(_) => {
296 $self.state.state = ParseState::Done;
297 $self.state.last_error_offset = start;
298 Err(Error::IllFormed(IllFormedError::UnclosedReference))
299 }
300 // Do not change state, stay in InsideRef
301 ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
302 Ok(Event::Text($self.state.emit_text(bytes)))
303 }
304 ReadRefResult::UpToRef(_) => {
305 $self.state.last_error_offset = start;
306 Err(Error::IllFormed(IllFormedError::UnclosedReference))
307 }
308 // Go to InsideMarkup state
309 ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
310 $self.state.state = ParseState::InsideMarkup;
311 Ok(Event::Text($self.state.emit_text(bytes)))
312 }
313 ReadRefResult::UpToMarkup(_) => {
314 $self.state.state = ParseState::InsideMarkup;
315 $self.state.last_error_offset = start;
316 Err(Error::IllFormed(IllFormedError::UnclosedReference))
317 }
318 ReadRefResult::Err(e) => Err(Error::Io(e.into())),
319 }
320 }
321 ParseState::InsideText => { // Go to InsideMarkup or Done state
322 if $self.state.config.trim_text_start {
323 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
324 }
325
326 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
327 ReadTextResult::Markup(buf) => {
328 $self.state.state = ParseState::InsideMarkup;
329 // Pass `buf` to the next next iteration of parsing loop
330 $buf = buf;
331 continue;
332 }
333 ReadTextResult::Ref(buf) => {
334 $self.state.state = ParseState::InsideRef;
335 // Pass `buf` to the next next iteration of parsing loop
336 $buf = buf;
337 continue;
338 }
339 ReadTextResult::UpToMarkup(bytes) => {
340 $self.state.state = ParseState::InsideMarkup;
341 // FIXME: Can produce an empty event if:
342 // - event contains only spaces
343 // - trim_text_start = false
344 // - trim_text_end = true
345 Ok(Event::Text($self.state.emit_text(bytes)))
346 }
347 ReadTextResult::UpToRef(bytes) => {
348 $self.state.state = ParseState::InsideRef;
349 // Return Text event with `bytes` content or Eof if bytes is empty
350 Ok(Event::Text($self.state.emit_text(bytes)))
351 }
352 ReadTextResult::UpToEof(bytes) => {
353 $self.state.state = ParseState::Done;
354 // Trim bytes from end if required
355 let event = $self.state.emit_text(bytes);
356 if event.is_empty() {
357 Ok(Event::Eof)
358 } else {
359 Ok(Event::Text(event))
360 }
361 }
362 ReadTextResult::Err(e) => Err(Error::Io(e.into())),
363 }
364 },
365 // Go to InsideText state in next two arms
366 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
367 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
368 ParseState::Done => Ok(Event::Eof),
369 };
370 };
371 match event {
372 // #513: In case of ill-formed errors we already consume the wrong data
373 // and change the state. We can continue parsing if we wish
374 Err(Error::IllFormed(_)) => {}
375 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
376 _ => {}
377 }
378 event
379 }};
380}
381
382/// Read bytes up to the `>` and skip it. This method is expected to be called
383/// after seeing the `<` symbol and skipping it. Inspects the next (current)
384/// symbol and returns an appropriate [`Event`]:
385///
386/// |Symbol |Event
387/// |-------|-------------------------------------
388/// |`!` |[`Comment`], [`CData`] or [`DocType`]
389/// |`/` |[`End`]
390/// |`?` |[`PI`]
391/// |_other_|[`Start`] or [`Empty`]
392///
393/// Moves parser to the `InsideText` state.
394///
395/// [`Comment`]: Event::Comment
396/// [`CData`]: Event::CData
397/// [`DocType`]: Event::DocType
398/// [`End`]: Event::End
399/// [`PI`]: Event::PI
400/// [`Start`]: Event::Start
401/// [`Empty`]: Event::Empty
402macro_rules! read_until_close {
403 (
404 $self:ident, $buf:ident,
405 $reader:expr
406 $(, $await:ident)?
407 ) => {{
408 $self.state.state = ParseState::InsideText;
409
410 let start = $self.state.offset;
411 match $reader.peek_one() $(.$await)? {
412 // `<!` - comment, CDATA or DOCTYPE declaration
413 Ok(Some(b'!')) => match $reader
414 .read_bang_element($buf, &mut $self.state.offset)
415 $(.$await)?
416 {
417 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
418 Err(e) => {
419 // We want to report error at `<`
420 $self.state.last_error_offset = start;
421 Err(e)
422 }
423 },
424 // `</` - closing tag
425 // #776: We parse using ElementParser which allows us to have attributes
426 // in close tags. While such tags are not allowed by the specification,
427 // we anyway allow to parse them because:
428 // - we do not check constraints during parsing. This is performed by the
429 // optional validate step which user should call manually
430 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
431 // `</tag attr=">` and text `" >` which probably no one existing parser
432 // does. This is malformed XML, however it is tolerated by some parsers
433 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
434 Ok(Some(b'/')) => match $reader
435 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
436 $(.$await)?
437 {
438 Ok(bytes) => $self.state.emit_end(bytes),
439 Err(e) => {
440 // We want to report error at `<`
441 $self.state.last_error_offset = start;
442 Err(e)
443 }
444 },
445 // `<?` - processing instruction
446 Ok(Some(b'?')) => match $reader
447 .read_with(PiParser(false), $buf, &mut $self.state.offset)
448 $(.$await)?
449 {
450 Ok(bytes) => $self.state.emit_question_mark(bytes),
451 Err(e) => {
452 // We want to report error at `<`
453 $self.state.last_error_offset = start;
454 Err(e)
455 }
456 },
457 // `<...` - opening or self-closed tag
458 Ok(Some(_)) => match $reader
459 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
460 $(.$await)?
461 {
462 Ok(bytes) => Ok($self.state.emit_start(bytes)),
463 Err(e) => {
464 // We want to report error at `<`
465 $self.state.last_error_offset = start;
466 Err(e)
467 }
468 },
469 // `<` - syntax error, tag not closed
470 Ok(None) => {
471 // We want to report error at `<`
472 $self.state.last_error_offset = start;
473 Err(Error::Syntax(SyntaxError::UnclosedTag))
474 }
475 Err(e) => Err(Error::Io(e.into())),
476 }
477 }};
478}
479
480/// Generalization of `read_to_end` method for buffered and borrowed readers
481macro_rules! read_to_end {
482 (
483 // $self: &mut Reader
484 $self:expr, $end:expr, $buf:expr,
485 $read_event:ident,
486 // Code block that performs clearing of internal buffer after read of each event
487 $clear:block
488 $(, $await:ident)?
489 ) => {{
490 // Because we take position after the event before the End event,
491 // it is important that this position indicates beginning of the End event.
492 // If between last event and the End event would be only spaces, then we
493 // take position before the spaces, but spaces would be skipped without
494 // generating event if `trim_text_start` is set to `true`. To prevent that
495 // we temporary disable start text trimming.
496 //
497 // We also cannot take position after getting End event, because if
498 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
499 // we do not known the real size of the End event that it is occupies in
500 // the source and cannot correct the position after the End event.
501 // So, we in any case should tweak parser configuration.
502 let config = $self.config_mut();
503 let trim = config.trim_text_start;
504 config.trim_text_start = false;
505
506 let start = $self.buffer_position();
507 let mut depth = 0;
508 loop {
509 $clear
510 let end = $self.buffer_position();
511 match $self.$read_event($buf) $(.$await)? {
512 Err(e) => {
513 $self.config_mut().trim_text_start = trim;
514 return Err(e);
515 }
516
517 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
518 Ok(Event::End(e)) if e.name() == $end => {
519 if depth == 0 {
520 $self.config_mut().trim_text_start = trim;
521 break start..end;
522 }
523 depth -= 1;
524 }
525 Ok(Event::Eof) => {
526 $self.config_mut().trim_text_start = trim;
527 return Err(Error::missed_end($end, $self.decoder()));
528 }
529 _ => (),
530 }
531 }
532 }};
533}
534
535#[cfg(feature = "async-tokio")]
536mod async_tokio;
537mod buffered_reader;
538mod ns_reader;
539mod slice_reader;
540mod state;
541
542pub use ns_reader::NsReader;
543
544/// Range of input in bytes, that corresponds to some piece of XML
545pub type Span = Range<u64>;
546
547////////////////////////////////////////////////////////////////////////////////////////////////////
548
549/// Possible reader states. The state transition diagram (`true` and `false` shows
550/// value of [`Config::expand_empty_elements`] option):
551///
552/// ```mermaid
553/// flowchart LR
554/// subgraph _
555/// direction LR
556///
557/// Init -- "(no event)"\n --> InsideMarkup
558/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
559/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
560/// InsideRef -- "(no event)"\nGeneralRef --> InsideText
561/// end
562/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
563/// InsideEmpty -- End --> InsideText
564/// _ -. Eof .-> Done
565/// ```
566#[derive(Clone, Debug)]
567enum ParseState {
568 /// Initial state in which reader stay after creation. Transition from that
569 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
570 /// state is always `InsideMarkup`. The reader will never return to this state. The
571 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
572 /// first symbol not `<`, otherwise no event are emitted.
573 Init,
574 /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other
575 /// events could be generated.
576 ///
577 /// After generating one event the reader moves to the `ClosedTag` state.
578 InsideRef,
579 /// State after seeing the `<` symbol. Depending on the next symbol all other
580 /// events could be generated.
581 ///
582 /// After generating one event the reader moves to the `InsideText` state.
583 InsideMarkup,
584 /// State in which reader searches the `<` symbol of a markup. All bytes before
585 /// that symbol will be returned in the [`Event::Text`] event. After that
586 /// the reader moves to the `InsideMarkup` state.
587 InsideText,
588 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
589 /// Reader enters to this state when it is in a `InsideText` state and emits an
590 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
591 /// after which reader returned to the `InsideText` state.
592 ///
593 /// [`expand_empty_elements`]: Config::expand_empty_elements
594 InsideEmpty,
595 /// Reader enters this state when `Eof` event generated or an error occurred.
596 /// This is the last state, the reader stay in it forever.
597 Done,
598}
599
600/// A reference to an encoding together with information about how it was retrieved.
601///
602/// The state transition diagram:
603///
604/// ```mermaid
605/// flowchart LR
606/// Implicit -- from_str --> Explicit
607/// Implicit -- BOM --> BomDetected
608/// Implicit -- "encoding=..." --> XmlDetected
609/// BomDetected -- "encoding=..." --> XmlDetected
610/// ```
611#[cfg(feature = "encoding")]
612#[derive(Clone, Copy, Debug)]
613enum EncodingRef {
614 /// Encoding was implicitly assumed to have a specified value. It can be refined
615 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
616 Implicit(&'static Encoding),
617 /// Encoding was explicitly set to the desired value. It cannot be changed
618 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
619 Explicit(&'static Encoding),
620 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
621 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
622 BomDetected(&'static Encoding),
623 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
624 /// It can no longer change
625 XmlDetected(&'static Encoding),
626}
627#[cfg(feature = "encoding")]
628impl EncodingRef {
629 #[inline]
630 const fn encoding(&self) -> &'static Encoding {
631 match self {
632 Self::Implicit(e) => e,
633 Self::Explicit(e) => e,
634 Self::BomDetected(e) => e,
635 Self::XmlDetected(e) => e,
636 }
637 }
638 #[inline]
639 const fn can_be_refined(&self) -> bool {
640 match self {
641 Self::Implicit(_) | Self::BomDetected(_) => true,
642 Self::Explicit(_) | Self::XmlDetected(_) => false,
643 }
644 }
645}
646
647////////////////////////////////////////////////////////////////////////////////////////////////////
648
649/// A direct stream to the underlying [`Reader`]s reader which updates
650/// [`Reader::buffer_position()`] when read from it.
651#[derive(Debug)]
652#[must_use = "streams do nothing unless read or polled"]
653pub struct BinaryStream<'r, R> {
654 inner: &'r mut R,
655 offset: &'r mut u64,
656}
657
658impl<'r, R> BinaryStream<'r, R> {
659 /// Returns current position in bytes in the original source.
660 #[inline]
661 pub const fn offset(&self) -> u64 {
662 *self.offset
663 }
664
665 /// Gets a reference to the underlying reader.
666 #[inline]
667 pub const fn get_ref(&self) -> &R {
668 self.inner
669 }
670
671 /// Gets a mutable reference to the underlying reader.
672 ///
673 /// Avoid read from this reader because this will not update reader's position
674 /// and will lead to incorrect positions of errors. Read from this stream instead.
675 #[inline]
676 pub fn get_mut(&mut self) -> &mut R {
677 self.inner
678 }
679}
680
681impl<'r, R> io::Read for BinaryStream<'r, R>
682where
683 R: io::Read,
684{
685 #[inline]
686 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
687 let amt = self.inner.read(buf)?;
688 *self.offset += amt as u64;
689 Ok(amt)
690 }
691}
692
693impl<'r, R> io::BufRead for BinaryStream<'r, R>
694where
695 R: io::BufRead,
696{
697 #[inline]
698 fn fill_buf(&mut self) -> io::Result<&[u8]> {
699 self.inner.fill_buf()
700 }
701
702 #[inline]
703 fn consume(&mut self, amt: usize) {
704 self.inner.consume(amt);
705 *self.offset += amt as u64;
706 }
707}
708
709////////////////////////////////////////////////////////////////////////////////////////////////////
710
711/// A low level encoding-agnostic XML event reader.
712///
713/// Consumes bytes and streams XML [`Event`]s.
714///
715/// This reader does not manage namespace declarations and not able to resolve
716/// prefixes. If you want these features, use the [`NsReader`].
717///
718/// # Examples
719///
720/// ```
721/// use quick_xml::events::Event;
722/// use quick_xml::reader::Reader;
723///
724/// let xml = r#"<tag1 att1 = "test">
725/// <tag2><!--Test comment-->Test</tag2>
726/// <tag2>Test 2</tag2>
727/// </tag1>"#;
728/// let mut reader = Reader::from_str(xml);
729/// reader.config_mut().trim_text(true);
730///
731/// let mut count = 0;
732/// let mut txt = Vec::new();
733/// let mut buf = Vec::new();
734///
735/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
736/// loop {
737/// // NOTE: this is the generic case when we don't know about the input BufRead.
738/// // when the input is a &str or a &[u8], we don't actually need to use another
739/// // buffer, we could directly call `reader.read_event()`
740/// match reader.read_event_into(&mut buf) {
741/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
742/// // exits the loop when reaching end of file
743/// Ok(Event::Eof) => break,
744///
745/// Ok(Event::Start(e)) => {
746/// match e.name().as_ref() {
747/// b"tag1" => println!("attributes values: {:?}",
748/// e.attributes().map(|a| a.unwrap().value)
749/// .collect::<Vec<_>>()),
750/// b"tag2" => count += 1,
751/// _ => (),
752/// }
753/// }
754/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
755///
756/// // There are several other `Event`s we do not consider here
757/// _ => (),
758/// }
759/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
760/// buf.clear();
761/// }
762/// ```
763///
764/// [`NsReader`]: crate::reader::NsReader
765#[derive(Debug, Clone)]
766pub struct Reader<R> {
767 /// Source of data for parse
768 reader: R,
769 /// Configuration and current parse state
770 state: ReaderState,
771}
772
773/// Builder methods
774impl<R> Reader<R> {
775 /// Creates a `Reader` that reads from a given reader.
776 pub fn from_reader(reader: R) -> Self {
777 Self {
778 reader,
779 state: ReaderState::default(),
780 }
781 }
782
783 /// Returns reference to the parser configuration
784 pub const fn config(&self) -> &Config {
785 &self.state.config
786 }
787
788 /// Returns mutable reference to the parser configuration
789 pub fn config_mut(&mut self) -> &mut Config {
790 &mut self.state.config
791 }
792}
793
794/// Getters
795impl<R> Reader<R> {
796 /// Consumes `Reader` returning the underlying reader
797 ///
798 /// Can be used to compute line and column of a parsing error position
799 ///
800 /// # Examples
801 ///
802 /// ```
803 /// # use pretty_assertions::assert_eq;
804 /// use std::{str, io::Cursor};
805 /// use quick_xml::events::Event;
806 /// use quick_xml::reader::Reader;
807 ///
808 /// let xml = r#"<tag1 att1 = "test">
809 /// <tag2><!--Test comment-->Test</tag2>
810 /// <tag3>Test 2</tag3>
811 /// </tag1>"#;
812 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
813 /// let mut buf = Vec::new();
814 ///
815 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
816 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
817 /// let end_pos = reader.buffer_position() as usize;
818 /// let mut cursor = reader.into_inner();
819 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
820 /// .expect("can't make a string");
821 /// let mut line = 1;
822 /// let mut column = 0;
823 /// for c in s.chars() {
824 /// if c == '\n' {
825 /// line += 1;
826 /// column = 0;
827 /// } else {
828 /// column += 1;
829 /// }
830 /// }
831 /// (line, column)
832 /// }
833 ///
834 /// loop {
835 /// match reader.read_event_into(&mut buf) {
836 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
837 /// b"tag1" | b"tag2" => (),
838 /// tag => {
839 /// assert_eq!(b"tag3", tag);
840 /// assert_eq!((3, 22), into_line_and_column(reader));
841 /// break;
842 /// }
843 /// },
844 /// Ok(Event::Eof) => unreachable!(),
845 /// _ => (),
846 /// }
847 /// buf.clear();
848 /// }
849 /// ```
850 pub fn into_inner(self) -> R {
851 self.reader
852 }
853
854 /// Gets a reference to the underlying reader.
855 pub const fn get_ref(&self) -> &R {
856 &self.reader
857 }
858
859 /// Gets a mutable reference to the underlying reader.
860 ///
861 /// Avoid read from this reader because this will not update reader's position
862 /// and will lead to incorrect positions of errors. If you want to read, use
863 /// [`stream()`] instead.
864 ///
865 /// [`stream()`]: Self::stream
866 pub fn get_mut(&mut self) -> &mut R {
867 &mut self.reader
868 }
869
870 /// Gets the byte position in the input data just after the last emitted event
871 /// (i.e. this is position where data of last event ends).
872 ///
873 /// Note, that for text events which is originally ended with whitespace characters
874 /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position
875 /// before trim, not the position of the last byte of the [`Event::Text`] content.
876 pub const fn buffer_position(&self) -> u64 {
877 self.state.offset
878 }
879
880 /// Gets the last error byte position in the input data. If there is no errors
881 /// yet, returns `0`.
882 ///
883 /// Unlike `buffer_position` it will point to the place where it is rational
884 /// to report error to the end user. For example, all [`SyntaxError`]s are
885 /// reported when the parser sees EOF inside of some kind of markup. The
886 /// `buffer_position()` will point to the last byte of input which is not
887 /// very useful. `error_position()` will point to the start of corresponding
888 /// markup element (i. e. to the `<` character).
889 ///
890 /// This position is always `<= buffer_position()`.
891 pub const fn error_position(&self) -> u64 {
892 self.state.last_error_offset
893 }
894
895 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
896 ///
897 /// If [`encoding`] feature is enabled, the used encoding may change after
898 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
899 ///
900 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
901 /// defaults to UTF-8.
902 ///
903 /// [`encoding`]: ../index.html#encoding
904 #[inline]
905 pub const fn decoder(&self) -> Decoder {
906 self.state.decoder()
907 }
908
909 /// Get the direct access to the underlying reader, but tracks the amount of
910 /// read data and update [`Reader::buffer_position()`] accordingly.
911 ///
912 /// Note, that this method gives you access to the internal reader and read
913 /// data will not be returned in any subsequent events read by `read_event`
914 /// family of methods.
915 ///
916 /// # Example
917 ///
918 /// This example demonstrates how to read stream raw bytes from an XML document.
919 /// This could be used to implement streaming read of text, or to read raw binary
920 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
921 /// valid XML, but XML-derived file formats exist where such documents are valid).
922 ///
923 /// ```
924 /// # use pretty_assertions::assert_eq;
925 /// use std::io::{BufRead, Read};
926 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
927 /// use quick_xml::reader::Reader;
928 ///
929 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
930 /// // ^ ^ ^ ^
931 /// // 0 5 21 27
932 ///
933 /// assert_eq!(
934 /// (reader.read_event().unwrap(), reader.buffer_position()),
935 /// // 5 - end of the `<tag>`
936 /// (Event::Start(BytesStart::new("tag")), 5)
937 /// );
938 ///
939 /// // Reading directly from underlying reader will not update position
940 /// // let mut inner = reader.get_mut();
941 ///
942 /// // Reading from the stream() advances position
943 /// let mut inner = reader.stream();
944 ///
945 /// // Read binary data. We must know its size
946 /// let mut binary = [0u8; 16];
947 /// inner.read_exact(&mut binary).unwrap();
948 /// assert_eq!(&binary, b"binary << data&>");
949 /// // 21 - end of the `binary << data&>`
950 /// assert_eq!(inner.offset(), 21);
951 /// assert_eq!(reader.buffer_position(), 21);
952 ///
953 /// assert_eq!(
954 /// (reader.read_event().unwrap(), reader.buffer_position()),
955 /// // 27 - end of the `</tag>`
956 /// (Event::End(BytesEnd::new("tag")), 27)
957 /// );
958 ///
959 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
960 /// ```
961 #[inline]
962 pub fn stream(&mut self) -> BinaryStream<'_, R> {
963 BinaryStream {
964 inner: &mut self.reader,
965 offset: &mut self.state.offset,
966 }
967 }
968}
969
970/// Private sync reading methods
971impl<R> Reader<R> {
972 /// Read text into the given buffer, and return an event that borrows from
973 /// either that buffer or from the input itself, based on the type of the
974 /// reader.
975 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
976 where
977 R: XmlSource<'i, B>,
978 {
979 read_event_impl!(self, buf, self.reader, read_until_close)
980 }
981
982 /// Private function to read until `>` is found. This function expects that
983 /// it was called just after encounter a `<` symbol.
984 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
985 where
986 R: XmlSource<'i, B>,
987 {
988 read_until_close!(self, buf, self.reader)
989 }
990}
991
992////////////////////////////////////////////////////////////////////////////////////////////////////
993
994/// Result of an attempt to read XML textual data from the source.
995#[derive(Debug)]
996enum ReadTextResult<'r, B> {
997 /// Start of markup (`<` character) was found in the first byte. `<` was consumed.
998 /// Contains buffer that should be returned back to the next iteration cycle
999 /// to satisfy borrow checker requirements.
1000 Markup(B),
1001 /// Start of reference (`&` character) was found in the first byte.
1002 /// `&` was not consumed.
1003 /// Contains buffer that should be returned back to the next iteration cycle
1004 /// to satisfy borrow checker requirements.
1005 Ref(B),
1006 /// Contains text block up to start of markup (`<` character). `<` was consumed.
1007 UpToMarkup(&'r [u8]),
1008 /// Contains text block up to start of reference (`&` character).
1009 /// `&` was not consumed.
1010 UpToRef(&'r [u8]),
1011 /// Contains text block up to EOF, neither start of markup (`<` character)
1012 /// or start of reference (`&` character) was found.
1013 UpToEof(&'r [u8]),
1014 /// IO error occurred.
1015 Err(io::Error),
1016}
1017
1018/// Result of an attempt to read general reference from the reader.
1019#[derive(Debug)]
1020enum ReadRefResult<'r> {
1021 /// Contains text block up to end of reference (`;` character).
1022 /// Result includes start `&`, but not end `;`.
1023 Ref(&'r [u8]),
1024 /// Contains text block up to EOF. Neither end of reference (`;`), start of
1025 /// another reference (`&`) or start of markup (`<`) characters was found.
1026 /// Result includes start `&`.
1027 UpToEof(&'r [u8]),
1028 /// Contains text block up to next possible reference (`&` character).
1029 /// Result includes start `&`.
1030 UpToRef(&'r [u8]),
1031 /// Contains text block up to start of markup (`<` character).
1032 /// Result includes start `&`.
1033 UpToMarkup(&'r [u8]),
1034 /// IO error occurred.
1035 Err(io::Error),
1036}
1037
1038/// Represents an input for a reader that can return borrowed data.
1039///
1040/// There are two implementors of this trait: generic one that read data from
1041/// `Self`, copies some part of it into a provided buffer of type `B` and then
1042/// returns data that borrow from that buffer.
1043///
1044/// The other implementor is for `&[u8]` and instead of copying data returns
1045/// borrowed data from `Self` instead. This implementation allows zero-copy
1046/// deserialization.
1047///
1048/// # Parameters
1049/// - `'r`: lifetime of a buffer from which events will borrow
1050/// - `B`: a type of a buffer that can be used to store data read from `Self` and
1051/// from which events can borrow
1052trait XmlSource<'r, B> {
1053 /// Removes UTF-8 BOM if it is present
1054 #[cfg(not(feature = "encoding"))]
1055 fn remove_utf8_bom(&mut self) -> io::Result<()>;
1056
1057 /// Determines encoding from the start of input and removes BOM if it is present
1058 #[cfg(feature = "encoding")]
1059 fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
1060
1061 /// Read input until start of markup (the `<`) is found, start of general entity
1062 /// reference (the `&`) is found or end of input is reached.
1063 ///
1064 /// # Parameters
1065 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1066 /// from which [events] could borrow their data
1067 /// - `position`: Will be increased by amount of bytes consumed
1068 ///
1069 /// [events]: crate::events::Event
1070 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
1071
1072 /// Read input until end of general reference (the `;`) is found, start of
1073 /// another general reference (the `&`) is found or end of input is reached.
1074 ///
1075 /// This method must be called when current character is `&`.
1076 ///
1077 /// # Parameters
1078 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1079 /// from which [events] could borrow their data
1080 /// - `position`: Will be increased by amount of bytes consumed
1081 ///
1082 /// [events]: crate::events::Event
1083 fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>;
1084
1085 /// Read input until processing instruction is finished.
1086 ///
1087 /// This method expect that start sequence of a parser already was read.
1088 ///
1089 /// Returns a slice of data read up to the end of the thing being parsed.
1090 /// The end of thing and the returned content is determined by the used parser.
1091 ///
1092 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
1093 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
1094 ///
1095 /// # Parameters
1096 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1097 /// from which [events] could borrow their data
1098 /// - `position`: Will be increased by amount of bytes consumed
1099 ///
1100 /// A `P` type parameter is used to preserve state between calls to the underlying
1101 /// reader which provides bytes fed into the parser.
1102 ///
1103 /// [events]: crate::events::Event
1104 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
1105 where
1106 P: Parser;
1107
1108 /// Read input until comment or CDATA is finished.
1109 ///
1110 /// This method expect that `<` already was read.
1111 ///
1112 /// Returns a slice of data read up to end of comment or CDATA (`>`),
1113 /// which does not include into result.
1114 ///
1115 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
1116 ///
1117 /// # Parameters
1118 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1119 /// from which [events] could borrow their data
1120 /// - `position`: Will be increased by amount of bytes consumed
1121 ///
1122 /// [events]: crate::events::Event
1123 fn read_bang_element(
1124 &mut self,
1125 buf: B,
1126 position: &mut u64,
1127 ) -> Result<(BangType, &'r [u8]), Error>;
1128
1129 /// Consume and discard all the whitespace until the next non-whitespace
1130 /// character or EOF.
1131 ///
1132 /// # Parameters
1133 /// - `position`: Will be increased by amount of bytes consumed
1134 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1135
1136 /// Return one character without consuming it, so that future `read_*` calls
1137 /// will still include it. On EOF, return `None`.
1138 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1139}
1140
1141/// Possible elements started with `<!`
1142#[derive(Debug, PartialEq)]
1143enum BangType {
1144 /// <![CDATA[...]]>
1145 CData,
1146 /// <!--...-->
1147 Comment,
1148 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1149 DocType(DtdParser),
1150}
1151impl BangType {
1152 #[inline(always)]
1153 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1154 Ok(match byte {
1155 Some(b'[') => Self::CData,
1156 Some(b'-') => Self::Comment,
1157 Some(b'D') | Some(b'd') => Self::DocType(DtdParser::BeforeInternalSubset(0)),
1158 _ => return Err(SyntaxError::InvalidBangMarkup),
1159 })
1160 }
1161
1162 /// If element is finished, returns its content up to `>` symbol and
1163 /// an index of this symbol, otherwise returns `None`
1164 ///
1165 /// # Parameters
1166 /// - `buf`: buffer with data consumed on previous iterations
1167 /// - `chunk`: data read on current iteration and not yet consumed from reader
1168 #[inline(always)]
1169 fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1170 match self {
1171 Self::Comment => {
1172 for i in memchr::memchr_iter(b'>', chunk) {
1173 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1174 // <!----> - XML comment
1175 // 0123456 - i
1176 if buf.len() + i > 5 {
1177 if chunk[..i].ends_with(b"--") {
1178 // We cannot strip last `--` from the buffer because we need it in case of
1179 // check_comments enabled option. XML standard requires that comment
1180 // will not end with `--->` sequence because this is a special case of
1181 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1182 return Some((&chunk[..i], i + 1)); // +1 for `>`
1183 }
1184 // End sequence `-|->` was splitted at |
1185 // buf --/ \-- chunk
1186 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1187 return Some((&chunk[..i], i + 1)); // +1 for `>`
1188 }
1189 // End sequence `--|>` was splitted at |
1190 // buf --/ \-- chunk
1191 if i == 0 && buf.ends_with(b"--") {
1192 return Some((&[], i + 1)); // +1 for `>`
1193 }
1194 }
1195 }
1196 }
1197 Self::CData => {
1198 for i in memchr::memchr_iter(b'>', chunk) {
1199 if chunk[..i].ends_with(b"]]") {
1200 return Some((&chunk[..i], i + 1)); // +1 for `>`
1201 }
1202 // End sequence `]|]>` was splitted at |
1203 // buf --/ \-- chunk
1204 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1205 return Some((&chunk[..i], i + 1)); // +1 for `>`
1206 }
1207 // End sequence `]]|>` was splitted at |
1208 // buf --/ \-- chunk
1209 if i == 0 && buf.ends_with(b"]]") {
1210 return Some((&[], i + 1)); // +1 for `>`
1211 }
1212 }
1213 }
1214 Self::DocType(ref mut parser) => return parser.feed(buf, chunk),
1215 }
1216 None
1217 }
1218 #[inline]
1219 const fn to_err(&self) -> SyntaxError {
1220 match self {
1221 Self::CData => SyntaxError::UnclosedCData,
1222 Self::Comment => SyntaxError::UnclosedComment,
1223 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1224 }
1225 }
1226}
1227
1228////////////////////////////////////////////////////////////////////////////////////////////////////
1229
1230#[cfg(test)]
1231mod test {
1232 /// Checks the internal implementation of the various reader methods
1233 macro_rules! check {
1234 (
1235 #[$test:meta]
1236 $read_event:ident,
1237 $read_until_close:ident,
1238 // constructor of the XML source on which internal functions will be called
1239 $source:path,
1240 // constructor of the buffer to which read data will stored
1241 $buf:expr
1242 $(, $async:ident, $await:ident)?
1243 ) => {
1244 mod read_bang_element {
1245 use super::*;
1246 use crate::errors::{Error, SyntaxError};
1247 use crate::reader::{BangType, DtdParser};
1248 use crate::utils::Bytes;
1249
1250 /// Checks that reading CDATA content works correctly
1251 mod cdata {
1252 use super::*;
1253 use pretty_assertions::assert_eq;
1254
1255 /// Checks that if input begins like CDATA element, but CDATA start sequence
1256 /// is not finished, parsing ends with an error
1257 #[$test]
1258 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1259 $($async)? fn not_properly_start() {
1260 let buf = $buf;
1261 let mut position = 1;
1262 let mut input = b"<![]]>other content".as_ref();
1263 // ^= 1
1264
1265 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1266 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1267 x => panic!(
1268 "Expected `Err(Syntax(_))`, but got `{:?}`",
1269 x
1270 ),
1271 }
1272 assert_eq!(position, 1);
1273 }
1274
1275 /// Checks that if CDATA startup sequence was matched, but an end sequence
1276 /// is not found, parsing ends with an error
1277 #[$test]
1278 $($async)? fn not_closed() {
1279 let buf = $buf;
1280 let mut position = 0;
1281 let mut input = b"<![CDATA[other content".as_ref();
1282 // ^= 0 ^= 22
1283
1284 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1285 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1286 x => panic!(
1287 "Expected `Err(Syntax(_))`, but got `{:?}`",
1288 x
1289 ),
1290 }
1291 assert_eq!(position, 22);
1292 }
1293
1294 /// Checks that CDATA element without content inside parsed successfully
1295 #[$test]
1296 $($async)? fn empty() {
1297 let buf = $buf;
1298 let mut position = 0;
1299 let mut input = b"<![CDATA[]]>other content".as_ref();
1300 // ^= 0 ^= 12
1301
1302 let (ty, bytes) = $source(&mut input)
1303 .read_bang_element(buf, &mut position)
1304 $(.$await)?
1305 .unwrap();
1306 assert_eq!(
1307 (ty, Bytes(bytes)),
1308 (BangType::CData, Bytes(b"<![CDATA[]]"))
1309 );
1310 assert_eq!(position, 12);
1311 }
1312
1313 /// Checks that CDATA element with content parsed successfully.
1314 /// Additionally checks that sequences inside CDATA that may look like
1315 /// a CDATA end sequence do not interrupt CDATA parsing
1316 #[$test]
1317 $($async)? fn with_content() {
1318 let buf = $buf;
1319 let mut position = 0;
1320 let mut input = b"<![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1321 // ^= 0 ^= 29
1322
1323 let (ty, bytes) = $source(&mut input)
1324 .read_bang_element(buf, &mut position)
1325 $(.$await)?
1326 .unwrap();
1327 assert_eq!(
1328 (ty, Bytes(bytes)),
1329 (BangType::CData, Bytes(b"<![CDATA[cdata]] ]>content]]"))
1330 );
1331 assert_eq!(position, 29);
1332 }
1333 }
1334
1335 /// Checks that reading XML comments works correctly. According to the [specification],
1336 /// comment data can contain any sequence except `--`:
1337 ///
1338 /// ```peg
1339 /// comment = '<--' (!'--' char)* '-->';
1340 /// char = [#x1-#x2C]
1341 /// / [#x2E-#xD7FF]
1342 /// / [#xE000-#xFFFD]
1343 /// / [#x10000-#x10FFFF]
1344 /// ```
1345 ///
1346 /// The presence of this limitation, however, is simply a poorly designed specification
1347 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1348 /// presence of these sequences by default. This tests allow such content.
1349 ///
1350 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1351 mod comment {
1352 use super::*;
1353 use pretty_assertions::assert_eq;
1354
1355 #[$test]
1356 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1357 $($async)? fn not_properly_start() {
1358 let buf = $buf;
1359 let mut position = 1;
1360 let mut input = b"<!- -->other content".as_ref();
1361 // ^= 1
1362
1363 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1364 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1365 x => panic!(
1366 "Expected `Err(Syntax(_))`, but got `{:?}`",
1367 x
1368 ),
1369 }
1370 assert_eq!(position, 1);
1371 }
1372
1373 #[$test]
1374 $($async)? fn not_properly_end() {
1375 let buf = $buf;
1376 let mut position = 0;
1377 let mut input = b"<!->other content".as_ref();
1378 // ^= 0 ^= 17
1379
1380 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1381 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1382 x => panic!(
1383 "Expected `Err(Syntax(_))`, but got `{:?}`",
1384 x
1385 ),
1386 }
1387 assert_eq!(position, 17);
1388 }
1389
1390 #[$test]
1391 $($async)? fn not_closed1() {
1392 let buf = $buf;
1393 let mut position = 0;
1394 let mut input = b"<!--other content".as_ref();
1395 // ^= 0 ^= 17
1396
1397 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1398 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1399 x => panic!(
1400 "Expected `Err(Syntax(_))`, but got `{:?}`",
1401 x
1402 ),
1403 }
1404 assert_eq!(position, 17);
1405 }
1406
1407 #[$test]
1408 $($async)? fn not_closed2() {
1409 let buf = $buf;
1410 let mut position = 0;
1411 let mut input = b"<!-->other content".as_ref();
1412 // ^= 0 ^= 18
1413
1414 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1415 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1416 x => panic!(
1417 "Expected `Err(Syntax(_))`, but got `{:?}`",
1418 x
1419 ),
1420 }
1421 assert_eq!(position, 18);
1422 }
1423
1424 #[$test]
1425 $($async)? fn not_closed3() {
1426 let buf = $buf;
1427 let mut position = 0;
1428 let mut input = b"<!--->other content".as_ref();
1429 // ^= 0 ^= 19
1430
1431 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1432 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1433 x => panic!(
1434 "Expected `Err(Syntax(_))`, but got `{:?}`",
1435 x
1436 ),
1437 }
1438 assert_eq!(position, 19);
1439 }
1440
1441 #[$test]
1442 $($async)? fn empty() {
1443 let buf = $buf;
1444 let mut position = 0;
1445 let mut input = b"<!---->other content".as_ref();
1446 // ^= 0 ^= 7
1447
1448 let (ty, bytes) = $source(&mut input)
1449 .read_bang_element(buf, &mut position)
1450 $(.$await)?
1451 .unwrap();
1452 assert_eq!(
1453 (ty, Bytes(bytes)),
1454 (BangType::Comment, Bytes(b"<!----"))
1455 );
1456 assert_eq!(position, 7);
1457 }
1458
1459 #[$test]
1460 $($async)? fn with_content() {
1461 let buf = $buf;
1462 let mut position = 0;
1463 let mut input = b"<!--->comment<--->other content".as_ref();
1464 // ^= 0 ^= 18
1465
1466 let (ty, bytes) = $source(&mut input)
1467 .read_bang_element(buf, &mut position)
1468 $(.$await)?
1469 .unwrap();
1470 assert_eq!(
1471 (ty, Bytes(bytes)),
1472 (BangType::Comment, Bytes(b"<!--->comment<---"))
1473 );
1474 assert_eq!(position, 18);
1475 }
1476 }
1477
1478 /// Checks that reading DOCTYPE definition works correctly
1479 mod doctype {
1480 use super::*;
1481
1482 mod uppercase {
1483 use super::*;
1484 use pretty_assertions::assert_eq;
1485
1486 #[$test]
1487 $($async)? fn not_properly_start() {
1488 let buf = $buf;
1489 let mut position = 0;
1490 let mut input = b"<!D other content".as_ref();
1491 // ^= 0 ^= 17
1492
1493 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1494 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1495 x => panic!(
1496 "Expected `Err(Syntax(_))`, but got `{:?}`",
1497 x
1498 ),
1499 }
1500 assert_eq!(position, 17);
1501 }
1502
1503 #[$test]
1504 $($async)? fn without_space() {
1505 let buf = $buf;
1506 let mut position = 0;
1507 let mut input = b"<!DOCTYPEother content".as_ref();
1508 // ^= 0 ^= 22
1509
1510 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1511 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1512 x => panic!(
1513 "Expected `Err(Syntax(_))`, but got `{:?}`",
1514 x
1515 ),
1516 }
1517 assert_eq!(position, 22);
1518 }
1519
1520 #[$test]
1521 $($async)? fn empty() {
1522 let buf = $buf;
1523 let mut position = 0;
1524 let mut input = b"<!DOCTYPE>other content".as_ref();
1525 // ^= 0 ^= 10
1526
1527 let (ty, bytes) = $source(&mut input)
1528 .read_bang_element(buf, &mut position)
1529 $(.$await)?
1530 .unwrap();
1531 assert_eq!(
1532 (ty, Bytes(bytes)),
1533 (BangType::DocType(DtdParser::Finished), Bytes(b"<!DOCTYPE"))
1534 );
1535 assert_eq!(position, 10);
1536 }
1537
1538 #[$test]
1539 $($async)? fn not_closed() {
1540 let buf = $buf;
1541 let mut position = 0;
1542 let mut input = b"<!DOCTYPE other content".as_ref();
1543 // ^= 0 ^23
1544
1545 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1546 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1547 x => panic!(
1548 "Expected `Err(Syntax(_))`, but got `{:?}`",
1549 x
1550 ),
1551 }
1552 assert_eq!(position, 23);
1553 }
1554 }
1555
1556 mod lowercase {
1557 use super::*;
1558 use pretty_assertions::assert_eq;
1559
1560 #[$test]
1561 $($async)? fn not_properly_start() {
1562 let buf = $buf;
1563 let mut position = 0;
1564 let mut input = b"<!d other content".as_ref();
1565 // ^= 0 ^= 17
1566
1567 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1568 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1569 x => panic!(
1570 "Expected `Err(Syntax(_))`, but got `{:?}`",
1571 x
1572 ),
1573 }
1574 assert_eq!(position, 17);
1575 }
1576
1577 #[$test]
1578 $($async)? fn without_space() {
1579 let buf = $buf;
1580 let mut position = 0;
1581 let mut input = b"<!doctypeother content".as_ref();
1582 // ^= 0 ^= 22
1583
1584 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1585 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1586 x => panic!(
1587 "Expected `Err(Syntax(_))`, but got `{:?}`",
1588 x
1589 ),
1590 }
1591 assert_eq!(position, 22);
1592 }
1593
1594 #[$test]
1595 $($async)? fn empty() {
1596 let buf = $buf;
1597 let mut position = 0;
1598 let mut input = b"<!doctype>other content".as_ref();
1599 // ^= 0 ^= 10
1600
1601 let (ty, bytes) = $source(&mut input)
1602 .read_bang_element(buf, &mut position)
1603 $(.$await)?
1604 .unwrap();
1605 assert_eq!(
1606 (ty, Bytes(bytes)),
1607 (BangType::DocType(DtdParser::Finished), Bytes(b"<!doctype"))
1608 );
1609 assert_eq!(position, 10);
1610 }
1611
1612 #[$test]
1613 $($async)? fn not_closed() {
1614 let buf = $buf;
1615 let mut position = 0;
1616 let mut input = b"<!doctype other content".as_ref();
1617 // ^= 0 ^= 23
1618
1619 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1620 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1621 x => panic!(
1622 "Expected `Err(Syntax(_))`, but got `{:?}`",
1623 x
1624 ),
1625 }
1626 assert_eq!(position, 23);
1627 }
1628 }
1629 }
1630 }
1631
1632 mod read_text {
1633 use super::*;
1634 use crate::reader::ReadTextResult;
1635 use crate::utils::Bytes;
1636 use pretty_assertions::assert_eq;
1637
1638 #[$test]
1639 $($async)? fn empty() {
1640 let buf = $buf;
1641 let mut position = 1;
1642 let mut input = b"".as_ref();
1643 // ^= 1
1644
1645 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1646 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")),
1647 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1648 }
1649 assert_eq!(position, 1);
1650 }
1651
1652 #[$test]
1653 $($async)? fn markup() {
1654 let buf = $buf;
1655 let mut position = 1;
1656 let mut input = b"<".as_ref();
1657 // ^= 1
1658
1659 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1660 ReadTextResult::Markup(b) => assert_eq!(b, $buf),
1661 x => panic!("Expected `Markup(_)`, but got `{:?}`", x),
1662 }
1663 assert_eq!(position, 1);
1664 }
1665
1666 #[$test]
1667 $($async)? fn ref_() {
1668 let buf = $buf;
1669 let mut position = 1;
1670 let mut input = b"&".as_ref();
1671 // ^= 1
1672
1673 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1674 ReadTextResult::Ref(b) => assert_eq!(b, $buf),
1675 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1676 }
1677 assert_eq!(position, 1);
1678 }
1679
1680 #[$test]
1681 $($async)? fn up_to_markup() {
1682 let buf = $buf;
1683 let mut position = 1;
1684 let mut input = b"a<".as_ref();
1685 // ^= 2
1686
1687 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1688 ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1689 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1690 }
1691 assert_eq!(position, 2);
1692 }
1693
1694 #[$test]
1695 $($async)? fn up_to_ref() {
1696 let buf = $buf;
1697 let mut position = 1;
1698 let mut input = b"a&".as_ref();
1699 // ^= 2
1700
1701 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1702 ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1703 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1704 }
1705 assert_eq!(position, 2);
1706 }
1707
1708 #[$test]
1709 $($async)? fn up_to_eof() {
1710 let buf = $buf;
1711 let mut position = 1;
1712 let mut input = b"a".as_ref();
1713 // ^= 2
1714
1715 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1716 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1717 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1718 }
1719 assert_eq!(position, 2);
1720 }
1721 }
1722
1723 mod read_ref {
1724 use super::*;
1725 use crate::reader::ReadRefResult;
1726 use crate::utils::Bytes;
1727 use pretty_assertions::assert_eq;
1728
1729 // Empty input is not allowed for `read_ref` so not tested.
1730 // Borrowed source triggers debug assertion,
1731 // buffered do nothing due to implementation details.
1732
1733 #[$test]
1734 $($async)? fn up_to_eof() {
1735 let buf = $buf;
1736 let mut position = 1;
1737 let mut input = b"&".as_ref();
1738 // ^= 2
1739
1740 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1741 ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1742 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1743 }
1744 assert_eq!(position, 2);
1745 }
1746
1747 #[$test]
1748 $($async)? fn up_to_ref() {
1749 let buf = $buf;
1750 let mut position = 1;
1751 let mut input = b"&&".as_ref();
1752 // ^= 2
1753
1754 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1755 ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1756 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1757 }
1758 assert_eq!(position, 2);
1759 }
1760
1761 #[$test]
1762 $($async)? fn up_to_markup() {
1763 let buf = $buf;
1764 let mut position = 1;
1765 let mut input = b"&<".as_ref();
1766 // ^= 2
1767
1768 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1769 ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1770 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1771 }
1772 assert_eq!(position, 2);
1773 }
1774
1775 #[$test]
1776 $($async)? fn empty_ref() {
1777 let buf = $buf;
1778 let mut position = 1;
1779 let mut input = b"&;".as_ref();
1780 // ^= 3
1781
1782 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1783 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1784 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1785 }
1786 assert_eq!(position, 3);
1787 }
1788
1789 #[$test]
1790 $($async)? fn normal() {
1791 let buf = $buf;
1792 let mut position = 1;
1793 let mut input = b"<".as_ref();
1794 // ^= 5
1795
1796 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1797 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")),
1798 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1799 }
1800 assert_eq!(position, 5);
1801 }
1802 }
1803
1804 mod read_element {
1805 use super::*;
1806 use crate::errors::{Error, SyntaxError};
1807 use crate::parser::ElementParser;
1808 use crate::utils::Bytes;
1809 use pretty_assertions::assert_eq;
1810
1811 /// Checks that nothing was read from empty buffer
1812 #[$test]
1813 $($async)? fn empty() {
1814 let buf = $buf;
1815 let mut position = 1;
1816 let mut input = b"".as_ref();
1817 // ^= 1
1818
1819 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1820 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1821 x => panic!(
1822 "Expected `Err(Syntax(_))`, but got `{:?}`",
1823 x
1824 ),
1825 }
1826 assert_eq!(position, 1);
1827 }
1828
1829 mod open {
1830 use super::*;
1831 use pretty_assertions::assert_eq;
1832
1833 #[$test]
1834 $($async)? fn empty_tag() {
1835 let buf = $buf;
1836 let mut position = 1;
1837 let mut input = b">".as_ref();
1838 // ^= 2
1839
1840 assert_eq!(
1841 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1842 Bytes(b"")
1843 );
1844 assert_eq!(position, 2);
1845 }
1846
1847 #[$test]
1848 $($async)? fn normal() {
1849 let buf = $buf;
1850 let mut position = 1;
1851 let mut input = b"tag>".as_ref();
1852 // ^= 5
1853
1854 assert_eq!(
1855 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1856 Bytes(b"tag")
1857 );
1858 assert_eq!(position, 5);
1859 }
1860
1861 #[$test]
1862 $($async)? fn empty_ns_empty_tag() {
1863 let buf = $buf;
1864 let mut position = 1;
1865 let mut input = b":>".as_ref();
1866 // ^= 3
1867
1868 assert_eq!(
1869 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1870 Bytes(b":")
1871 );
1872 assert_eq!(position, 3);
1873 }
1874
1875 #[$test]
1876 $($async)? fn empty_ns() {
1877 let buf = $buf;
1878 let mut position = 1;
1879 let mut input = b":tag>".as_ref();
1880 // ^= 6
1881
1882 assert_eq!(
1883 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1884 Bytes(b":tag")
1885 );
1886 assert_eq!(position, 6);
1887 }
1888
1889 #[$test]
1890 $($async)? fn with_attributes() {
1891 let buf = $buf;
1892 let mut position = 1;
1893 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1894 // ^= 39
1895
1896 assert_eq!(
1897 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1898 Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
1899 );
1900 assert_eq!(position, 39);
1901 }
1902 }
1903
1904 mod self_closed {
1905 use super::*;
1906 use pretty_assertions::assert_eq;
1907
1908 #[$test]
1909 $($async)? fn empty_tag() {
1910 let buf = $buf;
1911 let mut position = 1;
1912 let mut input = b"/>".as_ref();
1913 // ^= 3
1914
1915 assert_eq!(
1916 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1917 Bytes(b"/")
1918 );
1919 assert_eq!(position, 3);
1920 }
1921
1922 #[$test]
1923 $($async)? fn normal() {
1924 let buf = $buf;
1925 let mut position = 1;
1926 let mut input = b"tag/>".as_ref();
1927 // ^= 6
1928
1929 assert_eq!(
1930 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1931 Bytes(b"tag/")
1932 );
1933 assert_eq!(position, 6);
1934 }
1935
1936 #[$test]
1937 $($async)? fn empty_ns_empty_tag() {
1938 let buf = $buf;
1939 let mut position = 1;
1940 let mut input = b":/>".as_ref();
1941 // ^= 4
1942
1943 assert_eq!(
1944 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1945 Bytes(b":/")
1946 );
1947 assert_eq!(position, 4);
1948 }
1949
1950 #[$test]
1951 $($async)? fn empty_ns() {
1952 let buf = $buf;
1953 let mut position = 1;
1954 let mut input = b":tag/>".as_ref();
1955 // ^= 7
1956
1957 assert_eq!(
1958 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1959 Bytes(b":tag/")
1960 );
1961 assert_eq!(position, 7);
1962 }
1963
1964 #[$test]
1965 $($async)? fn with_attributes() {
1966 let buf = $buf;
1967 let mut position = 1;
1968 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1969 // ^= 42
1970
1971 assert_eq!(
1972 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1973 Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
1974 );
1975 assert_eq!(position, 42);
1976 }
1977 }
1978
1979 mod close {
1980 use super::*;
1981 use pretty_assertions::assert_eq;
1982
1983 #[$test]
1984 $($async)? fn empty_tag() {
1985 let buf = $buf;
1986 let mut position = 1;
1987 let mut input = b"/ >".as_ref();
1988 // ^= 4
1989
1990 assert_eq!(
1991 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1992 Bytes(b"/ ")
1993 );
1994 assert_eq!(position, 4);
1995 }
1996
1997 #[$test]
1998 $($async)? fn normal() {
1999 let buf = $buf;
2000 let mut position = 1;
2001 let mut input = b"/tag>".as_ref();
2002 // ^= 6
2003
2004 assert_eq!(
2005 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2006 Bytes(b"/tag")
2007 );
2008 assert_eq!(position, 6);
2009 }
2010
2011 #[$test]
2012 $($async)? fn empty_ns_empty_tag() {
2013 let buf = $buf;
2014 let mut position = 1;
2015 let mut input = b"/:>".as_ref();
2016 // ^= 4
2017
2018 assert_eq!(
2019 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2020 Bytes(b"/:")
2021 );
2022 assert_eq!(position, 4);
2023 }
2024
2025 #[$test]
2026 $($async)? fn empty_ns() {
2027 let buf = $buf;
2028 let mut position = 1;
2029 let mut input = b"/:tag>".as_ref();
2030 // ^= 7
2031
2032 assert_eq!(
2033 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2034 Bytes(b"/:tag")
2035 );
2036 assert_eq!(position, 7);
2037 }
2038
2039 #[$test]
2040 $($async)? fn with_attributes() {
2041 let buf = $buf;
2042 let mut position = 1;
2043 let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
2044 // ^= 40
2045
2046 assert_eq!(
2047 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2048 Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#)
2049 );
2050 assert_eq!(position, 40);
2051 }
2052 }
2053 }
2054
2055 /// Ensures, that no empty `Text` events are generated
2056 mod $read_event {
2057 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
2058 use crate::reader::Reader;
2059 use pretty_assertions::assert_eq;
2060
2061 /// When `encoding` feature is enabled, encoding should be detected
2062 /// from BOM (UTF-8) and BOM should be stripped.
2063 ///
2064 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
2065 /// character should be stripped for consistency
2066 #[$test]
2067 $($async)? fn bom_from_reader() {
2068 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
2069
2070 assert_eq!(
2071 reader.$read_event($buf) $(.$await)? .unwrap(),
2072 Event::Text(BytesText::from_escaped("\u{feff}"))
2073 );
2074
2075 assert_eq!(
2076 reader.$read_event($buf) $(.$await)? .unwrap(),
2077 Event::Eof
2078 );
2079 }
2080
2081 /// When parsing from &str, encoding is fixed (UTF-8), so
2082 /// - when `encoding` feature is disabled, the behavior the
2083 /// same as in `bom_from_reader` text
2084 /// - when `encoding` feature is enabled, the behavior should
2085 /// stay consistent, so the first BOM character is stripped
2086 #[$test]
2087 $($async)? fn bom_from_str() {
2088 let mut reader = Reader::from_str("\u{feff}\u{feff}");
2089
2090 assert_eq!(
2091 reader.$read_event($buf) $(.$await)? .unwrap(),
2092 Event::Text(BytesText::from_escaped("\u{feff}"))
2093 );
2094
2095 assert_eq!(
2096 reader.$read_event($buf) $(.$await)? .unwrap(),
2097 Event::Eof
2098 );
2099 }
2100
2101 #[$test]
2102 $($async)? fn declaration() {
2103 let mut reader = Reader::from_str("<?xml ?>");
2104
2105 assert_eq!(
2106 reader.$read_event($buf) $(.$await)? .unwrap(),
2107 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
2108 );
2109 }
2110
2111 #[$test]
2112 $($async)? fn doctype() {
2113 let mut reader = Reader::from_str("<!DOCTYPE x>");
2114
2115 assert_eq!(
2116 reader.$read_event($buf) $(.$await)? .unwrap(),
2117 Event::DocType(BytesText::from_escaped("x"))
2118 );
2119 }
2120
2121 #[$test]
2122 $($async)? fn processing_instruction() {
2123 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
2124
2125 assert_eq!(
2126 reader.$read_event($buf) $(.$await)? .unwrap(),
2127 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
2128 );
2129 }
2130
2131 /// Lone closing tags are not allowed, so testing it together with start tag
2132 #[$test]
2133 $($async)? fn start_and_end() {
2134 let mut reader = Reader::from_str("<tag></tag>");
2135
2136 assert_eq!(
2137 reader.$read_event($buf) $(.$await)? .unwrap(),
2138 Event::Start(BytesStart::new("tag"))
2139 );
2140
2141 assert_eq!(
2142 reader.$read_event($buf) $(.$await)? .unwrap(),
2143 Event::End(BytesEnd::new("tag"))
2144 );
2145 }
2146
2147 #[$test]
2148 $($async)? fn empty() {
2149 let mut reader = Reader::from_str("<tag/>");
2150
2151 assert_eq!(
2152 reader.$read_event($buf) $(.$await)? .unwrap(),
2153 Event::Empty(BytesStart::new("tag"))
2154 );
2155 }
2156
2157 #[$test]
2158 $($async)? fn text() {
2159 let mut reader = Reader::from_str("text");
2160
2161 assert_eq!(
2162 reader.$read_event($buf) $(.$await)? .unwrap(),
2163 Event::Text(BytesText::from_escaped("text"))
2164 );
2165 }
2166
2167 #[$test]
2168 $($async)? fn cdata() {
2169 let mut reader = Reader::from_str("<![CDATA[]]>");
2170
2171 assert_eq!(
2172 reader.$read_event($buf) $(.$await)? .unwrap(),
2173 Event::CData(BytesCData::new(""))
2174 );
2175 }
2176
2177 #[$test]
2178 $($async)? fn comment() {
2179 let mut reader = Reader::from_str("<!---->");
2180
2181 assert_eq!(
2182 reader.$read_event($buf) $(.$await)? .unwrap(),
2183 Event::Comment(BytesText::from_escaped(""))
2184 );
2185 }
2186
2187 #[$test]
2188 $($async)? fn eof() {
2189 let mut reader = Reader::from_str("");
2190
2191 assert_eq!(
2192 reader.$read_event($buf) $(.$await)? .unwrap(),
2193 Event::Eof
2194 );
2195 }
2196 }
2197 };
2198 }
2199
2200 // Export macros for the child modules:
2201 // - buffered_reader
2202 // - slice_reader
2203 pub(super) use check;
2204}