quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, IllFormedError, SyntaxError};
10use crate::events::{BytesRef, Event};
11use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27 /// Whether lone ampersand character (without a paired semicolon) should be
28 /// allowed in textual content. Unless enabled, in case of a dangling ampersand,
29 /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
30 ///
31 /// Default: `false`
32 ///
33 /// # Example
34 ///
35 /// ```
36 /// # use quick_xml::events::{BytesRef, BytesText, Event};
37 /// # use quick_xml::reader::Reader;
38 /// # use pretty_assertions::assert_eq;
39 /// let mut reader = Reader::from_str("text with & & & alone");
40 /// reader.config_mut().allow_dangling_amp = true;
41 ///
42 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
43 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
44 /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
45 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
46 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
47 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
48 /// ```
49 ///
50 /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
51 pub allow_dangling_amp: bool,
52
53 /// Whether unmatched closing tag names should be allowed. Unless enabled,
54 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
55 /// is returned from read methods.
56 ///
57 /// When set to `true`, it won't check if a closing tag has a corresponding
58 /// opening tag at all. For example, `<a></a></b>` will be permitted.
59 ///
60 /// Note that the emitted [`End`] event will not be modified if this is enabled,
61 /// ie. it will contain the data of the unmatched end tag.
62 ///
63 /// Note, that setting this to `true` will lead to additional allocates that
64 /// needed to store tag name for an [`End`] event.
65 ///
66 /// Default: `false`
67 ///
68 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
69 /// [`End`]: crate::events::Event::End
70 pub allow_unmatched_ends: bool,
71
72 /// Whether comments should be validated. If enabled, in case of invalid comment
73 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
74 ///
75 /// When set to `true`, every [`Comment`] event will be checked for not
76 /// containing `--`, which [is not allowed] in XML comments. Most of the time
77 /// we don't want comments at all so we don't really care about comment
78 /// correctness, thus the default value is `false` to improve performance.
79 ///
80 /// Default: `false`
81 ///
82 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
83 /// [`Comment`]: crate::events::Event::Comment
84 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
85 pub check_comments: bool,
86
87 /// Whether mismatched closing tag names should be detected. If enabled, in
88 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
89 /// read methods.
90 ///
91 /// Note, that start and end tags [should match literally][spec], they cannot
92 /// have different prefixes even if both prefixes resolve to the same namespace.
93 /// The XML
94 ///
95 /// ```xml
96 /// <outer xmlns="namespace" xmlns:p="namespace">
97 /// </p:outer>
98 /// ```
99 ///
100 /// is not valid, even though semantically the start tag is the same as the
101 /// end tag. The reason is that namespaces are an extension of the original
102 /// XML specification (without namespaces) and it should be backward-compatible.
103 ///
104 /// When set to `false`, it won't check if a closing tag matches the corresponding
105 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
106 ///
107 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
108 ///
109 /// Note that the emitted [`End`] event will not be modified if this is disabled,
110 /// ie. it will contain the data of the mismatched end tag.
111 ///
112 /// Note, that setting this to `true` will lead to additional allocates that
113 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
114 /// is also set, only one additional allocation will be performed that support
115 /// both these options.
116 ///
117 /// Default: `true`
118 ///
119 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
120 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
121 /// [`End`]: crate::events::Event::End
122 /// [`expand_empty_elements`]: Self::expand_empty_elements
123 pub check_end_names: bool,
124
125 /// Whether empty elements should be split into an `Open` and a `Close` event.
126 ///
127 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
128 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
129 /// event. When set to `false` (the default), those tags are represented by
130 /// an [`Empty`] event instead.
131 ///
132 /// Note, that setting this to `true` will lead to additional allocates that
133 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
134 /// is also set, only one additional allocation will be performed that support
135 /// both these options.
136 ///
137 /// Default: `false`
138 ///
139 /// [`Empty`]: crate::events::Event::Empty
140 /// [`Start`]: crate::events::Event::Start
141 /// [`End`]: crate::events::Event::End
142 /// [`check_end_names`]: Self::check_end_names
143 pub expand_empty_elements: bool,
144
145 /// Whether trailing whitespace after the markup name are trimmed in closing
146 /// tags `</a >`.
147 ///
148 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
149 /// after the markup name.
150 ///
151 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
152 /// of markup names is going to fail erroneously if a closing tag contains
153 /// trailing whitespace.
154 ///
155 /// Default: `true`
156 ///
157 /// [`End`]: crate::events::Event::End
158 /// [`check_end_names`]: Self::check_end_names
159 pub trim_markup_names_in_closing_tags: bool,
160
161 /// Whether whitespace before character data should be removed.
162 ///
163 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
164 /// If after that the event is empty it will not be pushed.
165 ///
166 /// Default: `false`
167 ///
168 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
169 ///
170 /// WARNING: With this option every text events will be trimmed which is
171 /// incorrect behavior when text events delimited by comments, processing
172 /// instructions or CDATA sections. To correctly trim data manually apply
173 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
174 /// only to necessary events.
175 /// </div>
176 ///
177 /// [`Text`]: crate::events::Event::Text
178 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
179 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
180 pub trim_text_start: bool,
181
182 /// Whether whitespace after character data should be removed.
183 ///
184 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
185 /// If after that the event is empty it will not be pushed.
186 ///
187 /// Default: `false`
188 ///
189 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
190 ///
191 /// WARNING: With this option every text events will be trimmed which is
192 /// incorrect behavior when text events delimited by comments, processing
193 /// instructions or CDATA sections. To correctly trim data manually apply
194 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
195 /// only to necessary events.
196 /// </div>
197 ///
198 /// [`Text`]: crate::events::Event::Text
199 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
200 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
201 pub trim_text_end: bool,
202}
203
204impl Config {
205 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
206 ///
207 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
208 ///
209 /// WARNING: With this option every text events will be trimmed which is
210 /// incorrect behavior when text events delimited by comments, processing
211 /// instructions or CDATA sections. To correctly trim data manually apply
212 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
213 /// only to necessary events.
214 /// </div>
215 ///
216 /// [`trim_text_start`]: Self::trim_text_start
217 /// [`trim_text_end`]: Self::trim_text_end
218 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
219 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
220 #[inline]
221 pub fn trim_text(&mut self, trim: bool) {
222 self.trim_text_start = trim;
223 self.trim_text_end = trim;
224 }
225
226 /// Turn on or off all checks for well-formedness. Currently it is that settings:
227 /// - [`check_comments`](Self::check_comments)
228 /// - [`check_end_names`](Self::check_end_names)
229 #[inline]
230 pub fn enable_all_checks(&mut self, enable: bool) {
231 self.check_comments = enable;
232 self.check_end_names = enable;
233 }
234}
235
236impl Default for Config {
237 fn default() -> Self {
238 Self {
239 allow_dangling_amp: false,
240 allow_unmatched_ends: false,
241 check_comments: false,
242 check_end_names: true,
243 expand_empty_elements: false,
244 trim_markup_names_in_closing_tags: true,
245 trim_text_start: false,
246 trim_text_end: false,
247 }
248 }
249}
250
251////////////////////////////////////////////////////////////////////////////////////////////////////
252
253macro_rules! read_event_impl {
254 (
255 $self:ident, $buf:ident,
256 $reader:expr,
257 $read_until_close:ident
258 $(, $await:ident)?
259 ) => {{
260 let event = loop {
261 break match $self.state.state {
262 ParseState::Init => { // Go to InsideText state
263 // If encoding set explicitly, we not need to detect it. For example,
264 // explicit UTF-8 set automatically if Reader was created using `from_str`.
265 // But we still need to remove BOM for consistency with no encoding
266 // feature enabled path
267 #[cfg(feature = "encoding")]
268 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
269 if $self.state.encoding.can_be_refined() {
270 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
271 }
272 }
273
274 // Removes UTF-8 BOM if it is present
275 #[cfg(not(feature = "encoding"))]
276 $reader.remove_utf8_bom() $(.$await)? ?;
277
278 $self.state.state = ParseState::InsideText;
279 continue;
280 },
281 ParseState::InsideRef => { // Go to InsideText
282 let start = $self.state.offset;
283 match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? {
284 // Emit reference, go to InsideText state
285 ReadRefResult::Ref(bytes) => {
286 $self.state.state = ParseState::InsideText;
287 // +1 to skip start `&`
288 // -1 to skip end `;`
289 Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..bytes.len() - 1], $self.decoder())))
290 }
291 // Go to Done state
292 ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
293 $self.state.state = ParseState::Done;
294 Ok(Event::Text($self.state.emit_text(bytes)))
295 }
296 ReadRefResult::UpToEof(_) => {
297 $self.state.state = ParseState::Done;
298 $self.state.last_error_offset = start;
299 Err(Error::IllFormed(IllFormedError::UnclosedReference))
300 }
301 // Do not change state, stay in InsideRef
302 ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
303 Ok(Event::Text($self.state.emit_text(bytes)))
304 }
305 ReadRefResult::UpToRef(_) => {
306 $self.state.last_error_offset = start;
307 Err(Error::IllFormed(IllFormedError::UnclosedReference))
308 }
309 // Go to InsideMarkup state
310 ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
311 $self.state.state = ParseState::InsideMarkup;
312 Ok(Event::Text($self.state.emit_text(bytes)))
313 }
314 ReadRefResult::UpToMarkup(_) => {
315 $self.state.state = ParseState::InsideMarkup;
316 $self.state.last_error_offset = start;
317 Err(Error::IllFormed(IllFormedError::UnclosedReference))
318 }
319 ReadRefResult::Err(e) => Err(Error::Io(e.into())),
320 }
321 }
322 ParseState::InsideText => { // Go to InsideMarkup or Done state
323 if $self.state.config.trim_text_start {
324 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
325 }
326
327 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
328 ReadTextResult::Markup(buf) => {
329 $self.state.state = ParseState::InsideMarkup;
330 // Pass `buf` to the next next iteration of parsing loop
331 $buf = buf;
332 continue;
333 }
334 ReadTextResult::Ref(buf) => {
335 $self.state.state = ParseState::InsideRef;
336 // Pass `buf` to the next next iteration of parsing loop
337 $buf = buf;
338 continue;
339 }
340 ReadTextResult::UpToMarkup(bytes) => {
341 $self.state.state = ParseState::InsideMarkup;
342 // FIXME: Can produce an empty event if:
343 // - event contains only spaces
344 // - trim_text_start = false
345 // - trim_text_end = true
346 Ok(Event::Text($self.state.emit_text(bytes)))
347 }
348 ReadTextResult::UpToRef(bytes) => {
349 $self.state.state = ParseState::InsideRef;
350 // Return Text event with `bytes` content or Eof if bytes is empty
351 Ok(Event::Text($self.state.emit_text(bytes)))
352 }
353 ReadTextResult::UpToEof(bytes) => {
354 $self.state.state = ParseState::Done;
355 // Trim bytes from end if required
356 let event = $self.state.emit_text(bytes);
357 if event.is_empty() {
358 Ok(Event::Eof)
359 } else {
360 Ok(Event::Text(event))
361 }
362 }
363 ReadTextResult::Err(e) => Err(Error::Io(e.into())),
364 }
365 },
366 // Go to InsideText state in next two arms
367 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
368 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
369 ParseState::Done => Ok(Event::Eof),
370 };
371 };
372 match event {
373 // #513: In case of ill-formed errors we already consume the wrong data
374 // and change the state. We can continue parsing if we wish
375 Err(Error::IllFormed(_)) => {}
376 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
377 _ => {}
378 }
379 event
380 }};
381}
382
383/// Read bytes up to the `>` and skip it. This method is expected to be called
384/// after seeing the `<` symbol and skipping it. Inspects the next (current)
385/// symbol and returns an appropriate [`Event`]:
386///
387/// |Symbol |Event
388/// |-------|-------------------------------------
389/// |`!` |[`Comment`], [`CData`] or [`DocType`]
390/// |`/` |[`End`]
391/// |`?` |[`PI`]
392/// |_other_|[`Start`] or [`Empty`]
393///
394/// Moves parser to the `InsideText` state.
395///
396/// [`Comment`]: Event::Comment
397/// [`CData`]: Event::CData
398/// [`DocType`]: Event::DocType
399/// [`End`]: Event::End
400/// [`PI`]: Event::PI
401/// [`Start`]: Event::Start
402/// [`Empty`]: Event::Empty
403macro_rules! read_until_close {
404 (
405 $self:ident, $buf:ident,
406 $reader:expr
407 $(, $await:ident)?
408 ) => {{
409 $self.state.state = ParseState::InsideText;
410
411 let start = $self.state.offset;
412 match $reader.peek_one() $(.$await)? {
413 // `<!` - comment, CDATA or DOCTYPE declaration
414 Ok(Some(b'!')) => match $reader
415 .read_bang_element($buf, &mut $self.state.offset)
416 $(.$await)?
417 {
418 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
419 Err(e) => {
420 // We want to report error at `<`
421 $self.state.last_error_offset = start;
422 Err(e)
423 }
424 },
425 // `</` - closing tag
426 // #776: We parse using ElementParser which allows us to have attributes
427 // in close tags. While such tags are not allowed by the specification,
428 // we anyway allow to parse them because:
429 // - we do not check constraints during parsing. This is performed by the
430 // optional validate step which user should call manually
431 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
432 // `</tag attr=">` and text `" >` which probably no one existing parser
433 // does. This is malformed XML, however it is tolerated by some parsers
434 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
435 Ok(Some(b'/')) => match $reader
436 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
437 $(.$await)?
438 {
439 Ok(bytes) => $self.state.emit_end(bytes),
440 Err(e) => {
441 // We want to report error at `<`
442 $self.state.last_error_offset = start;
443 Err(e)
444 }
445 },
446 // `<?` - processing instruction
447 Ok(Some(b'?')) => match $reader
448 .read_with(PiParser(false), $buf, &mut $self.state.offset)
449 $(.$await)?
450 {
451 Ok(bytes) => $self.state.emit_question_mark(bytes),
452 Err(e) => {
453 // We want to report error at `<`
454 $self.state.last_error_offset = start;
455 Err(e)
456 }
457 },
458 // `<...` - opening or self-closed tag
459 Ok(Some(_)) => match $reader
460 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
461 $(.$await)?
462 {
463 Ok(bytes) => Ok($self.state.emit_start(bytes)),
464 Err(e) => {
465 // We want to report error at `<`
466 $self.state.last_error_offset = start;
467 Err(e)
468 }
469 },
470 // `<` - syntax error, tag not closed
471 Ok(None) => {
472 // We want to report error at `<`
473 $self.state.last_error_offset = start;
474 Err(Error::Syntax(SyntaxError::UnclosedTag))
475 }
476 Err(e) => Err(Error::Io(e.into())),
477 }
478 }};
479}
480
481/// Generalization of `read_to_end` method for buffered and borrowed readers
482macro_rules! read_to_end {
483 (
484 // $self: &mut Reader
485 $self:expr, $end:expr, $buf:expr,
486 $read_event:ident,
487 // Code block that performs clearing of internal buffer after read of each event
488 $clear:block
489 $(, $await:ident)?
490 ) => {{
491 // Because we take position after the event before the End event,
492 // it is important that this position indicates beginning of the End event.
493 // If between last event and the End event would be only spaces, then we
494 // take position before the spaces, but spaces would be skipped without
495 // generating event if `trim_text_start` is set to `true`. To prevent that
496 // we temporary disable start text trimming.
497 //
498 // We also cannot take position after getting End event, because if
499 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
500 // we do not known the real size of the End event that it is occupies in
501 // the source and cannot correct the position after the End event.
502 // So, we in any case should tweak parser configuration.
503 let config = $self.config_mut();
504 let trim = config.trim_text_start;
505 config.trim_text_start = false;
506
507 let start = $self.buffer_position();
508 let mut depth = 0;
509 loop {
510 $clear
511 let end = $self.buffer_position();
512 match $self.$read_event($buf) $(.$await)? {
513 Err(e) => {
514 $self.config_mut().trim_text_start = trim;
515 return Err(e);
516 }
517
518 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
519 Ok(Event::End(e)) if e.name() == $end => {
520 if depth == 0 {
521 $self.config_mut().trim_text_start = trim;
522 break start..end;
523 }
524 depth -= 1;
525 }
526 Ok(Event::Eof) => {
527 $self.config_mut().trim_text_start = trim;
528 return Err(Error::missed_end($end, $self.decoder()));
529 }
530 _ => (),
531 }
532 }
533 }};
534}
535
536#[cfg(feature = "async-tokio")]
537mod async_tokio;
538mod buffered_reader;
539mod ns_reader;
540mod slice_reader;
541mod state;
542
543pub use ns_reader::NsReader;
544
545/// Range of input in bytes, that corresponds to some piece of XML
546pub type Span = Range<u64>;
547
548////////////////////////////////////////////////////////////////////////////////////////////////////
549
550/// Possible reader states. The state transition diagram (`true` and `false` shows
551/// value of [`Config::expand_empty_elements`] option):
552///
553/// ```mermaid
554/// flowchart LR
555/// subgraph _
556/// direction LR
557///
558/// Init -- "(no event)"\n --> InsideMarkup
559/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
560/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
561/// InsideRef -- "(no event)"\nGeneralRef --> InsideText
562/// end
563/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
564/// InsideEmpty -- End --> InsideText
565/// _ -. Eof .-> Done
566/// ```
567#[derive(Clone, Debug)]
568enum ParseState {
569 /// Initial state in which reader stay after creation. Transition from that
570 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
571 /// state is always `InsideMarkup`. The reader will never return to this state. The
572 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
573 /// first symbol not `<`, otherwise no event are emitted.
574 Init,
575 /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other
576 /// events could be generated.
577 ///
578 /// After generating one event the reader moves to the `ClosedTag` state.
579 InsideRef,
580 /// State after seeing the `<` symbol. Depending on the next symbol all other
581 /// events could be generated.
582 ///
583 /// After generating one event the reader moves to the `InsideText` state.
584 InsideMarkup,
585 /// State in which reader searches the `<` symbol of a markup. All bytes before
586 /// that symbol will be returned in the [`Event::Text`] event. After that
587 /// the reader moves to the `InsideMarkup` state.
588 InsideText,
589 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
590 /// Reader enters to this state when it is in a `InsideText` state and emits an
591 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
592 /// after which reader returned to the `InsideText` state.
593 ///
594 /// [`expand_empty_elements`]: Config::expand_empty_elements
595 InsideEmpty,
596 /// Reader enters this state when `Eof` event generated or an error occurred.
597 /// This is the last state, the reader stay in it forever.
598 Done,
599}
600
601/// A reference to an encoding together with information about how it was retrieved.
602///
603/// The state transition diagram:
604///
605/// ```mermaid
606/// flowchart LR
607/// Implicit -- from_str --> Explicit
608/// Implicit -- BOM --> BomDetected
609/// Implicit -- "encoding=..." --> XmlDetected
610/// BomDetected -- "encoding=..." --> XmlDetected
611/// ```
612#[cfg(feature = "encoding")]
613#[derive(Clone, Copy, Debug)]
614enum EncodingRef {
615 /// Encoding was implicitly assumed to have a specified value. It can be refined
616 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
617 Implicit(&'static Encoding),
618 /// Encoding was explicitly set to the desired value. It cannot be changed
619 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
620 Explicit(&'static Encoding),
621 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
622 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
623 BomDetected(&'static Encoding),
624 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
625 /// It can no longer change
626 XmlDetected(&'static Encoding),
627}
628#[cfg(feature = "encoding")]
629impl EncodingRef {
630 #[inline]
631 const fn encoding(&self) -> &'static Encoding {
632 match self {
633 Self::Implicit(e) => e,
634 Self::Explicit(e) => e,
635 Self::BomDetected(e) => e,
636 Self::XmlDetected(e) => e,
637 }
638 }
639 #[inline]
640 const fn can_be_refined(&self) -> bool {
641 match self {
642 Self::Implicit(_) | Self::BomDetected(_) => true,
643 Self::Explicit(_) | Self::XmlDetected(_) => false,
644 }
645 }
646}
647
648////////////////////////////////////////////////////////////////////////////////////////////////////
649
650/// A direct stream to the underlying [`Reader`]s reader which updates
651/// [`Reader::buffer_position()`] when read from it.
652#[derive(Debug)]
653#[must_use = "streams do nothing unless read or polled"]
654pub struct BinaryStream<'r, R> {
655 inner: &'r mut R,
656 offset: &'r mut u64,
657}
658
659impl<'r, R> BinaryStream<'r, R> {
660 /// Returns current position in bytes in the original source.
661 #[inline]
662 pub const fn offset(&self) -> u64 {
663 *self.offset
664 }
665
666 /// Gets a reference to the underlying reader.
667 #[inline]
668 pub const fn get_ref(&self) -> &R {
669 self.inner
670 }
671
672 /// Gets a mutable reference to the underlying reader.
673 ///
674 /// Avoid read from this reader because this will not update reader's position
675 /// and will lead to incorrect positions of errors. Read from this stream instead.
676 #[inline]
677 pub fn get_mut(&mut self) -> &mut R {
678 self.inner
679 }
680}
681
682impl<'r, R> io::Read for BinaryStream<'r, R>
683where
684 R: io::Read,
685{
686 #[inline]
687 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
688 let amt = self.inner.read(buf)?;
689 *self.offset += amt as u64;
690 Ok(amt)
691 }
692}
693
694impl<'r, R> io::BufRead for BinaryStream<'r, R>
695where
696 R: io::BufRead,
697{
698 #[inline]
699 fn fill_buf(&mut self) -> io::Result<&[u8]> {
700 self.inner.fill_buf()
701 }
702
703 #[inline]
704 fn consume(&mut self, amt: usize) {
705 self.inner.consume(amt);
706 *self.offset += amt as u64;
707 }
708}
709
710////////////////////////////////////////////////////////////////////////////////////////////////////
711
712/// A low level encoding-agnostic XML event reader.
713///
714/// Consumes bytes and streams XML [`Event`]s.
715///
716/// This reader does not manage namespace declarations and not able to resolve
717/// prefixes. If you want these features, use the [`NsReader`].
718///
719/// # Examples
720///
721/// ```
722/// use quick_xml::events::Event;
723/// use quick_xml::reader::Reader;
724///
725/// let xml = r#"<tag1 att1 = "test">
726/// <tag2><!--Test comment-->Test</tag2>
727/// <tag2>Test 2</tag2>
728/// </tag1>"#;
729/// let mut reader = Reader::from_str(xml);
730/// reader.config_mut().trim_text(true);
731///
732/// let mut count = 0;
733/// let mut txt = Vec::new();
734/// let mut buf = Vec::new();
735///
736/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
737/// loop {
738/// // NOTE: this is the generic case when we don't know about the input BufRead.
739/// // when the input is a &str or a &[u8], we don't actually need to use another
740/// // buffer, we could directly call `reader.read_event()`
741/// match reader.read_event_into(&mut buf) {
742/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
743/// // exits the loop when reaching end of file
744/// Ok(Event::Eof) => break,
745///
746/// Ok(Event::Start(e)) => {
747/// match e.name().as_ref() {
748/// b"tag1" => println!("attributes values: {:?}",
749/// e.attributes().map(|a| a.unwrap().value)
750/// .collect::<Vec<_>>()),
751/// b"tag2" => count += 1,
752/// _ => (),
753/// }
754/// }
755/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
756///
757/// // There are several other `Event`s we do not consider here
758/// _ => (),
759/// }
760/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
761/// buf.clear();
762/// }
763/// ```
764///
765/// [`NsReader`]: crate::reader::NsReader
766#[derive(Debug, Clone)]
767pub struct Reader<R> {
768 /// Source of data for parse
769 reader: R,
770 /// Configuration and current parse state
771 state: ReaderState,
772}
773
774/// Builder methods
775impl<R> Reader<R> {
776 /// Creates a `Reader` that reads from a given reader.
777 pub fn from_reader(reader: R) -> Self {
778 Self {
779 reader,
780 state: ReaderState::default(),
781 }
782 }
783
784 /// Returns reference to the parser configuration
785 pub const fn config(&self) -> &Config {
786 &self.state.config
787 }
788
789 /// Returns mutable reference to the parser configuration
790 pub fn config_mut(&mut self) -> &mut Config {
791 &mut self.state.config
792 }
793}
794
795/// Getters
796impl<R> Reader<R> {
797 /// Consumes `Reader` returning the underlying reader
798 ///
799 /// Can be used to compute line and column of a parsing error position
800 ///
801 /// # Examples
802 ///
803 /// ```
804 /// # use pretty_assertions::assert_eq;
805 /// use std::{str, io::Cursor};
806 /// use quick_xml::events::Event;
807 /// use quick_xml::reader::Reader;
808 ///
809 /// let xml = r#"<tag1 att1 = "test">
810 /// <tag2><!--Test comment-->Test</tag2>
811 /// <tag3>Test 2</tag3>
812 /// </tag1>"#;
813 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
814 /// let mut buf = Vec::new();
815 ///
816 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
817 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
818 /// let end_pos = reader.buffer_position() as usize;
819 /// let mut cursor = reader.into_inner();
820 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
821 /// .expect("can't make a string");
822 /// let mut line = 1;
823 /// let mut column = 0;
824 /// for c in s.chars() {
825 /// if c == '\n' {
826 /// line += 1;
827 /// column = 0;
828 /// } else {
829 /// column += 1;
830 /// }
831 /// }
832 /// (line, column)
833 /// }
834 ///
835 /// loop {
836 /// match reader.read_event_into(&mut buf) {
837 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
838 /// b"tag1" | b"tag2" => (),
839 /// tag => {
840 /// assert_eq!(b"tag3", tag);
841 /// assert_eq!((3, 22), into_line_and_column(reader));
842 /// break;
843 /// }
844 /// },
845 /// Ok(Event::Eof) => unreachable!(),
846 /// _ => (),
847 /// }
848 /// buf.clear();
849 /// }
850 /// ```
851 pub fn into_inner(self) -> R {
852 self.reader
853 }
854
855 /// Gets a reference to the underlying reader.
856 pub const fn get_ref(&self) -> &R {
857 &self.reader
858 }
859
860 /// Gets a mutable reference to the underlying reader.
861 ///
862 /// Avoid read from this reader because this will not update reader's position
863 /// and will lead to incorrect positions of errors. If you want to read, use
864 /// [`stream()`] instead.
865 ///
866 /// [`stream()`]: Self::stream
867 pub fn get_mut(&mut self) -> &mut R {
868 &mut self.reader
869 }
870
871 /// Gets the byte position in the input data just after the last emitted event
872 /// (i.e. this is position where data of last event ends).
873 ///
874 /// Note, that for text events which is originally ended with whitespace characters
875 /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position
876 /// before trim, not the position of the last byte of the [`Event::Text`] content.
877 pub const fn buffer_position(&self) -> u64 {
878 self.state.offset
879 }
880
881 /// Gets the last error byte position in the input data. If there is no errors
882 /// yet, returns `0`.
883 ///
884 /// Unlike `buffer_position` it will point to the place where it is rational
885 /// to report error to the end user. For example, all [`SyntaxError`]s are
886 /// reported when the parser sees EOF inside of some kind of markup. The
887 /// `buffer_position()` will point to the last byte of input which is not
888 /// very useful. `error_position()` will point to the start of corresponding
889 /// markup element (i. e. to the `<` character).
890 ///
891 /// This position is always `<= buffer_position()`.
892 pub const fn error_position(&self) -> u64 {
893 self.state.last_error_offset
894 }
895
896 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
897 ///
898 /// If [`encoding`] feature is enabled, the used encoding may change after
899 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
900 ///
901 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
902 /// defaults to UTF-8.
903 ///
904 /// [`encoding`]: ../index.html#encoding
905 #[inline]
906 pub const fn decoder(&self) -> Decoder {
907 self.state.decoder()
908 }
909
910 /// Get the direct access to the underlying reader, but tracks the amount of
911 /// read data and update [`Reader::buffer_position()`] accordingly.
912 ///
913 /// Note, that this method gives you access to the internal reader and read
914 /// data will not be returned in any subsequent events read by `read_event`
915 /// family of methods.
916 ///
917 /// # Example
918 ///
919 /// This example demonstrates how to read stream raw bytes from an XML document.
920 /// This could be used to implement streaming read of text, or to read raw binary
921 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
922 /// valid XML, but XML-derived file formats exist where such documents are valid).
923 ///
924 /// ```
925 /// # use pretty_assertions::assert_eq;
926 /// use std::io::{BufRead, Read};
927 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
928 /// use quick_xml::reader::Reader;
929 ///
930 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
931 /// // ^ ^ ^ ^
932 /// // 0 5 21 27
933 ///
934 /// assert_eq!(
935 /// (reader.read_event().unwrap(), reader.buffer_position()),
936 /// // 5 - end of the `<tag>`
937 /// (Event::Start(BytesStart::new("tag")), 5)
938 /// );
939 ///
940 /// // Reading directly from underlying reader will not update position
941 /// // let mut inner = reader.get_mut();
942 ///
943 /// // Reading from the stream() advances position
944 /// let mut inner = reader.stream();
945 ///
946 /// // Read binary data. We must know its size
947 /// let mut binary = [0u8; 16];
948 /// inner.read_exact(&mut binary).unwrap();
949 /// assert_eq!(&binary, b"binary << data&>");
950 /// // 21 - end of the `binary << data&>`
951 /// assert_eq!(inner.offset(), 21);
952 /// assert_eq!(reader.buffer_position(), 21);
953 ///
954 /// assert_eq!(
955 /// (reader.read_event().unwrap(), reader.buffer_position()),
956 /// // 27 - end of the `</tag>`
957 /// (Event::End(BytesEnd::new("tag")), 27)
958 /// );
959 ///
960 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
961 /// ```
962 #[inline]
963 pub fn stream(&mut self) -> BinaryStream<'_, R> {
964 BinaryStream {
965 inner: &mut self.reader,
966 offset: &mut self.state.offset,
967 }
968 }
969}
970
971/// Private sync reading methods
972impl<R> Reader<R> {
973 /// Read text into the given buffer, and return an event that borrows from
974 /// either that buffer or from the input itself, based on the type of the
975 /// reader.
976 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
977 where
978 R: XmlSource<'i, B>,
979 {
980 read_event_impl!(self, buf, self.reader, read_until_close)
981 }
982
983 /// Private function to read until `>` is found. This function expects that
984 /// it was called just after encounter a `<` symbol.
985 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
986 where
987 R: XmlSource<'i, B>,
988 {
989 read_until_close!(self, buf, self.reader)
990 }
991}
992
993////////////////////////////////////////////////////////////////////////////////////////////////////
994
995/// Result of an attempt to read XML textual data from the source.
996#[derive(Debug)]
997enum ReadTextResult<'r, B> {
998 /// Start of markup (`<` character) was found in the first byte. `<` was consumed.
999 /// Contains buffer that should be returned back to the next iteration cycle
1000 /// to satisfy borrow checker requirements.
1001 Markup(B),
1002 /// Start of reference (`&` character) was found in the first byte.
1003 /// `&` was not consumed.
1004 /// Contains buffer that should be returned back to the next iteration cycle
1005 /// to satisfy borrow checker requirements.
1006 Ref(B),
1007 /// Contains text block up to start of markup (`<` character). `<` was consumed.
1008 UpToMarkup(&'r [u8]),
1009 /// Contains text block up to start of reference (`&` character).
1010 /// `&` was not consumed.
1011 UpToRef(&'r [u8]),
1012 /// Contains text block up to EOF, neither start of markup (`<` character)
1013 /// or start of reference (`&` character) was found.
1014 UpToEof(&'r [u8]),
1015 /// IO error occurred.
1016 Err(io::Error),
1017}
1018
1019/// Result of an attempt to read general reference from the reader.
1020#[derive(Debug)]
1021enum ReadRefResult<'r> {
1022 /// Contains text block up to end of reference (`;` character).
1023 /// Result includes start `&`, but not end `;`.
1024 Ref(&'r [u8]),
1025 /// Contains text block up to EOF. Neither end of reference (`;`), start of
1026 /// another reference (`&`) or start of markup (`<`) characters was found.
1027 /// Result includes start `&`.
1028 UpToEof(&'r [u8]),
1029 /// Contains text block up to next possible reference (`&` character).
1030 /// Result includes start `&`.
1031 UpToRef(&'r [u8]),
1032 /// Contains text block up to start of markup (`<` character).
1033 /// Result includes start `&`.
1034 UpToMarkup(&'r [u8]),
1035 /// IO error occurred.
1036 Err(io::Error),
1037}
1038
1039/// Represents an input for a reader that can return borrowed data.
1040///
1041/// There are two implementors of this trait: generic one that read data from
1042/// `Self`, copies some part of it into a provided buffer of type `B` and then
1043/// returns data that borrow from that buffer.
1044///
1045/// The other implementor is for `&[u8]` and instead of copying data returns
1046/// borrowed data from `Self` instead. This implementation allows zero-copy
1047/// deserialization.
1048///
1049/// # Parameters
1050/// - `'r`: lifetime of a buffer from which events will borrow
1051/// - `B`: a type of a buffer that can be used to store data read from `Self` and
1052/// from which events can borrow
1053trait XmlSource<'r, B> {
1054 /// Removes UTF-8 BOM if it is present
1055 #[cfg(not(feature = "encoding"))]
1056 fn remove_utf8_bom(&mut self) -> io::Result<()>;
1057
1058 /// Determines encoding from the start of input and removes BOM if it is present
1059 #[cfg(feature = "encoding")]
1060 fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
1061
1062 /// Read input until start of markup (the `<`) is found, start of general entity
1063 /// reference (the `&`) is found or end of input is reached.
1064 ///
1065 /// # Parameters
1066 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1067 /// from which [events] could borrow their data
1068 /// - `position`: Will be increased by amount of bytes consumed
1069 ///
1070 /// [events]: crate::events::Event
1071 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
1072
1073 /// Read input until end of general reference (the `;`) is found, start of
1074 /// another general reference (the `&`) is found or end of input is reached.
1075 ///
1076 /// This method must be called when current character is `&`.
1077 ///
1078 /// # Parameters
1079 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1080 /// from which [events] could borrow their data
1081 /// - `position`: Will be increased by amount of bytes consumed
1082 ///
1083 /// [events]: crate::events::Event
1084 fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>;
1085
1086 /// Read input until processing instruction is finished.
1087 ///
1088 /// This method expect that start sequence of a parser already was read.
1089 ///
1090 /// Returns a slice of data read up to the end of the thing being parsed.
1091 /// The end of thing and the returned content is determined by the used parser.
1092 ///
1093 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
1094 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
1095 ///
1096 /// # Parameters
1097 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1098 /// from which [events] could borrow their data
1099 /// - `position`: Will be increased by amount of bytes consumed
1100 ///
1101 /// A `P` type parameter is used to preserve state between calls to the underlying
1102 /// reader which provides bytes fed into the parser.
1103 ///
1104 /// [events]: crate::events::Event
1105 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
1106 where
1107 P: Parser;
1108
1109 /// Read input until comment or CDATA is finished.
1110 ///
1111 /// This method expect that `<` already was read.
1112 ///
1113 /// Returns a slice of data read up to end of comment or CDATA (`>`),
1114 /// which does not include into result.
1115 ///
1116 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
1117 ///
1118 /// # Parameters
1119 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1120 /// from which [events] could borrow their data
1121 /// - `position`: Will be increased by amount of bytes consumed
1122 ///
1123 /// [events]: crate::events::Event
1124 fn read_bang_element(
1125 &mut self,
1126 buf: B,
1127 position: &mut u64,
1128 ) -> Result<(BangType, &'r [u8]), Error>;
1129
1130 /// Consume and discard all the whitespace until the next non-whitespace
1131 /// character or EOF.
1132 ///
1133 /// # Parameters
1134 /// - `position`: Will be increased by amount of bytes consumed
1135 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1136
1137 /// Return one character without consuming it, so that future `read_*` calls
1138 /// will still include it. On EOF, return `None`.
1139 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1140}
1141
1142/// Possible elements started with `<!`
1143#[derive(Debug, PartialEq)]
1144enum BangType {
1145 /// <![CDATA[...]]>
1146 CData,
1147 /// <!--...-->
1148 Comment,
1149 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1150 DocType(DtdParser),
1151}
1152impl BangType {
1153 #[inline(always)]
1154 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1155 Ok(match byte {
1156 Some(b'[') => Self::CData,
1157 Some(b'-') => Self::Comment,
1158 Some(b'D') | Some(b'd') => Self::DocType(DtdParser::BeforeInternalSubset(0)),
1159 _ => return Err(SyntaxError::InvalidBangMarkup),
1160 })
1161 }
1162
1163 /// If element is finished, returns its content up to `>` symbol and
1164 /// an index of this symbol, otherwise returns `None`
1165 ///
1166 /// # Parameters
1167 /// - `buf`: buffer with data consumed on previous iterations
1168 /// - `chunk`: data read on current iteration and not yet consumed from reader
1169 #[inline(always)]
1170 fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<usize> {
1171 match self {
1172 Self::Comment => {
1173 for i in memchr::memchr_iter(b'>', chunk) {
1174 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1175 // <!----> - XML comment
1176 // 0123456 - i
1177 if buf.len() + i > 5 {
1178 if chunk[..i].ends_with(b"--") {
1179 // We cannot strip last `--` from the buffer because we need it in case of
1180 // check_comments enabled option. XML standard requires that comment
1181 // will not end with `--->` sequence because this is a special case of
1182 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1183 return Some(i);
1184 }
1185 // End sequence `-|->` was splitted at |
1186 // buf --/ \-- chunk
1187 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1188 return Some(i);
1189 }
1190 // End sequence `--|>` was splitted at |
1191 // buf --/ \-- chunk
1192 if i == 0 && buf.ends_with(b"--") {
1193 return Some(i);
1194 }
1195 }
1196 }
1197 }
1198 Self::CData => {
1199 for i in memchr::memchr_iter(b'>', chunk) {
1200 if chunk[..i].ends_with(b"]]") {
1201 return Some(i);
1202 }
1203 // End sequence `]|]>` was splitted at |
1204 // buf --/ \-- chunk
1205 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1206 return Some(i);
1207 }
1208 // End sequence `]]|>` was splitted at |
1209 // buf --/ \-- chunk
1210 if i == 0 && buf.ends_with(b"]]") {
1211 return Some(i);
1212 }
1213 }
1214 }
1215 Self::DocType(ref mut parser) => return parser.feed(buf, chunk),
1216 }
1217 None
1218 }
1219 #[inline]
1220 const fn to_err(&self) -> SyntaxError {
1221 match self {
1222 Self::CData => SyntaxError::UnclosedCData,
1223 Self::Comment => SyntaxError::UnclosedComment,
1224 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1225 }
1226 }
1227}
1228
1229////////////////////////////////////////////////////////////////////////////////////////////////////
1230
1231#[cfg(test)]
1232mod test {
1233 /// Checks the internal implementation of the various reader methods
1234 macro_rules! check {
1235 (
1236 #[$test:meta]
1237 $read_event:ident,
1238 $read_until_close:ident,
1239 // constructor of the XML source on which internal functions will be called
1240 $source:path,
1241 $skip:literal,
1242 // constructor of the buffer to which read data will stored
1243 $buf:expr
1244 $(, $async:ident, $await:ident)?
1245 ) => {
1246 mod read_bang_element {
1247 use super::*;
1248 use crate::errors::{Error, SyntaxError};
1249 use crate::reader::{BangType, DtdParser};
1250 use crate::utils::Bytes;
1251
1252 /// Checks that reading CDATA content works correctly
1253 mod cdata {
1254 use super::*;
1255 use pretty_assertions::assert_eq;
1256
1257 /// Checks that if input begins like CDATA element, but CDATA start sequence
1258 /// is not finished, parsing ends with an error
1259 #[$test]
1260 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1261 $($async)? fn not_properly_start() {
1262 let buf = $buf;
1263 let mut position = 0;
1264 let mut input = &b"<![]]>other content"[$skip..];
1265 // ^= 0
1266
1267 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1268 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1269 x => panic!(
1270 "Expected `Err(Syntax(_))`, but got `{:?}`",
1271 x
1272 ),
1273 }
1274 assert_eq!(position, 1);
1275 }
1276
1277 /// Checks that if CDATA startup sequence was matched, but an end sequence
1278 /// is not found, parsing ends with an error
1279 #[$test]
1280 $($async)? fn not_closed() {
1281 let buf = $buf;
1282 let mut position = 0;
1283 let mut input = &b"<![CDATA[other content"[$skip..];
1284 // ^= 0 ^= 22
1285
1286 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1287 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1288 x => panic!(
1289 "Expected `Err(Syntax(_))`, but got `{:?}`",
1290 x
1291 ),
1292 }
1293 assert_eq!(position, 22);
1294 }
1295
1296 /// Checks that CDATA element without content inside parsed successfully
1297 #[$test]
1298 $($async)? fn empty() {
1299 let buf = $buf;
1300 let mut position = 0;
1301 let mut input = &b"<![CDATA[]]>other content"[$skip..];
1302 // ^= 0 ^= 12
1303
1304 let (ty, bytes) = $source(&mut input)
1305 .read_bang_element(buf, &mut position)
1306 $(.$await)?
1307 .unwrap();
1308 assert_eq!(
1309 (ty, Bytes(bytes)),
1310 (BangType::CData, Bytes(b"<![CDATA[]]>"))
1311 );
1312 assert_eq!(position, 12);
1313 }
1314
1315 /// Checks that CDATA element with content parsed successfully.
1316 /// Additionally checks that sequences inside CDATA that may look like
1317 /// a CDATA end sequence do not interrupt CDATA parsing
1318 #[$test]
1319 $($async)? fn with_content() {
1320 let buf = $buf;
1321 let mut position = 0;
1322 let mut input = &b"<![CDATA[cdata]] ]>content]]>other content]]>"[$skip..];
1323 // ^= 0 ^= 29
1324
1325 let (ty, bytes) = $source(&mut input)
1326 .read_bang_element(buf, &mut position)
1327 $(.$await)?
1328 .unwrap();
1329 assert_eq!(
1330 (ty, Bytes(bytes)),
1331 (BangType::CData, Bytes(b"<![CDATA[cdata]] ]>content]]>"))
1332 );
1333 assert_eq!(position, 29);
1334 }
1335 }
1336
1337 /// Checks that reading XML comments works correctly. According to the [specification],
1338 /// comment data can contain any sequence except `--`:
1339 ///
1340 /// ```peg
1341 /// comment = '<--' (!'--' char)* '-->';
1342 /// char = [#x1-#x2C]
1343 /// / [#x2E-#xD7FF]
1344 /// / [#xE000-#xFFFD]
1345 /// / [#x10000-#x10FFFF]
1346 /// ```
1347 ///
1348 /// The presence of this limitation, however, is simply a poorly designed specification
1349 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1350 /// presence of these sequences by default. This tests allow such content.
1351 ///
1352 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1353 mod comment {
1354 use super::*;
1355 use pretty_assertions::assert_eq;
1356
1357 #[$test]
1358 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1359 $($async)? fn not_properly_start() {
1360 let buf = $buf;
1361 let mut position = 0;
1362 let mut input = &b"<!- -->other content"[$skip..];
1363 // ^= 1
1364
1365 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1366 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1367 x => panic!(
1368 "Expected `Err(Syntax(_))`, but got `{:?}`",
1369 x
1370 ),
1371 }
1372 assert_eq!(position, 1);
1373 }
1374
1375 #[$test]
1376 $($async)? fn not_properly_end() {
1377 let buf = $buf;
1378 let mut position = 0;
1379 let mut input = &b"<!->other content"[$skip..];
1380 // ^= 0 ^= 17
1381
1382 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1383 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1384 x => panic!(
1385 "Expected `Err(Syntax(_))`, but got `{:?}`",
1386 x
1387 ),
1388 }
1389 assert_eq!(position, 17);
1390 }
1391
1392 #[$test]
1393 $($async)? fn not_closed1() {
1394 let buf = $buf;
1395 let mut position = 0;
1396 let mut input = &b"<!--other content"[$skip..];
1397 // ^= 0 ^= 17
1398
1399 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1400 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1401 x => panic!(
1402 "Expected `Err(Syntax(_))`, but got `{:?}`",
1403 x
1404 ),
1405 }
1406 assert_eq!(position, 17);
1407 }
1408
1409 #[$test]
1410 $($async)? fn not_closed2() {
1411 let buf = $buf;
1412 let mut position = 0;
1413 let mut input = &b"<!-->other content"[$skip..];
1414 // ^= 0 ^= 18
1415
1416 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1417 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1418 x => panic!(
1419 "Expected `Err(Syntax(_))`, but got `{:?}`",
1420 x
1421 ),
1422 }
1423 assert_eq!(position, 18);
1424 }
1425
1426 #[$test]
1427 $($async)? fn not_closed3() {
1428 let buf = $buf;
1429 let mut position = 0;
1430 let mut input = &b"<!--->other content"[$skip..];
1431 // ^= 0 ^= 19
1432
1433 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1434 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1435 x => panic!(
1436 "Expected `Err(Syntax(_))`, but got `{:?}`",
1437 x
1438 ),
1439 }
1440 assert_eq!(position, 19);
1441 }
1442
1443 #[$test]
1444 $($async)? fn empty() {
1445 let buf = $buf;
1446 let mut position = 0;
1447 let mut input = &b"<!---->other content"[$skip..];
1448 // ^= 0 ^= 7
1449
1450 let (ty, bytes) = $source(&mut input)
1451 .read_bang_element(buf, &mut position)
1452 $(.$await)?
1453 .unwrap();
1454 assert_eq!(
1455 (ty, Bytes(bytes)),
1456 (BangType::Comment, Bytes(b"<!---->"))
1457 );
1458 assert_eq!(position, 7);
1459 }
1460
1461 #[$test]
1462 $($async)? fn with_content() {
1463 let buf = $buf;
1464 let mut position = 0;
1465 let mut input = &b"<!--->comment<--->other content"[$skip..];
1466 // ^= 0 ^= 18
1467
1468 let (ty, bytes) = $source(&mut input)
1469 .read_bang_element(buf, &mut position)
1470 $(.$await)?
1471 .unwrap();
1472 assert_eq!(
1473 (ty, Bytes(bytes)),
1474 (BangType::Comment, Bytes(b"<!--->comment<--->"))
1475 );
1476 assert_eq!(position, 18);
1477 }
1478 }
1479
1480 /// Checks that reading DOCTYPE definition works correctly
1481 mod doctype {
1482 use super::*;
1483
1484 mod uppercase {
1485 use super::*;
1486 use pretty_assertions::assert_eq;
1487
1488 #[$test]
1489 $($async)? fn not_properly_start() {
1490 let buf = $buf;
1491 let mut position = 0;
1492 let mut input = &b"<!D other content"[$skip..];
1493 // ^= 0 ^= 17
1494
1495 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1496 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1497 x => panic!(
1498 "Expected `Err(Syntax(_))`, but got `{:?}`",
1499 x
1500 ),
1501 }
1502 assert_eq!(position, 17);
1503 }
1504
1505 #[$test]
1506 $($async)? fn without_space() {
1507 let buf = $buf;
1508 let mut position = 0;
1509 let mut input = &b"<!DOCTYPEother content"[$skip..];
1510 // ^= 0 ^= 22
1511
1512 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1513 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1514 x => panic!(
1515 "Expected `Err(Syntax(_))`, but got `{:?}`",
1516 x
1517 ),
1518 }
1519 assert_eq!(position, 22);
1520 }
1521
1522 #[$test]
1523 $($async)? fn empty() {
1524 let buf = $buf;
1525 let mut position = 0;
1526 let mut input = &b"<!DOCTYPE>other content"[$skip..];
1527 // ^= 0 ^= 10
1528
1529 let (ty, bytes) = $source(&mut input)
1530 .read_bang_element(buf, &mut position)
1531 $(.$await)?
1532 .unwrap();
1533 assert_eq!(
1534 (ty, Bytes(bytes)),
1535 (BangType::DocType(DtdParser::Finished), Bytes(b"<!DOCTYPE>"))
1536 );
1537 assert_eq!(position, 10);
1538 }
1539
1540 #[$test]
1541 $($async)? fn not_closed() {
1542 let buf = $buf;
1543 let mut position = 0;
1544 let mut input = &b"<!DOCTYPE other content"[$skip..];
1545 // ^= 0 ^23
1546
1547 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1548 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1549 x => panic!(
1550 "Expected `Err(Syntax(_))`, but got `{:?}`",
1551 x
1552 ),
1553 }
1554 assert_eq!(position, 23);
1555 }
1556 }
1557
1558 mod lowercase {
1559 use super::*;
1560 use pretty_assertions::assert_eq;
1561
1562 #[$test]
1563 $($async)? fn not_properly_start() {
1564 let buf = $buf;
1565 let mut position = 0;
1566 let mut input = &b"<!d other content"[$skip..];
1567 // ^= 0 ^= 17
1568
1569 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1570 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1571 x => panic!(
1572 "Expected `Err(Syntax(_))`, but got `{:?}`",
1573 x
1574 ),
1575 }
1576 assert_eq!(position, 17);
1577 }
1578
1579 #[$test]
1580 $($async)? fn without_space() {
1581 let buf = $buf;
1582 let mut position = 0;
1583 let mut input = &b"<!doctypeother content"[$skip..];
1584 // ^= 0 ^= 22
1585
1586 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1587 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1588 x => panic!(
1589 "Expected `Err(Syntax(_))`, but got `{:?}`",
1590 x
1591 ),
1592 }
1593 assert_eq!(position, 22);
1594 }
1595
1596 #[$test]
1597 $($async)? fn empty() {
1598 let buf = $buf;
1599 let mut position = 0;
1600 let mut input = &b"<!doctype>other content"[$skip..];
1601 // ^= 0 ^= 10
1602
1603 let (ty, bytes) = $source(&mut input)
1604 .read_bang_element(buf, &mut position)
1605 $(.$await)?
1606 .unwrap();
1607 assert_eq!(
1608 (ty, Bytes(bytes)),
1609 (BangType::DocType(DtdParser::Finished), Bytes(b"<!doctype>"))
1610 );
1611 assert_eq!(position, 10);
1612 }
1613
1614 #[$test]
1615 $($async)? fn not_closed() {
1616 let buf = $buf;
1617 let mut position = 0;
1618 let mut input = &b"<!doctype other content"[$skip..];
1619 // ^= 0 ^= 23
1620
1621 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1622 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1623 x => panic!(
1624 "Expected `Err(Syntax(_))`, but got `{:?}`",
1625 x
1626 ),
1627 }
1628 assert_eq!(position, 23);
1629 }
1630 }
1631 }
1632 }
1633
1634 mod read_text {
1635 use super::*;
1636 use crate::reader::ReadTextResult;
1637 use crate::utils::Bytes;
1638 use pretty_assertions::assert_eq;
1639
1640 #[$test]
1641 $($async)? fn empty() {
1642 let buf = $buf;
1643 let mut position = 1;
1644 let mut input = b"".as_ref();
1645 // ^= 1
1646
1647 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1648 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")),
1649 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1650 }
1651 assert_eq!(position, 1);
1652 }
1653
1654 #[$test]
1655 $($async)? fn markup() {
1656 let buf = $buf;
1657 let mut position = 1;
1658 let mut input = b"<".as_ref();
1659 // ^= 1
1660
1661 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1662 ReadTextResult::Markup(b) => assert_eq!(b, $buf),
1663 x => panic!("Expected `Markup(_)`, but got `{:?}`", x),
1664 }
1665 assert_eq!(position, 1);
1666 }
1667
1668 #[$test]
1669 $($async)? fn ref_() {
1670 let buf = $buf;
1671 let mut position = 1;
1672 let mut input = b"&".as_ref();
1673 // ^= 1
1674
1675 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1676 ReadTextResult::Ref(b) => assert_eq!(b, $buf),
1677 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1678 }
1679 assert_eq!(position, 1);
1680 }
1681
1682 #[$test]
1683 $($async)? fn up_to_markup() {
1684 let buf = $buf;
1685 let mut position = 1;
1686 let mut input = b"a<".as_ref();
1687 // ^= 2
1688
1689 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1690 ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1691 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1692 }
1693 assert_eq!(position, 2);
1694 }
1695
1696 #[$test]
1697 $($async)? fn up_to_ref() {
1698 let buf = $buf;
1699 let mut position = 1;
1700 let mut input = b"a&".as_ref();
1701 // ^= 2
1702
1703 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1704 ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1705 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1706 }
1707 assert_eq!(position, 2);
1708 }
1709
1710 #[$test]
1711 $($async)? fn up_to_eof() {
1712 let buf = $buf;
1713 let mut position = 1;
1714 let mut input = b"a".as_ref();
1715 // ^= 2
1716
1717 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1718 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1719 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1720 }
1721 assert_eq!(position, 2);
1722 }
1723 }
1724
1725 mod read_ref {
1726 use super::*;
1727 use crate::reader::ReadRefResult;
1728 use crate::utils::Bytes;
1729 use pretty_assertions::assert_eq;
1730
1731 // Empty input is not allowed for `read_ref` so not tested.
1732 // Borrowed source triggers debug assertion,
1733 // buffered do nothing due to implementation details.
1734
1735 #[$test]
1736 $($async)? fn up_to_eof() {
1737 let buf = $buf;
1738 let mut position = 1;
1739 let mut input = b"&".as_ref();
1740 // ^= 2
1741
1742 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1743 ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1744 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1745 }
1746 assert_eq!(position, 2);
1747 }
1748
1749 #[$test]
1750 $($async)? fn up_to_ref() {
1751 let buf = $buf;
1752 let mut position = 1;
1753 let mut input = b"&&".as_ref();
1754 // ^= 2
1755
1756 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1757 ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1758 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1759 }
1760 assert_eq!(position, 2);
1761 }
1762
1763 #[$test]
1764 $($async)? fn up_to_markup() {
1765 let buf = $buf;
1766 let mut position = 1;
1767 let mut input = b"&<".as_ref();
1768 // ^= 2
1769
1770 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1771 ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1772 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1773 }
1774 assert_eq!(position, 2);
1775 }
1776
1777 #[$test]
1778 $($async)? fn empty_ref() {
1779 let buf = $buf;
1780 let mut position = 1;
1781 let mut input = b"&;".as_ref();
1782 // ^= 3
1783
1784 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1785 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&;")),
1786 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1787 }
1788 assert_eq!(position, 3);
1789 }
1790
1791 #[$test]
1792 $($async)? fn normal() {
1793 let buf = $buf;
1794 let mut position = 1;
1795 let mut input = b"<".as_ref();
1796 // ^= 5
1797
1798 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1799 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")),
1800 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1801 }
1802 assert_eq!(position, 5);
1803 }
1804 }
1805
1806 mod read_element {
1807 use super::*;
1808 use crate::errors::{Error, SyntaxError};
1809 use crate::parser::ElementParser;
1810 use crate::utils::Bytes;
1811 use pretty_assertions::assert_eq;
1812
1813 /// Checks that nothing was read from empty buffer
1814 /// `<` read in peek_one that is called before read_with, that is why it in the input buffer
1815 /// peek_one, however, does not increment position for simplicity of the code
1816 #[$test]
1817 $($async)? fn empty() {
1818 let buf = $buf;
1819 let mut position = 0;
1820 let mut input = &b"<"[$skip..];
1821 // ^= 1
1822
1823 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1824 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1825 x => panic!(
1826 "Expected `Err(Syntax(_))`, but got `{:?}`",
1827 x
1828 ),
1829 }
1830 assert_eq!(position, 1);
1831 }
1832
1833 mod open {
1834 use super::*;
1835 use pretty_assertions::assert_eq;
1836
1837 #[$test]
1838 $($async)? fn empty_tag() {
1839 let buf = $buf;
1840 let mut position = 0;
1841 let mut input = &b"<>"[$skip..];
1842 // ^= 2
1843
1844 assert_eq!(
1845 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1846 Bytes(b"<>")
1847 );
1848 assert_eq!(position, 2);
1849 }
1850
1851 #[$test]
1852 $($async)? fn normal() {
1853 let buf = $buf;
1854 let mut position = 0;
1855 let mut input = &b"<tag>"[$skip..];
1856 // ^= 5
1857
1858 assert_eq!(
1859 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1860 Bytes(b"<tag>")
1861 );
1862 assert_eq!(position, 5);
1863 }
1864
1865 #[$test]
1866 $($async)? fn empty_ns_empty_tag() {
1867 let buf = $buf;
1868 let mut position = 0;
1869 let mut input = &b"<:>"[$skip..];
1870 // ^= 3
1871
1872 assert_eq!(
1873 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1874 Bytes(b"<:>")
1875 );
1876 assert_eq!(position, 3);
1877 }
1878
1879 #[$test]
1880 $($async)? fn empty_ns() {
1881 let buf = $buf;
1882 let mut position = 0;
1883 let mut input = &b"<:tag>"[$skip..];
1884 // ^= 6
1885
1886 assert_eq!(
1887 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1888 Bytes(b"<:tag>")
1889 );
1890 assert_eq!(position, 6);
1891 }
1892
1893 #[$test]
1894 $($async)? fn with_attributes() {
1895 let buf = $buf;
1896 let mut position = 0;
1897 let mut input = &br#"<tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
1898 // ^= 39
1899
1900 assert_eq!(
1901 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1902 Bytes(br#"<tag attr-1=">" attr2 = '>' 3attr>"#)
1903 );
1904 assert_eq!(position, 39);
1905 }
1906 }
1907
1908 mod self_closed {
1909 use super::*;
1910 use pretty_assertions::assert_eq;
1911
1912 #[$test]
1913 $($async)? fn empty_tag() {
1914 let buf = $buf;
1915 let mut position = 0;
1916 let mut input = &b"</>"[$skip..];
1917 // ^= 3
1918
1919 assert_eq!(
1920 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1921 Bytes(b"</>")
1922 );
1923 assert_eq!(position, 3);
1924 }
1925
1926 #[$test]
1927 $($async)? fn normal() {
1928 let buf = $buf;
1929 let mut position = 0;
1930 let mut input = &b"<tag/>"[$skip..];
1931 // ^= 6
1932
1933 assert_eq!(
1934 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1935 Bytes(b"<tag/>")
1936 );
1937 assert_eq!(position, 6);
1938 }
1939
1940 #[$test]
1941 $($async)? fn empty_ns_empty_tag() {
1942 let buf = $buf;
1943 let mut position = 0;
1944 let mut input = &b"<:/>"[$skip..];
1945 // ^= 4
1946
1947 assert_eq!(
1948 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1949 Bytes(b"<:/>")
1950 );
1951 assert_eq!(position, 4);
1952 }
1953
1954 #[$test]
1955 $($async)? fn empty_ns() {
1956 let buf = $buf;
1957 let mut position = 0;
1958 let mut input = &b"<:tag/>"[$skip..];
1959 // ^= 7
1960
1961 assert_eq!(
1962 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1963 Bytes(b"<:tag/>")
1964 );
1965 assert_eq!(position, 7);
1966 }
1967
1968 #[$test]
1969 $($async)? fn with_attributes() {
1970 let buf = $buf;
1971 let mut position = 0;
1972 let mut input = &br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#[$skip..];
1973 // ^= 42
1974
1975 assert_eq!(
1976 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1977 Bytes(br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#)
1978 );
1979 assert_eq!(position, 42);
1980 }
1981 }
1982
1983 mod close {
1984 use super::*;
1985 use pretty_assertions::assert_eq;
1986
1987 #[$test]
1988 $($async)? fn empty_tag() {
1989 let buf = $buf;
1990 let mut position = 0;
1991 let mut input = &b"</ >"[$skip..];
1992 // ^= 4
1993
1994 assert_eq!(
1995 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1996 Bytes(b"</ >")
1997 );
1998 assert_eq!(position, 4);
1999 }
2000
2001 #[$test]
2002 $($async)? fn normal() {
2003 let buf = $buf;
2004 let mut position = 0;
2005 let mut input = &b"</tag>"[$skip..];
2006 // ^= 6
2007
2008 assert_eq!(
2009 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2010 Bytes(b"</tag>")
2011 );
2012 assert_eq!(position, 6);
2013 }
2014
2015 #[$test]
2016 $($async)? fn empty_ns_empty_tag() {
2017 let buf = $buf;
2018 let mut position = 0;
2019 let mut input = &b"</:>"[$skip..];
2020 // ^= 4
2021
2022 assert_eq!(
2023 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2024 Bytes(b"</:>")
2025 );
2026 assert_eq!(position, 4);
2027 }
2028
2029 #[$test]
2030 $($async)? fn empty_ns() {
2031 let buf = $buf;
2032 let mut position = 0;
2033 let mut input = &b"</:tag>"[$skip..];
2034 // ^= 7
2035
2036 assert_eq!(
2037 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2038 Bytes(b"</:tag>")
2039 );
2040 assert_eq!(position, 7);
2041 }
2042
2043 #[$test]
2044 $($async)? fn with_attributes() {
2045 let buf = $buf;
2046 let mut position = 0;
2047 let mut input = &br#"</tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
2048 // ^= 40
2049
2050 assert_eq!(
2051 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2052 Bytes(br#"</tag attr-1=">" attr2 = '>' 3attr>"#)
2053 );
2054 assert_eq!(position, 40);
2055 }
2056 }
2057 }
2058
2059 /// Ensures, that no empty `Text` events are generated
2060 mod $read_event {
2061 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
2062 use crate::reader::Reader;
2063 use pretty_assertions::assert_eq;
2064
2065 /// When `encoding` feature is enabled, encoding should be detected
2066 /// from BOM (UTF-8) and BOM should be stripped.
2067 ///
2068 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
2069 /// character should be stripped for consistency
2070 #[$test]
2071 $($async)? fn bom_from_reader() {
2072 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
2073
2074 assert_eq!(
2075 reader.$read_event($buf) $(.$await)? .unwrap(),
2076 Event::Text(BytesText::from_escaped("\u{feff}"))
2077 );
2078
2079 assert_eq!(
2080 reader.$read_event($buf) $(.$await)? .unwrap(),
2081 Event::Eof
2082 );
2083 }
2084
2085 /// When parsing from &str, encoding is fixed (UTF-8), so
2086 /// - when `encoding` feature is disabled, the behavior the
2087 /// same as in `bom_from_reader` text
2088 /// - when `encoding` feature is enabled, the behavior should
2089 /// stay consistent, so the first BOM character is stripped
2090 #[$test]
2091 $($async)? fn bom_from_str() {
2092 let mut reader = Reader::from_str("\u{feff}\u{feff}");
2093
2094 assert_eq!(
2095 reader.$read_event($buf) $(.$await)? .unwrap(),
2096 Event::Text(BytesText::from_escaped("\u{feff}"))
2097 );
2098
2099 assert_eq!(
2100 reader.$read_event($buf) $(.$await)? .unwrap(),
2101 Event::Eof
2102 );
2103 }
2104
2105 #[$test]
2106 $($async)? fn declaration() {
2107 let mut reader = Reader::from_str("<?xml ?>");
2108
2109 assert_eq!(
2110 reader.$read_event($buf) $(.$await)? .unwrap(),
2111 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
2112 );
2113 }
2114
2115 #[$test]
2116 $($async)? fn doctype() {
2117 let mut reader = Reader::from_str("<!DOCTYPE x>");
2118
2119 assert_eq!(
2120 reader.$read_event($buf) $(.$await)? .unwrap(),
2121 Event::DocType(BytesText::from_escaped("x"))
2122 );
2123 }
2124
2125 #[$test]
2126 $($async)? fn processing_instruction() {
2127 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
2128
2129 assert_eq!(
2130 reader.$read_event($buf) $(.$await)? .unwrap(),
2131 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
2132 );
2133 }
2134
2135 /// Lone closing tags are not allowed, so testing it together with start tag
2136 #[$test]
2137 $($async)? fn start_and_end() {
2138 let mut reader = Reader::from_str("<tag></tag>");
2139
2140 assert_eq!(
2141 reader.$read_event($buf) $(.$await)? .unwrap(),
2142 Event::Start(BytesStart::new("tag"))
2143 );
2144
2145 assert_eq!(
2146 reader.$read_event($buf) $(.$await)? .unwrap(),
2147 Event::End(BytesEnd::new("tag"))
2148 );
2149 }
2150
2151 #[$test]
2152 $($async)? fn empty() {
2153 let mut reader = Reader::from_str("<tag/>");
2154
2155 assert_eq!(
2156 reader.$read_event($buf) $(.$await)? .unwrap(),
2157 Event::Empty(BytesStart::new("tag"))
2158 );
2159 }
2160
2161 #[$test]
2162 $($async)? fn text() {
2163 let mut reader = Reader::from_str("text");
2164
2165 assert_eq!(
2166 reader.$read_event($buf) $(.$await)? .unwrap(),
2167 Event::Text(BytesText::from_escaped("text"))
2168 );
2169 }
2170
2171 #[$test]
2172 $($async)? fn cdata() {
2173 let mut reader = Reader::from_str("<![CDATA[]]>");
2174
2175 assert_eq!(
2176 reader.$read_event($buf) $(.$await)? .unwrap(),
2177 Event::CData(BytesCData::new(""))
2178 );
2179 }
2180
2181 #[$test]
2182 $($async)? fn comment() {
2183 let mut reader = Reader::from_str("<!---->");
2184
2185 assert_eq!(
2186 reader.$read_event($buf) $(.$await)? .unwrap(),
2187 Event::Comment(BytesText::from_escaped(""))
2188 );
2189 }
2190
2191 #[$test]
2192 $($async)? fn eof() {
2193 let mut reader = Reader::from_str("");
2194
2195 assert_eq!(
2196 reader.$read_event($buf) $(.$await)? .unwrap(),
2197 Event::Eof
2198 );
2199 }
2200 }
2201 };
2202 }
2203
2204 // Export macros for the child modules:
2205 // - buffered_reader
2206 // - slice_reader
2207 pub(super) use check;
2208}