quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9#[cfg(feature = "encoding")]
10use crate::encoding::DetectedEncoding;
11use crate::errors::{Error, IllFormedError, SyntaxError};
12use crate::events::{BytesRef, Event};
13use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
14use crate::reader::state::ReaderState;
15
16/// A struct that holds a parser configuration.
17///
18/// Current parser configuration can be retrieved by calling [`Reader::config()`]
19/// and changed by changing properties of the object returned by a call to
20/// [`Reader::config_mut()`].
21///
22/// [`Reader::config()`]: crate::reader::Reader::config
23/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
24#[derive(Debug, Clone, PartialEq, Eq)]
25#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
26#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
27#[non_exhaustive]
28pub struct Config {
29 /// Whether lone ampersand character (without a paired semicolon) should be
30 /// allowed in textual content. Unless enabled, in case of a dangling ampersand,
31 /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
32 ///
33 /// Default: `false`
34 ///
35 /// # Example
36 ///
37 /// ```
38 /// # use quick_xml::events::{BytesRef, BytesText, Event};
39 /// # use quick_xml::reader::Reader;
40 /// # use pretty_assertions::assert_eq;
41 /// let mut reader = Reader::from_str("text with & & & alone");
42 /// reader.config_mut().allow_dangling_amp = true;
43 ///
44 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
45 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
46 /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
47 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
48 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
49 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
50 /// ```
51 ///
52 /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
53 pub allow_dangling_amp: bool,
54
55 /// Whether unmatched closing tag names should be allowed. Unless enabled,
56 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
57 /// is returned from read methods.
58 ///
59 /// When set to `true`, it won't check if a closing tag has a corresponding
60 /// opening tag at all. For example, `<a></a></b>` will be permitted.
61 ///
62 /// Note that the emitted [`End`] event will not be modified if this is enabled,
63 /// ie. it will contain the data of the unmatched end tag.
64 ///
65 /// Note, that setting this to `true` will lead to additional allocates that
66 /// needed to store tag name for an [`End`] event.
67 ///
68 /// Default: `false`
69 ///
70 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
71 /// [`End`]: crate::events::Event::End
72 pub allow_unmatched_ends: bool,
73
74 /// Whether comments should be validated. If enabled, in case of invalid comment
75 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
76 ///
77 /// When set to `true`, every [`Comment`] event will be checked for not
78 /// containing `--`, which [is not allowed] in XML comments. Most of the time
79 /// we don't want comments at all so we don't really care about comment
80 /// correctness, thus the default value is `false` to improve performance.
81 ///
82 /// Default: `false`
83 ///
84 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
85 /// [`Comment`]: crate::events::Event::Comment
86 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
87 pub check_comments: bool,
88
89 /// Whether mismatched closing tag names should be detected. If enabled, in
90 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
91 /// read methods.
92 ///
93 /// Note, that start and end tags [should match literally][spec], they cannot
94 /// have different prefixes even if both prefixes resolve to the same namespace.
95 /// The XML
96 ///
97 /// ```xml
98 /// <outer xmlns="namespace" xmlns:p="namespace">
99 /// </p:outer>
100 /// ```
101 ///
102 /// is not valid, even though semantically the start tag is the same as the
103 /// end tag. The reason is that namespaces are an extension of the original
104 /// XML specification (without namespaces) and it should be backward-compatible.
105 ///
106 /// When set to `false`, it won't check if a closing tag matches the corresponding
107 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
108 ///
109 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
110 ///
111 /// Note that the emitted [`End`] event will not be modified if this is disabled,
112 /// ie. it will contain the data of the mismatched end tag.
113 ///
114 /// Note, that setting this to `true` will lead to additional allocates that
115 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
116 /// is also set, only one additional allocation will be performed that support
117 /// both these options.
118 ///
119 /// Default: `true`
120 ///
121 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
122 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
123 /// [`End`]: crate::events::Event::End
124 /// [`expand_empty_elements`]: Self::expand_empty_elements
125 pub check_end_names: bool,
126
127 /// Whether empty elements should be split into an `Open` and a `Close` event.
128 ///
129 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
130 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
131 /// event. When set to `false` (the default), those tags are represented by
132 /// an [`Empty`] event instead.
133 ///
134 /// Note, that setting this to `true` will lead to additional allocates that
135 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
136 /// is also set, only one additional allocation will be performed that support
137 /// both these options.
138 ///
139 /// Default: `false`
140 ///
141 /// [`Empty`]: crate::events::Event::Empty
142 /// [`Start`]: crate::events::Event::Start
143 /// [`End`]: crate::events::Event::End
144 /// [`check_end_names`]: Self::check_end_names
145 pub expand_empty_elements: bool,
146
147 /// Whether trailing whitespace after the markup name are trimmed in closing
148 /// tags `</a >`.
149 ///
150 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
151 /// after the markup name.
152 ///
153 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
154 /// of markup names is going to fail erroneously if a closing tag contains
155 /// trailing whitespace.
156 ///
157 /// Default: `true`
158 ///
159 /// [`End`]: crate::events::Event::End
160 /// [`check_end_names`]: Self::check_end_names
161 pub trim_markup_names_in_closing_tags: bool,
162
163 /// Whether whitespace before character data should be removed.
164 ///
165 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
166 /// If after that the event is empty it will not be pushed.
167 ///
168 /// Default: `false`
169 ///
170 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
171 ///
172 /// WARNING: With this option every text events will be trimmed which is
173 /// incorrect behavior when text events delimited by comments, processing
174 /// instructions or CDATA sections. To correctly trim data manually apply
175 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
176 /// only to necessary events.
177 /// </div>
178 ///
179 /// [`Text`]: crate::events::Event::Text
180 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
181 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
182 pub trim_text_start: bool,
183
184 /// Whether whitespace after character data should be removed.
185 ///
186 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
187 /// If after that the event is empty it will not be pushed.
188 ///
189 /// Default: `false`
190 ///
191 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
192 ///
193 /// WARNING: With this option every text events will be trimmed which is
194 /// incorrect behavior when text events delimited by comments, processing
195 /// instructions or CDATA sections. To correctly trim data manually apply
196 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
197 /// only to necessary events.
198 /// </div>
199 ///
200 /// [`Text`]: crate::events::Event::Text
201 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
202 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
203 pub trim_text_end: bool,
204}
205
206impl Config {
207 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
208 ///
209 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
210 ///
211 /// WARNING: With this option every text events will be trimmed which is
212 /// incorrect behavior when text events delimited by comments, processing
213 /// instructions or CDATA sections. To correctly trim data manually apply
214 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
215 /// only to necessary events.
216 /// </div>
217 ///
218 /// [`trim_text_start`]: Self::trim_text_start
219 /// [`trim_text_end`]: Self::trim_text_end
220 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
221 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
222 #[inline]
223 pub fn trim_text(&mut self, trim: bool) {
224 self.trim_text_start = trim;
225 self.trim_text_end = trim;
226 }
227
228 /// Turn on or off all checks for well-formedness. Currently it is that settings:
229 /// - [`check_comments`](Self::check_comments)
230 /// - [`check_end_names`](Self::check_end_names)
231 #[inline]
232 pub fn enable_all_checks(&mut self, enable: bool) {
233 self.check_comments = enable;
234 self.check_end_names = enable;
235 }
236}
237
238impl Default for Config {
239 fn default() -> Self {
240 Self {
241 allow_dangling_amp: false,
242 allow_unmatched_ends: false,
243 check_comments: false,
244 check_end_names: true,
245 expand_empty_elements: false,
246 trim_markup_names_in_closing_tags: true,
247 trim_text_start: false,
248 trim_text_end: false,
249 }
250 }
251}
252
253////////////////////////////////////////////////////////////////////////////////////////////////////
254
255macro_rules! read_event_impl {
256 (
257 $self:ident, $buf:ident,
258 $reader:expr,
259 $read_until_close:ident
260 $(, $await:ident)?
261 ) => {{
262 let event = loop {
263 break match $self.state.state {
264 ParseState::Init => { // Go to InsideText state
265 // If encoding set explicitly, we not need to detect it. For example,
266 // explicit UTF-8 set automatically if Reader was created using `from_str`.
267 // But we still need to remove BOM for consistency with no encoding
268 // feature enabled path
269 #[cfg(feature = "encoding")]
270 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
271 if $self.state.encoding.can_be_refined() {
272 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding.encoding());
273 }
274 }
275
276 // Removes UTF-8 BOM if it is present
277 #[cfg(not(feature = "encoding"))]
278 $reader.remove_utf8_bom() $(.$await)? ?;
279
280 $self.state.state = ParseState::InsideText;
281 continue;
282 },
283 ParseState::InsideRef => { // Go to InsideText
284 let start = $self.state.offset;
285 match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? {
286 // Emit reference, go to InsideText state
287 ReadRefResult::Ref(bytes) => {
288 $self.state.state = ParseState::InsideText;
289 // +1 to skip start `&`
290 // -1 to skip end `;`
291 Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..bytes.len() - 1], $self.decoder())))
292 }
293 // Go to Done state
294 ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
295 $self.state.state = ParseState::Done;
296 Ok(Event::Text($self.state.emit_text(bytes)))
297 }
298 ReadRefResult::UpToEof(_) => {
299 $self.state.state = ParseState::Done;
300 $self.state.last_error_offset = start;
301 Err(Error::IllFormed(IllFormedError::UnclosedReference))
302 }
303 // Do not change state, stay in InsideRef
304 ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
305 Ok(Event::Text($self.state.emit_text(bytes)))
306 }
307 ReadRefResult::UpToRef(_) => {
308 $self.state.last_error_offset = start;
309 Err(Error::IllFormed(IllFormedError::UnclosedReference))
310 }
311 // Go to InsideMarkup state
312 ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
313 $self.state.state = ParseState::InsideMarkup;
314 Ok(Event::Text($self.state.emit_text(bytes)))
315 }
316 ReadRefResult::UpToMarkup(_) => {
317 $self.state.state = ParseState::InsideMarkup;
318 $self.state.last_error_offset = start;
319 Err(Error::IllFormed(IllFormedError::UnclosedReference))
320 }
321 ReadRefResult::Err(e) => Err(Error::from(e)),
322 }
323 }
324 ParseState::InsideText => { // Go to InsideMarkup or Done state
325 if $self.state.config.trim_text_start {
326 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
327 }
328
329 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
330 ReadTextResult::Markup(buf) => {
331 $self.state.state = ParseState::InsideMarkup;
332 // Pass `buf` to the next next iteration of parsing loop
333 $buf = buf;
334 continue;
335 }
336 ReadTextResult::Ref(buf) => {
337 $self.state.state = ParseState::InsideRef;
338 // Pass `buf` to the next next iteration of parsing loop
339 $buf = buf;
340 continue;
341 }
342 ReadTextResult::UpToMarkup(bytes) => {
343 $self.state.state = ParseState::InsideMarkup;
344 // FIXME: Can produce an empty event if:
345 // - event contains only spaces
346 // - trim_text_start = false
347 // - trim_text_end = true
348 Ok(Event::Text($self.state.emit_text(bytes)))
349 }
350 ReadTextResult::UpToRef(bytes) => {
351 $self.state.state = ParseState::InsideRef;
352 // Return Text event with `bytes` content or Eof if bytes is empty
353 Ok(Event::Text($self.state.emit_text(bytes)))
354 }
355 ReadTextResult::UpToEof(bytes) => {
356 $self.state.state = ParseState::Done;
357 // Trim bytes from end if required
358 let event = $self.state.emit_text(bytes);
359 if event.is_empty() {
360 Ok(Event::Eof)
361 } else {
362 Ok(Event::Text(event))
363 }
364 }
365 ReadTextResult::Err(e) => Err(Error::from(e)),
366 }
367 },
368 // Go to InsideText state in next two arms
369 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
370 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
371 ParseState::Done => Ok(Event::Eof),
372 };
373 };
374 match event {
375 // #513: In case of ill-formed errors we already consume the wrong data
376 // and change the state. We can continue parsing if we wish
377 Err(Error::IllFormed(_)) => {}
378 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
379 _ => {}
380 }
381 event
382 }};
383}
384
385/// Read bytes up to the `>` and skip it. This method is expected to be called
386/// after seeing the `<` symbol and skipping it. Inspects the next (current)
387/// symbol and returns an appropriate [`Event`]:
388///
389/// |Symbol |Event
390/// |-------|-------------------------------------
391/// |`!` |[`Comment`], [`CData`] or [`DocType`]
392/// |`/` |[`End`]
393/// |`?` |[`PI`]
394/// |_other_|[`Start`] or [`Empty`]
395///
396/// Moves parser to the `InsideText` state.
397///
398/// [`Comment`]: Event::Comment
399/// [`CData`]: Event::CData
400/// [`DocType`]: Event::DocType
401/// [`End`]: Event::End
402/// [`PI`]: Event::PI
403/// [`Start`]: Event::Start
404/// [`Empty`]: Event::Empty
405macro_rules! read_until_close {
406 (
407 $self:ident, $buf:ident,
408 $reader:expr
409 $(, $await:ident)?
410 ) => {{
411 $self.state.state = ParseState::InsideText;
412
413 let start = $self.state.offset;
414 match $reader.peek_one() $(.$await)? {
415 // `<!` - comment, CDATA or DOCTYPE declaration
416 Ok(Some(b'!')) => match $reader
417 .read_bang_element($buf, &mut $self.state.offset)
418 $(.$await)?
419 {
420 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
421 Err(e) => {
422 // We want to report error at `<`
423 $self.state.last_error_offset = start;
424 Err(e)
425 }
426 },
427 // `</` - closing tag
428 // #776: We parse using ElementParser which allows us to have attributes
429 // in close tags. While such tags are not allowed by the specification,
430 // we anyway allow to parse them because:
431 // - we do not check constraints during parsing. This is performed by the
432 // optional validate step which user should call manually
433 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
434 // `</tag attr=">` and text `" >` which probably no one existing parser
435 // does. This is malformed XML, however it is tolerated by some parsers
436 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
437 Ok(Some(b'/')) => match $reader
438 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
439 $(.$await)?
440 {
441 Ok(bytes) => $self.state.emit_end(bytes),
442 Err(e) => {
443 // We want to report error at `<`
444 $self.state.last_error_offset = start;
445 Err(e)
446 }
447 },
448 // `<?` - processing instruction
449 Ok(Some(b'?')) => match $reader
450 .read_with(PiParser(false), $buf, &mut $self.state.offset)
451 $(.$await)?
452 {
453 Ok(bytes) => $self.state.emit_question_mark(bytes),
454 Err(e) => {
455 // We want to report error at `<`
456 $self.state.last_error_offset = start;
457 Err(e)
458 }
459 },
460 // `<...` - opening or self-closed tag
461 Ok(Some(_)) => match $reader
462 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
463 $(.$await)?
464 {
465 Ok(bytes) => Ok($self.state.emit_start(bytes)),
466 Err(e) => {
467 // We want to report error at `<`
468 $self.state.last_error_offset = start;
469 Err(e)
470 }
471 },
472 // `<` - syntax error, tag not closed
473 Ok(None) => {
474 // We want to report error at `<`
475 $self.state.last_error_offset = start;
476 Err(Error::Syntax(SyntaxError::UnclosedTag))
477 }
478 Err(e) => Err(Error::from(e)),
479 }
480 }};
481}
482
483/// Generalization of `read_to_end` method for buffered and borrowed readers
484macro_rules! read_to_end {
485 (
486 // $self: &mut Reader
487 $self:expr, $end:expr, $buf:expr,
488 $read_event:ident,
489 // Code block that performs clearing of internal buffer after read of each event
490 $clear:block
491 $(, $await:ident)?
492 ) => {{
493 // Because we take position after the event before the End event,
494 // it is important that this position indicates beginning of the End event.
495 // If between last event and the End event would be only spaces, then we
496 // take position before the spaces, but spaces would be skipped without
497 // generating event if `trim_text_start` is set to `true`. To prevent that
498 // we temporary disable start text trimming.
499 //
500 // We also cannot take position after getting End event, because if
501 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
502 // we do not known the real size of the End event that it is occupies in
503 // the source and cannot correct the position after the End event.
504 // So, we in any case should tweak parser configuration.
505 let config = $self.config_mut();
506 let trim = config.trim_text_start;
507 config.trim_text_start = false;
508
509 let start = $self.buffer_position();
510 let mut depth = 0;
511 loop {
512 $clear
513 let end = $self.buffer_position();
514 match $self.$read_event($buf) $(.$await)? {
515 Err(e) => {
516 $self.config_mut().trim_text_start = trim;
517 return Err(e);
518 }
519
520 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
521 Ok(Event::End(e)) if e.name() == $end => {
522 if depth == 0 {
523 $self.config_mut().trim_text_start = trim;
524 break start..end;
525 }
526 depth -= 1;
527 }
528 Ok(Event::Eof) => {
529 $self.config_mut().trim_text_start = trim;
530 return Err(Error::missed_end($end, $self.decoder()));
531 }
532 _ => (),
533 }
534 }
535 }};
536}
537
538#[cfg(feature = "async-tokio")]
539mod async_tokio;
540mod buffered_reader;
541mod ns_reader;
542mod slice_reader;
543mod state;
544
545pub use ns_reader::NsReader;
546
547/// Range of input in bytes, that corresponds to some piece of XML
548pub type Span = Range<u64>;
549
550////////////////////////////////////////////////////////////////////////////////////////////////////
551
552/// Possible reader states. The state transition diagram (`true` and `false` shows
553/// value of [`Config::expand_empty_elements`] option):
554///
555/// ```mermaid
556/// flowchart LR
557/// subgraph _
558/// direction LR
559///
560/// Init -- "(no event)"\n --> InsideMarkup
561/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
562/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
563/// InsideRef -- "(no event)"\nGeneralRef --> InsideText
564/// end
565/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
566/// InsideEmpty -- End --> InsideText
567/// _ -. Eof .-> Done
568/// ```
569#[derive(Clone, Debug)]
570enum ParseState {
571 /// Initial state in which reader stay after creation. Transition from that
572 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
573 /// state is always `InsideMarkup`. The reader will never return to this state. The
574 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
575 /// first symbol not `<`, otherwise no event are emitted.
576 Init,
577 /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other
578 /// events could be generated.
579 ///
580 /// After generating one event the reader moves to the `ClosedTag` state.
581 InsideRef,
582 /// State after seeing the `<` symbol. Depending on the next symbol all other
583 /// events could be generated.
584 ///
585 /// After generating one event the reader moves to the `InsideText` state.
586 InsideMarkup,
587 /// State in which reader searches the `<` symbol of a markup. All bytes before
588 /// that symbol will be returned in the [`Event::Text`] event. After that
589 /// the reader moves to the `InsideMarkup` state.
590 InsideText,
591 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
592 /// Reader enters to this state when it is in a `InsideText` state and emits an
593 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
594 /// after which reader returned to the `InsideText` state.
595 ///
596 /// [`expand_empty_elements`]: Config::expand_empty_elements
597 InsideEmpty,
598 /// Reader enters this state when `Eof` event generated or an error occurred.
599 /// This is the last state, the reader stay in it forever.
600 Done,
601}
602
603/// A reference to an encoding together with information about how it was retrieved.
604///
605/// The state transition diagram:
606///
607/// ```mermaid
608/// flowchart LR
609/// Implicit -- from_str --> Explicit
610/// Implicit -- BOM --> BomDetected
611/// Implicit -- "encoding=..." --> XmlDetected
612/// BomDetected -- "encoding=..." --> XmlDetected
613/// ```
614#[cfg(feature = "encoding")]
615#[derive(Clone, Copy, Debug)]
616enum EncodingRef {
617 /// Encoding was implicitly assumed to have a specified value. It can be refined
618 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
619 Implicit(&'static Encoding),
620 /// Encoding was explicitly set to the desired value. It cannot be changed
621 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
622 Explicit(&'static Encoding),
623 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
624 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
625 BomDetected(&'static Encoding),
626 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
627 /// It can no longer change
628 XmlDetected(&'static Encoding),
629}
630#[cfg(feature = "encoding")]
631impl EncodingRef {
632 #[inline]
633 const fn encoding(&self) -> &'static Encoding {
634 match self {
635 Self::Implicit(e) => e,
636 Self::Explicit(e) => e,
637 Self::BomDetected(e) => e,
638 Self::XmlDetected(e) => e,
639 }
640 }
641 #[inline]
642 const fn can_be_refined(&self) -> bool {
643 match self {
644 Self::Implicit(_) | Self::BomDetected(_) => true,
645 Self::Explicit(_) | Self::XmlDetected(_) => false,
646 }
647 }
648}
649
650////////////////////////////////////////////////////////////////////////////////////////////////////
651
652/// A direct stream to the underlying [`Reader`]s reader which updates
653/// [`Reader::buffer_position()`] when read from it.
654#[derive(Debug)]
655#[must_use = "streams do nothing unless read or polled"]
656pub struct BinaryStream<'r, R> {
657 inner: &'r mut R,
658 offset: &'r mut u64,
659}
660
661impl<'r, R> BinaryStream<'r, R> {
662 /// Returns current position in bytes in the original source.
663 #[inline]
664 pub const fn offset(&self) -> u64 {
665 *self.offset
666 }
667
668 /// Gets a reference to the underlying reader.
669 #[inline]
670 pub const fn get_ref(&self) -> &R {
671 self.inner
672 }
673
674 /// Gets a mutable reference to the underlying reader.
675 ///
676 /// Avoid read from this reader because this will not update reader's position
677 /// and will lead to incorrect positions of errors. Read from this stream instead.
678 #[inline]
679 pub fn get_mut(&mut self) -> &mut R {
680 self.inner
681 }
682}
683
684impl<'r, R> io::Read for BinaryStream<'r, R>
685where
686 R: io::Read,
687{
688 #[inline]
689 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
690 let amt = self.inner.read(buf)?;
691 *self.offset += amt as u64;
692 Ok(amt)
693 }
694}
695
696impl<'r, R> io::BufRead for BinaryStream<'r, R>
697where
698 R: io::BufRead,
699{
700 #[inline]
701 fn fill_buf(&mut self) -> io::Result<&[u8]> {
702 self.inner.fill_buf()
703 }
704
705 #[inline]
706 fn consume(&mut self, amt: usize) {
707 self.inner.consume(amt);
708 *self.offset += amt as u64;
709 }
710}
711
712////////////////////////////////////////////////////////////////////////////////////////////////////
713
714/// A low level encoding-agnostic XML event reader.
715///
716/// Consumes bytes and streams XML [`Event`]s.
717///
718/// This reader does not manage namespace declarations and not able to resolve
719/// prefixes. If you want these features, use the [`NsReader`].
720///
721/// # Examples
722///
723/// ```
724/// use quick_xml::events::Event;
725/// use quick_xml::reader::Reader;
726///
727/// let xml = r#"<tag1 att1 = "test">
728/// <tag2><!--Test comment-->Test</tag2>
729/// <tag2>Test 2</tag2>
730/// </tag1>"#;
731/// let mut reader = Reader::from_str(xml);
732/// reader.config_mut().trim_text(true);
733///
734/// let mut count = 0;
735/// let mut txt = Vec::new();
736/// let mut buf = Vec::new();
737///
738/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
739/// loop {
740/// // NOTE: this is the generic case when we don't know about the input BufRead.
741/// // when the input is a &str or a &[u8], we don't actually need to use another
742/// // buffer, we could directly call `reader.read_event()`
743/// match reader.read_event_into(&mut buf) {
744/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
745/// // exits the loop when reaching end of file
746/// Ok(Event::Eof) => break,
747///
748/// Ok(Event::Start(e)) => {
749/// match e.name().as_ref() {
750/// b"tag1" => println!("attributes values: {:?}",
751/// e.attributes().map(|a| a.unwrap().value)
752/// .collect::<Vec<_>>()),
753/// b"tag2" => count += 1,
754/// _ => (),
755/// }
756/// }
757/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
758///
759/// // There are several other `Event`s we do not consider here
760/// _ => (),
761/// }
762/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
763/// buf.clear();
764/// }
765/// ```
766///
767/// [`NsReader`]: crate::reader::NsReader
768#[derive(Debug, Clone)]
769pub struct Reader<R> {
770 /// Source of data for parse
771 reader: R,
772 /// Configuration and current parse state
773 state: ReaderState,
774}
775
776/// Builder methods
777impl<R> Reader<R> {
778 /// Creates a `Reader` that reads from a given reader.
779 pub fn from_reader(reader: R) -> Self {
780 Self {
781 reader,
782 state: ReaderState::default(),
783 }
784 }
785
786 /// Returns reference to the parser configuration
787 pub const fn config(&self) -> &Config {
788 &self.state.config
789 }
790
791 /// Returns mutable reference to the parser configuration
792 pub fn config_mut(&mut self) -> &mut Config {
793 &mut self.state.config
794 }
795}
796
797/// Getters
798impl<R> Reader<R> {
799 /// Consumes `Reader` returning the underlying reader
800 ///
801 /// Can be used to compute line and column of a parsing error position
802 ///
803 /// # Examples
804 ///
805 /// ```
806 /// # use pretty_assertions::assert_eq;
807 /// use std::{str, io::Cursor};
808 /// use quick_xml::events::Event;
809 /// use quick_xml::reader::Reader;
810 ///
811 /// let xml = r#"<tag1 att1 = "test">
812 /// <tag2><!--Test comment-->Test</tag2>
813 /// <tag3>Test 2</tag3>
814 /// </tag1>"#;
815 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
816 /// let mut buf = Vec::new();
817 ///
818 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
819 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
820 /// let end_pos = reader.buffer_position() as usize;
821 /// let mut cursor = reader.into_inner();
822 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
823 /// .expect("can't make a string");
824 /// let mut line = 1;
825 /// let mut column = 0;
826 /// for c in s.chars() {
827 /// if c == '\n' {
828 /// line += 1;
829 /// column = 0;
830 /// } else {
831 /// column += 1;
832 /// }
833 /// }
834 /// (line, column)
835 /// }
836 ///
837 /// loop {
838 /// match reader.read_event_into(&mut buf) {
839 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
840 /// b"tag1" | b"tag2" => (),
841 /// tag => {
842 /// assert_eq!(b"tag3", tag);
843 /// assert_eq!((3, 22), into_line_and_column(reader));
844 /// break;
845 /// }
846 /// },
847 /// Ok(Event::Eof) => unreachable!(),
848 /// _ => (),
849 /// }
850 /// buf.clear();
851 /// }
852 /// ```
853 pub fn into_inner(self) -> R {
854 self.reader
855 }
856
857 /// Gets a reference to the underlying reader.
858 pub const fn get_ref(&self) -> &R {
859 &self.reader
860 }
861
862 /// Gets a mutable reference to the underlying reader.
863 ///
864 /// Avoid read from this reader because this will not update reader's position
865 /// and will lead to incorrect positions of errors. If you want to read, use
866 /// [`stream()`] instead.
867 ///
868 /// [`stream()`]: Self::stream
869 pub fn get_mut(&mut self) -> &mut R {
870 &mut self.reader
871 }
872
873 /// Gets the byte position in the input data just after the last emitted event
874 /// (i.e. this is position where data of last event ends).
875 ///
876 /// Note, that for text events which is originally ended with whitespace characters
877 /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position
878 /// before trim, not the position of the last byte of the [`Event::Text`] content.
879 pub const fn buffer_position(&self) -> u64 {
880 self.state.offset
881 }
882
883 /// Gets the last error byte position in the input data. If there is no errors
884 /// yet, returns `0`.
885 ///
886 /// Unlike `buffer_position` it will point to the place where it is rational
887 /// to report error to the end user. For example, all [`SyntaxError`]s are
888 /// reported when the parser sees EOF inside of some kind of markup. The
889 /// `buffer_position()` will point to the last byte of input which is not
890 /// very useful. `error_position()` will point to the start of corresponding
891 /// markup element (i. e. to the `<` character).
892 ///
893 /// This position is always `<= buffer_position()`.
894 pub const fn error_position(&self) -> u64 {
895 self.state.last_error_offset
896 }
897
898 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
899 ///
900 /// If [`encoding`] feature is enabled, the used encoding may change after
901 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
902 ///
903 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
904 /// defaults to UTF-8.
905 ///
906 /// [`encoding`]: ../index.html#encoding
907 #[inline]
908 pub const fn decoder(&self) -> Decoder {
909 self.state.decoder()
910 }
911
912 /// Get the direct access to the underlying reader, but tracks the amount of
913 /// read data and update [`Reader::buffer_position()`] accordingly.
914 ///
915 /// Note, that this method gives you access to the internal reader and read
916 /// data will not be returned in any subsequent events read by `read_event`
917 /// family of methods.
918 ///
919 /// # Example
920 ///
921 /// This example demonstrates how to read stream raw bytes from an XML document.
922 /// This could be used to implement streaming read of text, or to read raw binary
923 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
924 /// valid XML, but XML-derived file formats exist where such documents are valid).
925 ///
926 /// ```
927 /// # use pretty_assertions::assert_eq;
928 /// use std::io::{BufRead, Read};
929 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
930 /// use quick_xml::reader::Reader;
931 ///
932 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
933 /// // ^ ^ ^ ^
934 /// // 0 5 21 27
935 ///
936 /// assert_eq!(
937 /// (reader.read_event().unwrap(), reader.buffer_position()),
938 /// // 5 - end of the `<tag>`
939 /// (Event::Start(BytesStart::new("tag")), 5)
940 /// );
941 ///
942 /// // Reading directly from underlying reader will not update position
943 /// // let mut inner = reader.get_mut();
944 ///
945 /// // Reading from the stream() advances position
946 /// let mut inner = reader.stream();
947 ///
948 /// // Read binary data. We must know its size
949 /// let mut binary = [0u8; 16];
950 /// inner.read_exact(&mut binary).unwrap();
951 /// assert_eq!(&binary, b"binary << data&>");
952 /// // 21 - end of the `binary << data&>`
953 /// assert_eq!(inner.offset(), 21);
954 /// assert_eq!(reader.buffer_position(), 21);
955 ///
956 /// assert_eq!(
957 /// (reader.read_event().unwrap(), reader.buffer_position()),
958 /// // 27 - end of the `</tag>`
959 /// (Event::End(BytesEnd::new("tag")), 27)
960 /// );
961 ///
962 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
963 /// ```
964 #[inline]
965 pub fn stream(&mut self) -> BinaryStream<'_, R> {
966 BinaryStream {
967 inner: &mut self.reader,
968 offset: &mut self.state.offset,
969 }
970 }
971}
972
973/// Private sync reading methods
974impl<R> Reader<R> {
975 /// Read text into the given buffer, and return an event that borrows from
976 /// either that buffer or from the input itself, based on the type of the
977 /// reader.
978 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
979 where
980 R: XmlSource<'i, B>,
981 {
982 read_event_impl!(self, buf, self.reader, read_until_close)
983 }
984
985 /// Private function to read until `>` is found. This function expects that
986 /// it was called just after encounter a `<` symbol.
987 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
988 where
989 R: XmlSource<'i, B>,
990 {
991 read_until_close!(self, buf, self.reader)
992 }
993}
994
995////////////////////////////////////////////////////////////////////////////////////////////////////
996
997/// Result of an attempt to read XML textual data from the source.
998#[derive(Debug)]
999enum ReadTextResult<'r, B> {
1000 /// Start of markup (`<` character) was found in the first byte. `<` was consumed.
1001 /// Contains buffer that should be returned back to the next iteration cycle
1002 /// to satisfy borrow checker requirements.
1003 Markup(B),
1004 /// Start of reference (`&` character) was found in the first byte.
1005 /// `&` was not consumed.
1006 /// Contains buffer that should be returned back to the next iteration cycle
1007 /// to satisfy borrow checker requirements.
1008 Ref(B),
1009 /// Contains text block up to start of markup (`<` character). `<` was consumed.
1010 UpToMarkup(&'r [u8]),
1011 /// Contains text block up to start of reference (`&` character).
1012 /// `&` was not consumed.
1013 UpToRef(&'r [u8]),
1014 /// Contains text block up to EOF, neither start of markup (`<` character)
1015 /// or start of reference (`&` character) was found.
1016 UpToEof(&'r [u8]),
1017 /// IO error occurred.
1018 Err(io::Error),
1019}
1020
1021/// Result of an attempt to read general reference from the reader.
1022#[derive(Debug)]
1023enum ReadRefResult<'r> {
1024 /// Contains text block up to end of reference (`;` character).
1025 /// Result includes start `&`, but not end `;`.
1026 Ref(&'r [u8]),
1027 /// Contains text block up to EOF. Neither end of reference (`;`), start of
1028 /// another reference (`&`) or start of markup (`<`) characters was found.
1029 /// Result includes start `&`.
1030 UpToEof(&'r [u8]),
1031 /// Contains text block up to next possible reference (`&` character).
1032 /// Result includes start `&`.
1033 UpToRef(&'r [u8]),
1034 /// Contains text block up to start of markup (`<` character).
1035 /// Result includes start `&`.
1036 UpToMarkup(&'r [u8]),
1037 /// IO error occurred.
1038 Err(io::Error),
1039}
1040
1041/// Represents an input for a reader that can return borrowed data.
1042///
1043/// There are two implementors of this trait: generic one that read data from
1044/// `Self`, copies some part of it into a provided buffer of type `B` and then
1045/// returns data that borrow from that buffer.
1046///
1047/// The other implementor is for `&[u8]` and instead of copying data returns
1048/// borrowed data from `Self` instead. This implementation allows zero-copy
1049/// deserialization.
1050///
1051/// # Parameters
1052/// - `'r`: lifetime of a buffer from which events will borrow
1053/// - `B`: a type of a buffer that can be used to store data read from `Self` and
1054/// from which events can borrow
1055trait XmlSource<'r, B> {
1056 /// Removes UTF-8 BOM if it is present
1057 #[cfg(not(feature = "encoding"))]
1058 fn remove_utf8_bom(&mut self) -> io::Result<()>;
1059
1060 /// Determines encoding from the start of input and removes BOM if it is present
1061 #[cfg(feature = "encoding")]
1062 fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>>;
1063
1064 /// Read input until start of markup (the `<`) is found, start of general entity
1065 /// reference (the `&`) is found or end of input is reached.
1066 ///
1067 /// # Parameters
1068 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1069 /// from which [events] could borrow their data
1070 /// - `position`: Will be increased by amount of bytes consumed
1071 ///
1072 /// [events]: crate::events::Event
1073 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
1074
1075 /// Read input until end of general reference (the `;`) is found, start of
1076 /// another general reference (the `&`) is found or end of input is reached.
1077 ///
1078 /// This method must be called when current character is `&`.
1079 ///
1080 /// # Parameters
1081 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1082 /// from which [events] could borrow their data
1083 /// - `position`: Will be increased by amount of bytes consumed
1084 ///
1085 /// [events]: crate::events::Event
1086 fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>;
1087
1088 /// Read input until processing instruction is finished.
1089 ///
1090 /// This method expect that start sequence of a parser already was read.
1091 ///
1092 /// Returns a slice of data read up to the end of the thing being parsed.
1093 /// The end of thing and the returned content is determined by the used parser.
1094 ///
1095 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
1096 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
1097 ///
1098 /// # Parameters
1099 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1100 /// from which [events] could borrow their data
1101 /// - `position`: Will be increased by amount of bytes consumed
1102 ///
1103 /// A `P` type parameter is used to preserve state between calls to the underlying
1104 /// reader which provides bytes fed into the parser.
1105 ///
1106 /// [events]: crate::events::Event
1107 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
1108 where
1109 P: Parser;
1110
1111 /// Read input until comment or CDATA is finished.
1112 ///
1113 /// This method expect that `<` already was read.
1114 ///
1115 /// Returns a slice of data read up to end of comment or CDATA (`>`),
1116 /// which does not include into result.
1117 ///
1118 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
1119 ///
1120 /// # Parameters
1121 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1122 /// from which [events] could borrow their data
1123 /// - `position`: Will be increased by amount of bytes consumed
1124 ///
1125 /// [events]: crate::events::Event
1126 fn read_bang_element(
1127 &mut self,
1128 buf: B,
1129 position: &mut u64,
1130 ) -> Result<(BangType, &'r [u8]), Error>;
1131
1132 /// Consume and discard all the whitespace until the next non-whitespace
1133 /// character or EOF.
1134 ///
1135 /// # Parameters
1136 /// - `position`: Will be increased by amount of bytes consumed
1137 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1138
1139 /// Return one character without consuming it, so that future `read_*` calls
1140 /// will still include it. On EOF, return `None`.
1141 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1142}
1143
1144/// Possible elements started with `<!`
1145#[derive(Debug, PartialEq)]
1146enum BangType {
1147 /// <![CDATA[...]]>
1148 CData,
1149 /// <!--...-->
1150 Comment,
1151 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1152 DocType(DtdParser),
1153}
1154impl BangType {
1155 #[inline(always)]
1156 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1157 Ok(match byte {
1158 Some(b'[') => Self::CData,
1159 Some(b'-') => Self::Comment,
1160 Some(b'D') | Some(b'd') => Self::DocType(DtdParser::BeforeInternalSubset(0)),
1161 _ => return Err(SyntaxError::InvalidBangMarkup),
1162 })
1163 }
1164
1165 /// If element is finished, returns its content up to `>` symbol and
1166 /// an index of this symbol, otherwise returns `None`
1167 ///
1168 /// # Parameters
1169 /// - `buf`: buffer with data consumed on previous iterations
1170 /// - `chunk`: data read on current iteration and not yet consumed from reader
1171 #[inline(always)]
1172 fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
1173 match self {
1174 Self::Comment => {
1175 for i in memchr::memchr_iter(b'>', chunk) {
1176 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1177 // <!----> - XML comment
1178 // 0123456 - i
1179 if buf.len() + i > 5 {
1180 if chunk[..i].ends_with(b"--") {
1181 // We cannot strip last `--` from the buffer because we need it in case of
1182 // check_comments enabled option. XML standard requires that comment
1183 // will not end with `--->` sequence because this is a special case of
1184 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1185 return Some(i);
1186 }
1187 // End sequence `-|->` was splitted at |
1188 // buf --/ \-- chunk
1189 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1190 return Some(i);
1191 }
1192 // End sequence `--|>` was splitted at |
1193 // buf --/ \-- chunk
1194 if i == 0 && buf.ends_with(b"--") {
1195 return Some(i);
1196 }
1197 }
1198 }
1199 }
1200 Self::CData => {
1201 for i in memchr::memchr_iter(b'>', chunk) {
1202 if chunk[..i].ends_with(b"]]") {
1203 return Some(i);
1204 }
1205 // End sequence `]|]>` was splitted at |
1206 // buf --/ \-- chunk
1207 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1208 return Some(i);
1209 }
1210 // End sequence `]]|>` was splitted at |
1211 // buf --/ \-- chunk
1212 if i == 0 && buf.ends_with(b"]]") {
1213 return Some(i);
1214 }
1215 }
1216 }
1217 Self::DocType(ref mut parser) => return parser.feed(buf, chunk),
1218 }
1219 None
1220 }
1221 #[inline]
1222 const fn to_err(&self) -> SyntaxError {
1223 match self {
1224 Self::CData => SyntaxError::UnclosedCData,
1225 Self::Comment => SyntaxError::UnclosedComment,
1226 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1227 }
1228 }
1229}
1230
1231////////////////////////////////////////////////////////////////////////////////////////////////////
1232
1233#[cfg(test)]
1234mod test {
1235 /// Checks the internal implementation of the various reader methods
1236 macro_rules! check {
1237 (
1238 #[$test:meta]
1239 $read_event:ident,
1240 // constructor of the XML source on which internal functions will be called
1241 $source:path,
1242 $skip:literal,
1243 // constructor of the buffer to which read data will stored
1244 $buf:expr
1245 $(, $async:ident, $await:ident)?
1246 ) => {
1247 mod read_bang_element {
1248 use super::*;
1249 use crate::errors::{Error, SyntaxError};
1250 use crate::reader::{BangType, DtdParser};
1251 use crate::utils::Bytes;
1252
1253 /// Checks that reading CDATA content works correctly
1254 mod cdata {
1255 use super::*;
1256 use pretty_assertions::assert_eq;
1257
1258 /// Checks that if input begins like CDATA element, but CDATA start sequence
1259 /// is not finished, parsing ends with an error
1260 #[$test]
1261 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1262 $($async)? fn not_properly_start() {
1263 let buf = $buf;
1264 let mut position = 0;
1265 let mut input = &b"<![]]>other content"[$skip..];
1266 // ^= 0
1267
1268 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1269 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1270 x => panic!(
1271 "Expected `Err(Syntax(_))`, but got `{:?}`",
1272 x
1273 ),
1274 }
1275 assert_eq!(position, 1);
1276 }
1277
1278 /// Checks that if CDATA startup sequence was matched, but an end sequence
1279 /// is not found, parsing ends with an error
1280 #[$test]
1281 $($async)? fn not_closed() {
1282 let buf = $buf;
1283 let mut position = 0;
1284 let mut input = &b"<![CDATA[other content"[$skip..];
1285 // ^= 0 ^= 22
1286
1287 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1288 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1289 x => panic!(
1290 "Expected `Err(Syntax(_))`, but got `{:?}`",
1291 x
1292 ),
1293 }
1294 assert_eq!(position, 22);
1295 }
1296
1297 /// Checks that CDATA element without content inside parsed successfully
1298 #[$test]
1299 $($async)? fn empty() {
1300 let buf = $buf;
1301 let mut position = 0;
1302 let mut input = &b"<![CDATA[]]>other content"[$skip..];
1303 // ^= 0 ^= 12
1304
1305 let (ty, bytes) = $source(&mut input)
1306 .read_bang_element(buf, &mut position)
1307 $(.$await)?
1308 .unwrap();
1309 assert_eq!(
1310 (ty, Bytes(bytes)),
1311 (BangType::CData, Bytes(b"<![CDATA[]]>"))
1312 );
1313 assert_eq!(position, 12);
1314 }
1315
1316 /// Checks that CDATA element with content parsed successfully.
1317 /// Additionally checks that sequences inside CDATA that may look like
1318 /// a CDATA end sequence do not interrupt CDATA parsing
1319 #[$test]
1320 $($async)? fn with_content() {
1321 let buf = $buf;
1322 let mut position = 0;
1323 let mut input = &b"<![CDATA[cdata]] ]>content]]>other content]]>"[$skip..];
1324 // ^= 0 ^= 29
1325
1326 let (ty, bytes) = $source(&mut input)
1327 .read_bang_element(buf, &mut position)
1328 $(.$await)?
1329 .unwrap();
1330 assert_eq!(
1331 (ty, Bytes(bytes)),
1332 (BangType::CData, Bytes(b"<![CDATA[cdata]] ]>content]]>"))
1333 );
1334 assert_eq!(position, 29);
1335 }
1336 }
1337
1338 /// Checks that reading XML comments works correctly. According to the [specification],
1339 /// comment data can contain any sequence except `--`:
1340 ///
1341 /// ```peg
1342 /// comment = '<--' (!'--' char)* '-->';
1343 /// char = [#x1-#x2C]
1344 /// / [#x2E-#xD7FF]
1345 /// / [#xE000-#xFFFD]
1346 /// / [#x10000-#x10FFFF]
1347 /// ```
1348 ///
1349 /// The presence of this limitation, however, is simply a poorly designed specification
1350 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1351 /// presence of these sequences by default. This tests allow such content.
1352 ///
1353 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1354 mod comment {
1355 use super::*;
1356 use pretty_assertions::assert_eq;
1357
1358 #[$test]
1359 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1360 $($async)? fn not_properly_start() {
1361 let buf = $buf;
1362 let mut position = 0;
1363 let mut input = &b"<!- -->other content"[$skip..];
1364 // ^= 1
1365
1366 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1367 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1368 x => panic!(
1369 "Expected `Err(Syntax(_))`, but got `{:?}`",
1370 x
1371 ),
1372 }
1373 assert_eq!(position, 1);
1374 }
1375
1376 #[$test]
1377 $($async)? fn not_properly_end() {
1378 let buf = $buf;
1379 let mut position = 0;
1380 let mut input = &b"<!->other content"[$skip..];
1381 // ^= 0 ^= 17
1382
1383 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1384 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1385 x => panic!(
1386 "Expected `Err(Syntax(_))`, but got `{:?}`",
1387 x
1388 ),
1389 }
1390 assert_eq!(position, 17);
1391 }
1392
1393 #[$test]
1394 $($async)? fn not_closed1() {
1395 let buf = $buf;
1396 let mut position = 0;
1397 let mut input = &b"<!--other content"[$skip..];
1398 // ^= 0 ^= 17
1399
1400 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1401 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1402 x => panic!(
1403 "Expected `Err(Syntax(_))`, but got `{:?}`",
1404 x
1405 ),
1406 }
1407 assert_eq!(position, 17);
1408 }
1409
1410 #[$test]
1411 $($async)? fn not_closed2() {
1412 let buf = $buf;
1413 let mut position = 0;
1414 let mut input = &b"<!-->other content"[$skip..];
1415 // ^= 0 ^= 18
1416
1417 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1418 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1419 x => panic!(
1420 "Expected `Err(Syntax(_))`, but got `{:?}`",
1421 x
1422 ),
1423 }
1424 assert_eq!(position, 18);
1425 }
1426
1427 #[$test]
1428 $($async)? fn not_closed3() {
1429 let buf = $buf;
1430 let mut position = 0;
1431 let mut input = &b"<!--->other content"[$skip..];
1432 // ^= 0 ^= 19
1433
1434 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1435 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1436 x => panic!(
1437 "Expected `Err(Syntax(_))`, but got `{:?}`",
1438 x
1439 ),
1440 }
1441 assert_eq!(position, 19);
1442 }
1443
1444 #[$test]
1445 $($async)? fn empty() {
1446 let buf = $buf;
1447 let mut position = 0;
1448 let mut input = &b"<!---->other content"[$skip..];
1449 // ^= 0 ^= 7
1450
1451 let (ty, bytes) = $source(&mut input)
1452 .read_bang_element(buf, &mut position)
1453 $(.$await)?
1454 .unwrap();
1455 assert_eq!(
1456 (ty, Bytes(bytes)),
1457 (BangType::Comment, Bytes(b"<!---->"))
1458 );
1459 assert_eq!(position, 7);
1460 }
1461
1462 #[$test]
1463 $($async)? fn with_content() {
1464 let buf = $buf;
1465 let mut position = 0;
1466 let mut input = &b"<!--->comment<--->other content"[$skip..];
1467 // ^= 0 ^= 18
1468
1469 let (ty, bytes) = $source(&mut input)
1470 .read_bang_element(buf, &mut position)
1471 $(.$await)?
1472 .unwrap();
1473 assert_eq!(
1474 (ty, Bytes(bytes)),
1475 (BangType::Comment, Bytes(b"<!--->comment<--->"))
1476 );
1477 assert_eq!(position, 18);
1478 }
1479 }
1480
1481 /// Checks that reading DOCTYPE definition works correctly
1482 mod doctype {
1483 use super::*;
1484
1485 mod uppercase {
1486 use super::*;
1487 use pretty_assertions::assert_eq;
1488
1489 #[$test]
1490 $($async)? fn not_properly_start() {
1491 let buf = $buf;
1492 let mut position = 0;
1493 let mut input = &b"<!D other content"[$skip..];
1494 // ^= 0 ^= 17
1495
1496 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1497 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1498 x => panic!(
1499 "Expected `Err(Syntax(_))`, but got `{:?}`",
1500 x
1501 ),
1502 }
1503 assert_eq!(position, 17);
1504 }
1505
1506 #[$test]
1507 $($async)? fn without_space() {
1508 let buf = $buf;
1509 let mut position = 0;
1510 let mut input = &b"<!DOCTYPEother content"[$skip..];
1511 // ^= 0 ^= 22
1512
1513 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1514 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1515 x => panic!(
1516 "Expected `Err(Syntax(_))`, but got `{:?}`",
1517 x
1518 ),
1519 }
1520 assert_eq!(position, 22);
1521 }
1522
1523 #[$test]
1524 $($async)? fn empty() {
1525 let buf = $buf;
1526 let mut position = 0;
1527 let mut input = &b"<!DOCTYPE>other content"[$skip..];
1528 // ^= 0 ^= 10
1529
1530 let (ty, bytes) = $source(&mut input)
1531 .read_bang_element(buf, &mut position)
1532 $(.$await)?
1533 .unwrap();
1534 assert_eq!(
1535 (ty, Bytes(bytes)),
1536 (BangType::DocType(DtdParser::Finished), Bytes(b"<!DOCTYPE>"))
1537 );
1538 assert_eq!(position, 10);
1539 }
1540
1541 #[$test]
1542 $($async)? fn not_closed() {
1543 let buf = $buf;
1544 let mut position = 0;
1545 let mut input = &b"<!DOCTYPE other content"[$skip..];
1546 // ^= 0 ^23
1547
1548 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1549 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1550 x => panic!(
1551 "Expected `Err(Syntax(_))`, but got `{:?}`",
1552 x
1553 ),
1554 }
1555 assert_eq!(position, 23);
1556 }
1557 }
1558
1559 mod lowercase {
1560 use super::*;
1561 use pretty_assertions::assert_eq;
1562
1563 #[$test]
1564 $($async)? fn not_properly_start() {
1565 let buf = $buf;
1566 let mut position = 0;
1567 let mut input = &b"<!d other content"[$skip..];
1568 // ^= 0 ^= 17
1569
1570 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1571 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1572 x => panic!(
1573 "Expected `Err(Syntax(_))`, but got `{:?}`",
1574 x
1575 ),
1576 }
1577 assert_eq!(position, 17);
1578 }
1579
1580 #[$test]
1581 $($async)? fn without_space() {
1582 let buf = $buf;
1583 let mut position = 0;
1584 let mut input = &b"<!doctypeother content"[$skip..];
1585 // ^= 0 ^= 22
1586
1587 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1588 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1589 x => panic!(
1590 "Expected `Err(Syntax(_))`, but got `{:?}`",
1591 x
1592 ),
1593 }
1594 assert_eq!(position, 22);
1595 }
1596
1597 #[$test]
1598 $($async)? fn empty() {
1599 let buf = $buf;
1600 let mut position = 0;
1601 let mut input = &b"<!doctype>other content"[$skip..];
1602 // ^= 0 ^= 10
1603
1604 let (ty, bytes) = $source(&mut input)
1605 .read_bang_element(buf, &mut position)
1606 $(.$await)?
1607 .unwrap();
1608 assert_eq!(
1609 (ty, Bytes(bytes)),
1610 (BangType::DocType(DtdParser::Finished), Bytes(b"<!doctype>"))
1611 );
1612 assert_eq!(position, 10);
1613 }
1614
1615 #[$test]
1616 $($async)? fn not_closed() {
1617 let buf = $buf;
1618 let mut position = 0;
1619 let mut input = &b"<!doctype other content"[$skip..];
1620 // ^= 0 ^= 23
1621
1622 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1623 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1624 x => panic!(
1625 "Expected `Err(Syntax(_))`, but got `{:?}`",
1626 x
1627 ),
1628 }
1629 assert_eq!(position, 23);
1630 }
1631 }
1632 }
1633 }
1634
1635 mod read_text {
1636 use super::*;
1637 use crate::reader::ReadTextResult;
1638 use crate::utils::Bytes;
1639 use pretty_assertions::assert_eq;
1640
1641 #[$test]
1642 $($async)? fn empty() {
1643 let buf = $buf;
1644 let mut position = 1;
1645 let mut input = b"".as_ref();
1646 // ^= 1
1647
1648 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1649 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")),
1650 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1651 }
1652 assert_eq!(position, 1);
1653 }
1654
1655 #[$test]
1656 $($async)? fn markup() {
1657 let buf = $buf;
1658 let mut position = 1;
1659 let mut input = b"<".as_ref();
1660 // ^= 1
1661
1662 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1663 ReadTextResult::Markup(b) => assert_eq!(b, $buf),
1664 x => panic!("Expected `Markup(_)`, but got `{:?}`", x),
1665 }
1666 assert_eq!(position, 1);
1667 }
1668
1669 #[$test]
1670 $($async)? fn ref_() {
1671 let buf = $buf;
1672 let mut position = 1;
1673 let mut input = b"&".as_ref();
1674 // ^= 1
1675
1676 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1677 ReadTextResult::Ref(b) => assert_eq!(b, $buf),
1678 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1679 }
1680 assert_eq!(position, 1);
1681 }
1682
1683 #[$test]
1684 $($async)? fn up_to_markup() {
1685 let buf = $buf;
1686 let mut position = 1;
1687 let mut input = b"a<".as_ref();
1688 // ^= 2
1689
1690 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1691 ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1692 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1693 }
1694 assert_eq!(position, 2);
1695 }
1696
1697 #[$test]
1698 $($async)? fn up_to_ref() {
1699 let buf = $buf;
1700 let mut position = 1;
1701 let mut input = b"a&".as_ref();
1702 // ^= 2
1703
1704 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1705 ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1706 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1707 }
1708 assert_eq!(position, 2);
1709 }
1710
1711 #[$test]
1712 $($async)? fn up_to_eof() {
1713 let buf = $buf;
1714 let mut position = 1;
1715 let mut input = b"a".as_ref();
1716 // ^= 2
1717
1718 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1719 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1720 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1721 }
1722 assert_eq!(position, 2);
1723 }
1724 }
1725
1726 mod read_ref {
1727 use super::*;
1728 use crate::reader::ReadRefResult;
1729 use crate::utils::Bytes;
1730 use pretty_assertions::assert_eq;
1731
1732 // Empty input is not allowed for `read_ref` so not tested.
1733 // Borrowed source triggers debug assertion,
1734 // buffered do nothing due to implementation details.
1735
1736 #[$test]
1737 $($async)? fn up_to_eof() {
1738 let buf = $buf;
1739 let mut position = 1;
1740 let mut input = b"&".as_ref();
1741 // ^= 2
1742
1743 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1744 ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1745 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1746 }
1747 assert_eq!(position, 2);
1748 }
1749
1750 #[$test]
1751 $($async)? fn up_to_ref() {
1752 let buf = $buf;
1753 let mut position = 1;
1754 let mut input = b"&&".as_ref();
1755 // ^= 2
1756
1757 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1758 ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1759 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1760 }
1761 assert_eq!(position, 2);
1762 }
1763
1764 #[$test]
1765 $($async)? fn up_to_markup() {
1766 let buf = $buf;
1767 let mut position = 1;
1768 let mut input = b"&<".as_ref();
1769 // ^= 2
1770
1771 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1772 ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1773 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1774 }
1775 assert_eq!(position, 2);
1776 }
1777
1778 #[$test]
1779 $($async)? fn empty_ref() {
1780 let buf = $buf;
1781 let mut position = 1;
1782 let mut input = b"&;".as_ref();
1783 // ^= 3
1784
1785 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1786 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&;")),
1787 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1788 }
1789 assert_eq!(position, 3);
1790 }
1791
1792 #[$test]
1793 $($async)? fn normal() {
1794 let buf = $buf;
1795 let mut position = 1;
1796 let mut input = b"<".as_ref();
1797 // ^= 5
1798
1799 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1800 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")),
1801 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1802 }
1803 assert_eq!(position, 5);
1804 }
1805 }
1806
1807 mod read_element {
1808 use super::*;
1809 use crate::errors::{Error, SyntaxError};
1810 use crate::parser::ElementParser;
1811 use crate::utils::Bytes;
1812 use pretty_assertions::assert_eq;
1813
1814 /// Checks that nothing was read from empty buffer
1815 /// `<` read in peek_one that is called before read_with, that is why it in the input buffer
1816 /// peek_one, however, does not increment position for simplicity of the code
1817 #[$test]
1818 $($async)? fn empty() {
1819 let buf = $buf;
1820 let mut position = 0;
1821 let mut input = &b"<"[$skip..];
1822 // ^= 1
1823
1824 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1825 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1826 x => panic!(
1827 "Expected `Err(Syntax(_))`, but got `{:?}`",
1828 x
1829 ),
1830 }
1831 assert_eq!(position, 1);
1832 }
1833
1834 mod open {
1835 use super::*;
1836 use pretty_assertions::assert_eq;
1837
1838 #[$test]
1839 $($async)? fn empty_tag() {
1840 let buf = $buf;
1841 let mut position = 0;
1842 let mut input = &b"<>"[$skip..];
1843 // ^= 2
1844
1845 assert_eq!(
1846 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1847 Bytes(b"<>")
1848 );
1849 assert_eq!(position, 2);
1850 }
1851
1852 #[$test]
1853 $($async)? fn normal() {
1854 let buf = $buf;
1855 let mut position = 0;
1856 let mut input = &b"<tag>"[$skip..];
1857 // ^= 5
1858
1859 assert_eq!(
1860 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1861 Bytes(b"<tag>")
1862 );
1863 assert_eq!(position, 5);
1864 }
1865
1866 #[$test]
1867 $($async)? fn empty_ns_empty_tag() {
1868 let buf = $buf;
1869 let mut position = 0;
1870 let mut input = &b"<:>"[$skip..];
1871 // ^= 3
1872
1873 assert_eq!(
1874 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1875 Bytes(b"<:>")
1876 );
1877 assert_eq!(position, 3);
1878 }
1879
1880 #[$test]
1881 $($async)? fn empty_ns() {
1882 let buf = $buf;
1883 let mut position = 0;
1884 let mut input = &b"<:tag>"[$skip..];
1885 // ^= 6
1886
1887 assert_eq!(
1888 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1889 Bytes(b"<:tag>")
1890 );
1891 assert_eq!(position, 6);
1892 }
1893
1894 #[$test]
1895 $($async)? fn with_attributes() {
1896 let buf = $buf;
1897 let mut position = 0;
1898 let mut input = &br#"<tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
1899 // ^= 39
1900
1901 assert_eq!(
1902 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1903 Bytes(br#"<tag attr-1=">" attr2 = '>' 3attr>"#)
1904 );
1905 assert_eq!(position, 39);
1906 }
1907 }
1908
1909 mod self_closed {
1910 use super::*;
1911 use pretty_assertions::assert_eq;
1912
1913 #[$test]
1914 $($async)? fn empty_tag() {
1915 let buf = $buf;
1916 let mut position = 0;
1917 let mut input = &b"</>"[$skip..];
1918 // ^= 3
1919
1920 assert_eq!(
1921 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1922 Bytes(b"</>")
1923 );
1924 assert_eq!(position, 3);
1925 }
1926
1927 #[$test]
1928 $($async)? fn normal() {
1929 let buf = $buf;
1930 let mut position = 0;
1931 let mut input = &b"<tag/>"[$skip..];
1932 // ^= 6
1933
1934 assert_eq!(
1935 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1936 Bytes(b"<tag/>")
1937 );
1938 assert_eq!(position, 6);
1939 }
1940
1941 #[$test]
1942 $($async)? fn empty_ns_empty_tag() {
1943 let buf = $buf;
1944 let mut position = 0;
1945 let mut input = &b"<:/>"[$skip..];
1946 // ^= 4
1947
1948 assert_eq!(
1949 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1950 Bytes(b"<:/>")
1951 );
1952 assert_eq!(position, 4);
1953 }
1954
1955 #[$test]
1956 $($async)? fn empty_ns() {
1957 let buf = $buf;
1958 let mut position = 0;
1959 let mut input = &b"<:tag/>"[$skip..];
1960 // ^= 7
1961
1962 assert_eq!(
1963 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1964 Bytes(b"<:tag/>")
1965 );
1966 assert_eq!(position, 7);
1967 }
1968
1969 #[$test]
1970 $($async)? fn with_attributes() {
1971 let buf = $buf;
1972 let mut position = 0;
1973 let mut input = &br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#[$skip..];
1974 // ^= 42
1975
1976 assert_eq!(
1977 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1978 Bytes(br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#)
1979 );
1980 assert_eq!(position, 42);
1981 }
1982 }
1983
1984 mod close {
1985 use super::*;
1986 use pretty_assertions::assert_eq;
1987
1988 #[$test]
1989 $($async)? fn empty_tag() {
1990 let buf = $buf;
1991 let mut position = 0;
1992 let mut input = &b"</ >"[$skip..];
1993 // ^= 4
1994
1995 assert_eq!(
1996 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1997 Bytes(b"</ >")
1998 );
1999 assert_eq!(position, 4);
2000 }
2001
2002 #[$test]
2003 $($async)? fn normal() {
2004 let buf = $buf;
2005 let mut position = 0;
2006 let mut input = &b"</tag>"[$skip..];
2007 // ^= 6
2008
2009 assert_eq!(
2010 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2011 Bytes(b"</tag>")
2012 );
2013 assert_eq!(position, 6);
2014 }
2015
2016 #[$test]
2017 $($async)? fn empty_ns_empty_tag() {
2018 let buf = $buf;
2019 let mut position = 0;
2020 let mut input = &b"</:>"[$skip..];
2021 // ^= 4
2022
2023 assert_eq!(
2024 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2025 Bytes(b"</:>")
2026 );
2027 assert_eq!(position, 4);
2028 }
2029
2030 #[$test]
2031 $($async)? fn empty_ns() {
2032 let buf = $buf;
2033 let mut position = 0;
2034 let mut input = &b"</:tag>"[$skip..];
2035 // ^= 7
2036
2037 assert_eq!(
2038 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2039 Bytes(b"</:tag>")
2040 );
2041 assert_eq!(position, 7);
2042 }
2043
2044 #[$test]
2045 $($async)? fn with_attributes() {
2046 let buf = $buf;
2047 let mut position = 0;
2048 let mut input = &br#"</tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
2049 // ^= 40
2050
2051 assert_eq!(
2052 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2053 Bytes(br#"</tag attr-1=">" attr2 = '>' 3attr>"#)
2054 );
2055 assert_eq!(position, 40);
2056 }
2057 }
2058 }
2059
2060 /// Ensures, that no empty `Text` events are generated
2061 mod $read_event {
2062 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
2063 use crate::reader::Reader;
2064 use pretty_assertions::assert_eq;
2065
2066 /// When `encoding` feature is enabled, encoding should be detected
2067 /// from BOM (UTF-8) and BOM should be stripped.
2068 ///
2069 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
2070 /// character should be stripped for consistency
2071 #[$test]
2072 $($async)? fn bom_from_reader() {
2073 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
2074
2075 assert_eq!(
2076 reader.$read_event($buf) $(.$await)? .unwrap(),
2077 Event::Text(BytesText::from_escaped("\u{feff}"))
2078 );
2079
2080 assert_eq!(
2081 reader.$read_event($buf) $(.$await)? .unwrap(),
2082 Event::Eof
2083 );
2084 }
2085
2086 /// When parsing from &str, encoding is fixed (UTF-8), so
2087 /// - when `encoding` feature is disabled, the behavior the
2088 /// same as in `bom_from_reader` text
2089 /// - when `encoding` feature is enabled, the behavior should
2090 /// stay consistent, so the first BOM character is stripped
2091 #[$test]
2092 $($async)? fn bom_from_str() {
2093 let mut reader = Reader::from_str("\u{feff}\u{feff}");
2094
2095 assert_eq!(
2096 reader.$read_event($buf) $(.$await)? .unwrap(),
2097 Event::Text(BytesText::from_escaped("\u{feff}"))
2098 );
2099
2100 assert_eq!(
2101 reader.$read_event($buf) $(.$await)? .unwrap(),
2102 Event::Eof
2103 );
2104 }
2105
2106 #[$test]
2107 $($async)? fn declaration() {
2108 let mut reader = Reader::from_str("<?xml ?>");
2109
2110 assert_eq!(
2111 reader.$read_event($buf) $(.$await)? .unwrap(),
2112 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
2113 );
2114 }
2115
2116 #[$test]
2117 $($async)? fn doctype() {
2118 let mut reader = Reader::from_str("<!DOCTYPE x>");
2119
2120 assert_eq!(
2121 reader.$read_event($buf) $(.$await)? .unwrap(),
2122 Event::DocType(BytesText::from_escaped("x"))
2123 );
2124 }
2125
2126 #[$test]
2127 $($async)? fn processing_instruction() {
2128 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
2129
2130 assert_eq!(
2131 reader.$read_event($buf) $(.$await)? .unwrap(),
2132 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
2133 );
2134 }
2135
2136 /// Lone closing tags are not allowed, so testing it together with start tag
2137 #[$test]
2138 $($async)? fn start_and_end() {
2139 let mut reader = Reader::from_str("<tag></tag>");
2140
2141 assert_eq!(
2142 reader.$read_event($buf) $(.$await)? .unwrap(),
2143 Event::Start(BytesStart::new("tag"))
2144 );
2145
2146 assert_eq!(
2147 reader.$read_event($buf) $(.$await)? .unwrap(),
2148 Event::End(BytesEnd::new("tag"))
2149 );
2150 }
2151
2152 #[$test]
2153 $($async)? fn empty() {
2154 let mut reader = Reader::from_str("<tag/>");
2155
2156 assert_eq!(
2157 reader.$read_event($buf) $(.$await)? .unwrap(),
2158 Event::Empty(BytesStart::new("tag"))
2159 );
2160 }
2161
2162 #[$test]
2163 $($async)? fn text() {
2164 let mut reader = Reader::from_str("text");
2165
2166 assert_eq!(
2167 reader.$read_event($buf) $(.$await)? .unwrap(),
2168 Event::Text(BytesText::from_escaped("text"))
2169 );
2170 }
2171
2172 #[$test]
2173 $($async)? fn cdata() {
2174 let mut reader = Reader::from_str("<![CDATA[]]>");
2175
2176 assert_eq!(
2177 reader.$read_event($buf) $(.$await)? .unwrap(),
2178 Event::CData(BytesCData::new(""))
2179 );
2180 }
2181
2182 #[$test]
2183 $($async)? fn comment() {
2184 let mut reader = Reader::from_str("<!---->");
2185
2186 assert_eq!(
2187 reader.$read_event($buf) $(.$await)? .unwrap(),
2188 Event::Comment(BytesText::from_escaped(""))
2189 );
2190 }
2191
2192 #[$test]
2193 $($async)? fn eof() {
2194 let mut reader = Reader::from_str("");
2195
2196 assert_eq!(
2197 reader.$read_event($buf) $(.$await)? .unwrap(),
2198 Event::Eof
2199 );
2200 }
2201 }
2202 };
2203 }
2204
2205 // Export macros for the child modules:
2206 // - buffered_reader
2207 // - slice_reader
2208 pub(super) use check;
2209}