quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9#[cfg(feature = "encoding")]
10use crate::encoding::DetectedEncoding;
11use crate::errors::{Error, IllFormedError, SyntaxError};
12use crate::events::{BytesRef, Event};
13use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
14use crate::reader::state::ReaderState;
15
16/// A struct that holds a parser configuration.
17///
18/// Current parser configuration can be retrieved by calling [`Reader::config()`]
19/// and changed by changing properties of the object returned by a call to
20/// [`Reader::config_mut()`].
21///
22/// [`Reader::config()`]: crate::reader::Reader::config
23/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
24#[derive(Debug, Clone, PartialEq, Eq)]
25#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
26#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
27#[non_exhaustive]
28pub struct Config {
29 /// Whether lone ampersand character (without a paired semicolon) should be
30 /// allowed in textual content. Unless enabled, in case of a dangling ampersand,
31 /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
32 ///
33 /// Default: `false`
34 ///
35 /// # Example
36 ///
37 /// ```
38 /// # use quick_xml::events::{BytesRef, BytesText, Event};
39 /// # use quick_xml::reader::Reader;
40 /// # use pretty_assertions::assert_eq;
41 /// let mut reader = Reader::from_str("text with & & & alone");
42 /// reader.config_mut().allow_dangling_amp = true;
43 ///
44 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
45 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
46 /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
47 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
48 /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
49 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
50 /// ```
51 ///
52 /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
53 pub allow_dangling_amp: bool,
54
55 /// Whether unmatched closing tag names should be allowed. Unless enabled,
56 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
57 /// is returned from read methods.
58 ///
59 /// When set to `true`, it won't check if a closing tag has a corresponding
60 /// opening tag at all. For example, `<a></a></b>` will be permitted.
61 ///
62 /// Note that the emitted [`End`] event will not be modified if this is enabled,
63 /// ie. it will contain the data of the unmatched end tag.
64 ///
65 /// Note, that setting this to `true` will lead to additional allocates that
66 /// needed to store tag name for an [`End`] event.
67 ///
68 /// Default: `false`
69 ///
70 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
71 /// [`End`]: crate::events::Event::End
72 pub allow_unmatched_ends: bool,
73
74 /// Whether comments should be validated. If enabled, in case of invalid comment
75 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
76 ///
77 /// When set to `true`, every [`Comment`] event will be checked for not
78 /// containing `--`, which [is not allowed] in XML comments. Most of the time
79 /// we don't want comments at all so we don't really care about comment
80 /// correctness, thus the default value is `false` to improve performance.
81 ///
82 /// Default: `false`
83 ///
84 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
85 /// [`Comment`]: crate::events::Event::Comment
86 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
87 pub check_comments: bool,
88
89 /// Whether mismatched closing tag names should be detected. If enabled, in
90 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
91 /// read methods.
92 ///
93 /// Note, that start and end tags [should match literally][spec], they cannot
94 /// have different prefixes even if both prefixes resolve to the same namespace.
95 /// The XML
96 ///
97 /// ```xml
98 /// <outer xmlns="namespace" xmlns:p="namespace">
99 /// </p:outer>
100 /// ```
101 ///
102 /// is not valid, even though semantically the start tag is the same as the
103 /// end tag. The reason is that namespaces are an extension of the original
104 /// XML specification (without namespaces) and it should be backward-compatible.
105 ///
106 /// When set to `false`, it won't check if a closing tag matches the corresponding
107 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
108 ///
109 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
110 ///
111 /// Note that the emitted [`End`] event will not be modified if this is disabled,
112 /// ie. it will contain the data of the mismatched end tag.
113 ///
114 /// Note, that setting this to `true` will lead to additional allocates that
115 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
116 /// is also set, only one additional allocation will be performed that support
117 /// both these options.
118 ///
119 /// Default: `true`
120 ///
121 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
122 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
123 /// [`End`]: crate::events::Event::End
124 /// [`expand_empty_elements`]: Self::expand_empty_elements
125 pub check_end_names: bool,
126
127 /// Whether empty elements should be split into an `Open` and a `Close` event.
128 ///
129 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
130 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
131 /// event. When set to `false` (the default), those tags are represented by
132 /// an [`Empty`] event instead.
133 ///
134 /// Note, that setting this to `true` will lead to additional allocates that
135 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
136 /// is also set, only one additional allocation will be performed that support
137 /// both these options.
138 ///
139 /// Default: `false`
140 ///
141 /// [`Empty`]: crate::events::Event::Empty
142 /// [`Start`]: crate::events::Event::Start
143 /// [`End`]: crate::events::Event::End
144 /// [`check_end_names`]: Self::check_end_names
145 pub expand_empty_elements: bool,
146
147 /// Whether trailing whitespace after the markup name are trimmed in closing
148 /// tags `</a >`.
149 ///
150 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
151 /// after the markup name.
152 ///
153 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
154 /// of markup names is going to fail erroneously if a closing tag contains
155 /// trailing whitespace.
156 ///
157 /// Default: `true`
158 ///
159 /// [`End`]: crate::events::Event::End
160 /// [`check_end_names`]: Self::check_end_names
161 pub trim_markup_names_in_closing_tags: bool,
162
163 /// Whether whitespace before character data should be removed.
164 ///
165 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
166 /// If after that the event is empty it will not be pushed.
167 ///
168 /// Default: `false`
169 ///
170 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
171 ///
172 /// WARNING: With this option every text events will be trimmed which is
173 /// incorrect behavior when text events delimited by comments, processing
174 /// instructions or CDATA sections. To correctly trim data manually apply
175 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
176 /// only to necessary events.
177 /// </div>
178 ///
179 /// [`Text`]: crate::events::Event::Text
180 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
181 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
182 pub trim_text_start: bool,
183
184 /// Whether whitespace after character data should be removed.
185 ///
186 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
187 /// If after that the event is empty it will not be pushed.
188 ///
189 /// Default: `false`
190 ///
191 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
192 ///
193 /// WARNING: With this option every text events will be trimmed which is
194 /// incorrect behavior when text events delimited by comments, processing
195 /// instructions or CDATA sections. To correctly trim data manually apply
196 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
197 /// only to necessary events.
198 /// </div>
199 ///
200 /// [`Text`]: crate::events::Event::Text
201 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
202 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
203 pub trim_text_end: bool,
204}
205
206impl Config {
207 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
208 ///
209 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
210 ///
211 /// WARNING: With this option every text events will be trimmed which is
212 /// incorrect behavior when text events delimited by comments, processing
213 /// instructions or CDATA sections. To correctly trim data manually apply
214 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
215 /// only to necessary events.
216 /// </div>
217 ///
218 /// [`trim_text_start`]: Self::trim_text_start
219 /// [`trim_text_end`]: Self::trim_text_end
220 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
221 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
222 #[inline]
223 pub fn trim_text(&mut self, trim: bool) {
224 self.trim_text_start = trim;
225 self.trim_text_end = trim;
226 }
227
228 /// Turn on or off all checks for well-formedness. Currently it is that settings:
229 /// - [`check_comments`](Self::check_comments)
230 /// - [`check_end_names`](Self::check_end_names)
231 #[inline]
232 pub fn enable_all_checks(&mut self, enable: bool) {
233 self.check_comments = enable;
234 self.check_end_names = enable;
235 }
236}
237
238impl Default for Config {
239 fn default() -> Self {
240 Self {
241 allow_dangling_amp: false,
242 allow_unmatched_ends: false,
243 check_comments: false,
244 check_end_names: true,
245 expand_empty_elements: false,
246 trim_markup_names_in_closing_tags: true,
247 trim_text_start: false,
248 trim_text_end: false,
249 }
250 }
251}
252
253////////////////////////////////////////////////////////////////////////////////////////////////////
254
255macro_rules! read_event_impl {
256 (
257 $self:ident, $buf:ident,
258 $reader:expr,
259 $read_until_close:ident
260 $(, $await:ident)?
261 ) => {{
262 let event = loop {
263 break match $self.state.state {
264 ParseState::Init => { // Go to InsideText state
265 // If encoding set explicitly, we not need to detect it. For example,
266 // explicit UTF-8 set automatically if Reader was created using `from_str`.
267 // But we still need to remove BOM for consistency with no encoding
268 // feature enabled path
269 #[cfg(feature = "encoding")]
270 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
271 if $self.state.encoding.can_be_refined() {
272 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding.encoding());
273 }
274 }
275
276 // Removes UTF-8 BOM if it is present
277 #[cfg(not(feature = "encoding"))]
278 $reader.remove_utf8_bom() $(.$await)? ?;
279
280 $self.state.state = ParseState::InsideText;
281 continue;
282 },
283 ParseState::InsideRef => { // Go to InsideText
284 let start = $self.state.offset;
285 match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? {
286 // Emit reference, go to InsideText state
287 ReadRefResult::Ref(bytes) => {
288 $self.state.state = ParseState::InsideText;
289 // +1 to skip start `&`
290 // -1 to skip end `;`
291 Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..bytes.len() - 1], $self.decoder())))
292 }
293 // Go to Done state
294 ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
295 $self.state.state = ParseState::Done;
296 Ok(Event::Text($self.state.emit_text(bytes)))
297 }
298 ReadRefResult::UpToEof(_) => {
299 $self.state.state = ParseState::Done;
300 $self.state.last_error_offset = start;
301 Err(Error::IllFormed(IllFormedError::UnclosedReference))
302 }
303 // Do not change state, stay in InsideRef
304 ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
305 Ok(Event::Text($self.state.emit_text(bytes)))
306 }
307 ReadRefResult::UpToRef(_) => {
308 $self.state.last_error_offset = start;
309 Err(Error::IllFormed(IllFormedError::UnclosedReference))
310 }
311 // Go to InsideMarkup state
312 ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
313 $self.state.state = ParseState::InsideMarkup;
314 Ok(Event::Text($self.state.emit_text(bytes)))
315 }
316 ReadRefResult::UpToMarkup(_) => {
317 $self.state.state = ParseState::InsideMarkup;
318 $self.state.last_error_offset = start;
319 Err(Error::IllFormed(IllFormedError::UnclosedReference))
320 }
321 ReadRefResult::Err(e) => Err(Error::from(e)),
322 }
323 }
324 ParseState::InsideText => { // Go to InsideMarkup or Done state
325 if $self.state.config.trim_text_start {
326 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
327 }
328
329 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
330 ReadTextResult::Markup(buf) => {
331 $self.state.state = ParseState::InsideMarkup;
332 // Pass `buf` to the next next iteration of parsing loop
333 $buf = buf;
334 continue;
335 }
336 ReadTextResult::Ref(buf) => {
337 $self.state.state = ParseState::InsideRef;
338 // Pass `buf` to the next next iteration of parsing loop
339 $buf = buf;
340 continue;
341 }
342 ReadTextResult::UpToMarkup(bytes) => {
343 $self.state.state = ParseState::InsideMarkup;
344 // FIXME: Can produce an empty event if:
345 // - event contains only spaces
346 // - trim_text_start = false
347 // - trim_text_end = true
348 Ok(Event::Text($self.state.emit_text(bytes)))
349 }
350 ReadTextResult::UpToRef(bytes) => {
351 $self.state.state = ParseState::InsideRef;
352 // Return Text event with `bytes` content or Eof if bytes is empty
353 Ok(Event::Text($self.state.emit_text(bytes)))
354 }
355 ReadTextResult::UpToEof(bytes) => {
356 $self.state.state = ParseState::Done;
357 // Trim bytes from end if required
358 let event = $self.state.emit_text(bytes);
359 if event.is_empty() {
360 Ok(Event::Eof)
361 } else {
362 Ok(Event::Text(event))
363 }
364 }
365 ReadTextResult::Err(e) => Err(Error::from(e)),
366 }
367 },
368 // Go to InsideText state in next two arms
369 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
370 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
371 ParseState::Done => Ok(Event::Eof),
372 };
373 };
374 match event {
375 // #513: In case of ill-formed errors we already consume the wrong data
376 // and change the state. We can continue parsing if we wish
377 Err(Error::IllFormed(_)) => {}
378 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
379 _ => {}
380 }
381 event
382 }};
383}
384
385/// Read bytes up to the `>` and skip it. This method is expected to be called
386/// after seeing the `<` symbol and skipping it. Inspects the next (current)
387/// symbol and returns an appropriate [`Event`]:
388///
389/// |Symbol |Event
390/// |-------|-------------------------------------
391/// |`!` |[`Comment`], [`CData`] or [`DocType`]
392/// |`/` |[`End`]
393/// |`?` |[`PI`]
394/// |_other_|[`Start`] or [`Empty`]
395///
396/// Moves parser to the `InsideText` state.
397///
398/// [`Comment`]: Event::Comment
399/// [`CData`]: Event::CData
400/// [`DocType`]: Event::DocType
401/// [`End`]: Event::End
402/// [`PI`]: Event::PI
403/// [`Start`]: Event::Start
404/// [`Empty`]: Event::Empty
405macro_rules! read_until_close {
406 (
407 $self:ident, $buf:ident,
408 $reader:expr
409 $(, $await:ident)?
410 ) => {{
411 $self.state.state = ParseState::InsideText;
412
413 let start = $self.state.offset;
414 match $reader.peek_one() $(.$await)? {
415 // `<!` - comment, CDATA or DOCTYPE declaration
416 Ok(Some(b'!')) => match $reader
417 .read_bang_element($buf, &mut $self.state.offset)
418 $(.$await)?
419 {
420 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
421 Err(e) => {
422 // We want to report error at `<`
423 $self.state.last_error_offset = start;
424 Err(e)
425 }
426 },
427 // `</` - closing tag
428 // #776: We parse using ElementParser which allows us to have attributes
429 // in close tags. While such tags are not allowed by the specification,
430 // we anyway allow to parse them because:
431 // - we do not check constraints during parsing. This is performed by the
432 // optional validate step which user should call manually
433 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
434 // `</tag attr=">` and text `" >` which probably no one existing parser
435 // does. This is malformed XML, however it is tolerated by some parsers
436 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
437 Ok(Some(b'/')) => match $reader
438 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
439 $(.$await)?
440 {
441 Ok(bytes) => $self.state.emit_end(bytes),
442 Err(e) => {
443 // We want to report error at `<`
444 $self.state.last_error_offset = start;
445 Err(e)
446 }
447 },
448 // `<?` - processing instruction
449 Ok(Some(b'?')) => match $reader
450 .read_with(PiParser(false), $buf, &mut $self.state.offset)
451 $(.$await)?
452 {
453 Ok(bytes) => $self.state.emit_question_mark(bytes),
454 Err(e) => {
455 // We want to report error at `<`
456 $self.state.last_error_offset = start;
457 Err(e)
458 }
459 },
460 // `<...` - opening or self-closed tag
461 Ok(Some(_)) => match $reader
462 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
463 $(.$await)?
464 {
465 Ok(bytes) => Ok($self.state.emit_start(bytes)),
466 Err(e) => {
467 // We want to report error at `<`
468 $self.state.last_error_offset = start;
469 Err(e)
470 }
471 },
472 // `<` - syntax error, tag not closed
473 Ok(None) => {
474 // We want to report error at `<`
475 $self.state.last_error_offset = start;
476 Err(Error::Syntax(SyntaxError::UnclosedTag))
477 }
478 Err(e) => Err(Error::from(e)),
479 }
480 }};
481}
482
483/// Generalization of `read_to_end` method for buffered and borrowed readers
484macro_rules! read_to_end {
485 (
486 // $self: &mut Reader
487 $self:expr, $end:expr, $buf:expr,
488 $read_event:ident,
489 // Code block that performs clearing of internal buffer after read of each event
490 $clear:block
491 $(, $await:ident)?
492 ) => {{
493 // Because we take position after the event before the End event,
494 // it is important that this position indicates beginning of the End event.
495 // If between last event and the End event would be only spaces, then we
496 // take position before the spaces, but spaces would be skipped without
497 // generating event if `trim_text_start` is set to `true`. To prevent that
498 // we temporary disable start text trimming.
499 //
500 // We also cannot take position after getting End event, because if
501 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
502 // we do not known the real size of the End event that it is occupies in
503 // the source and cannot correct the position after the End event.
504 // So, we in any case should tweak parser configuration.
505 let config = $self.config_mut();
506 let trim = config.trim_text_start;
507 config.trim_text_start = false;
508
509 let start = $self.buffer_position();
510 let mut depth = 0;
511 loop {
512 $clear
513 let end = $self.buffer_position();
514 match $self.$read_event($buf) $(.$await)? {
515 Err(e) => {
516 $self.config_mut().trim_text_start = trim;
517 return Err(e);
518 }
519
520 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
521 Ok(Event::End(e)) if e.name() == $end => {
522 if depth == 0 {
523 $self.config_mut().trim_text_start = trim;
524 break start..end;
525 }
526 depth -= 1;
527 }
528 Ok(Event::Eof) => {
529 $self.config_mut().trim_text_start = trim;
530 return Err(Error::missed_end($end, $self.decoder()));
531 }
532 _ => (),
533 }
534 }
535 }};
536}
537
538#[cfg(feature = "async-tokio")]
539mod async_tokio;
540mod buffered_reader;
541mod ns_reader;
542mod slice_reader;
543mod state;
544
545pub use ns_reader::NsReader;
546
547/// Range of input in bytes, that corresponds to some piece of XML
548pub type Span = Range<u64>;
549
550////////////////////////////////////////////////////////////////////////////////////////////////////
551
552/// Possible reader states. The state transition diagram (`true` and `false` shows
553/// value of [`Config::expand_empty_elements`] option):
554///
555/// ```mermaid
556/// flowchart LR
557/// subgraph _
558/// direction LR
559///
560/// Init -- "(no event)"\n --> InsideMarkup
561/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
562/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
563/// InsideRef -- "(no event)"\nGeneralRef --> InsideText
564/// end
565/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
566/// InsideEmpty -- End --> InsideText
567/// _ -. Eof .-> Done
568/// ```
569#[derive(Clone, Debug)]
570enum ParseState {
571 /// Initial state in which reader stay after creation. Transition from that
572 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
573 /// state is always `InsideMarkup`. The reader will never return to this state. The
574 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
575 /// first symbol not `<`, otherwise no event are emitted.
576 Init,
577 /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other
578 /// events could be generated.
579 ///
580 /// After generating one event the reader moves to the `ClosedTag` state.
581 InsideRef,
582 /// State after seeing the `<` symbol. Depending on the next symbol all other
583 /// events could be generated.
584 ///
585 /// After generating one event the reader moves to the `InsideText` state.
586 InsideMarkup,
587 /// State in which reader searches the `<` symbol of a markup. All bytes before
588 /// that symbol will be returned in the [`Event::Text`] event. After that
589 /// the reader moves to the `InsideMarkup` state.
590 InsideText,
591 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
592 /// Reader enters to this state when it is in a `InsideText` state and emits an
593 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
594 /// after which reader returned to the `InsideText` state.
595 ///
596 /// [`expand_empty_elements`]: Config::expand_empty_elements
597 InsideEmpty,
598 /// Reader enters this state when `Eof` event generated or an error occurred.
599 /// This is the last state, the reader stay in it forever.
600 Done,
601}
602
603/// A reference to an encoding together with information about how it was retrieved.
604///
605/// The state transition diagram:
606///
607/// ```mermaid
608/// flowchart LR
609/// Implicit -- from_str --> Explicit
610/// Implicit -- BOM --> BomDetected
611/// Implicit -- "encoding=..." --> XmlDetected
612/// BomDetected -- "encoding=..." --> XmlDetected
613/// ```
614#[cfg(feature = "encoding")]
615#[derive(Clone, Copy, Debug)]
616enum EncodingRef {
617 /// Encoding was implicitly assumed to have a specified value. It can be refined
618 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
619 Implicit(&'static Encoding),
620 /// Encoding was explicitly set to the desired value. It cannot be changed
621 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
622 Explicit(&'static Encoding),
623 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
624 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
625 BomDetected(&'static Encoding),
626 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
627 /// It can no longer change
628 XmlDetected(&'static Encoding),
629}
630#[cfg(feature = "encoding")]
631impl EncodingRef {
632 #[inline]
633 const fn encoding(&self) -> &'static Encoding {
634 match self {
635 Self::Implicit(e) => e,
636 Self::Explicit(e) => e,
637 Self::BomDetected(e) => e,
638 Self::XmlDetected(e) => e,
639 }
640 }
641 #[inline]
642 const fn can_be_refined(&self) -> bool {
643 match self {
644 Self::Implicit(_) | Self::BomDetected(_) => true,
645 Self::Explicit(_) | Self::XmlDetected(_) => false,
646 }
647 }
648}
649
650////////////////////////////////////////////////////////////////////////////////////////////////////
651
652/// A direct stream to the underlying [`Reader`]s reader which updates
653/// [`Reader::buffer_position()`] when read from it.
654#[derive(Debug)]
655#[must_use = "streams do nothing unless read or polled"]
656pub struct BinaryStream<'r, R> {
657 inner: &'r mut R,
658 offset: &'r mut u64,
659}
660
661impl<'r, R> BinaryStream<'r, R> {
662 /// Returns current position in bytes in the original source.
663 #[inline]
664 pub const fn offset(&self) -> u64 {
665 *self.offset
666 }
667
668 /// Gets a reference to the underlying reader.
669 #[inline]
670 pub const fn get_ref(&self) -> &R {
671 self.inner
672 }
673
674 /// Gets a mutable reference to the underlying reader.
675 ///
676 /// Avoid read from this reader because this will not update reader's position
677 /// and will lead to incorrect positions of errors. Read from this stream instead.
678 #[inline]
679 pub fn get_mut(&mut self) -> &mut R {
680 self.inner
681 }
682}
683
684impl<'r, R> io::Read for BinaryStream<'r, R>
685where
686 R: io::Read,
687{
688 #[inline]
689 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
690 let amt = self.inner.read(buf)?;
691 *self.offset += amt as u64;
692 Ok(amt)
693 }
694}
695
696impl<'r, R> io::BufRead for BinaryStream<'r, R>
697where
698 R: io::BufRead,
699{
700 #[inline]
701 fn fill_buf(&mut self) -> io::Result<&[u8]> {
702 self.inner.fill_buf()
703 }
704
705 #[inline]
706 fn consume(&mut self, amt: usize) {
707 self.inner.consume(amt);
708 *self.offset += amt as u64;
709 }
710}
711
712////////////////////////////////////////////////////////////////////////////////////////////////////
713
714/// A low level encoding-agnostic XML event reader.
715///
716/// Consumes bytes and streams XML [`Event`]s.
717///
718/// This reader does not manage namespace declarations and not able to resolve
719/// prefixes. If you want these features, use the [`NsReader`].
720///
721/// # Examples
722///
723/// ```
724/// use quick_xml::events::Event;
725/// use quick_xml::reader::Reader;
726///
727/// let xml = r#"<tag1 att1 = "test">
728/// <tag2><!--Test comment-->Test</tag2>
729/// <tag2>Test 2</tag2>
730/// </tag1>"#;
731/// let mut reader = Reader::from_str(xml);
732/// reader.config_mut().trim_text(true);
733///
734/// let mut count = 0;
735/// let mut txt = Vec::new();
736/// let mut buf = Vec::new();
737///
738/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
739/// loop {
740/// // NOTE: this is the generic case when we don't know about the input BufRead.
741/// // when the input is a &str or a &[u8], we don't actually need to use another
742/// // buffer, we could directly call `reader.read_event()`
743/// match reader.read_event_into(&mut buf) {
744/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
745/// // exits the loop when reaching end of file
746/// Ok(Event::Eof) => break,
747///
748/// Ok(Event::Start(e)) => {
749/// match e.name().as_ref() {
750/// b"tag1" => println!("attributes values: {:?}",
751/// e.attributes().map(|a| a.unwrap().value)
752/// .collect::<Vec<_>>()),
753/// b"tag2" => count += 1,
754/// _ => (),
755/// }
756/// }
757/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
758///
759/// // There are several other `Event`s we do not consider here
760/// _ => (),
761/// }
762/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
763/// buf.clear();
764/// }
765/// ```
766///
767/// [`NsReader`]: crate::reader::NsReader
768#[derive(Debug, Clone)]
769pub struct Reader<R> {
770 /// Source of data for parse
771 reader: R,
772 /// Configuration and current parse state
773 state: ReaderState,
774}
775
776/// Builder methods
777impl<R> Reader<R> {
778 /// Creates a `Reader` that reads from a given reader.
779 pub fn from_reader(reader: R) -> Self {
780 Self {
781 reader,
782 state: ReaderState::default(),
783 }
784 }
785
786 /// Returns reference to the parser configuration
787 pub const fn config(&self) -> &Config {
788 &self.state.config
789 }
790
791 /// Returns mutable reference to the parser configuration
792 pub fn config_mut(&mut self) -> &mut Config {
793 &mut self.state.config
794 }
795}
796
797/// Getters
798impl<R> Reader<R> {
799 /// Consumes `Reader` returning the underlying reader
800 ///
801 /// Can be used to compute line and column of a parsing error position
802 ///
803 /// # Examples
804 ///
805 /// ```
806 /// # use pretty_assertions::assert_eq;
807 /// use std::{str, io::Cursor};
808 /// use quick_xml::events::Event;
809 /// use quick_xml::reader::Reader;
810 ///
811 /// let xml = r#"<tag1 att1 = "test">
812 /// <tag2><!--Test comment-->Test</tag2>
813 /// <tag3>Test 2</tag3>
814 /// </tag1>"#;
815 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
816 /// let mut buf = Vec::new();
817 ///
818 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
819 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
820 /// let end_pos = reader.buffer_position() as usize;
821 /// let mut cursor = reader.into_inner();
822 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
823 /// .expect("can't make a string");
824 /// let mut line = 1;
825 /// let mut column = 0;
826 /// for c in s.chars() {
827 /// if c == '\n' {
828 /// line += 1;
829 /// column = 0;
830 /// } else {
831 /// column += 1;
832 /// }
833 /// }
834 /// (line, column)
835 /// }
836 ///
837 /// loop {
838 /// match reader.read_event_into(&mut buf) {
839 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
840 /// b"tag1" | b"tag2" => (),
841 /// tag => {
842 /// assert_eq!(b"tag3", tag);
843 /// assert_eq!((3, 22), into_line_and_column(reader));
844 /// break;
845 /// }
846 /// },
847 /// Ok(Event::Eof) => unreachable!(),
848 /// _ => (),
849 /// }
850 /// buf.clear();
851 /// }
852 /// ```
853 pub fn into_inner(self) -> R {
854 self.reader
855 }
856
857 /// Gets a reference to the underlying reader.
858 pub const fn get_ref(&self) -> &R {
859 &self.reader
860 }
861
862 /// Gets a mutable reference to the underlying reader.
863 ///
864 /// Avoid read from this reader because this will not update reader's position
865 /// and will lead to incorrect positions of errors. If you want to read, use
866 /// [`stream()`] instead.
867 ///
868 /// [`stream()`]: Self::stream
869 pub fn get_mut(&mut self) -> &mut R {
870 &mut self.reader
871 }
872
873 /// Gets the byte position in the input data just after the last emitted event
874 /// (i.e. this is position where data of last event ends).
875 ///
876 /// Note, that for text events which is originally ended with whitespace characters
877 /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position
878 /// before trim, not the position of the last byte of the [`Event::Text`] content.
879 pub const fn buffer_position(&self) -> u64 {
880 self.state.offset
881 }
882
883 /// Gets the last error byte position in the input data. If there is no errors
884 /// yet, returns `0`.
885 ///
886 /// Unlike `buffer_position` it will point to the place where it is rational
887 /// to report error to the end user. For example, all [`SyntaxError`]s are
888 /// reported when the parser sees EOF inside of some kind of markup. The
889 /// `buffer_position()` will point to the last byte of input which is not
890 /// very useful. `error_position()` will point to the start of corresponding
891 /// markup element (i. e. to the `<` character).
892 ///
893 /// This position is always `<= buffer_position()`.
894 pub const fn error_position(&self) -> u64 {
895 self.state.last_error_offset
896 }
897
898 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
899 ///
900 /// If [`encoding`] feature is enabled, the used encoding may change after
901 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
902 ///
903 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
904 /// defaults to UTF-8.
905 ///
906 /// [`encoding`]: ../index.html#encoding
907 #[inline]
908 pub const fn decoder(&self) -> Decoder {
909 self.state.decoder()
910 }
911
912 /// Get the direct access to the underlying reader, but tracks the amount of
913 /// read data and update [`Reader::buffer_position()`] accordingly.
914 ///
915 /// Note, that this method gives you access to the internal reader and read
916 /// data will not be returned in any subsequent events read by `read_event`
917 /// family of methods.
918 ///
919 /// # Example
920 ///
921 /// This example demonstrates how to read stream raw bytes from an XML document.
922 /// This could be used to implement streaming read of text, or to read raw binary
923 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
924 /// valid XML, but XML-derived file formats exist where such documents are valid).
925 ///
926 /// ```
927 /// # use pretty_assertions::assert_eq;
928 /// use std::io::{BufRead, Read};
929 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
930 /// use quick_xml::reader::Reader;
931 ///
932 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
933 /// // ^ ^ ^ ^
934 /// // 0 5 21 27
935 ///
936 /// assert_eq!(
937 /// (reader.read_event().unwrap(), reader.buffer_position()),
938 /// // 5 - end of the `<tag>`
939 /// (Event::Start(BytesStart::new("tag")), 5)
940 /// );
941 ///
942 /// // Reading directly from underlying reader will not update position
943 /// // let mut inner = reader.get_mut();
944 ///
945 /// // Reading from the stream() advances position
946 /// let mut inner = reader.stream();
947 ///
948 /// // Read binary data. We must know its size
949 /// let mut binary = [0u8; 16];
950 /// inner.read_exact(&mut binary).unwrap();
951 /// assert_eq!(&binary, b"binary << data&>");
952 /// // 21 - end of the `binary << data&>`
953 /// assert_eq!(inner.offset(), 21);
954 /// assert_eq!(reader.buffer_position(), 21);
955 ///
956 /// assert_eq!(
957 /// (reader.read_event().unwrap(), reader.buffer_position()),
958 /// // 27 - end of the `</tag>`
959 /// (Event::End(BytesEnd::new("tag")), 27)
960 /// );
961 ///
962 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
963 /// ```
964 #[inline]
965 pub fn stream(&mut self) -> BinaryStream<'_, R> {
966 BinaryStream {
967 inner: &mut self.reader,
968 offset: &mut self.state.offset,
969 }
970 }
971}
972
973/// Private sync reading methods
974impl<R> Reader<R> {
975 /// Read text into the given buffer, and return an event that borrows from
976 /// either that buffer or from the input itself, based on the type of the
977 /// reader.
978 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
979 where
980 R: XmlSource<'i, B>,
981 {
982 read_event_impl!(self, buf, self.reader, read_until_close)
983 }
984
985 /// Private function to read until `>` is found. This function expects that
986 /// it was called just after encounter a `<` symbol.
987 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
988 where
989 R: XmlSource<'i, B>,
990 {
991 read_until_close!(self, buf, self.reader)
992 }
993}
994
995////////////////////////////////////////////////////////////////////////////////////////////////////
996
997/// Result of an attempt to read XML textual data from the source.
998#[derive(Debug)]
999enum ReadTextResult<'r, B> {
1000 /// Start of markup (`<` character) was found in the first byte. `<` was consumed.
1001 /// Contains buffer that should be returned back to the next iteration cycle
1002 /// to satisfy borrow checker requirements.
1003 Markup(B),
1004 /// Start of reference (`&` character) was found in the first byte.
1005 /// `&` was not consumed.
1006 /// Contains buffer that should be returned back to the next iteration cycle
1007 /// to satisfy borrow checker requirements.
1008 Ref(B),
1009 /// Contains text block up to start of markup (`<` character). `<` was consumed.
1010 UpToMarkup(&'r [u8]),
1011 /// Contains text block up to start of reference (`&` character).
1012 /// `&` was not consumed.
1013 UpToRef(&'r [u8]),
1014 /// Contains text block up to EOF, neither start of markup (`<` character)
1015 /// or start of reference (`&` character) was found.
1016 UpToEof(&'r [u8]),
1017 /// IO error occurred.
1018 Err(io::Error),
1019}
1020
1021/// Result of an attempt to read general reference from the reader.
1022#[derive(Debug)]
1023enum ReadRefResult<'r> {
1024 /// Contains text block up to end of reference (`;` character).
1025 /// Result includes start `&`, but not end `;`.
1026 Ref(&'r [u8]),
1027 /// Contains text block up to EOF. Neither end of reference (`;`), start of
1028 /// another reference (`&`) or start of markup (`<`) characters was found.
1029 /// Result includes start `&`.
1030 UpToEof(&'r [u8]),
1031 /// Contains text block up to next possible reference (`&` character).
1032 /// Result includes start `&`.
1033 UpToRef(&'r [u8]),
1034 /// Contains text block up to start of markup (`<` character).
1035 /// Result includes start `&`.
1036 UpToMarkup(&'r [u8]),
1037 /// IO error occurred.
1038 Err(io::Error),
1039}
1040
1041/// Represents an input for a reader that can return borrowed data.
1042///
1043/// There are two implementors of this trait: generic one that read data from
1044/// `Self`, copies some part of it into a provided buffer of type `B` and then
1045/// returns data that borrow from that buffer.
1046///
1047/// The other implementor is for `&[u8]` and instead of copying data returns
1048/// borrowed data from `Self` instead. This implementation allows zero-copy
1049/// deserialization.
1050///
1051/// # Parameters
1052/// - `'r`: lifetime of a buffer from which events will borrow
1053/// - `B`: a type of a buffer that can be used to store data read from `Self` and
1054/// from which events can borrow
1055trait XmlSource<'r, B> {
1056 /// Removes UTF-8 BOM if it is present
1057 #[cfg(not(feature = "encoding"))]
1058 fn remove_utf8_bom(&mut self) -> io::Result<()>;
1059
1060 /// Determines encoding from the start of input and removes BOM if it is present
1061 #[cfg(feature = "encoding")]
1062 fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>>;
1063
1064 /// Read input until start of markup (the `<`) is found, start of general entity
1065 /// reference (the `&`) is found or end of input is reached.
1066 ///
1067 /// # Parameters
1068 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1069 /// from which [events] could borrow their data
1070 /// - `position`: Will be increased by amount of bytes consumed
1071 ///
1072 /// [events]: crate::events::Event
1073 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
1074
1075 /// Read input until end of general reference (the `;`) is found, start of
1076 /// another general reference (the `&`) is found or end of input is reached.
1077 ///
1078 /// This method must be called when current character is `&`.
1079 ///
1080 /// # Parameters
1081 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1082 /// from which [events] could borrow their data
1083 /// - `position`: Will be increased by amount of bytes consumed
1084 ///
1085 /// [events]: crate::events::Event
1086 fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>;
1087
1088 /// Read input until processing instruction is finished.
1089 ///
1090 /// This method expect that start sequence of a parser already was read.
1091 ///
1092 /// Returns a slice of data read up to the end of the thing being parsed.
1093 /// The end of thing and the returned content is determined by the used parser.
1094 ///
1095 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
1096 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
1097 ///
1098 /// # Parameters
1099 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1100 /// from which [events] could borrow their data
1101 /// - `position`: Will be increased by amount of bytes consumed
1102 ///
1103 /// A `P` type parameter is used to preserve state between calls to the underlying
1104 /// reader which provides bytes fed into the parser.
1105 ///
1106 /// [events]: crate::events::Event
1107 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
1108 where
1109 P: Parser;
1110
1111 /// Read input until comment or CDATA is finished.
1112 ///
1113 /// This method expect that `<` already was read.
1114 ///
1115 /// Returns a slice of data read up to end of comment or CDATA (`>`),
1116 /// which does not include into result.
1117 ///
1118 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
1119 ///
1120 /// # Parameters
1121 /// - `buf`: Buffer that could be filled from an input (`Self`) and
1122 /// from which [events] could borrow their data
1123 /// - `position`: Will be increased by amount of bytes consumed
1124 ///
1125 /// [events]: crate::events::Event
1126 fn read_bang_element(
1127 &mut self,
1128 buf: B,
1129 position: &mut u64,
1130 ) -> Result<(BangType, &'r [u8]), Error>;
1131
1132 /// Consume and discard all the whitespace until the next non-whitespace
1133 /// character or EOF.
1134 ///
1135 /// # Parameters
1136 /// - `position`: Will be increased by amount of bytes consumed
1137 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1138
1139 /// Return one character without consuming it, so that future `read_*` calls
1140 /// will still include it. On EOF, return `None`.
1141 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1142}
1143
1144/// Possible elements started with `<!`
1145#[derive(Debug, PartialEq)]
1146enum BangType {
1147 /// <![CDATA[...]]>
1148 CData,
1149 /// <!--...-->
1150 Comment,
1151 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1152 DocType(DtdParser),
1153}
1154impl BangType {
1155 #[inline(always)]
1156 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1157 Ok(match byte {
1158 Some(b'[') => Self::CData,
1159 Some(b'-') => Self::Comment,
1160 Some(b'D') | Some(b'd') => Self::DocType(DtdParser::BeforeInternalSubset(0)),
1161 _ => return Err(SyntaxError::InvalidBangMarkup),
1162 })
1163 }
1164
1165 /// If element is finished, returns its content up to `>` symbol and
1166 /// an index of this symbol, otherwise returns `None`
1167 ///
1168 /// # Parameters
1169 /// - `buf`: buffer with data consumed on previous iterations
1170 /// - `chunk`: data read on current iteration and not yet consumed from reader
1171 #[inline(always)]
1172 fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
1173 match self {
1174 Self::Comment => {
1175 for i in memchr::memchr_iter(b'>', chunk) {
1176 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1177 // <!----> - XML comment
1178 // 0123456 - i
1179 if buf.len() + i > 5 {
1180 if chunk[..i].ends_with(b"--") {
1181 // We cannot strip last `--` from the buffer because we need it in case of
1182 // check_comments enabled option. XML standard requires that comment
1183 // will not end with `--->` sequence because this is a special case of
1184 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1185 return Some(i);
1186 }
1187 // End sequence `-|->` was splitted at |
1188 // buf --/ \-- chunk
1189 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1190 return Some(i);
1191 }
1192 // End sequence `--|>` was splitted at |
1193 // buf --/ \-- chunk
1194 if i == 0 && buf.ends_with(b"--") {
1195 return Some(i);
1196 }
1197 }
1198 }
1199 }
1200 Self::CData => {
1201 for i in memchr::memchr_iter(b'>', chunk) {
1202 if chunk[..i].ends_with(b"]]") {
1203 return Some(i);
1204 }
1205 // End sequence `]|]>` was splitted at |
1206 // buf --/ \-- chunk
1207 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1208 return Some(i);
1209 }
1210 // End sequence `]]|>` was splitted at |
1211 // buf --/ \-- chunk
1212 if i == 0 && buf.ends_with(b"]]") {
1213 return Some(i);
1214 }
1215 }
1216 }
1217 Self::DocType(ref mut parser) => return parser.feed(buf, chunk),
1218 }
1219 None
1220 }
1221 #[inline]
1222 const fn to_err(&self) -> SyntaxError {
1223 match self {
1224 Self::CData => SyntaxError::UnclosedCData,
1225 Self::Comment => SyntaxError::UnclosedComment,
1226 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1227 }
1228 }
1229}
1230
1231////////////////////////////////////////////////////////////////////////////////////////////////////
1232
1233#[cfg(test)]
1234mod test {
1235 /// Checks the internal implementation of the various reader methods
1236 macro_rules! check {
1237 (
1238 #[$test:meta]
1239 $read_event:ident,
1240 $read_until_close:ident,
1241 // constructor of the XML source on which internal functions will be called
1242 $source:path,
1243 $skip:literal,
1244 // constructor of the buffer to which read data will stored
1245 $buf:expr
1246 $(, $async:ident, $await:ident)?
1247 ) => {
1248 mod read_bang_element {
1249 use super::*;
1250 use crate::errors::{Error, SyntaxError};
1251 use crate::reader::{BangType, DtdParser};
1252 use crate::utils::Bytes;
1253
1254 /// Checks that reading CDATA content works correctly
1255 mod cdata {
1256 use super::*;
1257 use pretty_assertions::assert_eq;
1258
1259 /// Checks that if input begins like CDATA element, but CDATA start sequence
1260 /// is not finished, parsing ends with an error
1261 #[$test]
1262 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1263 $($async)? fn not_properly_start() {
1264 let buf = $buf;
1265 let mut position = 0;
1266 let mut input = &b"<![]]>other content"[$skip..];
1267 // ^= 0
1268
1269 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1270 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1271 x => panic!(
1272 "Expected `Err(Syntax(_))`, but got `{:?}`",
1273 x
1274 ),
1275 }
1276 assert_eq!(position, 1);
1277 }
1278
1279 /// Checks that if CDATA startup sequence was matched, but an end sequence
1280 /// is not found, parsing ends with an error
1281 #[$test]
1282 $($async)? fn not_closed() {
1283 let buf = $buf;
1284 let mut position = 0;
1285 let mut input = &b"<![CDATA[other content"[$skip..];
1286 // ^= 0 ^= 22
1287
1288 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1289 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1290 x => panic!(
1291 "Expected `Err(Syntax(_))`, but got `{:?}`",
1292 x
1293 ),
1294 }
1295 assert_eq!(position, 22);
1296 }
1297
1298 /// Checks that CDATA element without content inside parsed successfully
1299 #[$test]
1300 $($async)? fn empty() {
1301 let buf = $buf;
1302 let mut position = 0;
1303 let mut input = &b"<![CDATA[]]>other content"[$skip..];
1304 // ^= 0 ^= 12
1305
1306 let (ty, bytes) = $source(&mut input)
1307 .read_bang_element(buf, &mut position)
1308 $(.$await)?
1309 .unwrap();
1310 assert_eq!(
1311 (ty, Bytes(bytes)),
1312 (BangType::CData, Bytes(b"<![CDATA[]]>"))
1313 );
1314 assert_eq!(position, 12);
1315 }
1316
1317 /// Checks that CDATA element with content parsed successfully.
1318 /// Additionally checks that sequences inside CDATA that may look like
1319 /// a CDATA end sequence do not interrupt CDATA parsing
1320 #[$test]
1321 $($async)? fn with_content() {
1322 let buf = $buf;
1323 let mut position = 0;
1324 let mut input = &b"<![CDATA[cdata]] ]>content]]>other content]]>"[$skip..];
1325 // ^= 0 ^= 29
1326
1327 let (ty, bytes) = $source(&mut input)
1328 .read_bang_element(buf, &mut position)
1329 $(.$await)?
1330 .unwrap();
1331 assert_eq!(
1332 (ty, Bytes(bytes)),
1333 (BangType::CData, Bytes(b"<![CDATA[cdata]] ]>content]]>"))
1334 );
1335 assert_eq!(position, 29);
1336 }
1337 }
1338
1339 /// Checks that reading XML comments works correctly. According to the [specification],
1340 /// comment data can contain any sequence except `--`:
1341 ///
1342 /// ```peg
1343 /// comment = '<--' (!'--' char)* '-->';
1344 /// char = [#x1-#x2C]
1345 /// / [#x2E-#xD7FF]
1346 /// / [#xE000-#xFFFD]
1347 /// / [#x10000-#x10FFFF]
1348 /// ```
1349 ///
1350 /// The presence of this limitation, however, is simply a poorly designed specification
1351 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1352 /// presence of these sequences by default. This tests allow such content.
1353 ///
1354 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1355 mod comment {
1356 use super::*;
1357 use pretty_assertions::assert_eq;
1358
1359 #[$test]
1360 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1361 $($async)? fn not_properly_start() {
1362 let buf = $buf;
1363 let mut position = 0;
1364 let mut input = &b"<!- -->other content"[$skip..];
1365 // ^= 1
1366
1367 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1368 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1369 x => panic!(
1370 "Expected `Err(Syntax(_))`, but got `{:?}`",
1371 x
1372 ),
1373 }
1374 assert_eq!(position, 1);
1375 }
1376
1377 #[$test]
1378 $($async)? fn not_properly_end() {
1379 let buf = $buf;
1380 let mut position = 0;
1381 let mut input = &b"<!->other content"[$skip..];
1382 // ^= 0 ^= 17
1383
1384 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1385 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1386 x => panic!(
1387 "Expected `Err(Syntax(_))`, but got `{:?}`",
1388 x
1389 ),
1390 }
1391 assert_eq!(position, 17);
1392 }
1393
1394 #[$test]
1395 $($async)? fn not_closed1() {
1396 let buf = $buf;
1397 let mut position = 0;
1398 let mut input = &b"<!--other content"[$skip..];
1399 // ^= 0 ^= 17
1400
1401 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1402 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1403 x => panic!(
1404 "Expected `Err(Syntax(_))`, but got `{:?}`",
1405 x
1406 ),
1407 }
1408 assert_eq!(position, 17);
1409 }
1410
1411 #[$test]
1412 $($async)? fn not_closed2() {
1413 let buf = $buf;
1414 let mut position = 0;
1415 let mut input = &b"<!-->other content"[$skip..];
1416 // ^= 0 ^= 18
1417
1418 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1419 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1420 x => panic!(
1421 "Expected `Err(Syntax(_))`, but got `{:?}`",
1422 x
1423 ),
1424 }
1425 assert_eq!(position, 18);
1426 }
1427
1428 #[$test]
1429 $($async)? fn not_closed3() {
1430 let buf = $buf;
1431 let mut position = 0;
1432 let mut input = &b"<!--->other content"[$skip..];
1433 // ^= 0 ^= 19
1434
1435 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1436 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1437 x => panic!(
1438 "Expected `Err(Syntax(_))`, but got `{:?}`",
1439 x
1440 ),
1441 }
1442 assert_eq!(position, 19);
1443 }
1444
1445 #[$test]
1446 $($async)? fn empty() {
1447 let buf = $buf;
1448 let mut position = 0;
1449 let mut input = &b"<!---->other content"[$skip..];
1450 // ^= 0 ^= 7
1451
1452 let (ty, bytes) = $source(&mut input)
1453 .read_bang_element(buf, &mut position)
1454 $(.$await)?
1455 .unwrap();
1456 assert_eq!(
1457 (ty, Bytes(bytes)),
1458 (BangType::Comment, Bytes(b"<!---->"))
1459 );
1460 assert_eq!(position, 7);
1461 }
1462
1463 #[$test]
1464 $($async)? fn with_content() {
1465 let buf = $buf;
1466 let mut position = 0;
1467 let mut input = &b"<!--->comment<--->other content"[$skip..];
1468 // ^= 0 ^= 18
1469
1470 let (ty, bytes) = $source(&mut input)
1471 .read_bang_element(buf, &mut position)
1472 $(.$await)?
1473 .unwrap();
1474 assert_eq!(
1475 (ty, Bytes(bytes)),
1476 (BangType::Comment, Bytes(b"<!--->comment<--->"))
1477 );
1478 assert_eq!(position, 18);
1479 }
1480 }
1481
1482 /// Checks that reading DOCTYPE definition works correctly
1483 mod doctype {
1484 use super::*;
1485
1486 mod uppercase {
1487 use super::*;
1488 use pretty_assertions::assert_eq;
1489
1490 #[$test]
1491 $($async)? fn not_properly_start() {
1492 let buf = $buf;
1493 let mut position = 0;
1494 let mut input = &b"<!D other content"[$skip..];
1495 // ^= 0 ^= 17
1496
1497 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1498 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1499 x => panic!(
1500 "Expected `Err(Syntax(_))`, but got `{:?}`",
1501 x
1502 ),
1503 }
1504 assert_eq!(position, 17);
1505 }
1506
1507 #[$test]
1508 $($async)? fn without_space() {
1509 let buf = $buf;
1510 let mut position = 0;
1511 let mut input = &b"<!DOCTYPEother content"[$skip..];
1512 // ^= 0 ^= 22
1513
1514 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1515 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1516 x => panic!(
1517 "Expected `Err(Syntax(_))`, but got `{:?}`",
1518 x
1519 ),
1520 }
1521 assert_eq!(position, 22);
1522 }
1523
1524 #[$test]
1525 $($async)? fn empty() {
1526 let buf = $buf;
1527 let mut position = 0;
1528 let mut input = &b"<!DOCTYPE>other content"[$skip..];
1529 // ^= 0 ^= 10
1530
1531 let (ty, bytes) = $source(&mut input)
1532 .read_bang_element(buf, &mut position)
1533 $(.$await)?
1534 .unwrap();
1535 assert_eq!(
1536 (ty, Bytes(bytes)),
1537 (BangType::DocType(DtdParser::Finished), Bytes(b"<!DOCTYPE>"))
1538 );
1539 assert_eq!(position, 10);
1540 }
1541
1542 #[$test]
1543 $($async)? fn not_closed() {
1544 let buf = $buf;
1545 let mut position = 0;
1546 let mut input = &b"<!DOCTYPE other content"[$skip..];
1547 // ^= 0 ^23
1548
1549 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1550 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1551 x => panic!(
1552 "Expected `Err(Syntax(_))`, but got `{:?}`",
1553 x
1554 ),
1555 }
1556 assert_eq!(position, 23);
1557 }
1558 }
1559
1560 mod lowercase {
1561 use super::*;
1562 use pretty_assertions::assert_eq;
1563
1564 #[$test]
1565 $($async)? fn not_properly_start() {
1566 let buf = $buf;
1567 let mut position = 0;
1568 let mut input = &b"<!d other content"[$skip..];
1569 // ^= 0 ^= 17
1570
1571 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1572 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1573 x => panic!(
1574 "Expected `Err(Syntax(_))`, but got `{:?}`",
1575 x
1576 ),
1577 }
1578 assert_eq!(position, 17);
1579 }
1580
1581 #[$test]
1582 $($async)? fn without_space() {
1583 let buf = $buf;
1584 let mut position = 0;
1585 let mut input = &b"<!doctypeother content"[$skip..];
1586 // ^= 0 ^= 22
1587
1588 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1589 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1590 x => panic!(
1591 "Expected `Err(Syntax(_))`, but got `{:?}`",
1592 x
1593 ),
1594 }
1595 assert_eq!(position, 22);
1596 }
1597
1598 #[$test]
1599 $($async)? fn empty() {
1600 let buf = $buf;
1601 let mut position = 0;
1602 let mut input = &b"<!doctype>other content"[$skip..];
1603 // ^= 0 ^= 10
1604
1605 let (ty, bytes) = $source(&mut input)
1606 .read_bang_element(buf, &mut position)
1607 $(.$await)?
1608 .unwrap();
1609 assert_eq!(
1610 (ty, Bytes(bytes)),
1611 (BangType::DocType(DtdParser::Finished), Bytes(b"<!doctype>"))
1612 );
1613 assert_eq!(position, 10);
1614 }
1615
1616 #[$test]
1617 $($async)? fn not_closed() {
1618 let buf = $buf;
1619 let mut position = 0;
1620 let mut input = &b"<!doctype other content"[$skip..];
1621 // ^= 0 ^= 23
1622
1623 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1624 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1625 x => panic!(
1626 "Expected `Err(Syntax(_))`, but got `{:?}`",
1627 x
1628 ),
1629 }
1630 assert_eq!(position, 23);
1631 }
1632 }
1633 }
1634 }
1635
1636 mod read_text {
1637 use super::*;
1638 use crate::reader::ReadTextResult;
1639 use crate::utils::Bytes;
1640 use pretty_assertions::assert_eq;
1641
1642 #[$test]
1643 $($async)? fn empty() {
1644 let buf = $buf;
1645 let mut position = 1;
1646 let mut input = b"".as_ref();
1647 // ^= 1
1648
1649 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1650 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")),
1651 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1652 }
1653 assert_eq!(position, 1);
1654 }
1655
1656 #[$test]
1657 $($async)? fn markup() {
1658 let buf = $buf;
1659 let mut position = 1;
1660 let mut input = b"<".as_ref();
1661 // ^= 1
1662
1663 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1664 ReadTextResult::Markup(b) => assert_eq!(b, $buf),
1665 x => panic!("Expected `Markup(_)`, but got `{:?}`", x),
1666 }
1667 assert_eq!(position, 1);
1668 }
1669
1670 #[$test]
1671 $($async)? fn ref_() {
1672 let buf = $buf;
1673 let mut position = 1;
1674 let mut input = b"&".as_ref();
1675 // ^= 1
1676
1677 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1678 ReadTextResult::Ref(b) => assert_eq!(b, $buf),
1679 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1680 }
1681 assert_eq!(position, 1);
1682 }
1683
1684 #[$test]
1685 $($async)? fn up_to_markup() {
1686 let buf = $buf;
1687 let mut position = 1;
1688 let mut input = b"a<".as_ref();
1689 // ^= 2
1690
1691 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1692 ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1693 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1694 }
1695 assert_eq!(position, 2);
1696 }
1697
1698 #[$test]
1699 $($async)? fn up_to_ref() {
1700 let buf = $buf;
1701 let mut position = 1;
1702 let mut input = b"a&".as_ref();
1703 // ^= 2
1704
1705 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1706 ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1707 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1708 }
1709 assert_eq!(position, 2);
1710 }
1711
1712 #[$test]
1713 $($async)? fn up_to_eof() {
1714 let buf = $buf;
1715 let mut position = 1;
1716 let mut input = b"a".as_ref();
1717 // ^= 2
1718
1719 match $source(&mut input).read_text(buf, &mut position) $(.$await)? {
1720 ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")),
1721 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1722 }
1723 assert_eq!(position, 2);
1724 }
1725 }
1726
1727 mod read_ref {
1728 use super::*;
1729 use crate::reader::ReadRefResult;
1730 use crate::utils::Bytes;
1731 use pretty_assertions::assert_eq;
1732
1733 // Empty input is not allowed for `read_ref` so not tested.
1734 // Borrowed source triggers debug assertion,
1735 // buffered do nothing due to implementation details.
1736
1737 #[$test]
1738 $($async)? fn up_to_eof() {
1739 let buf = $buf;
1740 let mut position = 1;
1741 let mut input = b"&".as_ref();
1742 // ^= 2
1743
1744 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1745 ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1746 x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
1747 }
1748 assert_eq!(position, 2);
1749 }
1750
1751 #[$test]
1752 $($async)? fn up_to_ref() {
1753 let buf = $buf;
1754 let mut position = 1;
1755 let mut input = b"&&".as_ref();
1756 // ^= 2
1757
1758 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1759 ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1760 x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
1761 }
1762 assert_eq!(position, 2);
1763 }
1764
1765 #[$test]
1766 $($async)? fn up_to_markup() {
1767 let buf = $buf;
1768 let mut position = 1;
1769 let mut input = b"&<".as_ref();
1770 // ^= 2
1771
1772 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1773 ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1774 x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
1775 }
1776 assert_eq!(position, 2);
1777 }
1778
1779 #[$test]
1780 $($async)? fn empty_ref() {
1781 let buf = $buf;
1782 let mut position = 1;
1783 let mut input = b"&;".as_ref();
1784 // ^= 3
1785
1786 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1787 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&;")),
1788 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1789 }
1790 assert_eq!(position, 3);
1791 }
1792
1793 #[$test]
1794 $($async)? fn normal() {
1795 let buf = $buf;
1796 let mut position = 1;
1797 let mut input = b"<".as_ref();
1798 // ^= 5
1799
1800 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1801 ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")),
1802 x => panic!("Expected `Ref(_)`, but got `{:?}`", x),
1803 }
1804 assert_eq!(position, 5);
1805 }
1806 }
1807
1808 mod read_element {
1809 use super::*;
1810 use crate::errors::{Error, SyntaxError};
1811 use crate::parser::ElementParser;
1812 use crate::utils::Bytes;
1813 use pretty_assertions::assert_eq;
1814
1815 /// Checks that nothing was read from empty buffer
1816 /// `<` read in peek_one that is called before read_with, that is why it in the input buffer
1817 /// peek_one, however, does not increment position for simplicity of the code
1818 #[$test]
1819 $($async)? fn empty() {
1820 let buf = $buf;
1821 let mut position = 0;
1822 let mut input = &b"<"[$skip..];
1823 // ^= 1
1824
1825 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1826 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1827 x => panic!(
1828 "Expected `Err(Syntax(_))`, but got `{:?}`",
1829 x
1830 ),
1831 }
1832 assert_eq!(position, 1);
1833 }
1834
1835 mod open {
1836 use super::*;
1837 use pretty_assertions::assert_eq;
1838
1839 #[$test]
1840 $($async)? fn empty_tag() {
1841 let buf = $buf;
1842 let mut position = 0;
1843 let mut input = &b"<>"[$skip..];
1844 // ^= 2
1845
1846 assert_eq!(
1847 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1848 Bytes(b"<>")
1849 );
1850 assert_eq!(position, 2);
1851 }
1852
1853 #[$test]
1854 $($async)? fn normal() {
1855 let buf = $buf;
1856 let mut position = 0;
1857 let mut input = &b"<tag>"[$skip..];
1858 // ^= 5
1859
1860 assert_eq!(
1861 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1862 Bytes(b"<tag>")
1863 );
1864 assert_eq!(position, 5);
1865 }
1866
1867 #[$test]
1868 $($async)? fn empty_ns_empty_tag() {
1869 let buf = $buf;
1870 let mut position = 0;
1871 let mut input = &b"<:>"[$skip..];
1872 // ^= 3
1873
1874 assert_eq!(
1875 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1876 Bytes(b"<:>")
1877 );
1878 assert_eq!(position, 3);
1879 }
1880
1881 #[$test]
1882 $($async)? fn empty_ns() {
1883 let buf = $buf;
1884 let mut position = 0;
1885 let mut input = &b"<:tag>"[$skip..];
1886 // ^= 6
1887
1888 assert_eq!(
1889 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1890 Bytes(b"<:tag>")
1891 );
1892 assert_eq!(position, 6);
1893 }
1894
1895 #[$test]
1896 $($async)? fn with_attributes() {
1897 let buf = $buf;
1898 let mut position = 0;
1899 let mut input = &br#"<tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
1900 // ^= 39
1901
1902 assert_eq!(
1903 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1904 Bytes(br#"<tag attr-1=">" attr2 = '>' 3attr>"#)
1905 );
1906 assert_eq!(position, 39);
1907 }
1908 }
1909
1910 mod self_closed {
1911 use super::*;
1912 use pretty_assertions::assert_eq;
1913
1914 #[$test]
1915 $($async)? fn empty_tag() {
1916 let buf = $buf;
1917 let mut position = 0;
1918 let mut input = &b"</>"[$skip..];
1919 // ^= 3
1920
1921 assert_eq!(
1922 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1923 Bytes(b"</>")
1924 );
1925 assert_eq!(position, 3);
1926 }
1927
1928 #[$test]
1929 $($async)? fn normal() {
1930 let buf = $buf;
1931 let mut position = 0;
1932 let mut input = &b"<tag/>"[$skip..];
1933 // ^= 6
1934
1935 assert_eq!(
1936 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1937 Bytes(b"<tag/>")
1938 );
1939 assert_eq!(position, 6);
1940 }
1941
1942 #[$test]
1943 $($async)? fn empty_ns_empty_tag() {
1944 let buf = $buf;
1945 let mut position = 0;
1946 let mut input = &b"<:/>"[$skip..];
1947 // ^= 4
1948
1949 assert_eq!(
1950 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1951 Bytes(b"<:/>")
1952 );
1953 assert_eq!(position, 4);
1954 }
1955
1956 #[$test]
1957 $($async)? fn empty_ns() {
1958 let buf = $buf;
1959 let mut position = 0;
1960 let mut input = &b"<:tag/>"[$skip..];
1961 // ^= 7
1962
1963 assert_eq!(
1964 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1965 Bytes(b"<:tag/>")
1966 );
1967 assert_eq!(position, 7);
1968 }
1969
1970 #[$test]
1971 $($async)? fn with_attributes() {
1972 let buf = $buf;
1973 let mut position = 0;
1974 let mut input = &br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#[$skip..];
1975 // ^= 42
1976
1977 assert_eq!(
1978 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1979 Bytes(br#"<tag attr-1="/>" attr2 = '/>' 3attr/>"#)
1980 );
1981 assert_eq!(position, 42);
1982 }
1983 }
1984
1985 mod close {
1986 use super::*;
1987 use pretty_assertions::assert_eq;
1988
1989 #[$test]
1990 $($async)? fn empty_tag() {
1991 let buf = $buf;
1992 let mut position = 0;
1993 let mut input = &b"</ >"[$skip..];
1994 // ^= 4
1995
1996 assert_eq!(
1997 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1998 Bytes(b"</ >")
1999 );
2000 assert_eq!(position, 4);
2001 }
2002
2003 #[$test]
2004 $($async)? fn normal() {
2005 let buf = $buf;
2006 let mut position = 0;
2007 let mut input = &b"</tag>"[$skip..];
2008 // ^= 6
2009
2010 assert_eq!(
2011 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2012 Bytes(b"</tag>")
2013 );
2014 assert_eq!(position, 6);
2015 }
2016
2017 #[$test]
2018 $($async)? fn empty_ns_empty_tag() {
2019 let buf = $buf;
2020 let mut position = 0;
2021 let mut input = &b"</:>"[$skip..];
2022 // ^= 4
2023
2024 assert_eq!(
2025 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2026 Bytes(b"</:>")
2027 );
2028 assert_eq!(position, 4);
2029 }
2030
2031 #[$test]
2032 $($async)? fn empty_ns() {
2033 let buf = $buf;
2034 let mut position = 0;
2035 let mut input = &b"</:tag>"[$skip..];
2036 // ^= 7
2037
2038 assert_eq!(
2039 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2040 Bytes(b"</:tag>")
2041 );
2042 assert_eq!(position, 7);
2043 }
2044
2045 #[$test]
2046 $($async)? fn with_attributes() {
2047 let buf = $buf;
2048 let mut position = 0;
2049 let mut input = &br#"</tag attr-1=">" attr2 = '>' 3attr>"#[$skip..];
2050 // ^= 40
2051
2052 assert_eq!(
2053 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
2054 Bytes(br#"</tag attr-1=">" attr2 = '>' 3attr>"#)
2055 );
2056 assert_eq!(position, 40);
2057 }
2058 }
2059 }
2060
2061 /// Ensures, that no empty `Text` events are generated
2062 mod $read_event {
2063 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
2064 use crate::reader::Reader;
2065 use pretty_assertions::assert_eq;
2066
2067 /// When `encoding` feature is enabled, encoding should be detected
2068 /// from BOM (UTF-8) and BOM should be stripped.
2069 ///
2070 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
2071 /// character should be stripped for consistency
2072 #[$test]
2073 $($async)? fn bom_from_reader() {
2074 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
2075
2076 assert_eq!(
2077 reader.$read_event($buf) $(.$await)? .unwrap(),
2078 Event::Text(BytesText::from_escaped("\u{feff}"))
2079 );
2080
2081 assert_eq!(
2082 reader.$read_event($buf) $(.$await)? .unwrap(),
2083 Event::Eof
2084 );
2085 }
2086
2087 /// When parsing from &str, encoding is fixed (UTF-8), so
2088 /// - when `encoding` feature is disabled, the behavior the
2089 /// same as in `bom_from_reader` text
2090 /// - when `encoding` feature is enabled, the behavior should
2091 /// stay consistent, so the first BOM character is stripped
2092 #[$test]
2093 $($async)? fn bom_from_str() {
2094 let mut reader = Reader::from_str("\u{feff}\u{feff}");
2095
2096 assert_eq!(
2097 reader.$read_event($buf) $(.$await)? .unwrap(),
2098 Event::Text(BytesText::from_escaped("\u{feff}"))
2099 );
2100
2101 assert_eq!(
2102 reader.$read_event($buf) $(.$await)? .unwrap(),
2103 Event::Eof
2104 );
2105 }
2106
2107 #[$test]
2108 $($async)? fn declaration() {
2109 let mut reader = Reader::from_str("<?xml ?>");
2110
2111 assert_eq!(
2112 reader.$read_event($buf) $(.$await)? .unwrap(),
2113 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
2114 );
2115 }
2116
2117 #[$test]
2118 $($async)? fn doctype() {
2119 let mut reader = Reader::from_str("<!DOCTYPE x>");
2120
2121 assert_eq!(
2122 reader.$read_event($buf) $(.$await)? .unwrap(),
2123 Event::DocType(BytesText::from_escaped("x"))
2124 );
2125 }
2126
2127 #[$test]
2128 $($async)? fn processing_instruction() {
2129 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
2130
2131 assert_eq!(
2132 reader.$read_event($buf) $(.$await)? .unwrap(),
2133 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
2134 );
2135 }
2136
2137 /// Lone closing tags are not allowed, so testing it together with start tag
2138 #[$test]
2139 $($async)? fn start_and_end() {
2140 let mut reader = Reader::from_str("<tag></tag>");
2141
2142 assert_eq!(
2143 reader.$read_event($buf) $(.$await)? .unwrap(),
2144 Event::Start(BytesStart::new("tag"))
2145 );
2146
2147 assert_eq!(
2148 reader.$read_event($buf) $(.$await)? .unwrap(),
2149 Event::End(BytesEnd::new("tag"))
2150 );
2151 }
2152
2153 #[$test]
2154 $($async)? fn empty() {
2155 let mut reader = Reader::from_str("<tag/>");
2156
2157 assert_eq!(
2158 reader.$read_event($buf) $(.$await)? .unwrap(),
2159 Event::Empty(BytesStart::new("tag"))
2160 );
2161 }
2162
2163 #[$test]
2164 $($async)? fn text() {
2165 let mut reader = Reader::from_str("text");
2166
2167 assert_eq!(
2168 reader.$read_event($buf) $(.$await)? .unwrap(),
2169 Event::Text(BytesText::from_escaped("text"))
2170 );
2171 }
2172
2173 #[$test]
2174 $($async)? fn cdata() {
2175 let mut reader = Reader::from_str("<![CDATA[]]>");
2176
2177 assert_eq!(
2178 reader.$read_event($buf) $(.$await)? .unwrap(),
2179 Event::CData(BytesCData::new(""))
2180 );
2181 }
2182
2183 #[$test]
2184 $($async)? fn comment() {
2185 let mut reader = Reader::from_str("<!---->");
2186
2187 assert_eq!(
2188 reader.$read_event($buf) $(.$await)? .unwrap(),
2189 Event::Comment(BytesText::from_escaped(""))
2190 );
2191 }
2192
2193 #[$test]
2194 $($async)? fn eof() {
2195 let mut reader = Reader::from_str("");
2196
2197 assert_eq!(
2198 reader.$read_event($buf) $(.$await)? .unwrap(),
2199 Event::Eof
2200 );
2201 }
2202 }
2203 };
2204 }
2205
2206 // Export macros for the child modules:
2207 // - buffered_reader
2208 // - slice_reader
2209 pub(super) use check;
2210}