Skip to main content

fast_xml/events/
mod.rs

1//! Defines zero-copy XML events used throughout this library.
2//!
3//! A XML event often represents part of a XML element.
4//! They occur both during reading and writing and are
5//! usually used with the stream-oriented API.
6//!
7//! For example, the XML element
8//! ```xml
9//! <name attr="value">Inner text</name>
10//! ```
11//! consists of the three events `Start`, `Text` and `End`.
12//! They can also represent other parts in an XML document like the
13//! XML declaration. Each Event usually contains further information,
14//! like the tag name, the attribute or the inner text.
15//!
16//! See [`Event`] for a list of all possible events.
17//!
18//! # Reading
19//! When reading a XML stream, the events are emitted by
20//! [`Reader::read_event`]. You must listen
21//! for the different types of events you are interested in.
22//!
23//! See [`Reader`] for further information.
24//!
25//! # Writing
26//! When writing the XML document, you must create the XML element
27//! by constructing the events it consists of and pass them to the writer
28//! sequentially.
29//!
30//! See [`Writer`] for further information.
31//!
32//! [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
33//! [`Reader`]: ../reader/struct.Reader.html
34//! [`Writer`]: ../writer/struct.Writer.html
35//! [`Event`]: enum.Event.html
36
37pub mod attributes;
38
39#[cfg(feature = "encoding_rs")]
40use encoding_rs::Encoding;
41use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8};
42
43use crate::escape::{do_unescape, escape, partial_escape};
44use crate::utils::write_cow_string;
45use crate::{errors::Error, errors::Result, reader::Reader};
46use attributes::{Attribute, Attributes};
47
48#[cfg(feature = "serialize")]
49use crate::escape::EscapeError;
50
51use memchr;
52
53/// Opening tag data (`Event::Start`), with optional attributes.
54///
55/// `<name attr="value">`.
56///
57/// The name can be accessed using the [`name`], [`local_name`] or [`unescaped`] methods. An
58/// iterator over the attributes is returned by the [`attributes`] method.
59///
60/// [`name`]: #method.name
61/// [`local_name`]: #method.local_name
62/// [`unescaped`]: #method.unescaped
63/// [`attributes`]: #method.attributes
64#[derive(Clone, Eq, PartialEq)]
65pub struct BytesStart<'a> {
66    /// content of the element, before any utf8 conversion
67    buf: Cow<'a, [u8]>,
68    /// end of the element name, the name starts at that the start of `buf`
69    name_len: usize,
70}
71
72impl<'a> BytesStart<'a> {
73    /// Creates a new `BytesStart` from the given content (name + attributes).
74    ///
75    /// # Warning
76    ///
77    /// `&content[..name_len]` is not checked to be a valid name
78    #[inline]
79    pub fn borrowed(content: &'a [u8], name_len: usize) -> Self {
80        BytesStart {
81            buf: Cow::Borrowed(content),
82            name_len,
83        }
84    }
85
86    /// Creates a new `BytesStart` from the given name.
87    ///
88    /// # Warning
89    ///
90    /// `&content` is not checked to be a valid name
91    #[inline]
92    pub fn borrowed_name(name: &'a [u8]) -> BytesStart<'a> {
93        Self::borrowed(name, name.len())
94    }
95
96    /// Creates a new `BytesStart` from the given content (name + attributes)
97    ///
98    /// Owns its contents.
99    #[inline]
100    pub fn owned<C: Into<Vec<u8>>>(content: C, name_len: usize) -> BytesStart<'static> {
101        BytesStart {
102            buf: Cow::Owned(content.into()),
103            name_len,
104        }
105    }
106
107    /// Creates a new `BytesStart` from the given name
108    ///
109    /// Owns its contents.
110    #[inline]
111    pub fn owned_name<C: Into<Vec<u8>>>(name: C) -> BytesStart<'static> {
112        let content = name.into();
113        BytesStart {
114            name_len: content.len(),
115            buf: Cow::Owned(content),
116        }
117    }
118
119    /// Converts the event into an owned event.
120    pub fn into_owned(self) -> BytesStart<'static> {
121        Self::owned(self.buf.into_owned(), self.name_len)
122    }
123
124    /// Converts the event into an owned event without taking ownership of Event
125    pub fn to_owned(&self) -> BytesStart<'static> {
126        Self::owned(self.buf.to_owned(), self.name_len)
127    }
128
129    /// Converts the event into a borrowed event. Most useful when paired with [`to_end`].
130    ///
131    /// # Example
132    ///
133    /// ```rust
134    /// # use fast_xml::{Error, Writer};
135    /// use fast_xml::events::{BytesStart, Event};
136    ///
137    /// struct SomeStruct<'a> {
138    ///     attrs: BytesStart<'a>,
139    ///     // ...
140    /// }
141    /// # impl<'a> SomeStruct<'a> {
142    /// # fn example(&self) -> Result<(), Error> {
143    /// # let mut writer = Writer::new(Vec::new());
144    ///
145    /// writer.write_event(Event::Start(self.attrs.to_borrowed()))?;
146    /// // ...
147    /// writer.write_event(Event::End(self.attrs.to_end()))?;
148    /// # Ok(())
149    /// # }}
150    /// ```
151    ///
152    /// [`to_end`]: #method.to_end
153    pub fn to_borrowed(&self) -> BytesStart {
154        BytesStart::borrowed(&self.buf, self.name_len)
155    }
156
157    /// Creates new paired close tag
158    pub fn to_end(&self) -> BytesEnd {
159        BytesEnd::borrowed(self.name())
160    }
161
162    /// Gets the undecoded raw tag name as a `&[u8]`.
163    #[inline]
164    pub fn name(&self) -> &[u8] {
165        &self.buf[..self.name_len]
166    }
167
168    /// Gets the undecoded raw local tag name (excluding namespace) as a `&[u8]`.
169    ///
170    /// All content up to and including the first `:` character is removed from the tag name.
171    #[inline]
172    pub fn local_name(&self) -> &[u8] {
173        let name = self.name();
174        memchr::memchr(b':', name).map_or(name, |i| &name[i + 1..])
175    }
176
177    /// Gets the unescaped tag name.
178    ///
179    /// XML escape sequences like "`&lt;`" will be replaced by their unescaped characters like
180    /// "`<`".
181    ///
182    /// See also [`unescaped_with_custom_entities()`](#method.unescaped_with_custom_entities)
183    #[inline]
184    pub fn unescaped(&self) -> Result<Cow<[u8]>> {
185        self.make_unescaped(None)
186    }
187
188    /// Gets the unescaped tag name, using custom entities.
189    ///
190    /// XML escape sequences like "`&lt;`" will be replaced by their unescaped characters like
191    /// "`<`".
192    /// Additional entities can be provided in `custom_entities`.
193    ///
194    /// # Pre-condition
195    ///
196    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
197    ///
198    /// See also [`unescaped()`](#method.unescaped)
199    #[inline]
200    pub fn unescaped_with_custom_entities<'s>(
201        &'s self,
202        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
203    ) -> Result<Cow<'s, [u8]>> {
204        self.make_unescaped(Some(custom_entities))
205    }
206
207    #[inline]
208    fn make_unescaped<'s>(
209        &'s self,
210        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
211    ) -> Result<Cow<'s, [u8]>> {
212        do_unescape(&*self.buf, custom_entities).map_err(Error::EscapeError)
213    }
214
215    /// Returns the unescaped and decoded string value.
216    ///
217    /// This allocates a `String` in all cases. For performance reasons it might be a better idea to
218    /// instead use one of:
219    ///
220    /// * [`unescaped()`], as it doesn't allocate when no escape sequences are used.
221    /// * [`Reader::decode()`], as it only allocates when the decoding can't be performed otherwise.
222    ///
223    /// [`unescaped()`]: #method.unescaped
224    /// [`Reader::decode()`]: ../reader/struct.Reader.html#method.decode
225    #[inline]
226    pub fn unescape_and_decode<B: BufRead>(&self, reader: &Reader<B>) -> Result<String> {
227        self.do_unescape_and_decode_with_custom_entities(reader, None)
228    }
229
230    /// Returns the unescaped and decoded string value with custom entities.
231    ///
232    /// This allocates a `String` in all cases. For performance reasons it might be a better idea to
233    /// instead use one of:
234    ///
235    /// * [`unescaped_with_custom_entities()`], as it doesn't allocate when no escape sequences are used.
236    /// * [`Reader::decode()`], as it only allocates when the decoding can't be performed otherwise.
237    ///
238    /// [`unescaped_with_custom_entities()`]: #method.unescaped_with_custom_entities
239    /// [`Reader::decode()`]: ../reader/struct.Reader.html#method.decode
240    ///
241    /// # Pre-condition
242    ///
243    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
244    #[inline]
245    pub fn unescape_and_decode_with_custom_entities<B: BufRead>(
246        &self,
247        reader: &Reader<B>,
248        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
249    ) -> Result<String> {
250        self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities))
251    }
252
253    #[cfg(feature = "encoding")]
254    #[inline]
255    fn do_unescape_and_decode_with_custom_entities<B: BufRead>(
256        &self,
257        reader: &Reader<B>,
258        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
259    ) -> Result<String> {
260        let decoded = reader.decode(&*self);
261        let unescaped =
262            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
263        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
264    }
265
266    #[cfg(not(feature = "encoding"))]
267    #[inline]
268    fn do_unescape_and_decode_with_custom_entities<B: BufRead>(
269        &self,
270        reader: &Reader<B>,
271        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
272    ) -> Result<String> {
273        let decoded = reader.decode(&*self)?;
274        let unescaped =
275            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
276        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
277    }
278
279    /// Edit the name of the BytesStart in-place
280    ///
281    /// # Warning
282    ///
283    /// `name` is not checked to be a valid name
284    pub fn set_name(&mut self, name: &[u8]) -> &mut BytesStart<'a> {
285        let bytes = self.buf.to_mut();
286        bytes.splice(..self.name_len, name.iter().cloned());
287        self.name_len = name.len();
288        self
289    }
290}
291
292/// Attribute-related methods
293impl<'a> BytesStart<'a> {
294    /// Consumes `self` and yield a new `BytesStart` with additional attributes from an iterator.
295    ///
296    /// The yielded items must be convertible to [`Attribute`] using `Into`.
297    pub fn with_attributes<'b, I>(mut self, attributes: I) -> Self
298    where
299        I: IntoIterator,
300        I::Item: Into<Attribute<'b>>,
301    {
302        self.extend_attributes(attributes);
303        self
304    }
305
306    /// Add additional attributes to this tag using an iterator.
307    ///
308    /// The yielded items must be convertible to [`Attribute`] using `Into`.
309    pub fn extend_attributes<'b, I>(&mut self, attributes: I) -> &mut BytesStart<'a>
310    where
311        I: IntoIterator,
312        I::Item: Into<Attribute<'b>>,
313    {
314        for attr in attributes {
315            self.push_attribute(attr);
316        }
317        self
318    }
319
320    /// Adds an attribute to this element.
321    pub fn push_attribute<'b, A>(&mut self, attr: A)
322    where
323        A: Into<Attribute<'b>>,
324    {
325        let a = attr.into();
326        let bytes = self.buf.to_mut();
327        bytes.push(b' ');
328        bytes.extend_from_slice(a.key);
329        bytes.extend_from_slice(b"=\"");
330        bytes.extend_from_slice(&*a.value);
331        bytes.push(b'"');
332    }
333
334    /// Remove all attributes from the ByteStart
335    pub fn clear_attributes(&mut self) -> &mut BytesStart<'a> {
336        self.buf.to_mut().truncate(self.name_len);
337        self
338    }
339
340    /// Returns an iterator over the attributes of this tag.
341    pub fn attributes(&self) -> Attributes {
342        Attributes::new(&self.buf, self.name_len)
343    }
344
345    /// Returns an iterator over the HTML-like attributes of this tag (no mandatory quotes or `=`).
346    pub fn html_attributes(&self) -> Attributes {
347        Attributes::html(self, self.name_len)
348    }
349
350    /// Gets the undecoded raw string with the attributes of this tag as a `&[u8]`,
351    /// including the whitespace after the tag name if there is any.
352    #[inline]
353    pub fn attributes_raw(&self) -> &[u8] {
354        &self.buf[self.name_len..]
355    }
356
357    /// Try to get an attribute
358    pub fn try_get_attribute<N: AsRef<[u8]> + Sized>(
359        &'a self,
360        attr_name: N,
361    ) -> Result<Option<Attribute<'a>>> {
362        for a in self.attributes() {
363            let a = a?;
364            if a.key == attr_name.as_ref() {
365                return Ok(Some(a));
366            }
367        }
368        Ok(None)
369    }
370}
371
372impl<'a> std::fmt::Debug for BytesStart<'a> {
373    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
374        write!(f, "BytesStart {{ buf: ")?;
375        write_cow_string(f, &self.buf)?;
376        write!(f, ", name_len: {} }}", self.name_len)
377    }
378}
379
380////////////////////////////////////////////////////////////////////////////////////////////////////
381
382/// An XML declaration (`Event::Decl`).
383///
384/// [W3C XML 1.1 Prolog and Document Type Declaration](http://w3.org/TR/xml11/#sec-prolog-dtd)
385#[derive(Clone, Debug, Eq, PartialEq)]
386pub struct BytesDecl<'a> {
387    element: BytesStart<'a>,
388}
389
390impl<'a> BytesDecl<'a> {
391    /// Creates a `BytesDecl` from a `BytesStart`
392    pub fn from_start(start: BytesStart<'a>) -> BytesDecl<'a> {
393        BytesDecl { element: start }
394    }
395
396    /// Gets xml version, excluding quotes (`'` or `"`).
397    ///
398    /// According to the [grammar], the version *must* be the first thing in the declaration.
399    /// This method tries to extract the first thing in the declaration and return it.
400    /// In case of multiple attributes value of the first one is returned.
401    ///
402    /// If version is missed in the declaration, or the first thing is not a version,
403    /// [`Error::XmlDeclWithoutVersion`] will be returned.
404    ///
405    /// # Examples
406    ///
407    /// ```
408    /// use std::borrow::Cow;
409    /// use fast_xml::Error;
410    /// use fast_xml::events::{BytesDecl, BytesStart};
411    ///
412    /// // <?xml version='1.1'?>
413    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" version='1.1'", 0));
414    /// assert_eq!(
415    ///     decl.version().unwrap(),
416    ///     Cow::Borrowed(b"1.1".as_ref())
417    /// );
418    ///
419    /// // <?xml version='1.0' version='1.1'?>
420    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" version='1.0' version='1.1'", 0));
421    /// assert_eq!(
422    ///     decl.version().unwrap(),
423    ///     Cow::Borrowed(b"1.0".as_ref())
424    /// );
425    ///
426    /// // <?xml encoding='utf-8'?>
427    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" encoding='utf-8'", 0));
428    /// match decl.version() {
429    ///     Err(Error::XmlDeclWithoutVersion(Some(key))) => assert_eq!(key, "encoding".to_string()),
430    ///     _ => assert!(false),
431    /// }
432    ///
433    /// // <?xml encoding='utf-8' version='1.1'?>
434    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" encoding='utf-8' version='1.1'", 0));
435    /// match decl.version() {
436    ///     Err(Error::XmlDeclWithoutVersion(Some(key))) => assert_eq!(key, "encoding".to_string()),
437    ///     _ => assert!(false),
438    /// }
439    ///
440    /// // <?xml?>
441    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b"", 0));
442    /// match decl.version() {
443    ///     Err(Error::XmlDeclWithoutVersion(None)) => {},
444    ///     _ => assert!(false),
445    /// }
446    /// ```
447    ///
448    /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
449    pub fn version(&self) -> Result<Cow<[u8]>> {
450        // The version *must* be the first thing in the declaration.
451        match self.element.attributes().with_checks(false).next() {
452            Some(Ok(a)) if a.key == b"version" => Ok(a.value),
453            // first attribute was not "version"
454            Some(Ok(a)) => {
455                let found = from_utf8(a.key).map_err(Error::Utf8)?.to_string();
456                Err(Error::XmlDeclWithoutVersion(Some(found)))
457            }
458            // error parsing attributes
459            Some(Err(e)) => Err(e.into()),
460            // no attributes
461            None => Err(Error::XmlDeclWithoutVersion(None)),
462        }
463    }
464
465    /// Gets xml encoding, excluding quotes (`'` or `"`).
466    ///
467    /// Although according to the [grammar] encoding must appear before `"standalone"`
468    /// and after `"version"`, this method does not check that. The first occurrence
469    /// of the attribute will be returned even if there are several. Also, method does
470    /// not restrict symbols that can forming the encoding, so the returned encoding
471    /// name may not correspond to the grammar.
472    ///
473    /// # Examples
474    ///
475    /// ```
476    /// use std::borrow::Cow;
477    /// use fast_xml::Error;
478    /// use fast_xml::events::{BytesDecl, BytesStart};
479    ///
480    /// // <?xml version='1.1'?>
481    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" version='1.1'", 0));
482    /// assert!(decl.encoding().is_none());
483    ///
484    /// // <?xml encoding='utf-8'?>
485    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" encoding='utf-8'", 0));
486    /// match decl.encoding() {
487    ///     Some(Ok(Cow::Borrowed(encoding))) => assert_eq!(encoding, b"utf-8"),
488    ///     _ => assert!(false),
489    /// }
490    ///
491    /// // <?xml encoding='something_WRONG' encoding='utf-8'?>
492    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" encoding='something_WRONG' encoding='utf-8'", 0));
493    /// match decl.encoding() {
494    ///     Some(Ok(Cow::Borrowed(encoding))) => assert_eq!(encoding, b"something_WRONG"),
495    ///     _ => assert!(false),
496    /// }
497    /// ```
498    ///
499    /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
500    pub fn encoding(&self) -> Option<Result<Cow<[u8]>>> {
501        self.element
502            .try_get_attribute("encoding")
503            .map(|a| a.map(|a| a.value))
504            .transpose()
505    }
506
507    /// Gets xml standalone, excluding quotes (`'` or `"`).
508    ///
509    /// Although according to the [grammar] standalone flag must appear after `"version"`
510    /// and `"encoding"`, this method does not check that. The first occurrence of the
511    /// attribute will be returned even if there are several. Also, method does not
512    /// restrict symbols that can forming the value, so the returned flag name may not
513    /// correspond to the grammar.
514    ///
515    /// # Examples
516    ///
517    /// ```
518    /// use std::borrow::Cow;
519    /// use fast_xml::Error;
520    /// use fast_xml::events::{BytesDecl, BytesStart};
521    ///
522    /// // <?xml version='1.1'?>
523    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" version='1.1'", 0));
524    /// assert!(decl.standalone().is_none());
525    ///
526    /// // <?xml standalone='yes'?>
527    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" standalone='yes'", 0));
528    /// match decl.standalone() {
529    ///     Some(Ok(Cow::Borrowed(encoding))) => assert_eq!(encoding, b"yes"),
530    ///     _ => assert!(false),
531    /// }
532    ///
533    /// // <?xml standalone='something_WRONG' encoding='utf-8'?>
534    /// let decl = BytesDecl::from_start(BytesStart::borrowed(b" standalone='something_WRONG' encoding='utf-8'", 0));
535    /// match decl.standalone() {
536    ///     Some(Ok(Cow::Borrowed(flag))) => assert_eq!(flag, b"something_WRONG"),
537    ///     _ => assert!(false),
538    /// }
539    /// ```
540    ///
541    /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl
542    pub fn standalone(&self) -> Option<Result<Cow<[u8]>>> {
543        self.element
544            .try_get_attribute("standalone")
545            .map(|a| a.map(|a| a.value))
546            .transpose()
547    }
548
549    /// Constructs a new `XmlDecl` from the (mandatory) _version_ (should be `1.0` or `1.1`),
550    /// the optional _encoding_ (e.g., `UTF-8`) and the optional _standalone_ (`yes` or `no`)
551    /// attribute.
552    ///
553    /// Does not escape any of its inputs. Always uses double quotes to wrap the attribute values.
554    /// The caller is responsible for escaping attribute values. Shouldn't usually be relevant since
555    /// the double quote character is not allowed in any of the attribute values.
556    pub fn new(
557        version: &[u8],
558        encoding: Option<&[u8]>,
559        standalone: Option<&[u8]>,
560    ) -> BytesDecl<'static> {
561        // Compute length of the buffer based on supplied attributes
562        // ' encoding=""'   => 12
563        let encoding_attr_len = if let Some(xs) = encoding {
564            12 + xs.len()
565        } else {
566            0
567        };
568        // ' standalone=""' => 14
569        let standalone_attr_len = if let Some(xs) = standalone {
570            14 + xs.len()
571        } else {
572            0
573        };
574        // 'xml version=""' => 14
575        let mut buf = Vec::with_capacity(14 + encoding_attr_len + standalone_attr_len);
576
577        buf.extend_from_slice(b"xml version=\"");
578        buf.extend_from_slice(version);
579
580        if let Some(encoding_val) = encoding {
581            buf.extend_from_slice(b"\" encoding=\"");
582            buf.extend_from_slice(encoding_val);
583        }
584
585        if let Some(standalone_val) = standalone {
586            buf.extend_from_slice(b"\" standalone=\"");
587            buf.extend_from_slice(standalone_val);
588        }
589        buf.push(b'"');
590
591        BytesDecl {
592            element: BytesStart::owned(buf, 3),
593        }
594    }
595
596    /// Gets the decoder struct
597    #[cfg(feature = "encoding_rs")]
598    pub fn encoder(&self) -> Option<&'static Encoding> {
599        self.encoding()
600            .and_then(|e| e.ok())
601            .and_then(|e| Encoding::for_label(&*e))
602    }
603
604    /// Converts the event into an owned event.
605    pub fn into_owned(self) -> BytesDecl<'static> {
606        BytesDecl {
607            element: self.element.into_owned(),
608        }
609    }
610}
611
612////////////////////////////////////////////////////////////////////////////////////////////////////
613
614/// A struct to manage `Event::End` events
615#[derive(Clone, Eq, PartialEq)]
616pub struct BytesEnd<'a> {
617    name: Cow<'a, [u8]>,
618}
619
620impl<'a> BytesEnd<'a> {
621    /// Creates a new `BytesEnd` borrowing a slice
622    #[inline]
623    pub fn borrowed(name: &'a [u8]) -> BytesEnd<'a> {
624        BytesEnd {
625            name: Cow::Borrowed(name),
626        }
627    }
628
629    /// Creates a new `BytesEnd` owning its name
630    #[inline]
631    pub fn owned(name: Vec<u8>) -> BytesEnd<'static> {
632        BytesEnd {
633            name: Cow::Owned(name),
634        }
635    }
636
637    /// Converts the event into an owned event.
638    pub fn into_owned(self) -> BytesEnd<'static> {
639        BytesEnd {
640            name: Cow::Owned(self.name.into_owned()),
641        }
642    }
643
644    /// Gets `BytesEnd` event name
645    #[inline]
646    pub fn name(&self) -> &[u8] {
647        &*self.name
648    }
649
650    /// local name (excluding namespace) as &[u8] (without eventual attributes)
651    /// returns the name() with any leading namespace removed (all content up to
652    /// and including the first ':' character)
653    #[inline]
654    pub fn local_name(&self) -> &[u8] {
655        if let Some(i) = self.name().iter().position(|b| *b == b':') {
656            &self.name()[i + 1..]
657        } else {
658            self.name()
659        }
660    }
661}
662
663impl<'a> std::fmt::Debug for BytesEnd<'a> {
664    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
665        write!(f, "BytesEnd {{ name: ")?;
666        write_cow_string(f, &self.name)?;
667        write!(f, " }}")
668    }
669}
670
671////////////////////////////////////////////////////////////////////////////////////////////////////
672
673/// Data from various events (most notably, `Event::Text`) that stored in XML
674/// in escaped form. Internally data is stored in escaped form
675#[derive(Clone, Eq, PartialEq)]
676pub struct BytesText<'a> {
677    // Invariant: The content is always escaped.
678    content: Cow<'a, [u8]>,
679}
680
681impl<'a> BytesText<'a> {
682    /// Creates a new `BytesText` from an escaped byte sequence.
683    #[inline]
684    pub fn from_escaped<C: Into<Cow<'a, [u8]>>>(content: C) -> Self {
685        Self {
686            content: content.into(),
687        }
688    }
689
690    /// Creates a new `BytesText` from a byte sequence. The byte sequence is
691    /// expected not to be escaped.
692    #[inline]
693    pub fn from_plain(content: &'a [u8]) -> Self {
694        Self {
695            content: escape(content),
696        }
697    }
698
699    /// Creates a new `BytesText` from an escaped string.
700    #[inline]
701    pub fn from_escaped_str<C: Into<Cow<'a, str>>>(content: C) -> Self {
702        Self::from_escaped(match content.into() {
703            Cow::Owned(o) => Cow::Owned(o.into_bytes()),
704            Cow::Borrowed(b) => Cow::Borrowed(b.as_bytes()),
705        })
706    }
707
708    /// Creates a new `BytesText` from a string. The string is expected not to
709    /// be escaped.
710    #[inline]
711    pub fn from_plain_str(content: &'a str) -> Self {
712        Self::from_plain(content.as_bytes())
713    }
714
715    /// Ensures that all data is owned to extend the object's lifetime if
716    /// necessary.
717    #[inline]
718    pub fn into_owned(self) -> BytesText<'static> {
719        BytesText {
720            content: self.content.into_owned().into(),
721        }
722    }
723
724    /// Extracts the inner `Cow` from the `BytesText` event container.
725    #[inline]
726    pub fn into_inner(self) -> Cow<'a, [u8]> {
727        self.content
728    }
729
730    /// Returns unescaped version of the text content, that can be written
731    /// as CDATA in XML
732    #[cfg(feature = "serialize")]
733    pub(crate) fn unescape(self) -> std::result::Result<BytesCData<'a>, EscapeError> {
734        //TODO: need to think about better API instead of dozens similar functions
735        // Maybe use builder pattern. After that expose function as public API
736        //FIXME: need to take into account entities defined in the document
737        Ok(BytesCData::new(match do_unescape(&self.content, None)? {
738            Cow::Borrowed(_) => self.content,
739            Cow::Owned(unescaped) => Cow::Owned(unescaped),
740        }))
741    }
742
743    /// gets escaped content
744    ///
745    /// Searches for '&' into content and try to escape the coded character if possible
746    /// returns Malformed error with index within element if '&' is not followed by ';'
747    ///
748    /// See also [`unescaped_with_custom_entities()`](#method.unescaped_with_custom_entities)
749    pub fn unescaped(&self) -> Result<Cow<[u8]>> {
750        self.make_unescaped(None)
751    }
752
753    /// gets escaped content with custom entities
754    ///
755    /// Searches for '&' into content and try to escape the coded character if possible
756    /// returns Malformed error with index within element if '&' is not followed by ';'
757    /// Additional entities can be provided in `custom_entities`.
758    ///
759    /// # Pre-condition
760    ///
761    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
762    ///
763    /// See also [`unescaped()`](#method.unescaped)
764    pub fn unescaped_with_custom_entities<'s>(
765        &'s self,
766        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
767    ) -> Result<Cow<'s, [u8]>> {
768        self.make_unescaped(Some(custom_entities))
769    }
770
771    fn make_unescaped<'s>(
772        &'s self,
773        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
774    ) -> Result<Cow<'s, [u8]>> {
775        do_unescape(self, custom_entities).map_err(Error::EscapeError)
776    }
777
778    /// helper method to unescape then decode self using the reader encoding
779    /// but without BOM (Byte order mark)
780    ///
781    /// for performance reasons (could avoid allocating a `String`),
782    /// it might be wiser to manually use
783    /// 1. BytesText::unescaped()
784    /// 2. Reader::decode(...)
785    #[cfg(feature = "encoding")]
786    pub fn unescape_and_decode_without_bom<B: BufRead>(
787        &self,
788        reader: &mut Reader<B>,
789    ) -> Result<String> {
790        self.do_unescape_and_decode_without_bom(reader, None)
791    }
792
793    /// helper method to unescape then decode self using the reader encoding
794    /// but without BOM (Byte order mark)
795    ///
796    /// for performance reasons (could avoid allocating a `String`),
797    /// it might be wiser to manually use
798    /// 1. BytesText::unescaped()
799    /// 2. Reader::decode(...)
800    #[cfg(not(feature = "encoding"))]
801    pub fn unescape_and_decode_without_bom<B: BufRead>(
802        &self,
803        reader: &Reader<B>,
804    ) -> Result<String> {
805        self.do_unescape_and_decode_without_bom(reader, None)
806    }
807
808    /// helper method to unescape then decode self using the reader encoding with custom entities
809    /// but without BOM (Byte order mark)
810    ///
811    /// for performance reasons (could avoid allocating a `String`),
812    /// it might be wiser to manually use
813    /// 1. BytesText::unescaped()
814    /// 2. Reader::decode(...)
815    ///
816    /// # Pre-condition
817    ///
818    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
819    #[cfg(feature = "encoding")]
820    pub fn unescape_and_decode_without_bom_with_custom_entities<B: BufRead>(
821        &self,
822        reader: &mut Reader<B>,
823        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
824    ) -> Result<String> {
825        self.do_unescape_and_decode_without_bom(reader, Some(custom_entities))
826    }
827
828    /// helper method to unescape then decode self using the reader encoding with custom entities
829    /// but without BOM (Byte order mark)
830    ///
831    /// for performance reasons (could avoid allocating a `String`),
832    /// it might be wiser to manually use
833    /// 1. BytesText::unescaped()
834    /// 2. Reader::decode(...)
835    ///
836    /// # Pre-condition
837    ///
838    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
839    #[cfg(not(feature = "encoding"))]
840    pub fn unescape_and_decode_without_bom_with_custom_entities<B: BufRead>(
841        &self,
842        reader: &Reader<B>,
843        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
844    ) -> Result<String> {
845        self.do_unescape_and_decode_without_bom(reader, Some(custom_entities))
846    }
847
848    #[cfg(feature = "encoding")]
849    fn do_unescape_and_decode_without_bom<B: BufRead>(
850        &self,
851        reader: &mut Reader<B>,
852        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
853    ) -> Result<String> {
854        let decoded = reader.decode_without_bom(&*self);
855        let unescaped =
856            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
857        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
858    }
859
860    #[cfg(not(feature = "encoding"))]
861    fn do_unescape_and_decode_without_bom<B: BufRead>(
862        &self,
863        reader: &Reader<B>,
864        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
865    ) -> Result<String> {
866        let decoded = reader.decode_without_bom(&*self)?;
867        let unescaped =
868            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
869        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
870    }
871
872    /// helper method to unescape then decode self using the reader encoding
873    ///
874    /// for performance reasons (could avoid allocating a `String`),
875    /// it might be wiser to manually use
876    /// 1. BytesText::unescaped()
877    /// 2. Reader::decode(...)
878    pub fn unescape_and_decode<B: BufRead>(&self, reader: &Reader<B>) -> Result<String> {
879        self.do_unescape_and_decode_with_custom_entities(reader, None)
880    }
881
882    /// helper method to unescape then decode self using the reader encoding with custom entities
883    ///
884    /// for performance reasons (could avoid allocating a `String`),
885    /// it might be wiser to manually use
886    /// 1. BytesText::unescaped()
887    /// 2. Reader::decode(...)
888    ///
889    /// # Pre-condition
890    ///
891    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
892    pub fn unescape_and_decode_with_custom_entities<B: BufRead>(
893        &self,
894        reader: &Reader<B>,
895        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
896    ) -> Result<String> {
897        self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities))
898    }
899
900    #[cfg(feature = "encoding")]
901    fn do_unescape_and_decode_with_custom_entities<B: BufRead>(
902        &self,
903        reader: &Reader<B>,
904        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
905    ) -> Result<String> {
906        let decoded = reader.decode(&*self);
907        let unescaped =
908            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
909        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
910    }
911
912    #[cfg(not(feature = "encoding"))]
913    fn do_unescape_and_decode_with_custom_entities<B: BufRead>(
914        &self,
915        reader: &Reader<B>,
916        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
917    ) -> Result<String> {
918        let decoded = reader.decode(&*self)?;
919        let unescaped =
920            do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?;
921        String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error()))
922    }
923
924    /// Gets escaped content.
925    pub fn escaped(&self) -> &[u8] {
926        self.content.as_ref()
927    }
928}
929
930impl<'a> std::fmt::Debug for BytesText<'a> {
931    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
932        write!(f, "BytesText {{ content: ")?;
933        write_cow_string(f, &self.content)?;
934        write!(f, " }}")
935    }
936}
937
938////////////////////////////////////////////////////////////////////////////////////////////////////
939
940/// CDATA content contains unescaped data from the reader. If you want to write them as a text,
941/// [convert](Self::escape) it to [`BytesText`]
942#[derive(Clone, Eq, PartialEq)]
943pub struct BytesCData<'a> {
944    content: Cow<'a, [u8]>,
945}
946
947impl<'a> BytesCData<'a> {
948    /// Creates a new `BytesCData` from a byte sequence.
949    #[inline]
950    pub fn new<C: Into<Cow<'a, [u8]>>>(content: C) -> Self {
951        Self {
952            content: content.into(),
953        }
954    }
955
956    /// Creates a new `BytesCData` from a string
957    #[inline]
958    pub fn from_str(content: &'a str) -> Self {
959        Self::new(content.as_bytes())
960    }
961
962    /// Ensures that all data is owned to extend the object's lifetime if
963    /// necessary.
964    #[inline]
965    pub fn into_owned(self) -> BytesCData<'static> {
966        BytesCData {
967            content: self.content.into_owned().into(),
968        }
969    }
970
971    /// Extracts the inner `Cow` from the `BytesCData` event container.
972    #[inline]
973    pub fn into_inner(self) -> Cow<'a, [u8]> {
974        self.content
975    }
976
977    /// Converts this CDATA content to an escaped version, that can be written
978    /// as an usual text in XML.
979    ///
980    /// This function performs following replacements:
981    ///
982    /// | Character | Replacement
983    /// |-----------|------------
984    /// | `<`       | `&lt;`
985    /// | `>`       | `&gt;`
986    /// | `&`       | `&amp;`
987    /// | `'`       | `&apos;`
988    /// | `"`       | `&quot;`
989    pub fn escape(self) -> BytesText<'a> {
990        BytesText::from_escaped(match escape(&self.content) {
991            Cow::Borrowed(_) => self.content,
992            Cow::Owned(escaped) => Cow::Owned(escaped),
993        })
994    }
995
996    /// Converts this CDATA content to an escaped version, that can be written
997    /// as an usual text in XML.
998    ///
999    /// In XML text content, it is allowed (though not recommended) to leave
1000    /// the quote special characters `"` and `'` unescaped.
1001    ///
1002    /// This function performs following replacements:
1003    ///
1004    /// | Character | Replacement
1005    /// |-----------|------------
1006    /// | `<`       | `&lt;`
1007    /// | `>`       | `&gt;`
1008    /// | `&`       | `&amp;`
1009    pub fn partial_escape(self) -> BytesText<'a> {
1010        BytesText::from_escaped(match partial_escape(&self.content) {
1011            Cow::Borrowed(_) => self.content,
1012            Cow::Owned(escaped) => Cow::Owned(escaped),
1013        })
1014    }
1015
1016    /// Gets content of this text buffer in the specified encoding
1017    #[cfg(feature = "serialize")]
1018    pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
1019        Ok(match &self.content {
1020            Cow::Borrowed(bytes) => {
1021                #[cfg(feature = "encoding")]
1022                {
1023                    decoder.decode(bytes)
1024                }
1025                #[cfg(not(feature = "encoding"))]
1026                {
1027                    decoder.decode(bytes)?.into()
1028                }
1029            }
1030            Cow::Owned(bytes) => {
1031                #[cfg(feature = "encoding")]
1032                let decoded = decoder.decode(bytes).into_owned();
1033
1034                #[cfg(not(feature = "encoding"))]
1035                let decoded = decoder.decode(bytes)?.to_string();
1036
1037                decoded.into()
1038            }
1039        })
1040    }
1041}
1042
1043impl<'a> std::fmt::Debug for BytesCData<'a> {
1044    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1045        write!(f, "BytesCData {{ content: ")?;
1046        write_cow_string(f, &self.content)?;
1047        write!(f, " }}")
1048    }
1049}
1050
1051////////////////////////////////////////////////////////////////////////////////////////////////////
1052
1053/// Event emitted by [`Reader::read_event`].
1054///
1055/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event
1056#[derive(Clone, Debug, Eq, PartialEq)]
1057pub enum Event<'a> {
1058    /// Start tag (with attributes) `<tag attr="value">`.
1059    Start(BytesStart<'a>),
1060    /// End tag `</tag>`.
1061    End(BytesEnd<'a>),
1062    /// Empty element tag (with attributes) `<tag attr="value" />`.
1063    Empty(BytesStart<'a>),
1064    /// Character data between `Start` and `End` element.
1065    Text(BytesText<'a>),
1066    /// Comment `<!-- ... -->`.
1067    Comment(BytesText<'a>),
1068    /// CData `<![CDATA[...]]>`.
1069    CData(BytesCData<'a>),
1070    /// XML declaration `<?xml ...?>`.
1071    Decl(BytesDecl<'a>),
1072    /// Processing instruction `<?...?>`.
1073    PI(BytesText<'a>),
1074    /// Doctype `<!DOCTYPE ...>`.
1075    DocType(BytesText<'a>),
1076    /// End of XML document.
1077    Eof,
1078}
1079
1080impl<'a> Event<'a> {
1081    /// Converts the event to an owned version, untied to the lifetime of
1082    /// buffer used when reading but incurring a new, separate allocation.
1083    pub fn into_owned(self) -> Event<'static> {
1084        match self {
1085            Event::Start(e) => Event::Start(e.into_owned()),
1086            Event::End(e) => Event::End(e.into_owned()),
1087            Event::Empty(e) => Event::Empty(e.into_owned()),
1088            Event::Text(e) => Event::Text(e.into_owned()),
1089            Event::Comment(e) => Event::Comment(e.into_owned()),
1090            Event::CData(e) => Event::CData(e.into_owned()),
1091            Event::Decl(e) => Event::Decl(e.into_owned()),
1092            Event::PI(e) => Event::PI(e.into_owned()),
1093            Event::DocType(e) => Event::DocType(e.into_owned()),
1094            Event::Eof => Event::Eof,
1095        }
1096    }
1097}
1098
1099////////////////////////////////////////////////////////////////////////////////////////////////////
1100
1101impl<'a> Deref for BytesStart<'a> {
1102    type Target = [u8];
1103    fn deref(&self) -> &[u8] {
1104        &*self.buf
1105    }
1106}
1107
1108impl<'a> Deref for BytesDecl<'a> {
1109    type Target = [u8];
1110    fn deref(&self) -> &[u8] {
1111        &*self.element
1112    }
1113}
1114
1115impl<'a> Deref for BytesEnd<'a> {
1116    type Target = [u8];
1117    fn deref(&self) -> &[u8] {
1118        &*self.name
1119    }
1120}
1121
1122impl<'a> Deref for BytesText<'a> {
1123    type Target = [u8];
1124    fn deref(&self) -> &[u8] {
1125        &*self.content
1126    }
1127}
1128
1129impl<'a> Deref for BytesCData<'a> {
1130    type Target = [u8];
1131
1132    fn deref(&self) -> &[u8] {
1133        &*self.content
1134    }
1135}
1136
1137impl<'a> Deref for Event<'a> {
1138    type Target = [u8];
1139    fn deref(&self) -> &[u8] {
1140        match *self {
1141            Event::Start(ref e) | Event::Empty(ref e) => &*e,
1142            Event::End(ref e) => &*e,
1143            Event::Text(ref e) => &*e,
1144            Event::Decl(ref e) => &*e,
1145            Event::PI(ref e) => &*e,
1146            Event::CData(ref e) => &*e,
1147            Event::Comment(ref e) => &*e,
1148            Event::DocType(ref e) => &*e,
1149            Event::Eof => &[],
1150        }
1151    }
1152}
1153
1154impl<'a> AsRef<Event<'a>> for Event<'a> {
1155    fn as_ref(&self) -> &Event<'a> {
1156        self
1157    }
1158}
1159
1160////////////////////////////////////////////////////////////////////////////////////////////////////
1161
1162#[cfg(test)]
1163mod test {
1164    use super::*;
1165    use pretty_assertions::assert_eq;
1166
1167    #[test]
1168    fn local_name() {
1169        use std::str::from_utf8;
1170        let xml = r#"
1171            <foo:bus attr='bar'>foobusbar</foo:bus>
1172            <foo: attr='bar'>foobusbar</foo:>
1173            <:foo attr='bar'>foobusbar</:foo>
1174            <foo:bus:baz attr='bar'>foobusbar</foo:bus:baz>
1175            "#;
1176        let mut rdr = Reader::from_str(xml);
1177        let mut buf = Vec::new();
1178        let mut parsed_local_names = Vec::new();
1179        loop {
1180            match rdr.read_event(&mut buf).expect("unable to read xml event") {
1181                Event::Start(ref e) => parsed_local_names.push(
1182                    from_utf8(e.local_name())
1183                        .expect("unable to build str from local_name")
1184                        .to_string(),
1185                ),
1186                Event::End(ref e) => parsed_local_names.push(
1187                    from_utf8(e.local_name())
1188                        .expect("unable to build str from local_name")
1189                        .to_string(),
1190                ),
1191                Event::Eof => break,
1192                _ => {}
1193            }
1194        }
1195        assert_eq!(parsed_local_names[0], "bus".to_string());
1196        assert_eq!(parsed_local_names[1], "bus".to_string());
1197        assert_eq!(parsed_local_names[2], "".to_string());
1198        assert_eq!(parsed_local_names[3], "".to_string());
1199        assert_eq!(parsed_local_names[4], "foo".to_string());
1200        assert_eq!(parsed_local_names[5], "foo".to_string());
1201        assert_eq!(parsed_local_names[6], "bus:baz".to_string());
1202        assert_eq!(parsed_local_names[7], "bus:baz".to_string());
1203    }
1204
1205    #[test]
1206    fn bytestart_create() {
1207        let b = BytesStart::owned_name("test");
1208        assert_eq!(b.len(), 4);
1209        assert_eq!(b.name(), b"test");
1210    }
1211
1212    #[test]
1213    fn bytestart_set_name() {
1214        let mut b = BytesStart::owned_name("test");
1215        assert_eq!(b.len(), 4);
1216        assert_eq!(b.name(), b"test");
1217        assert_eq!(b.attributes_raw(), b"");
1218        b.push_attribute(("x", "a"));
1219        assert_eq!(b.len(), 10);
1220        assert_eq!(b.attributes_raw(), b" x=\"a\"");
1221        b.set_name(b"g");
1222        assert_eq!(b.len(), 7);
1223        assert_eq!(b.name(), b"g");
1224    }
1225
1226    #[test]
1227    fn bytestart_clear_attributes() {
1228        let mut b = BytesStart::owned_name("test");
1229        b.push_attribute(("x", "y\"z"));
1230        b.push_attribute(("x", "y\"z"));
1231        b.clear_attributes();
1232        assert!(b.attributes().next().is_none());
1233        assert_eq!(b.len(), 4);
1234        assert_eq!(b.name(), b"test");
1235    }
1236}