quick_xml/reader/slice_reader.rs
1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::io;
6
7#[cfg(feature = "encoding")]
8use crate::encoding::DetectedEncoding;
9#[cfg(feature = "encoding")]
10use crate::reader::EncodingRef;
11#[cfg(feature = "encoding")]
12use encoding_rs;
13
14use crate::errors::{Error, Result};
15use crate::events::{BytesText, Event};
16use crate::name::QName;
17use crate::parser::Parser;
18use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
19use crate::utils::is_whitespace;
20
21/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
22/// This implementation supports not using an intermediate buffer as the byte slice
23/// itself can be used to borrow from.
24impl<'a> Reader<&'a [u8]> {
25 /// Creates an XML reader from a string slice.
26 #[allow(clippy::should_implement_trait)]
27 pub fn from_str(s: &'a str) -> Self {
28 // Rust strings are guaranteed to be UTF-8, so lock the encoding
29 #[cfg(feature = "encoding")]
30 {
31 let mut reader = Self::from_reader(s.as_bytes());
32 reader.state.encoding = EncodingRef::Explicit(encoding_rs::UTF_8);
33 reader
34 }
35
36 #[cfg(not(feature = "encoding"))]
37 Self::from_reader(s.as_bytes())
38 }
39
40 /// Read an event that borrows from the input rather than a buffer.
41 ///
42 /// There is no asynchronous `read_event_async()` version of this function,
43 /// because it is not necessary -- the contents are already in memory and no IO
44 /// is needed, therefore there is no potential for blocking.
45 ///
46 /// # Examples
47 ///
48 /// ```
49 /// # use pretty_assertions::assert_eq;
50 /// use quick_xml::events::Event;
51 /// use quick_xml::reader::Reader;
52 ///
53 /// let mut reader = Reader::from_str(r#"
54 /// <tag1 att1 = "test">
55 /// <tag2><!--Test comment-->Test</tag2>
56 /// <tag2>Test 2</tag2>
57 /// </tag1>
58 /// "#);
59 /// reader.config_mut().trim_text(true);
60 ///
61 /// let mut count = 0;
62 /// let mut txt = Vec::new();
63 /// loop {
64 /// match reader.read_event().unwrap() {
65 /// Event::Start(e) => count += 1,
66 /// Event::Text(e) => txt.push(e.decode().unwrap().into_owned()),
67 /// Event::Eof => break,
68 /// _ => (),
69 /// }
70 /// }
71 /// assert_eq!(count, 3);
72 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
73 /// ```
74 #[inline]
75 pub fn read_event(&mut self) -> Result<Event<'a>> {
76 self.read_event_impl(())
77 }
78
79 /// Reads until end element is found. This function is supposed to be called
80 /// after you already read a [`Start`] event.
81 ///
82 /// Returns a span that cover content between `>` of an opening tag and `<` of
83 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
84 /// this method was called after reading expanded [`Start`] event.
85 ///
86 /// Manages nested cases where parent and child elements have the _literally_
87 /// same name.
88 ///
89 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
90 /// will be returned. In particularly, that error will be returned if you call
91 /// this method without consuming the corresponding [`Start`] event first.
92 ///
93 /// The `end` parameter should contain name of the end element _in the reader
94 /// encoding_. It is good practice to always get that parameter using
95 /// [`BytesStart::to_end()`] method.
96 ///
97 /// The correctness of the skipped events does not checked, if you disabled
98 /// the [`check_end_names`] option.
99 ///
100 /// There is no asynchronous `read_to_end_async()` version of this function,
101 /// because it is not necessary -- the contents are already in memory and no IO
102 /// is needed, therefore there is no potential for blocking.
103 ///
104 /// # Namespaces
105 ///
106 /// While the `Reader` does not support namespace resolution, namespaces
107 /// does not change the algorithm for comparing names. Although the names
108 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
109 /// same namespace, are semantically equivalent, `</b:name>` cannot close
110 /// `<a:name>`, because according to [the specification]
111 ///
112 /// > The end of every element that begins with a **start-tag** MUST be marked
113 /// > by an **end-tag** containing a name that echoes the element's type as
114 /// > given in the **start-tag**
115 ///
116 /// # Examples
117 ///
118 /// This example shows, how you can skip XML content after you read the
119 /// start event.
120 ///
121 /// ```
122 /// # use pretty_assertions::assert_eq;
123 /// use quick_xml::events::{BytesStart, Event};
124 /// use quick_xml::reader::Reader;
125 ///
126 /// let mut reader = Reader::from_str(r#"
127 /// <outer>
128 /// <inner>
129 /// <inner></inner>
130 /// <inner/>
131 /// <outer></outer>
132 /// <outer/>
133 /// </inner>
134 /// </outer>
135 /// "#);
136 /// reader.config_mut().trim_text(true);
137 ///
138 /// let start = BytesStart::new("outer");
139 /// let end = start.to_end().into_owned();
140 ///
141 /// // First, we read a start event...
142 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
143 ///
144 /// // ...then, we could skip all events to the corresponding end event.
145 /// // This call will correctly handle nested <outer> elements.
146 /// // Note, however, that this method does not handle namespaces.
147 /// reader.read_to_end(end.name()).unwrap();
148 ///
149 /// // At the end we should get an Eof event, because we ate the whole XML
150 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
151 /// ```
152 ///
153 /// [`Start`]: Event::Start
154 /// [`End`]: Event::End
155 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
156 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
157 /// [`check_end_names`]: crate::reader::Config::check_end_names
158 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
159 pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
160 Ok(read_to_end!(self, end, (), read_event_impl, {}))
161 }
162
163 /// Reads content between start and end tags, including any markup. This
164 /// function is supposed to be called after you already read a [`Start`] event.
165 ///
166 /// Manages nested cases where parent and child elements have the _literally_
167 /// same name.
168 ///
169 /// This method does not unescape read data, instead it returns content
170 /// "as is" of the XML document. This is because it has no idea what text
171 /// it reads, and if, for example, it contains CDATA section, attempt to
172 /// unescape it content will spoil data.
173 ///
174 /// Any text will be decoded using the XML current [`decoder()`].
175 ///
176 /// Actually, this method perform the following code:
177 ///
178 /// ```ignore
179 /// let span = reader.read_to_end(end)?;
180 /// let text = reader.decoder().decode(&reader.inner_slice[span]);
181 /// ```
182 ///
183 /// # Examples
184 ///
185 /// This example shows, how you can read a HTML content from your XML document.
186 ///
187 /// ```
188 /// # use pretty_assertions::assert_eq;
189 /// # use std::borrow::Cow;
190 /// use quick_xml::events::{BytesStart, Event};
191 /// use quick_xml::reader::Reader;
192 ///
193 /// let mut reader = Reader::from_str("
194 /// <html>
195 /// <title>This is a HTML text</title>
196 /// <p>Usual XML rules does not apply inside it
197 /// <p>For example, elements not needed to be "closed"
198 /// </html>
199 /// ");
200 /// reader.config_mut().trim_text(true);
201 ///
202 /// let start = BytesStart::new("html");
203 /// let end = start.to_end().into_owned();
204 ///
205 /// // First, we read a start event...
206 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
207 /// // ...and disable checking of end names because we expect HTML further...
208 /// reader.config_mut().check_end_names = false;
209 ///
210 /// // ...then, we could read text content until close tag.
211 /// // This call will correctly handle nested <html> elements.
212 /// let text = reader.read_text(end.name()).unwrap();
213 /// let text = text.decode().unwrap();
214 /// assert_eq!(text, r#"
215 /// <title>This is a HTML text</title>
216 /// <p>Usual XML rules does not apply inside it
217 /// <p>For example, elements not needed to be "closed"
218 /// "#);
219 /// assert!(matches!(text, Cow::Borrowed(_)));
220 ///
221 /// // Now we can enable checks again
222 /// reader.config_mut().check_end_names = true;
223 ///
224 /// // At the end we should get an Eof event, because we ate the whole XML
225 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
226 /// ```
227 ///
228 /// [`Start`]: Event::Start
229 /// [`decoder()`]: Self::decoder()
230 pub fn read_text(&mut self, end: QName) -> Result<BytesText<'a>> {
231 // self.reader will be changed, so store original reference
232 let buffer = self.reader;
233 let span = self.read_to_end(end)?;
234
235 let len = span.end - span.start;
236 // SAFETY: `span` can only contain indexes up to usize::MAX because it
237 // was created from offsets from a single &[u8] slice
238 Ok(BytesText::wrap(&buffer[0..len as usize], self.decoder()))
239 }
240}
241
242////////////////////////////////////////////////////////////////////////////////////////////////////
243
244/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
245/// that will be borrowed by events. This implementation provides a zero-copy deserialization
246impl<'a> XmlSource<'a, ()> for &'a [u8] {
247 #[cfg(not(feature = "encoding"))]
248 #[inline]
249 fn remove_utf8_bom(&mut self) -> io::Result<()> {
250 if self.starts_with(crate::encoding::UTF8_BOM) {
251 *self = &self[crate::encoding::UTF8_BOM.len()..];
252 }
253 Ok(())
254 }
255
256 #[cfg(feature = "encoding")]
257 #[inline]
258 fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
259 if let Some(detected) = crate::encoding::detect_encoding(self) {
260 *self = &self[detected.bom_len() as usize..];
261 return Ok(Some(detected));
262 }
263 Ok(None)
264 }
265
266 #[inline]
267 fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
268 // Search for start of markup or an entity or character reference
269 match memchr::memchr2(b'<', b'&', self) {
270 Some(0) if self[0] == b'<' => ReadTextResult::Markup(()),
271 // Do not consume `&` because it may be lone and we would be need to
272 // return it as part of Text event
273 Some(0) => ReadTextResult::Ref(()),
274 Some(i) if self[i] == b'<' => {
275 let (bytes, rest) = self.split_at(i);
276 *self = rest;
277 *position += i as u64;
278 ReadTextResult::UpToMarkup(bytes)
279 }
280 Some(i) => {
281 let (bytes, rest) = self.split_at(i);
282 *self = rest;
283 *position += i as u64;
284 ReadTextResult::UpToRef(bytes)
285 }
286 None => {
287 let bytes = &self[..];
288 *self = &[];
289 *position += bytes.len() as u64;
290 ReadTextResult::UpToEof(bytes)
291 }
292 }
293 }
294
295 #[inline]
296 fn read_ref(&mut self, _buf: (), position: &mut u64) -> ReadRefResult<'a> {
297 debug_assert!(
298 self.starts_with(b"&"),
299 "`read_ref` must be called at `&`:\n{:?}",
300 crate::utils::Bytes(self)
301 );
302 // Search for the end of reference or a start of another reference or a markup
303 match memchr::memchr3(b';', b'&', b'<', &self[1..]) {
304 Some(i) if self[i + 1] == b';' => {
305 // +1 for the start `&`
306 // +1 for the end `;`
307 let end = i + 2;
308 let (bytes, rest) = self.split_at(end);
309 *self = rest;
310 *position += end as u64;
311
312 ReadRefResult::Ref(bytes)
313 }
314 // Do not consume `&` because it may be lone and we would be need to
315 // return it as part of Text event
316 Some(i) => {
317 let is_amp = self[i + 1] == b'&';
318 let (bytes, rest) = self.split_at(i + 1);
319 *self = rest;
320 *position += i as u64 + 1;
321
322 if is_amp {
323 ReadRefResult::UpToRef(bytes)
324 } else {
325 ReadRefResult::UpToMarkup(bytes)
326 }
327 }
328 None => {
329 let bytes = &self[..];
330 *self = &[];
331 *position += bytes.len() as u64;
332
333 ReadRefResult::UpToEof(bytes)
334 }
335 }
336 }
337
338 #[inline]
339 fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
340 where
341 P: Parser,
342 {
343 if let Some(i) = parser.feed(self) {
344 let used = i + 1; // +1 for `>`
345 *position += used as u64;
346 let (bytes, rest) = self.split_at(used);
347 *self = rest;
348 return Ok(bytes);
349 }
350
351 *position += self.len() as u64;
352 Err(Error::Syntax(parser.eof_error(self)))
353 }
354
355 #[inline]
356 fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
357 // Peeked one bang ('!') before being called, so it's guaranteed to
358 // start with it.
359 debug_assert!(
360 self.starts_with(b"<!"),
361 "`read_bang_element` must be called at `<!`:\n{:?}",
362 crate::utils::Bytes(self)
363 );
364
365 let mut bang_type = BangType::new(self.get(2).copied())?;
366
367 if let Some(i) = bang_type.feed(&[], self) {
368 let consumed = i + 1; // +1 for `>`
369 *position += consumed as u64;
370 let (bytes, rest) = self.split_at(consumed);
371 *self = rest;
372 return Ok((bang_type, bytes));
373 }
374
375 *position += self.len() as u64;
376 Err(Error::Syntax(bang_type.to_err()))
377 }
378
379 #[inline]
380 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
381 let whitespaces = self
382 .iter()
383 .position(|b| !is_whitespace(*b))
384 .unwrap_or(self.len());
385 *position += whitespaces as u64;
386 *self = &self[whitespaces..];
387 Ok(())
388 }
389
390 #[inline]
391 fn peek_one(&mut self) -> io::Result<Option<u8>> {
392 debug_assert!(
393 self.starts_with(b"<"),
394 "markup must start from '<':\n{:?}",
395 crate::utils::Bytes(self)
396 );
397 Ok(self.get(1).copied())
398 }
399}
400
401#[cfg(test)]
402mod test {
403 use crate::reader::test::check;
404 use crate::reader::XmlSource;
405
406 /// Default buffer constructor just pass the byte array from the test
407 fn identity<T>(input: T) -> T {
408 input
409 }
410
411 check!(
412 #[test]
413 read_event_impl,
414 read_until_close,
415 identity,
416 0,
417 ()
418 );
419}