Skip to main content

embedded_xml/
reader.rs

1use embedded_io::Error;
2
3use crate::Result;
4use crate::attributes::AttributeReader;
5use crate::events::Event;
6
7use core::ops::Range;
8
9#[cfg(test)]
10extern crate std;
11
12macro_rules! trace {
13    ($($arg:tt)*) => {
14        #[cfg(feature = "log")]
15        log::trace!($($arg)*);
16        #[cfg(test)]
17        std::eprintln!($($arg)*);
18    };
19}
20
21/// A streaming XML reader.
22/// The temporary buffer can be owned or borrowed
23pub struct Reader<R, Buffer> {
24    reader: R,
25    remaining: usize,
26    buffer: Buffer,
27    pos: usize,
28    end: usize,
29    at_start: bool,
30    self_closing: Option<Range<usize>>,
31}
32
33impl<'a, R: embedded_io::Read> Reader<R, &'a mut [u8]> {
34    /// Creates a new Reader with a borrowed buffer.
35    /// ```
36    /// # use embedded_xml as xml;
37    /// # fn main() -> Result<(), xml::Error> {
38    /// # let xml = "<?xml version=\"1.0\"?>";
39    /// # let mut reader = xml.as_bytes();
40    /// let mut buffer = [0u8; 256];
41    /// let mut parser = xml::Reader::new_borrowed(&mut reader, xml.len(), &mut buffer)?;
42    /// # Ok(())
43    /// # }
44    /// ```
45    pub fn new_borrowed(reader: R, total_size: usize, buffer: &'a mut [u8]) -> Result<Self> {
46        Self::new_with_read(reader, total_size, buffer)
47    }
48}
49
50#[cfg(feature = "alloc")]
51impl<R: embedded_io::Read> Reader<R, alloc::vec::Vec<u8>> {
52    /// Creates a new Reader with an owned buffer of size `buffer_size`.
53    /// ```
54    /// # use embedded_xml as xml;
55    /// # fn main() -> Result<(), xml::Error> {
56    /// # let xml = "<?xml version=\"1.0\"?>";
57    /// # let mut reader = xml.as_bytes();
58    /// let mut parser = xml::Reader::new(&mut reader, xml.len(), 256)?;
59    /// # Ok(())
60    /// # }
61    /// ```
62    pub fn new(reader: R, total_size: usize, buffer_size: usize) -> Result<Self> {
63        let buffer = alloc::vec![0; buffer_size];
64        Self::new_with_read(reader, total_size, buffer)
65    }
66}
67
68impl<R: embedded_io::Read, Buffer: AsRef<[u8]> + AsMut<[u8]>> Reader<R, Buffer> {
69    fn new_with_read(mut reader: R, total_size: usize, mut buffer: Buffer) -> Result<Self> {
70        let end = reader
71            .read(buffer.as_mut())
72            .map_err(|e| crate::Error::IoError(e.kind()))?;
73        let remaining = total_size - end;
74        Ok(Reader {
75            reader,
76            remaining,
77            buffer,
78            pos: 0,
79            end,
80            at_start: true,
81            self_closing: None,
82        })
83    }
84
85    /// Advances the reader to the next event and returns it.
86    /// 
87    /// # Examples
88    /// ```
89    /// # use embedded_xml as xml;
90    /// # fn main() -> Result<(), xml::Error> {
91    /// # let xml = "<?xml version=\"1.0\"?>";
92    /// # let mut reader = xml.as_bytes();
93    /// # let mut buffer = [0u8; 256];
94    /// # let mut reader = xml::Reader::new_borrowed(&mut reader, xml.len(), &mut buffer)?;
95    /// loop {
96    ///     match reader.next_event()? {
97    ///         xml::Event::Declaration { mut attrs } => {
98    ///             assert_eq!(attrs.get("version"), Some("1.0"));
99    ///         }
100    ///         xml::Event::StartElement { name: "item", mut attrs } => {
101    ///             for (name, value) in attrs {
102    ///                println!("Attribute: {} = {}", name, value);
103    ///             }
104    ///         }
105    ///         xml::Event::EndElement { name } => {
106    ///             println!("End element: {}", name);
107    ///         }
108    ///         xml::Event::EndOfFile => break,
109    ///         _ => {}
110    ///     }
111    /// }
112    /// # Ok(())
113    /// # }
114    /// ```
115    pub fn next_event(&mut self) -> Result<Event<'_>> {
116        // Ensure we have an XML declaration at the start of the document
117        // We should probably ensure version 1.0 and UTF-8 encoding.
118        if self.at_start {
119            self.at_start = false;
120            let (start, end) = self.try_find("<?xml", "?>")?;
121            let block = core::str::from_utf8(&self.buffer.as_ref()[start..end])?;
122            let attrs = AttributeReader::from_block(block);
123            self.pos = end + 2;
124            return Ok(Event::Declaration { attrs });
125        };
126
127        if self.pos == self.end && self.remaining == 0 {
128            trace!("Pos = End");
129            return Ok(Event::EndOfFile);
130        }
131
132        if let Some(range) = self.self_closing.take() {
133            let block = &self.buffer.as_ref()[range].trim_ascii();
134            let name = core::str::from_utf8(block)?
135                .split_ascii_whitespace()
136                .next()
137                .ok_or(crate::Error::InvalidState)?;
138            return Ok(Event::EndElement { name });
139        }
140
141        let curr_end = match self.try_find_start("<") {
142            Ok(pos) => pos,
143            Err(crate::Error::Eof) => return Ok(Event::EndOfFile),
144            Err(e) => return Err(e),
145        };
146
147        let curr = self.buffer()[..curr_end].trim_ascii();
148        if !curr.is_empty() {
149            let block = self.buffer.as_ref()[self.pos..self.pos + curr_end].trim_ascii();
150            let content = core::str::from_utf8(block)?;
151            self.pos += curr_end;
152            return Ok(Event::Text { content });
153        }
154
155        self.pos += curr_end;
156        match self.ensure(3) {
157            Ok(()) => {}
158            Err(crate::Error::Eof) => {
159                return Ok(Event::EndOfFile);
160            }
161            Err(e) => return Err(e),
162        };
163
164        enum BlockType {
165            Cdata,
166            Comment,
167            Dtd,
168            PI,
169            EndElement,
170            StartElement,
171        }
172
173        let b = self.buffer();
174        let (ty, n_start, n_end) = match (b[1], b[2]) {
175            (b'!', b'[') => (BlockType::Cdata, "<![CDATA[", "]]>"),
176            (b'!', b'-') => (BlockType::Comment, "<!--", "-->"),
177            (b'!', _) => (BlockType::Dtd, "<!", ">"),
178            (b'?', _) => (BlockType::PI, "<?", "?>"),
179            (b'/', _) => (BlockType::EndElement, "</", ">"),
180            (_, _) => (BlockType::StartElement, "<", ">"),
181        };
182
183        let (start, end) = self.try_find(n_start, n_end)?;
184
185        let range = if matches!(ty, BlockType::StartElement) && self.buffer()[end - 1] == b'/' {
186            let range = self.pos + start..self.pos + end - 1;
187            self.self_closing = Some(range.clone());
188            range
189        } else {
190            self.pos + start..self.pos + end
191        };
192
193        let block = &self.buffer.as_ref()[range].trim_ascii();
194
195        let event = match ty {
196            BlockType::Cdata => Event::CDATA { data: block },
197            BlockType::Comment => Event::Comment {
198                content: core::str::from_utf8(block)?,
199            },
200            BlockType::Dtd => Event::Dtd {
201                content: core::str::from_utf8(block)?,
202            },
203            BlockType::PI => {
204                let (name, attrs) = Self::name_and_attrs(block)?;
205                Event::ProcessingInstruction { name, attrs }
206            }
207            BlockType::EndElement => Event::EndElement {
208                name: core::str::from_utf8(block)?,
209            },
210            BlockType::StartElement => {
211                let (name, attrs) = Self::name_and_attrs(block)?;
212                Event::StartElement { name, attrs }
213            }
214        };
215        self.pos += end + n_end.len();
216        Ok(event)
217    }
218    pub fn name_and_attrs(block: &[u8]) -> Result<(&str, AttributeReader<'_>)> {
219        let block = core::str::from_utf8(block)?;
220        let mut split = block.split_ascii_whitespace();
221        let name = split.next().unwrap_or("");
222        Ok((name, AttributeReader::from_split(split)))
223    }
224
225    /// Moves the unparsed characters starting from offset to the beginning
226    /// of the buffer, updates positional indices and reads more data.
227    fn advance(&mut self, offset: usize) -> Result<()> {
228        trace!(
229            "Advancing by {offset} bytes (remaining: {})",
230            self.remaining
231        );
232        if self.remaining == 0 {
233            return Err(crate::Error::Eof);
234        }
235        assert!(offset <= self.end);
236        assert!(offset <= self.buffer.as_ref().len());
237        trace!("Copying {} bytes to start of buffer", self.end - offset);
238        for i in offset..self.end {
239            self.buffer.as_mut()[i - offset] = self.buffer.as_ref()[i];
240        }
241        self.pos = 0;
242        self.end -= offset;
243        let data_start = self.buffer.as_ref().len() - offset;
244        let read_bytes = self
245            .reader
246            .read(&mut self.buffer.as_mut()[data_start..])
247            .map_err(|e| crate::Error::IoError(e.kind()))?;
248        self.end += read_bytes;
249        self.remaining -= read_bytes;
250        trace!(
251            "Read {read_bytes} bytes, new buffer len: {}, remaining: {}",
252            self.buffer().len(),
253            self.remaining
254        );
255        Ok(())
256    }
257
258    /// Ensure at least `size` bytes are available in the buffer, advancing if necessary.
259    fn ensure(&mut self, size: usize) -> Result<()> {
260        trace!("Ensuring {size} bytes (remaining: {})", self.remaining);
261        let available = self.buffer().len();
262        if available >= size {
263            return Ok(());
264        }
265        if available + self.remaining < size {
266            return Err(crate::Error::Eof);
267        }
268        self.advance(self.pos)
269    }
270
271    /// Tries to find start & end needles in the buffer.
272    /// If we find the start needle but not the end, we advance to have the start at 0 and try again - once.
273    /// If we find neither, we advance to the end of the buffer and try again - once.
274    fn try_find(&mut self, n_start: &str, n_end: &str) -> Result<(usize, usize)> {
275        trace!(
276            "Trying to find '{n_start}' and '{n_end}' (remaining: {})",
277            self.remaining
278        );
279        let n_start = n_start.as_bytes();
280        let n_end = n_end.as_bytes();
281        match find_span(self.buffer(), n_start, n_end) {
282            Some((start, Some(end))) => Ok((start, end)),
283            Some((start, None)) => {
284                self.advance(self.pos + start)?;
285                let Some(end) = memchr::memmem::find(self.buffer(), n_end) else {
286                    return Err(crate::Error::Eof);
287                };
288                Ok((0, end))
289            }
290            None => {
291                self.advance(self.buffer.as_ref().len())?;
292                let Some((start, Some(end))) = find_span(self.buffer(), n_start, n_end) else {
293                    return Err(crate::Error::Eof);
294                };
295                Ok((start, end))
296            }
297        }
298    }
299
300    /// Tries to find the start needle in the buffer.
301    /// If it is not found, we advance to the end of the buffer and try again - once.
302    fn try_find_start(&mut self, n_start: &str) -> Result<usize> {
303        trace!(
304            "Trying to find start '{n_start}' (pos: {}, remaining: {})",
305            self.pos, self.remaining
306        );
307        let n_start = n_start.as_bytes();
308        match memchr::memmem::find(self.buffer(), n_start) {
309            Some(pos) => Ok(pos),
310            None => {
311                self.advance(self.pos)?;
312                let Some(pos) = memchr::memmem::find(self.buffer(), n_start) else {
313                    trace!("Needle not found!");
314                    return Err(crate::Error::Eof);
315                };
316                Ok(pos)
317            }
318        }
319    }
320
321    fn buffer(&self) -> &[u8] {
322        &self.buffer.as_ref()[self.pos..self.end]
323    }
324}
325
326fn find_span(buffer: &[u8], start: &[u8], end: &[u8]) -> Option<(usize, Option<usize>)> {
327    let start = memchr::memmem::find(buffer, start)? + start.len();
328    let end = memchr::memmem::find(&buffer[start..], end).map(|pos| pos + start);
329    Some((start, end))
330}
331
332#[cfg(test)]
333#[rustfmt::skip]
334mod tests {
335    extern crate std;
336
337    use crate::*;
338    use super::*;
339
340    const LOREM: &str = "\
341        Lorem ipsum dolor sit amet, consetetur sadipscing elitr,seddiam \
342        nonumy eirmod tempor invidunt ut labore et dolore magna aliquya \
343        erat, sed diam voluptua. At vero eos et accusam et justo duo do \
344        ores et ea rebum. Stet clita kasd gubergren, no sea takimata sa \
345        ctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,\
346        consetetur sadipscing elitr, sed diam nonumy eirmod tempor invid\
347        unt ut labore et dolore magna aliquyam erat, sed diam voluptua. \
348        At vero eos et accusam et justo duo dolores et ea rebum. Stet cl";
349
350    #[test]
351    #[cfg(feature = "alloc")]
352    fn test_window() {
353        let data = LOREM.as_bytes();
354        let mut buffer = data;
355        let mut parser = Reader::new(&mut buffer, data.len(), 256).unwrap();
356        assert_eq!(parser.buffer(), &data[..256]);
357        parser.advance(256).unwrap();
358        assert_eq!(parser.buffer(), &data[256..]);
359    }
360
361    #[test]
362    fn test_needle_range() {
363        let xml = "\
364            <root>\
365                <child>Text</child>\
366                <child>More text</child>\
367            </root>";
368        let data = xml.as_bytes();
369
370        let Some((start, Some(end))) = find_span(data, b"<", b">") else {
371            panic!("Failed to find span");
372        };
373        assert_eq!(&xml[start..end], "root");
374
375        let Some((start, Some(end))) = find_span(data, b"<child>", b"</child>") else {
376            panic!("Failed to find span");
377        };
378        assert_eq!(&xml[start..end], "Text");
379    }
380
381    #[test]
382    #[cfg(feature = "alloc")]
383    fn test_find() {
384        fn find_str<'a>(
385            parser: &'a mut OwnedReader<&'_ [u8]>,
386            n_start: &str,
387            n_end: &str,
388        ) -> Result<&'a str> {
389            let (start, end) = parser.try_find(n_start, n_end)?;
390            Ok(core::str::from_utf8(&parser.buffer[start..end])?)
391        }
392
393        let data = LOREM.as_bytes();
394        let buffer = data;
395        let mut parser = Reader::new(buffer, data.len(), 256).unwrap();
396        let ipsum = find_str(&mut parser, "Lorem ", " dolor").unwrap();
397        assert_eq!(ipsum, "ipsum");
398        let aliquyam = find_str(&mut parser, "no sea takimata ", " ctus est").unwrap();
399        assert_eq!(aliquyam, "sa");
400        assert_eq!(parser.buffer(), &data[253..509]);
401    }
402}