strong_xml/
xml_reader.rs

1use std::borrow::Cow;
2use std::iter::{Iterator, Peekable};
3
4use xmlparser::ElementEnd;
5use xmlparser::Error;
6use xmlparser::Token;
7use xmlparser::Tokenizer;
8
9use crate::xml_unescape::xml_unescape;
10use crate::{XmlError, XmlResult};
11
12/// Xml Reader
13///
14/// It behaves almost exactly like `xmlparser::Tokenizer::from("...").peekable()`
15/// but with some helper functions.
16pub struct XmlReader<'a> {
17    tokenizer: Peekable<Tokenizer<'a>>,
18}
19
20impl<'a> XmlReader<'a> {
21    #[inline]
22    pub fn new(text: &'a str) -> XmlReader<'a> {
23        XmlReader {
24            tokenizer: Tokenizer::from(text).peekable(),
25        }
26    }
27
28    #[inline]
29    pub fn next(&mut self) -> Option<Result<Token<'a>, Error>> {
30        self.tokenizer.next()
31    }
32
33    #[inline]
34    pub fn peek(&mut self) -> Option<&Result<Token<'a>, Error>> {
35        self.tokenizer.peek()
36    }
37
38    #[inline]
39    pub fn read_text(&mut self, end_tag: &str) -> XmlResult<Cow<'a, str>> {
40        let mut res = None;
41
42        while let Some(token) = self.next() {
43            match token? {
44                Token::ElementEnd {
45                    end: ElementEnd::Open,
46                    ..
47                }
48                | Token::Attribute { .. } => (),
49                Token::Text { text } => {
50                    res = Some(xml_unescape(text.as_str())?);
51                }
52                Token::Cdata { text, .. } => {
53                    res = Some(Cow::Borrowed(text.as_str()));
54                }
55                Token::ElementEnd {
56                    end: ElementEnd::Close(_, _),
57                    span,
58                } => {
59                    let span = span.as_str(); // </tag>
60                    let tag = &span[2..span.len() - 1]; // remove `</` and `>`
61                    if end_tag == tag {
62                        break;
63                    } else {
64                        return Err(XmlError::TagMismatch {
65                            expected: end_tag.to_owned(),
66                            found: tag.to_owned(),
67                        });
68                    }
69                }
70                token => {
71                    return Err(XmlError::UnexpectedToken {
72                        token: format!("{:?}", token),
73                    });
74                }
75            }
76        }
77
78        Ok(res.unwrap_or_default())
79    }
80
81    #[inline]
82    pub fn read_till_element_start(&mut self, end_tag: &str) -> XmlResult<()> {
83        while let Some(token) = self.next() {
84            match token? {
85                Token::ElementStart { span, .. } => {
86                    let tag = &span.as_str()[1..];
87                    if end_tag == tag {
88                        break;
89                    } else {
90                        self.read_to_end(tag)?;
91                    }
92                }
93                Token::ElementEnd { .. }
94                | Token::Attribute { .. }
95                | Token::Text { .. }
96                | Token::Cdata { .. } => {
97                    return Err(XmlError::UnexpectedToken {
98                        token: format!("{:?}", token),
99                    });
100                }
101                _ => (),
102            }
103        }
104        Ok(())
105    }
106
107    #[inline]
108    pub fn find_attribute(&mut self) -> XmlResult<Option<(&'a str, Cow<'a, str>)>> {
109        if let Some(token) = self.tokenizer.peek() {
110            match token {
111                Ok(Token::Attribute { span, value, .. }) => {
112                    let value = value.as_str();
113                    let span = span.as_str(); // key="value"
114                    let key = &span[0..span.len() - value.len() - 3]; // remove `="`, value and `"`
115                    let value = Cow::Borrowed(value);
116                    self.next();
117                    return Ok(Some((key, value)));
118                }
119                Ok(Token::ElementEnd {
120                    end: ElementEnd::Open,
121                    ..
122                })
123                | Ok(Token::ElementEnd {
124                    end: ElementEnd::Empty,
125                    ..
126                }) => return Ok(None),
127                Ok(token) => {
128                    return Err(XmlError::UnexpectedToken {
129                        token: format!("{:?}", token),
130                    })
131                }
132                Err(_) => {
133                    // we have call .peek() above, and it's safe to use unwrap
134                    self.next().unwrap()?;
135                }
136            }
137        }
138
139        Err(XmlError::UnexpectedEof)
140    }
141
142    #[inline]
143    pub fn find_element_start(&mut self, end_tag: Option<&str>) -> XmlResult<Option<&'a str>> {
144        while let Some(token) = self.tokenizer.peek() {
145            match token {
146                Ok(Token::ElementStart { span, .. }) => {
147                    return Ok(Some(&span.as_str()[1..]));
148                }
149                Ok(Token::ElementEnd {
150                    end: ElementEnd::Close(_, _),
151                    span,
152                }) if end_tag.is_some() => {
153                    let end_tag = end_tag.unwrap();
154                    let span = span.as_str(); // </tag>
155                    let tag = &span[2..span.len() - 1]; // remove `</` and `>`
156                    if tag == end_tag {
157                        self.next();
158                        return Ok(None);
159                    } else {
160                        return Err(XmlError::TagMismatch {
161                            expected: end_tag.to_owned(),
162                            found: tag.to_owned(),
163                        });
164                    }
165                }
166                Ok(Token::ElementEnd { .. }) | Ok(Token::Attribute { .. }) => {
167                    return Err(XmlError::UnexpectedToken {
168                        token: format!("{:?}", token),
169                    })
170                }
171                _ => {
172                    // we have call .peek() above, and it's safe to use unwrap
173                    self.next().unwrap()?;
174                }
175            }
176        }
177
178        Err(XmlError::UnexpectedEof)
179    }
180
181    #[inline]
182    pub fn read_to_end(&mut self, end_tag: &str) -> XmlResult<()> {
183        while let Some(token) = self.next() {
184            match token? {
185                // if this element is emtpy, just return
186                Token::ElementEnd {
187                    end: ElementEnd::Empty,
188                    ..
189                } => return Ok(()),
190                Token::ElementEnd {
191                    end: ElementEnd::Open,
192                    ..
193                } => break,
194                Token::Attribute { .. } => (),
195                // there shouldn't have any token but Attribute between ElementStart and ElementEnd
196                token => {
197                    return Err(XmlError::UnexpectedToken {
198                        token: format!("{:?}", token),
199                    })
200                }
201            }
202        }
203
204        let mut depth = 1;
205
206        while let Some(token) = self.next() {
207            match token? {
208                Token::ElementStart { span, .. } if end_tag == &span.as_str()[1..] => {
209                    while let Some(token) = self.next() {
210                        match token? {
211                            Token::ElementEnd {
212                                end: ElementEnd::Empty,
213                                ..
214                            } => {
215                                if depth == 0 {
216                                    return Ok(());
217                                } else {
218                                    // don't advance depth in this case
219                                    break;
220                                }
221                            }
222                            Token::ElementEnd {
223                                end: ElementEnd::Open,
224                                ..
225                            } => {
226                                depth += 1;
227                                break;
228                            }
229                            Token::Attribute { .. } => (),
230                            // there shouldn't have any token but Attribute between ElementStart and ElementEnd
231                            token => {
232                                return Err(XmlError::UnexpectedToken {
233                                    token: format!("{:?}", token),
234                                });
235                            }
236                        }
237                    }
238                }
239                Token::ElementEnd {
240                    end: ElementEnd::Close(_, _),
241                    span,
242                } if end_tag == &span.as_str()[2..span.as_str().len() - 1] => {
243                    depth -= 1;
244                    if depth == 0 {
245                        return Ok(());
246                    }
247                }
248                _ => (),
249            }
250        }
251
252        Err(XmlError::UnexpectedEof)
253    }
254}
255
256#[test]
257fn read_text() -> XmlResult<()> {
258    let mut reader = XmlReader::new("<parent></parent>");
259
260    assert!(reader.next().is_some()); // "<parent"
261    assert_eq!(reader.read_text("parent")?, "");
262    assert!(reader.next().is_none());
263
264    reader = XmlReader::new("<parent>text</parent>");
265
266    assert!(reader.next().is_some()); // "<parent"
267    assert_eq!(reader.read_text("parent")?, "text");
268    assert!(reader.next().is_none());
269
270    reader = XmlReader::new("<parent attr=\"value\">text</parent>");
271
272    assert!(reader.next().is_some()); // "<parent"
273    assert_eq!(reader.read_text("parent")?, "text");
274    assert!(reader.next().is_none());
275
276    reader = XmlReader::new("<parent attr=\"value\">&quot;&apos;&lt;&gt;&amp;</parent>");
277
278    assert!(reader.next().is_some()); // "<parent"
279    assert_eq!(reader.read_text("parent")?, r#""'<>&"#);
280    assert!(reader.next().is_none());
281
282    let mut reader = XmlReader::new("<parent><![CDATA[]]></parent>");
283
284    assert!(reader.next().is_some()); // "<parent"
285    assert_eq!(reader.read_text("parent")?, "");
286    assert!(reader.next().is_none());
287
288    reader = XmlReader::new("<parent><![CDATA[text]]></parent>");
289
290    assert!(reader.next().is_some()); // "<parent"
291    assert_eq!(reader.read_text("parent")?, "text");
292    assert!(reader.next().is_none());
293
294    reader = XmlReader::new("<parent attr=\"value\"><![CDATA[text]]></parent>");
295
296    assert!(reader.next().is_some()); // "<parent"
297    assert_eq!(reader.read_text("parent")?, "text");
298    assert!(reader.next().is_none());
299
300    reader = XmlReader::new("<parent attr=\"value\"><![CDATA[<foo></foo>]]></parent>");
301
302    assert!(reader.next().is_some()); // "<parent"
303    assert_eq!(reader.read_text("parent")?, "<foo></foo>");
304    assert!(reader.next().is_none());
305
306    reader =
307        XmlReader::new("<parent attr=\"value\"><![CDATA[&quot;&apos;&lt;&gt;&amp;]]></parent>");
308
309    assert!(reader.next().is_some()); // "<parent"
310    assert_eq!(reader.read_text("parent")?, "&quot;&apos;&lt;&gt;&amp;");
311    assert!(reader.next().is_none());
312
313    Ok(())
314}
315
316#[test]
317fn read_till_element_start() -> XmlResult<()> {
318    let mut reader = XmlReader::new("<tag/>");
319
320    reader.read_till_element_start("tag")?;
321    assert!(reader.next().is_some()); // "/>"
322    assert!(reader.next().is_none());
323
324    reader = XmlReader::new("<parent><skip/><tag/></parent>");
325
326    assert!(reader.next().is_some()); // "<parent"
327    assert!(reader.next().is_some()); // ">"
328    reader.read_till_element_start("tag")?;
329    assert!(reader.next().is_some()); // "/>"
330    assert!(reader.next().is_some()); // "</parent>"
331    assert!(reader.next().is_none());
332
333    reader = XmlReader::new("<parent><skip></skip><tag/></parent>");
334
335    assert!(reader.next().is_some()); // "<parent"
336    assert!(reader.next().is_some()); // ">"
337    reader.read_till_element_start("tag")?;
338    assert!(reader.next().is_some()); // "/>"
339    assert!(reader.next().is_some()); // "</parent>"
340    assert!(reader.next().is_none());
341
342    reader = XmlReader::new("<parent><skip><skip/></skip><tag/></parent>");
343
344    assert!(reader.next().is_some()); // "<parent"
345    assert!(reader.next().is_some()); // ">"
346    reader.read_till_element_start("tag")?;
347    assert!(reader.next().is_some()); // "/>"
348    assert!(reader.next().is_some()); // "</parent>"
349    assert!(reader.next().is_none());
350
351    reader = XmlReader::new("<parent><skip><skip></skip></skip><tag/></parent>");
352
353    assert!(reader.next().is_some()); // "<parent"
354    assert!(reader.next().is_some()); // ">"
355    reader.read_till_element_start("tag")?;
356    assert!(reader.next().is_some()); // "/>"
357    assert!(reader.next().is_some()); // "</parent>"
358    assert!(reader.next().is_none());
359
360    Ok(())
361}
362
363#[test]
364fn read_to_end() -> XmlResult<()> {
365    let mut reader = XmlReader::new("<parent><child/></parent>");
366
367    assert!(reader.next().is_some()); // "<parent"
368    assert!(reader.next().is_some()); // ">"
369    assert!(reader.next().is_some()); // "<child"
370    reader.read_to_end("child")?;
371    assert!(reader.next().is_some()); // "</parent>"
372    assert!(reader.next().is_none());
373
374    reader = XmlReader::new("<parent><child></child></parent>");
375
376    assert!(reader.next().is_some()); // "<parent"
377    assert!(reader.next().is_some()); // ">"
378    assert!(reader.next().is_some()); // "<child"
379    reader.read_to_end("child")?;
380    assert!(reader.next().is_some()); // "</parent>"
381    assert!(reader.next().is_none());
382
383    reader = XmlReader::new("<parent><child><child/></child></parent>");
384
385    assert!(reader.next().is_some()); // "<parent"
386    assert!(reader.next().is_some()); // ">"
387    assert!(reader.next().is_some()); // "<child"
388    reader.read_to_end("child")?;
389    assert!(reader.next().is_some()); // "</parent>"
390    assert!(reader.next().is_none());
391
392    reader = XmlReader::new("<parent><child><child></child></child></parent>");
393
394    assert!(reader.next().is_some()); // "<parent"
395    assert!(reader.next().is_some()); // ">"
396    assert!(reader.next().is_some()); // "<child"
397    reader.read_to_end("child")?;
398    assert!(reader.next().is_some()); // "</parent>"
399    assert!(reader.next().is_none());
400
401    Ok(())
402}