hard_xml/
xml_reader.rs

1use std::borrow::Cow;
2use std::iter::{Iterator, Peekable};
3
4use xmlparser::ElementEnd;
5use xmlparser::Error;
6use xmlparser::Token;
7use xmlparser::Tokenizer;
8
9use crate::xml_unescape::xml_unescape;
10use crate::{XmlError, XmlResult};
11
12/// Xml Reader
13///
14/// It behaves almost exactly like `xmlparser::Tokenizer::from("...").peekable()`
15/// but with some helper functions.
16pub struct XmlReader<'a> {
17    tokenizer: Peekable<Tokenizer<'a>>,
18}
19
20impl<'a> XmlReader<'a> {
21    #[inline]
22    pub fn new(text: &'a str) -> XmlReader<'a> {
23        XmlReader {
24            tokenizer: Tokenizer::from(text).peekable(),
25        }
26    }
27
28    #[inline]
29    pub fn next(&mut self) -> Option<Result<Token<'a>, Error>> {
30        self.tokenizer.next()
31    }
32
33    #[inline]
34    pub fn peek(&mut self) -> Option<&Result<Token<'a>, Error>> {
35        self.tokenizer.peek()
36    }
37
38    #[inline]
39    pub fn read_text(&mut self, end_tag: &str) -> XmlResult<Cow<'a, str>> {
40        let mut res = Cow::Borrowed("");
41
42        while let Some(token) = self.next() {
43            match token? {
44                Token::ElementEnd {
45                    end: ElementEnd::Open,
46                    ..
47                }
48                | Token::Attribute { .. } => (),
49                Token::Text { text } => {
50                    let text = xml_unescape(text.as_str())?;
51                    if res.is_empty() {
52                        res = text;
53                    } else {
54                        res.to_mut().push_str(&text);
55                    }
56                }
57                Token::Cdata { text, .. } => {
58                    if res.is_empty() {
59                        res = Cow::Borrowed(text.as_str());
60                    } else {
61                        res.to_mut().push_str(&text);
62                    }
63                }
64                Token::ElementEnd {
65                    end: ElementEnd::Close(_, _),
66                    span,
67                } => {
68                    let span = span.as_str(); // </tag>
69                    let tag = &span[2..span.len() - 1]; // remove `</` and `>`
70                    if end_tag == tag {
71                        break;
72                    } else {
73                        return Err(XmlError::TagMismatch {
74                            expected: end_tag.to_owned(),
75                            found: tag.to_owned(),
76                        });
77                    }
78                }
79                Token::ElementEnd {
80                    end: ElementEnd::Empty,
81                    ..
82                } => {
83                    break;
84                }
85                token => {
86                    return Err(XmlError::UnexpectedToken {
87                        token: format!("{:?}", token),
88                    });
89                }
90            }
91        }
92
93        Ok(res)
94    }
95
96    #[inline]
97    pub fn read_till_element_start(&mut self, end_tag: &str) -> XmlResult<()> {
98        while let Some(token) = self.next() {
99            match token? {
100                Token::ElementStart { span, .. } => {
101                    let tag = &span.as_str()[1..];
102                    if end_tag == tag {
103                        break;
104                    } else {
105                        self.read_to_end(tag)?;
106                    }
107                }
108                Token::ElementEnd { .. }
109                | Token::Attribute { .. }
110                | Token::Text { .. }
111                | Token::Cdata { .. } => {
112                    return Err(XmlError::UnexpectedToken {
113                        token: format!("{:?}", token),
114                    });
115                }
116                _ => (),
117            }
118        }
119        Ok(())
120    }
121
122    #[inline]
123    pub fn find_attribute(&mut self) -> XmlResult<Option<(&'a str, Cow<'a, str>)>> {
124        if let Some(token) = self.tokenizer.peek() {
125            match token {
126                Ok(Token::Attribute { span, value, .. }) => {
127                    let value = value.as_str();
128                    let span = span.as_str(); // key="value"
129                    let key = &span[0..span.len() - value.len() - 3]; // remove `="`, value and `"`
130                    let value = xml_unescape(value)?;
131                    self.next();
132                    return Ok(Some((key, value)));
133                }
134                Ok(Token::ElementEnd {
135                    end: ElementEnd::Open,
136                    ..
137                })
138                | Ok(Token::ElementEnd {
139                    end: ElementEnd::Empty,
140                    ..
141                }) => return Ok(None),
142                Ok(token) => {
143                    return Err(XmlError::UnexpectedToken {
144                        token: format!("{:?}", token),
145                    })
146                }
147                Err(_) => {
148                    // we have call .peek() above, and it's safe to use unwrap
149                    self.next().unwrap()?;
150                }
151            }
152        }
153
154        Err(XmlError::UnexpectedEof)
155    }
156
157    #[inline]
158    pub fn find_element_start(&mut self, end_tag: Option<&str>) -> XmlResult<Option<&'a str>> {
159        while let Some(token) = self.tokenizer.peek() {
160            match token {
161                Ok(Token::ElementStart { span, .. }) => {
162                    return Ok(Some(&span.as_str()[1..]));
163                }
164                Ok(Token::ElementEnd {
165                    end: ElementEnd::Close(_, _),
166                    span,
167                }) if end_tag.is_some() => {
168                    let end_tag = end_tag.unwrap();
169                    let span = span.as_str(); // </tag>
170                    let tag = &span[2..span.len() - 1]; // remove `</` and `>`
171                    if tag == end_tag {
172                        self.next();
173                        return Ok(None);
174                    } else {
175                        return Err(XmlError::TagMismatch {
176                            expected: end_tag.to_owned(),
177                            found: tag.to_owned(),
178                        });
179                    }
180                }
181                Ok(Token::ElementEnd { .. }) | Ok(Token::Attribute { .. }) => {
182                    return Err(XmlError::UnexpectedToken {
183                        token: format!("{:?}", token),
184                    })
185                }
186                _ => {
187                    // we have call .peek() above, and it's safe to use unwrap
188                    self.next().unwrap()?;
189                }
190            }
191        }
192
193        Err(XmlError::UnexpectedEof)
194    }
195
196    #[inline]
197    pub fn read_to_end(&mut self, end_tag: &str) -> XmlResult<()> {
198        while let Some(token) = self.next() {
199            match token? {
200                // if this element is emtpy, just return
201                Token::ElementEnd {
202                    end: ElementEnd::Empty,
203                    ..
204                } => return Ok(()),
205                Token::ElementEnd {
206                    end: ElementEnd::Open,
207                    ..
208                } => break,
209                Token::Attribute { .. } => (),
210                // there shouldn't have any token but Attribute between ElementStart and ElementEnd
211                token => {
212                    return Err(XmlError::UnexpectedToken {
213                        token: format!("{:?}", token),
214                    })
215                }
216            }
217        }
218
219        let mut depth = 1;
220
221        while let Some(token) = self.next() {
222            match token? {
223                Token::ElementStart { span, .. } if end_tag == &span.as_str()[1..] => {
224                    while let Some(token) = self.next() {
225                        match token? {
226                            Token::ElementEnd {
227                                end: ElementEnd::Empty,
228                                ..
229                            } => {
230                                if depth == 0 {
231                                    return Ok(());
232                                } else {
233                                    // don't advance depth in this case
234                                    break;
235                                }
236                            }
237                            Token::ElementEnd {
238                                end: ElementEnd::Open,
239                                ..
240                            } => {
241                                depth += 1;
242                                break;
243                            }
244                            Token::Attribute { .. } => (),
245                            // there shouldn't have any token but Attribute between ElementStart and ElementEnd
246                            token => {
247                                return Err(XmlError::UnexpectedToken {
248                                    token: format!("{:?}", token),
249                                });
250                            }
251                        }
252                    }
253                }
254                Token::ElementEnd {
255                    end: ElementEnd::Close(_, _),
256                    span,
257                } if end_tag == &span.as_str()[2..span.as_str().len() - 1] => {
258                    depth -= 1;
259                    if depth == 0 {
260                        return Ok(());
261                    }
262                }
263                _ => (),
264            }
265        }
266
267        Err(XmlError::UnexpectedEof)
268    }
269}
270
271#[test]
272fn read_text() -> XmlResult<()> {
273    let mut reader = XmlReader::new("<parent></parent>");
274
275    assert!(reader.next().is_some()); // "<parent"
276    assert_eq!(reader.read_text("parent")?, "");
277    assert!(reader.next().is_none());
278
279    reader = XmlReader::new("<parent>text</parent>");
280
281    assert!(reader.next().is_some()); // "<parent"
282    assert_eq!(reader.read_text("parent")?, "text");
283    assert!(reader.next().is_none());
284
285    reader = XmlReader::new("<parent attr=\"value\">text</parent>");
286
287    assert!(reader.next().is_some()); // "<parent"
288    assert_eq!(reader.read_text("parent")?, "text");
289    assert!(reader.next().is_none());
290
291    reader = XmlReader::new("<parent attr=\"value\">&quot;&apos;&lt;&gt;&amp;</parent>");
292
293    assert!(reader.next().is_some()); // "<parent"
294    assert_eq!(reader.read_text("parent")?, r#""'<>&"#);
295    assert!(reader.next().is_none());
296
297    let mut reader = XmlReader::new("<parent><![CDATA[]]></parent>");
298
299    assert!(reader.next().is_some()); // "<parent"
300    assert_eq!(reader.read_text("parent")?, "");
301    assert!(reader.next().is_none());
302
303    reader = XmlReader::new("<parent><![CDATA[text]]></parent>");
304
305    assert!(reader.next().is_some()); // "<parent"
306    assert_eq!(reader.read_text("parent")?, "text");
307    assert!(reader.next().is_none());
308
309    reader = XmlReader::new("<parent attr=\"value\"><![CDATA[text]]></parent>");
310
311    assert!(reader.next().is_some()); // "<parent"
312    assert_eq!(reader.read_text("parent")?, "text");
313    assert!(reader.next().is_none());
314
315    reader = XmlReader::new("<parent attr=\"value\"><![CDATA[<foo></foo>]]></parent>");
316
317    assert!(reader.next().is_some()); // "<parent"
318    assert_eq!(reader.read_text("parent")?, "<foo></foo>");
319    assert!(reader.next().is_none());
320
321    reader =
322        XmlReader::new("<parent attr=\"value\"><![CDATA[&quot;&apos;&lt;&gt;&amp;]]></parent>");
323
324    assert!(reader.next().is_some()); // "<parent"
325    assert_eq!(reader.read_text("parent")?, "&quot;&apos;&lt;&gt;&amp;");
326    assert!(reader.next().is_none());
327
328    reader = XmlReader::new("<parent>\n  text\n  \n</parent>");
329    assert!(reader.next().is_some()); // "<parent"
330    assert_eq!(reader.read_text("parent")?, "\n  text\n  \n");
331    assert!(reader.next().is_none());
332
333    reader = XmlReader::new("<parent>\n  <![CDATA[text]]>\n  \n</parent>");
334    assert!(reader.next().is_some()); // "<parent"
335    assert_eq!(reader.read_text("parent")?, "\n  text\n  \n");
336    assert!(reader.next().is_none());
337
338    reader = XmlReader::new("<parent>\n  <![CDATA[text1]]>\n  <![CDATA[text2]]>\n  \n</parent>");
339    assert!(reader.next().is_some()); // "<parent"
340    assert_eq!(reader.read_text("parent")?, "\n  text1\n  text2\n  \n");
341    assert!(reader.next().is_none());
342
343    Ok(())
344}
345
346#[test]
347fn read_till_element_start() -> XmlResult<()> {
348    let mut reader = XmlReader::new("<tag/>");
349
350    reader.read_till_element_start("tag")?;
351    assert!(reader.next().is_some()); // "/>"
352    assert!(reader.next().is_none());
353
354    reader = XmlReader::new("<parent><skip/><tag/></parent>");
355
356    assert!(reader.next().is_some()); // "<parent"
357    assert!(reader.next().is_some()); // ">"
358    reader.read_till_element_start("tag")?;
359    assert!(reader.next().is_some()); // "/>"
360    assert!(reader.next().is_some()); // "</parent>"
361    assert!(reader.next().is_none());
362
363    reader = XmlReader::new("<parent><skip></skip><tag/></parent>");
364
365    assert!(reader.next().is_some()); // "<parent"
366    assert!(reader.next().is_some()); // ">"
367    reader.read_till_element_start("tag")?;
368    assert!(reader.next().is_some()); // "/>"
369    assert!(reader.next().is_some()); // "</parent>"
370    assert!(reader.next().is_none());
371
372    reader = XmlReader::new("<parent><skip><skip/></skip><tag/></parent>");
373
374    assert!(reader.next().is_some()); // "<parent"
375    assert!(reader.next().is_some()); // ">"
376    reader.read_till_element_start("tag")?;
377    assert!(reader.next().is_some()); // "/>"
378    assert!(reader.next().is_some()); // "</parent>"
379    assert!(reader.next().is_none());
380
381    reader = XmlReader::new("<parent><skip><skip></skip></skip><tag/></parent>");
382
383    assert!(reader.next().is_some()); // "<parent"
384    assert!(reader.next().is_some()); // ">"
385    reader.read_till_element_start("tag")?;
386    assert!(reader.next().is_some()); // "/>"
387    assert!(reader.next().is_some()); // "</parent>"
388    assert!(reader.next().is_none());
389
390    Ok(())
391}
392
393#[test]
394fn read_to_end() -> XmlResult<()> {
395    let mut reader = XmlReader::new("<parent><child/></parent>");
396
397    assert!(reader.next().is_some()); // "<parent"
398    assert!(reader.next().is_some()); // ">"
399    assert!(reader.next().is_some()); // "<child"
400    reader.read_to_end("child")?;
401    assert!(reader.next().is_some()); // "</parent>"
402    assert!(reader.next().is_none());
403
404    reader = XmlReader::new("<parent><child></child></parent>");
405
406    assert!(reader.next().is_some()); // "<parent"
407    assert!(reader.next().is_some()); // ">"
408    assert!(reader.next().is_some()); // "<child"
409    reader.read_to_end("child")?;
410    assert!(reader.next().is_some()); // "</parent>"
411    assert!(reader.next().is_none());
412
413    reader = XmlReader::new("<parent><child><child/></child></parent>");
414
415    assert!(reader.next().is_some()); // "<parent"
416    assert!(reader.next().is_some()); // ">"
417    assert!(reader.next().is_some()); // "<child"
418    reader.read_to_end("child")?;
419    assert!(reader.next().is_some()); // "</parent>"
420    assert!(reader.next().is_none());
421
422    reader = XmlReader::new("<parent><child><child></child></child></parent>");
423
424    assert!(reader.next().is_some()); // "<parent"
425    assert!(reader.next().is_some()); // ">"
426    assert!(reader.next().is_some()); // "<child"
427    reader.read_to_end("child")?;
428    assert!(reader.next().is_some()); // "</parent>"
429    assert!(reader.next().is_none());
430
431    Ok(())
432}