soupy/parser/xml/
mod.rs

1use std::{
2    collections::BTreeMap,
3    io::Read,
4    marker::PhantomData,
5};
6
7use xmltree::Namespace;
8
9use crate::{
10    parser::Parser,
11    Node,
12};
13
14/// Default XML parser
15///
16/// Errors on malformed XML.
17#[derive(Clone, Debug)]
18pub struct XMLParser<R> {
19    _marker: PhantomData<R>,
20}
21
22impl<R> Parser for XMLParser<R>
23where
24    R: Read,
25{
26    type Input = R;
27    type Node = XMLNode;
28    type Error = xmltree::ParseError;
29
30    fn parse(reader: R) -> Result<Vec<Self::Node>, Self::Error> {
31        Ok(xmltree::Element::parse_all(reader)?
32            .into_iter()
33            .map(Into::into)
34            .collect())
35    }
36}
37
38/// Represents an XML element
39#[derive(Debug, Default, Clone, PartialEq, Eq)]
40pub struct XMLElement {
41    /// This elements prefix, if any
42    pub prefix: Option<String>,
43
44    /// This elements namespace, if any
45    pub namespace: Option<String>,
46
47    /// The full list of namespaces, if any
48    pub namespaces: Option<Namespace>,
49
50    /// The name of the Element
51    pub name: String,
52
53    /// The Element attributes
54    pub attributes: BTreeMap<String, String>,
55
56    /// Children
57    pub children: Vec<XMLNode>,
58}
59
60impl From<xmltree::Element> for XMLElement {
61    fn from(value: xmltree::Element) -> Self {
62        Self {
63            prefix: value.prefix,
64            namespace: value.namespace,
65            namespaces: value.namespaces,
66            name: value.name,
67            attributes: value.attributes.into_iter().collect(),
68            children: value.children.into_iter().map(Into::into).collect(),
69        }
70    }
71}
72
73/// Represents an XML node
74#[derive(Debug, Clone, PartialEq, Eq)]
75pub enum XMLNode {
76    /// XML element which can contain children nodes
77    Element(XMLElement),
78
79    /// Comment
80    Comment(String),
81
82    /// CDATA
83    CData(String),
84
85    /// Text
86    Text(String),
87
88    /// Processing Instruction
89    ProcessingInstruction(String, Option<String>),
90}
91
92impl From<xmltree::XMLNode> for XMLNode {
93    fn from(value: xmltree::XMLNode) -> Self {
94        match value {
95            xmltree::XMLNode::Element(e) => XMLNode::Element(e.into()),
96            xmltree::XMLNode::Comment(c) => XMLNode::Comment(c),
97            xmltree::XMLNode::CData(d) => XMLNode::CData(d),
98            xmltree::XMLNode::Text(t) => XMLNode::Text(t),
99            xmltree::XMLNode::ProcessingInstruction(a, b) => XMLNode::ProcessingInstruction(a, b),
100        }
101    }
102}
103
104impl Node for XMLNode {
105    type Text = String;
106
107    fn name(&self) -> Option<&String> {
108        match self {
109            XMLNode::Element(e) => Some(&e.name),
110            _ => None,
111        }
112    }
113
114    fn text(&self) -> Option<&String> {
115        match self {
116            XMLNode::Text(t) => Some(t),
117            _ => None,
118        }
119    }
120
121    fn attrs(&self) -> Option<&BTreeMap<String, String>> {
122        match self {
123            XMLNode::Element(e) => Some(&e.attributes),
124            _ => None,
125        }
126    }
127
128    fn children(&self) -> &[Self] {
129        if let XMLNode::Element(e) = &self {
130            e.children.as_slice()
131        } else {
132            &[]
133        }
134    }
135}
136
137impl XMLNode {
138    /// Iterate over direct children
139    pub fn iter(&self) -> std::slice::Iter<Self> {
140        self.children().iter()
141    }
142}
143
144impl<'a> IntoIterator for &'a XMLNode {
145    type Item = &'a XMLNode;
146    type IntoIter = std::slice::Iter<'a, XMLNode>;
147
148    fn into_iter(self) -> Self::IntoIter {
149        self.iter()
150    }
151}
152
153#[cfg(test)]
154mod tests {
155    use std::ops::Deref;
156
157    use super::*;
158    use crate::*;
159
160    const HELLO: &str = r#"<?xml version="1.0" encoding="utf-8"?>
161<root>
162    <simple>Here's some text</simple>
163    <complex id="hello">
164        <nested>Nested text!</nested>
165        <example>More text</example>
166
167        <tree depth="1">
168            <tree depth="2">
169                <tree depth="3">Tree text</tree>
170            </tree>
171        </tree>
172    </complex>
173
174    <b>
175        <a>Inner text</a>
176    </b>
177
178    <a>Outer text</a>
179</root>"#;
180
181    #[test]
182    fn test_text() {
183        let soup = Soup::xml(HELLO.as_bytes()).expect("Failed to parse XML");
184
185        let example = soup
186            .tag("example")
187            .first()
188            .expect("Could not find 'example' tag");
189
190        // TODO: Fix borrow lifetime issue here
191        let child = example
192            .children()
193            .first()
194            .expect("Could not find 'example' child node");
195
196        assert_eq!(child.text(), Some(&"More text".into()));
197
198        let root = soup.tag("root").first().expect("Could not find 'root' tag");
199
200        assert_eq!(
201            root.all_text(),
202            "Here's some text\nNested text!\nMore text\nTree text\nInner text\nOuter text"
203        );
204    }
205
206    #[test]
207    fn test_tree_iter() {
208        let soup = Soup::xml(HELLO.as_bytes()).expect("Failed to parse XML");
209
210        let complex = soup
211            .tag("complex")
212            .first()
213            .expect("Could not find 'complex' tag")
214            .deref()
215            .clone();
216
217        let mut nodes = complex.descendants();
218
219        assert_eq!(nodes.next().unwrap().name(), Some(&"complex".into()));
220
221        assert_eq!(
222            nodes.next().unwrap(),
223            &XMLNode::Element(XMLElement {
224                name: "nested".into(),
225                children: vec![XMLNode::Text("Nested text!".into())],
226                ..Default::default()
227            })
228        );
229
230        assert_eq!(nodes.next().unwrap(), &XMLNode::Text("Nested text!".into()));
231
232        assert_eq!(
233            nodes.next().unwrap(),
234            &XMLNode::Element(XMLElement {
235                name: "example".into(),
236                children: vec![XMLNode::Text("More text".into())],
237                ..Default::default()
238            })
239        );
240
241        assert_eq!(nodes.next().unwrap(), &XMLNode::Text("More text".into()));
242    }
243
244    #[test]
245    fn test_direct_iter() {
246        let soup = Soup::xml(HELLO.as_bytes()).expect("Failed to parse XML");
247
248        let complex = soup
249            .tag("complex")
250            .first()
251            .expect("Could not find 'complex' tag")
252            .deref()
253            .clone();
254
255        let mut nodes = complex.into_iter();
256
257        assert_eq!(
258            nodes.next().unwrap(),
259            &XMLNode::Element(XMLElement {
260                name: "nested".into(),
261                children: vec![XMLNode::Text("Nested text!".into())],
262                ..Default::default()
263            })
264        );
265
266        assert_eq!(
267            nodes.next().unwrap(),
268            &XMLNode::Element(XMLElement {
269                name: "example".into(),
270                children: vec![XMLNode::Text("More text".into())],
271                ..Default::default()
272            })
273        );
274    }
275
276    #[test]
277    fn test_iter_order() {
278        let soup = Soup::xml(HELLO.as_bytes()).expect("Failed to parse XML");
279
280        let soup = soup
281            .tag("root")
282            .first()
283            .expect("Failed to find 'root' tag")
284            .query();
285
286        // By default, the data is searched recursively, depth-first.
287        assert_eq!(
288            soup.tag("a").first().map(|t| t.all_text()),
289            Some("Inner text".into())
290        );
291
292        // Strict queries only match direct children, no recursion.
293        assert_eq!(
294            soup.strict().tag("a").first().map(|t| t.all_text()),
295            Some("Outer text".into())
296        );
297    }
298}