arquery/document/
mod.rs

1use std::{
2    collections::HashMap,
3    fs::File,
4    io::{BufReader, Read},
5    path::Path,
6};
7
8use xml::reader::{EventReader, XmlEvent};
9
10use super::{Element, SelectError};
11
12/// The various errors that can happen when creating a document.
13#[derive(Clone, Debug)]
14pub enum DocumentError {
15    UnableToOpenFile(String),
16    ParseError(String),
17}
18
19/// The DOM tree representation of the parsed document.
20#[derive(Clone, Debug)]
21pub struct Document {
22    root: Element,
23}
24
25impl Document {
26    /// Creates a new document from a byte stream.
27    pub fn new_from_xml_stream<R: Read>(stream: R) -> Result<Document, DocumentError> {
28        let event_reader = EventReader::new(stream);
29
30        let mut elements: Vec<Element> = Vec::new();
31        let mut next_node_index = 1;
32
33        for event in event_reader {
34            match event {
35                Ok(XmlEvent::StartElement {
36                    ref name,
37                    ref attributes,
38                    ..
39                }) => {
40                    let attr_map =
41                        attributes
42                            .iter()
43                            .fold(HashMap::new(), |mut hash_map, attribute| {
44                                hash_map.insert(
45                                    attribute.name.local_name.clone(),
46                                    attribute.value.clone(),
47                                );
48
49                                return hash_map;
50                            });
51
52                    elements.push(Element {
53                        node_index: next_node_index,
54                        children: None,
55                        tag_name: name.local_name.clone(),
56                        attr_map: attr_map,
57                        text: String::new(),
58                    });
59                    next_node_index = next_node_index + 1;
60                }
61
62                Ok(XmlEvent::EndElement { ref name, .. })
63                    if elements.last().unwrap().tag_name() == name.local_name =>
64                {
65                    let child_node = elements.pop().unwrap();
66
67                    if let Some(mut parent) = elements.pop() {
68                        if let Some(ref mut children) = parent.children {
69                            children.push(child_node);
70                        } else {
71                            parent.children = Some(vec![child_node]);
72                        }
73
74                        elements.push(parent);
75                    } else {
76                        return Ok(Document {
77                            root: Element {
78                                node_index: 0,
79                                tag_name: "[root]".to_string(),
80                                children: Some(vec![child_node]),
81                                attr_map: HashMap::new(),
82                                text: String::new(),
83                            },
84                        });
85                    }
86                }
87
88                Ok(XmlEvent::Characters(string)) => {
89                    elements.last_mut().unwrap().text.push_str(&string);
90                }
91
92                Ok(XmlEvent::Whitespace(string)) => {
93                    elements.last_mut().unwrap().text.push_str(&string);
94                }
95
96                Err(error) => {
97                    return Err(DocumentError::ParseError(error.to_string()));
98                }
99
100                Ok(_) => {}
101            }
102        }
103
104        panic!("Root element was not properly returned!");
105    }
106
107    /// Creates a new document from a string.
108    pub fn new_from_xml_string(string: &str) -> Result<Document, DocumentError> {
109        Document::new_from_xml_stream(string.as_bytes())
110    }
111
112    /// Creates a new document from a file.
113    pub fn new_from_xml_file(filename: &str) -> Result<Document, DocumentError> {
114        let path = Path::new(filename);
115
116        if let Ok(file) = File::open(path) {
117            let reader = BufReader::new(file);
118
119            Document::new_from_xml_stream(reader)
120        } else {
121            Err(DocumentError::UnableToOpenFile(
122                path.to_str().unwrap().to_string(),
123            ))
124        }
125    }
126
127    /// Returns the total number of elements in the document.
128    pub fn number_of_elements(&self) -> usize {
129        self.root.subtree_size() - 1
130    }
131
132    /// Searches the document for elements matching the given CSS selector.
133    pub fn select_all<'a>(
134        &'a self,
135        selector: &str,
136    ) -> Result<Box<dyn Iterator<Item = &'a Element> + 'a>, SelectError> {
137        self.root.select_all(selector)
138    }
139
140    /// Just like `select_all` but only returns the first match.
141    pub fn select<'a>(&'a self, selector: &str) -> Result<&'a Element, SelectError> {
142        self.root.select(selector)
143    }
144}
145
146#[test]
147fn it_assigns_node_indices_in_monotonically_increasing_order() {
148    let document = Document::new_from_xml_string(
149        r#"
150<?xml version="1.0" encoding="UTF-8"?>
151<sample type="simple">
152  This is some text
153  <!-- This is a comment -->
154  <title>Simple Sample</title>
155  <note long="false">Some unrecognisable scribbling</note>
156
157  <related>
158    <!-- This is another comment -->
159    <item id="1">
160      <title>Another Sample</title>
161      <ref>http://path.to.somewhere</ref>
162    </item>
163
164    <item id="2">
165      <title>Other Sample</title>
166      <ref>http://some.other.path</ref>
167    </item>
168  </related>
169
170  <!-- div soup goodness -->
171  <div></div>
172  <div>
173    <other>
174      <div></div>
175    </other>
176    <div>
177      <div></div>
178      <div>
179        <div></div>
180        <div></div>
181      </div>
182    </div>
183  </div>
184</sample>
185"#,
186    )
187    .unwrap();
188
189    assert_eq!(document.root.node_index, 0);
190
191    document.root.children_deep_iter().fold(0, |index, child| {
192        assert!(index < child.node_index);
193        child.node_index
194    });
195}