1use std::collections::HashMap;
7use std::fs::File;
8use std::io::{BufReader, Read};
9use std::path::Path;
10
11use quick_xml::escape::unescape;
12use quick_xml::events::{BytesStart, Event};
13use quick_xml::Reader;
14
15use super::NodeFactory;
16use crate::error::{Error, Result};
17use crate::node::{NodeInner, NodeRef, XmlComment, XmlContent, XmlElement, XmlText};
18
19pub struct XmlParser<F: NodeFactory> {
21 factory: F,
22}
23
24impl<F: NodeFactory> XmlParser<F> {
25 pub fn new(factory: F) -> Self {
27 XmlParser { factory }
28 }
29
30 pub fn parse_str(&self, xml: &str) -> Result<NodeRef> {
32 let mut reader = Reader::from_str(xml);
33 reader.config_mut().trim_text_start = false;
35 reader.config_mut().trim_text_end = false;
36 self.parse_reader(&mut reader)
37 }
38
39 pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<NodeRef> {
41 let file = File::open(path)?;
42 let buf_reader = BufReader::new(file);
43 let mut reader = Reader::from_reader(buf_reader);
44 reader.config_mut().trim_text_start = false;
46 reader.config_mut().trim_text_end = false;
47 self.parse_reader(&mut reader)
48 }
49
50 fn parse_reader<R: Read + std::io::BufRead>(&self, reader: &mut Reader<R>) -> Result<NodeRef> {
52 let root = self.factory.make_node(XmlContent::Element(XmlElement::new(
54 "$ROOT$".to_string(),
55 HashMap::new(),
56 )));
57
58 let mut node_stack: Vec<NodeRef> = vec![root.clone()];
59 let mut current_text: Option<String> = None;
60 let mut buf = Vec::new();
61
62 loop {
63 match reader.read_event_into(&mut buf) {
64 Ok(Event::Start(ref e)) => {
65 if let Some(text) = current_text.take() {
67 let trimmed = text.trim();
68 if !trimmed.is_empty() {
69 let text_node = self
70 .factory
71 .make_node(XmlContent::Text(XmlText::new(trimmed)));
72 if let Some(parent) = node_stack.last() {
73 NodeInner::add_child_to_ref(parent, text_node);
74 }
75 }
76 }
77
78 let element = self.parse_element(e, reader)?;
80 let node = self.factory.make_node(XmlContent::Element(element));
81
82 if let Some(parent) = node_stack.last() {
84 NodeInner::add_child_to_ref(parent, node.clone());
85 }
86 node_stack.push(node);
87 }
88 Ok(Event::End(_)) => {
89 if let Some(text) = current_text.take() {
91 let trimmed = text.trim();
92 if !trimmed.is_empty() {
93 let text_node = self
94 .factory
95 .make_node(XmlContent::Text(XmlText::new(trimmed)));
96 if let Some(parent) = node_stack.last() {
97 NodeInner::add_child_to_ref(parent, text_node);
98 }
99 }
100 }
101
102 node_stack.pop();
104 }
105 Ok(Event::Empty(ref e)) => {
106 if let Some(text) = current_text.take() {
108 let trimmed = text.trim();
109 if !trimmed.is_empty() {
110 let text_node = self
111 .factory
112 .make_node(XmlContent::Text(XmlText::new(trimmed)));
113 if let Some(parent) = node_stack.last() {
114 NodeInner::add_child_to_ref(parent, text_node);
115 }
116 }
117 }
118
119 let element = self.parse_element(e, reader)?;
120 let node = self.factory.make_node(XmlContent::Element(element));
121
122 if let Some(parent) = node_stack.last() {
123 NodeInner::add_child_to_ref(parent, node);
124 }
125 }
126 Ok(Event::Text(e)) => {
127 let raw =
129 std::str::from_utf8(e.as_ref()).map_err(|e| Error::Parse(e.to_string()))?;
130 let text = unescape(raw).map_err(|e| Error::Parse(e.to_string()))?;
131 let normalized = self.normalize_whitespace(&text, current_text.as_deref());
132 if let Some(normalized) = normalized {
133 current_text = Some(match current_text {
134 Some(mut existing) => {
135 existing.push_str(&normalized);
136 existing
137 }
138 None => normalized,
139 });
140 }
141 }
142 Ok(Event::CData(ref e)) => {
143 let text = String::from_utf8_lossy(e.as_ref());
145 let normalized = self.normalize_whitespace(&text, current_text.as_deref());
146 if let Some(normalized) = normalized {
147 current_text = Some(match current_text {
148 Some(mut existing) => {
149 existing.push_str(&normalized);
150 existing
151 }
152 None => normalized,
153 });
154 }
155 }
156 Ok(Event::Eof) => break,
157 Ok(Event::Comment(ref e)) => {
158 let comment_text = String::from_utf8_lossy(e.as_ref()).to_string();
160 let comment_node = self
161 .factory
162 .make_node(XmlContent::Comment(XmlComment::new(&comment_text)));
163 if let Some(parent) = node_stack.last() {
164 NodeInner::add_child_to_ref(parent, comment_node);
165 }
166 }
167 Ok(Event::Decl(_)) | Ok(Event::PI(_)) => {
168 }
170 Ok(Event::DocType(_)) => {
171 }
173 Ok(Event::GeneralRef(_)) => {
174 }
176 Err(e) => return Err(Error::Parse(format!("XML parse error: {}", e))),
177 }
178 buf.clear();
179 }
180
181 Ok(root)
182 }
183
184 fn parse_element<R: Read + std::io::BufRead>(
186 &self,
187 e: &BytesStart,
188 reader: &Reader<R>,
189 ) -> Result<XmlElement> {
190 let name = reader
191 .decoder()
192 .decode(e.name().as_ref())
193 .map_err(|e| Error::Parse(e.to_string()))?
194 .to_string();
195
196 let mut attributes = HashMap::new();
197 for attr_result in e.attributes() {
198 let attr = attr_result.map_err(|e| Error::Parse(format!("Attribute error: {}", e)))?;
199 let key = reader
200 .decoder()
201 .decode(attr.key.as_ref())
202 .map_err(|e| Error::Parse(e.to_string()))?
203 .to_string();
204 let value = attr
205 .unescape_value()
206 .map_err(|e| Error::Parse(e.to_string()))?
207 .to_string();
208 attributes.insert(key, value);
209 }
210
211 Ok(XmlElement::new(name, attributes))
212 }
213
214 fn normalize_whitespace(&self, text: &str, previous: Option<&str>) -> Option<String> {
221 let last_is_ws = previous.is_none_or(|p| p.ends_with(' '));
222 let mut last_was_ws = last_is_ws;
223 let mut has_non_ws = false;
224 let mut result = String::new();
225
226 for c in text.chars() {
227 if c.is_whitespace() {
228 if !last_was_ws {
229 result.push(' ');
230 last_was_ws = true;
231 }
232 } else {
234 result.push(c);
235 last_was_ws = false;
236 has_non_ws = true;
237 }
238 }
239
240 if has_non_ws {
241 Some(result)
242 } else {
243 None
244 }
245 }
246}
247
248pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<NodeRef> {
250 let parser = XmlParser::new(super::BaseNodeFactory);
251 parser.parse_file(path)
252}
253
254pub fn parse_str(xml: &str) -> Result<NodeRef> {
256 let parser = XmlParser::new(super::BaseNodeFactory);
257 parser.parse_str(xml)
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263 use crate::xml::BaseNodeFactory;
264
265 #[test]
266 fn test_parse_simple_xml() {
267 let xml = r#"<root><child>text</child></root>"#;
268 let parser = XmlParser::new(BaseNodeFactory);
269 let root = parser.parse_str(xml).unwrap();
270
271 let root_borrowed = root.borrow();
273 assert_eq!(root_borrowed.child_count(), 1);
274
275 let root_content = root_borrowed.content().unwrap();
276 if let XmlContent::Element(e) = root_content {
277 assert_eq!(e.qname(), "$ROOT$");
278 } else {
279 panic!("Expected element");
280 }
281
282 let root_elem = root_borrowed.children()[0].clone();
284 let root_elem_borrowed = root_elem.borrow();
285 if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
286 assert_eq!(e.qname(), "root");
287 } else {
288 panic!("Expected element");
289 }
290 }
291
292 #[test]
293 fn test_parse_with_attributes() {
294 let xml = r#"<root id="foo" class="bar">content</root>"#;
295 let parser = XmlParser::new(BaseNodeFactory);
296 let root = parser.parse_str(xml).unwrap();
297
298 let root_borrowed = root.borrow();
299 let root_elem = root_borrowed.children()[0].clone();
300 let root_elem_borrowed = root_elem.borrow();
301
302 if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
303 assert_eq!(e.qname(), "root");
304 assert_eq!(e.attributes().get("id"), Some(&"foo".to_string()));
305 assert_eq!(e.attributes().get("class"), Some(&"bar".to_string()));
306 } else {
307 panic!("Expected element");
308 }
309 }
310
311 #[test]
312 fn test_whitespace_normalization() {
313 let xml = r#"<root> hello world </root>"#;
314 let parser = XmlParser::new(BaseNodeFactory);
315 let root = parser.parse_str(xml).unwrap();
316
317 let root_borrowed = root.borrow();
318 let root_elem = root_borrowed.children()[0].clone();
319 let root_elem_borrowed = root_elem.borrow();
320
321 assert_eq!(root_elem_borrowed.child_count(), 1);
323 let text_node = root_elem_borrowed.children()[0].clone();
324 let text_borrowed = text_node.borrow();
325
326 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
327 let text: String = t.text().iter().collect();
328 assert_eq!(text, "hello world");
329 } else {
330 panic!("Expected text node");
331 }
332 }
333
334 #[test]
335 fn test_empty_element() {
336 let xml = r#"<root><empty /></root>"#;
337 let parser = XmlParser::new(BaseNodeFactory);
338 let root = parser.parse_str(xml).unwrap();
339
340 let root_borrowed = root.borrow();
341 let root_elem = root_borrowed.children()[0].clone();
342 let root_elem_borrowed = root_elem.borrow();
343
344 assert_eq!(root_elem_borrowed.child_count(), 1);
345 let empty_elem = root_elem_borrowed.children()[0].clone();
346 let empty_borrowed = empty_elem.borrow();
347
348 if let Some(XmlContent::Element(e)) = empty_borrowed.content() {
349 assert_eq!(e.qname(), "empty");
350 } else {
351 panic!("Expected element");
352 }
353 assert_eq!(empty_borrowed.child_count(), 0);
354 }
355
356 #[test]
357 fn test_nested_elements() {
358 let xml = r#"<a><b><c>deep</c></b></a>"#;
359 let parser = XmlParser::new(BaseNodeFactory);
360 let root = parser.parse_str(xml).unwrap();
361
362 let root_borrowed = root.borrow();
364 let a = root_borrowed.children()[0].clone();
365 let a_borrowed = a.borrow();
366 let b = a_borrowed.children()[0].clone();
367 let b_borrowed = b.borrow();
368 let c = b_borrowed.children()[0].clone();
369 let c_borrowed = c.borrow();
370 let text = c_borrowed.children()[0].clone();
371 let text_borrowed = text.borrow();
372
373 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
374 let text_str: String = t.text().iter().collect();
375 assert_eq!(text_str, "deep");
376 } else {
377 panic!("Expected text node");
378 }
379 }
380}