1use std::collections::HashMap;
7use std::fs::File;
8use std::io::{BufReader, Read};
9use std::path::Path;
10
11use quick_xml::escape::unescape;
12use quick_xml::events::{BytesStart, Event};
13use quick_xml::Reader;
14
15use super::NodeFactory;
16use crate::error::{Error, Result};
17use crate::node::{
18 is_xmlns_attr, split_qname, ExpandedName, NamespaceContext, NodeInner, NodeRef, XmlComment,
19 XmlContent, XmlElement, XmlProcessingInstruction, XmlText,
20};
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
24enum WhitespaceMode {
25 #[default]
27 Normalize,
28 Preserve,
30}
31
32pub struct XmlParser<F: NodeFactory> {
34 factory: F,
35}
36
37impl<F: NodeFactory> XmlParser<F> {
38 pub fn new(factory: F) -> Self {
40 XmlParser { factory }
41 }
42
43 pub fn parse_str(&self, xml: &str) -> Result<NodeRef> {
45 let mut reader = Reader::from_str(xml);
46 reader.config_mut().trim_text_start = false;
48 reader.config_mut().trim_text_end = false;
49 self.parse_reader(&mut reader)
50 }
51
52 pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<NodeRef> {
54 let file = File::open(path)?;
55 let buf_reader = BufReader::new(file);
56 let mut reader = Reader::from_reader(buf_reader);
57 reader.config_mut().trim_text_start = false;
59 reader.config_mut().trim_text_end = false;
60 self.parse_reader(&mut reader)
61 }
62
63 fn parse_reader<R: Read + std::io::BufRead>(&self, reader: &mut Reader<R>) -> Result<NodeRef> {
65 let root = self.factory.make_node(XmlContent::Element(XmlElement::new(
67 "$ROOT$".to_string(),
68 HashMap::new(),
69 )));
70
71 let mut node_stack: Vec<NodeRef> = vec![root.clone()];
72 let mut ws_mode_stack: Vec<WhitespaceMode> = vec![WhitespaceMode::Normalize];
73 let mut ns_context = NamespaceContext::new();
74 let mut current_text: Option<String> = None;
75 let mut buf = Vec::new();
76
77 loop {
78 match reader.read_event_into(&mut buf) {
79 Ok(Event::Start(ref e)) => {
80 let current_ws_mode =
82 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
83
84 if let Some(text) = current_text.take() {
86 let text_to_store = if current_ws_mode == WhitespaceMode::Preserve {
87 text } else {
89 text.trim().to_string() };
91 if !text_to_store.is_empty() || current_ws_mode == WhitespaceMode::Preserve
92 {
93 let text_node = self
94 .factory
95 .make_node(XmlContent::Text(XmlText::new(&text_to_store)));
96 if let Some(parent) = node_stack.last() {
97 NodeInner::add_child_to_ref(parent, text_node);
98 }
99 }
100 }
101
102 ns_context.push_scope();
104
105 let (element, ws_mode_override) =
107 self.parse_element_with_ns(e, reader, &mut ns_context)?;
108 let node = self.factory.make_node(XmlContent::Element(element));
109
110 let new_ws_mode = ws_mode_override.unwrap_or(current_ws_mode);
112 ws_mode_stack.push(new_ws_mode);
113
114 if let Some(parent) = node_stack.last() {
116 NodeInner::add_child_to_ref(parent, node.clone());
117 }
118 node_stack.push(node);
119 }
120 Ok(Event::End(_)) => {
121 let current_ws_mode =
122 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
123
124 if let Some(text) = current_text.take() {
126 let text_to_store = if current_ws_mode == WhitespaceMode::Preserve {
127 text
128 } else {
129 text.trim().to_string()
130 };
131 if !text_to_store.is_empty() || current_ws_mode == WhitespaceMode::Preserve
132 {
133 let text_node = self
134 .factory
135 .make_node(XmlContent::Text(XmlText::new(&text_to_store)));
136 if let Some(parent) = node_stack.last() {
137 NodeInner::add_child_to_ref(parent, text_node);
138 }
139 }
140 }
141
142 node_stack.pop();
144 ws_mode_stack.pop();
145 ns_context.pop_scope();
146 }
147 Ok(Event::Empty(ref e)) => {
148 let current_ws_mode =
150 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
151
152 if let Some(text) = current_text.take() {
154 let text_to_store = if current_ws_mode == WhitespaceMode::Preserve {
155 text
156 } else {
157 text.trim().to_string()
158 };
159 if !text_to_store.is_empty() {
160 let text_node = self
161 .factory
162 .make_node(XmlContent::Text(XmlText::new(&text_to_store)));
163 if let Some(parent) = node_stack.last() {
164 NodeInner::add_child_to_ref(parent, text_node);
165 }
166 }
167 }
168
169 ns_context.push_scope();
171 let (element, _ws_mode_override) =
172 self.parse_element_with_ns(e, reader, &mut ns_context)?;
173 ns_context.pop_scope();
174 let node = self.factory.make_node(XmlContent::Element(element));
175
176 if let Some(parent) = node_stack.last() {
177 NodeInner::add_child_to_ref(parent, node);
178 }
179 }
180 Ok(Event::Text(e)) => {
181 let current_ws_mode =
182 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
183 let raw =
184 std::str::from_utf8(e.as_ref()).map_err(|e| Error::Parse(e.to_string()))?;
185 let text = unescape(raw).map_err(|e| Error::Parse(e.to_string()))?;
186
187 if current_ws_mode == WhitespaceMode::Preserve {
188 current_text = Some(match current_text {
190 Some(mut existing) => {
191 existing.push_str(&text);
192 existing
193 }
194 None => text.to_string(),
195 });
196 } else {
197 let normalized = self.normalize_whitespace(&text, current_text.as_deref());
199 if let Some(normalized) = normalized {
200 current_text = Some(match current_text {
201 Some(mut existing) => {
202 existing.push_str(&normalized);
203 existing
204 }
205 None => normalized,
206 });
207 }
208 }
209 }
210 Ok(Event::CData(ref e)) => {
211 let text = String::from_utf8_lossy(e.as_ref());
213 current_text = Some(match current_text {
214 Some(mut existing) => {
215 existing.push_str(&text);
216 existing
217 }
218 None => text.to_string(),
219 });
220 }
221 Ok(Event::Eof) => break,
222 Ok(Event::Comment(ref e)) => {
223 let current_ws_mode =
224 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
225
226 if let Some(text) = current_text.take() {
228 let text_to_store = if current_ws_mode == WhitespaceMode::Preserve {
229 text
230 } else {
231 text.trim().to_string()
232 };
233 if !text_to_store.is_empty() {
234 let text_node = self
235 .factory
236 .make_node(XmlContent::Text(XmlText::new(&text_to_store)));
237 if let Some(parent) = node_stack.last() {
238 NodeInner::add_child_to_ref(parent, text_node);
239 }
240 }
241 }
242
243 let comment_text = String::from_utf8_lossy(e.as_ref()).to_string();
245 let comment_node = self
246 .factory
247 .make_node(XmlContent::Comment(XmlComment::new(&comment_text)));
248 if let Some(parent) = node_stack.last() {
249 NodeInner::add_child_to_ref(parent, comment_node);
250 }
251 }
252 Ok(Event::PI(ref e)) => {
253 let current_ws_mode =
254 *ws_mode_stack.last().unwrap_or(&WhitespaceMode::Normalize);
255
256 if let Some(text) = current_text.take() {
258 let text_to_store = if current_ws_mode == WhitespaceMode::Preserve {
259 text
260 } else {
261 text.trim().to_string()
262 };
263 if !text_to_store.is_empty() || current_ws_mode == WhitespaceMode::Preserve
264 {
265 let text_node = self
266 .factory
267 .make_node(XmlContent::Text(XmlText::new(&text_to_store)));
268 if let Some(parent) = node_stack.last() {
269 NodeInner::add_child_to_ref(parent, text_node);
270 }
271 }
272 }
273
274 let pi_data = String::from_utf8_lossy(e.as_ref()).to_string();
276 let (target, content) = match pi_data
278 .char_indices()
279 .find(|(_, c)| c.is_whitespace())
280 .map(|(i, _)| i)
281 {
282 Some(pos) => (
283 pi_data[..pos].to_string(),
284 pi_data[pos..].trim().to_string(),
285 ),
286 None => (pi_data, String::new()),
287 };
288
289 let pi_node = self.factory.make_node(XmlContent::ProcessingInstruction(
290 XmlProcessingInstruction::new(&target, &content),
291 ));
292 if let Some(parent) = node_stack.last() {
293 NodeInner::add_child_to_ref(parent, pi_node);
294 }
295 }
296 Ok(Event::Decl(_)) => {
297 }
299 Ok(Event::DocType(_)) => {
300 }
302 Ok(Event::GeneralRef(_)) => {
303 }
305 Err(e) => return Err(Error::Parse(format!("XML parse error: {}", e))),
306 }
307 buf.clear();
308 }
309
310 Ok(root)
311 }
312
313 fn parse_element_with_ns<R: Read + std::io::BufRead>(
317 &self,
318 e: &BytesStart,
319 reader: &Reader<R>,
320 ns_context: &mut NamespaceContext,
321 ) -> Result<(XmlElement, Option<WhitespaceMode>)> {
322 let qname = reader
323 .decoder()
324 .decode(e.name().as_ref())
325 .map_err(|e| Error::Parse(e.to_string()))?
326 .to_string();
327
328 let mut attributes = HashMap::new();
329 let mut namespace_decls = HashMap::new();
330 let mut ws_mode_override = None;
331
332 for attr_result in e.attributes() {
333 let attr = attr_result.map_err(|e| Error::Parse(format!("Attribute error: {}", e)))?;
334 let key = reader
335 .decoder()
336 .decode(attr.key.as_ref())
337 .map_err(|e| Error::Parse(e.to_string()))?
338 .to_string();
339 let value = attr
340 .unescape_value()
341 .map_err(|e| Error::Parse(e.to_string()))?
342 .to_string();
343
344 if key == "xml:space" {
346 ws_mode_override = Some(match value.as_str() {
347 "preserve" => WhitespaceMode::Preserve,
348 _ => WhitespaceMode::Normalize,
349 });
350 }
351
352 if is_xmlns_attr(&key) {
353 let prefix = if key == "xmlns" {
355 String::new()
356 } else {
357 key[6..].to_string() };
359 ns_context.bind(&prefix, &value);
360 namespace_decls.insert(prefix, value);
361 } else {
362 attributes.insert(key, value);
363 }
364 }
365
366 let (prefix, local_name) = split_qname(&qname);
368 let expanded_name = match prefix {
369 Some(p) => ns_context
370 .resolve(p)
371 .map(|uri| ExpandedName::new(uri, local_name.to_string())),
372 None => {
373 if let Some(uri) = ns_context.default_namespace() {
374 if !uri.is_empty() {
375 Some(ExpandedName::new(uri, local_name.to_string()))
376 } else {
377 Some(ExpandedName::no_namespace(local_name.to_string()))
378 }
379 } else {
380 Some(ExpandedName::no_namespace(local_name.to_string()))
381 }
382 }
383 };
384
385 Ok((
386 XmlElement::new_with_namespace(qname, expanded_name, namespace_decls, attributes),
387 ws_mode_override,
388 ))
389 }
390
391 fn normalize_whitespace(&self, text: &str, previous: Option<&str>) -> Option<String> {
398 let last_is_ws = previous.is_none_or(|p| p.ends_with(' '));
399 let mut last_was_ws = last_is_ws;
400 let mut has_non_ws = false;
401 let mut result = String::new();
402
403 for c in text.chars() {
404 if c.is_whitespace() {
405 if !last_was_ws {
406 result.push(' ');
407 last_was_ws = true;
408 }
409 } else {
411 result.push(c);
412 last_was_ws = false;
413 has_non_ws = true;
414 }
415 }
416
417 if has_non_ws {
418 Some(result)
419 } else {
420 None
421 }
422 }
423}
424
425pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<NodeRef> {
427 let parser = XmlParser::new(super::BaseNodeFactory);
428 parser.parse_file(path)
429}
430
431pub fn parse_str(xml: &str) -> Result<NodeRef> {
433 let parser = XmlParser::new(super::BaseNodeFactory);
434 parser.parse_str(xml)
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440 use crate::xml::BaseNodeFactory;
441
442 #[test]
443 fn test_parse_simple_xml() {
444 let xml = r#"<root><child>text</child></root>"#;
445 let parser = XmlParser::new(BaseNodeFactory);
446 let root = parser.parse_str(xml).unwrap();
447
448 let root_borrowed = root.borrow();
450 assert_eq!(root_borrowed.child_count(), 1);
451
452 let root_content = root_borrowed.content().unwrap();
453 if let XmlContent::Element(e) = root_content {
454 assert_eq!(e.qname(), "$ROOT$");
455 } else {
456 panic!("Expected element");
457 }
458
459 let root_elem = root_borrowed.children()[0].clone();
461 let root_elem_borrowed = root_elem.borrow();
462 if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
463 assert_eq!(e.qname(), "root");
464 } else {
465 panic!("Expected element");
466 }
467 }
468
469 #[test]
470 fn test_parse_with_attributes() {
471 let xml = r#"<root id="foo" class="bar">content</root>"#;
472 let parser = XmlParser::new(BaseNodeFactory);
473 let root = parser.parse_str(xml).unwrap();
474
475 let root_borrowed = root.borrow();
476 let root_elem = root_borrowed.children()[0].clone();
477 let root_elem_borrowed = root_elem.borrow();
478
479 if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
480 assert_eq!(e.qname(), "root");
481 assert_eq!(e.attributes().get("id"), Some(&"foo".to_string()));
482 assert_eq!(e.attributes().get("class"), Some(&"bar".to_string()));
483 } else {
484 panic!("Expected element");
485 }
486 }
487
488 #[test]
489 fn test_whitespace_normalization() {
490 let xml = r#"<root> hello world </root>"#;
491 let parser = XmlParser::new(BaseNodeFactory);
492 let root = parser.parse_str(xml).unwrap();
493
494 let root_borrowed = root.borrow();
495 let root_elem = root_borrowed.children()[0].clone();
496 let root_elem_borrowed = root_elem.borrow();
497
498 assert_eq!(root_elem_borrowed.child_count(), 1);
500 let text_node = root_elem_borrowed.children()[0].clone();
501 let text_borrowed = text_node.borrow();
502
503 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
504 let text: String = t.text().iter().collect();
505 assert_eq!(text, "hello world");
506 } else {
507 panic!("Expected text node");
508 }
509 }
510
511 #[test]
512 fn test_empty_element() {
513 let xml = r#"<root><empty /></root>"#;
514 let parser = XmlParser::new(BaseNodeFactory);
515 let root = parser.parse_str(xml).unwrap();
516
517 let root_borrowed = root.borrow();
518 let root_elem = root_borrowed.children()[0].clone();
519 let root_elem_borrowed = root_elem.borrow();
520
521 assert_eq!(root_elem_borrowed.child_count(), 1);
522 let empty_elem = root_elem_borrowed.children()[0].clone();
523 let empty_borrowed = empty_elem.borrow();
524
525 if let Some(XmlContent::Element(e)) = empty_borrowed.content() {
526 assert_eq!(e.qname(), "empty");
527 } else {
528 panic!("Expected element");
529 }
530 assert_eq!(empty_borrowed.child_count(), 0);
531 }
532
533 #[test]
534 fn test_nested_elements() {
535 let xml = r#"<a><b><c>deep</c></b></a>"#;
536 let parser = XmlParser::new(BaseNodeFactory);
537 let root = parser.parse_str(xml).unwrap();
538
539 let root_borrowed = root.borrow();
541 let a = root_borrowed.children()[0].clone();
542 let a_borrowed = a.borrow();
543 let b = a_borrowed.children()[0].clone();
544 let b_borrowed = b.borrow();
545 let c = b_borrowed.children()[0].clone();
546 let c_borrowed = c.borrow();
547 let text = c_borrowed.children()[0].clone();
548 let text_borrowed = text.borrow();
549
550 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
551 let text_str: String = t.text().iter().collect();
552 assert_eq!(text_str, "deep");
553 } else {
554 panic!("Expected text node");
555 }
556 }
557
558 #[test]
559 fn test_whitespace_preservation() {
560 let xml = r#"<root xml:space="preserve"> hello world </root>"#;
562 let parser = XmlParser::new(BaseNodeFactory);
563 let root = parser.parse_str(xml).unwrap();
564
565 let root_borrowed = root.borrow();
566 let root_elem = root_borrowed.children()[0].clone();
567 let root_elem_borrowed = root_elem.borrow();
568
569 assert_eq!(root_elem_borrowed.child_count(), 1);
571 let text_node = root_elem_borrowed.children()[0].clone();
572 let text_borrowed = text_node.borrow();
573
574 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
575 let text: String = t.text().iter().collect();
576 assert_eq!(text, " hello world ");
578 } else {
579 panic!("Expected text node");
580 }
581 }
582
583 #[test]
584 fn test_whitespace_preservation_inheritance() {
585 let xml = r#"<root xml:space="preserve"><child> text </child></root>"#;
587 let parser = XmlParser::new(BaseNodeFactory);
588 let root = parser.parse_str(xml).unwrap();
589
590 let root_borrowed = root.borrow();
591 let root_elem = root_borrowed.children()[0].clone();
592 let root_elem_borrowed = root_elem.borrow();
593
594 let child_elem = root_elem_borrowed.children()[0].clone();
595 let child_borrowed = child_elem.borrow();
596
597 assert_eq!(child_borrowed.child_count(), 1);
598 let text_node = child_borrowed.children()[0].clone();
599 let text_borrowed = text_node.borrow();
600
601 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
602 let text: String = t.text().iter().collect();
603 assert_eq!(text, " text ");
605 } else {
606 panic!("Expected text node");
607 }
608 }
609
610 #[test]
611 fn test_whitespace_preservation_override() {
612 let xml =
614 r#"<root xml:space="preserve"><child xml:space="default"> text </child></root>"#;
615 let parser = XmlParser::new(BaseNodeFactory);
616 let root = parser.parse_str(xml).unwrap();
617
618 let root_borrowed = root.borrow();
619 let root_elem = root_borrowed.children()[0].clone();
620 let root_elem_borrowed = root_elem.borrow();
621
622 let child_elem = root_elem_borrowed.children()[0].clone();
623 let child_borrowed = child_elem.borrow();
624
625 assert_eq!(child_borrowed.child_count(), 1);
626 let text_node = child_borrowed.children()[0].clone();
627 let text_borrowed = text_node.borrow();
628
629 if let Some(XmlContent::Text(t)) = text_borrowed.content() {
630 let text: String = t.text().iter().collect();
631 assert_eq!(text, "text");
633 } else {
634 panic!("Expected text node");
635 }
636 }
637
638 #[test]
639 fn test_namespace_parsing() {
640 let xml = r#"<root xmlns="http://example.com" xmlns:ns="http://ns.example.com"><ns:child /></root>"#;
641 let parser = XmlParser::new(BaseNodeFactory);
642 let root = parser.parse_str(xml).unwrap();
643
644 let root_borrowed = root.borrow();
645 let root_elem = root_borrowed.children()[0].clone();
646 let root_elem_borrowed = root_elem.borrow();
647
648 if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
649 assert_eq!(e.qname(), "root");
650 assert_eq!(
652 e.namespace_decls().get(""),
653 Some(&"http://example.com".to_string())
654 );
655 assert_eq!(
656 e.namespace_decls().get("ns"),
657 Some(&"http://ns.example.com".to_string())
658 );
659 assert!(e.attributes().is_empty());
660 let expanded = e.expanded_name().expect("should have expanded name");
662 assert_eq!(expanded.namespace_uri.as_ref(), "http://example.com");
663 assert_eq!(expanded.local_name, "root");
664 } else {
665 panic!("Expected element");
666 }
667
668 let child = root_elem_borrowed.children()[0].clone();
670 let child_borrowed = child.borrow();
671 if let Some(XmlContent::Element(e)) = child_borrowed.content() {
672 assert_eq!(e.qname(), "ns:child");
673 let expanded = e.expanded_name().expect("should have expanded name");
674 assert_eq!(expanded.namespace_uri.as_ref(), "http://ns.example.com");
675 assert_eq!(expanded.local_name, "child");
676 } else {
677 panic!("Expected element");
678 }
679 }
680
681 #[test]
682 fn test_comment_flushes_text() {
683 let xml = r#"<root>hello<!-- comment -->world</root>"#;
685 let parser = XmlParser::new(BaseNodeFactory);
686 let root = parser.parse_str(xml).unwrap();
687
688 let root_borrowed = root.borrow();
689 let root_elem = root_borrowed.children()[0].clone();
690 let root_elem_borrowed = root_elem.borrow();
691
692 assert_eq!(root_elem_borrowed.child_count(), 3);
694
695 let first = root_elem_borrowed.children()[0].clone();
697 let first_borrowed = first.borrow();
698 if let Some(XmlContent::Text(t)) = first_borrowed.content() {
699 let text: String = t.text().iter().collect();
700 assert_eq!(text, "hello");
701 } else {
702 panic!("Expected text node, got {:?}", first_borrowed.content());
703 }
704
705 let second = root_elem_borrowed.children()[1].clone();
707 let second_borrowed = second.borrow();
708 assert!(matches!(
709 second_borrowed.content(),
710 Some(XmlContent::Comment(_))
711 ));
712
713 let third = root_elem_borrowed.children()[2].clone();
715 let third_borrowed = third.borrow();
716 if let Some(XmlContent::Text(t)) = third_borrowed.content() {
717 let text: String = t.text().iter().collect();
718 assert_eq!(text, "world");
719 } else {
720 panic!("Expected text node");
721 }
722 }
723}