Skip to main content

xml_3dm/xml/
parser.rs

1//! XML parser that builds node trees.
2//!
3//! This parser uses quick-xml's streaming API to build node trees matching
4//! the Java implementation's behavior.
5
6use std::collections::HashMap;
7use std::fs::File;
8use std::io::{BufReader, Read};
9use std::path::Path;
10
11use quick_xml::escape::unescape;
12use quick_xml::events::{BytesStart, Event};
13use quick_xml::Reader;
14
15use super::NodeFactory;
16use crate::error::{Error, Result};
17use crate::node::{NodeInner, NodeRef, XmlComment, XmlContent, XmlElement, XmlText};
18
19/// XML parser that builds node trees.
20pub struct XmlParser<F: NodeFactory> {
21    factory: F,
22}
23
24impl<F: NodeFactory> XmlParser<F> {
25    /// Creates a new parser with the given node factory.
26    pub fn new(factory: F) -> Self {
27        XmlParser { factory }
28    }
29
30    /// Parses XML from a string.
31    pub fn parse_str(&self, xml: &str) -> Result<NodeRef> {
32        let mut reader = Reader::from_str(xml);
33        // Don't trim text - we handle whitespace normalization ourselves
34        reader.config_mut().trim_text_start = false;
35        reader.config_mut().trim_text_end = false;
36        self.parse_reader(&mut reader)
37    }
38
39    /// Parses XML from a file.
40    pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<NodeRef> {
41        let file = File::open(path)?;
42        let buf_reader = BufReader::new(file);
43        let mut reader = Reader::from_reader(buf_reader);
44        // Don't trim text - we handle whitespace normalization ourselves
45        reader.config_mut().trim_text_start = false;
46        reader.config_mut().trim_text_end = false;
47        self.parse_reader(&mut reader)
48    }
49
50    /// Parses XML from a quick-xml Reader.
51    fn parse_reader<R: Read + std::io::BufRead>(&self, reader: &mut Reader<R>) -> Result<NodeRef> {
52        // Create the synthetic $ROOT$ element (matches Java's startDocument)
53        let root = self.factory.make_node(XmlContent::Element(XmlElement::new(
54            "$ROOT$".to_string(),
55            HashMap::new(),
56        )));
57
58        let mut node_stack: Vec<NodeRef> = vec![root.clone()];
59        let mut current_text: Option<String> = None;
60        let mut buf = Vec::new();
61
62        loop {
63            match reader.read_event_into(&mut buf) {
64                Ok(Event::Start(ref e)) => {
65                    // Flush any accumulated text
66                    if let Some(text) = current_text.take() {
67                        let trimmed = text.trim();
68                        if !trimmed.is_empty() {
69                            let text_node = self
70                                .factory
71                                .make_node(XmlContent::Text(XmlText::new(trimmed)));
72                            if let Some(parent) = node_stack.last() {
73                                NodeInner::add_child_to_ref(parent, text_node);
74                            }
75                        }
76                    }
77
78                    // Create the element node
79                    let element = self.parse_element(e, reader)?;
80                    let node = self.factory.make_node(XmlContent::Element(element));
81
82                    // Add to parent and push onto stack
83                    if let Some(parent) = node_stack.last() {
84                        NodeInner::add_child_to_ref(parent, node.clone());
85                    }
86                    node_stack.push(node);
87                }
88                Ok(Event::End(_)) => {
89                    // Flush any accumulated text
90                    if let Some(text) = current_text.take() {
91                        let trimmed = text.trim();
92                        if !trimmed.is_empty() {
93                            let text_node = self
94                                .factory
95                                .make_node(XmlContent::Text(XmlText::new(trimmed)));
96                            if let Some(parent) = node_stack.last() {
97                                NodeInner::add_child_to_ref(parent, text_node);
98                            }
99                        }
100                    }
101
102                    // Pop from stack
103                    node_stack.pop();
104                }
105                Ok(Event::Empty(ref e)) => {
106                    // Self-closing tag - handle like Start + End
107                    if let Some(text) = current_text.take() {
108                        let trimmed = text.trim();
109                        if !trimmed.is_empty() {
110                            let text_node = self
111                                .factory
112                                .make_node(XmlContent::Text(XmlText::new(trimmed)));
113                            if let Some(parent) = node_stack.last() {
114                                NodeInner::add_child_to_ref(parent, text_node);
115                            }
116                        }
117                    }
118
119                    let element = self.parse_element(e, reader)?;
120                    let node = self.factory.make_node(XmlContent::Element(element));
121
122                    if let Some(parent) = node_stack.last() {
123                        NodeInner::add_child_to_ref(parent, node);
124                    }
125                }
126                Ok(Event::Text(e)) => {
127                    // Accumulate text, normalizing whitespace as Java does
128                    let raw =
129                        std::str::from_utf8(e.as_ref()).map_err(|e| Error::Parse(e.to_string()))?;
130                    let text = unescape(raw).map_err(|e| Error::Parse(e.to_string()))?;
131                    let normalized = self.normalize_whitespace(&text, current_text.as_deref());
132                    if let Some(normalized) = normalized {
133                        current_text = Some(match current_text {
134                            Some(mut existing) => {
135                                existing.push_str(&normalized);
136                                existing
137                            }
138                            None => normalized,
139                        });
140                    }
141                }
142                Ok(Event::CData(ref e)) => {
143                    // Treat CDATA like text
144                    let text = String::from_utf8_lossy(e.as_ref());
145                    let normalized = self.normalize_whitespace(&text, current_text.as_deref());
146                    if let Some(normalized) = normalized {
147                        current_text = Some(match current_text {
148                            Some(mut existing) => {
149                                existing.push_str(&normalized);
150                                existing
151                            }
152                            None => normalized,
153                        });
154                    }
155                }
156                Ok(Event::Eof) => break,
157                Ok(Event::Comment(ref e)) => {
158                    // Capture comments as nodes
159                    let comment_text = String::from_utf8_lossy(e.as_ref()).to_string();
160                    let comment_node = self
161                        .factory
162                        .make_node(XmlContent::Comment(XmlComment::new(&comment_text)));
163                    if let Some(parent) = node_stack.last() {
164                        NodeInner::add_child_to_ref(parent, comment_node);
165                    }
166                }
167                Ok(Event::Decl(_)) | Ok(Event::PI(_)) => {
168                    // Ignore XML declaration and processing instructions
169                }
170                Ok(Event::DocType(_)) => {
171                    // Ignore DOCTYPE
172                }
173                Ok(Event::GeneralRef(_)) => {
174                    // Ignore general entity references
175                }
176                Err(e) => return Err(Error::Parse(format!("XML parse error: {}", e))),
177            }
178            buf.clear();
179        }
180
181        Ok(root)
182    }
183
184    /// Parses an element's name and attributes.
185    fn parse_element<R: Read + std::io::BufRead>(
186        &self,
187        e: &BytesStart,
188        reader: &Reader<R>,
189    ) -> Result<XmlElement> {
190        let name = reader
191            .decoder()
192            .decode(e.name().as_ref())
193            .map_err(|e| Error::Parse(e.to_string()))?
194            .to_string();
195
196        let mut attributes = HashMap::new();
197        for attr_result in e.attributes() {
198            let attr = attr_result.map_err(|e| Error::Parse(format!("Attribute error: {}", e)))?;
199            let key = reader
200                .decoder()
201                .decode(attr.key.as_ref())
202                .map_err(|e| Error::Parse(e.to_string()))?
203                .to_string();
204            let value = attr
205                .unescape_value()
206                .map_err(|e| Error::Parse(e.to_string()))?
207                .to_string();
208            attributes.insert(key, value);
209        }
210
211        Ok(XmlElement::new(name, attributes))
212    }
213
214    /// Normalizes whitespace in text content, matching Java's behavior.
215    ///
216    /// The Java implementation:
217    /// - Collapses consecutive whitespace to a single space
218    /// - Tracks whether the previous text ended with whitespace
219    /// - Only returns Some if there's non-whitespace content
220    fn normalize_whitespace(&self, text: &str, previous: Option<&str>) -> Option<String> {
221        let last_is_ws = previous.is_none_or(|p| p.ends_with(' '));
222        let mut last_was_ws = last_is_ws;
223        let mut has_non_ws = false;
224        let mut result = String::new();
225
226        for c in text.chars() {
227            if c.is_whitespace() {
228                if !last_was_ws {
229                    result.push(' ');
230                    last_was_ws = true;
231                }
232                // Skip additional whitespace
233            } else {
234                result.push(c);
235                last_was_ws = false;
236                has_non_ws = true;
237            }
238        }
239
240        if has_non_ws {
241            Some(result)
242        } else {
243            None
244        }
245    }
246}
247
248/// Parses XML from a file using a base node factory.
249pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<NodeRef> {
250    let parser = XmlParser::new(super::BaseNodeFactory);
251    parser.parse_file(path)
252}
253
254/// Parses XML from a string using a base node factory.
255pub fn parse_str(xml: &str) -> Result<NodeRef> {
256    let parser = XmlParser::new(super::BaseNodeFactory);
257    parser.parse_str(xml)
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use crate::xml::BaseNodeFactory;
264
265    #[test]
266    fn test_parse_simple_xml() {
267        let xml = r#"<root><child>text</child></root>"#;
268        let parser = XmlParser::new(BaseNodeFactory);
269        let root = parser.parse_str(xml).unwrap();
270
271        // Root should be $ROOT$ with one child (the actual root element)
272        let root_borrowed = root.borrow();
273        assert_eq!(root_borrowed.child_count(), 1);
274
275        let root_content = root_borrowed.content().unwrap();
276        if let XmlContent::Element(e) = root_content {
277            assert_eq!(e.qname(), "$ROOT$");
278        } else {
279            panic!("Expected element");
280        }
281
282        // First child should be <root>
283        let root_elem = root_borrowed.children()[0].clone();
284        let root_elem_borrowed = root_elem.borrow();
285        if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
286            assert_eq!(e.qname(), "root");
287        } else {
288            panic!("Expected element");
289        }
290    }
291
292    #[test]
293    fn test_parse_with_attributes() {
294        let xml = r#"<root id="foo" class="bar">content</root>"#;
295        let parser = XmlParser::new(BaseNodeFactory);
296        let root = parser.parse_str(xml).unwrap();
297
298        let root_borrowed = root.borrow();
299        let root_elem = root_borrowed.children()[0].clone();
300        let root_elem_borrowed = root_elem.borrow();
301
302        if let Some(XmlContent::Element(e)) = root_elem_borrowed.content() {
303            assert_eq!(e.qname(), "root");
304            assert_eq!(e.attributes().get("id"), Some(&"foo".to_string()));
305            assert_eq!(e.attributes().get("class"), Some(&"bar".to_string()));
306        } else {
307            panic!("Expected element");
308        }
309    }
310
311    #[test]
312    fn test_whitespace_normalization() {
313        let xml = r#"<root>  hello   world  </root>"#;
314        let parser = XmlParser::new(BaseNodeFactory);
315        let root = parser.parse_str(xml).unwrap();
316
317        let root_borrowed = root.borrow();
318        let root_elem = root_borrowed.children()[0].clone();
319        let root_elem_borrowed = root_elem.borrow();
320
321        // Should have one text child with normalized whitespace
322        assert_eq!(root_elem_borrowed.child_count(), 1);
323        let text_node = root_elem_borrowed.children()[0].clone();
324        let text_borrowed = text_node.borrow();
325
326        if let Some(XmlContent::Text(t)) = text_borrowed.content() {
327            let text: String = t.text().iter().collect();
328            assert_eq!(text, "hello world");
329        } else {
330            panic!("Expected text node");
331        }
332    }
333
334    #[test]
335    fn test_empty_element() {
336        let xml = r#"<root><empty /></root>"#;
337        let parser = XmlParser::new(BaseNodeFactory);
338        let root = parser.parse_str(xml).unwrap();
339
340        let root_borrowed = root.borrow();
341        let root_elem = root_borrowed.children()[0].clone();
342        let root_elem_borrowed = root_elem.borrow();
343
344        assert_eq!(root_elem_borrowed.child_count(), 1);
345        let empty_elem = root_elem_borrowed.children()[0].clone();
346        let empty_borrowed = empty_elem.borrow();
347
348        if let Some(XmlContent::Element(e)) = empty_borrowed.content() {
349            assert_eq!(e.qname(), "empty");
350        } else {
351            panic!("Expected element");
352        }
353        assert_eq!(empty_borrowed.child_count(), 0);
354    }
355
356    #[test]
357    fn test_nested_elements() {
358        let xml = r#"<a><b><c>deep</c></b></a>"#;
359        let parser = XmlParser::new(BaseNodeFactory);
360        let root = parser.parse_str(xml).unwrap();
361
362        // Navigate: $ROOT$ -> a -> b -> c -> text
363        let root_borrowed = root.borrow();
364        let a = root_borrowed.children()[0].clone();
365        let a_borrowed = a.borrow();
366        let b = a_borrowed.children()[0].clone();
367        let b_borrowed = b.borrow();
368        let c = b_borrowed.children()[0].clone();
369        let c_borrowed = c.borrow();
370        let text = c_borrowed.children()[0].clone();
371        let text_borrowed = text.borrow();
372
373        if let Some(XmlContent::Text(t)) = text_borrowed.content() {
374            let text_str: String = t.text().iter().collect();
375            assert_eq!(text_str, "deep");
376        } else {
377            panic!("Expected text node");
378        }
379    }
380}