Skip to main content

ooxml_xml/
raw_xml.rs

1//! Raw XML preservation for round-trip fidelity.
2//!
3//! This module provides types for storing unparsed XML elements,
4//! allowing documents to survive read→write cycles without losing
5//! features we don't explicitly understand.
6
7use quick_xml::events::{BytesCData, BytesEnd, BytesStart, BytesText, Event};
8use quick_xml::{Reader, Writer};
9use std::io::{BufRead, Write};
10
11use crate::{Error, FromXml, ParseError, Result};
12
13/// A raw XML node with its original position for correct round-trip ordering.
14///
15/// When unknown elements are captured during parsing, we store their position
16/// among siblings so they can be interleaved correctly during serialization.
17#[derive(Clone, Debug, PartialEq)]
18pub struct PositionedNode {
19    /// Original position among sibling elements (0-indexed).
20    pub position: usize,
21    /// The preserved XML node.
22    pub node: RawXmlNode,
23}
24
25impl PositionedNode {
26    /// Create a new positioned node.
27    pub fn new(position: usize, node: RawXmlNode) -> Self {
28        Self { position, node }
29    }
30}
31
32/// An XML attribute with its original position for correct round-trip ordering.
33///
34/// When unknown attributes are captured during parsing, we store their position
35/// among sibling attributes so they can be serialized in the original order.
36#[derive(Clone, Debug, PartialEq)]
37pub struct PositionedAttr {
38    /// Original position among sibling attributes (0-indexed).
39    pub position: usize,
40    /// The attribute name (including namespace prefix if present).
41    pub name: String,
42    /// The attribute value.
43    pub value: String,
44}
45
46impl PositionedAttr {
47    /// Create a new positioned attribute.
48    pub fn new(position: usize, name: impl Into<String>, value: impl Into<String>) -> Self {
49        Self {
50            position,
51            name: name.into(),
52            value: value.into(),
53        }
54    }
55}
56
57/// A raw XML node that can be preserved during round-trip.
58#[derive(Clone, Debug, PartialEq)]
59pub enum RawXmlNode {
60    /// An XML element with name, attributes, and children.
61    Element(RawXmlElement),
62    /// Text content.
63    Text(String),
64    /// CDATA content.
65    CData(String),
66    /// A comment.
67    Comment(String),
68}
69
70/// A raw XML element with its name, attributes, and children preserved.
71#[derive(Clone, Debug, PartialEq)]
72pub struct RawXmlElement {
73    /// The full element name (including namespace prefix if present).
74    pub name: String,
75    /// Element attributes as (name, value) pairs.
76    pub attributes: Vec<(String, String)>,
77    /// Child nodes.
78    pub children: Vec<RawXmlNode>,
79    /// Whether this was a self-closing element.
80    pub self_closing: bool,
81}
82
83impl RawXmlElement {
84    /// Create a new empty element.
85    pub fn new(name: impl Into<String>) -> Self {
86        Self {
87            name: name.into(),
88            attributes: Vec::new(),
89            children: Vec::new(),
90            self_closing: false,
91        }
92    }
93
94    /// Parse a raw XML element from a reader, starting after the opening tag.
95    ///
96    /// The `start` parameter should be the BytesStart event that opened this element.
97    pub fn from_reader<R: BufRead>(reader: &mut Reader<R>, start: &BytesStart) -> Result<Self> {
98        let name = String::from_utf8_lossy(start.name().as_ref()).to_string();
99
100        let attributes = start
101            .attributes()
102            .filter_map(|a| a.ok())
103            .map(|a| {
104                (
105                    String::from_utf8_lossy(a.key.as_ref()).to_string(),
106                    String::from_utf8_lossy(&a.value).to_string(),
107                )
108            })
109            .collect();
110
111        let mut element = RawXmlElement {
112            name: name.clone(),
113            attributes,
114            children: Vec::new(),
115            self_closing: false,
116        };
117
118        let mut buf = Vec::new();
119        let target_name = start.name().as_ref().to_vec();
120
121        loop {
122            match reader.read_event_into(&mut buf) {
123                Ok(Event::Start(e)) => {
124                    let child = RawXmlElement::from_reader(reader, &e)?;
125                    element.children.push(RawXmlNode::Element(child));
126                }
127                Ok(Event::Empty(e)) => {
128                    let child = RawXmlElement::from_empty(&e);
129                    element.children.push(RawXmlNode::Element(child));
130                }
131                Ok(Event::Text(e)) => {
132                    let text = e.decode().unwrap_or_default();
133                    if !text.is_empty() {
134                        // Merge with preceding text node (e.g. after a GeneralRef)
135                        if let Some(RawXmlNode::Text(last)) = element.children.last_mut() {
136                            last.push_str(&text);
137                        } else {
138                            element.children.push(RawXmlNode::Text(text.to_string()));
139                        }
140                    }
141                }
142                Ok(Event::GeneralRef(e)) => {
143                    let entity_name = e.decode().unwrap_or_default();
144                    if let Some(resolved) = quick_xml::escape::resolve_xml_entity(&entity_name) {
145                        // Append to last text node if possible, otherwise create new one
146                        if let Some(RawXmlNode::Text(last)) = element.children.last_mut() {
147                            last.push_str(resolved);
148                        } else {
149                            element
150                                .children
151                                .push(RawXmlNode::Text(resolved.to_string()));
152                        }
153                    }
154                }
155                Ok(Event::CData(e)) => {
156                    let text = String::from_utf8_lossy(&e).to_string();
157                    element.children.push(RawXmlNode::CData(text));
158                }
159                Ok(Event::Comment(e)) => {
160                    let text = String::from_utf8_lossy(&e).to_string();
161                    element.children.push(RawXmlNode::Comment(text));
162                }
163                Ok(Event::End(e)) => {
164                    if e.name().as_ref() == target_name {
165                        break;
166                    }
167                }
168                Ok(Event::Eof) => {
169                    return Err(Error::Invalid(format!(
170                        "Unexpected EOF while parsing element '{}'",
171                        name
172                    )));
173                }
174                Err(e) => return Err(Error::Xml(e)),
175                _ => {}
176            }
177            buf.clear();
178        }
179
180        Ok(element)
181    }
182
183    /// Create from an empty/self-closing element.
184    pub fn from_empty(start: &BytesStart) -> Self {
185        let name = String::from_utf8_lossy(start.name().as_ref()).to_string();
186
187        let attributes = start
188            .attributes()
189            .filter_map(|a| a.ok())
190            .map(|a| {
191                (
192                    String::from_utf8_lossy(a.key.as_ref()).to_string(),
193                    String::from_utf8_lossy(&a.value).to_string(),
194                )
195            })
196            .collect();
197
198        RawXmlElement {
199            name,
200            attributes,
201            children: Vec::new(),
202            self_closing: true,
203        }
204    }
205
206    /// Parse this element as a typed struct using the FromXml trait.
207    ///
208    /// Uses a streaming approach that generates XML bytes lazily from the
209    /// in-memory tree structure, avoiding full upfront serialization.
210    ///
211    /// # Example
212    /// ```ignore
213    /// use ooxml_dml::types::CTTable;
214    /// if let Some(table) = raw_element.parse_as::<CTTable>() {
215    ///     // Use the parsed table
216    /// }
217    /// ```
218    pub fn parse_as<T: FromXml>(&self) -> std::result::Result<T, ParseError> {
219        let streaming_reader = RawXmlStreamReader::new(self);
220        let mut reader = Reader::from_reader(streaming_reader);
221        let mut buf = Vec::new();
222
223        loop {
224            match reader.read_event_into(&mut buf) {
225                Ok(Event::Start(e)) => {
226                    return T::from_xml(&mut reader, &e, false);
227                }
228                Ok(Event::Empty(e)) => {
229                    return T::from_xml(&mut reader, &e, true);
230                }
231                Ok(Event::Eof) => {
232                    return Err(ParseError::UnexpectedElement(
233                        "empty XML in parse_as".to_string(),
234                    ));
235                }
236                Err(e) => return Err(ParseError::Xml(e)),
237                _ => {}
238            }
239            buf.clear();
240        }
241    }
242
243    /// Write this element to an XML writer.
244    pub fn write_to<W: Write>(&self, writer: &mut Writer<W>) -> Result<()> {
245        let mut start = BytesStart::new(&self.name);
246        for (key, value) in &self.attributes {
247            start.push_attribute((key.as_str(), value.as_str()));
248        }
249
250        if self.self_closing && self.children.is_empty() {
251            writer.write_event(Event::Empty(start))?;
252        } else {
253            writer.write_event(Event::Start(start))?;
254
255            for child in &self.children {
256                child.write_to(writer)?;
257            }
258
259            writer.write_event(Event::End(BytesEnd::new(&self.name)))?;
260        }
261
262        Ok(())
263    }
264}
265
266impl RawXmlNode {
267    /// Write this node to an XML writer.
268    pub fn write_to<W: Write>(&self, writer: &mut Writer<W>) -> Result<()> {
269        match self {
270            RawXmlNode::Element(elem) => elem.write_to(writer),
271            RawXmlNode::Text(text) => {
272                writer.write_event(Event::Text(BytesText::new(text)))?;
273                Ok(())
274            }
275            RawXmlNode::CData(text) => {
276                writer.write_event(Event::CData(BytesCData::new(text)))?;
277                Ok(())
278            }
279            RawXmlNode::Comment(text) => {
280                writer.write_event(Event::Comment(BytesText::new(text)))?;
281                Ok(())
282            }
283        }
284    }
285}
286
287/// A streaming reader that produces XML bytes lazily from a RawXmlElement tree.
288///
289/// Implements `BufRead` so it can be used with `quick_xml::Reader::from_reader()`.
290/// This avoids allocating the full XML string upfront - bytes are generated
291/// on-demand as the parser reads.
292pub struct RawXmlStreamReader<'a> {
293    /// Stack of elements being processed (element, child_index, state)
294    stack: Vec<(&'a RawXmlElement, usize, ElementState)>,
295    /// Current buffered output
296    buffer: Vec<u8>,
297    /// Current read position in buffer
298    pos: usize,
299    /// Whether we've finished
300    done: bool,
301}
302
303#[derive(Clone, Copy, PartialEq)]
304enum ElementState {
305    /// About to emit start tag
306    Start,
307    /// Emitting children
308    Children,
309    /// About to emit end tag
310    End,
311}
312
313impl<'a> RawXmlStreamReader<'a> {
314    /// Create a new streaming reader for the given element.
315    pub fn new(elem: &'a RawXmlElement) -> Self {
316        Self {
317            stack: vec![(elem, 0, ElementState::Start)],
318            buffer: Vec::with_capacity(256),
319            pos: 0,
320            done: false,
321        }
322    }
323
324    /// Generate the next chunk of XML into the buffer.
325    fn generate_next(&mut self) {
326        self.buffer.clear();
327        self.pos = 0;
328
329        while self.buffer.is_empty() && !self.stack.is_empty() {
330            let (elem, child_idx, state) = self.stack.pop().unwrap();
331
332            match state {
333                ElementState::Start => {
334                    // Emit start tag or empty tag
335                    self.buffer.push(b'<');
336                    self.buffer.extend_from_slice(elem.name.as_bytes());
337
338                    for (key, value) in &elem.attributes {
339                        self.buffer.push(b' ');
340                        self.buffer.extend_from_slice(key.as_bytes());
341                        self.buffer.extend_from_slice(b"=\"");
342                        // Escape attribute value
343                        for &b in value.as_bytes() {
344                            match b {
345                                b'"' => self.buffer.extend_from_slice(b"&quot;"),
346                                b'&' => self.buffer.extend_from_slice(b"&amp;"),
347                                b'<' => self.buffer.extend_from_slice(b"&lt;"),
348                                _ => self.buffer.push(b),
349                            }
350                        }
351                        self.buffer.push(b'"');
352                    }
353
354                    if elem.self_closing && elem.children.is_empty() {
355                        self.buffer.extend_from_slice(b"/>");
356                        // Done with this element
357                    } else {
358                        self.buffer.push(b'>');
359                        // Push back to process children
360                        self.stack.push((elem, 0, ElementState::Children));
361                    }
362                }
363                ElementState::Children => {
364                    if child_idx < elem.children.len() {
365                        // Push back with next child index
366                        self.stack
367                            .push((elem, child_idx + 1, ElementState::Children));
368
369                        // Process current child
370                        match &elem.children[child_idx] {
371                            RawXmlNode::Element(child) => {
372                                self.stack.push((child, 0, ElementState::Start));
373                            }
374                            RawXmlNode::Text(text) => {
375                                // Escape text content
376                                for &b in text.as_bytes() {
377                                    match b {
378                                        b'&' => self.buffer.extend_from_slice(b"&amp;"),
379                                        b'<' => self.buffer.extend_from_slice(b"&lt;"),
380                                        b'>' => self.buffer.extend_from_slice(b"&gt;"),
381                                        _ => self.buffer.push(b),
382                                    }
383                                }
384                            }
385                            RawXmlNode::CData(text) => {
386                                self.buffer.extend_from_slice(b"<![CDATA[");
387                                self.buffer.extend_from_slice(text.as_bytes());
388                                self.buffer.extend_from_slice(b"]]>");
389                            }
390                            RawXmlNode::Comment(text) => {
391                                self.buffer.extend_from_slice(b"<!--");
392                                self.buffer.extend_from_slice(text.as_bytes());
393                                self.buffer.extend_from_slice(b"-->");
394                            }
395                        }
396                    } else {
397                        // Done with children, emit end tag
398                        self.stack.push((elem, 0, ElementState::End));
399                    }
400                }
401                ElementState::End => {
402                    self.buffer.extend_from_slice(b"</");
403                    self.buffer.extend_from_slice(elem.name.as_bytes());
404                    self.buffer.push(b'>');
405                }
406            }
407        }
408
409        if self.stack.is_empty() && self.buffer.is_empty() {
410            self.done = true;
411        }
412    }
413}
414
415impl<'a> std::io::Read for RawXmlStreamReader<'a> {
416    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
417        if self.pos >= self.buffer.len() {
418            if self.done {
419                return Ok(0);
420            }
421            self.generate_next();
422            if self.done && self.buffer.is_empty() {
423                return Ok(0);
424            }
425        }
426
427        let remaining = &self.buffer[self.pos..];
428        let to_copy = remaining.len().min(buf.len());
429        buf[..to_copy].copy_from_slice(&remaining[..to_copy]);
430        self.pos += to_copy;
431        Ok(to_copy)
432    }
433}
434
435impl<'a> BufRead for RawXmlStreamReader<'a> {
436    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
437        if self.pos >= self.buffer.len() {
438            if self.done {
439                return Ok(&[]);
440            }
441            self.generate_next();
442        }
443        Ok(&self.buffer[self.pos..])
444    }
445
446    fn consume(&mut self, amt: usize) {
447        self.pos += amt;
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454    use std::io::Cursor;
455
456    #[test]
457    fn test_parse_simple_element() {
458        let xml = r#"<w:test attr="value">content</w:test>"#;
459        let mut reader = Reader::from_str(xml);
460        let mut buf = Vec::new();
461
462        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
463            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
464            assert_eq!(elem.name, "w:test");
465            assert_eq!(
466                elem.attributes,
467                vec![("attr".to_string(), "value".to_string())]
468            );
469            assert_eq!(elem.children.len(), 1);
470            if let RawXmlNode::Text(t) = &elem.children[0] {
471                assert_eq!(t, "content");
472            } else {
473                panic!("Expected text node");
474            }
475        }
476    }
477
478    #[test]
479    fn test_parse_nested_elements() {
480        let xml = r#"<parent><child1/><child2>text</child2></parent>"#;
481        let mut reader = Reader::from_str(xml);
482        let mut buf = Vec::new();
483
484        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
485            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
486            assert_eq!(elem.name, "parent");
487            assert_eq!(elem.children.len(), 2);
488        }
489    }
490
491    #[test]
492    fn test_roundtrip() {
493        let xml = r#"<w:test attr="value"><w:child>text</w:child></w:test>"#;
494        let mut reader = Reader::from_str(xml);
495        let mut buf = Vec::new();
496
497        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
498            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
499
500            let mut output = Vec::new();
501            let mut writer = Writer::new(Cursor::new(&mut output));
502            elem.write_to(&mut writer).unwrap();
503
504            let output_str = String::from_utf8(output).unwrap();
505            assert_eq!(output_str, xml);
506        }
507    }
508
509    #[test]
510    fn test_streaming_reader() {
511        use std::io::Read;
512
513        let xml = r#"<parent attr="val"><child>text</child></parent>"#;
514        let mut reader = Reader::from_str(xml);
515        let mut buf = Vec::new();
516
517        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
518            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
519
520            // Read from streaming reader
521            let mut stream_reader = RawXmlStreamReader::new(&elem);
522            let mut output = String::new();
523            stream_reader.read_to_string(&mut output).unwrap();
524
525            assert_eq!(output, xml);
526        }
527    }
528
529    #[test]
530    fn test_streaming_reader_escaping() {
531        use std::io::Read;
532
533        // Test that special characters are properly escaped
534        let mut elem = RawXmlElement::new("test");
535        elem.attributes
536            .push(("attr".to_string(), "val\"ue".to_string()));
537        elem.children
538            .push(RawXmlNode::Text("a < b & c > d".to_string()));
539
540        let mut stream_reader = RawXmlStreamReader::new(&elem);
541        let mut output = String::new();
542        stream_reader.read_to_string(&mut output).unwrap();
543
544        assert_eq!(
545            output,
546            r#"<test attr="val&quot;ue">a &lt; b &amp; c &gt; d</test>"#
547        );
548    }
549
550    #[test]
551    fn test_from_reader_preserves_xml_entities() {
552        // Verify that XML entity references (&amp;, &lt;, etc.) survive
553        // the parse→store→re-serialize roundtrip through RawXmlElement.
554        let xml = r#"<root><t>A &amp; B &lt; C &gt; D &quot;E&quot; &apos;F&apos;</t></root>"#;
555        let mut reader = Reader::from_str(xml);
556        let mut buf = Vec::new();
557
558        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
559            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
560
561            // The text node should contain the decoded characters
562            let child = &elem.children[0];
563            if let RawXmlNode::Element(t_elem) = child {
564                if let Some(RawXmlNode::Text(text)) = t_elem.children.first() {
565                    assert_eq!(text, "A & B < C > D \"E\" 'F'");
566                } else {
567                    panic!("Expected text child in <t> element");
568                }
569            } else {
570                panic!("Expected element child");
571            }
572
573            // Re-serialize via streaming reader and verify entities are escaped
574            use std::io::Read;
575            let mut stream_reader = RawXmlStreamReader::new(&elem);
576            let mut output = String::new();
577            stream_reader.read_to_string(&mut output).unwrap();
578            assert!(output.contains("A &amp; B &lt; C &gt; D"));
579        }
580    }
581}