Skip to main content

ooxml_xml/
raw_xml.rs

1//! Raw XML preservation for round-trip fidelity.
2//!
3//! This module provides types for storing unparsed XML elements,
4//! allowing documents to survive read→write cycles without losing
5//! features we don't explicitly understand.
6
7use quick_xml::events::{BytesCData, BytesEnd, BytesStart, BytesText, Event};
8use quick_xml::{Reader, Writer};
9use std::io::{BufRead, Write};
10
11use crate::{Error, FromXml, ParseError, Result};
12
13/// A raw XML node with its original position for correct round-trip ordering.
14///
15/// When unknown elements are captured during parsing, we store their position
16/// among siblings so they can be interleaved correctly during serialization.
17#[derive(Clone, Debug, PartialEq)]
18pub struct PositionedNode {
19    /// Original position among sibling elements (0-indexed).
20    pub position: usize,
21    /// The preserved XML node.
22    pub node: RawXmlNode,
23}
24
25impl PositionedNode {
26    /// Create a new positioned node.
27    pub fn new(position: usize, node: RawXmlNode) -> Self {
28        Self { position, node }
29    }
30}
31
32/// An XML attribute with its original position for correct round-trip ordering.
33///
34/// When unknown attributes are captured during parsing, we store their position
35/// among sibling attributes so they can be serialized in the original order.
36#[derive(Clone, Debug, PartialEq)]
37pub struct PositionedAttr {
38    /// Original position among sibling attributes (0-indexed).
39    pub position: usize,
40    /// The attribute name (including namespace prefix if present).
41    pub name: String,
42    /// The attribute value.
43    pub value: String,
44}
45
46impl PositionedAttr {
47    /// Create a new positioned attribute.
48    pub fn new(position: usize, name: impl Into<String>, value: impl Into<String>) -> Self {
49        Self {
50            position,
51            name: name.into(),
52            value: value.into(),
53        }
54    }
55}
56
57/// A raw XML node that can be preserved during round-trip.
58#[derive(Clone, Debug, PartialEq)]
59pub enum RawXmlNode {
60    /// An XML element with name, attributes, and children.
61    Element(RawXmlElement),
62    /// Text content.
63    Text(String),
64    /// CDATA content.
65    CData(String),
66    /// A comment.
67    Comment(String),
68}
69
70/// A raw XML element with its name, attributes, and children preserved.
71#[derive(Clone, Debug, PartialEq)]
72pub struct RawXmlElement {
73    /// The full element name (including namespace prefix if present).
74    pub name: String,
75    /// Element attributes as (name, value) pairs.
76    pub attributes: Vec<(String, String)>,
77    /// Child nodes.
78    pub children: Vec<RawXmlNode>,
79    /// Whether this was a self-closing element.
80    pub self_closing: bool,
81}
82
83impl RawXmlElement {
84    /// Create a new empty element.
85    pub fn new(name: impl Into<String>) -> Self {
86        Self {
87            name: name.into(),
88            attributes: Vec::new(),
89            children: Vec::new(),
90            self_closing: false,
91        }
92    }
93
94    /// Parse a raw XML element from a reader, starting after the opening tag.
95    ///
96    /// The `start` parameter should be the BytesStart event that opened this element.
97    pub fn from_reader<R: BufRead>(reader: &mut Reader<R>, start: &BytesStart) -> Result<Self> {
98        let name = String::from_utf8_lossy(start.name().as_ref()).to_string();
99
100        let attributes = start
101            .attributes()
102            .filter_map(|a| a.ok())
103            .map(|a| {
104                (
105                    String::from_utf8_lossy(a.key.as_ref()).to_string(),
106                    String::from_utf8_lossy(&a.value).to_string(),
107                )
108            })
109            .collect();
110
111        let mut element = RawXmlElement {
112            name: name.clone(),
113            attributes,
114            children: Vec::new(),
115            self_closing: false,
116        };
117
118        let mut buf = Vec::new();
119        let target_name = start.name().as_ref().to_vec();
120
121        loop {
122            match reader.read_event_into(&mut buf) {
123                Ok(Event::Start(e)) => {
124                    let child = RawXmlElement::from_reader(reader, &e)?;
125                    element.children.push(RawXmlNode::Element(child));
126                }
127                Ok(Event::Empty(e)) => {
128                    let child = RawXmlElement::from_empty(&e);
129                    element.children.push(RawXmlNode::Element(child));
130                }
131                Ok(Event::Text(e)) => {
132                    let text = e.decode().unwrap_or_default();
133                    if !text.is_empty() {
134                        element.children.push(RawXmlNode::Text(text.to_string()));
135                    }
136                }
137                Ok(Event::CData(e)) => {
138                    let text = String::from_utf8_lossy(&e).to_string();
139                    element.children.push(RawXmlNode::CData(text));
140                }
141                Ok(Event::Comment(e)) => {
142                    let text = String::from_utf8_lossy(&e).to_string();
143                    element.children.push(RawXmlNode::Comment(text));
144                }
145                Ok(Event::End(e)) => {
146                    if e.name().as_ref() == target_name {
147                        break;
148                    }
149                }
150                Ok(Event::Eof) => {
151                    return Err(Error::Invalid(format!(
152                        "Unexpected EOF while parsing element '{}'",
153                        name
154                    )));
155                }
156                Err(e) => return Err(Error::Xml(e)),
157                _ => {}
158            }
159            buf.clear();
160        }
161
162        Ok(element)
163    }
164
165    /// Create from an empty/self-closing element.
166    pub fn from_empty(start: &BytesStart) -> Self {
167        let name = String::from_utf8_lossy(start.name().as_ref()).to_string();
168
169        let attributes = start
170            .attributes()
171            .filter_map(|a| a.ok())
172            .map(|a| {
173                (
174                    String::from_utf8_lossy(a.key.as_ref()).to_string(),
175                    String::from_utf8_lossy(&a.value).to_string(),
176                )
177            })
178            .collect();
179
180        RawXmlElement {
181            name,
182            attributes,
183            children: Vec::new(),
184            self_closing: true,
185        }
186    }
187
188    /// Parse this element as a typed struct using the FromXml trait.
189    ///
190    /// Uses a streaming approach that generates XML bytes lazily from the
191    /// in-memory tree structure, avoiding full upfront serialization.
192    ///
193    /// # Example
194    /// ```ignore
195    /// use ooxml_dml::types::CTTable;
196    /// if let Some(table) = raw_element.parse_as::<CTTable>() {
197    ///     // Use the parsed table
198    /// }
199    /// ```
200    pub fn parse_as<T: FromXml>(&self) -> std::result::Result<T, ParseError> {
201        let streaming_reader = RawXmlStreamReader::new(self);
202        let mut reader = Reader::from_reader(streaming_reader);
203        let mut buf = Vec::new();
204
205        loop {
206            match reader.read_event_into(&mut buf) {
207                Ok(Event::Start(e)) => {
208                    return T::from_xml(&mut reader, &e, false);
209                }
210                Ok(Event::Empty(e)) => {
211                    return T::from_xml(&mut reader, &e, true);
212                }
213                Ok(Event::Eof) => {
214                    return Err(ParseError::UnexpectedElement(
215                        "empty XML in parse_as".to_string(),
216                    ));
217                }
218                Err(e) => return Err(ParseError::Xml(e)),
219                _ => {}
220            }
221            buf.clear();
222        }
223    }
224
225    /// Write this element to an XML writer.
226    pub fn write_to<W: Write>(&self, writer: &mut Writer<W>) -> Result<()> {
227        let mut start = BytesStart::new(&self.name);
228        for (key, value) in &self.attributes {
229            start.push_attribute((key.as_str(), value.as_str()));
230        }
231
232        if self.self_closing && self.children.is_empty() {
233            writer.write_event(Event::Empty(start))?;
234        } else {
235            writer.write_event(Event::Start(start))?;
236
237            for child in &self.children {
238                child.write_to(writer)?;
239            }
240
241            writer.write_event(Event::End(BytesEnd::new(&self.name)))?;
242        }
243
244        Ok(())
245    }
246}
247
248impl RawXmlNode {
249    /// Write this node to an XML writer.
250    pub fn write_to<W: Write>(&self, writer: &mut Writer<W>) -> Result<()> {
251        match self {
252            RawXmlNode::Element(elem) => elem.write_to(writer),
253            RawXmlNode::Text(text) => {
254                writer.write_event(Event::Text(BytesText::new(text)))?;
255                Ok(())
256            }
257            RawXmlNode::CData(text) => {
258                writer.write_event(Event::CData(BytesCData::new(text)))?;
259                Ok(())
260            }
261            RawXmlNode::Comment(text) => {
262                writer.write_event(Event::Comment(BytesText::new(text)))?;
263                Ok(())
264            }
265        }
266    }
267}
268
269/// A streaming reader that produces XML bytes lazily from a RawXmlElement tree.
270///
271/// Implements `BufRead` so it can be used with `quick_xml::Reader::from_reader()`.
272/// This avoids allocating the full XML string upfront - bytes are generated
273/// on-demand as the parser reads.
274pub struct RawXmlStreamReader<'a> {
275    /// Stack of elements being processed (element, child_index, state)
276    stack: Vec<(&'a RawXmlElement, usize, ElementState)>,
277    /// Current buffered output
278    buffer: Vec<u8>,
279    /// Current read position in buffer
280    pos: usize,
281    /// Whether we've finished
282    done: bool,
283}
284
285#[derive(Clone, Copy, PartialEq)]
286enum ElementState {
287    /// About to emit start tag
288    Start,
289    /// Emitting children
290    Children,
291    /// About to emit end tag
292    End,
293}
294
295impl<'a> RawXmlStreamReader<'a> {
296    /// Create a new streaming reader for the given element.
297    pub fn new(elem: &'a RawXmlElement) -> Self {
298        Self {
299            stack: vec![(elem, 0, ElementState::Start)],
300            buffer: Vec::with_capacity(256),
301            pos: 0,
302            done: false,
303        }
304    }
305
306    /// Generate the next chunk of XML into the buffer.
307    fn generate_next(&mut self) {
308        self.buffer.clear();
309        self.pos = 0;
310
311        while self.buffer.is_empty() && !self.stack.is_empty() {
312            let (elem, child_idx, state) = self.stack.pop().unwrap();
313
314            match state {
315                ElementState::Start => {
316                    // Emit start tag or empty tag
317                    self.buffer.push(b'<');
318                    self.buffer.extend_from_slice(elem.name.as_bytes());
319
320                    for (key, value) in &elem.attributes {
321                        self.buffer.push(b' ');
322                        self.buffer.extend_from_slice(key.as_bytes());
323                        self.buffer.extend_from_slice(b"=\"");
324                        // Escape attribute value
325                        for &b in value.as_bytes() {
326                            match b {
327                                b'"' => self.buffer.extend_from_slice(b"&quot;"),
328                                b'&' => self.buffer.extend_from_slice(b"&amp;"),
329                                b'<' => self.buffer.extend_from_slice(b"&lt;"),
330                                _ => self.buffer.push(b),
331                            }
332                        }
333                        self.buffer.push(b'"');
334                    }
335
336                    if elem.self_closing && elem.children.is_empty() {
337                        self.buffer.extend_from_slice(b"/>");
338                        // Done with this element
339                    } else {
340                        self.buffer.push(b'>');
341                        // Push back to process children
342                        self.stack.push((elem, 0, ElementState::Children));
343                    }
344                }
345                ElementState::Children => {
346                    if child_idx < elem.children.len() {
347                        // Push back with next child index
348                        self.stack
349                            .push((elem, child_idx + 1, ElementState::Children));
350
351                        // Process current child
352                        match &elem.children[child_idx] {
353                            RawXmlNode::Element(child) => {
354                                self.stack.push((child, 0, ElementState::Start));
355                            }
356                            RawXmlNode::Text(text) => {
357                                // Escape text content
358                                for &b in text.as_bytes() {
359                                    match b {
360                                        b'&' => self.buffer.extend_from_slice(b"&amp;"),
361                                        b'<' => self.buffer.extend_from_slice(b"&lt;"),
362                                        b'>' => self.buffer.extend_from_slice(b"&gt;"),
363                                        _ => self.buffer.push(b),
364                                    }
365                                }
366                            }
367                            RawXmlNode::CData(text) => {
368                                self.buffer.extend_from_slice(b"<![CDATA[");
369                                self.buffer.extend_from_slice(text.as_bytes());
370                                self.buffer.extend_from_slice(b"]]>");
371                            }
372                            RawXmlNode::Comment(text) => {
373                                self.buffer.extend_from_slice(b"<!--");
374                                self.buffer.extend_from_slice(text.as_bytes());
375                                self.buffer.extend_from_slice(b"-->");
376                            }
377                        }
378                    } else {
379                        // Done with children, emit end tag
380                        self.stack.push((elem, 0, ElementState::End));
381                    }
382                }
383                ElementState::End => {
384                    self.buffer.extend_from_slice(b"</");
385                    self.buffer.extend_from_slice(elem.name.as_bytes());
386                    self.buffer.push(b'>');
387                }
388            }
389        }
390
391        if self.stack.is_empty() && self.buffer.is_empty() {
392            self.done = true;
393        }
394    }
395}
396
397impl<'a> std::io::Read for RawXmlStreamReader<'a> {
398    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
399        if self.pos >= self.buffer.len() {
400            if self.done {
401                return Ok(0);
402            }
403            self.generate_next();
404            if self.done && self.buffer.is_empty() {
405                return Ok(0);
406            }
407        }
408
409        let remaining = &self.buffer[self.pos..];
410        let to_copy = remaining.len().min(buf.len());
411        buf[..to_copy].copy_from_slice(&remaining[..to_copy]);
412        self.pos += to_copy;
413        Ok(to_copy)
414    }
415}
416
417impl<'a> BufRead for RawXmlStreamReader<'a> {
418    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
419        if self.pos >= self.buffer.len() {
420            if self.done {
421                return Ok(&[]);
422            }
423            self.generate_next();
424        }
425        Ok(&self.buffer[self.pos..])
426    }
427
428    fn consume(&mut self, amt: usize) {
429        self.pos += amt;
430    }
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436    use std::io::Cursor;
437
438    #[test]
439    fn test_parse_simple_element() {
440        let xml = r#"<w:test attr="value">content</w:test>"#;
441        let mut reader = Reader::from_str(xml);
442        let mut buf = Vec::new();
443
444        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
445            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
446            assert_eq!(elem.name, "w:test");
447            assert_eq!(
448                elem.attributes,
449                vec![("attr".to_string(), "value".to_string())]
450            );
451            assert_eq!(elem.children.len(), 1);
452            if let RawXmlNode::Text(t) = &elem.children[0] {
453                assert_eq!(t, "content");
454            } else {
455                panic!("Expected text node");
456            }
457        }
458    }
459
460    #[test]
461    fn test_parse_nested_elements() {
462        let xml = r#"<parent><child1/><child2>text</child2></parent>"#;
463        let mut reader = Reader::from_str(xml);
464        let mut buf = Vec::new();
465
466        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
467            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
468            assert_eq!(elem.name, "parent");
469            assert_eq!(elem.children.len(), 2);
470        }
471    }
472
473    #[test]
474    fn test_roundtrip() {
475        let xml = r#"<w:test attr="value"><w:child>text</w:child></w:test>"#;
476        let mut reader = Reader::from_str(xml);
477        let mut buf = Vec::new();
478
479        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
480            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
481
482            let mut output = Vec::new();
483            let mut writer = Writer::new(Cursor::new(&mut output));
484            elem.write_to(&mut writer).unwrap();
485
486            let output_str = String::from_utf8(output).unwrap();
487            assert_eq!(output_str, xml);
488        }
489    }
490
491    #[test]
492    fn test_streaming_reader() {
493        use std::io::Read;
494
495        let xml = r#"<parent attr="val"><child>text</child></parent>"#;
496        let mut reader = Reader::from_str(xml);
497        let mut buf = Vec::new();
498
499        if let Ok(Event::Start(e)) = reader.read_event_into(&mut buf) {
500            let elem = RawXmlElement::from_reader(&mut reader, &e).unwrap();
501
502            // Read from streaming reader
503            let mut stream_reader = RawXmlStreamReader::new(&elem);
504            let mut output = String::new();
505            stream_reader.read_to_string(&mut output).unwrap();
506
507            assert_eq!(output, xml);
508        }
509    }
510
511    #[test]
512    fn test_streaming_reader_escaping() {
513        use std::io::Read;
514
515        // Test that special characters are properly escaped
516        let mut elem = RawXmlElement::new("test");
517        elem.attributes
518            .push(("attr".to_string(), "val\"ue".to_string()));
519        elem.children
520            .push(RawXmlNode::Text("a < b & c > d".to_string()));
521
522        let mut stream_reader = RawXmlStreamReader::new(&elem);
523        let mut output = String::new();
524        stream_reader.read_to_string(&mut output).unwrap();
525
526        assert_eq!(
527            output,
528            r#"<test attr="val&quot;ue">a &lt; b &amp; c &gt; d</test>"#
529        );
530    }
531}