Skip to main content

docx_rs/reader/
xml_parser.rs

1use std::collections::VecDeque;
2use std::io::{BufReader, Read};
3
4use quick_xml::encoding::Decoder;
5use quick_xml::events::{BytesEnd, BytesStart, Event};
6use quick_xml::Reader;
7
8#[derive(Clone, Debug, PartialEq, Eq)]
9pub struct OwnedName {
10    pub local_name: String,
11    pub namespace: Option<String>,
12    pub prefix: Option<String>,
13}
14
15#[derive(Clone, Debug, PartialEq, Eq)]
16pub struct OwnedAttribute {
17    pub name: OwnedName,
18    pub value: String,
19}
20
21#[derive(Clone, Debug, Default, PartialEq, Eq)]
22pub struct Namespace {
23    mappings: Vec<(String, String)>,
24}
25
26impl Namespace {
27    pub fn empty() -> Self {
28        Self {
29            mappings: Vec::new(),
30        }
31    }
32}
33
34impl IntoIterator for Namespace {
35    type Item = (String, String);
36    type IntoIter = std::vec::IntoIter<(String, String)>;
37
38    fn into_iter(self) -> Self::IntoIter {
39        self.mappings.into_iter()
40    }
41}
42
43#[derive(Clone, Debug, PartialEq)]
44pub enum XmlEvent {
45    StartElement {
46        name: OwnedName,
47        attributes: Vec<OwnedAttribute>,
48        namespace: Namespace,
49    },
50    EndElement {
51        name: OwnedName,
52    },
53    Characters(String),
54    Whitespace(String),
55    EndDocument,
56}
57
58pub struct EventReader<R: Read> {
59    reader: Reader<BufReader<R>>,
60    buf: Vec<u8>,
61    pending: VecDeque<XmlEvent>,
62    finished: bool,
63}
64
65impl<R: Read> EventReader<R> {
66    pub fn new(reader: R) -> Self {
67        let mut reader = Reader::from_reader(BufReader::new(reader));
68        {
69            let config = reader.config_mut();
70            config.trim_text(false);
71            config.check_end_names = true;
72            config.expand_empty_elements = false;
73        }
74        Self {
75            reader,
76            buf: Vec::new(),
77            pending: VecDeque::new(),
78            finished: false,
79        }
80    }
81
82    pub fn next(&mut self) -> Result<XmlEvent, quick_xml::Error> {
83        self.read_next()
84    }
85
86    fn read_next(&mut self) -> Result<XmlEvent, quick_xml::Error> {
87        if let Some(event) = self.pending.pop_front() {
88            return Ok(event);
89        }
90
91        loop {
92            self.buf.clear();
93            match self.reader.read_event_into(&mut self.buf)? {
94                Event::Start(element) => {
95                    let decoder = self.reader.decoder();
96                    let event = Self::build_start_event(element, decoder)?;
97                    return Ok(event);
98                }
99                Event::Empty(element) => {
100                    let decoder = self.reader.decoder();
101                    let event = Self::build_start_event(element, decoder)?;
102                    if let XmlEvent::StartElement { name, .. } = &event {
103                        self.pending
104                            .push_back(XmlEvent::EndElement { name: name.clone() });
105                    }
106                    return Ok(event);
107                }
108                Event::End(element) => {
109                    let name = build_name_from_end(&element)?;
110                    return Ok(XmlEvent::EndElement { name });
111                }
112                Event::Text(text) => {
113                    let text = text.unescape()?.into_owned();
114                    if text.chars().all(char::is_whitespace) {
115                        return Ok(XmlEvent::Whitespace(text));
116                    } else {
117                        return Ok(XmlEvent::Characters(text));
118                    }
119                }
120                Event::CData(text) => {
121                    let decoded = self.reader.decoder().decode(text.as_ref())?.into_owned();
122                    return Ok(XmlEvent::Characters(decoded));
123                }
124                Event::Eof => {
125                    self.finished = true;
126                    return Ok(XmlEvent::EndDocument);
127                }
128                Event::Decl(_) | Event::PI(_) | Event::Comment(_) | Event::DocType(_) => {
129                    // Skip non-structural events
130                }
131            }
132        }
133    }
134
135    fn build_start_event(
136        element: BytesStart<'_>,
137        decoder: Decoder,
138    ) -> Result<XmlEvent, quick_xml::Error> {
139        let name = build_name_from_start(&element)?;
140        let attributes = build_attributes(&element, decoder)?;
141        Ok(XmlEvent::StartElement {
142            name,
143            attributes,
144            namespace: Namespace::empty(),
145        })
146    }
147}
148
149impl<R: Read> Iterator for EventReader<R> {
150    type Item = Result<XmlEvent, quick_xml::Error>;
151
152    fn next(&mut self) -> Option<Self::Item> {
153        if self.finished {
154            return None;
155        }
156        match self.read_next() {
157            Ok(XmlEvent::EndDocument) => {
158                self.finished = true;
159                Some(Ok(XmlEvent::EndDocument))
160            }
161            Ok(event) => Some(Ok(event)),
162            Err(e) => {
163                self.finished = true;
164                Some(Err(e))
165            }
166        }
167    }
168}
169
170fn build_name_from_start(element: &BytesStart<'_>) -> Result<OwnedName, quick_xml::Error> {
171    let name = element.name();
172    Ok(split_qname(name.as_ref()))
173}
174
175fn build_name_from_end(element: &BytesEnd<'_>) -> Result<OwnedName, quick_xml::Error> {
176    let name = element.name();
177    Ok(split_qname(name.as_ref()))
178}
179
180fn split_qname(raw: &[u8]) -> OwnedName {
181    let text = String::from_utf8_lossy(raw).into_owned();
182    if let Some(idx) = text.find(':') {
183        let prefix = text[..idx].to_string();
184        let local = text[idx + 1..].to_string();
185        OwnedName {
186            local_name: local,
187            namespace: None,
188            prefix: Some(prefix),
189        }
190    } else {
191        OwnedName {
192            local_name: text,
193            namespace: None,
194            prefix: None,
195        }
196    }
197}
198
199fn build_attributes(
200    element: &BytesStart<'_>,
201    decoder: Decoder,
202) -> Result<Vec<OwnedAttribute>, quick_xml::Error> {
203    let mut attributes = Vec::new();
204    for attr_result in element.attributes().with_checks(false) {
205        let attr = attr_result.map_err(quick_xml::Error::from)?;
206        let value = attr.decode_and_unescape_value(decoder)?.into_owned();
207        let name = split_qname(attr.key.as_ref());
208        attributes.push(OwnedAttribute { name, value });
209    }
210    Ok(attributes)
211}