1extern crate alloc;
4
5use alloc::borrow::Cow;
6use alloc::string::String;
7use alloc::vec::Vec;
8use core::fmt;
9use std::io::Cursor;
10
11use facet_dom::{DomEvent, DomParser};
12use quick_xml::NsReader;
13use quick_xml::escape::resolve_xml_entity;
14use quick_xml::events::Event;
15use quick_xml::name::ResolveResult;
16
17#[derive(Debug, Clone)]
19pub enum XmlError {
20 Parse(String),
22 UnexpectedEof,
24 UnbalancedTags,
26 InvalidUtf8(core::str::Utf8Error),
28}
29
30impl fmt::Display for XmlError {
31 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32 match self {
33 XmlError::Parse(msg) => write!(f, "XML parse error: {}", msg),
34 XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
35 XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
36 XmlError::InvalidUtf8(e) => write!(f, "Invalid UTF-8 in XML: {}", e),
37 }
38 }
39}
40
41impl std::error::Error for XmlError {}
42
43pub struct XmlParser<'de> {
45 reader: NsReader<Cursor<&'de [u8]>>,
46 input: &'de [u8],
48 buf: Vec<u8>,
50 peeked: Option<DomEvent<'de>>,
52 depth: usize,
54 pending_attrs: Vec<(Option<String>, String, String)>,
56 attr_idx: usize,
58 state: ParserState,
60 is_empty_element: bool,
62 node_start_pos: u64,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq)]
67enum ParserState {
68 Ready,
70 EmittingAttrs,
72 NeedChildrenStart,
74 InChildren,
76 NeedChildrenEnd,
78 NeedNodeEnd,
80 Done,
82}
83
84impl<'de> XmlParser<'de> {
85 pub fn new(input: &'de [u8]) -> Self {
87 trace!(input_len = input.len(), "creating XML parser");
88
89 let mut reader = NsReader::from_reader(Cursor::new(input));
90 reader.config_mut().trim_text(true);
91
92 Self {
93 reader,
94 input,
95 buf: Vec::new(),
96 peeked: None,
97 depth: 0,
98 pending_attrs: Vec::new(),
99 attr_idx: 0,
100 state: ParserState::Ready,
101 is_empty_element: false,
102 node_start_pos: 0,
103 }
104 }
105
106 fn do_capture_raw_node(&mut self) -> Result<Cow<'de, str>, XmlError> {
109 let start = self.node_start_pos as usize;
111 let start_depth = self.depth;
112
113 loop {
115 let event = if let Some(e) = self.peeked.take() {
117 Some(e)
118 } else {
119 self.read_next()?
120 };
121
122 match event {
123 Some(DomEvent::NodeEnd) if self.depth < start_depth => break,
124 None => break,
125 _ => {}
126 }
127 }
128
129 let end = self.reader.buffer_position() as usize;
130 let raw = &self.input[start..end];
131 let s = core::str::from_utf8(raw).map_err(XmlError::InvalidUtf8)?;
132 Ok(Cow::Borrowed(s))
133 }
134
135 fn read_next(&mut self) -> Result<Option<DomEvent<'de>>, XmlError> {
137 loop {
138 match self.state {
139 ParserState::Done => return Ok(None),
140
141 ParserState::EmittingAttrs => {
142 if self.attr_idx < self.pending_attrs.len() {
143 let (ns, name, value) = &self.pending_attrs[self.attr_idx];
144 let event = DomEvent::Attribute {
145 name: Cow::Owned(name.clone()),
146 value: Cow::Owned(value.clone()),
147 namespace: ns.clone().map(Cow::Owned),
148 };
149 self.attr_idx += 1;
150 return Ok(Some(event));
151 }
152 self.pending_attrs.clear();
154 self.attr_idx = 0;
155 self.state = ParserState::NeedChildrenStart;
156 }
157
158 ParserState::NeedChildrenStart => {
159 if self.is_empty_element {
160 self.state = ParserState::NeedChildrenEnd;
161 self.is_empty_element = false;
162 } else {
163 self.state = ParserState::InChildren;
164 }
165 return Ok(Some(DomEvent::ChildrenStart));
166 }
167
168 ParserState::NeedChildrenEnd => {
169 self.state = ParserState::NeedNodeEnd;
170 return Ok(Some(DomEvent::ChildrenEnd));
171 }
172
173 ParserState::NeedNodeEnd => {
174 self.depth -= 1;
175 self.state = if self.depth == 0 {
176 ParserState::Done
177 } else {
178 ParserState::InChildren
179 };
180 return Ok(Some(DomEvent::NodeEnd));
181 }
182
183 ParserState::Ready | ParserState::InChildren => {
184 let pos_before = self.reader.buffer_position();
186
187 self.buf.clear();
188 let (resolve, event) = self
189 .reader
190 .read_resolved_event_into(&mut self.buf)
191 .map_err(|e| XmlError::Parse(e.to_string()))?;
192
193 let elem_ns = resolve_namespace(resolve)?;
195
196 match event {
197 Event::Start(ref e) | Event::Empty(ref e) => {
198 let is_empty = matches!(event, Event::Empty(_));
199 self.node_start_pos = pos_before;
201
202 let local_name = e.local_name();
204 let local = core::str::from_utf8(local_name.as_ref())
205 .map_err(XmlError::InvalidUtf8)?;
206 let local_owned = local.to_string();
207
208 self.pending_attrs.clear();
210 self.attr_idx = 0;
211
212 for attr in e.attributes() {
213 let attr = attr.map_err(|e| XmlError::Parse(e.to_string()))?;
214
215 let key = attr.key;
217 if key.as_ref() == b"xmlns" {
218 continue;
219 }
220 if let Some(prefix) = key.prefix()
221 && prefix.as_ref() == b"xmlns"
222 {
223 continue;
224 }
225
226 let (attr_resolve, _) =
227 self.reader.resolver().resolve_attribute(key);
228 let attr_ns = resolve_namespace(attr_resolve)?;
229 let attr_local_name = key.local_name();
230 let attr_local = core::str::from_utf8(attr_local_name.as_ref())
231 .map_err(XmlError::InvalidUtf8)?;
232 let value = attr
233 .unescape_value()
234 .map_err(|e| XmlError::Parse(e.to_string()))?;
235
236 self.pending_attrs.push((
237 attr_ns,
238 attr_local.to_string(),
239 value.into_owned(),
240 ));
241 }
242
243 self.depth += 1;
244 self.is_empty_element = is_empty;
245
246 if self.pending_attrs.is_empty() {
247 self.state = ParserState::NeedChildrenStart;
248 } else {
249 self.state = ParserState::EmittingAttrs;
250 }
251
252 return Ok(Some(DomEvent::NodeStart {
253 tag: Cow::Owned(local_owned),
254 namespace: elem_ns.map(Cow::Owned),
255 }));
256 }
257 Event::End(_) => {
258 self.state = ParserState::NeedChildrenEnd;
259 }
260 Event::Text(e) => {
261 let text = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
262 let trimmed = text.trim();
263 if !trimmed.is_empty() {
264 return Ok(Some(DomEvent::Text(Cow::Owned(trimmed.to_string()))));
265 }
266 }
267 Event::CData(e) => {
268 let text =
269 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
270 if !text.is_empty() {
271 return Ok(Some(DomEvent::Text(Cow::Owned(text.to_string()))));
272 }
273 }
274 Event::Comment(e) => {
275 let text =
276 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
277 return Ok(Some(DomEvent::Comment(Cow::Owned(text.to_string()))));
278 }
279 Event::PI(e) => {
280 let content =
281 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
282 let (target, data) = content
283 .split_once(char::is_whitespace)
284 .unwrap_or((content, ""));
285 return Ok(Some(DomEvent::ProcessingInstruction {
286 target: Cow::Owned(target.to_string()),
287 data: Cow::Owned(data.trim().to_string()),
288 }));
289 }
290 Event::Decl(_) => {
291 }
293 Event::DocType(e) => {
294 let text =
296 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
297 return Ok(Some(DomEvent::Doctype(Cow::Owned(text.to_string()))));
298 }
299 Event::Eof => {
300 self.state = ParserState::Done;
301 return Ok(None);
302 }
303 Event::GeneralRef(e) => {
304 let raw = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
305 let resolved = resolve_entity(&raw)?;
306 return Ok(Some(DomEvent::Text(Cow::Owned(resolved))));
307 }
308 }
309 }
310 }
311 }
312 }
313}
314
315impl<'de> DomParser<'de> for XmlParser<'de> {
316 type Error = XmlError;
317
318 fn next_event(&mut self) -> Result<Option<DomEvent<'de>>, Self::Error> {
319 if let Some(event) = self.peeked.take() {
320 return Ok(Some(event));
321 }
322 self.read_next()
323 }
324
325 fn peek_event(&mut self) -> Result<Option<&DomEvent<'de>>, Self::Error> {
326 if self.peeked.is_none() {
327 self.peeked = self.read_next()?;
328 }
329 Ok(self.peeked.as_ref())
330 }
331
332 fn skip_node(&mut self) -> Result<(), Self::Error> {
333 let start_depth = self.depth;
334
335 loop {
336 let event = self.next_event()?;
337 match event {
338 Some(DomEvent::NodeEnd) => {
339 if self.depth < start_depth {
340 break;
341 }
342 }
343 None => break,
344 _ => {}
345 }
346 }
347
348 Ok(())
349 }
350
351 fn current_span(&self) -> Option<facet_reflect::Span> {
352 None
353 }
354
355 fn format_namespace(&self) -> Option<&'static str> {
356 Some("xml")
357 }
358
359 fn capture_raw_node(&mut self) -> Result<Option<Cow<'de, str>>, Self::Error> {
360 Ok(Some(self.do_capture_raw_node()?))
361 }
362}
363
364fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
366 match resolve {
367 ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
368 ResolveResult::Unbound => Ok(None),
369 ResolveResult::Unknown(_) => Ok(None),
370 }
371}
372
373fn resolve_entity(raw: &str) -> Result<String, XmlError> {
375 if let Some(resolved) = resolve_xml_entity(raw) {
376 return Ok(resolved.into());
377 }
378
379 if let Some(rest) = raw.strip_prefix('#') {
380 let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
381 u32::from_str_radix(hex, 16)
382 .map_err(|_| XmlError::Parse(format!("Invalid hex entity: #{}", rest)))?
383 } else {
384 rest.parse::<u32>()
385 .map_err(|_| XmlError::Parse(format!("Invalid decimal entity: #{}", rest)))?
386 };
387
388 let ch = char::from_u32(code)
389 .ok_or_else(|| XmlError::Parse(format!("Invalid Unicode: {}", code)))?;
390 return Ok(ch.to_string());
391 }
392
393 Ok(format!("&{};", raw))
394}