1extern crate alloc;
4
5use alloc::borrow::Cow;
6use alloc::string::String;
7use alloc::vec::Vec;
8use core::fmt;
9use std::io::Cursor;
10
11use facet_dom::{DomEvent, DomParser};
12use quick_xml::NsReader;
13use quick_xml::escape::resolve_xml_entity;
14use quick_xml::events::Event;
15use quick_xml::name::ResolveResult;
16
17#[derive(Debug, Clone)]
19pub enum XmlError {
20 Parse(String),
22 UnexpectedEof,
24 UnbalancedTags,
26 InvalidUtf8(core::str::Utf8Error),
28}
29
30impl fmt::Display for XmlError {
31 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32 match self {
33 XmlError::Parse(msg) => write!(f, "XML parse error: {}", msg),
34 XmlError::UnexpectedEof => write!(f, "Unexpected end of XML"),
35 XmlError::UnbalancedTags => write!(f, "Unbalanced XML tags"),
36 XmlError::InvalidUtf8(e) => write!(f, "Invalid UTF-8 in XML: {}", e),
37 }
38 }
39}
40
41impl std::error::Error for XmlError {}
42
43pub struct XmlParser<'de> {
45 reader: NsReader<Cursor<&'de [u8]>>,
46 input: &'de [u8],
48 buf: Vec<u8>,
50 peeked: Option<DomEvent<'de>>,
52 depth: usize,
54 pending_attrs: Vec<(Option<String>, String, String)>,
56 attr_idx: usize,
58 state: ParserState,
60 is_empty_element: bool,
62 node_start_pos: u64,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq)]
67enum ParserState {
68 Ready,
70 EmittingAttrs,
72 NeedChildrenStart,
74 InChildren,
76 NeedChildrenEnd,
78 NeedNodeEnd,
80 Done,
82}
83
84impl<'de> XmlParser<'de> {
85 pub fn new(input: &'de [u8]) -> Self {
87 trace!(input_len = input.len(), "creating XML parser");
88
89 let mut reader = NsReader::from_reader(Cursor::new(input));
90 reader.config_mut().trim_text(true);
91
92 Self {
93 reader,
94 input,
95 buf: Vec::new(),
96 peeked: None,
97 depth: 0,
98 pending_attrs: Vec::new(),
99 attr_idx: 0,
100 state: ParserState::Ready,
101 is_empty_element: false,
102 node_start_pos: 0,
103 }
104 }
105
106 fn do_capture_raw_node(&mut self) -> Result<Cow<'de, str>, XmlError> {
109 let start = self.node_start_pos as usize;
111 let start_depth = self.depth;
112
113 loop {
115 let event = if let Some(e) = self.peeked.take() {
117 Some(e)
118 } else {
119 self.read_next()?
120 };
121
122 match event {
123 Some(DomEvent::NodeEnd) if self.depth < start_depth => break,
124 None => break,
125 _ => {}
126 }
127 }
128
129 let end = self.reader.buffer_position() as usize;
130 let raw = &self.input[start..end];
131 let s = core::str::from_utf8(raw).map_err(XmlError::InvalidUtf8)?;
132 Ok(Cow::Borrowed(s))
133 }
134
135 fn read_next(&mut self) -> Result<Option<DomEvent<'de>>, XmlError> {
137 loop {
138 match self.state {
139 ParserState::Done => return Ok(None),
140
141 ParserState::EmittingAttrs => {
142 if self.attr_idx < self.pending_attrs.len() {
143 let (ns, name, value) = &self.pending_attrs[self.attr_idx];
144 let event = DomEvent::Attribute {
145 name: Cow::Owned(name.clone()),
146 value: Cow::Owned(value.clone()),
147 namespace: ns.clone().map(Cow::Owned),
148 };
149 self.attr_idx += 1;
150 return Ok(Some(event));
151 }
152 self.pending_attrs.clear();
154 self.attr_idx = 0;
155 self.state = ParserState::NeedChildrenStart;
156 }
157
158 ParserState::NeedChildrenStart => {
159 if self.is_empty_element {
160 self.state = ParserState::NeedChildrenEnd;
161 self.is_empty_element = false;
162 } else {
163 self.state = ParserState::InChildren;
164 }
165 return Ok(Some(DomEvent::ChildrenStart));
166 }
167
168 ParserState::NeedChildrenEnd => {
169 self.state = ParserState::NeedNodeEnd;
170 return Ok(Some(DomEvent::ChildrenEnd));
171 }
172
173 ParserState::NeedNodeEnd => {
174 self.depth -= 1;
175 self.state = if self.depth == 0 {
176 ParserState::Done
177 } else {
178 ParserState::InChildren
179 };
180 return Ok(Some(DomEvent::NodeEnd));
181 }
182
183 ParserState::Ready | ParserState::InChildren => {
184 let pos_before = self.reader.buffer_position();
186
187 self.buf.clear();
188 let (resolve, event) = self
189 .reader
190 .read_resolved_event_into(&mut self.buf)
191 .map_err(|e| XmlError::Parse(e.to_string()))?;
192
193 let elem_ns = resolve_namespace(resolve)?;
195
196 match event {
197 Event::Start(ref e) | Event::Empty(ref e) => {
198 let is_empty = matches!(event, Event::Empty(_));
199 self.node_start_pos = pos_before;
201
202 let local_name = e.local_name();
204 let local = core::str::from_utf8(local_name.as_ref())
205 .map_err(XmlError::InvalidUtf8)?;
206 let local_owned = local.to_string();
207
208 self.pending_attrs.clear();
210 self.attr_idx = 0;
211
212 for attr in e.attributes() {
213 let attr = attr.map_err(|e| XmlError::Parse(e.to_string()))?;
214
215 let key = attr.key;
217 if key.as_ref() == b"xmlns" {
218 continue;
219 }
220 if let Some(prefix) = key.prefix()
221 && prefix.as_ref() == b"xmlns"
222 {
223 continue;
224 }
225
226 let (attr_resolve, _) =
227 self.reader.resolver().resolve_attribute(key);
228 let attr_ns = resolve_namespace(attr_resolve)?;
229 let attr_local_name = key.local_name();
230 let attr_local = core::str::from_utf8(attr_local_name.as_ref())
231 .map_err(XmlError::InvalidUtf8)?;
232
233 let value = attr
234 .decode_and_unescape_value(self.reader.decoder())
235 .map_err(|e| XmlError::Parse(e.to_string()))?;
236
237 self.pending_attrs.push((
238 attr_ns,
239 attr_local.to_string(),
240 value.into_owned(),
241 ));
242 }
243
244 self.depth += 1;
245 self.is_empty_element = is_empty;
246
247 if self.pending_attrs.is_empty() {
248 self.state = ParserState::NeedChildrenStart;
249 } else {
250 self.state = ParserState::EmittingAttrs;
251 }
252
253 return Ok(Some(DomEvent::NodeStart {
254 tag: Cow::Owned(local_owned),
255 namespace: elem_ns.map(Cow::Owned),
256 }));
257 }
258 Event::End(_) => {
259 self.state = ParserState::NeedChildrenEnd;
260 }
261 Event::Text(e) => {
262 let text = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
263 let trimmed = text.trim();
264 if !trimmed.is_empty() {
265 return Ok(Some(DomEvent::Text(Cow::Owned(trimmed.to_string()))));
266 }
267 }
268 Event::CData(e) => {
269 let text =
270 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
271 if !text.is_empty() {
272 return Ok(Some(DomEvent::Text(Cow::Owned(text.to_string()))));
273 }
274 }
275 Event::Comment(e) => {
276 let text =
277 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
278 return Ok(Some(DomEvent::Comment(Cow::Owned(text.to_string()))));
279 }
280 Event::PI(e) => {
281 let content =
282 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
283 let (target, data) = content
284 .split_once(char::is_whitespace)
285 .unwrap_or((content, ""));
286 return Ok(Some(DomEvent::ProcessingInstruction {
287 target: Cow::Owned(target.to_string()),
288 data: Cow::Owned(data.trim().to_string()),
289 }));
290 }
291 Event::Decl(_) => {
292 }
294 Event::DocType(e) => {
295 let text =
297 core::str::from_utf8(e.as_ref()).map_err(XmlError::InvalidUtf8)?;
298 return Ok(Some(DomEvent::Doctype(Cow::Owned(text.to_string()))));
299 }
300 Event::Eof => {
301 self.state = ParserState::Done;
302 return Ok(None);
303 }
304 Event::GeneralRef(e) => {
305 let raw = e.decode().map_err(|e| XmlError::Parse(e.to_string()))?;
306 let resolved = resolve_entity(&raw)?;
307 return Ok(Some(DomEvent::Text(Cow::Owned(resolved))));
308 }
309 }
310 }
311 }
312 }
313 }
314}
315
316impl<'de> DomParser<'de> for XmlParser<'de> {
317 type Error = XmlError;
318
319 fn next_event(&mut self) -> Result<Option<DomEvent<'de>>, Self::Error> {
320 if let Some(event) = self.peeked.take() {
321 return Ok(Some(event));
322 }
323 self.read_next()
324 }
325
326 fn peek_event(&mut self) -> Result<Option<&DomEvent<'de>>, Self::Error> {
327 if self.peeked.is_none() {
328 self.peeked = self.read_next()?;
329 }
330 Ok(self.peeked.as_ref())
331 }
332
333 fn skip_node(&mut self) -> Result<(), Self::Error> {
334 let start_depth = self.depth;
335
336 loop {
337 let event = self.next_event()?;
338 match event {
339 Some(DomEvent::NodeEnd) => {
340 if self.depth < start_depth {
341 break;
342 }
343 }
344 None => break,
345 _ => {}
346 }
347 }
348
349 Ok(())
350 }
351
352 fn current_span(&self) -> Option<facet_reflect::Span> {
353 None
354 }
355
356 fn format_namespace(&self) -> Option<&'static str> {
357 Some("xml")
358 }
359
360 fn capture_raw_node(&mut self) -> Result<Option<Cow<'de, str>>, Self::Error> {
361 Ok(Some(self.do_capture_raw_node()?))
362 }
363}
364
365fn resolve_namespace(resolve: ResolveResult<'_>) -> Result<Option<String>, XmlError> {
367 match resolve {
368 ResolveResult::Bound(ns) => Ok(Some(String::from_utf8_lossy(ns.as_ref()).into_owned())),
369 ResolveResult::Unbound => Ok(None),
370 ResolveResult::Unknown(_) => Ok(None),
371 }
372}
373
374fn resolve_entity(raw: &str) -> Result<String, XmlError> {
376 if let Some(resolved) = resolve_xml_entity(raw) {
377 return Ok(resolved.into());
378 }
379
380 if let Some(rest) = raw.strip_prefix('#') {
381 let code = if let Some(hex) = rest.strip_prefix('x').or_else(|| rest.strip_prefix('X')) {
382 u32::from_str_radix(hex, 16)
383 .map_err(|_| XmlError::Parse(format!("Invalid hex entity: #{}", rest)))?
384 } else {
385 rest.parse::<u32>()
386 .map_err(|_| XmlError::Parse(format!("Invalid decimal entity: #{}", rest)))?
387 };
388
389 let ch = char::from_u32(code)
390 .ok_or_else(|| XmlError::Parse(format!("Invalid Unicode: {}", code)))?;
391 return Ok(ch.to_string());
392 }
393
394 Ok(format!("&{};", raw))
395}