1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{DecodeError, EditXMLError, MalformedReason, Result};
4use crate::types::StandaloneValue;
5use crate::utils::HashMap;
6use crate::utils::{bytes_to_unescaped_string, XMLStringUtils};
7use encoding_rs::Decoder;
8use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
9use quick_xml::events::{BytesDecl, BytesStart, Event};
10use quick_xml::Reader;
11use std::borrow::Cow;
12use std::io::{BufRead, Read};
13use tracing::{debug, trace};
14
15pub(crate) struct DecodeReader<R: Read> {
16 decoder: Option<Decoder>,
17 inner: R,
18 undecoded: Box<[u8]>,
19 undecoded_pos: usize,
20 undecoded_cap: usize,
21 remaining: [u8; 32], decoded: Box<[u8]>,
23 decoded_pos: usize,
24 decoded_cap: usize,
25 done: bool,
26}
27
28impl<R: Read> DecodeReader<R> {
29 pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
31 DecodeReader {
32 decoder,
33 inner: reader,
34 undecoded: vec![0; 4096].into_boxed_slice(),
35 undecoded_pos: 0,
36 undecoded_cap: 0,
37 remaining: [0; 32],
38 decoded: vec![0; 12288].into_boxed_slice(),
39 decoded_pos: 0,
40 decoded_cap: 0,
41 done: false,
42 }
43 }
44
45 pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
46 self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
47 self.done = false;
48 }
49
50 fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
52 if self.decoded_pos >= self.decoded_cap {
53 debug_assert!(self.decoded_pos == self.decoded_cap);
54 if self.done {
55 return Ok(&[]);
56 }
57 let remaining = self.undecoded_cap - self.undecoded_pos;
58 if remaining <= 32 {
59 self.remaining[..remaining]
61 .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
62 self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
63 let read = self.inner.read(&mut self.undecoded[remaining..])?;
65 self.done = read == 0;
66 self.undecoded_pos = 0;
67 self.undecoded_cap = remaining + read;
68 }
69
70 let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
72 &self.undecoded[self.undecoded_pos..self.undecoded_cap],
73 &mut self.decoded,
74 self.done,
75 );
76 self.undecoded_pos += read;
77 self.decoded_cap = written;
78 self.decoded_pos = 0;
79 }
80 Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
81 }
82
83 fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
84 if self.undecoded_pos >= self.undecoded_cap {
85 debug_assert!(self.undecoded_pos == self.undecoded_cap);
86 self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
87 self.undecoded_pos = 0;
88 }
89 Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
90 }
91}
92
93impl<R: Read> Read for DecodeReader<R> {
94 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
95 (&self.decoded[..]).read(buf)
96 }
97}
98
99impl<R: Read> BufRead for DecodeReader<R> {
100 fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
102 match &self.decoder {
103 Some(_) => self.fill_buf_decode(),
104 None => self.fill_buf_without_decode(),
105 }
106 }
107 fn consume(&mut self, amt: usize) {
108 match &self.decoder {
109 Some(_) => {
110 self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
111 }
112 None => {
113 self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
114 }
115 }
116 }
117}
118
119#[derive(Debug, Clone, PartialEq, Eq)]
121pub struct ReadOptions {
122 pub empty_text_node: bool,
125 pub trim_text: bool,
128 pub ignore_whitespace_only: bool,
132 pub require_decl: bool,
136 pub encoding: Option<String>,
141}
142impl ReadOptions {
143 pub fn relaxed() -> Self {
145 ReadOptions {
146 empty_text_node: true,
147 trim_text: true,
148 ignore_whitespace_only: true,
149 require_decl: false,
150 encoding: None,
151 }
152 }
153}
154impl Default for ReadOptions {
155 fn default() -> Self {
156 ReadOptions {
157 empty_text_node: true,
158 trim_text: true,
159 ignore_whitespace_only: false,
160 require_decl: true,
161 encoding: None,
162 }
163 }
164}
165
166pub(crate) struct DocumentParser {
168 doc: Document,
169 read_opts: ReadOptions,
170 encoding: Option<&'static Encoding>,
171 element_stack: Vec<Element>,
172}
173
174impl DocumentParser {
175 pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
176 let doc = Document::new();
177 let element_stack = vec![doc.container()];
178 let mut parser = DocumentParser {
179 doc,
180 read_opts: opts,
181 encoding: None,
182 element_stack,
183 };
184 parser.parse_start(reader)?;
185 Ok(parser.doc)
186 }
187
188 fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
189 self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
190 self.encoding = match ev.encoding() {
191 Some(res) => {
192 let encoding = Encoding::for_label(&res?).ok_or(DecodeError::MissingEncoding)?;
193 if encoding == UTF_8 {
194 None
195 } else {
196 Some(encoding)
197 }
198 }
199 None => None,
200 };
201 self.doc.standalone = match ev.standalone() {
202 Some(res) => {
203 let standalone_value = res?;
204 Some(StandaloneValue::try_from(standalone_value.as_ref())?)
205 }
206 None => None,
207 };
208 Ok(())
209 }
210
211 fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
212 let full_name = ev.name().into_string()?;
213 let mut namespace_decls = HashMap::new();
214 let mut attributes = HashMap::new();
215 for attr in ev.attributes() {
216 let mut attr = attr?;
217 attr.value = Cow::Owned(normalize_space(&attr.value));
218 let key = attr.key.into_string()?;
219 let value = bytes_to_unescaped_string(&attr.value)?;
220 if key == "xmlns" {
221 namespace_decls.insert(String::new(), value);
222 continue;
223 } else if let Some(prefix) = key.strip_prefix("xmlns:") {
224 namespace_decls.insert(prefix.to_owned(), value);
225 continue;
226 }
227 attributes.insert(key, value);
228 }
229 let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
230 parent
231 .push_child(&mut self.doc, Node::Element(elem))
232 .unwrap();
233 Ok(elem)
234 }
235
236 fn handle_event(&mut self, event: Event) -> Result<bool> {
238 match event {
239 Event::Start(ref ev) => {
240 let parent = *self.element_stack.last().ok_or_else(|| {
241 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
242 })?;
243 let element = self.create_element(parent, ev)?;
244 self.element_stack.push(element);
245 Ok(false)
246 }
247 Event::End(_) => {
248 let elem = self.element_stack.pop().ok_or_else(|| {
249 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
250 })?; if self.read_opts.empty_text_node {
252 if !elem.has_children(&self.doc) {
254 elem.push_child(&mut self.doc, Node::Text(String::new()))
255 .unwrap();
256 }
257 }
258 Ok(false)
259 }
260 Event::Empty(ref ev) => {
261 let parent = *self.element_stack.last().ok_or_else(|| {
262 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
263 })?;
264 self.create_element(parent, ev)?;
265 Ok(false)
266 }
267 Event::Text(ev) => {
270 if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
271 return Ok(false);
272 }
273 if ev.is_empty() {
275 return Ok(false);
276 }
277 let content = ev.unescape_to_string()?;
279 let node = Node::Text(content);
280 let parent = *self.element_stack.last().ok_or_else(|| {
281 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
282 })?;
283 parent.push_child(&mut self.doc, node).unwrap();
284 Ok(false)
285 }
286 Event::DocType(ev) => {
287 let raw = ev.unescape_to_string()?.into_bytes();
289 let content = if !raw.is_empty() && raw[0] == b' ' {
290 String::from_utf8(raw[1..].to_vec())?
291 } else {
292 String::from_utf8(raw.to_vec())?
293 };
294 let node = Node::DocType(content);
295 let parent = *self.element_stack.last().ok_or_else(|| {
296 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
297 })?;
298 parent.push_child(&mut self.doc, node).unwrap();
299 Ok(false)
300 }
301 Event::Comment(ev) => {
302 let content = String::from_utf8(ev.escape_ascii().collect())?;
303 let node = Node::Comment(content);
304 let parent = *self.element_stack.last().ok_or_else(|| {
305 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
306 })?;
307 parent.push_child(&mut self.doc, node).unwrap();
308 Ok(false)
309 }
310 Event::CData(ev) => {
311 let content = String::from_utf8(ev.to_vec())?;
312 let node = Node::CData(content);
313 let parent = *self.element_stack.last().ok_or_else(|| {
314 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
315 })?;
316 parent.push_child(&mut self.doc, node).unwrap();
317 Ok(false)
318 }
319 Event::PI(ev) => {
320 let content = ev.into_string()?;
321 let node = Node::PI(content);
322 let parent = *self.element_stack.last().ok_or_else(|| {
323 EditXMLError::MalformedXML(MalformedReason::GenericMalformedTree)
324 })?;
325 parent.push_child(&mut self.doc, node).unwrap();
326 Ok(false)
327 }
328 Event::Decl(_) => Err(EditXMLError::MalformedXML(MalformedReason::UnexpectedItem(
329 "XML Declaration",
330 ))),
331 Event::Eof => Ok(true),
332 }
333 }
334
335 fn sniff_encoding<R: Read>(
337 &mut self,
338 decodereader: &mut DecodeReader<R>,
339 ) -> Result<Option<&'static Encoding>> {
340 let bytes = decodereader.fill_buf()?;
341 let encoding = match bytes {
342 [0x3c, 0x3f, ..] => None, [0xfe, 0xff, ..] => {
344 decodereader.consume(2);
346 Some(UTF_16BE)
347 }
348 [0xff, 0xfe, ..] => {
349 decodereader.consume(2);
351 Some(UTF_16LE)
352 }
353 [0xef, 0xbb, 0xbf, ..] => {
354 decodereader.consume(3);
356 None
357 }
358 [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
359 [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
360 _ => None, };
362 Ok(encoding)
363 }
364
365 fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
367 debug!(?self.read_opts, "Parsing Start");
368 let mut decodereader = DecodeReader::new(reader, None);
369 let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
370 if let Some(enc) = &self.read_opts.encoding {
371 init_encoding =
372 Some(Encoding::for_label(enc.as_bytes()).ok_or(DecodeError::MissingEncoding)?)
373 }
374 debug!(?init_encoding, "Initial Encoding");
375 decodereader.set_encoding(init_encoding);
376 let mut xmlreader = Reader::from_reader(decodereader);
377 xmlreader.config_mut().trim_text(self.read_opts.trim_text);
378
379 let mut buf = Vec::with_capacity(200);
380
381 let event = match xmlreader.read_event_into(&mut buf)? {
383 Event::Text(ev) => {
384 if ev.len() == 0 {
385 trace!("Skipping empty text event");
386 xmlreader.read_event_into(&mut buf)?
387 } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
388 trace!("Skipping whitespace only text event");
389 xmlreader.read_event_into(&mut buf)?
390 } else {
391 trace!("First Event is Text");
392 Event::Text(ev)
393 }
394 }
395 ev => ev,
396 };
397 debug!(?event, "First Event");
398 if let Event::Decl(ev) = event {
399 self.handle_decl(&ev)?;
400 if self.encoding != init_encoding
402 && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
403 {
404 let mut decode_reader = xmlreader.into_inner();
405 decode_reader.set_encoding(self.encoding);
406 xmlreader = Reader::from_reader(decode_reader);
407 xmlreader.config_mut().trim_text(self.read_opts.trim_text);
408 }
409 } else if self.read_opts.require_decl {
410 debug!(?self.read_opts, ?event, "XML Declaration is required");
411 return Err(MalformedReason::MissingDeclaration.into());
412 } else if self.handle_event(event)? {
413 return Ok(());
414 }
415 self.parse_content(xmlreader)
417 }
418
419 fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
420 let mut buf = Vec::with_capacity(200); loop {
423 let ev = reader.read_event_into(&mut buf)?;
424
425 if self.handle_event(ev)? {
426 if self.element_stack.len() == 1 {
427 return Ok(());
429 } else {
430 return Err(MalformedReason::MissingClosingTag.into());
431 }
432 }
433 }
434 }
435}
436
437#[allow(clippy::match_like_matches_macro)]
439fn is_whitespace(byte: u8) -> bool {
440 match byte {
441 b'\r' | b'\n' | b'\t' | b' ' => true,
442 _ => false,
443 }
444}
445
446fn only_has_whitespace(bytes: &[u8]) -> bool {
448 bytes.iter().all(|b| is_whitespace(*b))
449}
450
451pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
455 let mut normalized = Vec::with_capacity(bytes.len());
456 let mut char_found = false;
457 let mut last_space = false;
458 for &byte in bytes {
459 if is_whitespace(byte) {
460 if char_found && !last_space {
461 normalized.push(b' ');
462 last_space = true;
463 }
464 } else {
465 char_found = true;
466 last_space = false;
467 normalized.push(byte);
468 }
469 }
470 if normalized.last() == Some(&b' ') {
472 normalized.pop();
473 }
474 normalized
475}