1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11
12#[cfg(debug_assertions)]
13macro_rules! debug {
14 ($x:expr) => {
15 println!("{:?}", $x)
16 };
17}
18
19pub(crate) struct DecodeReader<R: Read> {
20 decoder: Option<Decoder>,
21 inner: R,
22 undecoded: Box<[u8]>,
23 undecoded_pos: usize,
24 undecoded_cap: usize,
25 remaining: [u8; 32], decoded: Box<[u8]>,
27 decoded_pos: usize,
28 decoded_cap: usize,
29 done: bool,
30}
31
32impl<R: Read> DecodeReader<R> {
33 pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
35 DecodeReader {
36 decoder,
37 inner: reader,
38 undecoded: vec![0; 4096].into_boxed_slice(),
39 undecoded_pos: 0,
40 undecoded_cap: 0,
41 remaining: [0; 32],
42 decoded: vec![0; 12288].into_boxed_slice(),
43 decoded_pos: 0,
44 decoded_cap: 0,
45 done: false,
46 }
47 }
48
49 pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
50 self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
51 self.done = false;
52 }
53
54 fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
56 if self.decoded_pos >= self.decoded_cap {
57 debug_assert!(self.decoded_pos == self.decoded_cap);
58 if self.done {
59 return Ok(&[]);
60 }
61 let remaining = self.undecoded_cap - self.undecoded_pos;
62 if remaining <= 32 {
63 self.remaining[..remaining]
65 .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
66 self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
67 let read = self.inner.read(&mut self.undecoded[remaining..])?;
69 self.done = read == 0;
70 self.undecoded_pos = 0;
71 self.undecoded_cap = remaining + read;
72 }
73
74 let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
76 &self.undecoded[self.undecoded_pos..self.undecoded_cap],
77 &mut self.decoded,
78 self.done,
79 );
80 self.undecoded_pos += read;
81 self.decoded_cap = written;
82 self.decoded_pos = 0;
83 }
84 Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
85 }
86
87 fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
88 if self.undecoded_pos >= self.undecoded_cap {
89 debug_assert!(self.undecoded_pos == self.undecoded_cap);
90 self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
91 self.undecoded_pos = 0;
92 }
93 Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
94 }
95}
96
97impl<R: Read> Read for DecodeReader<R> {
98 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
99 (&self.decoded[..]).read(buf)
100 }
101}
102
103impl<R: Read> BufRead for DecodeReader<R> {
104 fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
106 match &self.decoder {
107 Some(_) => self.fill_buf_decode(),
108 None => self.fill_buf_without_decode(),
109 }
110 }
111 fn consume(&mut self, amt: usize) {
112 match &self.decoder {
113 Some(_) => {
114 self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
115 }
116 None => {
117 self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
118 }
119 }
120 }
121}
122
123#[derive(Debug, Clone, PartialEq, Eq)]
125pub struct ReadOptions {
126 pub empty_text_node: bool,
129 pub trim_text: bool,
132 pub ignore_whitespace_only: bool,
136 pub require_decl: bool,
140 pub encoding: Option<String>,
145}
146
147impl ReadOptions {
148 pub fn default() -> ReadOptions {
150 ReadOptions {
151 empty_text_node: true,
152 trim_text: true,
153 ignore_whitespace_only: false,
154 require_decl: true,
155 encoding: None,
156 }
157 }
158}
159
160pub(crate) struct DocumentParser {
162 doc: Document,
163 read_opts: ReadOptions,
164 encoding: Option<&'static Encoding>,
165 element_stack: Vec<Element>,
166}
167
168impl DocumentParser {
169 pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
170 let doc = Document::new();
171 let element_stack = vec![doc.container()];
172 let mut parser = DocumentParser {
173 doc,
174 read_opts: opts,
175 encoding: None,
176 element_stack: element_stack,
177 };
178 parser.parse_start(reader)?;
179 Ok(parser.doc)
180 }
181
182 fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
183 self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
184 self.encoding = match ev.encoding() {
185 Some(res) => {
186 let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
187 if encoding == UTF_8 {
188 None
189 } else {
190 Some(encoding)
191 }
192 }
193 None => None,
194 };
195 self.doc.standalone = match ev.standalone() {
196 Some(res) => {
197 let val = std::str::from_utf8(&res?)?.to_lowercase();
198 match val.as_str() {
199 "yes" => true,
200 "no" => false,
201 _ => {
202 return Err(Error::MalformedXML(
203 "Standalone Document Declaration has non boolean value".to_string(),
204 ))
205 }
206 }
207 }
208 None => false,
209 };
210 Ok(())
211 }
212
213 fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
214 let full_name = String::from_utf8(ev.name().to_vec())?;
215 let mut namespace_decls = HashMap::new();
216 let mut attributes = HashMap::new();
217 for attr in ev.attributes() {
218 let mut attr = attr?;
219 attr.value = Cow::Owned(normalize_space(&attr.value));
220 let key = String::from_utf8(attr.key.to_vec())?;
221 let value = String::from_utf8(attr.unescaped_value()?.to_vec())?;
222 if key == "xmlns" {
223 namespace_decls.insert(String::new(), value);
224 continue;
225 } else if let Some(prefix) = key.strip_prefix("xmlns:") {
226 namespace_decls.insert(prefix.to_owned(), value);
227 continue;
228 }
229 attributes.insert(key, value);
230 }
231
232 let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
233 parent
234 .push_child(&mut self.doc, Node::Element(elem))
235 .unwrap();
236 Ok(elem)
237 }
238
239 fn handle_event(&mut self, event: Event) -> Result<bool> {
241 match event {
242 Event::Start(ref ev) => {
243 let parent = *self
244 .element_stack
245 .last()
246 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
247 let element = self.create_element(parent, ev)?;
248 self.element_stack.push(element);
249 Ok(false)
250 }
251 Event::End(_) => {
252 let elem = self
253 .element_stack
254 .pop()
255 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; if self.read_opts.empty_text_node {
257 if !elem.has_children(&self.doc) {
259 elem.push_child(&mut self.doc, Node::Text(String::new()))
260 .unwrap();
261 }
262 }
263 Ok(false)
264 }
265 Event::Empty(ref ev) => {
266 let parent = *self
267 .element_stack
268 .last()
269 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
270 self.create_element(parent, ev)?;
271 Ok(false)
272 }
273 Event::Text(ev) => {
276 if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
277 return Ok(false);
278 }
279 if ev.is_empty() {
281 return Ok(false);
282 }
283 let content = String::from_utf8(ev.unescaped()?.to_vec())?;
284 let node = Node::Text(content);
285 let parent = *self
286 .element_stack
287 .last()
288 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
289 parent.push_child(&mut self.doc, node).unwrap();
290 Ok(false)
291 }
292 Event::DocType(ev) => {
293 let raw = ev.unescaped()?;
295 let content = if !raw.is_empty() && raw[0] == b' ' {
296 String::from_utf8(raw[1..].to_vec())?
297 } else {
298 String::from_utf8(raw.to_vec())?
299 };
300 let node = Node::DocType(content);
301 let parent = *self
302 .element_stack
303 .last()
304 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
305 parent.push_child(&mut self.doc, node).unwrap();
306 Ok(false)
307 }
308 Event::Comment(ev) => {
309 let content = String::from_utf8(ev.escaped().to_vec())?;
310 let node = Node::Comment(content);
311 let parent = *self
312 .element_stack
313 .last()
314 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
315 parent.push_child(&mut self.doc, node).unwrap();
316 Ok(false)
317 }
318 Event::CData(ev) => {
319 let content = String::from_utf8(ev.unescaped()?.to_vec())?;
320 let node = Node::CData(content);
321 let parent = *self
322 .element_stack
323 .last()
324 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
325 parent.push_child(&mut self.doc, node).unwrap();
326 Ok(false)
327 }
328 Event::PI(ev) => {
329 let content = String::from_utf8(ev.escaped().to_vec())?;
330 let node = Node::PI(content);
331 let parent = *self
332 .element_stack
333 .last()
334 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
335 parent.push_child(&mut self.doc, node).unwrap();
336 Ok(false)
337 }
338 Event::Decl(_) => Err(Error::MalformedXML(
339 "XML declaration found in the middle of the document".to_string(),
340 )),
341 Event::Eof => Ok(true),
342 }
343 }
344
345 fn sniff_encoding<R: Read>(
347 &mut self,
348 decodereader: &mut DecodeReader<R>,
349 ) -> Result<Option<&'static Encoding>> {
350 let bytes = decodereader.fill_buf()?;
351 let encoding = match bytes {
352 [0x3c, 0x3f, ..] => None, [0xfe, 0xff, ..] => {
354 decodereader.consume(2);
356 Some(UTF_16BE)
357 }
358 [0xff, 0xfe, ..] => {
359 decodereader.consume(2);
361 Some(UTF_16LE)
362 }
363 [0xef, 0xbb, 0xbf, ..] => {
364 decodereader.consume(3);
366 None
367 }
368 [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
369 [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
370 _ => None, };
372 Ok(encoding)
373 }
374
375 fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
377 let mut decodereader = DecodeReader::new(reader, None);
378 let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
379 if let Some(enc) = &self.read_opts.encoding {
380 init_encoding = Some(Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode)?)
381 }
382 decodereader.set_encoding(init_encoding);
383 let mut xmlreader = Reader::from_reader(decodereader);
384 xmlreader.trim_text(self.read_opts.trim_text);
385
386 let mut buf = Vec::with_capacity(200);
387
388 let event = match xmlreader.read_event(&mut buf)? {
390 Event::Text(ev) => {
391 if ev.len() == 0 {
392 xmlreader.read_event(&mut buf)?
393 } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
394 xmlreader.read_event(&mut buf)?
395 } else {
396 Event::Text(ev)
397 }
398 }
399 ev => ev,
400 };
401 #[cfg(debug_assertions)]
402 debug!(event);
403 if let Event::Decl(ev) = event {
404 self.handle_decl(&ev)?;
405 if self.encoding != init_encoding
407 && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
408 {
409 let mut decode_reader = xmlreader.into_underlying_reader();
410 decode_reader.set_encoding(self.encoding);
411 xmlreader = Reader::from_reader(decode_reader);
412 xmlreader.trim_text(self.read_opts.trim_text);
413 }
414 } else if self.read_opts.require_decl {
415 return Err(Error::MalformedXML(
416 "Didn't find XML Declaration at the start of file".to_string(),
417 ));
418 } else if self.handle_event(event)? {
419 return Ok(());
420 }
421 self.parse_content(xmlreader)
423 }
424
425 fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
426 let mut buf = Vec::with_capacity(200); loop {
429 let ev = reader.read_event(&mut buf)?;
430 #[cfg(debug_assertions)]
431 debug!(ev);
432 if self.handle_event(ev)? {
433 if self.element_stack.len() == 1 {
434 return Ok(());
436 } else {
437 return Err(Error::MalformedXML("Closing tag not found.".to_string()));
438 }
439 }
440 }
441 }
442}
443
444fn is_whitespace(byte: u8) -> bool {
446 match byte {
447 b'\r' | b'\n' | b'\t' | b' ' => true,
448 _ => false,
449 }
450}
451
452fn only_has_whitespace(bytes: &[u8]) -> bool {
454 bytes.iter().all(|b| is_whitespace(*b))
455}
456
457pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
461 let mut normalized = Vec::with_capacity(bytes.len());
462 let mut char_found = false;
463 let mut last_space = false;
464 for &byte in bytes {
465 if is_whitespace(byte) {
466 if char_found && !last_space {
467 normalized.push(b' ');
468 last_space = true;
469 }
470 } else {
471 char_found = true;
472 last_space = false;
473 normalized.push(byte);
474 }
475 }
476 if normalized.last() == Some(&b' ') {
478 normalized.pop();
479 }
480 normalized
481}