1use crate::document::{Document, Node};
2use crate::element::Element;
3use crate::error::{Error, Result};
4use encoding_rs::Decoder;
5use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
6use quick_xml::events::{BytesDecl, BytesStart, Event};
7use quick_xml::Reader;
8use std::borrow::Cow;
9use std::collections::HashMap;
10use std::io::{BufRead, Read};
11use log4rs_macros::debug;
12
13pub(crate) struct DecodeReader<R: Read> {
14 decoder: Option<Decoder>,
15 inner: R,
16 undecoded: Box<[u8]>,
17 undecoded_pos: usize,
18 undecoded_cap: usize,
19 remaining: [u8; 32], decoded: Box<[u8]>,
21 decoded_pos: usize,
22 decoded_cap: usize,
23 done: bool,
24}
25
26impl<R: Read> DecodeReader<R> {
27 pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
29 DecodeReader {
30 decoder,
31 inner: reader,
32 undecoded: vec![0; 4096].into_boxed_slice(),
33 undecoded_pos: 0,
34 undecoded_cap: 0,
35 remaining: [0; 32],
36 decoded: vec![0; 12288].into_boxed_slice(),
37 decoded_pos: 0,
38 decoded_cap: 0,
39 done: false,
40 }
41 }
42
43 pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
44 self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
45 self.done = false;
46 }
47
48 fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
50 if self.decoded_pos >= self.decoded_cap {
51 debug_assert!(self.decoded_pos == self.decoded_cap);
52 if self.done {
53 return Ok(&[]);
54 }
55 let remaining = self.undecoded_cap - self.undecoded_pos;
56 if remaining <= 32 {
57 self.remaining[..remaining]
59 .copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
60 self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
61 let read = self.inner.read(&mut self.undecoded[remaining..])?;
63 self.done = read == 0;
64 self.undecoded_pos = 0;
65 self.undecoded_cap = remaining + read;
66 }
67
68 let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
70 &self.undecoded[self.undecoded_pos..self.undecoded_cap],
71 &mut self.decoded,
72 self.done,
73 );
74 self.undecoded_pos += read;
75 self.decoded_cap = written;
76 self.decoded_pos = 0;
77 }
78 Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
79 }
80
81 fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
82 if self.undecoded_pos >= self.undecoded_cap {
83 debug_assert!(self.undecoded_pos == self.undecoded_cap);
84 self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
85 self.undecoded_pos = 0;
86 }
87 Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
88 }
89}
90
91impl<R: Read> Read for DecodeReader<R> {
92 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
93 (&self.decoded[..]).read(buf)
94 }
95}
96
97impl<R: Read> BufRead for DecodeReader<R> {
98 fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
100 match &self.decoder {
101 Some(_) => self.fill_buf_decode(),
102 None => self.fill_buf_without_decode(),
103 }
104 }
105 fn consume(&mut self, amt: usize) {
106 match &self.decoder {
107 Some(_) => {
108 self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
109 }
110 None => {
111 self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
112 }
113 }
114 }
115}
116
117#[derive(Debug, Clone, PartialEq, Eq)]
119pub struct ReadOptions {
120 pub empty_text_node: bool,
123 pub trim_text: bool,
126 pub ignore_whitespace_only: bool,
130 pub require_decl: bool,
134 pub encoding: Option<String>,
139}
140
141impl ReadOptions {
142 pub fn default() -> ReadOptions {
144 ReadOptions {
145 empty_text_node: true,
146 trim_text: true,
147 ignore_whitespace_only: false,
148 require_decl: true,
149 encoding: None,
150 }
151 }
152}
153
154pub(crate) struct DocumentParser {
156 doc: Document,
157 read_opts: ReadOptions,
158 encoding: Option<&'static Encoding>,
159 element_stack: Vec<Element>,
160}
161
162impl DocumentParser {
163 pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
164 let doc = Document::new();
165 let element_stack = vec![doc.container()];
166 let mut parser = DocumentParser {
167 doc,
168 read_opts: opts,
169 encoding: None,
170 element_stack: element_stack,
171 };
172 parser.parse_start(reader)?;
173 Ok(parser.doc)
174 }
175
176 fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
177 self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
178 self.encoding = match ev.encoding() {
179 Some(res) => {
180 let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
181 if encoding == UTF_8 {
182 None
183 } else {
184 Some(encoding)
185 }
186 }
187 None => None,
188 };
189 self.doc.standalone = match ev.standalone() {
190 Some(res) => {
191 let val = std::str::from_utf8(&res?)?.to_lowercase();
192 match val.as_str() {
193 "yes" => true,
194 "no" => false,
195 _ => {
196 return Err(Error::MalformedXML(
197 "Standalone Document Declaration has non boolean value".to_string(),
198 ))
199 }
200 }
201 }
202 None => false,
203 };
204 Ok(())
205 }
206
207 fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
208 let full_name = String::from_utf8(ev.name().to_vec())?;
209 let mut namespace_decls = HashMap::new();
210 let mut attributes = HashMap::new();
211 for attr in ev.attributes() {
212 let mut attr = attr?;
213 attr.value = Cow::Owned(normalize_space(&attr.value));
214 let key = String::from_utf8(attr.key.to_vec())?;
215 let value = String::from_utf8(attr.unescaped_value()?.to_vec())?;
216 if key == "xmlns" {
217 namespace_decls.insert(String::new(), value);
218 continue;
219 } else if let Some(prefix) = key.strip_prefix("xmlns:") {
220 namespace_decls.insert(prefix.to_owned(), value);
221 continue;
222 }
223 attributes.insert(key, value);
224 }
225
226 let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
227 parent
228 .push_child(&mut self.doc, Node::Element(elem))
229 .unwrap();
230 Ok(elem)
231 }
232
233 fn handle_event(&mut self, event: Event) -> Result<bool> {
235 match event {
236 Event::Start(ref ev) => {
237 let parent = *self
238 .element_stack
239 .last()
240 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
241 let element = self.create_element(parent, ev)?;
242 self.element_stack.push(element);
243 Ok(false)
244 }
245 Event::End(_) => {
246 let elem = self
247 .element_stack
248 .pop()
249 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; if self.read_opts.empty_text_node {
251 if !elem.has_children(&self.doc) {
253 elem.push_child(&mut self.doc, Node::Text(String::new()))
254 .unwrap();
255 }
256 }
257 Ok(false)
258 }
259 Event::Empty(ref ev) => {
260 let parent = *self
261 .element_stack
262 .last()
263 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
264 self.create_element(parent, ev)?;
265 Ok(false)
266 }
267 Event::Text(ev) => {
270 if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
271 return Ok(false);
272 }
273 if ev.is_empty() {
275 return Ok(false);
276 }
277 let content = String::from_utf8(ev.unescaped()?.to_vec())?;
278 let node = Node::Text(content);
279 let parent = *self
280 .element_stack
281 .last()
282 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
283 parent.push_child(&mut self.doc, node).unwrap();
284 Ok(false)
285 }
286 Event::DocType(ev) => {
287 let raw = ev.unescaped()?;
289 let content = if !raw.is_empty() && raw[0] == b' ' {
290 String::from_utf8(raw[1..].to_vec())?
291 } else {
292 String::from_utf8(raw.to_vec())?
293 };
294 let node = Node::DocType(content);
295 let parent = *self
296 .element_stack
297 .last()
298 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
299 parent.push_child(&mut self.doc, node).unwrap();
300 Ok(false)
301 }
302 Event::Comment(ev) => {
303 let content = String::from_utf8(ev.escaped().to_vec())?;
304 let node = Node::Comment(content);
305 let parent = *self
306 .element_stack
307 .last()
308 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
309 parent.push_child(&mut self.doc, node).unwrap();
310 Ok(false)
311 }
312 Event::CData(ev) => {
313 let content = String::from_utf8(ev.unescaped()?.to_vec())?;
314 let node = Node::CData(content);
315 let parent = *self
316 .element_stack
317 .last()
318 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
319 parent.push_child(&mut self.doc, node).unwrap();
320 Ok(false)
321 }
322 Event::PI(ev) => {
323 let content = String::from_utf8(ev.escaped().to_vec())?;
324 let node = Node::PI(content);
325 let parent = *self
326 .element_stack
327 .last()
328 .ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
329 parent.push_child(&mut self.doc, node).unwrap();
330 Ok(false)
331 }
332 Event::Decl(_) => Err(Error::MalformedXML(
333 "XML declaration found in the middle of the document".to_string(),
334 )),
335 Event::Eof => Ok(true),
336 }
337 }
338
339 fn sniff_encoding<R: Read>(
341 &mut self,
342 decodereader: &mut DecodeReader<R>,
343 ) -> Result<Option<&'static Encoding>> {
344 let bytes = decodereader.fill_buf()?;
345 let encoding = match bytes {
346 [0x3c, 0x3f, ..] => None, [0xfe, 0xff, ..] => {
348 decodereader.consume(2);
350 Some(UTF_16BE)
351 }
352 [0xff, 0xfe, ..] => {
353 decodereader.consume(2);
355 Some(UTF_16LE)
356 }
357 [0xef, 0xbb, 0xbf, ..] => {
358 decodereader.consume(3);
360 None
361 }
362 [0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
363 [0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
364 _ => None, };
366 Ok(encoding)
367 }
368
369 fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
371 let mut decodereader = DecodeReader::new(reader, None);
372 let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
373 if let Some(enc) = &self.read_opts.encoding {
374 init_encoding = Some(Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode)?)
375 }
376 decodereader.set_encoding(init_encoding);
377 let mut xmlreader = Reader::from_reader(decodereader);
378 xmlreader.trim_text(self.read_opts.trim_text);
379
380 let mut buf = Vec::with_capacity(200);
381
382 let event = match xmlreader.read_event(&mut buf)? {
384 Event::Text(ev) => {
385 if ev.len() == 0 {
386 xmlreader.read_event(&mut buf)?
387 } else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
388 xmlreader.read_event(&mut buf)?
389 } else {
390 Event::Text(ev)
391 }
392 }
393 ev => ev,
394 };
395 #[cfg(debug_assertions)]
396 debug!(event);
397 if let Event::Decl(ev) = event {
398 self.handle_decl(&ev)?;
399 if self.encoding != init_encoding
401 && !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
402 {
403 let mut decode_reader = xmlreader.into_underlying_reader();
404 decode_reader.set_encoding(self.encoding);
405 xmlreader = Reader::from_reader(decode_reader);
406 xmlreader.trim_text(self.read_opts.trim_text);
407 }
408 } else if self.read_opts.require_decl {
409 return Err(Error::MalformedXML(
410 "Didn't find XML Declaration at the start of file".to_string(),
411 ));
412 } else if self.handle_event(event)? {
413 return Ok(());
414 }
415 self.parse_content(xmlreader)
417 }
418
419 fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
420 let mut buf = Vec::with_capacity(200); loop {
423 let ev = reader.read_event(&mut buf)?;
424 #[cfg(debug_assertions)]
425 debug!(ev);
426 if self.handle_event(ev)? {
427 if self.element_stack.len() == 1 {
428 return Ok(());
430 } else {
431 return Err(Error::MalformedXML("Closing tag not found.".to_string()));
432 }
433 }
434 }
435 }
436}
437
438fn is_whitespace(byte: u8) -> bool {
440 match byte {
441 b'\r' | b'\n' | b'\t' | b' ' => true,
442 _ => false,
443 }
444}
445
446fn only_has_whitespace(bytes: &[u8]) -> bool {
448 bytes.iter().all(|b| is_whitespace(*b))
449}
450
451pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
455 let mut normalized = Vec::with_capacity(bytes.len());
456 let mut char_found = false;
457 let mut last_space = false;
458 for &byte in bytes {
459 if is_whitespace(byte) {
460 if char_found && !last_space {
461 normalized.push(b' ');
462 last_space = true;
463 }
464 } else {
465 char_found = true;
466 last_space = false;
467 normalized.push(byte);
468 }
469 }
470 if normalized.last() == Some(&b' ') {
472 normalized.pop();
473 }
474 normalized
475}