use crate::document::{Document, Node};
use crate::element::Element;
use crate::error::{Error, Result};
use encoding_rs::Decoder;
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
use quick_xml::events::{BytesDecl, BytesStart, Event};
use quick_xml::Reader;
use std::borrow::Cow;
use std::collections::HashMap;
use std::io::{BufRead, Read};
#[cfg(debug_assertions)]
macro_rules! debug {
($x:expr) => {
println!("{:?}", $x)
};
}
pub(crate) struct DecodeReader<R: Read> {
decoder: Option<Decoder>,
inner: R,
undecoded: Box<[u8]>,
undecoded_pos: usize,
undecoded_cap: usize,
remaining: [u8; 32], decoded: Box<[u8]>,
decoded_pos: usize,
decoded_cap: usize,
done: bool,
}
impl<R: Read> DecodeReader<R> {
pub(crate) fn new(reader: R, decoder: Option<Decoder>) -> DecodeReader<R> {
DecodeReader {
decoder,
inner: reader,
undecoded: vec![0; 4096].into_boxed_slice(),
undecoded_pos: 0,
undecoded_cap: 0,
remaining: [0; 32],
decoded: vec![0; 12288].into_boxed_slice(),
decoded_pos: 0,
decoded_cap: 0,
done: false,
}
}
pub(crate) fn set_encoding(&mut self, encoding: Option<&'static Encoding>) {
self.decoder = encoding.map(|e| e.new_decoder_without_bom_handling());
self.done = false;
}
fn fill_buf_decode(&mut self) -> std::io::Result<&[u8]> {
if self.decoded_pos >= self.decoded_cap {
debug_assert!(self.decoded_pos == self.decoded_cap);
if self.done {
return Ok(&[]);
}
let remaining = self.undecoded_cap - self.undecoded_pos;
if remaining <= 32 {
self.remaining[..remaining]
.copy_from_slice(&self.undecoded[self.undecoded_pos..self.undecoded_cap]);
self.undecoded[..remaining].copy_from_slice(&self.remaining[..remaining]);
let read = self.inner.read(&mut self.undecoded[remaining..])?;
self.done = read == 0;
self.undecoded_pos = 0;
self.undecoded_cap = remaining + read;
}
let (_res, read, written, _replaced) = self.decoder.as_mut().unwrap().decode_to_utf8(
&self.undecoded[self.undecoded_pos..self.undecoded_cap],
&mut self.decoded,
self.done,
);
self.undecoded_pos += read;
self.decoded_cap = written;
self.decoded_pos = 0;
}
Ok(&self.decoded[self.decoded_pos..self.decoded_cap])
}
fn fill_buf_without_decode(&mut self) -> std::io::Result<&[u8]> {
if self.undecoded_pos >= self.undecoded_cap {
debug_assert!(self.undecoded_pos == self.undecoded_cap);
self.undecoded_cap = self.inner.read(&mut self.undecoded)?;
self.undecoded_pos = 0;
}
Ok(&self.undecoded[self.undecoded_pos..self.undecoded_cap])
}
}
impl<R: Read> Read for DecodeReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
(&self.decoded[..]).read(buf)
}
}
impl<R: Read> BufRead for DecodeReader<R> {
fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
match &self.decoder {
Some(_) => self.fill_buf_decode(),
None => self.fill_buf_without_decode(),
}
}
fn consume(&mut self, amt: usize) {
match &self.decoder {
Some(_) => {
self.decoded_pos = std::cmp::min(self.decoded_pos + amt, self.decoded_cap);
}
None => {
self.undecoded_pos = std::cmp::min(self.undecoded_pos + amt, self.undecoded_cap);
}
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ReadOptions {
pub empty_text_node: bool,
pub trim_text: bool,
pub ignore_whitespace_only: bool,
pub require_decl: bool,
pub encoding: Option<String>,
}
impl ReadOptions {
pub fn default() -> ReadOptions {
ReadOptions {
empty_text_node: true,
trim_text: true,
ignore_whitespace_only: false,
require_decl: true,
encoding: None,
}
}
}
pub(crate) struct DocumentParser {
doc: Document,
read_opts: ReadOptions,
encoding: Option<&'static Encoding>,
element_stack: Vec<Element>,
}
impl DocumentParser {
pub(crate) fn parse_reader<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
let doc = Document::new();
let element_stack = vec![doc.container()];
let mut parser = DocumentParser {
doc,
read_opts: opts,
encoding: None,
element_stack: element_stack,
};
parser.parse_start(reader)?;
Ok(parser.doc)
}
fn handle_decl(&mut self, ev: &BytesDecl) -> Result<()> {
self.doc.version = String::from_utf8(ev.version()?.to_vec())?;
self.encoding = match ev.encoding() {
Some(res) => {
let encoding = Encoding::for_label(&res?).ok_or(Error::CannotDecode)?;
if encoding == UTF_8 {
None
} else {
Some(encoding)
}
}
None => None,
};
self.doc.standalone = match ev.standalone() {
Some(res) => {
let val = std::str::from_utf8(&res?)?.to_lowercase();
match val.as_str() {
"yes" => true,
"no" => false,
_ => {
return Err(Error::MalformedXML(
"Standalone Document Declaration has non boolean value".to_string(),
))
}
}
}
None => false,
};
Ok(())
}
fn create_element(&mut self, parent: Element, ev: &BytesStart) -> Result<Element> {
let full_name = String::from_utf8(ev.name().to_vec())?;
let mut namespace_decls = HashMap::new();
let mut attributes = HashMap::new();
for attr in ev.attributes() {
let mut attr = attr?;
attr.value = Cow::Owned(normalize_space(&attr.value));
let key = String::from_utf8(attr.key.to_vec())?;
let value = String::from_utf8(attr.unescaped_value()?.to_vec())?;
if key == "xmlns" {
namespace_decls.insert(String::new(), value);
continue;
} else if let Some(prefix) = key.strip_prefix("xmlns:") {
namespace_decls.insert(prefix.to_owned(), value);
continue;
}
attributes.insert(key, value);
}
let elem = Element::with_data(&mut self.doc, full_name, attributes, namespace_decls);
parent
.push_child(&mut self.doc, Node::Element(elem))
.unwrap();
Ok(elem)
}
fn handle_event(&mut self, event: Event) -> Result<bool> {
match event {
Event::Start(ref ev) => {
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
let element = self.create_element(parent, ev)?;
self.element_stack.push(element);
Ok(false)
}
Event::End(_) => {
let elem = self
.element_stack
.pop()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?; if self.read_opts.empty_text_node {
if !elem.has_children(&self.doc) {
elem.push_child(&mut self.doc, Node::Text(String::new()))
.unwrap();
}
}
Ok(false)
}
Event::Empty(ref ev) => {
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
self.create_element(parent, ev)?;
Ok(false)
}
Event::Text(ev) => {
if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
return Ok(false);
}
if ev.is_empty() {
return Ok(false);
}
let content = String::from_utf8(ev.unescaped()?.to_vec())?;
let node = Node::Text(content);
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
parent.push_child(&mut self.doc, node).unwrap();
Ok(false)
}
Event::DocType(ev) => {
let raw = ev.unescaped()?;
let content = if !raw.is_empty() && raw[0] == b' ' {
String::from_utf8(raw[1..].to_vec())?
} else {
String::from_utf8(raw.to_vec())?
};
let node = Node::DocType(content);
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
parent.push_child(&mut self.doc, node).unwrap();
Ok(false)
}
Event::Comment(ev) => {
let content = String::from_utf8(ev.escaped().to_vec())?;
let node = Node::Comment(content);
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
parent.push_child(&mut self.doc, node).unwrap();
Ok(false)
}
Event::CData(ev) => {
let content = String::from_utf8(ev.unescaped()?.to_vec())?;
let node = Node::CData(content);
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
parent.push_child(&mut self.doc, node).unwrap();
Ok(false)
}
Event::PI(ev) => {
let content = String::from_utf8(ev.escaped().to_vec())?;
let node = Node::PI(content);
let parent = *self
.element_stack
.last()
.ok_or_else(|| Error::MalformedXML("Malformed Element Tree".to_string()))?;
parent.push_child(&mut self.doc, node).unwrap();
Ok(false)
}
Event::Decl(_) => Err(Error::MalformedXML(
"XML declaration found in the middle of the document".to_string(),
)),
Event::Eof => Ok(true),
}
}
fn sniff_encoding<R: Read>(
&mut self,
decodereader: &mut DecodeReader<R>,
) -> Result<Option<&'static Encoding>> {
let bytes = decodereader.fill_buf()?;
let encoding = match bytes {
[0x3c, 0x3f, ..] => None, [0xfe, 0xff, ..] => {
decodereader.consume(2);
Some(UTF_16BE)
}
[0xff, 0xfe, ..] => {
decodereader.consume(2);
Some(UTF_16LE)
}
[0xef, 0xbb, 0xbf, ..] => {
decodereader.consume(3);
None
}
[0x00, 0x3c, 0x00, 0x3f, ..] => Some(UTF_16BE),
[0x3c, 0x00, 0x3f, 0x00, ..] => Some(UTF_16LE),
_ => None, };
Ok(encoding)
}
fn parse_start<R: Read>(&mut self, reader: R) -> Result<()> {
let mut decodereader = DecodeReader::new(reader, None);
let mut init_encoding = self.sniff_encoding(&mut decodereader)?;
if let Some(enc) = &self.read_opts.encoding {
init_encoding = Some(Encoding::for_label(enc.as_bytes()).ok_or(Error::CannotDecode)?)
}
decodereader.set_encoding(init_encoding);
let mut xmlreader = Reader::from_reader(decodereader);
xmlreader.trim_text(self.read_opts.trim_text);
let mut buf = Vec::with_capacity(200);
let event = match xmlreader.read_event(&mut buf)? {
Event::Text(ev) => {
if ev.len() == 0 {
xmlreader.read_event(&mut buf)?
} else if self.read_opts.ignore_whitespace_only && only_has_whitespace(&ev) {
xmlreader.read_event(&mut buf)?
} else {
Event::Text(ev)
}
}
ev => ev,
};
#[cfg(debug_assertions)]
debug!(event);
if let Event::Decl(ev) = event {
self.handle_decl(&ev)?;
if self.encoding != init_encoding
&& !(self.encoding == Some(UTF_16LE) && init_encoding == Some(UTF_16BE))
{
let mut decode_reader = xmlreader.into_underlying_reader();
decode_reader.set_encoding(self.encoding);
xmlreader = Reader::from_reader(decode_reader);
xmlreader.trim_text(self.read_opts.trim_text);
}
} else if self.read_opts.require_decl {
return Err(Error::MalformedXML(
"Didn't find XML Declaration at the start of file".to_string(),
));
} else if self.handle_event(event)? {
return Ok(());
}
self.parse_content(xmlreader)
}
fn parse_content<B: BufRead>(&mut self, mut reader: Reader<B>) -> Result<()> {
let mut buf = Vec::with_capacity(200);
loop {
let ev = reader.read_event(&mut buf)?;
#[cfg(debug_assertions)]
debug!(ev);
if self.handle_event(ev)? {
if self.element_stack.len() == 1 {
return Ok(());
} else {
return Err(Error::MalformedXML("Closing tag not found.".to_string()));
}
}
}
}
}
fn is_whitespace(byte: u8) -> bool {
match byte {
b'\r' | b'\n' | b'\t' | b' ' => true,
_ => false,
}
}
fn only_has_whitespace(bytes: &[u8]) -> bool {
bytes.iter().all(|b| is_whitespace(*b))
}
pub fn normalize_space(bytes: &[u8]) -> Vec<u8> {
let mut normalized = Vec::with_capacity(bytes.len());
let mut char_found = false;
let mut last_space = false;
for &byte in bytes {
if is_whitespace(byte) {
if char_found && !last_space {
normalized.push(b' ');
last_space = true;
}
} else {
char_found = true;
last_space = false;
normalized.push(byte);
}
}
if normalized.last() == Some(&b' ') {
normalized.pop();
}
normalized
}