#[macro_use]
extern crate log;
pub mod error;
pub mod attributes;
#[cfg(test)]
mod test;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::iter::Iterator;
use std::path::Path;
use std::fmt;
use error::{Error, Result};
use attributes::Attributes;
enum TagState {
Opened,
Closed,
}
pub struct XmlReader<B: BufRead> {
reader: B,
exit: bool,
next_close: bool,
opened: Vec<Element>,
tag_state: TagState,
trim_text: bool,
with_check: bool,
}
impl<B: BufRead> XmlReader<B> {
pub fn from_reader(reader: B) -> XmlReader<B> {
XmlReader {
reader: reader,
exit: false,
next_close: false,
opened: Vec::new(),
tag_state: TagState::Closed,
trim_text: false,
with_check: true,
}
}
pub fn trim_text(mut self, val: bool) -> XmlReader<B> {
self.trim_text = val;
self
}
pub fn with_check(mut self, val: bool) -> XmlReader<B> {
self.with_check = val;
self
}
}
impl XmlReader<BufReader<File>> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<XmlReader<BufReader<File>>>
{
let reader = BufReader::new(try!(File::open(path)));
Ok(XmlReader::from_reader(reader))
}
}
impl<'a> XmlReader<&'a [u8]> {
pub fn from_str(s: &'a str) -> XmlReader<&'a [u8]> {
XmlReader::from_reader(s.as_bytes())
}
}
impl<B: BufRead> Iterator for XmlReader<B> {
type Item = Result<Event>;
fn next(&mut self) -> Option<Result<Event>> {
if self.exit { return None; }
if self.next_close {
self.next_close = false;
let e = self.opened.pop().unwrap();
return Some(Ok(Event::End(e)));
}
let mut buf = Vec::new();
match self.tag_state {
TagState::Opened => {
self.tag_state = TagState::Closed;
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => None,
Ok(_n) => {
let len = buf.len();
match buf[0] {
b'/' => {
if self.with_check {
let e = self.opened.pop().unwrap();
if &buf[1..] != e.as_bytes() {
self.exit = true;
return Some(Err(Error::Malformed(format!(
"End event {:?} doesn't match last opened element {:?}, opened: {:?}",
Element::new(buf, 1, len, len), e, self.opened))));
}
}
return Some(Ok(Event::End(Element::new(buf, 1, len, len))))
},
b'?' => {
if len > 1 && buf[len - 1] == b'?' {
return Some(Ok(Event::Header(Element::new(buf, 1, len - 1, len - 1))));
} else {
self.exit = true;
return Some(Err(Error::Malformed("Unescaped Header event".to_owned())));
}
},
b'!' => {
if len >= 3 && &buf[1..3] == b"--" {
loop {
let len = buf.len();
if len >= 5 && &buf[(len - 2)..] == b"--" {
return Some(Ok(Event::Comment(Element::new(buf, 3, len - 2, len - 2))));
}
buf.push(b'>');
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => {
self.exit = true;
return Some(Err(Error::Malformed("Unescaped Comment event".to_owned())));
},
Err(e) => {
self.exit = true;
return Some(Err(Error::from(e)));
},
_ => (),
}
}
} else if len >= 8 && &buf[1..8] == b"[CDATA[" {
loop {
let len = buf.len();
if len >= 10 && &buf[(len - 2)..] == b"]]" {
return Some(Ok(Event::CData(Element::new(buf, 8, len - 2, len - 2))));
}
buf.push(b'>');
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => {
self.exit = true;
return Some(Err(Error::Malformed("Unescaped CDATA event".to_owned())));
},
Err(e) => {
self.exit = true;
return Some(Err(Error::from(e)));
},
_ => (),
}
}
}
},
_ => (),
}
let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
if buf[len - 1] == b'/' {
self.next_close = true;
let element = Element::new(buf, 0, len - 1,
if name_end < len { name_end } else { len - 1 });
self.opened.push(element.clone());
Some(Ok(Event::Start(element)))
} else {
let element = Element::new(buf, 0, len, name_end);
if self.with_check {
self.opened.push(element.clone());
}
Some(Ok(Event::Start(element)))
}
},
Err(e) => {
self.exit = true;
Some(Err(Error::from(e)))
},
}
},
TagState::Closed => {
self.tag_state = TagState::Opened;
match read_until(&mut self.reader, b'<', &mut buf) {
Ok(0) => None,
Ok(_n) => {
let (start, len) = if self.trim_text {
match buf.iter().position(|&b| !is_whitespace(b)) {
Some(start) => (start, buf.len() - buf.iter().rev()
.position(|&b| !is_whitespace(b)).unwrap_or(0)),
None => return self.next()
}
} else {
(0, buf.len())
};
Some(Ok(Event::Text(Element::new(buf, start, len, len))))
},
Err(e) => {
self.exit = true;
Some(Err(Error::from(e)))
},
}
}
}
}
}
#[derive(Clone)]
pub struct Element {
buf: Vec<u8>,
start: usize,
end: usize,
name_end: usize,
}
impl Element {
fn new(buf: Vec<u8>, start: usize, end: usize, name_end: usize) -> Element {
Element {
buf: buf,
start: start,
end: end,
name_end: name_end,
}
}
pub fn as_bytes(&self) -> &[u8] {
&self.buf[self.start..self.name_end]
}
pub fn as_str(&self) -> Result<&str> {
::std::str::from_utf8(self.as_bytes()).map_err(|e| Error::Utf8(e))
}
pub fn attributes<'a>(&'a self) -> Attributes<'a> {
Attributes::new(&self.buf[self.start..self.end], self.name_end)
}
pub fn into_string(self) -> Result<String> {
::std::string::String::from_utf8(self.buf).map_err(|e| Error::Utf8(e.utf8_error()))
}
}
impl fmt::Debug for Element {
fn fmt(&self, f: &mut fmt::Formatter) -> ::std::result::Result<(), fmt::Error> {
write!(f, "Element {{ buf: {:?}, name_end: {}, end: {} }}",
self.as_str(), self.name_end, self.end)
}
}
#[derive(Debug)]
pub enum Event {
Start(Element),
End(Element),
Text(Element),
Comment(Element),
CData(Element),
Header(Element),
}
impl Event {
pub fn element(&self) -> &Element {
match self {
&Event::Start(ref e) |
&Event::End(ref e) |
&Event::Text(ref e) |
&Event::Comment(ref e) |
&Event::CData(ref e) |
&Event::Header(ref e) => e,
}
}
}
#[inline(always)]
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
#[inline(always)]
fn read_until<R: BufRead>(r: &mut R, byte: u8, buf: &mut Vec<u8>) -> Result<usize> {
let mut read = 0;
let mut done = false;
while !done {
let used = {
let available = match r.fill_buf() {
Ok(n) if n.is_empty() => return Ok(read),
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(Error::from(e)),
};
let mut bytes = available.iter().enumerate();
let used: usize;
loop {
match bytes.next() {
Some((i, &b)) => {
if b == byte {
buf.extend_from_slice(&available[..i]);
done = true;
used = i + 1;
break;
}
},
None => {
buf.extend_from_slice(available);
used = available.len();
break;
},
}
}
used
};
r.consume(used);
read += used;
}
Ok(read)
}