use std::fmt;
use std::str;
use {Tokenize, Stream, TextFrame, ElementId, AttributeId, Error};
#[derive(PartialEq)]
pub enum Token<'a> {
XmlElementStart(&'a str),
SvgElementStart(ElementId),
ElementEnd(ElementEnd<'a>),
XmlAttribute(&'a str, &'a str),
SvgAttribute(AttributeId, TextFrame<'a>),
Text(TextFrame<'a>),
Cdata(TextFrame<'a>),
Whitespace(&'a str),
Comment(&'a str),
DtdEmpty(&'a str),
DtdStart(&'a str),
Entity(&'a str, TextFrame<'a>),
DtdEnd,
Declaration(&'a str),
EndOfStream,
}
impl<'a> fmt::Debug for Token<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Token::XmlElementStart(s) =>
write!(f, "XmlElementStart({})", s),
Token::SvgElementStart(s) =>
write!(f, "SvgElementStart({:?})", s),
Token::ElementEnd(ref e) => {
let c = match *e {
ElementEnd::Open => ">",
ElementEnd::CloseXml(_) => "</",
ElementEnd::CloseSvg(_) => "</",
ElementEnd::Empty => "/>",
};
write!(f, "ElementEnd({})", c)
}
Token::XmlAttribute(k, ref v) =>
write!(f, "XmlAttribute({}, {:?})", k, v),
Token::SvgAttribute(k, ref v) =>
write!(f, "SvgAttribute({:?}, {:?})", k, v),
Token::Text(ref s) =>
write!(f, "Text({:?})", s),
Token::Cdata(ref s) =>
write!(f, "CDATA({:?})", s),
Token::Whitespace(s) =>
write!(f, "Whitespace({})", s),
Token::Comment(s) =>
write!(f, "Comment({})", s),
Token::DtdEmpty(s) =>
write!(f, "DtdEmpty({})", s),
Token::DtdStart(s) =>
write!(f, "DtdStart({})", s),
Token::Entity(k, ref v) =>
write!(f, "ENTITY({}, {:?})", k, v),
Token::DtdEnd =>
write!(f, "DtdEnd"),
Token::Declaration(s) =>
write!(f, "Declaration({})", s),
Token::EndOfStream =>
write!(f, "EndOfStream"),
}
}
}
#[derive(Debug,PartialEq,Clone)]
pub enum ElementEnd<'a> {
Open,
CloseXml(&'a str),
CloseSvg(ElementId),
Empty,
}
enum State {
AtStart,
Unknown,
Dtd,
Attributes,
Finished,
}
pub struct Tokenizer<'a> {
stream: Stream<'a>,
state: State,
depth: u32,
curr_elem: Option<ElementId>,
}
impl<'a> Tokenize<'a> for Tokenizer<'a> {
type Token = Token<'a>;
fn from_str(text: &str) -> Tokenizer {
Tokenizer {
stream: Stream::from_str(text),
state: State::AtStart,
depth: 0,
curr_elem: None,
}
}
fn from_frame(text: TextFrame<'a>) -> Tokenizer {
Tokenizer {
stream: Stream::from_frame(text),
state: State::AtStart,
depth: 0,
curr_elem: None,
}
}
fn parse_next(&mut self) -> Result<Token<'a>, Error> {
match self.state {
State::Unknown => {
if self.stream.at_end() {
self.state = State::Finished;
return Ok(Token::EndOfStream);
}
if self.stream.starts_with(b"<?") {
self.parse_declaration()
} else if self.stream.starts_with(b"<!--") {
self.parse_comment()
} else if self.stream.starts_with(b"<![") {
self.parse_cdata()
} else if self.stream.starts_with(b"<!DOCTYPE") {
self.parse_dtd()
} else if self.stream.starts_with(b"</") {
self.stream.advance(2)?; let tag_name = self.stream.read_to(b'>')?;
self.stream.advance(1)?;
if self.depth == 0 {
return Err(Error::UnexpectedClosingTag(self.stream.gen_error_pos()));
}
self.depth -= 1;
self.curr_elem = None;
let end = match ElementId::from_name(tag_name) {
Some(eid) => ElementEnd::CloseSvg(eid),
None => ElementEnd::CloseXml(tag_name),
};
Ok(Token::ElementEnd(end))
} else if self.stream.is_char_eq_raw(b'<') {
self.depth += 1;
self.parse_element()
} else if self.depth > 0 {
let start = self.stream.pos();
self.stream.skip_spaces();
if self.stream.is_char_eq(b'<')? {
let text = self.stream.slice_region_raw(start, self.stream.pos());
Ok(Token::Whitespace(text))
} else {
let b = self.stream.pos() - start;
self.stream.back(b)?;
let end = self.stream.pos() + self.stream.len_to(b'<')?;
let text_frame = self.stream.slice_frame_raw(self.stream.pos(), end);
self.stream.advance_raw(text_frame.len());
Ok(Token::Text(text_frame))
}
} else if self.stream.is_space()? {
assert_eq!(self.depth, 0);
self.stream.skip_spaces();
self.parse_next()
} else {
Err(Error::InvalidSvgToken(self.stream.gen_error_pos()))
}
}
State::Dtd => {
self.parse_entity()
}
State::Attributes => {
self.parse_attribute()
}
State::AtStart => {
if self.stream.at_end() {
self.state = State::Finished;
return Ok(Token::EndOfStream);
}
if self.stream.is_char_eq(0xEF)? {
self.stream.advance(3)?; }
self.state = State::Unknown;
self.parse_next()
}
State::Finished => {
Ok(Token::EndOfStream)
}
}
}
}
impl<'a> Tokenizer<'a> {
fn parse_declaration(&mut self) -> Result<Token<'a>, Error> {
debug_assert!(self.stream.starts_with(b"<?"));
if !self.stream.starts_with(b"<?xml ") {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
self.stream.advance_raw(6);
let l = self.stream.len_to(b'?')?;
let s = self.stream.read_raw(l);
self.stream.consume_char(b'?')?;
self.stream.consume_char(b'>')?;
Ok(Token::Declaration(s))
}
fn parse_comment(&mut self) -> Result<Token<'a>, Error> {
self.stream.advance(4)?; let start_pos = self.stream.pos();
loop {
let len = self.stream.len_to(b'>')?;
if len < 2 {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
self.stream.advance_raw(len);
if self.stream.char_at(-1)? == b'-' && self.stream.char_at(-2)? == b'-' {
break;
}
self.stream.advance(1)?;
}
let end_pos = self.stream.pos() - 2;
let s = self.stream.slice_region_raw(start_pos, end_pos);
self.stream.advance(1)?;
Ok(Token::Comment(s))
}
fn parse_cdata(&mut self) -> Result<Token<'a>, Error> {
self.stream.advance(9)?; let start_pos = self.stream.pos();
loop {
self.stream.jump_to(b']')?;
if self.stream.starts_with(b"]]>") {
break;
}
self.stream.advance(1)?;
}
let end = self.stream.pos();
self.stream.set_pos_raw(start_pos);
let text_frame = self.stream.slice_frame_raw(self.stream.pos(), end);
self.stream.set_pos_raw(end);
self.stream.advance(3)?;
Ok(Token::Cdata(text_frame))
}
fn parse_dtd(&mut self) -> Result<Token<'a>, Error> {
debug_assert!(self.stream.starts_with(b"<!DOCTYPE"));
self.stream.advance_raw(9); self.stream.consume_char(b' ')?;
let start = self.stream.pos();
let l = self.stream.slice_tail()
.as_bytes()
.into_iter()
.position(|x| *x == b'[' || *x == b'>');
match l {
Some(l) => self.stream.advance(l)?,
None => return Err(self.stream.gen_end_of_stream_error()),
}
if start == self.stream.pos() {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
if self.stream.is_char_eq(b'>')? {
let text = self.stream.slice_region_raw(start, self.stream.pos());
self.stream.advance(1)?;
Ok(Token::DtdEmpty(text))
} else {
self.state = State::Dtd;
let text = self.stream.slice_region_raw(start, self.stream.pos() - 1);
self.stream.advance(1)?; self.stream.skip_spaces();
Ok(Token::DtdStart(text))
}
}
fn parse_entity(&mut self) -> Result<Token<'a>, Error> {
if self.stream.starts_with(b"<!ENTITY") {
self.stream.advance(9)?;
let key = self.stream.read_to(b' ')?;
self.stream.skip_spaces();
self.stream.consume_char(b'"')?;
let value_len = self.stream.len_to(b'"')?;
let text_frame = self.stream.slice_frame_raw(self.stream.pos(),
self.stream.pos() + value_len);
self.stream.advance_raw(value_len);
self.stream.consume_char(b'"')?;
self.stream.skip_spaces();
self.stream.consume_char(b'>')?;
self.stream.skip_spaces();
Ok(Token::Entity(key, text_frame))
} else if self.stream.starts_with(b"]>") {
self.stream.advance(2)?; self.state = State::Unknown;
Ok(Token::DtdEnd)
} else {
let l = self.stream.len_to(b'>')? + 1;
warnln!("Unsupported DOCTYPE object: '{}'.",
self.stream.slice_next_raw(l));
self.stream.advance_raw(l);
self.stream.skip_spaces();
self.parse_next()
}
}
fn parse_element(&mut self) -> Result<Token<'a>, Error> {
debug_assert!(self.stream.is_char_eq_raw(b'<'));
self.stream.advance(1)?;
let start_pos = self.stream.pos();
while !self.stream.at_end() && self.stream.is_ident_raw() {
self.stream.advance(1)?;
}
if !self.stream.at_end() {
if !self.stream.is_space_raw()
&& !self.stream.is_char_eq_raw(b'/')
&& !self.stream.is_char_eq_raw(b'>')
{
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
} else {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
if start_pos == self.stream.pos() {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
let tag_name = self.stream.slice_region_raw(start_pos, self.stream.pos());
self.stream.skip_spaces();
self.state = State::Attributes;
let token = match ElementId::from_name(tag_name) {
Some(eid) => {
self.curr_elem = Some(eid);
Token::SvgElementStart(eid)
}
None => {
self.curr_elem = None;
Token::XmlElementStart(tag_name)
}
};
Ok(token)
}
fn parse_attribute(&mut self) -> Result<Token<'a>, Error> {
if self.stream.is_char_eq(b'/')? {
self.depth -= 1;
self.stream.advance(2)?;
self.state = State::Unknown;
self.curr_elem = None;
return Ok(Token::ElementEnd(ElementEnd::Empty));
}
if self.stream.is_char_eq(b'>')? {
self.stream.advance_raw(1);
self.state = State::Unknown;
self.curr_elem = None;
return Ok(Token::ElementEnd(ElementEnd::Open));
}
self.stream.skip_spaces();
let name = {
let start = self.stream.pos();
while !self.stream.at_end() && self.stream.is_ident_raw() {
self.stream.advance(1)?;
}
let len = self.stream.pos() - start;
if len == 0 {
return Err(Error::InvalidSvgToken(self.stream.gen_error_pos()));
}
self.stream.slice_region_raw(start, start + len)
};
self.stream.skip_spaces();
self.stream.consume_char(b'=')?;
self.stream.skip_spaces();
if !(self.stream.is_char_eq(b'"')? || self.stream.is_char_eq(b'\'')?) {
return Err(Error::InvalidChar {
current: self.stream.curr_char_raw() as char,
expected: '"',
pos: self.stream.gen_error_pos(),
});
}
let quote = self.stream.curr_char()?;
self.stream.advance(1)?;
let end = self.stream.pos() + self.stream.len_to(quote)?;
let text_frame = self.stream.slice_frame_raw(self.stream.pos(), end);
self.stream.advance_raw(text_frame.len());
self.stream.advance(1)?;
self.stream.skip_spaces();
if let Some(_) = self.curr_elem {
if let Some(aid) = AttributeId::from_name(name) {
return Ok(Token::SvgAttribute(aid, text_frame));
}
}
Ok(Token::XmlAttribute(name, text_frame.slice()))
}
}