use super::{unescape, StartTag, EndTag};
use std::collections::{HashMap, VecDeque};
use std::error::Error;
use std::fmt;
use std::iter::Iterator;
use std::mem;
#[derive(PartialEq, Eq, Debug)]
pub enum Event {
PI(String),
ElementStart(StartTag),
ElementEnd(EndTag),
Characters(String),
CDATA(String),
Comment(String)
}
#[derive(PartialEq, Debug, Clone)]
#[allow(missing_copy_implementations)]
pub struct ParserError {
pub line: u32,
pub col: u32,
pub msg: &'static str
}
impl Error for ParserError {
fn description(&self) -> &str {
self.msg
}
}
impl fmt::Display for ParserError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Parse error; Line: {}, Column: {}, Reason: {}", self.line, self.col, self.msg)
}
}
enum State {
OutsideTag,
TagOpened,
InProcessingInstructions,
InTagName,
InCloseTagName,
InTag,
InAttrName,
InAttrValue,
ExpectDelimiter,
ExpectClose,
ExpectSpaceOrClose,
InExclamationMark,
InCDATAOpening,
InCDATA,
InCommentOpening,
InComment1,
InComment2,
InDoctype
}
pub struct Parser {
line: u32,
col: u32,
has_error: bool,
data: VecDeque<char>,
buf: String,
namespaces: Vec<HashMap<String, String>>,
attributes: Vec<(String, Option<String>, String)>,
st: State,
name: Option<(Option<String>, String)>,
attr: Option<(Option<String>, String)>,
delim: Option<char>,
level: u8
}
impl Parser {
pub fn new() -> Parser {
let mut ns = HashMap::with_capacity(2);
ns.insert("xml".to_string(), "http://www.w3.org/XML/1998/namespace".to_string());
ns.insert("xmlns".to_string(), "http://www.w3.org/2000/xmlns/".to_string());
Parser {
line: 1,
col: 0,
has_error: false,
data: VecDeque::with_capacity(4096),
buf: String::new(),
namespaces: vec![ns],
attributes: Vec::new(),
st: State::OutsideTag,
name: None,
attr: None,
delim: None,
level: 0
}
}
pub fn feed_str(&mut self, data: &str) {
self.data.extend(data.chars());
}
}
impl Iterator for Parser {
type Item = Result<Event, ParserError>;
fn next(&mut self) -> Option<Result<Event, ParserError>> {
if self.has_error {
return None;
}
loop {
let c = match self.data.pop_front() {
Some(c) => c,
None => return None
};
if c == '\n' {
self.line += 1;
self.col = 0;
} else {
self.col += 1;
}
match self.parse_character(c) {
Ok(None) => continue,
Ok(Some(event)) => {
return Some(Ok(event));
}
Err(e) => {
self.has_error = true;
return Some(Err(e));
}
}
}
}
}
#[inline]
fn parse_qname(qname: &str) -> (Option<String>, String) {
if let Some(i) = qname.find(':') {
(Some(qname[..i].to_string()), qname[i+1..].to_string())
} else {
(None, qname.to_string())
}
}
impl Parser {
fn namespace_for_prefix(&self, prefix: &str) -> Option<String> {
for ns in self.namespaces.iter().rev() {
if let Some(namespace) = ns.get(prefix) {
if namespace.len() == 0 {
return None;
}
return Some(namespace.clone());
}
}
None
}
fn error(&self, msg: &'static str) -> Result<Option<Event>, ParserError> {
Err(ParserError { line: self.line, col: self.col, msg: msg })
}
fn parse_character(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match self.st {
State::OutsideTag => self.outside_tag(c),
State::TagOpened => self.tag_opened(c),
State::InProcessingInstructions => self.in_processing_instructions(c),
State::InTagName => self.in_tag_name(c),
State::InCloseTagName => self.in_close_tag_name(c),
State::InTag => self.in_tag(c),
State::InAttrName => self.in_attr_name(c),
State::InAttrValue => self.in_attr_value(c),
State::ExpectDelimiter => self.expect_delimiter(c),
State::ExpectClose => self.expect_close(c),
State::ExpectSpaceOrClose => self.expect_space_or_close(c),
State::InExclamationMark => self.in_exclamation_mark(c),
State::InCDATAOpening => self.in_cdata_opening(c),
State::InCDATA => self.in_cdata(c),
State::InCommentOpening => self.in_comment_opening(c),
State::InComment1 => self.in_comment1(c),
State::InComment2 => self.in_comment2(c),
State::InDoctype => self.in_doctype(c),
}
}
fn outside_tag(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'<' if self.buf.len() > 0 => {
self.st = State::TagOpened;
let buf = match unescape(&self.buf) {
Ok(unescaped) => unescaped,
Err(_) => return self.error("Found invalid entity")
};
self.buf.truncate(0);
return Ok(Some(Event::Characters(buf)));
}
'<' => self.st = State::TagOpened,
_ => self.buf.push(c)
}
Ok(None)
}
fn tag_opened(&mut self, c: char) -> Result<Option<Event>, ParserError> {
self.st = match c {
'?' => State::InProcessingInstructions,
'!' => State::InExclamationMark,
'/' => State::InCloseTagName,
_ => {
self.buf.push(c);
State::InTagName
}
};
Ok(None)
}
fn in_processing_instructions(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'?' => {
self.level = 1;
self.buf.push(c);
}
'>' if self.level == 1 => {
self.level = 0;
self.st = State::OutsideTag;
let _ = self.buf.pop();
let buf = mem::replace(&mut self.buf, String::new());
return Ok(Some(Event::PI(buf)));
}
_ => self.buf.push(c)
}
Ok(None)
}
fn in_tag_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'/'
| '>' => {
let (prefix, name) = parse_qname(&self.buf);
self.buf.truncate(0);
let ns = match prefix {
None => self.namespace_for_prefix(""),
Some(ref pre) => match self.namespace_for_prefix(&pre) {
None => return self.error("Unbound namespace prefix in tag name"),
ns => ns
}
};
self.namespaces.push(HashMap::new());
self.st = if c == '/' {
self.name = Some((prefix.clone(), name.clone()));
State::ExpectClose
} else {
State::OutsideTag
};
return Ok(Some(Event::ElementStart(StartTag {
name: name,
ns: ns,
prefix: prefix,
attributes: HashMap::new()
})));
}
' '
| '\t'
| '\r'
| '\n' => {
self.namespaces.push(HashMap::new());
self.name = Some(parse_qname(&self.buf));
self.buf.truncate(0);
self.st = State::InTag;
}
_ => self.buf.push(c)
}
Ok(None)
}
fn in_close_tag_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
' '
| '\t'
| '\r'
| '\n'
| '>' => {
let (prefix, name) = parse_qname(&self.buf);
self.buf.truncate(0);
let ns = match prefix {
None => self.namespace_for_prefix(""),
Some(ref pre) => match self.namespace_for_prefix(&pre) {
None => return self.error("Unbound namespace prefix in tag name"),
ns => ns
}
};
self.namespaces.pop();
self.st = if c == '>' {
State::OutsideTag
} else {
State::ExpectSpaceOrClose
};
Ok(Some(Event::ElementEnd(EndTag { name: name, ns: ns, prefix: prefix })))
}
_ => {
self.buf.push(c);
Ok(None)
}
}
}
fn in_tag(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'/'
| '>' => {
let attributes = mem::replace(&mut self.attributes, Vec::new());
let (prefix, name) = self.name.take().expect("Internal error: No element name set");
let ns = match prefix {
None => self.namespace_for_prefix(""),
Some(ref pre) => match self.namespace_for_prefix(&pre) {
None => return self.error("Unbound namespace prefix in tag name"),
ns => ns
}
};
let mut attributes_map: HashMap<(String, Option<String>), String> = HashMap::new();
for (name, ns, value) in attributes {
let ns = match ns {
None => None,
Some(ref prefix) => match self.namespace_for_prefix(&prefix) {
None => return self.error("Unbound namespace prefix in attribute name"),
ns => ns
}
};
if attributes_map.insert((name, ns), value).is_some() {
return self.error("Duplicate attribute");
}
}
self.st = if c == '/' {
self.name = Some((prefix.clone(), name.clone()));
State::ExpectClose
} else {
State::OutsideTag
};
return Ok(Some(Event::ElementStart(StartTag {
name: name,
ns: ns,
prefix: prefix,
attributes: attributes_map
})));
}
' '
| '\t'
| '\r'
| '\n' => (),
_ => {
self.buf.push(c);
self.st = State::InAttrName;
}
}
Ok(None)
}
fn in_attr_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'=' => {
self.level = 0;
self.attr = Some(parse_qname(&self.buf));
self.buf.truncate(0);
self.st = State::ExpectDelimiter;
}
' '
| '\t'
| '\r'
| '\n' => self.level = 1,
_ if self.level == 0 => self.buf.push(c),
_ => return self.error("Space occured in attribute name")
}
Ok(None)
}
fn in_attr_value(&mut self, c: char) -> Result<Option<Event>, ParserError> {
if c == self.delim.expect("Internal error: In attribute value, but no delimiter set") {
self.delim = None;
self.st = State::InTag;
let attr = self.attr.take();
let (prefix, name) =
attr.expect("Internal error: In attribute value, but no attribute name set");
let value = match unescape(&self.buf) {
Ok(unescaped) => unescaped,
Err(_) => return self.error("Found invalid entity")
};
self.buf.truncate(0);
let last = self.namespaces.last_mut().expect("Internal error: Empty namespace stack");
match prefix {
None if name == "xmlns" => {
last.insert(String::new(), value.clone());
}
Some(ref prefix) if *prefix == "xmlns" => {
last.insert(name.clone(), value.clone());
}
_ => ()
}
self.attributes.push((name, prefix, value));
} else {
self.buf.push(c);
}
Ok(None)
}
fn expect_delimiter(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'"'
| '\'' => {
self.delim = Some(c);
self.st = State::InAttrValue;
}
' '
| '\t'
| '\r'
| '\n' => (),
_ => return self.error("Attribute value not enclosed in ' or \"")
}
Ok(None)
}
fn expect_close(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
'>' => {
self.st = State::OutsideTag;
let (prefix, name) = self.name.take().expect("Internal error: No element name set");
let ns = match prefix {
None => self.namespace_for_prefix(""),
Some(ref pre) => match self.namespace_for_prefix(&pre) {
None => return self.error("Unbound namespace prefix in tag name"),
ns => ns
}
};
self.namespaces.pop();
Ok(Some(Event::ElementEnd(EndTag { name: name, ns: ns, prefix: prefix })))
}
_ => self.error("Expected '>' to close tag")
}
}
fn expect_space_or_close(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
' '
| '\t'
| '\r'
| '\n' => Ok(None),
'>' => {
self.st = State::OutsideTag;
Ok(None)
}
_ => self.error("Expected '>' to close tag, or LWS")
}
}
fn in_exclamation_mark(&mut self, c: char) -> Result<Option<Event>, ParserError> {
self.st = match c {
'-' => State::InCommentOpening,
'[' => State::InCDATAOpening,
'D' => State::InDoctype,
_ => return self.error("Malformed XML")
};
Ok(None)
}
fn in_cdata_opening(&mut self, c: char) -> Result<Option<Event>, ParserError> {
static CDATA_PATTERN: [char; 6] = ['C', 'D', 'A', 'T', 'A', '['];
if c == CDATA_PATTERN[self.level as usize] {
self.level += 1;
} else {
return self.error("Invalid CDATA opening sequence")
}
if self.level == 6 {
self.level = 0;
self.st = State::InCDATA;
}
Ok(None)
}
fn in_cdata(&mut self, c: char) -> Result<Option<Event>, ParserError> {
match c {
']' => {
self.buf.push(c);
self.level += 1;
}
'>' if self.level >= 2 => {
self.st = State::OutsideTag;
self.level = 0;
let len = self.buf.len();
self.buf.truncate(len - 2);
let buf = mem::replace(&mut self.buf, String::new());
return Ok(Some(Event::CDATA(buf)))
}
_ => {
self.buf.push(c);
self.level = 0;
}
}
Ok(None)
}
fn in_comment_opening(&mut self, c: char) -> Result<Option<Event>, ParserError> {
if c == '-' {
self.st = State::InComment1;
self.level = 0;
Ok(None)
} else {
self.error("Expected 2nd '-' to start comment")
}
}
fn in_comment1(&mut self, c: char) -> Result<Option<Event>, ParserError> {
if c == '-' {
self.level += 1;
} else {
self.level = 0;
}
if self.level == 2 {
self.level = 0;
self.st = State::InComment2;
}
self.buf.push(c);
Ok(None)
}
fn in_comment2(&mut self, c: char) -> Result<Option<Event>, ParserError> {
if c != '>' {
self.error("No more than one adjacent '-' allowed in a comment")
} else {
self.st = State::OutsideTag;
let len = self.buf.len();
self.buf.truncate(len - 2);
let buf = mem::replace(&mut self.buf, String::new());
Ok(Some(Event::Comment(buf)))
}
}
fn in_doctype(&mut self, c: char) -> Result<Option<Event>, ParserError> {
static DOCTYPE_PATTERN: [char; 6] = ['O', 'C', 'T', 'Y', 'P', 'E'];
match self.level {
0...5 => if c == DOCTYPE_PATTERN[self.level as usize] {
self.level += 1;
} else {
return self.error("Invalid DOCTYPE");
},
6 => {
match c {
' '
| '\t'
| '\r'
| '\n' => (),
_ => return self.error("Invalid DOCTYPE")
}
self.level += 1;
}
_ if c == '>' => {
self.level = 0;
self.st = State::OutsideTag;
}
_ => ()
}
Ok(None)
}
}
#[cfg(test)]
mod parser_tests {
use std::collections::HashMap;
use super::Parser;
use super::super::{Event, ParserError, StartTag, EndTag};
#[test]
fn test_start_tag() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<a>");
for event in p {
i += 1;
assert_eq!(event, Ok(Event::ElementStart(StartTag {
name: "a".to_string(),
ns: None,
prefix: None,
attributes: HashMap::new()
})));
}
assert_eq!(i, 1u8);
}
#[test]
fn test_end_tag() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("</a>");
for event in p {
i += 1;
assert_eq!(event, Ok(Event::ElementEnd(EndTag {
name: "a".to_string(),
ns: None,
prefix: None
})));
}
assert_eq!(i, 1u8);
}
#[test]
fn test_self_closing_with_space() {
let mut p = Parser::new();
p.feed_str("<register />");
let v: Vec<Result<Event, ParserError>> = p.collect();
assert_eq!(v, vec![
Ok(Event::ElementStart(StartTag {
name: "register".to_string(),
ns: None,
prefix: None,
attributes: HashMap::new()
})),
Ok(Event::ElementEnd(EndTag {
name: "register".to_string(),
ns: None,
prefix: None,
}))
]);
}
#[test]
fn test_self_closing_without_space() {
let mut p = Parser::new();
p.feed_str("<register/>");
let v: Vec<Result<Event, ParserError>> = p.collect();
assert_eq!(v, vec![
Ok(Event::ElementStart(StartTag {
name: "register".to_string(),
ns: None,
prefix: None,
attributes: HashMap::new()
})),
Ok(Event::ElementEnd(EndTag {
name: "register".to_string(),
ns: None,
prefix: None,
}))
]);
}
#[test]
fn test_self_closing_namespace() {
let mut p = Parser::new();
p.feed_str("<foo:a xmlns:foo='urn:foo'/>");
let v: Vec<Result<Event, ParserError>> = p.collect();
let mut attr: HashMap<(String, Option<String>), String> = HashMap::new();
attr.insert(("foo".to_string(), Some("http://www.w3.org/2000/xmlns/".to_string())),
"urn:foo".to_string());
assert_eq!(v, vec![
Ok(Event::ElementStart(StartTag {
name: "a".to_string(),
ns: Some("urn:foo".to_string()),
prefix: Some("foo".to_string()),
attributes: attr,
})),
Ok(Event::ElementEnd(EndTag {
name: "a".to_string(),
ns: Some("urn:foo".to_string()),
prefix: Some("foo".to_string()),
}))
]);
}
#[test]
fn test_pi() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<?xml version='1.0' encoding='utf-8'?>");
for event in p {
i += 1;
assert_eq!(event, Ok(Event::PI("xml version='1.0' encoding='utf-8'".to_string())));
}
assert_eq!(i, 1u8);
}
#[test]
fn test_comment() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<!--Nothing to see-->");
for event in p {
i += 1;
assert_eq!(event, Ok(Event::Comment("Nothing to see".to_string())));
}
assert_eq!(i, 1u8);
}
#[test]
fn test_cdata() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<![CDATA[<html><head><title>x</title></head><body/></html>]]>");
for event in p {
i += 1;
assert_eq!(event,
Ok(Event::CDATA("<html><head><title>x</title></head><body/></html>".to_string())));
}
assert_eq!(i, 1u8);
}
#[test]
fn test_characters() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<text>Hello World, it's a nice day</text>");
for event in p {
i += 1;
if i == 2 {
assert_eq!(event,
Ok(Event::Characters("Hello World, it's a nice day".to_string())));
}
}
assert_eq!(i, 3u8);
}
#[test]
fn test_doctype() {
let mut p = Parser::new();
let mut i = 0u8;
p.feed_str("<!DOCTYPE html>");
for _ in p {
i += 1;
}
assert_eq!(i, 0u8);
}
}