use super::{XmlTokenizer, TokenSink};
use util::{is_ascii_alnum};
use tendril::StrTendril;
use std::char::from_u32;
use std::borrow::Cow::Borrowed;
pub use self::Status::*;
use self::State::*;
mod data;
pub struct CharRef {
pub chars: [char; 2],
pub num_chars: u8,
}
pub enum Status {
Stuck,
Progress,
Done,
}
#[derive(Debug)]
enum State {
Begin,
Octothorpe,
Numeric(u32), NumericSemicolon,
Named,
BogusName,
}
pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
result: Option<CharRef>,
num: u32,
num_too_big: bool,
seen_digit: bool,
hex_marker: Option<char>,
name_buf_opt: Option<StrTendril>,
name_match: Option<(u32, u32)>,
name_len: usize,
}
impl CharRefTokenizer {
pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
CharRefTokenizer {
state: Begin,
addnl_allowed: addnl_allowed,
result: None,
num: 0,
num_too_big: false,
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
name_match: None,
name_len: 0,
}
}
pub fn get_result(self) -> CharRef {
self.result.expect("get_result called before done")
}
fn name_buf<'t>(&'t self) -> &'t StrTendril {
self.name_buf_opt.as_ref()
.expect("name_buf missing in named character reference")
}
fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
self.name_buf_opt.as_mut()
.expect("name_buf missing in named character reference")
}
fn finish_none(&mut self) -> Status {
self.result = Some(CharRef {
chars: ['\0', '\0'],
num_chars: 0,
});
Done
}
fn finish_one(&mut self, c: char) -> Status {
self.result = Some(CharRef {
chars: [c, '\0'],
num_chars: 1,
});
Done
}
}
impl CharRefTokenizer {
pub fn step<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
if self.result.is_some() {
return Done;
}
debug!("char ref tokenizer stepping in state {:?}", self.state);
match self.state {
Begin => self.do_begin(tokenizer),
Octothorpe => self.do_octothorpe(tokenizer),
Numeric(base) => self.do_numeric(tokenizer, base),
NumericSemicolon => self.do_numeric_semicolon(tokenizer),
Named => self.do_named(tokenizer),
BogusName => self.do_bogus_name(tokenizer),
}
}
fn do_begin<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
match unwrap_or_return!(tokenizer.peek(), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&'
=> self.finish_none(),
c if Some(c) == self.addnl_allowed
=> self.finish_none(),
'#' => {
tokenizer.discard_char();
self.state = Octothorpe;
Progress
}
_ => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
}
}
}
fn do_octothorpe<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
let c = unwrap_or_return!(tokenizer.peek(), Stuck);
match c {
'x' | 'X' => {
tokenizer.discard_char();
self.hex_marker = Some(c);
self.state = Numeric(16);
}
_ => {
self.hex_marker = None;
self.state = Numeric(10);
}
}
Progress
}
fn do_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32) -> Status {
let c = unwrap_or_return!(tokenizer.peek(), Stuck);
match c.to_digit(base) {
Some(n) => {
tokenizer.discard_char();
self.num = self.num.wrapping_mul(base);
if self.num > 0x10FFFF {
self.num_too_big = true;
}
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
}
None if !self.seen_digit => self.unconsume_numeric(tokenizer),
None => {
self.state = NumericSemicolon;
Progress
}
}
}
fn do_numeric_semicolon<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
match unwrap_or_return!(tokenizer.peek(), Stuck) {
';' => tokenizer.discard_char(),
_ => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")),
};
self.finish_numeric(tokenizer)
}
fn unconsume_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
let mut unconsume = StrTendril::from_char('#');
match self.hex_marker {
Some(c) => unconsume.push_char(c),
None => (),
}
tokenizer.unconsume(unconsume);
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
self.finish_none()
}
fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
fn conv(n: u32) -> char {
from_u32(n).expect("invalid char missed by error handling cases")
}
let (c, error) = match self.num {
n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
0x00 | 0xD800...0xDFFF => ('\u{fffd}', true),
0x80...0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
Some(c) => (c, true),
None => (conv(self.num), true),
},
0x01...0x08 | 0x0B | 0x0D...0x1F | 0x7F | 0xFDD0...0xFDEF
=> (conv(self.num), true),
n if (n & 0xFFFE) == 0xFFFE
=> (conv(n), true),
n => (conv(n), false),
};
if error {
let msg = format_if!(tokenizer.opts.exact_errors,
"Invalid numeric character reference",
"Invalid numeric character reference value 0x{:06X}", self.num);
tokenizer.emit_error(msg);
}
self.finish_one(c)
}
fn do_named<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(), Stuck);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
Some(&m) => {
if m.0 != 0 {
self.name_match = Some(m);
self.name_len = self.name_buf().len();
}
Progress
}
None => self.finish_named(tokenizer, Some(c)),
}
}
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
let msg = format_if!(tokenizer.opts.exact_errors,
"Invalid character reference",
"Invalid character reference &{}", self.name_buf());
tokenizer.emit_error(msg);
}
fn unconsume_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
tokenizer.unconsume(self.name_buf_opt.take().unwrap());
}
fn finish_named<Sink: TokenSink>(&mut self,
tokenizer: &mut XmlTokenizer<Sink>,
end_char: Option<char>) -> Status {
match self.name_match {
None => {
match end_char {
Some(c) if is_ascii_alnum(c) => {
self.state = BogusName;
return Progress;
}
Some(';') if self.name_buf().len() > 1
=> self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(tokenizer);
self.finish_none()
}
Some((c1, c2)) => {
let name_len = self.name_len;
assert!(name_len > 0);
let last_matched = self.name_buf()[name_len-1..].chars().next().unwrap();
let next_after = if name_len == self.name_buf().len() {
None
} else {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};
let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer.emit_error(Borrowed("Equals sign after character reference in attribute"));
true
}
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
tokenizer.emit_error(Borrowed("Character reference does not end with semicolon"));
false
}
};
if unconsume_all {
self.unconsume_name(tokenizer);
self.finish_none()
} else {
tokenizer.unconsume(StrTendril::from_slice(&self.name_buf()[name_len..]));
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
});
Done
}
}
}
}
fn do_bogus_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(), Stuck);
self.name_buf_mut().push_char(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
';' => self.emit_name_error(tokenizer),
_ => ()
}
self.unconsume_name(tokenizer);
self.finish_none()
}
pub fn end_of_file<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
while self.result.is_none() {
match self.state {
Begin => drop(self.finish_none()),
Numeric(_) if !self.seen_digit
=> drop(self.unconsume_numeric(tokenizer)),
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer);
}
Named => drop(self.finish_named(tokenizer, None)),
BogusName => {
self.unconsume_name(tokenizer);
self.finish_none();
}
Octothorpe => {
tokenizer.unconsume(StrTendril::from_slice("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
}
}
}
}
}