use std::io::{self, Write, BufRead, Cursor};
use std::char;
use self::DecodeState::*;
use self::DecodeErrKind::*;
use io_support::{self, write_char, CharsError};
use entities::*;
#[derive(Debug)]
pub enum DecodeErrKind {
UnknownEntity,
MalformedNumEscape,
InvalidCharacter,
PrematureEnd,
IoError(io::Error),
EncodingError,
}
impl PartialEq for DecodeErrKind {
fn eq(&self, other: &DecodeErrKind) -> bool {
match (self, other) {
(&UnknownEntity, &UnknownEntity) => true,
(&MalformedNumEscape, &MalformedNumEscape) => true,
(&InvalidCharacter, &InvalidCharacter) => true,
(&PrematureEnd, &PrematureEnd) => true,
(&IoError(_), &IoError(_)) => true,
(&EncodingError, &EncodingError) => true,
_ => false
}
}
}
impl Eq for DecodeErrKind {}
#[derive(Debug, Eq, PartialEq)]
pub struct DecodeErr {
pub position: usize,
pub kind: DecodeErrKind
}
#[derive(PartialEq, Eq)]
enum DecodeState {
Normal,
Entity,
Named,
Numeric,
Hex,
Dec
}
macro_rules! try_parse(
($parse:expr, $pos:expr) => (
match $parse {
Err(reason) => return Err(DecodeErr{ position: $pos, kind: reason}),
Ok(res) => res
}
););
macro_rules! try_dec_io(
($io:expr, $pos:expr) => (
match $io {
Err(e) => return Err(DecodeErr{ position: $pos, kind: IoError(e)}),
Ok(res) => res
}
););
pub fn decode_html_rw<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
let mut state: DecodeState = Normal;
let mut pos = 0;
let mut good_pos = 0;
let mut buf = String::with_capacity(8);
for c in io_support::chars(reader) {
let c = match c {
Err(e) => {
let kind = match e {
CharsError::NotUtf8 => EncodingError,
CharsError::Other(io) => IoError(io)
};
return Err(DecodeErr{ position: pos, kind: kind });
}
Ok(c) => c
};
match state {
Normal if c == '&' => state = Entity,
Normal => try_dec_io!(write_char(writer, c), good_pos),
Entity if c == '#' => state = Numeric,
Entity if c == ';' => return Err(DecodeErr{ position: good_pos, kind: UnknownEntity }),
Entity => {
state = Named;
buf.push(c);
}
Named if c == ';' => {
state = Normal;
let ch = try_parse!(decode_named_entity(&buf), good_pos);
try_dec_io!(write_char(writer, ch), good_pos);
buf.clear();
}
Named => buf.push(c),
Numeric if is_digit(c) => {
state = Dec;
buf.push(c);
}
Numeric if c == 'x' => state = Hex,
Dec if c == ';' => {
state = Normal;
let ch = try_parse!(decode_numeric(&buf, 10), good_pos);
try_dec_io!(write_char(writer, ch), good_pos);
buf.clear();
}
Hex if c == ';' => {
state = Normal;
let ch = try_parse!(decode_numeric(&buf, 16), good_pos);
try_dec_io!(write_char(writer, ch), good_pos);
buf.clear();
}
Hex if is_hex_digit(c) => buf.push(c),
Dec if is_digit(c) => buf.push(c),
Numeric | Hex | Dec => return Err(DecodeErr{ position: good_pos, kind: MalformedNumEscape}),
}
pos += 1;
if state == Normal {
good_pos = pos;
}
}
if state != Normal {
Err(DecodeErr{ position: good_pos, kind: PrematureEnd})
} else {
Ok(())
}
}
pub fn decode_html(s: &str) -> Result<String, DecodeErr> {
let mut writer = Vec::with_capacity(s.len());
let bytes = s.as_bytes();
let mut reader = Cursor::new(bytes);
let res = decode_html_rw(&mut reader, &mut writer);
match res {
Ok(_) => Ok(String::from_utf8(writer).unwrap()),
Err(err) => Err(err)
}
}
fn is_digit(c: char) -> bool { c >= '0' && c <= '9' }
fn is_hex_digit(c: char) -> bool {
is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}
fn decode_named_entity(entity: &str) -> Result<char, DecodeErrKind> {
match NAMED_ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) {
Err(..) => Err(UnknownEntity),
Ok(idx) => {
let (_, c) = NAMED_ENTITIES[idx];
Ok(c)
}
}
}
fn decode_numeric(esc: &str, radix: u32) -> Result<char, DecodeErrKind> {
match u32::from_str_radix(esc, radix) {
Ok(n) => match char::from_u32(n) {
Some(c) => Ok(c),
None => Err(InvalidCharacter)
},
Err(..) => Err(MalformedNumEscape)
}
}