1#[macro_use]
2extern crate lazy_static;
3
4use std::char::from_u32;
5use std::collections::BTreeMap;
6use std::iter::Peekable;
7use std::str::Chars;
8
9mod entities;
10
11struct Entities {
12 entities: BTreeMap<&'static str, &'static str>,
13 max_html_length: usize
14}
15
16impl Entities {
17 fn new() -> Self {
18 let entities = entities::entities();
19 let max_html_length = entities.keys().map(|x| x.len()).max().unwrap();
20
21 Entities {
22 entities: entities,
23 max_html_length: max_html_length
24 }
25 }
26}
27
28lazy_static! {
29 static ref ENTITIES: Entities = Entities::new();
30}
31
32#[derive(Clone, Debug)]
33pub enum DecodeError {
34 IllFormedEntity(Line, Col),
35 UnknownEntity(Line, Col, String),
36 EOF
37}
38
39type Line = usize;
40type Col = usize;
41
42pub fn decode_html_entities(html: &str) -> Result<String, DecodeError> {
43 let mut chars = html.chars().peekable(); let mut decoded = String::new(); let mut line = 1; let mut col = 1; while let Some(c) = chars.next() {
49 if c == '&' {
50 let unicode_entity = if let Some(&'#') = chars.peek() {
51 chars.next();
52 try!(parse_entity_numeric(&mut chars, line, &mut col))
53 } else {
54 let entity = try!(parse_entity_name(&mut chars, line, &mut col));
55 let unicode = try!(ENTITIES.entities.get(entity.as_str()).ok_or(DecodeError::UnknownEntity(line, col, entity)));
56 (*unicode).to_owned()
57 };
58
59 decoded += &unicode_entity;
60 } else {
61 if c == '\n' {
62 line += 1;
63 col = 1;
64 }
65
66 col += 1;
67 decoded.push(c);
68 }
69 }
70
71 Ok(decoded)
72}
73
74fn parse_entity_numeric(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
75 let c = match chars.peek() {
76 Some(&'x') | Some(&'X') => {
77 chars.next();
78 try!(parse_entity_hex(chars, line, col))
79 },
80 _ => try!(parse_entity_dec(chars, line, col))
81 };
82
83 let mut s = String::new();
84 s.push(c);
85 Ok(s)
86}
87
88fn parse_entity_hex(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
89 let num = try!(parse_number(chars, line, col));
90 let dec = try!(u32::from_str_radix(&num, 16).map_err(|_| DecodeError::IllFormedEntity(line, *col)));
91
92 from_u32(dec).ok_or(DecodeError::IllFormedEntity(line, *col))
93}
94
95fn parse_entity_dec(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
96 let num = try!(try!(parse_number(chars, line, col)).parse().map_err(|_| DecodeError::IllFormedEntity(line, *col)));
97 from_u32(num).ok_or(DecodeError::IllFormedEntity(line, *col))
98}
99
100fn parse_number(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
101 let mut hex = String::new();
102 let mut l = 0;
103
104 loop {
105 if let Some(c) = chars.next() {
106 *col += 1;
107
108 if c == ';' {
109 break;
110 }
111
112 l += 1;
113
114 if l >= 16 {
116 return Err(DecodeError::IllFormedEntity(line, *col));
117 }
118
119 hex.push(c);
120 } else {
121 return Err(DecodeError::EOF)
122 }
123 }
124
125 Ok(hex)
126}
127
128fn parse_entity_name(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
129 let mut entity = String::with_capacity(ENTITIES.max_html_length);
130 let mut l = 0;
131
132 entity.push('&');
133 while l < ENTITIES.max_html_length {
134 match chars.next() {
135 Some(c) => {
136 entity.push(c);
137
138 if c == ';' {
139 break;
140 }
141
142 l += 1;
143 *col += 1;
144 },
145 None => return Err(DecodeError::EOF)
146 }
147 }
148
149 if l == ENTITIES.max_html_length {
150 Err(DecodeError::IllFormedEntity(line, *col))
151 } else {
152 Ok(entity)
153 }
154}