1use std::io::{self, Write, BufRead, Cursor};
2use std::char;
3use self::DecodeState::*;
4use self::DecodeErrKind::*;
5use io_support::{self, write_char, CharsError};
6use entities::*;
7
8#[derive(Debug)]
9pub enum DecodeErrKind {
10 UnknownEntity,
13
14 MalformedNumEscape,
17
18 InvalidCharacter,
21
22 PrematureEnd,
24
25 IoError(io::Error),
27
28 EncodingError,
30}
31
32impl PartialEq for DecodeErrKind {
33 fn eq(&self, other: &DecodeErrKind) -> bool {
34 match (self, other) {
35 (&UnknownEntity, &UnknownEntity) => true,
36 (&MalformedNumEscape, &MalformedNumEscape) => true,
37 (&InvalidCharacter, &InvalidCharacter) => true,
38 (&PrematureEnd, &PrematureEnd) => true,
39 (&IoError(_), &IoError(_)) => true,
40 (&EncodingError, &EncodingError) => true,
41 _ => false
42 }
43 }
44}
45
46impl Eq for DecodeErrKind {}
47
48#[derive(Debug, Eq, PartialEq)]
50pub struct DecodeErr {
51 pub position: usize,
53 pub kind: DecodeErrKind
55}
56
57#[derive(PartialEq, Eq)]
58enum DecodeState {
59 Normal,
60 Entity,
61 Named,
62 Numeric,
63 Hex,
64 Dec
65}
66
67macro_rules! try_parse(
68 ($parse:expr, $pos:expr) => (
69 match $parse {
70 Err(reason) => return Err(DecodeErr{ position: $pos, kind: reason}),
71 Ok(res) => res
72 }
73 ););
74
75macro_rules! try_dec_io(
76 ($io:expr, $pos:expr) => (
77 match $io {
78 Err(e) => return Err(DecodeErr{ position: $pos, kind: IoError(e)}),
79 Ok(res) => res
80 }
81 ););
82
83pub fn decode_html_rw<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
95 let mut state: DecodeState = Normal;
96 let mut pos = 0;
97 let mut good_pos = 0;
98 let mut buf = String::with_capacity(8);
99 for c in io_support::chars(reader) {
100 let c = match c {
101 Err(e) => {
102 let kind = match e {
103 CharsError::NotUtf8 => EncodingError,
104 CharsError::Other(io) => IoError(io)
105 };
106 return Err(DecodeErr{ position: pos, kind: kind });
107 }
108 Ok(c) => c
109 };
110 match state {
111 Normal if c == '&' => state = Entity,
112 Normal => try_dec_io!(write_char(writer, c), good_pos),
113 Entity if c == '#' => state = Numeric,
114 Entity if c == ';' => return Err(DecodeErr{ position: good_pos, kind: UnknownEntity }),
115 Entity => {
116 state = Named;
117 buf.push(c);
118 }
119 Named if c == ';' => {
120 state = Normal;
121 let ch = try_parse!(decode_named_entity(&buf), good_pos);
122 try_dec_io!(write_char(writer, ch), good_pos);
123 buf.clear();
124 }
125 Named => buf.push(c),
126 Numeric if is_digit(c) => {
127 state = Dec;
128 buf.push(c);
129 }
130 Numeric if c == 'x' => state = Hex,
131 Dec if c == ';' => {
132 state = Normal;
133 let ch = try_parse!(decode_numeric(&buf, 10), good_pos);
134 try_dec_io!(write_char(writer, ch), good_pos);
135 buf.clear();
136 }
137 Hex if c == ';' => {
138 state = Normal;
139 let ch = try_parse!(decode_numeric(&buf, 16), good_pos);
140 try_dec_io!(write_char(writer, ch), good_pos);
141 buf.clear();
142 }
143 Hex if is_hex_digit(c) => buf.push(c),
144 Dec if is_digit(c) => buf.push(c),
145 Numeric | Hex | Dec => return Err(DecodeErr{ position: good_pos, kind: MalformedNumEscape}),
146 }
147 pos += 1;
148 if state == Normal {
149 good_pos = pos;
150 }
151 }
152 if state != Normal {
153 Err(DecodeErr{ position: good_pos, kind: PrematureEnd})
154 } else {
155 Ok(())
156 }
157}
158
159
160pub fn decode_html_rw_ignoring_errors<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
174 let mut state: DecodeState = Normal;
175 let mut pos = 0;
176 let mut good_pos = 0;
177 let mut buf = String::with_capacity(8);
178 let mut buf_since_good_pos = String::with_capacity(20);
179 for c in io_support::chars(reader) {
180 let c = match c {
181 Err(e) => {
182 let kind = match e {
183 CharsError::NotUtf8 => EncodingError,
184 CharsError::Other(io) => IoError(io)
185 };
186 return Err(DecodeErr{ position: pos, kind: kind });
187 }
188 Ok(c) => c
189 };
190 match state {
191 Normal if c == '&' => { buf_since_good_pos.push(c); state = Entity},
192 Normal => try_dec_io!(write_char(writer, c), good_pos),
193 Entity if c == '#' => { buf_since_good_pos.push(c); state = Numeric},
194 Entity if c == ';' => {
195 buf_since_good_pos.push(c);
196 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
197 buf_since_good_pos.clear();
198 state = Normal
199 }
200 Entity => {
201 state = Named;
202 buf.push(c);
203 buf_since_good_pos.push(c);
204 }
205 Named if c == ';' => {
206 state = Normal;
207 match decode_named_entity(&buf) {
208 Ok(ch) => {
209 try_dec_io!(write_char(writer, ch), good_pos);
210 buf.clear();
211 buf_since_good_pos.clear();
212 },
213 Err(_) => {
214 buf_since_good_pos.push(c);
215 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
216 buf_since_good_pos.clear();
217 buf.clear();
218 }
219 }
220 }
221 Named => {
222 buf.push(c);
223 buf_since_good_pos.push(c);
224 },
225 Numeric if is_digit(c) => {
226 state = Dec;
227 buf.push(c);
228 buf_since_good_pos.push(c);
229 }
230 Numeric if c == 'x' => {
231 buf_since_good_pos.push(c);
232 state = Hex
233 },
234 Dec if c == ';' => {
235 state = Normal;
236 match decode_numeric(&buf, 10) {
237 Ok(ch) => {
238 try_dec_io!(write_char(writer, ch), good_pos);
239 buf.clear();
240 buf_since_good_pos.clear();
241 },
242 Err(_) => {
243 buf_since_good_pos.push(c);
244 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
245 buf_since_good_pos.clear();
246 buf.clear();
247 }
248 }
249 }
250 Hex if c == ';' => {
251 state = Normal;
252 match decode_numeric(&buf, 16) {
253 Ok(ch) => {
254 try_dec_io!(write_char(writer, ch), good_pos);
255 buf.clear();
256 buf_since_good_pos.clear();
257 },
258 Err(_) => {
259 buf_since_good_pos.push(c);
260 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
261 buf_since_good_pos.clear();
262 buf.clear();
263 }
264 }
265 }
266 Hex if is_hex_digit(c) => { buf.push(c); buf_since_good_pos.push(c) },
267 Dec if is_digit(c) => { buf.push(c); buf_since_good_pos.push(c) },
268 Numeric | Hex | Dec => {
269 buf_since_good_pos.push(c);
270 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
271 buf_since_good_pos.clear();
272 buf.clear();
273 state = Normal
274 }
275 }
276 pos += 1;
277 if state == Normal {
278 good_pos = pos;
279 }
280 }
281
282 if state != Normal {
283 try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
285 }
286
287 Ok(())
288}
289
290
291pub fn decode_html(s: &str) -> Result<String, DecodeErr> {
307 let mut writer = Vec::with_capacity(s.len());
308 let bytes = s.as_bytes();
309 let mut reader = Cursor::new(bytes);
310 let res = decode_html_rw(&mut reader, &mut writer);
311 match res {
312 Ok(_) => Ok(String::from_utf8(writer).unwrap()),
313 Err(err) => Err(err)
314 }
315}
316
317pub fn decode_html_ignoring_errors(s: &str) -> Result<String, DecodeErr> {
331 let mut writer = Vec::with_capacity(s.len());
332 let bytes = s.as_bytes();
333 let mut reader = Cursor::new(bytes);
334 let res = decode_html_rw_ignoring_errors(&mut reader, &mut writer);
335 match res {
336 Ok(_) => Ok(String::from_utf8(writer).unwrap()),
337 Err(err) => Err(err)
338 }
339}
340
341fn is_digit(c: char) -> bool { c >= '0' && c <= '9' }
342
343fn is_hex_digit(c: char) -> bool {
344 is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
345}
346
347fn decode_named_entity(entity: &str) -> Result<char, DecodeErrKind> {
348 match NAMED_ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) {
349 Err(..) => Err(UnknownEntity),
350 Ok(idx) => {
351 let (_, c) = NAMED_ENTITIES[idx];
352 Ok(c)
353 }
354 }
355}
356
357fn decode_numeric(esc: &str, radix: u32) -> Result<char, DecodeErrKind> {
358 match u32::from_str_radix(esc, radix) {
359 Ok(n) => match char::from_u32(n) {
360 Some(c) => Ok(c),
361 None => Err(InvalidCharacter)
362 },
363 Err(..) => Err(MalformedNumEscape)
364 }
365}
366