htmlescape/
decode.rs

1use std::io::{self, Write, BufRead, Cursor};
2use std::char;
3use self::DecodeState::*;
4use self::DecodeErrKind::*;
5use io_support::{self, write_char, CharsError};
6use entities::*;
7
8#[derive(Debug)]
9pub enum DecodeErrKind {
10    /// A non-existent named entity was referenced.
11    /// Example: &thisentitydoesnotexist
12    UnknownEntity,
13
14    /// A numerical escape sequence (&# or &#x) containing an invalid character.
15    /// Examples: `&#32a`, `&#xfoo`
16    MalformedNumEscape,
17
18    /// A numerical escape sequence (&# or &#x) resolved to an invalid unicode code point.
19    /// Example: `&#xffffff`
20    InvalidCharacter,
21
22    /// The input ended prematurely (ie. inside an unterminated named entity sequence).
23    PrematureEnd,
24
25    /// An IO error occured.
26    IoError(io::Error),
27
28    /// The supplied Reader produces invalid UTF-8.
29    EncodingError,
30}
31
32impl PartialEq for DecodeErrKind {
33    fn eq(&self, other: &DecodeErrKind) -> bool {
34        match (self, other) {
35            (&UnknownEntity, &UnknownEntity) => true,
36            (&MalformedNumEscape, &MalformedNumEscape) => true,
37            (&InvalidCharacter, &InvalidCharacter) => true,
38            (&PrematureEnd, &PrematureEnd) => true,
39            (&IoError(_), &IoError(_)) => true,
40            (&EncodingError, &EncodingError) => true,
41            _ => false
42        }
43    }
44}
45
46impl Eq for DecodeErrKind {}
47
48/// Error from decoding a entity-encoded string.
49#[derive(Debug, Eq, PartialEq)]
50pub struct DecodeErr {
51    /// Number of characters read from the input before encountering an error
52    pub position: usize,
53    /// Type of error
54    pub kind: DecodeErrKind
55}
56
57#[derive(PartialEq, Eq)]
58enum DecodeState {
59    Normal,
60    Entity,
61    Named,
62    Numeric,
63    Hex,
64    Dec
65}
66
67macro_rules! try_parse(
68    ($parse:expr, $pos:expr) => (
69        match $parse {
70            Err(reason) => return Err(DecodeErr{ position: $pos, kind: reason}),
71            Ok(res) => res
72        }
73    ););
74
75macro_rules! try_dec_io(
76    ($io:expr, $pos:expr) => (
77        match $io {
78            Err(e) => return Err(DecodeErr{ position: $pos, kind: IoError(e)}),
79            Ok(res) => res
80        }
81    ););
82
83/// Decodes an entity-encoded string from a reader to a writer.
84///
85/// Similar to `decode_html`, except reading from a reader rather than a string, and
86/// writing to a writer rather than returning a `String`.
87///
88/// # Arguments
89/// - `reader` - UTF-8 encoded data is read from here.
90/// - `writer` - UTF8- decoded data is written to here.
91///
92/// # Errors
93/// Errors can be caused by IO errors, `reader` producing invalid UTF-8, or by syntax errors.
94pub fn decode_html_rw<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
95    let mut state: DecodeState = Normal;
96    let mut pos = 0;
97    let mut good_pos = 0;
98    let mut buf = String::with_capacity(8);
99    for c in io_support::chars(reader) {
100        let c = match c {
101            Err(e) => {
102                let kind = match e {
103                    CharsError::NotUtf8   => EncodingError,
104                    CharsError::Other(io) => IoError(io)
105                };
106                return Err(DecodeErr{ position: pos, kind: kind });
107            }
108            Ok(c) => c
109        };
110        match state {
111            Normal if c == '&' => state = Entity,
112            Normal => try_dec_io!(write_char(writer, c), good_pos),
113            Entity if c == '#' => state = Numeric,
114            Entity if c == ';' => return Err(DecodeErr{ position: good_pos, kind: UnknownEntity }),
115            Entity => {
116                state = Named;
117                buf.push(c);
118            }
119            Named if c == ';' => {
120                state = Normal;
121                let ch = try_parse!(decode_named_entity(&buf), good_pos);
122                try_dec_io!(write_char(writer, ch), good_pos);
123                buf.clear();
124            }
125            Named => buf.push(c),
126            Numeric if is_digit(c) => {
127                state = Dec;
128                buf.push(c);
129            }
130            Numeric if c == 'x' => state = Hex,
131            Dec if c == ';' => {
132                state = Normal;
133                let ch = try_parse!(decode_numeric(&buf, 10), good_pos);
134                try_dec_io!(write_char(writer, ch), good_pos);
135                buf.clear();
136            }
137            Hex if c == ';' => {
138                state = Normal;
139                let ch = try_parse!(decode_numeric(&buf, 16), good_pos);
140                try_dec_io!(write_char(writer, ch), good_pos);
141                buf.clear();
142            }
143            Hex if is_hex_digit(c) => buf.push(c),
144            Dec if is_digit(c) => buf.push(c),
145            Numeric | Hex | Dec => return Err(DecodeErr{ position: good_pos, kind: MalformedNumEscape}),
146        }
147        pos += 1;
148        if state == Normal {
149            good_pos = pos;
150        }
151    }
152    if state != Normal {
153        Err(DecodeErr{ position: good_pos, kind: PrematureEnd})
154    } else {
155        Ok(())
156    }
157}
158
159
160/// Decodes an entity-encoded string from a reader to a writer ignoring errors.
161/// Properly written and recognised entities will be decoded,
162/// any partial or unknown ones will be left intact.
163///
164/// Similar to `decode_html`, except reading from a reader rather than a string, and
165/// writing to a writer rather than returning a `String`.
166///
167/// # Arguments
168/// - `reader` - UTF-8 encoded data is read from here.
169/// - `writer` - UTF8- decoded data is written to here.
170///
171/// # Errors
172/// Errors can be caused by IO errors, `reader` producing invalid UTF-8.
173pub fn decode_html_rw_ignoring_errors<R: BufRead, W: Write>(reader: R, writer: &mut W) -> Result<(), DecodeErr> {
174    let mut state: DecodeState = Normal;
175    let mut pos = 0;
176    let mut good_pos = 0;
177    let mut buf = String::with_capacity(8);
178    let mut buf_since_good_pos = String::with_capacity(20);
179    for c in io_support::chars(reader) {
180        let c = match c {
181            Err(e) => {
182                let kind = match e {
183                    CharsError::NotUtf8   => EncodingError,
184                    CharsError::Other(io) => IoError(io)
185                };
186                return Err(DecodeErr{ position: pos, kind: kind });
187            }
188            Ok(c) => c
189        };
190        match state {
191            Normal if c == '&' => { buf_since_good_pos.push(c); state = Entity},
192            Normal => try_dec_io!(write_char(writer, c), good_pos),
193            Entity if c == '#' => { buf_since_good_pos.push(c); state = Numeric},
194            Entity if c == ';' => {
195                buf_since_good_pos.push(c);
196                try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
197                buf_since_good_pos.clear();
198                state = Normal
199            }
200            Entity => {
201                state = Named;
202                buf.push(c);
203                buf_since_good_pos.push(c);
204            }
205            Named if c == ';' => {
206                state = Normal;
207                match  decode_named_entity(&buf) {
208                    Ok(ch) => {
209                        try_dec_io!(write_char(writer, ch), good_pos);
210                        buf.clear();
211                        buf_since_good_pos.clear();
212                    },
213                    Err(_) => {
214                        buf_since_good_pos.push(c);
215                        try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
216                        buf_since_good_pos.clear();
217                        buf.clear();
218                    }
219                }
220            }
221            Named => {
222                buf.push(c);
223                buf_since_good_pos.push(c);
224            },
225            Numeric if is_digit(c) => {
226                state = Dec;
227                buf.push(c);
228                buf_since_good_pos.push(c);
229            }
230            Numeric if c == 'x' => {
231                buf_since_good_pos.push(c);
232                state = Hex
233            },
234            Dec if c == ';' => {
235                state = Normal;
236                match  decode_numeric(&buf, 10) {
237                    Ok(ch) => {
238                        try_dec_io!(write_char(writer, ch), good_pos);
239                        buf.clear();
240                        buf_since_good_pos.clear();
241                    },
242                    Err(_) => {
243                        buf_since_good_pos.push(c);
244                        try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
245                        buf_since_good_pos.clear();
246                        buf.clear();
247                    }
248                }
249            }
250            Hex if c == ';' => {
251                state = Normal;
252                match  decode_numeric(&buf, 16) {
253                    Ok(ch) => {
254                        try_dec_io!(write_char(writer, ch), good_pos);
255                        buf.clear();
256                        buf_since_good_pos.clear();
257                    },
258                    Err(_) => {
259                        buf_since_good_pos.push(c);
260                        try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
261                        buf_since_good_pos.clear();
262                        buf.clear();
263                    }
264                }
265            }
266            Hex if is_hex_digit(c) => { buf.push(c); buf_since_good_pos.push(c) },
267            Dec if is_digit(c) => { buf.push(c); buf_since_good_pos.push(c) },
268            Numeric | Hex | Dec => {
269                buf_since_good_pos.push(c);
270                try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos);
271                buf_since_good_pos.clear();
272                buf.clear();
273                state = Normal
274            }
275        }
276        pos += 1;
277        if state == Normal {
278            good_pos = pos;
279        }
280    }
281
282    if state != Normal {
283        //let slice = reader;
284        try_dec_io!(writer.write(buf_since_good_pos.as_bytes()), good_pos); 
285    }
286    
287    Ok(())
288}
289
290
291/// Decodes an entity-encoded string.
292///
293/// Decodes an entity encoded string, replacing HTML entities (`&amp;`, `&#20;` ...) with the
294/// the corresponding character. Case matters for named entities, ie. `&Amp;` is invalid.
295/// Case does not matter for hex entities, so `&#x2E;` and `&#x2e;` are treated the same.
296///
297/// # Arguments
298/// - `s` - Entity-encoded string to decode.
299///
300/// # Failure
301/// The function will fail if input string contains invalid named entities (eg. `&nosuchentity;`),
302/// invalid hex entities (eg. `&#xRT;`), invalid decimal entities (eg. `&#-1;), unclosed entities
303/// (`s == "&amp hej och hå"`) or otherwise malformed entities.
304///
305/// This function will never return errors with `kind` set to `IoError` or `EncodingError`.
306pub fn decode_html(s: &str) -> Result<String, DecodeErr> {
307    let mut writer = Vec::with_capacity(s.len());
308    let bytes = s.as_bytes();
309    let mut reader = Cursor::new(bytes);
310    let res = decode_html_rw(&mut reader, &mut writer);
311    match res {
312        Ok(_) => Ok(String::from_utf8(writer).unwrap()),
313        Err(err) => Err(err)
314    }
315}
316
317/// Decodes an entity-encoded string.
318///
319/// Decodes an entity encoded string, replacing HTML entities (`&amp;`, `&#20;` ...) with the
320/// the corresponding character. Case matters for named entities, ie. `&Amp;` is invalid.
321/// Case does not matter for hex entities, so `&#x2E;` and `&#x2e;` are treated the same.
322///
323/// # Arguments
324/// - `s` - Entity-encoded string to decode.
325///
326/// # Failure
327/// Any invalid, unrecognised or malformed entities will be ignored and left intact.
328///
329/// This function will never return errors with `kind` set to `IoError` or `EncodingError`.
330pub fn decode_html_ignoring_errors(s: &str) -> Result<String, DecodeErr> {
331    let mut writer = Vec::with_capacity(s.len());
332    let bytes = s.as_bytes();
333    let mut reader = Cursor::new(bytes);
334    let res = decode_html_rw_ignoring_errors(&mut reader, &mut writer);
335    match res {
336        Ok(_) => Ok(String::from_utf8(writer).unwrap()),
337        Err(err) => Err(err)
338    }
339}
340
341fn is_digit(c: char) -> bool { c >= '0' && c <= '9' }
342
343fn is_hex_digit(c: char) -> bool {
344    is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
345}
346
347fn decode_named_entity(entity: &str) -> Result<char, DecodeErrKind> {
348    match NAMED_ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) {
349        Err(..) => Err(UnknownEntity),
350        Ok(idx) => {
351            let (_, c) = NAMED_ENTITIES[idx];
352            Ok(c)
353        }
354    }
355}
356
357fn decode_numeric(esc: &str, radix: u32) -> Result<char, DecodeErrKind> {
358    match u32::from_str_radix(esc, radix) {
359        Ok(n) => match char::from_u32(n) {
360            Some(c) => Ok(c),
361            None => Err(InvalidCharacter)
362        },
363        Err(..) => Err(MalformedNumEscape)
364    }
365}
366