html_entities/
lib.rs

1#[macro_use]
2extern crate lazy_static;
3
4use std::char::from_u32;
5use std::collections::BTreeMap;
6use std::iter::Peekable;
7use std::str::Chars;
8
9mod entities;
10
11struct Entities {
12  entities: BTreeMap<&'static str, &'static str>,
13  max_html_length: usize
14}
15
16impl Entities {
17  fn new() -> Self {
18    let entities = entities::entities();
19    let max_html_length = entities.keys().map(|x| x.len()).max().unwrap();
20
21    Entities {
22      entities: entities,
23      max_html_length: max_html_length
24    }
25  }
26}
27
28lazy_static! {
29  static ref ENTITIES: Entities = Entities::new();
30}
31
32#[derive(Clone, Debug)]
33pub enum DecodeError {
34  IllFormedEntity(Line, Col),
35  UnknownEntity(Line, Col, String),
36  EOF
37}
38
39type Line = usize;
40type Col = usize;
41
42pub fn decode_html_entities(html: &str) -> Result<String, DecodeError> {
43  let mut chars = html.chars().peekable(); // iterator over the HTML to decode
44  let mut decoded = String::new(); // decoded string
45  let mut line = 1; // current line
46  let mut col = 1; // current column
47
48  while let Some(c) = chars.next() {
49    if c == '&' {
50      let unicode_entity = if let Some(&'#') = chars.peek() {
51        chars.next();
52        try!(parse_entity_numeric(&mut chars, line, &mut col))
53      } else {
54        let entity = try!(parse_entity_name(&mut chars, line, &mut col));
55        let unicode = try!(ENTITIES.entities.get(entity.as_str()).ok_or(DecodeError::UnknownEntity(line, col, entity)));
56        (*unicode).to_owned()
57      };
58      
59      decoded += &unicode_entity;
60    } else {
61      if c == '\n' {
62        line += 1;
63        col = 1;
64      }
65
66      col += 1;
67      decoded.push(c);
68    }
69  }
70
71  Ok(decoded)
72}
73
74fn parse_entity_numeric(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
75  let c = match chars.peek() {
76    Some(&'x') | Some(&'X') => {
77      chars.next();
78      try!(parse_entity_hex(chars, line, col))
79    },
80    _ => try!(parse_entity_dec(chars, line, col))
81  };
82
83  let mut s = String::new();
84  s.push(c);
85  Ok(s)
86}
87
88fn parse_entity_hex(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
89  let num = try!(parse_number(chars, line, col));
90  let dec = try!(u32::from_str_radix(&num, 16).map_err(|_| DecodeError::IllFormedEntity(line, *col)));
91
92  from_u32(dec).ok_or(DecodeError::IllFormedEntity(line, *col))
93}
94
95fn parse_entity_dec(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
96  let num = try!(try!(parse_number(chars, line, col)).parse().map_err(|_| DecodeError::IllFormedEntity(line, *col)));
97  from_u32(num).ok_or(DecodeError::IllFormedEntity(line, *col))
98}
99
100fn parse_number(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
101  let mut hex = String::new();
102  let mut l = 0;
103
104  loop {
105    if let Some(c) = chars.next() {
106      *col += 1;
107
108      if c == ';' {
109        break;
110      }
111
112      l += 1;
113
114      // abort on long numbers people would try to make to break our code
115      if l >= 16 {
116        return Err(DecodeError::IllFormedEntity(line, *col));
117      }
118
119      hex.push(c);
120    } else {
121      return Err(DecodeError::EOF)
122    }
123  }
124
125  Ok(hex)
126}
127
128fn parse_entity_name(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
129  let mut entity = String::with_capacity(ENTITIES.max_html_length);
130  let mut l = 0;
131
132  entity.push('&');
133  while l < ENTITIES.max_html_length {
134    match chars.next() {
135      Some(c) => {
136        entity.push(c);
137
138        if c == ';' {
139          break;
140        }
141
142        l += 1;
143        *col += 1;
144      },
145      None => return Err(DecodeError::EOF)
146    }
147  }
148
149  if l == ENTITIES.max_html_length {
150    Err(DecodeError::IllFormedEntity(line, *col))
151  } else {
152    Ok(entity)
153  }
154}