1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#[macro_use]
extern crate lazy_static;

use std::char::from_u32;
use std::collections::BTreeMap;
use std::iter::Peekable;
use std::str::Chars;

mod entities;

struct Entities {
  entities: BTreeMap<&'static str, &'static str>,
  max_html_length: usize
}

impl Entities {
  fn new() -> Self {
    let entities = entities::entities();
    let max_html_length = entities.keys().map(|x| x.len()).max().unwrap();

    Entities {
      entities: entities,
      max_html_length: max_html_length
    }
  }
}

lazy_static! {
  static ref ENTITIES: Entities = Entities::new();
}

#[derive(Clone, Debug)]
pub enum DecodeError {
  IllFormedEntity(Line, Col),
  UnknownEntity(Line, Col, String),
  EOF
}

type Line = usize;
type Col = usize;

pub fn decode_html_entities(html: &str) -> Result<String, DecodeError> {
  let mut chars = html.chars().peekable(); // iterator over the HTML to decode
  let mut decoded = String::new(); // decoded string
  let mut line = 1; // current line
  let mut col = 1; // current column

  while let Some(c) = chars.next() {
    if c == '&' {
      let unicode_entity = if let Some(&'#') = chars.peek() {
        chars.next();
        try!(parse_entity_numeric(&mut chars, line, &mut col))
      } else {
        let entity = try!(parse_entity_name(&mut chars, line, &mut col));
        let unicode = try!(ENTITIES.entities.get(entity.as_str()).ok_or(DecodeError::UnknownEntity(line, col, entity)));
        (*unicode).to_owned()
      };
      
      decoded += &unicode_entity;
    } else {
      if c == '\n' {
        line += 1;
        col = 1;
      }

      col += 1;
      decoded.push(c);
    }
  }

  Ok(decoded)
}

fn parse_entity_numeric(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
  let c = match chars.peek() {
    Some(&'x') | Some(&'X') => {
      chars.next();
      try!(parse_entity_hex(chars, line, col))
    },
    _ => try!(parse_entity_dec(chars, line, col))
  };

  let mut s = String::new();
  s.push(c);
  Ok(s)
}

fn parse_entity_hex(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
  let num = try!(parse_number(chars, line, col));
  let dec = try!(u32::from_str_radix(&num, 16).map_err(|_| DecodeError::IllFormedEntity(line, *col)));

  from_u32(dec).ok_or(DecodeError::IllFormedEntity(line, *col))
}

fn parse_entity_dec(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<char, DecodeError> {
  let num = try!(try!(parse_number(chars, line, col)).parse().map_err(|_| DecodeError::IllFormedEntity(line, *col)));
  from_u32(num).ok_or(DecodeError::IllFormedEntity(line, *col))
}

fn parse_number(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
  let mut hex = String::new();
  let mut l = 0;

  loop {
    if let Some(c) = chars.next() {
      *col += 1;

      if c == ';' {
        break;
      }

      l += 1;

      // abort on long numbers people would try to make to break our code
      if l >= 16 {
        return Err(DecodeError::IllFormedEntity(line, *col));
      }

      hex.push(c);
    } else {
      return Err(DecodeError::EOF)
    }
  }

  Ok(hex)
}

fn parse_entity_name(chars: &mut Peekable<Chars>, line: usize, col: &mut usize) -> Result<String, DecodeError> {
  let mut entity = String::with_capacity(ENTITIES.max_html_length);
  let mut l = 0;

  entity.push('&');
  while l < ENTITIES.max_html_length {
    match chars.next() {
      Some(c) => {
        entity.push(c);

        if c == ';' {
          break;
        }

        l += 1;
        *col += 1;
      },
      None => return Err(DecodeError::EOF)
    }
  }

  if l == ENTITIES.max_html_length {
    Err(DecodeError::IllFormedEntity(line, *col))
  } else {
    Ok(entity)
  }
}