use std::borrow::{Cow, IntoCow, ToOwned};
use std::default::Default;
use std::fmt;
use std::str::CharRange;
use html5ever::tokenizer::{Attribute, Tag, TokenSink, Token};
use html5ever::tokenizer::{CharacterTokens, CommentToken, NullCharacterToken,
ParseError, TagToken};
use html5ever::tokenizer::TagKind::{StartTag, EndTag};
use html5ever::driver::{tokenize_to, one_input};
use regex;
use url::{Url, UrlParser};
pub fn clean_html<'a>(html: &'a str) -> CleanHtml<'a> {
CleanHtml(html)
}
pub fn sanitize_html<'a>(html: &'a str, base_uri: Option<&str>) ->
SanitizeHtml<'a>
{
SanitizeHtml(html, base_uri.and_then(|e| Url::parse(e).ok()))
}
#[unstable]
pub struct CleanHtml<'a>(pub &'a str);
impl<'a> fmt::Display for CleanHtml<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let sink = MarkupTagCleaner { w: f };
tokenize_to(sink, one_input(self.0.to_owned()),
Default::default());
Ok(())
}
}
struct MarkupTagCleaner<'a, 'b: 'a> {
w: &'a mut fmt::Formatter<'b>,
}
impl<'a, 'b> TokenSink for MarkupTagCleaner<'a, 'b> {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
self.w.write_str(&b).unwrap();
}
NullCharacterToken => self.w.write_str("\0").unwrap(),
ParseError(_) => { } _ => { }
}
}
}
#[unstable]
pub struct SanitizeHtml<'a>(pub &'a str, pub Option<Url>);
impl<'a> fmt::Display for SanitizeHtml<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let sink = HtmlSanitizer {
base_uri: &self.1,
w: f,
ignore: false,
};
tokenize_to(sink, one_input(self.0.to_owned()),
Default::default());
Ok(())
}
}
#[inline]
fn disallowed_style_pattern() -> regex::Regex {
Regex::new(r#"(^|;)\s*display\s*:\s*[a-z-]+\s*(?:;\s*|$)"#).unwrap()
}
static DISALLOWED_SCHEMES: &'static [&'static str] = &[
"javascript:", "jscript:", "livescript:", "vbscript:", "data:",
"about:", "mocha:",
];
struct HtmlSanitizer<'a, 'b: 'a> {
base_uri: &'a Option<Url>,
w: &'a mut fmt::Formatter<'b>,
ignore: bool,
}
impl<'a, 'b> HtmlSanitizer<'a, 'b> {
#[inline]
fn write_fmt(&mut self, fmt: fmt::Arguments) {
self.w.write_fmt(fmt).unwrap()
}
#[inline]
fn write_str(&mut self, data: &str) {
self.w.write_str(data).unwrap()
}
}
fn remove_css(value: &str) -> String {
disallowed_style_pattern().replace(value, "$1")
}
fn disallowed_scheme(value: &str) -> bool {
DISALLOWED_SCHEMES.iter().any(|s| value.starts_with(*s))
}
impl<'a, 'b> TokenSink for HtmlSanitizer<'a, 'b> {
fn process_token(&mut self, token: Token) {
match (self.ignore, token) {
(_, TagToken(Tag { kind: EndTag, name: atom!(script), .. })) => {
self.ignore = false;
}
(_, TagToken(Tag { kind: EndTag, name, .. })) => {
write!(self, "</{}>", name.as_slice());
}
(true, _) => { }
(false, TagToken(Tag { kind: StartTag, name: atom!(script), .. })) => {
self.ignore = true;
}
(false, TagToken(Tag { kind: StartTag, name, mut attrs, .. })) => {
write!(self, "<{}", name.as_slice());
if let Some(base_uri) = self.base_uri.as_ref() {
if name == atom!(a) || name == atom!(link) {
let mut url_parser = UrlParser::new();
let base_uri = url_parser.base_url(base_uri);
for &mut Attribute { ref name,
ref mut value } in attrs.iter_mut() {
if name.local == atom!(href) {
match base_uri.parse(&value) {
Ok(u) => { *value = u.serialize(); }
Err(_) => { } }
}
}
}
}
for Attribute { name, value } in attrs.into_iter() {
write!(self, " {}", name.local.as_slice());
if !value.is_empty() {
let value = match name.local {
atom!(href) if disallowed_scheme(&value) => {
"".into_cow()
}
atom!(style) => {
remove_css(&value).into_cow()
}
_ => value.into_cow()
};
write!(self, "=\"{}\"", value);
}
}
write!(self, ">");
}
(false, CommentToken(c)) => write!(self, "<!--{}-->", c),
(false, CharacterTokens(b)) => self.write_str(&b),
(false, NullCharacterToken) => self.write_str("\0"),
(_, ParseError(_)) => { } _ => { }
}
}
}