mod handlers_dispatcher;
mod rewrite_controller;
#[macro_use]
mod settings;
use self::handlers_dispatcher::ContentHandlersDispatcher;
use self::rewrite_controller::*;
use crate::base::SharedEncoding;
use crate::memory::MemoryLimitExceededError;
use crate::memory::MemoryLimiter;
use crate::parser::ParsingAmbiguityError;
use crate::selectors_vm::{self, SelectorMatchingVm};
use crate::transform_stream::*;
use encoding_rs::Encoding;
use mime::Mime;
use std::borrow::Cow;
use std::error::Error as StdError;
use std::fmt::{self, Debug};
use std::rc::Rc;
use thiserror::Error;
pub use self::settings::*;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct AsciiCompatibleEncoding(&'static Encoding);
impl AsciiCompatibleEncoding {
pub fn new(encoding: &'static Encoding) -> Option<Self> {
if encoding.is_ascii_compatible() {
Some(Self(encoding))
} else {
None
}
}
fn from_mimetype(mime: &Mime) -> Option<AsciiCompatibleEncoding> {
mime.get_param("charset")
.and_then(|cs| Encoding::for_label_no_replacement(cs.as_str().as_bytes()))
.and_then(AsciiCompatibleEncoding::new)
}
pub fn utf_8() -> AsciiCompatibleEncoding {
Self(encoding_rs::UTF_8)
}
}
impl From<AsciiCompatibleEncoding> for &'static Encoding {
fn from(ascii_enc: AsciiCompatibleEncoding) -> &'static Encoding {
ascii_enc.0
}
}
impl std::convert::TryFrom<&'static Encoding> for AsciiCompatibleEncoding {
type Error = ();
fn try_from(enc: &'static Encoding) -> Result<Self, ()> {
Self::new(enc).ok_or(())
}
}
#[derive(Error, Debug)]
pub enum RewritingError {
#[error("{0}")]
MemoryLimitExceeded(MemoryLimitExceededError),
#[error("{0}")]
ParsingAmbiguity(ParsingAmbiguityError),
#[error("{0}")]
ContentHandlerError(Box<dyn StdError + Send + Sync>),
}
pub struct HtmlRewriter<'h, O: OutputSink> {
stream: TransformStream<HtmlRewriteController<'h>, O>,
poisoned: bool,
}
macro_rules! guarded {
($self:ident, $expr:expr) => {{
assert!(
!$self.poisoned,
"Attempt to use the HtmlRewriter after a fatal error."
);
let res = $expr;
if res.is_err() {
$self.poisoned = true;
}
res
}};
}
impl<'h, O: OutputSink> HtmlRewriter<'h, O> {
pub fn new<'s>(settings: Settings<'h, 's>, output_sink: O) -> Self {
let encoding = SharedEncoding::new(settings.encoding);
let mut selectors_ast = selectors_vm::Ast::default();
let mut dispatcher = ContentHandlersDispatcher::default();
let has_selectors =
!settings.element_content_handlers.is_empty() || settings.adjust_charset_on_meta_tag;
let charset_adjust_handler = if settings.adjust_charset_on_meta_tag {
let encoding = SharedEncoding::clone(&encoding);
Some(handler_adjust_charset_on_meta_tag(encoding))
} else {
None
};
let element_content_handlers = charset_adjust_handler
.into_iter()
.chain(settings.element_content_handlers);
for (selector, handlers) in element_content_handlers {
let locator = dispatcher.add_selector_associated_handlers(handlers);
selectors_ast.add_selector(&selector, locator);
}
for handlers in settings.document_content_handlers {
dispatcher.add_document_content_handlers(handlers);
}
let memory_limiter =
MemoryLimiter::new_shared(settings.memory_settings.max_allowed_memory_usage);
let selector_matching_vm = if has_selectors {
Some(SelectorMatchingVm::new(
selectors_ast,
settings.encoding.into(),
Rc::clone(&memory_limiter),
settings.enable_esi_tags,
))
} else {
None
};
let controller = HtmlRewriteController::new(dispatcher, selector_matching_vm);
let stream = TransformStream::new(TransformStreamSettings {
transform_controller: controller,
output_sink,
preallocated_parsing_buffer_size: settings
.memory_settings
.preallocated_parsing_buffer_size,
memory_limiter,
encoding,
strict: settings.strict,
});
HtmlRewriter {
stream,
poisoned: false,
}
}
#[inline]
pub fn write(&mut self, data: &[u8]) -> Result<(), RewritingError> {
guarded!(self, self.stream.write(data))
}
#[inline]
pub fn end(mut self) -> Result<(), RewritingError> {
guarded!(self, self.stream.end())
}
}
impl<O: OutputSink> Debug for HtmlRewriter<'_, O> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "HtmlRewriter")
}
}
fn handler_adjust_charset_on_meta_tag(
encoding: SharedEncoding,
) -> (
Cow<'static, crate::Selector>,
ElementContentHandlers<'static>,
) {
element!("meta", move |el| {
let attr_charset = el
.get_attribute("charset")
.and_then(|cs| Encoding::for_label_no_replacement(cs.as_bytes()))
.and_then(AsciiCompatibleEncoding::new);
let attr_http_equiv = el
.get_attribute("http-equiv")
.filter(|http_equiv| http_equiv.eq_ignore_ascii_case("Content-Type"))
.and_then(|_| el.get_attribute("content"))
.and_then(|ct| ct.parse::<Mime>().ok())
.as_ref()
.and_then(AsciiCompatibleEncoding::from_mimetype);
if let Some(charset) = attr_charset.or(attr_http_equiv) {
encoding.set(charset)
}
Ok(())
})
}
pub fn rewrite_str<'h, 's>(
html: &str,
settings: impl Into<Settings<'h, 's>>,
) -> Result<String, RewritingError> {
let mut output = vec![];
let mut rewriter = HtmlRewriter::new(settings.into(), |c: &[u8]| {
output.extend_from_slice(c);
});
rewriter.write(html.as_bytes())?;
rewriter.end()?;
Ok(String::from_utf8(output).unwrap())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::html_content::ContentType;
use crate::test_utils::{Output, ASCII_COMPATIBLE_ENCODINGS, NON_ASCII_COMPATIBLE_ENCODINGS};
use encoding_rs::Encoding;
use itertools::Itertools;
use std::cell::RefCell;
use std::convert::TryInto;
use std::rc::Rc;
fn write_chunks<O: OutputSink>(
mut rewriter: HtmlRewriter<O>,
encoding: &'static Encoding,
chunks: &[&str],
) {
for chunk in chunks {
let (chunk, _, _) = encoding.encode(chunk);
rewriter.write(&chunk).unwrap();
}
rewriter.end().unwrap();
}
fn rewrite_html_bytes(html: &[u8], settings: Settings) -> Vec<u8> {
let mut out: Vec<u8> = Vec::with_capacity(html.len());
let mut rewriter = HtmlRewriter::new(settings, |c: &[u8]| out.extend_from_slice(c));
rewriter.write(html).unwrap();
rewriter.end().unwrap();
out
}
#[test]
fn rewrite_html_str() {
let res = rewrite_str(
"<!-- 42 --><div><!--hi--></div>",
RewriteStrSettings {
element_content_handlers: vec![
element!("div", |el| {
el.set_tag_name("span").unwrap();
Ok(())
}),
comments!("div", |c| {
c.set_text("hello").unwrap();
Ok(())
}),
],
..RewriteStrSettings::default()
},
)
.unwrap();
assert_eq!(res, "<!-- 42 --><span><!--hello--></span>");
}
#[test]
fn rewrite_arbitrary_settings() {
let res = rewrite_str("<span>Some text</span>", Settings::default()).unwrap();
assert_eq!(res, "<span>Some text</span>");
}
#[test]
fn non_ascii_compatible_encoding() {
for encoding in &NON_ASCII_COMPATIBLE_ENCODINGS {
assert_eq!(AsciiCompatibleEncoding::new(encoding), None);
}
}
#[test]
fn doctype_info() {
for &enc in ASCII_COMPATIBLE_ENCODINGS.iter() {
let mut doctypes = Vec::default();
{
let rewriter = HtmlRewriter::new(
Settings {
document_content_handlers: vec![doctype!(|d| {
doctypes.push((d.name(), d.public_id(), d.system_id()));
Ok(())
})],
encoding: enc.try_into().unwrap(),
..Settings::default()
},
|_: &[u8]| {},
);
write_chunks(
rewriter,
enc,
&[
"<!doctype html1>",
"<!-- test --><div>",
r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "#,
r#""http://www.w3.org/TR/html4/strict.dtd">"#,
"</div><!DoCtYPe ",
],
);
}
assert_eq!(
doctypes,
&[
(Some("html1".into()), None, None),
(
Some("html".into()),
Some("-//W3C//DTD HTML 4.01//EN".into()),
Some("http://www.w3.org/TR/html4/strict.dtd".into())
),
(None, None, None),
]
);
}
}
#[test]
fn rewrite_start_tags() {
for &enc in ASCII_COMPATIBLE_ENCODINGS.iter() {
let actual: String = {
let mut output = Output::new(enc);
let rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![element!("*", |el| {
el.set_attribute("foo", "bar").unwrap();
el.prepend("<test></test>", ContentType::Html);
Ok(())
})],
encoding: enc.try_into().unwrap(),
..Settings::default()
},
|c: &[u8]| output.push(c),
);
write_chunks(
rewriter,
enc,
&[
"<!doctype html>\n",
"<html>\n",
" <head></head>\n",
" <body>\n",
" <div>Test</div>\n",
" </body>\n",
"</html>",
],
);
output.into()
};
assert_eq!(
actual,
concat!(
"<!doctype html>\n",
"<html foo=\"bar\"><test></test>\n",
" <head foo=\"bar\"><test></test></head>\n",
" <body foo=\"bar\"><test></test>\n",
" <div foo=\"bar\"><test></test>Test</div>\n",
" </body>\n",
"</html>",
)
);
}
}
#[test]
fn rewrite_document_content() {
for &enc in ASCII_COMPATIBLE_ENCODINGS.iter() {
let actual: String = {
let mut output = Output::new(enc);
let rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![],
document_content_handlers: vec![
doc_comments!(|c| {
c.set_text(&(c.text() + "1337")).unwrap();
Ok(())
}),
doc_text!(|c| {
if c.last_in_text_node() {
c.after("BAZ", ContentType::Text);
}
Ok(())
}),
],
encoding: enc.try_into().unwrap(),
..Settings::default()
},
|c: &[u8]| output.push(c),
);
write_chunks(
rewriter,
enc,
&[
"<!doctype html>\n",
"<!-- hey -->\n",
"<html>\n",
" <head><!-- aloha --></head>\n",
" <body>\n",
" <div>Test</div>\n",
" </body>\n",
" <!-- bonjour -->\n",
"</html>Pshhh",
],
);
output.into()
};
assert_eq!(
actual,
concat!(
"<!doctype html>\nBAZ",
"<!-- hey 1337-->\nBAZ",
"<html>\n",
" BAZ<head><!-- aloha 1337--></head>\n",
" BAZ<body>\n",
" BAZ<div>TestBAZ</div>\n",
" BAZ</body>\n",
" BAZ<!-- bonjour 1337-->\nBAZ",
"</html>PshhhBAZ",
)
);
}
}
#[test]
fn handler_invocation_order() {
let handlers_executed = Rc::new(RefCell::new(Vec::default()));
macro_rules! create_handlers {
($sel:expr, $idx:expr) => {
element!($sel, {
let handlers_executed = Rc::clone(&handlers_executed);
move |_| {
handlers_executed.borrow_mut().push($idx);
Ok(())
}
})
};
}
let _res = rewrite_str(
"<div><span foo></span></div>",
RewriteStrSettings {
element_content_handlers: vec![
create_handlers!("div span", 0),
create_handlers!("div > span", 1),
create_handlers!("span", 2),
create_handlers!("[foo]", 3),
create_handlers!("div span[foo]", 4),
],
..RewriteStrSettings::default()
},
)
.unwrap();
assert_eq!(*handlers_executed.borrow(), vec![0, 1, 2, 3, 4]);
}
#[test]
fn write_esi_tags() {
let res = rewrite_str(
"<span><esi:include src=a></span>",
RewriteStrSettings {
element_content_handlers: vec![element!("esi\\:include", |el| {
el.replace("?", ContentType::Text);
Ok(())
})],
enable_esi_tags: true,
..RewriteStrSettings::default()
},
)
.unwrap();
assert_eq!(res, "<span>?</span>");
}
#[test]
fn test_rewrite_adjust_charset_on_meta_tag_attribute_charset() {
use crate::html_content::{ContentType, TextChunk};
let enthusiastic_text_handler = || {
doc_text!(move |text: &mut TextChunk| {
let new_text = text.as_str().replace('!', "!!!");
text.replace(&new_text, ContentType::Text);
Ok(())
})
};
let html: Vec<u8> = [
r#"<meta charset="windows-1251"><html><head></head><body>I love "#
.as_bytes()
.to_vec(),
vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc],
r#"!</body></html>"#.as_bytes().to_vec(),
]
.into_iter()
.concat();
let expected: Vec<u8> = html
.iter()
.copied()
.flat_map(|c| match c {
b'!' => vec![b'!', b'!', b'!'],
c => vec![c],
})
.collect();
let transformed_no_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
..Settings::default()
},
);
assert_ne!(transformed_no_charset_adjustment, expected);
let transformed_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
adjust_charset_on_meta_tag: true,
..Settings::default()
},
);
assert_eq!(transformed_charset_adjustment, expected);
}
#[test]
fn test_rewrite_adjust_charset_on_meta_tag_attribute_content_type() {
use crate::html_content::{ContentType, TextChunk};
let enthusiastic_text_handler = || {
doc_text!(move |text: &mut TextChunk| {
let new_text = text.as_str().replace('!', "!!!");
text.replace(&new_text, ContentType::Text);
Ok(())
})
};
let html: Vec<u8> = [
r#"<meta http-equiv="content-type" content="text/html; charset=windows-1251"><html><head></head><body>I love "#.as_bytes().to_vec(),
vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc],
r#"!</body></html>"#.as_bytes().to_vec(),
].into_iter().concat();
let expected: Vec<u8> = html
.iter()
.copied()
.flat_map(|c| match c {
b'!' => vec![b'!', b'!', b'!'],
c => vec![c],
})
.collect();
let transformed_no_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
..Settings::default()
},
);
assert_ne!(transformed_no_charset_adjustment, expected);
let transformed_charset_adjustment: Vec<u8> = rewrite_html_bytes(
&html,
Settings {
document_content_handlers: vec![enthusiastic_text_handler()],
adjust_charset_on_meta_tag: true,
..Settings::default()
},
);
assert_eq!(transformed_charset_adjustment, expected);
}
mod fatal_errors {
use super::*;
use crate::errors::MemoryLimitExceededError;
fn create_rewriter<O: OutputSink>(
max_allowed_memory_usage: usize,
output_sink: O,
) -> HtmlRewriter<'static, O> {
HtmlRewriter::new(
Settings {
element_content_handlers: vec![element!("*", |_| Ok(()))],
memory_settings: MemorySettings {
max_allowed_memory_usage,
preallocated_parsing_buffer_size: 0,
},
..Settings::default()
},
output_sink,
)
}
#[test]
fn buffer_capacity_limit() {
const MAX: usize = 100;
let mut rewriter = create_rewriter(MAX, |_: &[u8]| {});
let chunk_1 = format!("<img alt=\"{}", "l".repeat(MAX / 2));
let chunk_2 = format!("{}\" />", "r".repeat(MAX / 2));
rewriter.write(chunk_1.as_bytes()).unwrap();
let write_err = rewriter.write(chunk_2.as_bytes()).unwrap_err();
match write_err {
RewritingError::MemoryLimitExceeded(e) => assert_eq!(e, MemoryLimitExceededError),
_ => panic!("{}", write_err),
}
}
#[test]
#[should_panic(expected = "Attempt to use the HtmlRewriter after a fatal error.")]
fn poisoning_after_fatal_error() {
const MAX: usize = 10;
let mut rewriter = create_rewriter(MAX, |_: &[u8]| {});
let chunk = format!("<img alt=\"{}", "l".repeat(MAX));
rewriter.write(chunk.as_bytes()).unwrap_err();
rewriter.end().unwrap_err();
}
#[test]
fn content_handler_error_propagation() {
fn assert_err(
element_handlers: ElementContentHandlers,
document_handlers: DocumentContentHandlers,
expected_err: &'static str,
) {
use std::borrow::Cow;
let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![(
Cow::Owned("*".parse().unwrap()),
element_handlers,
)],
document_content_handlers: vec![document_handlers],
..Settings::default()
},
|_: &[u8]| {},
);
let chunks = [
"<!--doc comment--> Doc text",
"<div><!--el comment-->El text</div>",
];
let mut err = None;
for chunk in chunks.iter() {
match rewriter.write(chunk.as_bytes()) {
Ok(_) => (),
Err(e) => {
err = Some(e);
break;
}
}
}
if err.is_none() {
match rewriter.end() {
Ok(_) => (),
Err(e) => err = Some(e),
}
}
let err = format!("{}", err.expect("Error expected"));
assert_eq!(err, expected_err);
}
assert_err(
ElementContentHandlers::default(),
doc_comments!(|_| Err("Error in doc comment handler".into())),
"Error in doc comment handler",
);
assert_err(
ElementContentHandlers::default(),
doc_text!(|_| Err("Error in doc text handler".into())),
"Error in doc text handler",
);
assert_err(
ElementContentHandlers::default(),
doc_text!(|_| Err("Error in doctype handler".into())),
"Error in doctype handler",
);
assert_err(
ElementContentHandlers::default()
.element(|_| Err("Error in element handler".into())),
DocumentContentHandlers::default(),
"Error in element handler",
);
assert_err(
ElementContentHandlers::default()
.comments(|_| Err("Error in element comment handler".into())),
DocumentContentHandlers::default(),
"Error in element comment handler",
);
assert_err(
ElementContentHandlers::default()
.text(|_| Err("Error in element text handler".into())),
DocumentContentHandlers::default(),
"Error in element text handler",
);
}
}
}