use crate::filter::ImageFilter;
use crate::Highlighter;
use gh_emoji::Replacer;
use html5ever::driver::{parse_fragment, ParseOpts};
use html5ever::interface::Attribute;
use html5ever::serialize::{serialize as serialize5, Serialize, SerializeOpts, TraversalScope};
use html5ever::tokenizer::TokenizerOpts;
use html5ever::tree_builder::{QuirksMode, TreeBuilderOpts, TreeSink};
use html5ever::QualName;
use markup5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
use std::borrow::Cow;
use std::io;
use std::rc::Rc;
#[cfg(test)]
use std::time::Duration;
use std::time::Instant;
/// Internal ugly hack
pub struct DomHighlighter<'a> {
dom: RcDom,
hilite: Option<&'a Highlighter>,
image_filter: &'a dyn ImageFilter,
github_emoji: Replacer,
pub warnings: Vec<String>,
pub deadline: Instant,
rustdoc_extensions: bool,
}
#[derive(Copy, Clone)]
struct FilterResult {
remove_element: bool,
is_code_one_liner: bool,
}
impl FilterResult {
pub fn nop() -> Self {
Self {
remove_element: false,
is_code_one_liner: false,
}
}
pub fn remove() -> Self {
Self {
remove_element: true,
is_code_one_liner: false,
}
}
}
impl<'a> DomHighlighter<'a> {
pub fn new(hilite: Option<&'a Highlighter>, markup: &str, image_filter: &'a dyn ImageFilter, rustdoc_extensions: bool, deadline: Instant) -> Self {
let dom = parse(markup);
let mut hi = Self {
dom,
hilite,
image_filter,
github_emoji: Replacer::new(),
warnings: Vec::new(),
deadline,
rustdoc_extensions,
};
let doc = Rc::clone(&hi.dom.document);
hi.filter_node(&doc, None, false);
hi
}
pub fn filtered(&self) -> String {
serialize_doc(&self.dom).expect("serialize")
}
pub fn filtered_node(&self) -> Handle {
Rc::clone(&self.dom.document)
}
// true if needs to remove the node
#[inline(never)]
fn filter_node(&mut self, node: &Handle, lang: Option<&str>, mut in_code: bool) -> FilterResult {
let mut new_lang = None;
match node.data {
NodeData::Text { ref contents } if !in_code => {
let mut c = contents.borrow_mut();
if let Cow::Owned(s) = self.github_emoji.replace_all(&c) {
*c = s.into();
}
return FilterResult::nop(); // no children
},
NodeData::Element { ref name, ref attrs, .. } if &*name.local == "a" => {
if let Some(id) = get_attr("id", &attrs.borrow()) {
let id = id.value.to_string();
if let Some(parent) = node.parent.take().and_then(|p| p.upgrade()) {
if let NodeData::Element { ref attrs, .. } = parent.data {
attrs.borrow_mut().push(Attribute {
name: QualName::new(None, ns!(), local_name!("id")),
value: id.into(),
});
}
// resets to the exact same node, because Cell+Weak is weird like that
node.parent.set(Some(Rc::downgrade(&parent)));
return FilterResult::remove();
}
}
},
NodeData::Element { ref name, ref attrs, .. } if &*name.local == "pre" => {
in_code = true;
if let Some(attr) = get_attr("lang", &attrs.borrow()) {
new_lang = Some(attr.value.to_string());
}
},
NodeData::Element { ref name, .. } if &*name.local == "code" => {
if let Some(hilite) = self.hilite {
let mut filter_result = FilterResult::nop();
let mut lines_of_code = 0;
let mut nodes = Vec::with_capacity(node.children.borrow().len());
for child in node.children.borrow().iter() {
match child.data {
NodeData::Text { ref contents } => {
let text = &**contents.borrow();
lines_of_code += text.trim().lines().take(2).count();
let (syntax, warning) = hilite.highlight_as_node(&mut self.dom, text, lang, self.rustdoc_extensions);
nodes.push(syntax);
if let Some(w) = warning {
self.warnings.push(w);
}
},
NodeData::Comment { .. } => {},
_ => return FilterResult::nop(), // skips links in code?
}
}
let tmp = node.children.borrow().iter().rev().map(Rc::downgrade).collect::<Vec<_>>();
for t in tmp.into_iter().filter_map(|c| c.upgrade()) {
self.dom.remove_from_parent(&t);
}
for new_node in nodes {
let reparsed = get_body(new_node);
self.dom.reparent_children(&reparsed, node);
}
if lines_of_code == 1 {
filter_result.is_code_one_liner = true;
}
return filter_result; // important! otherwise may loop forever
}
},
NodeData::Element { ref name, ref attrs, .. } if &*name.local == "img" => {
let mut attrs = attrs.borrow_mut();
let Some(src) = get_attr("src", &attrs).map(|s| s.value.trim().to_string()) else {
// drop image nodes without a src
return FilterResult::remove();
};
// Perf
attrs.push(Attribute {
name: QualName::new(None, ns!(), "decoding".into()),
value: "async".into(),
});
let wanted_width = get_attr("width", &attrs).and_then(|w| parse_html_size(&w.value, 800));
let wanted_height = get_attr("height", &attrs).and_then(|h| parse_html_size(&h.value, 800));
let container_width = if get_attr("align", &attrs).is_some() { 400 } else { 800 };
let f_img = self.image_filter.filter_url(&src, wanted_width, wanted_height, container_width, self.deadline);
if f_img.src != src {
// rewritten domain supports CORS, other may not :(
remove_attr("crossorigin", &mut attrs);
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("crossorigin")),
value: "anonymous".into(),
});
remove_attr("src", &mut attrs);
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: (*f_img.src).into(),
});
remove_attr("srcset", &mut attrs);
if let Some(srcset) = f_img.srcset.as_deref() {
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("srcset")),
value: srcset.into(),
});
}
}
if f_img.width != wanted_width {
remove_attr("width", &mut attrs);
if let Some(width) = f_img.width {
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("width")),
value: format!("{width}").into(),
});
}
}
if f_img.height != wanted_height {
remove_attr("height", &mut attrs);
if let Some(height) = f_img.height {
if height < 800 {
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("height")),
value: format!("{height}").into(),
});
}
}
}
},
_ => {},
}
let mut combined_result = FilterResult::nop();
let lang = new_lang.as_deref().or(lang);
let mut to_remove = Vec::new();
for ch in node.children.borrow().iter() {
let res = self.filter_node(ch, lang, in_code);
if res.remove_element {
to_remove.push(Rc::downgrade(ch));
} else if res.is_code_one_liner {
combined_result.is_code_one_liner = true;
if let NodeData::Element { ref name, ref attrs, .. } = &node.data {
if &*name.local == "pre" {
let mut attrs = attrs.borrow_mut();
attrs.push(Attribute {
name: QualName::new(None, ns!(), local_name!("class")),
value: "one-liner".into(),
});
combined_result.is_code_one_liner = false; // stop bubbling up
}
}
}
}
for to_remove in to_remove.into_iter().rev() {
if let Some(node) = to_remove.upgrade() {
self.dom.remove_from_parent(&node);
}
}
combined_result
}
}
fn parse_html_size(s: &str, max_size: u32) -> Option<u32> {
let s = s.trim();
if let Some(s) = s.strip_suffix('%') {
let s: u32 = s.trim_end().parse().ok()?;
Some(s * max_size / 100)
} else {
s.strip_suffix("px").map(|s| s.trim_end()).unwrap_or(s).parse::<u32>().ok()
}
}
fn get_body(node: Handle) -> Handle {
match node.data {
NodeData::Element { ref name, .. } if &*name.local == "body" || &*name.local == "html" => {
let child_nodes = node.children.borrow();
if child_nodes.len() == 1 {
return get_body(child_nodes[0].clone());
}
},
NodeData::Document => {
if let Some(ch) = node.children.borrow().first() {
return get_body(ch.clone());
}
},
_ => {},
}
node
}
fn get_attr<'a>(name: &str, att: &'a [Attribute]) -> Option<&'a Attribute> {
att.iter().find(|a| &*a.name.local == name)
}
fn remove_attr(name: &str, att: &mut Vec<Attribute>) {
att.retain(|a| &*a.name.local != name);
}
pub fn parse(src: &str) -> RcDom {
let parser = parse_fragment(
RcDom::default(),
ParseOpts {
tokenizer: TokenizerOpts::default(),
tree_builder: TreeBuilderOpts {
exact_errors: false,
scripting_enabled: false,
iframe_srcdoc: false,
drop_doctype: false,
ignore_missing_rules: true,
quirks_mode: QuirksMode::NoQuirks,
},
},
QualName::new(None, ns!(html), local_name!("div")),
vec![],
);
parser.one(src)
}
pub fn serialize_doc(doc: &RcDom) -> io::Result<String> {
let children = doc.document.children.borrow();
assert_eq!(1, children.len());
let handle: SerializableHandle = children[0].clone().into();
serialize_node(&handle)
}
pub fn serialize_node<S: Serialize>(node: &S) -> io::Result<String> {
let mut out = Vec::new();
serialize5(&mut out, node, SerializeOpts {
scripting_enabled: false,
traversal_scope: TraversalScope::ChildrenOnly(Some(QualName::new(None, ns!(html), local_name!("div")))),
create_missing_parent: true,
})?;
Ok(String::from_utf8(out).map_err(|_| io::ErrorKind::InvalidData)?)
}
#[test]
fn roundtrip() {
let unfiltered = serialize_doc(&parse("<h1 lang=pl>hell:cat:<pre>:cat:</pre><noscript>o</noscript><!-- --></h1><p>hi:notmoji:")).unwrap();
assert_eq!(&unfiltered, r#"<h1 lang="pl">hell:cat:<pre>:cat:</pre><noscript>o</noscript><!-- --></h1><p>hi:notmoji:</p>"#);
}
#[test]
fn code_highlight() {
let h = Some(Highlighter::new());
let d = DomHighlighter::new(h.as_ref(), "<pre><code>fn main(){}</code></pre>", &(), true, Instant::now() + Duration::from_secs(4));
assert_eq!(d.filtered(), r#"<pre class="one-liner"><code><tt class="src-rs"><tt class="m-fn m-fn-rs"><tt class="stor-ty stor-ty-fn">fn</tt> <tt class="ent-n ent-n-fn">main</tt></tt><tt class="m-fn m-fn-rs"><tt class="m-fn m-fn-parms"><tt class="pun-sec pun-sec-parms">(</tt></tt><tt class="m-fn m-fn-parms"><tt class="pun-sec pun-sec-parms">)</tt></tt></tt><tt class="m-fn m-fn-rs"></tt><tt class="m-fn m-fn-rs"><tt class="m-bl m-bl-rs"><tt class="pun-sec pun-sec-bl">{</tt></tt><tt class="m-bl m-bl-rs"><tt class="pun-sec pun-sec-bl">}</tt></tt></tt></tt></code></pre>"#);
}
#[test]
fn emoji() {
let h = Some(Highlighter::new());
let d = DomHighlighter::new(h.as_ref(), r#"<div>:cat:<pre lang="plain">:cat:<code>:cat:</code></pre>:dog::nothotdog:</div>"#, &(), true, Instant::now() + Duration::from_secs(4));
assert_eq!(d.filtered(), r#"<div>🐱<pre lang="plain" class="one-liner">:cat:<code><tt class="txt-plain">:cat:</tt></code></pre>🐶:nothotdog:</div>"#);
}