render_readme 0.11.0

Render Markdown or reStructuredText with syntax highlighting and image filtering similar to GitHub's
Documentation
use crate::filter::ImageFilter;
use crate::Highlighter;
use gh_emoji::Replacer;
use html5ever::driver::{parse_fragment, ParseOpts};
use html5ever::interface::Attribute;
use html5ever::serialize::{serialize as serialize5, Serialize, SerializeOpts, TraversalScope};
use html5ever::tokenizer::TokenizerOpts;
use html5ever::tree_builder::{QuirksMode, TreeBuilderOpts, TreeSink};
use html5ever::QualName;
use markup5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
use std::borrow::Cow;
use std::io;
use std::rc::Rc;
#[cfg(test)]
use std::time::Duration;
use std::time::Instant;

/// Internal ugly hack
pub struct DomHighlighter<'a> {
    dom: RcDom,
    hilite: Option<&'a Highlighter>,
    image_filter: &'a dyn ImageFilter,
    github_emoji: Replacer,
    pub warnings: Vec<String>,
    pub deadline: Instant,
    rustdoc_extensions: bool,
}

#[derive(Copy, Clone)]
struct FilterResult {
    remove_element: bool,
    is_code_one_liner: bool,
}

impl FilterResult {
    pub fn nop() -> Self {
        Self {
            remove_element: false,
            is_code_one_liner: false,
        }
    }

    pub fn remove() -> Self {
        Self {
            remove_element: true,
            is_code_one_liner: false,
        }
    }
}

impl<'a> DomHighlighter<'a> {
    pub fn new(hilite: Option<&'a Highlighter>, markup: &str, image_filter: &'a dyn ImageFilter, rustdoc_extensions: bool, deadline: Instant) -> Self {
        let dom = parse(markup);
        let mut hi = Self {
            dom,
            hilite,
            image_filter,
            github_emoji: Replacer::new(),
            warnings: Vec::new(),
            deadline,
            rustdoc_extensions,
        };
        let doc = Rc::clone(&hi.dom.document);
        hi.filter_node(&doc, None, false);
        hi
    }

    pub fn filtered(&self) -> String {
        serialize_doc(&self.dom).expect("serialize")
    }

    pub fn filtered_node(&self) -> Handle {
        Rc::clone(&self.dom.document)
    }

    // true if needs to remove the node
    #[inline(never)]
    fn filter_node(&mut self, node: &Handle, lang: Option<&str>, mut in_code: bool) -> FilterResult {
        let mut new_lang = None;
        match node.data {
            NodeData::Text { ref contents } if !in_code => {
                let mut c = contents.borrow_mut();
                if let Cow::Owned(s) = self.github_emoji.replace_all(&c) {
                    *c = s.into();
                }
                return FilterResult::nop(); // no children
            },
            NodeData::Element { ref name, ref attrs, .. } if &*name.local == "a" => {
                if let Some(id) = get_attr("id", &attrs.borrow()) {
                    let id = id.value.to_string();
                    if let Some(parent) = node.parent.take().and_then(|p| p.upgrade()) {
                        if let NodeData::Element { ref attrs, .. } = parent.data {
                            attrs.borrow_mut().push(Attribute {
                                name: QualName::new(None, ns!(), local_name!("id")),
                                value: id.into(),
                            });
                        }
                        // resets to the exact same node, because Cell+Weak is weird like that
                        node.parent.set(Some(Rc::downgrade(&parent)));
                        return FilterResult::remove();
                    }
                }
            },
            NodeData::Element { ref name, ref attrs, .. } if &*name.local == "pre" => {
                in_code = true;
                if let Some(attr) = get_attr("lang", &attrs.borrow()) {
                    new_lang = Some(attr.value.to_string());
                }
            },
            NodeData::Element { ref name, .. } if &*name.local == "code" => {
                if let Some(hilite) = self.hilite {
                    let mut filter_result = FilterResult::nop();
                    let mut lines_of_code = 0;
                    let mut nodes = Vec::with_capacity(node.children.borrow().len());
                    for child in node.children.borrow().iter() {
                        match child.data {
                            NodeData::Text { ref contents } => {
                                let text = &**contents.borrow();
                                lines_of_code += text.trim().lines().take(2).count();
                                let (syntax, warning) = hilite.highlight_as_node(&mut self.dom, text, lang, self.rustdoc_extensions);
                                nodes.push(syntax);
                                if let Some(w) = warning {
                                    self.warnings.push(w);
                                }
                            },
                            NodeData::Comment { .. } => {},
                            _ => return FilterResult::nop(), // skips links in code?
                        }
                    }
                    let tmp = node.children.borrow().iter().rev().map(Rc::downgrade).collect::<Vec<_>>();
                    for t in tmp.into_iter().filter_map(|c| c.upgrade()) {
                        self.dom.remove_from_parent(&t);
                    }
                    for new_node in nodes {
                        let reparsed = get_body(new_node);
                        self.dom.reparent_children(&reparsed, node);
                    }
                    if lines_of_code == 1 {
                        filter_result.is_code_one_liner = true;
                    }
                    return filter_result; // important! otherwise may loop forever
                }
            },
            NodeData::Element { ref name, ref attrs, .. } if &*name.local == "img" => {
                let mut attrs = attrs.borrow_mut();

                let Some(src) = get_attr("src", &attrs).map(|s| s.value.trim().to_string()) else {
                    // drop image nodes without a src
                    return FilterResult::remove();
                };

                // Perf
                attrs.push(Attribute {
                    name: QualName::new(None, ns!(), "decoding".into()),
                    value: "async".into(),
                });

                let wanted_width = get_attr("width", &attrs).and_then(|w| parse_html_size(&w.value, 800));
                let wanted_height = get_attr("height", &attrs).and_then(|h| parse_html_size(&h.value, 800));
                let container_width = if get_attr("align", &attrs).is_some() { 400 } else { 800 };
                let f_img = self.image_filter.filter_url(&src, wanted_width, wanted_height, container_width, self.deadline);
                if f_img.src != src {
                    // rewritten domain supports CORS, other may not :(
                    remove_attr("crossorigin", &mut attrs);
                    attrs.push(Attribute {
                        name: QualName::new(None, ns!(), local_name!("crossorigin")),
                        value: "anonymous".into(),
                    });
                    remove_attr("src", &mut attrs);
                    attrs.push(Attribute {
                        name: QualName::new(None, ns!(), local_name!("src")),
                        value: (*f_img.src).into(),
                    });
                    remove_attr("srcset", &mut attrs);
                    if let Some(srcset) = f_img.srcset.as_deref() {
                        attrs.push(Attribute {
                            name: QualName::new(None, ns!(), local_name!("srcset")),
                            value: srcset.into(),
                        });
                    }
                }
                if f_img.width != wanted_width {
                    remove_attr("width", &mut attrs);
                    if let Some(width) = f_img.width {
                        attrs.push(Attribute {
                            name: QualName::new(None, ns!(), local_name!("width")),
                            value: format!("{width}").into(),
                        });
                    }
                }
                if f_img.height != wanted_height {
                    remove_attr("height", &mut attrs);
                    if let Some(height) = f_img.height {
                        if height < 800 {
                            attrs.push(Attribute {
                                name: QualName::new(None, ns!(), local_name!("height")),
                                value: format!("{height}").into(),
                            });
                        }
                    }
                }
            },
            _ => {},
        }

        let mut combined_result = FilterResult::nop();

        let lang = new_lang.as_deref().or(lang);
        let mut to_remove = Vec::new();
        for ch in node.children.borrow().iter() {
            let res = self.filter_node(ch, lang, in_code);
            if res.remove_element {
                to_remove.push(Rc::downgrade(ch));
            } else if res.is_code_one_liner {
                combined_result.is_code_one_liner = true;
                if let NodeData::Element { ref name, ref attrs, .. } = &node.data {
                    if &*name.local == "pre" {
                        let mut attrs = attrs.borrow_mut();
                        attrs.push(Attribute {
                            name: QualName::new(None, ns!(), local_name!("class")),
                            value: "one-liner".into(),
                        });
                        combined_result.is_code_one_liner = false; // stop bubbling up
                    }
                }
            }
        }
        for to_remove in to_remove.into_iter().rev() {
            if let Some(node) = to_remove.upgrade() {
                self.dom.remove_from_parent(&node);
            }
        }
        combined_result
    }
}

fn parse_html_size(s: &str, max_size: u32) -> Option<u32> {
    let s = s.trim();
    if let Some(s) = s.strip_suffix('%') {
        let s: u32 = s.trim_end().parse().ok()?;
        Some(s * max_size / 100)
    } else {
        s.strip_suffix("px").map(|s| s.trim_end()).unwrap_or(s).parse::<u32>().ok()
    }
}

fn get_body(node: Handle) -> Handle {
    match node.data {
        NodeData::Element { ref name, .. } if &*name.local == "body" || &*name.local == "html" => {
            let child_nodes = node.children.borrow();
            if child_nodes.len() == 1 {
                return get_body(child_nodes[0].clone());
            }
        },
        NodeData::Document => {
            if let Some(ch) = node.children.borrow().first() {
                return get_body(ch.clone());
            }
        },
        _ => {},
    }
    node
}

fn get_attr<'a>(name: &str, att: &'a [Attribute]) -> Option<&'a Attribute> {
    att.iter().find(|a| &*a.name.local == name)
}

fn remove_attr(name: &str, att: &mut Vec<Attribute>) {
    att.retain(|a| &*a.name.local != name);
}

pub fn parse(src: &str) -> RcDom {
    let parser = parse_fragment(
        RcDom::default(),
        ParseOpts {
            tokenizer: TokenizerOpts::default(),
            tree_builder: TreeBuilderOpts {
                exact_errors: false,
                scripting_enabled: false,
                iframe_srcdoc: false,
                drop_doctype: false,
                ignore_missing_rules: true,
                quirks_mode: QuirksMode::NoQuirks,
            },
        },
        QualName::new(None, ns!(html), local_name!("div")),
        vec![],
    );
    parser.one(src)
}

pub fn serialize_doc(doc: &RcDom) -> io::Result<String> {
    let children = doc.document.children.borrow();
    assert_eq!(1, children.len());
    let handle: SerializableHandle = children[0].clone().into();
    serialize_node(&handle)
}

pub fn serialize_node<S: Serialize>(node: &S) -> io::Result<String> {
    let mut out = Vec::new();
    serialize5(&mut out, node, SerializeOpts {
        scripting_enabled: false,
        traversal_scope: TraversalScope::ChildrenOnly(Some(QualName::new(None, ns!(html), local_name!("div")))),
        create_missing_parent: true,
    })?;

    Ok(String::from_utf8(out).map_err(|_| io::ErrorKind::InvalidData)?)
}

#[test]
fn roundtrip() {
    let unfiltered = serialize_doc(&parse("<h1 lang=pl>hell:cat:<pre>:cat:</pre><noscript>o</noscript><!-- --></h1><p>hi:notmoji:")).unwrap();
    assert_eq!(&unfiltered, r#"<h1 lang="pl">hell:cat:<pre>:cat:</pre><noscript>o</noscript><!-- --></h1><p>hi:notmoji:</p>"#);
}

#[test]
fn code_highlight() {
    let h = Some(Highlighter::new());
    let d = DomHighlighter::new(h.as_ref(), "<pre><code>fn main(){}</code></pre>", &(), true, Instant::now() + Duration::from_secs(4));
    assert_eq!(d.filtered(), r#"<pre class="one-liner"><code><tt class="src-rs"><tt class="m-fn m-fn-rs"><tt class="stor-ty stor-ty-fn">fn</tt> <tt class="ent-n ent-n-fn">main</tt></tt><tt class="m-fn m-fn-rs"><tt class="m-fn m-fn-parms"><tt class="pun-sec pun-sec-parms">(</tt></tt><tt class="m-fn m-fn-parms"><tt class="pun-sec pun-sec-parms">)</tt></tt></tt><tt class="m-fn m-fn-rs"></tt><tt class="m-fn m-fn-rs"><tt class="m-bl m-bl-rs"><tt class="pun-sec pun-sec-bl">{</tt></tt><tt class="m-bl m-bl-rs"><tt class="pun-sec pun-sec-bl">}</tt></tt></tt></tt></code></pre>"#);
}

#[test]
fn emoji() {
    let h = Some(Highlighter::new());
    let d = DomHighlighter::new(h.as_ref(), r#"<div>:cat:<pre lang="plain">:cat:<code>:cat:</code></pre>:dog::nothotdog:</div>"#, &(), true, Instant::now() + Duration::from_secs(4));
    assert_eq!(d.filtered(), r#"<div>🐱<pre lang="plain" class="one-liner">:cat:<code><tt class="txt-plain">:cat:</tt></code></pre>🐶:nothotdog:</div>"#);
}