render_readme 0.11.0

Render Markdown or reStructuredText with syntax highlighting and image filtering similar to GitHub's
Documentation
use once_cell::sync::Lazy;
use regex::Regex;
use syntect::parsing::{ClearAmount, ParseState, ScopeStackOp, SyntaxReference, SyntaxSet, SCOPE_REPO};

/// Wrapper for a collection of Syntect syntaxes
#[derive(Default)]
pub struct Highlighter {
    syntax: SyntaxSet,
}

impl Highlighter {
    #[must_use]
    pub fn new() -> Self {
        Self {
            // if the file is missing, run the dump tool
            syntax: syntect::dumps::from_binary(include_bytes!("../syntax.dump")),
        }
    }

    fn get_syntax(&self) -> &SyntaxSet {
        &self.syntax
    }

    /// Syntax, warnings
    pub fn highlight_as_node(&self, dom: &mut RcDom, code: &str, language: Option<&str>, rustdoc_extensions: bool) -> (Handle, Option<String>) {
        let syntaxes = self.get_syntax();
        let mut warning = None;
        let lang = language.and_then(|l| {
            syntaxes.find_syntax_by_name(name_for_language(l))
            .or_else(|| {
                syntaxes.find_syntax_by_token(l) // same as extension but case insensitive
            }).or_else(|| {
                warning = Some(format!("ΒΆ unknown language: {l}"));
                None
            })
        }).or_else(|| {
            find_syntax_by_first_line(syntaxes, code.trim_start())
        }).unwrap_or_else(|| {
            syntaxes.find_syntax_plain_text()
        });

        (node_for_string(code, dom, lang, syntaxes, rustdoc_extensions), warning)
    }
}

fn find_syntax_by_first_line<'a>(syntaxes: &'a SyntaxSet, code: &str) -> Option<&'a SyntaxReference> {
    static PATH_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:\.\.?|~|\*|\*\*)?/\.?[a-zA-Z]\S+$|^[CD]:[/\\]|\.\*$"#).expect("regex"));
    static TOML_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^\[[a-z][a-z._-]+\]"#).expect("regex"));
    static SH_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:export [A-Z]|RUSTFLAGS|(?:cargo|make|git|cc|gcc|curl|tar|cd|brew|xcode-select|rustup|rustc|sudo|apt-get|yum) [a-z+-]|--[a-z]|\$[A-Z]|\$ |\.\.?/[a-z])"#).expect("regex"));
    static BAT_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:set [A-Z]|VCPKG|(?:vcpkg|pacman) [a-z+-]|--[a-z])"#).expect("regex"));
    static HYPHENATED_WORD: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^[a-z][a-z0-9_]*-[a-zA-Z0-9][a-zA-Z0-9_-]*$"#).expect("regex"));
    static CMD_HELP: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^USAGE:|^README|^test result:|\.\.\. bench:"#).expect("regex"));
    static MACRO: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^[a-z_]+!$"#).expect("regex"));
    static XML_HTML: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^</?[a-z-]+(?: [a-z-]+=[^/>]*)?/?>$"#).expect("regex"));

    if MACRO.is_match(code) {
        return Some(syntaxes.find_syntax_by_name("RustMacroName").unwrap());
    }

    let trimmed_code = code.trim_start();
    if trimmed_code.starts_with("MIT/Apache") || trimmed_code.starts_with("Copyright") || trimmed_code.starts_with("https:") || trimmed_code.starts_with("http:") ||
        PATH_LIKE.is_match(code) || HYPHENATED_WORD.is_match(code) || CMD_HELP.is_match(trimmed_code) {
        return syntaxes.find_syntax_by_name("Plain Text");
    }
    if TOML_LIKE.is_match(code) {
        return syntaxes.find_syntax_by_name("TOML");
    }
    if BAT_LIKE.is_match(code) {
        return syntaxes.find_syntax_by_name("Batch File");
    }
    if SH_LIKE.is_match(code) {
        return syntaxes.find_syntax_by_extension("sh");
    }
    if XML_HTML.is_match(code) {
        return syntaxes.find_syntax_by_extension("html");
    }
    syntaxes.find_syntax_by_first_line(code)
    .or_else(|| {
        syntaxes.find_syntax_by_name("Rust")
    })
}

/// Maps whatever name happens to be in code block tag to one of supported syntaxes
fn name_for_language(l: &str) -> &str {
    match l.split(',').next().unwrap().to_ascii_lowercase().trim().trim_start_matches('$') {
        "rust" | "{.rust}" | "notest" | "no_run" | "norun" | "should_panic" | "compile_fail" => "Rust",
        "c" | "c99" => "C",
        "csharp" => "C#",
        "objective_c" | "objc" => "Objective-C",
        "toml" | "ini" | "m3u" | "cargo" | "cargo.toml" | "systemd" => "TOML",
        "html" => "HTML",
        "diff" => "Diff",
        "json" => "JSON",
        "js" | "json5" => "JavaScript",
        "nginx" | "typescript" => "JavaScript",
        "glsl" => "GLSL",
        "golang" => "go",
        "xslt" => "XML",
        "hbs" | "handlebars" | "twig" | "liquid" | "jinja" | "tpl" | "tmpl" => "Jinja2",
        "markdown" | "md" => "Markdown",
        "ignore" | "text" | "plain" | "plaintext" | "notrust" | "none" | "bob" => "Plain Text",
        "tex" | "latex" | "math" => "LaTeX",
        "sh" | "bash" | "shell" | "gnuplot" | "pam" => "Bourne Again Shell (bash)",
        "elisp" | "emacs-lisp" => "Lisp",
        "batchfile" | "batch" | "bat" | "dos" => "Batch File",
        "shell-session" | "shellsession" => "Batch File",
        "dockerfile" | "docker" => "Dockerfile",
        "powershell" => "PowerShell",
        "console" | "terminal" => "Bourne Again Shell (bash)",
        "wasm" | "webassembly" => "WAST (WebAssembly Text)",
        "yaml" => "YAML",
        "f#" => "F#",
        _ => l,
    }
}

use html5ever::interface::QualName;
use html5ever::tree_builder::{create_element, Attribute, NodeOrText, TreeSink};
use markup5ever_rcdom::{Handle, RcDom};
use std::rc::Rc;

#[cfg(test)]
fn html_for_string(s: &str, syntax: &SyntaxReference, syntaxes: &SyntaxSet) -> String {
    use markup5ever_rcdom::SerializableHandle;

    let mut dom = RcDom::default();
    let node: SerializableHandle = node_for_string(s, &mut dom, syntax, syntaxes, true).into();
    crate::parser::serialize_node(&node).unwrap()
}

fn node_for_string(s: &str, dom: &mut RcDom, syntax: &SyntaxReference, syntaxes: &SyntaxSet, rustdoc_extensions: bool) -> Handle {
    let line_comment_scope = SCOPE_REPO.lock().unwrap().build("comment.line").unwrap();
    let mut parser = ParseState::new(syntax);

    let mut open_scopes: Vec<Option<String>> = Vec::with_capacity(12);

    let root_element = create_element(dom, QualName::new(None, ns!(html), local_name!("tt")), vec![]);
    let mut current_node = Rc::clone(&root_element);

    let is_rust_syntax = syntax.name == "Rust";
    let mut iter = s.split('\n').peekable();
    let mut line_comment_opened = None;
    while let Some(line) = iter.next() {
        // Rustdoc hides these lines
        if is_rust_syntax && rustdoc_extensions && line.starts_with("# ") {
            continue;
        }

        // It seems that commnent.line has special behavior of implicit reset at the end of line
        if let Some(prev_len) = line_comment_opened {
            while open_scopes.len() > prev_len {
                if let Some(Some(_)) = open_scopes.pop() {
                    let cell_tmp = current_node.parent.take(); // `Cell` is silly
                    let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
                    current_node.parent.set(cell_tmp);
                    current_node = parent;
                }
            }
            line_comment_opened = None;
        }

        let mut prev_off = 0;
        let parsed = match parser.parse_line(line, syntaxes) {
            Ok(p) => p,
            Err(_) => continue,
        };
        for (off, state) in parsed {
            if off > prev_off {
                dom.append(&current_node, NodeOrText::AppendText(line[prev_off..off].into()));
                prev_off = off;
            }
            match state {
                ScopeStackOp::Push(scope) => {
                    if line_comment_scope.is_prefix_of(scope) {
                        line_comment_opened = Some(open_scopes.len());
                    }
                    let name = scope.build_string();
                    let mut short_name2 = String::with_capacity(name.len());
                    let mut short_name3 = String::with_capacity(name.len());
                    let mut p = name.split('.').enumerate().take(3).peekable();
                    while let Some((n, part)) = p.next() {
                        let part = abbreviate_scope_name(part);
                        short_name3.push_str(part);
                        if n < 2 {
                            short_name2.push_str(part);
                        }
                        if p.peek().is_some() {
                            short_name3.push('-');
                            if n < 1 {
                                short_name2.push('-');
                            }
                        }
                    }

                    let dupe = open_scopes.iter().filter_map(|s| s.as_ref()).any(|s| *s == short_name3);
                    let useless = dupe || short_name2.as_str() == "m-grp";
                    if !useless {
                        let mut classes = String::with_capacity(short_name2.len() + short_name3.len() + 1);
                        classes.push_str(&short_name2);
                        if short_name2 != short_name3 {
                            classes.push(' ');
                            classes.push_str(&short_name3);
                        }
                        let new_node = create_element(dom, QualName::new(None, ns!(html), local_name!("tt")),vec![
                            Attribute {
                                name: QualName::new(None, ns!(), local_name!("class")),
                                value: classes.into()
                            }
                        ]);
                        dom.append(&current_node, NodeOrText::AppendNode(Rc::clone(&new_node)));
                        current_node = new_node;
                        open_scopes.push(Some(short_name3));
                    } else {
                        open_scopes.push(None);
                    }
                },
                ScopeStackOp::Pop(n) |
                ScopeStackOp::Clear(ClearAmount::TopN(n)) => {
                    for _ in 0..n {
                        if let Some(Some(_)) = open_scopes.pop() {
                            let cell_tmp = current_node.parent.take(); // `Cell` is silly
                            let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
                            current_node.parent.set(cell_tmp);
                            current_node = parent;
                        }
                    }
                },
                ScopeStackOp::Restore | ScopeStackOp::Clear(ClearAmount::All) => {
                    while let Some(Some(_)) = open_scopes.pop() {
                        let cell_tmp = current_node.parent.take(); // `Cell` is silly
                        let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
                        current_node.parent.set(cell_tmp);
                        current_node = parent;
                    }
                },
                ScopeStackOp::Noop => {},
            }
        }
        dom.append(&current_node, NodeOrText::AppendText(line[prev_off..].into()));
        if iter.peek().is_some() {
            dom.append(&current_node, NodeOrText::AppendText("\n".into()));
        }
    }
    root_element
}

fn abbreviate_scope_name(part: &str) -> &str {
    match part {
        "arguments" => "args",
        "arithmetic" => "arith",
        "assignment" => "asgn",
        "attribute-name" => "attr-n",
        "attribute-with-value" => "attr-val",
        "bash" => "sh",
        "block" => "bl",
        "boolean" => "bool",
        "cast" => "cast",
        "character" => "chr",
        "class" => "cls",
        "closure" => "closure",
        "comma" => "comma",
        "command" => "cmd",
        "comparison" => "comp",
        "compound" => "comp",
        "constant" => "const",
        "constructor" => "ctor",
        "continuation" => "cont",
        "control" => "ctrl",
        "declaration" => "decl",
        "deprecated" => "depr",
        "destructor" => "dtor",
        "dictionary" => "dict",
        "double" => "dbl",
        "double-brace" => "dbl-bl",
        "double-slash" => "dbl-sl",
        "enum" => "enum",
        "exit" => "exit",
        "expression" => "expr",
        "float" => "flt",
        "flow" => "flow",
        "function" => "fn",
        "function-call" => "fn-call",
        "generic-name" => "gen-n",
        "group" => "grp",
        "help" => "hlp",
        "html" => "html",
        "impl" => "impl",
        "instance" => "inst",
        "json" => "json",
        "key-value" => "kv",
        "keyword" => "k",
        "label" => "lbl",
        "language" => "lang",
        "lifetime" => "lf",
        "line" => "ln",
        "logical" => "logic",
        "markup" => "mk",
        "member" => "memb",
        "meta" => "m",
        "name" => "n",
        "namespace" => "ns",
        "node" => "node",
        "number-sign" => "num-sign",
        "operator" => "op",
        "other" => "ot",
        "parameter" => "parm",
        "parameters" => "parms",
        "path" => "path",
        "placeholder" => "phold",
        "plain" => "plain",
        "post-cmd" => "post-cmd",
        "preprocessor" => "prep",
        "property" => "prop",
        "python" => "py",
        "qualified-name" => "qn",
        "quoted" => "q",
        "readwrite" => "rw",
        "regexp" => "regexp",
        "return-type" => "ret-ty",
        "rust" => "rs",
        "shell" => "sh",
        "single" => "sgl",
        "source" => "src",
        "statement" => "stmt",
        "storage" => "stor",
        "struct" => "struct",
        "structure" => "struct",
        "table" => "tbl",
        "template" => "tpl",
        "template-expression" => "tpl-expr",
        "terminator" => "term",
        "text" => "txt",
        "this" => "this",
        "toml" => "toml",
        "trait" => "tr",
        "type" => "ty",
        "yaml" => "yaml",
        x => &x[0..3.min(x.len())],
    }
}

#[test]
fn parse_test() {
    let s: SyntaxSet = syntect::dumps::from_binary(include_bytes!("../syntax.dump"));

    let syn = s.find_syntax_by_name("Rust").expect("wat");
    assert_eq!("Rust", &syn.name);
    assert_eq!("<tt class=\"src-rs\">wat<tt class=\"pun-acc pun-acc-dot\">.</tt><tt class=\"sup-fn sup-fn-rs\">i</tt><tt class=\"pun-sec pun-sec-grp\">(</tt><tt class=\"pun-sec pun-sec-grp\">)</tt></tt>", &html_for_string("wat.i()", syn, &s));
}

#[test]
fn comment_bugs() {
    let s: SyntaxSet = syntect::dumps::from_binary(include_bytes!("../syntax.dump"));
    let syn = s.find_syntax_by_extension("sh").expect("sh");

    let html = html_for_string("# Install Rust
git clone https://github.com/test/test.git
cd test/
cargo build


", syn, &s);
    assert_eq!("<tt class=\"src-sh src-sh-sh\"><tt class=\"com-ln com-ln-num-sign\"><tt class=\"pun-def pun-def-com\">#</tt></tt><tt class=\"com-ln com-ln-num-sign\"> Install Rust\n</tt>git clone https://github.com/test/test.git\ncd test/\ncargo build\n\n\n</tt>", &html);
}

#[test]
fn syntaxes() {
    let h = Highlighter::new();
    let s = h.get_syntax();
    assert_eq!("TOML", &s.find_syntax_by_name("TOML").expect("TOML").name);
    assert_eq!("Lisp", &s.find_syntax_by_name("Lisp").expect("Lisp").name);
    assert_eq!("Jinja2", &s.find_syntax_by_name("Jinja2").expect("Jinja2").name);
    assert_eq!("Batch File", &s.find_syntax_by_name("Batch File").expect("Batch File").name);
    assert!(s.find_syntax_by_extension("sh").is_some());
    assert!(s.find_syntax_by_extension("html").is_some());
    assert!(s.find_syntax_by_extension("rs").is_some());
    assert!(s.find_syntax_by_extension("json").is_some());
    assert!(s.find_syntax_by_extension("js").is_some());
    assert!(s.find_syntax_by_extension("py").is_some());
    assert!(s.find_syntax_by_extension("jinja").is_some());
    assert!(s.find_syntax_by_extension("bat").is_some());
    assert!(s.find_syntax_by_extension("lisp").is_some());
    assert!(s.find_syntax_by_extension("po").is_some());
}