use once_cell::sync::Lazy;
use regex::Regex;
use syntect::parsing::{ClearAmount, ParseState, ScopeStackOp, SyntaxReference, SyntaxSet, SCOPE_REPO};
#[derive(Default)]
pub struct Highlighter {
syntax: SyntaxSet,
}
impl Highlighter {
#[must_use]
pub fn new() -> Self {
Self {
syntax: syntect::dumps::from_binary(include_bytes!("../syntax.dump")),
}
}
fn get_syntax(&self) -> &SyntaxSet {
&self.syntax
}
pub fn highlight_as_node(&self, dom: &mut RcDom, code: &str, language: Option<&str>, rustdoc_extensions: bool) -> (Handle, Option<String>) {
let syntaxes = self.get_syntax();
let mut warning = None;
let lang = language.and_then(|l| {
syntaxes.find_syntax_by_name(name_for_language(l))
.or_else(|| {
syntaxes.find_syntax_by_token(l) }).or_else(|| {
warning = Some(format!("ΒΆ unknown language: {l}"));
None
})
}).or_else(|| {
find_syntax_by_first_line(syntaxes, code.trim_start())
}).unwrap_or_else(|| {
syntaxes.find_syntax_plain_text()
});
(node_for_string(code, dom, lang, syntaxes, rustdoc_extensions), warning)
}
}
fn find_syntax_by_first_line<'a>(syntaxes: &'a SyntaxSet, code: &str) -> Option<&'a SyntaxReference> {
static PATH_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:\.\.?|~|\*|\*\*)?/\.?[a-zA-Z]\S+$|^[CD]:[/\\]|\.\*$"#).expect("regex"));
static TOML_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^\[[a-z][a-z._-]+\]"#).expect("regex"));
static SH_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:export [A-Z]|RUSTFLAGS|(?:cargo|make|git|cc|gcc|curl|tar|cd|brew|xcode-select|rustup|rustc|sudo|apt-get|yum) [a-z+-]|--[a-z]|\$[A-Z]|\$ |\.\.?/[a-z])"#).expect("regex"));
static BAT_LIKE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^(?:set [A-Z]|VCPKG|(?:vcpkg|pacman) [a-z+-]|--[a-z])"#).expect("regex"));
static HYPHENATED_WORD: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^[a-z][a-z0-9_]*-[a-zA-Z0-9][a-zA-Z0-9_-]*$"#).expect("regex"));
static CMD_HELP: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^USAGE:|^README|^test result:|\.\.\. bench:"#).expect("regex"));
static MACRO: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^[a-z_]+!$"#).expect("regex"));
static XML_HTML: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^</?[a-z-]+(?: [a-z-]+=[^/>]*)?/?>$"#).expect("regex"));
if MACRO.is_match(code) {
return Some(syntaxes.find_syntax_by_name("RustMacroName").unwrap());
}
let trimmed_code = code.trim_start();
if trimmed_code.starts_with("MIT/Apache") || trimmed_code.starts_with("Copyright") || trimmed_code.starts_with("https:") || trimmed_code.starts_with("http:") ||
PATH_LIKE.is_match(code) || HYPHENATED_WORD.is_match(code) || CMD_HELP.is_match(trimmed_code) {
return syntaxes.find_syntax_by_name("Plain Text");
}
if TOML_LIKE.is_match(code) {
return syntaxes.find_syntax_by_name("TOML");
}
if BAT_LIKE.is_match(code) {
return syntaxes.find_syntax_by_name("Batch File");
}
if SH_LIKE.is_match(code) {
return syntaxes.find_syntax_by_extension("sh");
}
if XML_HTML.is_match(code) {
return syntaxes.find_syntax_by_extension("html");
}
syntaxes.find_syntax_by_first_line(code)
.or_else(|| {
syntaxes.find_syntax_by_name("Rust")
})
}
fn name_for_language(l: &str) -> &str {
match l.split(',').next().unwrap().to_ascii_lowercase().trim().trim_start_matches('$') {
"rust" | "{.rust}" | "notest" | "no_run" | "norun" | "should_panic" | "compile_fail" => "Rust",
"c" | "c99" => "C",
"csharp" => "C#",
"objective_c" | "objc" => "Objective-C",
"toml" | "ini" | "m3u" | "cargo" | "cargo.toml" | "systemd" => "TOML",
"html" => "HTML",
"diff" => "Diff",
"json" => "JSON",
"js" | "json5" => "JavaScript",
"nginx" | "typescript" => "JavaScript",
"glsl" => "GLSL",
"golang" => "go",
"xslt" => "XML",
"hbs" | "handlebars" | "twig" | "liquid" | "jinja" | "tpl" | "tmpl" => "Jinja2",
"markdown" | "md" => "Markdown",
"ignore" | "text" | "plain" | "plaintext" | "notrust" | "none" | "bob" => "Plain Text",
"tex" | "latex" | "math" => "LaTeX",
"sh" | "bash" | "shell" | "gnuplot" | "pam" => "Bourne Again Shell (bash)",
"elisp" | "emacs-lisp" => "Lisp",
"batchfile" | "batch" | "bat" | "dos" => "Batch File",
"shell-session" | "shellsession" => "Batch File",
"dockerfile" | "docker" => "Dockerfile",
"powershell" => "PowerShell",
"console" | "terminal" => "Bourne Again Shell (bash)",
"wasm" | "webassembly" => "WAST (WebAssembly Text)",
"yaml" => "YAML",
"f#" => "F#",
_ => l,
}
}
use html5ever::interface::QualName;
use html5ever::tree_builder::{create_element, Attribute, NodeOrText, TreeSink};
use markup5ever_rcdom::{Handle, RcDom};
use std::rc::Rc;
#[cfg(test)]
fn html_for_string(s: &str, syntax: &SyntaxReference, syntaxes: &SyntaxSet) -> String {
use markup5ever_rcdom::SerializableHandle;
let mut dom = RcDom::default();
let node: SerializableHandle = node_for_string(s, &mut dom, syntax, syntaxes, true).into();
crate::parser::serialize_node(&node).unwrap()
}
fn node_for_string(s: &str, dom: &mut RcDom, syntax: &SyntaxReference, syntaxes: &SyntaxSet, rustdoc_extensions: bool) -> Handle {
let line_comment_scope = SCOPE_REPO.lock().unwrap().build("comment.line").unwrap();
let mut parser = ParseState::new(syntax);
let mut open_scopes: Vec<Option<String>> = Vec::with_capacity(12);
let root_element = create_element(dom, QualName::new(None, ns!(html), local_name!("tt")), vec![]);
let mut current_node = Rc::clone(&root_element);
let is_rust_syntax = syntax.name == "Rust";
let mut iter = s.split('\n').peekable();
let mut line_comment_opened = None;
while let Some(line) = iter.next() {
if is_rust_syntax && rustdoc_extensions && line.starts_with("# ") {
continue;
}
if let Some(prev_len) = line_comment_opened {
while open_scopes.len() > prev_len {
if let Some(Some(_)) = open_scopes.pop() {
let cell_tmp = current_node.parent.take(); let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
current_node.parent.set(cell_tmp);
current_node = parent;
}
}
line_comment_opened = None;
}
let mut prev_off = 0;
let parsed = match parser.parse_line(line, syntaxes) {
Ok(p) => p,
Err(_) => continue,
};
for (off, state) in parsed {
if off > prev_off {
dom.append(¤t_node, NodeOrText::AppendText(line[prev_off..off].into()));
prev_off = off;
}
match state {
ScopeStackOp::Push(scope) => {
if line_comment_scope.is_prefix_of(scope) {
line_comment_opened = Some(open_scopes.len());
}
let name = scope.build_string();
let mut short_name2 = String::with_capacity(name.len());
let mut short_name3 = String::with_capacity(name.len());
let mut p = name.split('.').enumerate().take(3).peekable();
while let Some((n, part)) = p.next() {
let part = abbreviate_scope_name(part);
short_name3.push_str(part);
if n < 2 {
short_name2.push_str(part);
}
if p.peek().is_some() {
short_name3.push('-');
if n < 1 {
short_name2.push('-');
}
}
}
let dupe = open_scopes.iter().filter_map(|s| s.as_ref()).any(|s| *s == short_name3);
let useless = dupe || short_name2.as_str() == "m-grp";
if !useless {
let mut classes = String::with_capacity(short_name2.len() + short_name3.len() + 1);
classes.push_str(&short_name2);
if short_name2 != short_name3 {
classes.push(' ');
classes.push_str(&short_name3);
}
let new_node = create_element(dom, QualName::new(None, ns!(html), local_name!("tt")),vec![
Attribute {
name: QualName::new(None, ns!(), local_name!("class")),
value: classes.into()
}
]);
dom.append(¤t_node, NodeOrText::AppendNode(Rc::clone(&new_node)));
current_node = new_node;
open_scopes.push(Some(short_name3));
} else {
open_scopes.push(None);
}
},
ScopeStackOp::Pop(n) |
ScopeStackOp::Clear(ClearAmount::TopN(n)) => {
for _ in 0..n {
if let Some(Some(_)) = open_scopes.pop() {
let cell_tmp = current_node.parent.take(); let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
current_node.parent.set(cell_tmp);
current_node = parent;
}
}
},
ScopeStackOp::Restore | ScopeStackOp::Clear(ClearAmount::All) => {
while let Some(Some(_)) = open_scopes.pop() {
let cell_tmp = current_node.parent.take(); let parent = cell_tmp.as_ref().expect("scope must have parent").upgrade().expect("scope's node should exist");
current_node.parent.set(cell_tmp);
current_node = parent;
}
},
ScopeStackOp::Noop => {},
}
}
dom.append(¤t_node, NodeOrText::AppendText(line[prev_off..].into()));
if iter.peek().is_some() {
dom.append(¤t_node, NodeOrText::AppendText("\n".into()));
}
}
root_element
}
fn abbreviate_scope_name(part: &str) -> &str {
match part {
"arguments" => "args",
"arithmetic" => "arith",
"assignment" => "asgn",
"attribute-name" => "attr-n",
"attribute-with-value" => "attr-val",
"bash" => "sh",
"block" => "bl",
"boolean" => "bool",
"cast" => "cast",
"character" => "chr",
"class" => "cls",
"closure" => "closure",
"comma" => "comma",
"command" => "cmd",
"comparison" => "comp",
"compound" => "comp",
"constant" => "const",
"constructor" => "ctor",
"continuation" => "cont",
"control" => "ctrl",
"declaration" => "decl",
"deprecated" => "depr",
"destructor" => "dtor",
"dictionary" => "dict",
"double" => "dbl",
"double-brace" => "dbl-bl",
"double-slash" => "dbl-sl",
"enum" => "enum",
"exit" => "exit",
"expression" => "expr",
"float" => "flt",
"flow" => "flow",
"function" => "fn",
"function-call" => "fn-call",
"generic-name" => "gen-n",
"group" => "grp",
"help" => "hlp",
"html" => "html",
"impl" => "impl",
"instance" => "inst",
"json" => "json",
"key-value" => "kv",
"keyword" => "k",
"label" => "lbl",
"language" => "lang",
"lifetime" => "lf",
"line" => "ln",
"logical" => "logic",
"markup" => "mk",
"member" => "memb",
"meta" => "m",
"name" => "n",
"namespace" => "ns",
"node" => "node",
"number-sign" => "num-sign",
"operator" => "op",
"other" => "ot",
"parameter" => "parm",
"parameters" => "parms",
"path" => "path",
"placeholder" => "phold",
"plain" => "plain",
"post-cmd" => "post-cmd",
"preprocessor" => "prep",
"property" => "prop",
"python" => "py",
"qualified-name" => "qn",
"quoted" => "q",
"readwrite" => "rw",
"regexp" => "regexp",
"return-type" => "ret-ty",
"rust" => "rs",
"shell" => "sh",
"single" => "sgl",
"source" => "src",
"statement" => "stmt",
"storage" => "stor",
"struct" => "struct",
"structure" => "struct",
"table" => "tbl",
"template" => "tpl",
"template-expression" => "tpl-expr",
"terminator" => "term",
"text" => "txt",
"this" => "this",
"toml" => "toml",
"trait" => "tr",
"type" => "ty",
"yaml" => "yaml",
x => &x[0..3.min(x.len())],
}
}
#[test]
fn parse_test() {
let s: SyntaxSet = syntect::dumps::from_binary(include_bytes!("../syntax.dump"));
let syn = s.find_syntax_by_name("Rust").expect("wat");
assert_eq!("Rust", &syn.name);
assert_eq!("<tt class=\"src-rs\">wat<tt class=\"pun-acc pun-acc-dot\">.</tt><tt class=\"sup-fn sup-fn-rs\">i</tt><tt class=\"pun-sec pun-sec-grp\">(</tt><tt class=\"pun-sec pun-sec-grp\">)</tt></tt>", &html_for_string("wat.i()", syn, &s));
}
#[test]
fn comment_bugs() {
let s: SyntaxSet = syntect::dumps::from_binary(include_bytes!("../syntax.dump"));
let syn = s.find_syntax_by_extension("sh").expect("sh");
let html = html_for_string("# Install Rust
git clone https://github.com/test/test.git
cd test/
cargo build
", syn, &s);
assert_eq!("<tt class=\"src-sh src-sh-sh\"><tt class=\"com-ln com-ln-num-sign\"><tt class=\"pun-def pun-def-com\">#</tt></tt><tt class=\"com-ln com-ln-num-sign\"> Install Rust\n</tt>git clone https://github.com/test/test.git\ncd test/\ncargo build\n\n\n</tt>", &html);
}
#[test]
fn syntaxes() {
let h = Highlighter::new();
let s = h.get_syntax();
assert_eq!("TOML", &s.find_syntax_by_name("TOML").expect("TOML").name);
assert_eq!("Lisp", &s.find_syntax_by_name("Lisp").expect("Lisp").name);
assert_eq!("Jinja2", &s.find_syntax_by_name("Jinja2").expect("Jinja2").name);
assert_eq!("Batch File", &s.find_syntax_by_name("Batch File").expect("Batch File").name);
assert!(s.find_syntax_by_extension("sh").is_some());
assert!(s.find_syntax_by_extension("html").is_some());
assert!(s.find_syntax_by_extension("rs").is_some());
assert!(s.find_syntax_by_extension("json").is_some());
assert!(s.find_syntax_by_extension("js").is_some());
assert!(s.find_syntax_by_extension("py").is_some());
assert!(s.find_syntax_by_extension("jinja").is_some());
assert!(s.find_syntax_by_extension("bat").is_some());
assert!(s.find_syntax_by_extension("lisp").is_some());
assert!(s.find_syntax_by_extension("po").is_some());
}