use std::collections::{HashMap, HashSet};
use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::error::{Error, ZaloResult};
use crate::grammars::{
BASE_GLOBAL_RULE_REF, CompiledGrammar, GlobalRuleRef, GrammarId, InjectionPrecedence, Match,
NO_OP_GLOBAL_RULE_REF, ROOT_RULE_ID, RawGrammar, Rule,
};
use crate::highlight::{HighlightedText, Highlighter, MergingOptions};
use crate::scope::Scope;
#[cfg(feature = "dump")]
use crate::scope::ScopeRepository;
use crate::themes::{CompiledTheme, RawTheme, ThemeVariant};
use crate::tokenizer::{Token, Tokenizer};
#[cfg(feature = "dump")]
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
struct Dump {
registry: Registry,
scope_repo: ScopeRepository,
}
#[cfg(feature = "dump")]
const BUILTIN_DATA: &[u8] = include_bytes!("../builtin.msgpack");
pub const PLAIN_GRAMMAR_NAME: &str = "plain";
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct HighlightOptions {
pub(crate) lang: String,
pub(crate) theme: ThemeVariant<String>,
pub(crate) merge_whitespaces: bool,
pub(crate) merge_same_style_tokens: bool,
pub(crate) fallback_to_plain: bool,
}
impl HighlightOptions {
pub fn new(lang: impl AsRef<str>, theme: ThemeVariant<&str>) -> Self {
let merge_same_style_tokens = matches!(theme, ThemeVariant::Single(_));
let theme = match theme {
ThemeVariant::Single(t) => ThemeVariant::Single(t.to_lowercase()),
ThemeVariant::Dual { light, dark } => ThemeVariant::Dual {
light: light.to_lowercase(),
dark: dark.to_lowercase(),
},
};
Self {
lang: lang.as_ref().to_lowercase(),
theme,
merge_same_style_tokens,
merge_whitespaces: true,
fallback_to_plain: false,
}
}
pub fn merge_whitespace(mut self, value: bool) -> Self {
self.merge_whitespaces = value;
self
}
pub fn merge_same_style_tokens(mut self, value: bool) -> Self {
self.merge_same_style_tokens = value;
self
}
pub fn fallback_to_plain(mut self, value: bool) -> Self {
self.fallback_to_plain = value;
self
}
}
#[derive(Debug, Clone)]
pub struct HighlightedCode<'a> {
pub language: &'a str,
pub theme: ThemeVariant<&'a CompiledTheme>,
pub tokens: Vec<Vec<HighlightedText>>,
}
#[inline]
pub(crate) fn normalize_string(s: &str) -> String {
s.replace("\r\n", "\n").replace('\r', "\n")
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct Registry {
pub(crate) grammars: Vec<CompiledGrammar>,
grammar_id_by_scope_name: HashMap<String, GrammarId>,
pub(crate) grammar_id_by_name: HashMap<String, GrammarId>,
themes: HashMap<String, CompiledTheme>,
injections_by_grammar: Vec<HashSet<GrammarId>>,
linked: bool,
}
impl Registry {
fn add_grammar_from_raw(&mut self, raw_grammar: RawGrammar) -> ZaloResult<()> {
if self.linked && self.grammar_id_by_name.contains_key(&raw_grammar.name) {
return Err(Error::ReplacingGrammarPostLinking(
raw_grammar.name.to_owned(),
));
}
let grammar_id = GrammarId(self.grammars.len() as u16);
let grammar = CompiledGrammar::from_raw_grammar(raw_grammar, grammar_id);
let grammar_name = grammar.name.to_lowercase();
let grammar_scope_name = grammar.scope_name.clone();
self.grammars.push(grammar);
self.grammar_id_by_scope_name
.insert(grammar_scope_name, grammar_id);
self.grammar_id_by_name.insert(grammar_name, grammar_id);
self.injections_by_grammar.push(HashSet::new());
Ok(())
}
pub fn add_grammar_from_path(&mut self, path: impl AsRef<Path>) -> ZaloResult<()> {
let raw_grammar = RawGrammar::load_from_file(path)?;
self.add_grammar_from_raw(raw_grammar)
}
pub fn add_plain_grammar(&mut self, aliases: &[&str]) -> ZaloResult<()> {
let raw = RawGrammar {
name: PLAIN_GRAMMAR_NAME.to_owned(),
scope_name: PLAIN_GRAMMAR_NAME.to_owned(),
..Default::default()
};
self.add_grammar_from_raw(raw)?;
for alias in aliases {
self.add_alias(PLAIN_GRAMMAR_NAME, alias);
}
Ok(())
}
pub fn add_alias(&mut self, grammar_name: &str, alias: &str) {
if let Some(grammar_id) = self
.grammar_id_by_name
.get(grammar_name.to_lowercase().as_str())
{
self.grammar_id_by_name
.insert(alias.to_lowercase(), *grammar_id);
}
}
pub fn add_theme_from_path(&mut self, path: impl AsRef<Path>) -> ZaloResult<()> {
let raw_theme = RawTheme::load_from_file(path)?;
let compiled_theme = raw_theme.compile()?;
self.themes
.insert(compiled_theme.name.to_lowercase(), compiled_theme);
Ok(())
}
pub fn generate_css(&self, theme_name: &str, prefix: &str) -> ZaloResult<String> {
let theme = self
.themes
.get(theme_name.to_lowercase().as_str())
.ok_or_else(|| Error::ThemeNotFound(theme_name.to_string()))?;
Ok(crate::themes::css::generate_css(theme, prefix))
}
pub fn generate_all_css(&self, prefix: &str) -> ZaloResult<HashMap<String, String>> {
let mut css_map = HashMap::new();
for (theme_name, theme) in &self.themes {
let css = crate::themes::css::generate_css(theme, prefix);
css_map.insert(theme_name.clone(), css);
}
Ok(css_map)
}
pub fn get_theme_names(&self) -> ZaloResult<Vec<String>> {
Ok(self.themes.keys().cloned().collect())
}
pub(crate) fn tokenize(
&self,
grammar_id: GrammarId,
content: &str,
) -> ZaloResult<Vec<Vec<Token>>> {
let mut tokenizer = Tokenizer::new(grammar_id, self);
let tokens = tokenizer
.tokenize_string(content)
.map_err(Error::TokenizeRegex)?;
Ok(tokens)
}
pub fn contains_grammar(&self, name: &str) -> bool {
self.grammar_id_by_name
.contains_key(name.to_lowercase().as_str())
}
pub fn contains_theme(&self, name: &str) -> bool {
self.themes.contains_key(name.to_lowercase().as_str())
}
pub fn highlight(
&self,
content: &str,
options: &HighlightOptions,
) -> ZaloResult<HighlightedCode<'_>> {
if !self.linked {
return Err(Error::UnlinkedGrammars);
}
let grammar_id = *self
.grammar_id_by_name
.get(&options.lang)
.or_else(|| {
if options.fallback_to_plain {
self.grammar_id_by_name.get(PLAIN_GRAMMAR_NAME)
} else {
None
}
})
.ok_or_else(|| Error::GrammarNotFound(options.lang.clone()))?;
let normalized_content = normalize_string(content);
let tokens = self.tokenize(grammar_id, &normalized_content)?;
let merging_options = MergingOptions {
merge_whitespaces: options.merge_whitespaces,
merge_same_style_tokens: options.merge_same_style_tokens,
};
match &options.theme {
ThemeVariant::Single(theme_name) => {
let theme = self
.themes
.get(theme_name)
.ok_or_else(|| Error::ThemeNotFound(theme_name.clone()))?;
let mut highlighter = Highlighter::new(theme);
let highlighted_tokens =
highlighter.highlight_tokens(&normalized_content, tokens, merging_options);
Ok(HighlightedCode {
language: &self.grammars[grammar_id].name,
theme: ThemeVariant::Single(theme),
tokens: highlighted_tokens,
})
}
ThemeVariant::Dual { light, dark } => {
let light_theme = self
.themes
.get(light)
.ok_or_else(|| Error::ThemeNotFound(light.clone()))?;
let dark_theme = self
.themes
.get(dark)
.ok_or_else(|| Error::ThemeNotFound(dark.clone()))?;
let mut highlighter = Highlighter::new_dual(light_theme, dark_theme);
let highlighted_tokens =
highlighter.highlight_tokens(&normalized_content, tokens, merging_options);
Ok(HighlightedCode {
language: &self.grammars[grammar_id].name,
theme: ThemeVariant::Dual {
light: light_theme,
dark: dark_theme,
},
tokens: highlighted_tokens,
})
}
}
}
pub fn link_grammars(&mut self) {
let grammar_names = &self.grammar_id_by_scope_name;
let grammars_ptr = &self.grammars as *const Vec<CompiledGrammar>;
for grammar in self.grammars.iter_mut() {
let grammars_ptr = unsafe { &*grammars_ptr };
grammar.resolve_external_references(grammar_names, grammars_ptr);
for inject_to in &grammar.inject_to {
if let Some(g_id) = self.grammar_id_by_name.get(inject_to) {
self.injections_by_grammar[g_id.as_index()].insert(grammar.id);
}
}
}
self.linked = true;
}
fn get_rule_patterns(
&self,
base_grammar_id: GrammarId,
mut rule_ref: GlobalRuleRef,
visited: &mut HashSet<GlobalRuleRef>,
) -> Vec<(GlobalRuleRef, &str)> {
let mut out = vec![];
if visited.contains(&rule_ref) || rule_ref == NO_OP_GLOBAL_RULE_REF {
return out;
}
if rule_ref == BASE_GLOBAL_RULE_REF {
rule_ref = GlobalRuleRef {
grammar: base_grammar_id,
rule: ROOT_RULE_ID,
};
}
visited.insert(rule_ref);
let grammar = &self.grammars[rule_ref.grammar];
let rule = &grammar.rules[rule_ref.rule];
match rule {
Rule::Match(Match { regex_id, .. }) => {
if let Some(regex_id) = regex_id {
let re = &grammar.regexes[*regex_id];
out.push((rule_ref, re.pattern()));
}
}
Rule::IncludeOnly(i) => {
out.extend(self.get_pattern_set_data(base_grammar_id, &i.patterns, visited));
}
Rule::BeginEnd(b) => out.push((rule_ref, grammar.regexes[b.begin].pattern())),
Rule::BeginWhile(b) => out.push((rule_ref, grammar.regexes[b.begin].pattern())),
Rule::Noop => {}
}
out
}
fn get_pattern_set_data(
&self,
base_grammar_id: GrammarId,
rule_refs: &[GlobalRuleRef],
visited: &mut HashSet<GlobalRuleRef>,
) -> Vec<(GlobalRuleRef, &str)> {
let mut out = Vec::new();
for r in rule_refs {
let rule_patterns = self.get_rule_patterns(base_grammar_id, *r, visited);
out.extend(rule_patterns);
}
out
}
pub(crate) fn collect_patterns(
&self,
base_grammar_id: GrammarId,
rule_ref: GlobalRuleRef,
) -> Vec<(GlobalRuleRef, &str)> {
let grammar = &self.grammars[rule_ref.grammar];
let base_patterns: &[GlobalRuleRef] = match &grammar.rules[rule_ref.rule] {
Rule::IncludeOnly(a) => &a.patterns,
Rule::BeginEnd(a) => &a.patterns,
Rule::BeginWhile(a) => &a.patterns,
Rule::Match(_) | Rule::Noop => &[],
};
let mut visited = HashSet::new();
self.get_pattern_set_data(base_grammar_id, base_patterns, &mut visited)
}
pub(crate) fn collect_injection_patterns(
&self,
target_grammar_id: GrammarId,
scope_stack: &[Scope],
) -> Vec<(InjectionPrecedence, GlobalRuleRef)> {
let mut result = Vec::new();
for (matchers, rule) in &self.grammars[target_grammar_id].injections {
for matcher in matchers {
if matcher.matches(scope_stack) {
if cfg!(feature = "debug") {
eprintln!(
"Scope stack {scope_stack:?} matched injection selector {matcher:?}"
);
}
result.push((matcher.precedence(), *rule));
}
}
}
for &injector_id in &self.injections_by_grammar[target_grammar_id.as_index()] {
let injector = &self.grammars[injector_id];
if let Some(matcher) = injector
.injection_selector
.iter()
.find(|matcher| matcher.matches(scope_stack))
{
result.push((
matcher.precedence(),
GlobalRuleRef {
grammar: injector_id,
rule: ROOT_RULE_ID,
},
));
}
}
result.sort_by_key(|(precedence, _)| match precedence {
InjectionPrecedence::Left => -1,
InjectionPrecedence::Right => 1,
});
result
}
#[cfg(feature = "dump")]
pub fn dump_to_file(&self, path: impl AsRef<Path>) -> ZaloResult<()> {
use crate::scope::lock_global_scope_repo;
use flate2::{Compression, write::GzEncoder};
use std::io::Write;
let scope_repo = lock_global_scope_repo().clone();
let dump = Dump {
registry: self.clone(),
scope_repo,
};
let msgpack_data = rmp_serde::to_vec(&dump)?;
let file = std::fs::File::create(path)?;
let mut encoder = GzEncoder::new(file, Compression::default());
encoder.write_all(&msgpack_data)?;
encoder.finish()?;
Ok(())
}
#[cfg(feature = "dump")]
fn load_from_bytes(compressed_data: &[u8]) -> ZaloResult<Self> {
use crate::scope::replace_global_scope_repo;
use flate2::read::GzDecoder;
use std::io::Read;
let mut decoder = GzDecoder::new(compressed_data);
let mut msgpack_data = Vec::new();
decoder.read_to_end(&mut msgpack_data)?;
let dump: Dump = rmp_serde::from_slice(&msgpack_data)?;
replace_global_scope_repo(dump.scope_repo);
Ok(dump.registry)
}
#[cfg(feature = "dump")]
pub fn load_from_file(path: impl AsRef<Path>) -> ZaloResult<Self> {
let compressed_data = std::fs::read(path)?;
Self::load_from_bytes(&compressed_data)
}
#[cfg(feature = "dump")]
pub fn builtin() -> ZaloResult<Self> {
Self::load_from_bytes(BUILTIN_DATA)
}
}
#[cfg(test)]
mod tests {
use std::fs::{self, create_dir_all};
use super::*;
use crate::highlight::HighlightedText;
use crate::test_utils::get_registry;
use crate::themes::font_style::FontStyle;
fn format_highlighted_tokens(
highlighted_tokens: &[Vec<HighlightedText>],
content: &str,
) -> String {
let lines: Vec<&str> = content.lines().collect();
let mut result = String::new();
for (line_idx, line_tokens) in highlighted_tokens.iter().enumerate() {
if line_idx >= lines.len() {
break;
}
for token in line_tokens {
let ThemeVariant::Single(style) = &token.style else {
unreachable!()
};
let hex_color = style.foreground.as_hex();
let font_style_abbr = if style.font_style.is_empty() {
" ".to_string() } else {
let mut abbr = String::from("[");
if style.font_style.contains(FontStyle::BOLD) {
abbr.push('b');
}
if style.font_style.contains(FontStyle::ITALIC) {
abbr.push('i');
}
if style.font_style.contains(FontStyle::UNDERLINE) {
abbr.push('u');
}
if style.font_style.contains(FontStyle::STRIKETHROUGH) {
abbr.push('s');
}
abbr.push(']');
format!("{:<6}", abbr) };
result.push_str(&format!(
"{:<10}{}{}\n",
hex_color, font_style_abbr, token.text
));
}
}
result
}
fn format_tokens(input: &str, lines_tokens: Vec<Vec<Token>>) -> String {
let normalized = input.replace("\r\n", "\n").replace('\r', "\n");
let lines: Vec<&str> = normalized.split('\n').collect();
let mut out = String::new();
for (line_idx, line_tokens) in lines_tokens.iter().enumerate() {
let line = lines.get(line_idx).unwrap_or(&"");
for (token_idx, token) in line_tokens.iter().enumerate() {
let text = &line[token.span.start..token.span.end];
out.push_str(&format!(
"{}: '{}' (line {})\n", token_idx, text, line_idx
));
for scope in &token.scopes {
out.push_str(&format!(" - {scope}\n"));
}
out.push('\n');
}
}
out
}
fn get_output_folder_content(path: impl AsRef<Path>) -> Vec<(String, String)> {
let mut out = Vec::new();
for entry in fs::read_dir(path).unwrap() {
let path = entry.unwrap().path();
let grammar_name = path.file_stem().unwrap().to_str().unwrap().to_string();
let content = fs::read_to_string(&path).unwrap();
out.push((grammar_name, content));
}
out
}
#[test]
fn cannot_replace_grammar_after_linking() {
let mut registry = Registry::default();
registry
.add_grammar_from_path("grammars-themes/packages/tm-grammars/grammars/json.json")
.unwrap();
registry.link_grammars();
let result = registry
.add_grammar_from_path("grammars-themes/packages/tm-grammars/grammars/json.json");
assert!(result.is_err());
}
#[test]
fn can_tokenize_like_vscode_textmate() {
let registry = get_registry();
create_dir_all("src/fixtures/tokens").unwrap();
let expected_tokens = get_output_folder_content("src/fixtures/tokens");
for (grammar, expected) in expected_tokens {
let sample_path = format!("grammars-themes/samples/{grammar}.sample");
println!("Checking {sample_path}");
let sample_content = normalize_string(&fs::read_to_string(sample_path).unwrap());
let tokens = registry
.tokenize(registry.grammar_id_by_name[&grammar], &sample_content)
.unwrap();
let out = format_tokens(&sample_content, tokens);
assert_eq!(expected.trim(), out.trim());
}
}
#[test]
fn can_highlight_plain_grammar() {
let mut registry = Registry::default();
registry.add_plain_grammar(&[]).unwrap();
registry
.add_theme_from_path("grammars-themes/packages/tm-themes/themes/vitesse-black.json")
.unwrap();
registry.link_grammars();
let sample_content = normalize_string(
&fs::read_to_string("grammars-themes/samples/javascript.sample").unwrap(),
);
let highlighted = registry
.highlight(
&sample_content,
&HighlightOptions::new(PLAIN_GRAMMAR_NAME, ThemeVariant::Single("vitesse-black"))
.merge_whitespace(false)
.merge_same_style_tokens(false),
)
.unwrap();
let out = format_highlighted_tokens(&highlighted.tokens, &sample_content);
insta::assert_snapshot!(out);
let highlighted2 = registry
.highlight(
&sample_content,
&HighlightOptions::new("unknown", ThemeVariant::Single("vitesse-black"))
.merge_whitespace(false)
.merge_same_style_tokens(false)
.fallback_to_plain(true),
)
.unwrap();
let out2 = format_highlighted_tokens(&highlighted2.tokens, &sample_content);
assert_eq!(out, out2);
}
#[test]
fn can_highlight_like_vscode_textmate() {
let registry = get_registry();
create_dir_all("src/fixtures/snapshots").unwrap();
let expected_snapshots = get_output_folder_content("src/fixtures/snapshots");
for (grammar, expected) in expected_snapshots {
let sample_path = format!("grammars-themes/samples/{grammar}.sample");
println!("Checking {sample_path}");
let sample_content = normalize_string(&fs::read_to_string(sample_path).unwrap());
let highlighted = registry
.highlight(
&sample_content,
&HighlightOptions::new(&grammar, ThemeVariant::Single("vitesse-black"))
.merge_whitespace(false)
.merge_same_style_tokens(false),
)
.unwrap();
let out = format_highlighted_tokens(&highlighted.tokens, &sample_content);
assert_eq!(expected.trim(), out.trim());
}
}
}