Skip to main content

rustik_highlight/
grammar.rs

1//! Grammar compilation and line tokenization.
2//!
3//! TextMate grammars are parsed into immutable pattern trees with interned
4//! scope names and compiled Oniguruma regex sets. The public tokenizer APIs use
5//! caller-owned [`LineState`] values so multi-line constructs can continue
6//! across line boundaries without forcing whole-file parsing.
7
8use std::collections::{BTreeMap, HashSet};
9use std::path::Path;
10use std::str::FromStr;
11
12use onig::{RegSet, RegSetLead, Regex, Region, SearchOptions};
13
14use crate::json;
15use crate::raw::{RawCapture, RawGrammar, RawPattern, first_line_patterns};
16use crate::util::{key, next_char_boundary, path_keys, trim_line_end};
17use crate::{Error, MAX_INCLUDE_DEPTH};
18
19mod end;
20mod pattern;
21mod tokenize;
22
23pub use tokenize::LineTokenizer;
24
25use pattern::{OpenRule, PatternSet, ScopeInterner};
26
27/// Display name for the built-in plain-text fallback grammar.
28pub(crate) const PLAIN_TEXT_NAME: &str = "Plain Text";
29
30/// Interned scope identifier.
31#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
32pub struct ScopeId(
33    /// Index into a grammar's interned scope-name table.
34    usize,
35);
36
37impl ScopeId {
38    /// Returns this id's index into its grammar's interned scope-name table.
39    pub const fn index(self) -> usize {
40        self.0
41    }
42
43    /// Creates a scope id from an interned scope-name table index.
44    pub(crate) const fn new(index: usize) -> Self {
45        Self(index)
46    }
47}
48
49/// Scope span produced by tokenization.
50#[derive(Clone, Copy, Debug, Eq, PartialEq)]
51pub struct ScopeSpan {
52    /// Start byte in the line.
53    pub start: usize,
54    /// End byte in the line.
55    pub end: usize,
56    /// Interned scope id.
57    pub scope: ScopeId,
58}
59
60/// Per-line parser state.
61#[derive(Clone, Debug, Default, Eq, PartialEq)]
62pub struct LineState {
63    /// Open begin/end rules carried across line boundaries.
64    stack: Vec<OpenRule>,
65}
66
67impl LineState {
68    /// Returns true when there are no open multi-line rules.
69    pub fn is_empty(&self) -> bool {
70        self.stack.is_empty()
71    }
72
73    /// Returns the number of open multi-line rules.
74    pub fn depth(&self) -> usize {
75        self.stack.len()
76    }
77}
78
79/// Immutable compiled grammar data.
80#[derive(Debug)]
81pub struct Grammar {
82    /// Human-readable grammar name.
83    pub name: String,
84    /// Root scope name.
85    pub scope_name: String,
86    /// Interned scope names used by this grammar.
87    pub scopes: Vec<String>,
88    /// Tokenization strategy used by this grammar.
89    pub kind: GrammarKind,
90    /// Lowercase file extensions, file names, and aliases matched by this grammar.
91    file_types: HashSet<String>,
92    /// Regexes used to match a source file's first line.
93    first_line_match: Vec<Regex>,
94    /// Root pattern set for TextMate tokenization.
95    patterns: PatternSet,
96    /// Path of pattern indices into `patterns`, indexed by rule id.
97    rule_paths: Vec<Box<[usize]>>,
98}
99
100/// Tokenization strategy used by a grammar.
101#[derive(Clone, Copy, Debug, Eq, PartialEq)]
102pub enum GrammarKind {
103    /// TextMate grammar compiled to Oniguruma regex sets.
104    TextMate,
105    /// Dedicated line-oriented JSON scanner.
106    Json,
107}
108
109impl Grammar {
110    /// Compiles a raw grammar.
111    pub fn compile(raw: &RawGrammar) -> Result<Self, Error> {
112        let mut next_rule = 0;
113        let mut interner = ScopeInterner::default();
114        let patterns = PatternSet::compile(raw, &raw.patterns, &mut next_rule, 0, &mut interner)?;
115        let first_line_match = first_line_patterns(raw.first_line_match.as_ref())
116            .iter()
117            .map(|pattern| Regex::new(pattern).map_err(|_| Error::InvalidRegex(pattern.clone())))
118            .collect::<Result<Vec<_>, _>>()?;
119        let rule_paths = patterns.collect_rule_paths(next_rule);
120
121        Ok(Self {
122            name: raw.display_name.clone().unwrap_or_else(|| raw.name.clone()),
123            scope_name: raw.scope_name.clone(),
124            file_types: raw
125                .file_types
126                .as_deref()
127                .unwrap_or_default()
128                .iter()
129                .chain(std::iter::once(&raw.name))
130                .map(|item| key(item))
131                .collect(),
132            first_line_match,
133            patterns,
134            rule_paths,
135            scopes: interner.scopes,
136            kind: GrammarKind::TextMate,
137        })
138    }
139
140    /// Builds a grammar with no patterns for plain text fallback.
141    pub fn plain_text() -> Self {
142        Self {
143            name: PLAIN_TEXT_NAME.to_owned(),
144            scope_name: "text.plain".to_owned(),
145            file_types: ["txt".to_owned(), "text".to_owned()].into_iter().collect(),
146            first_line_match: Vec::new(),
147            patterns: PatternSet::empty(),
148            rule_paths: Vec::new(),
149            scopes: Vec::new(),
150            kind: GrammarKind::TextMate,
151        }
152    }
153
154    /// Builds the dedicated fast JSON grammar.
155    ///
156    /// This does not use the recursive TextMate tokenizer. It emits the same
157    /// kind of [`ScopeSpan`] values as compiled grammars while scanning each
158    /// line directly.
159    pub fn json() -> Self {
160        Self {
161            name: "JSON".to_owned(),
162            scope_name: "source.json".to_owned(),
163            file_types: ["json".to_owned()].into_iter().collect(),
164            first_line_match: Vec::new(),
165            patterns: PatternSet::empty(),
166            rule_paths: Vec::new(),
167            scopes: json::SCOPES
168                .iter()
169                .map(|scope| (*scope).to_owned())
170                .collect(),
171            kind: GrammarKind::Json,
172        }
173    }
174
175    /// Returns true when this grammar matches a syntax name or alias.
176    pub fn matches_name(&self, name: &str) -> bool {
177        name.eq_ignore_ascii_case(&self.name)
178            || name.eq_ignore_ascii_case(&self.scope_name)
179            || self.file_types.contains(&key(name))
180    }
181
182    /// Returns true when this grammar matches a file path.
183    pub fn matches_path(&self, path: &Path) -> bool {
184        path_keys(path).any(|key| self.file_types.contains(&key))
185    }
186
187    /// Returns true when this grammar matches the first source line.
188    pub fn matches_first_line(&self, line: &str) -> bool {
189        self.first_line_match
190            .iter()
191            .any(|regex| regex.find(line).is_some())
192    }
193
194    /// Tokenizes one line, mutating the caller-owned line state.
195    pub fn tokenize_line(&self, state: &mut LineState, line: &str) -> Vec<ScopeSpan> {
196        let mut spans = Vec::new();
197        self.tokenize_line_into(state, line, &mut spans);
198        spans
199    }
200
201    /// Tokenizes one line into a caller-owned buffer.
202    pub fn tokenize_line_into(
203        &self,
204        state: &mut LineState,
205        line: &str,
206        spans: &mut Vec<ScopeSpan>,
207    ) {
208        LineTokenizer::new(self).tokenize_line_into(state, line, spans);
209    }
210
211    /// Returns the compiled pattern for a stored rule id.
212    fn pattern_by_rule(&self, rule_id: usize) -> Option<&pattern::Pattern> {
213        let path = self.rule_paths.get(rule_id)?;
214        self.patterns.pattern_at(path)
215    }
216}
217
218impl FromStr for Grammar {
219    type Err = Error;
220
221    /// Parses and compiles a grammar from JSON.
222    fn from_str(input: &str) -> Result<Self, Self::Err> {
223        let raw = RawGrammar::from_str(input)?;
224        Self::compile(&raw)
225    }
226}