Skip to main content

rustik_highlight/grammar/
tokenize.rs

1//! Runtime line tokenization for compiled grammars.
2//!
3//! Each step picks the next event in the line — a fresh root-pattern match, a
4//! nested match inside the currently open begin/end rule, or that rule's end —
5//! emits the spans it produces, and updates the parser state accordingly.
6
7use super::end::{EndRegex, EndRegexCache};
8use super::pattern::{Match, OpenRule, Pattern, RegexMatch};
9use super::*;
10
11/// Reusable line tokenizer for one compiled grammar.
12///
13/// The tokenizer keeps runtime scratch that is not part of immutable grammar
14/// data, such as dynamically resolved end regexes. Reuse one tokenizer across
15/// a blob or visible range to avoid recompiling those regexes unnecessarily.
16#[derive(Debug)]
17pub struct LineTokenizer<'grammar> {
18    /// Grammar this tokenizer emits scopes for.
19    pub(crate) grammar: &'grammar Grammar,
20    /// Cache of compiled regexes for resolved dynamic end patterns.
21    end_regex_cache: EndRegexCache,
22}
23
24impl<'grammar> LineTokenizer<'grammar> {
25    /// Creates a tokenizer for a compiled grammar.
26    pub fn new(grammar: &'grammar Grammar) -> Self {
27        Self {
28            grammar,
29            end_regex_cache: EndRegexCache::new(),
30        }
31    }
32
33    /// Clears cached dynamic end regexes while retaining allocations.
34    pub fn clear_caches(&mut self) {
35        self.end_regex_cache.clear();
36    }
37
38    /// Tokenizes one line, mutating the caller-owned line state.
39    pub fn tokenize_line(&mut self, state: &mut LineState, line: &str) -> Vec<ScopeSpan> {
40        let mut spans = Vec::new();
41        self.tokenize_line_into(state, line, &mut spans);
42        spans
43    }
44
45    /// Tokenizes one line into a caller-owned buffer.
46    pub fn tokenize_line_into(
47        &mut self,
48        state: &mut LineState,
49        line: &str,
50        spans: &mut Vec<ScopeSpan>,
51    ) {
52        spans.clear();
53
54        if matches!(self.grammar.kind, GrammarKind::Json) {
55            json::tokenize_line_into(line, spans);
56            return;
57        }
58        let line = trim_line_end(line);
59        let mut pos = 0;
60
61        while pos < line.len() {
62            pos = self.advance(state, line, pos, spans);
63        }
64    }
65
66    /// Performs one tokenization step and returns the next byte position.
67    fn advance(
68        &mut self,
69        state: &mut LineState,
70        line: &str,
71        pos: usize,
72        spans: &mut Vec<ScopeSpan>,
73    ) -> usize {
74        if let Some(open) = state.stack.last().cloned() {
75            self.advance_inside_open_rule(state, line, pos, spans, open)
76        } else {
77            self.advance_at_root(state, line, pos, spans)
78        }
79    }
80
81    /// Advances past one root-level pattern match, or to end of line if there is none.
82    fn advance_at_root(
83        &mut self,
84        state: &mut LineState,
85        line: &str,
86        pos: usize,
87        spans: &mut Vec<ScopeSpan>,
88    ) -> usize {
89        let Some(found) = self
90            .grammar
91            .patterns
92            .find_next(line, pos, &mut self.end_regex_cache)
93        else {
94            return line.len();
95        };
96        let next_pos = found.next_pos(line);
97        let opened = found.open_rule();
98        spans.extend(found.spans);
99        if let Some(opened) = opened {
100            state.stack.push(opened);
101        }
102        next_pos
103    }
104
105    /// Advances past one event inside the currently open begin/end rule.
106    fn advance_inside_open_rule(
107        &mut self,
108        state: &mut LineState,
109        line: &str,
110        pos: usize,
111        spans: &mut Vec<ScopeSpan>,
112        open: OpenRule,
113    ) -> usize {
114        let Some(pattern) = self.grammar.pattern_by_rule(open.rule_id) else {
115            state.stack.pop();
116            return pos;
117        };
118        let Some(end) = pattern.resume_end(open.dynamic_end.as_deref(), &mut self.end_regex_cache)
119        else {
120            state.stack.pop();
121            return pos;
122        };
123        let event = self.next_event_inside(pattern, &end, line, pos);
124        event.apply(state, line, pos, spans, pattern)
125    }
126
127    /// Finds the next event (nested match, end match, or end of line) inside an open rule.
128    fn next_event_inside(
129        &mut self,
130        pattern: &Pattern,
131        end: &EndRegex<'_>,
132        line: &str,
133        pos: usize,
134    ) -> InsideEvent {
135        let nested = pattern
136            .nested
137            .find_next(line, pos, &mut self.end_regex_cache);
138        let close = RegexMatch::find(end.regex(), line, pos, !pattern.captures.end.is_empty());
139
140        match (nested, close) {
141            (Some(nested), Some(close)) if nested.start < close.start => {
142                InsideEvent::Nested(nested)
143            }
144            (_, Some(close)) => InsideEvent::Close(close),
145            (Some(nested), None) => InsideEvent::Nested(nested),
146            (None, None) => InsideEvent::None,
147        }
148    }
149
150    /// Returns the number of cached dynamic end regexes (test helper).
151    #[cfg(test)]
152    pub(super) fn end_regex_cache_len(&self) -> usize {
153        self.end_regex_cache.len()
154    }
155}
156
157/// One event found inside an open begin/end rule.
158enum InsideEvent {
159    /// Nested pattern matched before the end pattern.
160    Nested(Match),
161    /// End pattern closed the open rule.
162    Close(RegexMatch),
163    /// Neither nested nor end matched on the rest of this line.
164    None,
165}
166
167impl InsideEvent {
168    /// Emits the event's spans, updates the parser stack, and returns the next byte position.
169    fn apply(
170        self,
171        state: &mut LineState,
172        line: &str,
173        pos: usize,
174        spans: &mut Vec<ScopeSpan>,
175        pattern: &Pattern,
176    ) -> usize {
177        match self {
178            Self::Close(close) => {
179                pattern.scope.push_visible(spans, pos, close.start);
180                pattern.append_match_spans(&close, &pattern.captures.end, spans);
181                state.stack.pop();
182                close.next_pos(line)
183            }
184            Self::Nested(nested) => {
185                pattern.scope.push_visible(spans, pos, nested.start);
186
187                let next_pos = nested.next_pos(line);
188                let opened = nested.open_rule();
189
190                spans.extend(nested.spans);
191
192                if let Some(opened) = opened {
193                    state.stack.push(opened);
194                }
195                next_pos
196            }
197            Self::None => {
198                pattern.scope.push_visible(spans, pos, line.len());
199                line.len()
200            }
201        }
202    }
203}