rustik-highlight 0.1.0

Rustik code highlighter.
Documentation
//! Runtime line tokenization for compiled grammars.
//!
//! Each step picks the next event in the line — a fresh root-pattern match, a
//! nested match inside the currently open begin/end rule, or that rule's end —
//! emits the spans it produces, and updates the parser state accordingly.

use super::end::{EndRegex, EndRegexCache};
use super::pattern::{Match, OpenRule, Pattern, RegexMatch};
use super::*;

/// Reusable line tokenizer for one compiled grammar.
///
/// The tokenizer keeps runtime scratch that is not part of immutable grammar
/// data, such as dynamically resolved end regexes. Reuse one tokenizer across
/// a blob or visible range to avoid recompiling those regexes unnecessarily.
#[derive(Debug)]
pub struct LineTokenizer<'grammar> {
    /// Grammar this tokenizer emits scopes for.
    pub(crate) grammar: &'grammar Grammar,
    /// Cache of compiled regexes for resolved dynamic end patterns.
    end_regex_cache: EndRegexCache,
}

impl<'grammar> LineTokenizer<'grammar> {
    /// Creates a tokenizer for a compiled grammar.
    pub fn new(grammar: &'grammar Grammar) -> Self {
        Self {
            grammar,
            end_regex_cache: EndRegexCache::new(),
        }
    }

    /// Clears cached dynamic end regexes while retaining allocations.
    pub fn clear_caches(&mut self) {
        self.end_regex_cache.clear();
    }

    /// Tokenizes one line, mutating the caller-owned line state.
    pub fn tokenize_line(&mut self, state: &mut LineState, line: &str) -> Vec<ScopeSpan> {
        let mut spans = Vec::new();
        self.tokenize_line_into(state, line, &mut spans);
        spans
    }

    /// Tokenizes one line into a caller-owned buffer.
    pub fn tokenize_line_into(
        &mut self,
        state: &mut LineState,
        line: &str,
        spans: &mut Vec<ScopeSpan>,
    ) {
        spans.clear();

        if matches!(self.grammar.kind, GrammarKind::Json) {
            json::tokenize_line_into(line, spans);
            return;
        }
        let line = trim_line_end(line);
        let mut pos = 0;

        while pos < line.len() {
            pos = self.advance(state, line, pos, spans);
        }
    }

    /// Performs one tokenization step and returns the next byte position.
    fn advance(
        &mut self,
        state: &mut LineState,
        line: &str,
        pos: usize,
        spans: &mut Vec<ScopeSpan>,
    ) -> usize {
        if let Some(open) = state.stack.last().cloned() {
            self.advance_inside_open_rule(state, line, pos, spans, open)
        } else {
            self.advance_at_root(state, line, pos, spans)
        }
    }

    /// Advances past one root-level pattern match, or to end of line if there is none.
    fn advance_at_root(
        &mut self,
        state: &mut LineState,
        line: &str,
        pos: usize,
        spans: &mut Vec<ScopeSpan>,
    ) -> usize {
        let Some(found) = self
            .grammar
            .patterns
            .find_next(line, pos, &mut self.end_regex_cache)
        else {
            return line.len();
        };
        let next_pos = found.next_pos(line);
        let opened = found.open_rule();
        spans.extend(found.spans);
        if let Some(opened) = opened {
            state.stack.push(opened);
        }
        next_pos
    }

    /// Advances past one event inside the currently open begin/end rule.
    fn advance_inside_open_rule(
        &mut self,
        state: &mut LineState,
        line: &str,
        pos: usize,
        spans: &mut Vec<ScopeSpan>,
        open: OpenRule,
    ) -> usize {
        let Some(pattern) = self.grammar.pattern_by_rule(open.rule_id) else {
            state.stack.pop();
            return pos;
        };
        let Some(end) = pattern.resume_end(open.dynamic_end.as_deref(), &mut self.end_regex_cache)
        else {
            state.stack.pop();
            return pos;
        };
        let event = self.next_event_inside(pattern, &end, line, pos);
        event.apply(state, line, pos, spans, pattern)
    }

    /// Finds the next event (nested match, end match, or end of line) inside an open rule.
    fn next_event_inside(
        &mut self,
        pattern: &Pattern,
        end: &EndRegex<'_>,
        line: &str,
        pos: usize,
    ) -> InsideEvent {
        let nested = pattern
            .nested
            .find_next(line, pos, &mut self.end_regex_cache);
        let close = RegexMatch::find(end.regex(), line, pos, !pattern.captures.end.is_empty());

        match (nested, close) {
            (Some(nested), Some(close)) if nested.start < close.start => {
                InsideEvent::Nested(nested)
            }
            (_, Some(close)) => InsideEvent::Close(close),
            (Some(nested), None) => InsideEvent::Nested(nested),
            (None, None) => InsideEvent::None,
        }
    }

    /// Returns the number of cached dynamic end regexes (test helper).
    #[cfg(test)]
    pub(super) fn end_regex_cache_len(&self) -> usize {
        self.end_regex_cache.len()
    }
}

/// One event found inside an open begin/end rule.
enum InsideEvent {
    /// Nested pattern matched before the end pattern.
    Nested(Match),
    /// End pattern closed the open rule.
    Close(RegexMatch),
    /// Neither nested nor end matched on the rest of this line.
    None,
}

impl InsideEvent {
    /// Emits the event's spans, updates the parser stack, and returns the next byte position.
    fn apply(
        self,
        state: &mut LineState,
        line: &str,
        pos: usize,
        spans: &mut Vec<ScopeSpan>,
        pattern: &Pattern,
    ) -> usize {
        match self {
            Self::Close(close) => {
                pattern.scope.push_visible(spans, pos, close.start);
                pattern.append_match_spans(&close, &pattern.captures.end, spans);
                state.stack.pop();
                close.next_pos(line)
            }
            Self::Nested(nested) => {
                pattern.scope.push_visible(spans, pos, nested.start);

                let next_pos = nested.next_pos(line);
                let opened = nested.open_rule();

                spans.extend(nested.spans);

                if let Some(opened) = opened {
                    state.stack.push(opened);
                }
                next_pos
            }
            Self::None => {
                pattern.scope.push_visible(spans, pos, line.len());
                line.len()
            }
        }
    }
}