rustik-highlight 0.1.0

//! Grammar compilation and line tokenization.
//!
//! TextMate grammars are parsed into immutable pattern trees with interned
//! scope names and compiled Oniguruma regex sets. The public tokenizer APIs use
//! caller-owned [`LineState`] values so multi-line constructs can continue
//! across line boundaries without forcing whole-file parsing.

use std::collections::{BTreeMap, HashSet};
use std::path::Path;
use std::str::FromStr;

use onig::{RegSet, RegSetLead, Regex, Region, SearchOptions};

use crate::json;
use crate::raw::{RawCapture, RawGrammar, RawPattern, first_line_patterns};
use crate::util::{key, next_char_boundary, path_keys, trim_line_end};
use crate::{Error, MAX_INCLUDE_DEPTH};

mod end;
mod pattern;
mod tokenize;

pub use tokenize::LineTokenizer;

use pattern::{OpenRule, PatternSet, ScopeInterner};

/// Display name for the built-in plain-text fallback grammar.
pub(crate) const PLAIN_TEXT_NAME: &str = "Plain Text";

/// Interned scope identifier.
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct ScopeId(
    /// Index into a grammar's interned scope-name table.
    usize,
);

impl ScopeId {
    /// Returns this id's index into its grammar's interned scope-name table.
    pub const fn index(self) -> usize {
        self.0
    }

    /// Creates a scope id from an interned scope-name table index.
    pub(crate) const fn new(index: usize) -> Self {
        Self(index)
    }
}

/// Scope span produced by tokenization.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct ScopeSpan {
    /// Start byte in the line.
    pub start: usize,
    /// End byte in the line.
    pub end: usize,
    /// Interned scope id.
    pub scope: ScopeId,
}

/// Per-line parser state.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct LineState {
    /// Open begin/end rules carried across line boundaries.
    stack: Vec<OpenRule>,
}

impl LineState {
    /// Returns true when there are no open multi-line rules.
    pub fn is_empty(&self) -> bool {
        self.stack.is_empty()
    }

    /// Returns the number of open multi-line rules.
    pub fn depth(&self) -> usize {
        self.stack.len()
    }
}

/// Immutable compiled grammar data.
#[derive(Debug)]
pub struct Grammar {
    /// Human-readable grammar name.
    pub name: String,
    /// Root scope name.
    pub scope_name: String,
    /// Interned scope names used by this grammar.
    pub scopes: Vec<String>,
    /// Tokenization strategy used by this grammar.
    pub kind: GrammarKind,
    /// Lowercase file extensions, file names, and aliases matched by this grammar.
    file_types: HashSet<String>,
    /// Regexes used to match a source file's first line.
    first_line_match: Vec<Regex>,
    /// Root pattern set for TextMate tokenization.
    patterns: PatternSet,
    /// Path of pattern indices into `patterns`, indexed by rule id.
    rule_paths: Vec<Box<[usize]>>,
}

/// Tokenization strategy used by a grammar.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum GrammarKind {
    /// TextMate grammar compiled to Oniguruma regex sets.
    TextMate,
    /// Dedicated line-oriented JSON scanner.
    Json,
}

impl Grammar {
    /// Compiles a raw grammar.
    pub fn compile(raw: &RawGrammar) -> Result<Self, Error> {
        let mut next_rule = 0;
        let mut interner = ScopeInterner::default();
        let patterns = PatternSet::compile(raw, &raw.patterns, &mut next_rule, 0, &mut interner)?;
        let first_line_match = first_line_patterns(raw.first_line_match.as_ref())
            .iter()
            .map(|pattern| Regex::new(pattern).map_err(|_| Error::InvalidRegex(pattern.clone())))
            .collect::<Result<Vec<_>, _>>()?;
        let rule_paths = patterns.collect_rule_paths(next_rule);

        Ok(Self {
            name: raw.display_name.clone().unwrap_or_else(|| raw.name.clone()),
            scope_name: raw.scope_name.clone(),
            file_types: raw
                .file_types
                .as_deref()
                .unwrap_or_default()
                .iter()
                .chain(std::iter::once(&raw.name))
                .map(|item| key(item))
                .collect(),
            first_line_match,
            patterns,
            rule_paths,
            scopes: interner.scopes,
            kind: GrammarKind::TextMate,
        })
    }

    /// Builds a grammar with no patterns for plain text fallback.
    pub fn plain_text() -> Self {
        Self {
            name: PLAIN_TEXT_NAME.to_owned(),
            scope_name: "text.plain".to_owned(),
            file_types: ["txt".to_owned(), "text".to_owned()].into_iter().collect(),
            first_line_match: Vec::new(),
            patterns: PatternSet::empty(),
            rule_paths: Vec::new(),
            scopes: Vec::new(),
            kind: GrammarKind::TextMate,
        }
    }

    /// Builds the dedicated fast JSON grammar.
    ///
    /// This does not use the recursive TextMate tokenizer. It emits the same
    /// kind of [`ScopeSpan`] values as compiled grammars while scanning each
    /// line directly.
    pub fn json() -> Self {
        Self {
            name: "JSON".to_owned(),
            scope_name: "source.json".to_owned(),
            file_types: ["json".to_owned()].into_iter().collect(),
            first_line_match: Vec::new(),
            patterns: PatternSet::empty(),
            rule_paths: Vec::new(),
            scopes: json::SCOPES
                .iter()
                .map(|scope| (*scope).to_owned())
                .collect(),
            kind: GrammarKind::Json,
        }
    }

    /// Returns true when this grammar matches a syntax name or alias.
    pub fn matches_name(&self, name: &str) -> bool {
        name.eq_ignore_ascii_case(&self.name)
            || name.eq_ignore_ascii_case(&self.scope_name)
            || self.file_types.contains(&key(name))
    }

    /// Returns true when this grammar matches a file path.
    pub fn matches_path(&self, path: &Path) -> bool {
        path_keys(path).any(|key| self.file_types.contains(&key))
    }

    /// Returns true when this grammar matches the first source line.
    pub fn matches_first_line(&self, line: &str) -> bool {
        self.first_line_match
            .iter()
            .any(|regex| regex.find(line).is_some())
    }

    /// Tokenizes one line, mutating the caller-owned line state.
    pub fn tokenize_line(&self, state: &mut LineState, line: &str) -> Vec<ScopeSpan> {
        let mut spans = Vec::new();
        self.tokenize_line_into(state, line, &mut spans);
        spans
    }

    /// Tokenizes one line into a caller-owned buffer.
    pub fn tokenize_line_into(
        &self,
        state: &mut LineState,
        line: &str,
        spans: &mut Vec<ScopeSpan>,
    ) {
        LineTokenizer::new(self).tokenize_line_into(state, line, spans);
    }

    /// Returns the compiled pattern for a stored rule id.
    fn pattern_by_rule(&self, rule_id: usize) -> Option<&pattern::Pattern> {
        let path = self.rule_paths.get(rule_id)?;
        self.patterns.pattern_at(path)
    }
}

impl FromStr for Grammar {
    type Err = Error;

    /// Parses and compiles a grammar from JSON.
    fn from_str(input: &str) -> Result<Self, Self::Err> {
        let raw = RawGrammar::from_str(input)?;
        Self::compile(&raw)
    }
}