use super::end::{EndPattern, EndRegex, EndRegexCache};
use super::*;
pub(super) type Captures = BTreeMap<usize, ScopeId>;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(super) enum Scope {
None,
Visible(ScopeId),
Structural(ScopeId),
}
#[derive(Debug, Default)]
pub(super) struct PatternCaptures {
pub(super) matched: Captures,
pub(super) begin: Captures,
pub(super) end: Captures,
}
#[derive(Debug)]
enum PatternBody {
Match {
source: String,
},
BeginEnd {
begin_source: String,
end: EndPattern,
},
}
#[derive(Debug)]
pub(super) struct PatternSet {
patterns: Vec<Pattern>,
regexes: Option<RegSet>,
}
enum Compiled {
Rule(Pattern),
Inline(Vec<Pattern>),
Skip,
}
#[derive(Debug)]
pub(super) struct Pattern {
id: usize,
pub(super) scope: Scope,
pub(super) captures: PatternCaptures,
body: PatternBody,
pub(super) nested: PatternSet,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub(super) struct OpenRule {
pub(super) rule_id: usize,
pub(super) dynamic_end: Option<String>,
}
pub(super) struct Match {
pub(super) start: usize,
end: usize,
pub(super) spans: Vec<ScopeSpan>,
open_rule: Option<OpenRule>,
}
pub(super) struct RegexMatch {
pub(super) start: usize,
end: usize,
captures: Vec<Option<(usize, usize)>>,
}
#[derive(Default)]
pub(super) struct ScopeInterner {
pub(super) scopes: Vec<String>,
ids: BTreeMap<String, ScopeId>,
}
struct EndOnLine {
matched: RegexMatch,
spans: Vec<ScopeSpan>,
}
impl Scope {
fn from_raw(name: Option<&str>, interner: &mut ScopeInterner) -> Self {
let Some(name) = name else { return Self::None };
let id = interner.intern(name);
if name.starts_with("meta.structure.") {
Self::Structural(id)
} else {
Self::Visible(id)
}
}
pub(super) fn id(self) -> Option<ScopeId> {
match self {
Self::None => None,
Self::Visible(id) | Self::Structural(id) => Some(id),
}
}
pub(super) fn push_visible(self, spans: &mut Vec<ScopeSpan>, start: usize, end: usize) {
if let Self::Visible(scope) = self
&& start < end
{
spans.push(ScopeSpan { start, end, scope });
}
}
}
impl ScopeInterner {
fn intern(&mut self, scope: &str) -> ScopeId {
if let Some(id) = self.ids.get(scope) {
return *id;
}
let id = ScopeId::new(self.scopes.len());
self.scopes.push(scope.to_owned());
self.ids.insert(scope.to_owned(), id);
id
}
fn captures(&mut self, raw: Option<&BTreeMap<String, RawCapture>>) -> Captures {
raw.into_iter()
.flat_map(BTreeMap::iter)
.filter_map(|(index, capture)| {
let index = index.parse().ok()?;
let name = capture.name.as_ref()?;
let scope = self.intern(name);
Some((index, scope))
})
.collect()
}
}
impl PatternSet {
pub(super) fn empty() -> Self {
Self {
patterns: Vec::new(),
regexes: None,
}
}
pub(super) fn compile(
raw: &RawGrammar,
patterns: &[RawPattern],
next_rule: &mut usize,
depth: usize,
interner: &mut ScopeInterner,
) -> Result<Self, Error> {
if depth > MAX_INCLUDE_DEPTH {
return Ok(Self::empty());
}
let mut compiled = Vec::new();
for pattern in patterns {
if let Some(include) = &pattern.include {
let expanded = expand_include(raw, include, next_rule, depth, interner)?;
compiled.extend(expanded);
continue;
}
match Pattern::compile(raw, pattern, next_rule, depth, interner)? {
Compiled::Rule(pattern) => compiled.push(pattern),
Compiled::Inline(patterns) => compiled.extend(patterns),
Compiled::Skip => {}
}
}
Ok(Self::new(compiled))
}
fn new(patterns: Vec<Pattern>) -> Self {
let sources: Vec<&str> = patterns.iter().map(Pattern::search_source).collect();
let regexes = (!sources.is_empty())
.then(|| RegSet::new(&sources).ok())
.flatten();
Self { patterns, regexes }
}
pub(super) fn collect_rule_paths(&self, total_rules: usize) -> Vec<Box<[usize]>> {
let mut paths = vec![Box::default(); total_rules];
let mut current = Vec::new();
self.fill_rule_paths(&mut current, &mut paths);
paths
}
fn fill_rule_paths(&self, current: &mut Vec<usize>, paths: &mut [Box<[usize]>]) {
for (index, pattern) in self.patterns.iter().enumerate() {
current.push(index);
if let Some(slot) = paths.get_mut(pattern.id) {
*slot = current.as_slice().into();
}
pattern.nested.fill_rule_paths(current, paths);
current.pop();
}
}
pub(super) fn pattern_at(&self, path: &[usize]) -> Option<&Pattern> {
let (last, rest) = path.split_last()?;
let mut current = self;
for &index in rest {
current = ¤t.patterns.get(index)?.nested;
}
current.patterns.get(*last)
}
pub(super) fn find_next(
&self,
line: &str,
pos: usize,
cache: &mut EndRegexCache,
) -> Option<Match> {
if pos > line.len() || !line.is_char_boundary(pos) {
return None;
}
let regexes = self.regexes.as_ref()?;
let (entry, captures) = regexes.captures_with_options(
line,
pos,
line.len(),
RegSetLead::Position,
SearchOptions::SEARCH_OPTION_NONE,
)?;
let pattern = self.patterns.get(entry)?;
let regex_match = RegexMatch::from_captures(captures, pattern.needs_match_captures())?;
pattern.match_result(regex_match, line, cache)
}
}
fn expand_include(
raw: &RawGrammar,
include: &str,
next_rule: &mut usize,
depth: usize,
interner: &mut ScopeInterner,
) -> Result<Vec<Pattern>, Error> {
if include == "$self" || include == "$base" {
let set = PatternSet::compile(raw, &raw.patterns, next_rule, depth + 1, interner)?;
return Ok(set.patterns);
}
let Some(name) = include.strip_prefix('#') else {
return Ok(Vec::new());
};
let Some(pattern) = raw
.repository
.as_ref()
.and_then(|repository| repository.get(name))
else {
return Ok(Vec::new());
};
let set = PatternSet::compile(
raw,
std::slice::from_ref(pattern),
next_rule,
depth + 1,
interner,
)?;
Ok(set.patterns)
}
impl Pattern {
fn compile(
raw: &RawGrammar,
pattern: &RawPattern,
next_rule: &mut usize,
depth: usize,
interner: &mut ScopeInterner,
) -> Result<Compiled, Error> {
let scope = Scope::from_raw(pattern.name.as_deref(), interner);
let nested = PatternSet::compile(
raw,
pattern.patterns.as_deref().unwrap_or_default(),
next_rule,
depth + 1,
interner,
)?;
let body = match (&pattern.match_rule, &pattern.begin, &pattern.end) {
(Some(source), _, _) => {
if Regex::new(source).is_err() {
return Ok(Compiled::Skip);
}
PatternBody::Match {
source: source.clone(),
}
}
(None, Some(begin), Some(end)) => {
if Regex::new(begin).is_err() {
return Ok(Compiled::Skip);
}
let Some(end) = EndPattern::compile(end) else {
return Ok(Compiled::Skip);
};
PatternBody::BeginEnd {
begin_source: begin.clone(),
end,
}
}
_ => return Ok(Compiled::Inline(nested.patterns)),
};
let id = *next_rule;
*next_rule += 1;
Ok(Compiled::Rule(Self {
id,
scope,
captures: PatternCaptures {
matched: interner.captures(pattern.captures.as_ref()),
begin: interner.captures(pattern.begin_captures.as_ref()),
end: interner.captures(pattern.end_captures.as_ref()),
},
body,
nested,
}))
}
pub(super) fn resume_end<'pat>(
&'pat self,
dynamic_end: Option<&str>,
cache: &mut EndRegexCache,
) -> Option<EndRegex<'pat>> {
let PatternBody::BeginEnd { end, .. } = &self.body else {
return None;
};
end.resume(dynamic_end, cache)
}
pub(super) fn append_match_spans(
&self,
matched: &RegexMatch,
captures: &Captures,
spans: &mut Vec<ScopeSpan>,
) {
for (index, scope) in captures {
if let Some(Some((start, end))) = matched.captures.get(*index)
&& start < end
{
spans.push(ScopeSpan {
start: *start,
end: *end,
scope: *scope,
});
}
}
if let Some(scope) = self.scope.id() {
spans.push(ScopeSpan {
start: matched.start,
end: matched.end,
scope,
});
}
spans.sort_by_key(|span| (span.start, span.end));
}
fn needs_match_captures(&self) -> bool {
match &self.body {
PatternBody::Match { .. } => !self.captures.matched.is_empty(),
PatternBody::BeginEnd { end, .. } => {
!self.captures.begin.is_empty() || end.is_dynamic()
}
}
}
fn search_source(&self) -> &str {
match &self.body {
PatternBody::Match { source } => source,
PatternBody::BeginEnd { begin_source, .. } => begin_source,
}
}
fn match_result(
&self,
regex_match: RegexMatch,
line: &str,
cache: &mut EndRegexCache,
) -> Option<Match> {
match &self.body {
PatternBody::Match { .. } => Some(self.simple_match(regex_match)),
PatternBody::BeginEnd { end, .. } => self.begin_match(end, regex_match, line, cache),
}
}
fn simple_match(&self, regex_match: RegexMatch) -> Match {
let mut spans = Vec::new();
self.append_match_spans(®ex_match, &self.captures.matched, &mut spans);
Match {
start: regex_match.start,
end: regex_match.end,
spans,
open_rule: None,
}
}
fn begin_match(
&self,
end_pattern: &EndPattern,
begin: RegexMatch,
line: &str,
cache: &mut EndRegexCache,
) -> Option<Match> {
let end_regex = end_pattern.resolve_for_begin(&begin, line, cache)?;
let dynamic_end = end_regex.dynamic_source();
let same_line_end = self.find_same_line_end(&end_regex, line, begin.end, cache);
let mut spans = Vec::new();
self.append_match_spans(&begin, &self.captures.begin, &mut spans);
let (end_byte, open_rule) = if let Some(close) = &same_line_end {
self.scope
.push_visible(&mut spans, begin.end, close.matched.start);
spans.extend(close.spans.iter().copied());
self.append_match_spans(&close.matched, &self.captures.end, &mut spans);
(close.matched.end, None)
} else {
(
begin.end,
Some(OpenRule {
rule_id: self.id,
dynamic_end,
}),
)
};
Some(Match {
start: begin.start,
end: end_byte,
spans,
open_rule,
})
}
fn find_same_line_end(
&self,
end: &EndRegex<'_>,
line: &str,
mut pos: usize,
cache: &mut EndRegexCache,
) -> Option<EndOnLine> {
let end_regex = end.regex();
let end_source = end.source();
let capture_end = !self.captures.end.is_empty();
let mut spans = Vec::new();
loop {
let matched = RegexMatch::find(end_regex, line, pos, capture_end)?;
if is_simple_quote(end_source) && is_escaped_at(line, matched.start) {
pos = matched.next_pos(line);
continue;
}
let nested_match = self.nested.find_next(line, pos, cache);
let Some(nested_match) = nested_match else {
return Some(EndOnLine { matched, spans });
};
if nested_match.start >= matched.start {
return Some(EndOnLine { matched, spans });
}
pos = nested_match.next_pos(line);
spans.extend(nested_match.spans);
}
}
}
fn is_simple_quote(source: &str) -> bool {
matches!(source, "\"" | "'")
}
fn is_escaped_at(line: &str, pos: usize) -> bool {
if pos > line.len() || !line.is_char_boundary(pos) {
return false;
}
line[..pos]
.bytes()
.rev()
.take_while(|byte| *byte == b'\\')
.count()
% 2
== 1
}
impl Match {
pub(super) fn next_pos(&self, line: &str) -> usize {
next_pos_after(line, self.start, self.end)
}
pub(super) fn open_rule(&self) -> Option<OpenRule> {
self.open_rule.clone()
}
}
impl RegexMatch {
pub(super) fn find(regex: &Regex, line: &str, pos: usize, keep_captures: bool) -> Option<Self> {
if pos > line.len() || !line.is_char_boundary(pos) {
return None;
}
let mut region = Region::new();
let _ = regex.search_with_options(
line,
pos,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
)?;
let (start, end) = region.pos(0)?;
if start > end || end > line.len() {
return None;
}
let captures = if keep_captures {
(0..region.len())
.map(|index| {
region.pos(index).and_then(|(start, end)| {
(start <= end && end <= line.len()).then_some((start, end))
})
})
.collect()
} else {
Vec::new()
};
Some(Self {
start,
end,
captures,
})
}
fn from_captures(captures: onig::Captures<'_>, keep_captures: bool) -> Option<Self> {
let (start, end) = captures.pos(0)?;
let captures = if keep_captures {
(0..captures.len())
.map(|index| captures.pos(index))
.collect()
} else {
Vec::new()
};
Some(Self {
start,
end,
captures,
})
}
pub(super) fn next_pos(&self, line: &str) -> usize {
next_pos_after(line, self.start, self.end)
}
pub(super) fn capture_text<'a>(&self, index: usize, line: &'a str) -> Option<&'a str> {
let (start, end) = self.captures.get(index)?.as_ref().copied()?;
line.get(start..end)
}
pub(super) fn expand_backrefs(&self, template: &str, line: &str) -> String {
let mut resolved = String::new();
let mut chars = template.chars();
while let Some(ch) = chars.next() {
if ch != '\\' {
resolved.push(ch);
continue;
}
let Some(next) = chars.next() else {
resolved.push(ch);
break;
};
if let Some(index) = next.to_digit(10)
&& let Some(text) = self.capture_text(index as usize, line)
{
push_regex_escaped(&mut resolved, text);
} else {
resolved.push(ch);
resolved.push(next);
}
}
resolved
}
}
fn push_regex_escaped(out: &mut String, input: &str) {
const META: &[char] = &[
'\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', '^', '$',
];
for ch in input.chars() {
if META.contains(&ch) {
out.push('\\');
}
out.push(ch);
}
}
fn next_pos_after(line: &str, start: usize, end: usize) -> usize {
if end > start {
end
} else {
next_char_boundary(line, start.saturating_add(1))
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use std::str::FromStr;
use super::*;
#[test]
fn invalid_regex_offsets_are_ignored() {
let regex = Regex::new("\"").unwrap();
assert!(RegexMatch::find(®ex, r#" "patterns": ["#, usize::MAX, true).is_none());
assert!(RegexMatch::find(®ex, "é", 1, true).is_none());
}
#[test]
fn regex_match_offsets_are_absolute_after_full_line_search() {
let regex = Regex::new("(bar)").unwrap();
let matched = RegexMatch::find(®ex, "foobar", 3, true).unwrap();
assert_eq!(matched.start, 3);
assert_eq!(matched.end, 6);
assert_eq!(matched.captures[1], Some((3, 6)));
}
#[test]
fn line_tokenizer_caches_dynamic_end_regexes() {
let grammar = Grammar::from_str(
r#"{
"name":"Demo",
"scopeName":"source.demo",
"patterns":[
{
"begin":"<([A-Za-z.]+)>",
"end":"</\\1>",
"name":"meta.tag.demo"
}
],
"repository":{}
}"#,
)
.unwrap();
let mut tokenizer = LineTokenizer::new(&grammar);
let mut state = LineState::default();
let mut spans = Vec::new();
tokenizer.tokenize_line_into(&mut state, "<foo.bar>", &mut spans);
assert_eq!(tokenizer.end_regex_cache_len(), 1);
tokenizer.tokenize_line_into(&mut state, "body", &mut spans);
assert_eq!(tokenizer.end_regex_cache_len(), 1);
tokenizer.tokenize_line_into(&mut state, "</foo.bar>", &mut spans);
assert!(state.is_empty());
assert_eq!(tokenizer.end_regex_cache_len(), 1);
}
}