syntect_no_panic/parsing/
yaml_load.rs

1use super::regex::{Regex, Region};
2use super::scope::*;
3use super::syntax_definition::*;
4use std::collections::HashMap;
5use std::error::Error;
6use std::ops::DerefMut;
7use std::path::Path;
8use yaml_rust::yaml::Hash;
9use yaml_rust::{ScanError, Yaml, YamlLoader};
10
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum ParseSyntaxError {
14    /// Invalid YAML file syntax, or at least something yaml_rust can't handle
15    #[error("Invalid YAML file syntax: {0}")]
16    InvalidYaml(#[from] ScanError),
17    /// The file must contain at least one YAML document
18    #[error("The file must contain at least one YAML document")]
19    EmptyFile,
20    /// Some keys are required for something to be a valid `.sublime-syntax`
21    #[error("Missing mandatory key in YAML file: {0}")]
22    MissingMandatoryKey(&'static str),
23    /// Invalid regex
24    #[error("Error while compiling regex '{0}': {1}")]
25    RegexCompileError(String, #[source] Box<dyn Error + Send + Sync + 'static>),
26    /// A scope that syntect's scope implementation can't handle
27    #[error("Invalid scope: {0}")]
28    InvalidScope(ParseScopeError),
29    /// A reference to another file that is invalid
30    #[error("Invalid file reference")]
31    BadFileRef,
32    /// Syntaxes must have a context named "main"
33    #[error("Context 'main' is missing")]
34    MainMissing,
35    /// Some part of the YAML file is the wrong type (e.g a string but should be a list)
36    /// Sorry this doesn't give you any way to narrow down where this is.
37    /// Maybe use Sublime Text to figure it out.
38    #[error("Type mismatch")]
39    TypeMismatch,
40}
41
42fn get_key<'a, R, F: FnOnce(&'a Yaml) -> Option<R>>(
43    map: &'a Hash,
44    key: &'static str,
45    f: F,
46) -> Result<R, ParseSyntaxError> {
47    map.get(&Yaml::String(key.to_owned()))
48        .ok_or(ParseSyntaxError::MissingMandatoryKey(key))
49        .and_then(|x| f(x).ok_or(ParseSyntaxError::TypeMismatch))
50}
51
52fn str_to_scopes(s: &str, repo: &mut ScopeRepository) -> Result<Vec<Scope>, ParseSyntaxError> {
53    s.split_whitespace()
54        .map(|scope| repo.build(scope).map_err(ParseSyntaxError::InvalidScope))
55        .collect()
56}
57
58struct ParserState<'a> {
59    scope_repo: &'a mut ScopeRepository,
60    variables: HashMap<String, String>,
61    variable_regex: Regex,
62    backref_regex: Regex,
63    lines_include_newline: bool,
64}
65
66// `__start` must not include prototypes from the actual syntax definition,
67// otherwise it's possible that a prototype makes us pop out of `__start`.
68static START_CONTEXT: &str = "
69__start:
70    - meta_include_prototype: false
71    - match: ''
72      push: __main
73__main:
74    - include: main
75";
76
77impl SyntaxDefinition {
78    /// In case you want to create your own SyntaxDefinition's in memory from strings.
79    ///
80    /// Generally you should use a [`SyntaxSet`].
81    ///
82    /// `fallback_name` is an optional name to use when the YAML doesn't provide a `name` key.
83    ///
84    /// [`SyntaxSet`]: ../struct.SyntaxSet.html
85    pub fn load_from_str(
86        s: &str,
87        lines_include_newline: bool,
88        fallback_name: Option<&str>,
89    ) -> Result<SyntaxDefinition, ParseSyntaxError> {
90        let docs = match YamlLoader::load_from_str(s) {
91            Ok(x) => x,
92            Err(e) => return Err(ParseSyntaxError::InvalidYaml(e)),
93        };
94        if docs.is_empty() {
95            return Err(ParseSyntaxError::EmptyFile);
96        }
97        let doc = &docs[0];
98        let mut scope_repo = SCOPE_REPO.lock().unwrap();
99        SyntaxDefinition::parse_top_level(
100            doc,
101            scope_repo.deref_mut(),
102            lines_include_newline,
103            fallback_name,
104        )
105    }
106
107    fn parse_top_level(
108        doc: &Yaml,
109        scope_repo: &mut ScopeRepository,
110        lines_include_newline: bool,
111        fallback_name: Option<&str>,
112    ) -> Result<SyntaxDefinition, ParseSyntaxError> {
113        let h = doc.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
114
115        let mut variables = HashMap::new();
116        if let Ok(map) = get_key(h, "variables", |x| x.as_hash()) {
117            for (key, value) in map.iter() {
118                if let (Some(key_str), Some(val_str)) = (key.as_str(), value.as_str()) {
119                    variables.insert(key_str.to_owned(), val_str.to_owned());
120                }
121            }
122        }
123        let contexts_hash = get_key(h, "contexts", |x| x.as_hash())?;
124        let top_level_scope = scope_repo
125            .build(get_key(h, "scope", |x| x.as_str())?)
126            .map_err(ParseSyntaxError::InvalidScope)?;
127        let mut state = ParserState {
128            scope_repo,
129            variables,
130            variable_regex: Regex::new(r"\{\{([A-Za-z0-9_]+)\}\}".into()),
131            backref_regex: Regex::new(r"\\\d".into()),
132            lines_include_newline,
133        };
134
135        let mut contexts = SyntaxDefinition::parse_contexts(contexts_hash, &mut state)?;
136        if !contexts.contains_key("main") {
137            return Err(ParseSyntaxError::MainMissing);
138        }
139
140        SyntaxDefinition::add_initial_contexts(&mut contexts, &mut state, top_level_scope);
141
142        let mut file_extensions = Vec::new();
143        for extension_key in &["file_extensions", "hidden_file_extensions"] {
144            if let Ok(v) = get_key(h, extension_key, |x| x.as_vec()) {
145                file_extensions.extend(v.iter().filter_map(|y| y.as_str().map(|s| s.to_owned())))
146            }
147        }
148
149        let defn = SyntaxDefinition {
150            name: get_key(h, "name", |x| x.as_str())
151                .unwrap_or_else(|_| fallback_name.unwrap_or("Unnamed"))
152                .to_owned(),
153            scope: top_level_scope,
154            file_extensions,
155            // TODO maybe cache a compiled version of this Regex
156            first_line_match: get_key(h, "first_line_match", |x| x.as_str())
157                .ok()
158                .map(|s| s.to_owned()),
159            hidden: get_key(h, "hidden", |x| x.as_bool()).unwrap_or(false),
160
161            variables: state.variables,
162            contexts,
163        };
164        Ok(defn)
165    }
166
167    fn parse_contexts(
168        map: &Hash,
169        state: &mut ParserState<'_>,
170    ) -> Result<HashMap<String, Context>, ParseSyntaxError> {
171        let mut contexts = HashMap::new();
172        for (key, value) in map.iter() {
173            if let (Some(name), Some(val_vec)) = (key.as_str(), value.as_vec()) {
174                let is_prototype = name == "prototype";
175                let mut namer = ContextNamer::new(name);
176                SyntaxDefinition::parse_context(
177                    val_vec,
178                    state,
179                    &mut contexts,
180                    is_prototype,
181                    &mut namer,
182                )?;
183            }
184        }
185
186        Ok(contexts)
187    }
188
189    fn parse_context(
190        vec: &[Yaml],
191        // TODO: Maybe just pass the scope repo if that's all that's needed?
192        state: &mut ParserState<'_>,
193        contexts: &mut HashMap<String, Context>,
194        is_prototype: bool,
195        namer: &mut ContextNamer,
196    ) -> Result<String, ParseSyntaxError> {
197        let mut context = Context::new(!is_prototype);
198        let name = namer.next();
199
200        for y in vec.iter() {
201            let map = y.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
202
203            let mut is_special = false;
204            if let Ok(x) = get_key(map, "meta_scope", |x| x.as_str()) {
205                context.meta_scope = str_to_scopes(x, state.scope_repo)?;
206                is_special = true;
207            }
208            if let Ok(x) = get_key(map, "meta_content_scope", |x| x.as_str()) {
209                context.meta_content_scope = str_to_scopes(x, state.scope_repo)?;
210                is_special = true;
211            }
212            if let Ok(x) = get_key(map, "meta_include_prototype", |x| x.as_bool()) {
213                context.meta_include_prototype = x;
214                is_special = true;
215            }
216            if let Ok(true) = get_key(map, "clear_scopes", |x| x.as_bool()) {
217                context.clear_scopes = Some(ClearAmount::All);
218                is_special = true;
219            }
220            if let Ok(x) = get_key(map, "clear_scopes", |x| x.as_i64()) {
221                context.clear_scopes = Some(ClearAmount::TopN(x as usize));
222                is_special = true;
223            }
224            if !is_special {
225                if let Ok(x) = get_key(map, "include", Some) {
226                    let reference =
227                        SyntaxDefinition::parse_reference(x, state, contexts, namer, false)?;
228                    context.patterns.push(Pattern::Include(reference));
229                } else {
230                    let pattern =
231                        SyntaxDefinition::parse_match_pattern(map, state, contexts, namer)?;
232                    if pattern.has_captures {
233                        context.uses_backrefs = true;
234                    }
235                    context.patterns.push(Pattern::Match(pattern));
236                }
237            }
238        }
239
240        contexts.insert(name.clone(), context);
241        Ok(name)
242    }
243
244    fn parse_reference(
245        y: &Yaml,
246        state: &mut ParserState<'_>,
247        contexts: &mut HashMap<String, Context>,
248        namer: &mut ContextNamer,
249        with_escape: bool,
250    ) -> Result<ContextReference, ParseSyntaxError> {
251        if let Some(s) = y.as_str() {
252            let parts: Vec<&str> = s.split('#').collect();
253            let sub_context = if parts.len() > 1 {
254                Some(parts[1].to_owned())
255            } else {
256                None
257            };
258            if parts[0].starts_with("scope:") {
259                Ok(ContextReference::ByScope {
260                    scope: state
261                        .scope_repo
262                        .build(&parts[0][6..])
263                        .map_err(ParseSyntaxError::InvalidScope)?,
264                    sub_context,
265                    with_escape,
266                })
267            } else if parts[0].ends_with(".sublime-syntax") {
268                let stem = Path::new(parts[0])
269                    .file_stem()
270                    .and_then(|x| x.to_str())
271                    .ok_or(ParseSyntaxError::BadFileRef)?;
272                Ok(ContextReference::File {
273                    name: stem.to_owned(),
274                    sub_context,
275                    with_escape,
276                })
277            } else {
278                Ok(ContextReference::Named(parts[0].to_owned()))
279            }
280        } else if let Some(v) = y.as_vec() {
281            let subname = SyntaxDefinition::parse_context(v, state, contexts, false, namer)?;
282            Ok(ContextReference::Inline(subname))
283        } else {
284            Err(ParseSyntaxError::TypeMismatch)
285        }
286    }
287
288    fn parse_match_pattern(
289        map: &Hash,
290        state: &mut ParserState<'_>,
291        contexts: &mut HashMap<String, Context>,
292        namer: &mut ContextNamer,
293    ) -> Result<MatchPattern, ParseSyntaxError> {
294        let raw_regex = get_key(map, "match", |x| x.as_str())?;
295        let regex_str = Self::parse_regex(raw_regex, state)?;
296        // println!("{:?}", regex_str);
297
298        let scope = get_key(map, "scope", |x| x.as_str())
299            .ok()
300            .map(|s| str_to_scopes(s, state.scope_repo))
301            .unwrap_or_else(|| Ok(vec![]))?;
302
303        let captures = if let Ok(map) = get_key(map, "captures", |x| x.as_hash()) {
304            Some(Self::parse_captures(map, &regex_str, state)?)
305        } else {
306            None
307        };
308
309        let mut has_captures = false;
310        let operation = if get_key(map, "pop", Some).is_ok() {
311            // Thanks @wbond for letting me know this is the correct way to check for captures
312            has_captures =
313                state
314                    .backref_regex
315                    .search(&regex_str, 0, regex_str.len(), None, false)?;
316            MatchOperation::Pop
317        } else if let Ok(y) = get_key(map, "push", Some) {
318            MatchOperation::Push(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
319        } else if let Ok(y) = get_key(map, "set", Some) {
320            MatchOperation::Set(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
321        } else if let Ok(y) = get_key(map, "embed", Some) {
322            // Same as push so we translate it to what it would be
323            let mut embed_escape_context_yaml = vec![];
324            let mut commands = Hash::new();
325            commands.insert(
326                Yaml::String("meta_include_prototype".to_string()),
327                Yaml::Boolean(false),
328            );
329            embed_escape_context_yaml.push(Yaml::Hash(commands));
330            if let Ok(s) = get_key(map, "embed_scope", Some) {
331                commands = Hash::new();
332                commands.insert(Yaml::String("meta_content_scope".to_string()), s.clone());
333                embed_escape_context_yaml.push(Yaml::Hash(commands));
334            }
335            if let Ok(v) = get_key(map, "escape", Some) {
336                let mut match_map = Hash::new();
337                match_map.insert(Yaml::String("match".to_string()), v.clone());
338                match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
339                if let Ok(y) = get_key(map, "escape_captures", Some) {
340                    match_map.insert(Yaml::String("captures".to_string()), y.clone());
341                }
342                embed_escape_context_yaml.push(Yaml::Hash(match_map));
343                let escape_context = SyntaxDefinition::parse_context(
344                    &embed_escape_context_yaml,
345                    state,
346                    contexts,
347                    false,
348                    namer,
349                )?;
350                MatchOperation::Push(vec![
351                    ContextReference::Inline(escape_context),
352                    SyntaxDefinition::parse_reference(y, state, contexts, namer, true)?,
353                ])
354            } else {
355                return Err(ParseSyntaxError::MissingMandatoryKey("escape"));
356            }
357        } else {
358            MatchOperation::None
359        };
360
361        let with_prototype = if let Ok(v) = get_key(map, "with_prototype", |x| x.as_vec()) {
362            // should a with_prototype include the prototype? I don't think so.
363            let subname = Self::parse_context(v, state, contexts, true, namer)?;
364            Some(ContextReference::Inline(subname))
365        } else if let Ok(v) = get_key(map, "escape", Some) {
366            let subname = namer.next();
367
368            let mut context = Context::new(false);
369            let mut match_map = Hash::new();
370            match_map.insert(
371                Yaml::String("match".to_string()),
372                Yaml::String(format!("(?={})", v.as_str().unwrap())),
373            );
374            match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
375            let pattern =
376                SyntaxDefinition::parse_match_pattern(&match_map, state, contexts, namer)?;
377            if pattern.has_captures {
378                context.uses_backrefs = true;
379            }
380            context.patterns.push(Pattern::Match(pattern));
381
382            contexts.insert(subname.clone(), context);
383            Some(ContextReference::Inline(subname))
384        } else {
385            None
386        };
387
388        let pattern = MatchPattern::new(
389            has_captures,
390            regex_str,
391            scope,
392            captures,
393            operation,
394            with_prototype,
395        );
396
397        Ok(pattern)
398    }
399
400    fn parse_pushargs(
401        y: &Yaml,
402        state: &mut ParserState<'_>,
403        contexts: &mut HashMap<String, Context>,
404        namer: &mut ContextNamer,
405    ) -> Result<Vec<ContextReference>, ParseSyntaxError> {
406        // check for a push of multiple items
407        if y.as_vec().map_or(false, |v| {
408            !v.is_empty()
409                && (v[0].as_str().is_some()
410                    || (v[0].as_vec().is_some() && v[0].as_vec().unwrap()[0].as_hash().is_some()))
411        }) {
412            // this works because Result implements FromIterator to handle the errors
413            y.as_vec()
414                .unwrap()
415                .iter()
416                .map(|x| SyntaxDefinition::parse_reference(x, state, contexts, namer, false))
417                .collect()
418        } else {
419            let reference = SyntaxDefinition::parse_reference(y, state, contexts, namer, false)?;
420            Ok(vec![reference])
421        }
422    }
423
424    fn parse_regex(raw_regex: &str, state: &ParserState<'_>) -> Result<String, ParseSyntaxError> {
425        let regex = Self::resolve_variables(raw_regex, state)?;
426        let regex = replace_posix_char_classes(regex);
427        let regex = if state.lines_include_newline {
428            regex_for_newlines(regex)
429        } else {
430            // If the passed in strings don't include newlines (unlike Sublime) we can't match on
431            // them using the original regex. So this tries to rewrite the regex in a way that
432            // allows matching against lines without newlines (essentially replacing `\n` with `$`).
433            regex_for_no_newlines(regex)
434        };
435        Self::try_compile_regex(&regex)?;
436        Ok(regex)
437    }
438
439    fn resolve_variables(
440        raw_regex: &str,
441        state: &ParserState<'_>,
442    ) -> Result<String, ParseSyntaxError> {
443        let mut result = String::new();
444        let mut index = 0;
445        let mut region = Region::new();
446        while state.variable_regex.search(
447            raw_regex,
448            index,
449            raw_regex.len(),
450            Some(&mut region),
451            false,
452        )? {
453            let (begin, end) = region.pos(0).unwrap();
454
455            result.push_str(&raw_regex[index..begin]);
456
457            let var_pos = region.pos(1).unwrap();
458            let var_name = &raw_regex[var_pos.0..var_pos.1];
459            let var_raw = state
460                .variables
461                .get(var_name)
462                .map(String::as_ref)
463                .unwrap_or("");
464            let var_resolved = Self::resolve_variables(var_raw, state)?;
465            result.push_str(&var_resolved);
466
467            index = end;
468        }
469        if index < raw_regex.len() {
470            result.push_str(&raw_regex[index..]);
471        }
472        Ok(result)
473    }
474
475    fn try_compile_regex(regex_str: &str) -> Result<(), ParseSyntaxError> {
476        // Replace backreferences with a placeholder value that will also appear in errors
477        let regex_str =
478            substitute_backrefs_in_regex(regex_str, |i| Some(format!("<placeholder_{}>", i)));
479
480        if let Some(error) = Regex::try_compile(&regex_str) {
481            Err(ParseSyntaxError::RegexCompileError(regex_str, error))
482        } else {
483            Ok(())
484        }
485    }
486
487    fn parse_captures(
488        map: &Hash,
489        regex_str: &str,
490        state: &mut ParserState<'_>,
491    ) -> Result<CaptureMapping, ParseSyntaxError> {
492        let valid_indexes = get_consuming_capture_indexes(regex_str);
493        let mut captures = Vec::new();
494        for (key, value) in map.iter() {
495            if let (Some(key_int), Some(val_str)) = (key.as_i64(), value.as_str()) {
496                if valid_indexes.contains(&(key_int as usize)) {
497                    captures.push((key_int as usize, str_to_scopes(val_str, state.scope_repo)?));
498                }
499            }
500        }
501        Ok(captures)
502    }
503
504    /// Sublime treats the top level context slightly differently from
505    /// including the main context from other syntaxes. When main is popped
506    /// it is immediately re-added and when it is `set` over the file level
507    /// scope remains. This behaviour is emulated through some added contexts
508    /// that are the actual top level contexts used in parsing.
509    /// See <https://github.com/trishume/syntect/issues/58> for more.
510    fn add_initial_contexts(
511        contexts: &mut HashMap<String, Context>,
512        state: &mut ParserState<'_>,
513        top_level_scope: Scope,
514    ) {
515        let yaml_docs = YamlLoader::load_from_str(START_CONTEXT).unwrap();
516        let yaml = &yaml_docs[0];
517
518        let start_yaml: &[Yaml] = yaml["__start"].as_vec().unwrap();
519        SyntaxDefinition::parse_context(
520            start_yaml,
521            state,
522            contexts,
523            false,
524            &mut ContextNamer::new("__start"),
525        )
526        .unwrap();
527        if let Some(start) = contexts.get_mut("__start") {
528            start.meta_content_scope = vec![top_level_scope];
529        }
530
531        let main_yaml: &[Yaml] = yaml["__main"].as_vec().unwrap();
532        SyntaxDefinition::parse_context(
533            main_yaml,
534            state,
535            contexts,
536            false,
537            &mut ContextNamer::new("__main"),
538        )
539        .unwrap();
540
541        let meta_include_prototype = contexts["main"].meta_include_prototype;
542        let meta_scope = contexts["main"].meta_scope.clone();
543        let meta_content_scope = contexts["main"].meta_content_scope.clone();
544
545        if let Some(outer_main) = contexts.get_mut("__main") {
546            outer_main.meta_include_prototype = meta_include_prototype;
547            outer_main.meta_scope = meta_scope;
548            outer_main.meta_content_scope = meta_content_scope;
549        }
550
551        // add the top_level_scope as a meta_content_scope to main so
552        // pushes from other syntaxes add the file scope
553        // TODO: this order is not quite correct if main also has a meta_scope
554        if let Some(main) = contexts.get_mut("main") {
555            main.meta_content_scope.insert(0, top_level_scope);
556        }
557    }
558}
559
560struct ContextNamer {
561    name: String,
562    anonymous_index: Option<usize>,
563}
564
565impl ContextNamer {
566    fn new(name: &str) -> ContextNamer {
567        ContextNamer {
568            name: name.to_string(),
569            anonymous_index: None,
570        }
571    }
572
573    fn next(&mut self) -> String {
574        let name = if let Some(index) = self.anonymous_index {
575            format!("#anon_{}_{}", self.name, index)
576        } else {
577            self.name.clone()
578        };
579
580        self.anonymous_index = Some(self.anonymous_index.map(|i| i + 1).unwrap_or(0));
581        name
582    }
583}
584
585/// In fancy-regex, POSIX character classes only match ASCII characters.
586///
587/// Sublime's syntaxes expect them to match Unicode characters as well, so transform them to
588/// corresponding Unicode character classes.
589fn replace_posix_char_classes(regex: String) -> String {
590    regex
591        .replace("[:alpha:]", r"\p{L}")
592        .replace("[:alnum:]", r"\p{L}\p{N}")
593        .replace("[:lower:]", r"\p{Ll}")
594        .replace("[:upper:]", r"\p{Lu}")
595        .replace("[:digit:]", r"\p{Nd}")
596}
597
598/// Some of the regexes include `$` and expect it to match end of line,
599/// e.g. *before* the `\n` in `test\n`.
600///
601/// In fancy-regex, `$` means end of text by default, so that would
602/// match *after* `\n`. Using `(?m:$)` instead means it matches end of line.
603///
604/// Note that we don't want to add a `(?m)` in the beginning to change the
605/// whole regex because that would also change the meaning of `^`. In
606/// fancy-regex, that also matches at the end of e.g. `test\n` which is
607/// different from onig. It would also change `.` to match more.
608fn regex_for_newlines(regex: String) -> String {
609    if !regex.contains('$') {
610        return regex;
611    }
612
613    let rewriter = RegexRewriterForNewlines {
614        parser: Parser::new(regex.as_bytes()),
615    };
616    rewriter.rewrite()
617}
618
619struct RegexRewriterForNewlines<'a> {
620    parser: Parser<'a>,
621}
622
623impl<'a> RegexRewriterForNewlines<'a> {
624    fn rewrite(mut self) -> String {
625        let mut result = Vec::new();
626
627        while let Some(c) = self.parser.peek() {
628            match c {
629                b'$' => {
630                    self.parser.next();
631                    result.extend_from_slice(br"(?m:$)");
632                }
633                b'\\' => {
634                    self.parser.next();
635                    result.push(c);
636                    if let Some(c2) = self.parser.peek() {
637                        self.parser.next();
638                        result.push(c2);
639                    }
640                }
641                b'[' => {
642                    let (mut content, _) = self.parser.parse_character_class();
643                    result.append(&mut content);
644                }
645                _ => {
646                    self.parser.next();
647                    result.push(c);
648                }
649            }
650        }
651        String::from_utf8(result).unwrap()
652    }
653}
654
655/// Rewrite a regex that matches `\n` to one that matches `$` (end of line) instead.
656/// That allows the regex to be used to match lines that don't include a trailing newline character.
657///
658/// The reason we're doing this is because the regexes in the syntax definitions assume that the
659/// lines that are being matched on include a trailing newline.
660///
661/// Note that the rewrite is just an approximation and there's a couple of cases it can not handle,
662/// due to `$` being an anchor whereas `\n` matches a character.
663fn regex_for_no_newlines(regex: String) -> String {
664    if !regex.contains(r"\n") {
665        return regex;
666    }
667
668    // A special fix to rewrite a pattern from the `Rd` syntax that the RegexRewriter can not
669    // handle properly.
670    let regex = regex.replace("(?:\\n)?", "(?:$|)");
671
672    let rewriter = RegexRewriterForNoNewlines {
673        parser: Parser::new(regex.as_bytes()),
674    };
675    rewriter.rewrite()
676}
677
678struct RegexRewriterForNoNewlines<'a> {
679    parser: Parser<'a>,
680}
681
682impl<'a> RegexRewriterForNoNewlines<'a> {
683    fn rewrite(mut self) -> String {
684        let mut result = Vec::new();
685        while let Some(c) = self.parser.peek() {
686            match c {
687                b'\\' => {
688                    self.parser.next();
689                    if let Some(c2) = self.parser.peek() {
690                        self.parser.next();
691                        // Replacing `\n` with `$` in `\n?` or `\n+` would make parsing later fail
692                        // with "target of repeat operator is invalid"
693                        let c3 = self.parser.peek();
694                        if c2 == b'n' && c3 != Some(b'?') && c3 != Some(b'+') && c3 != Some(b'*') {
695                            result.extend_from_slice(b"$");
696                        } else {
697                            result.push(c);
698                            result.push(c2);
699                        }
700                    } else {
701                        result.push(c);
702                    }
703                }
704                b'[' => {
705                    let (mut content, matches_newline) = self.parser.parse_character_class();
706                    if matches_newline && self.parser.peek() != Some(b'?') {
707                        result.extend_from_slice(b"(?:");
708                        result.append(&mut content);
709                        result.extend_from_slice(br"|$)");
710                    } else {
711                        result.append(&mut content);
712                    }
713                }
714                _ => {
715                    self.parser.next();
716                    result.push(c);
717                }
718            }
719        }
720        String::from_utf8(result).unwrap()
721    }
722}
723
724fn get_consuming_capture_indexes(regex: &str) -> Vec<usize> {
725    let parser = ConsumingCaptureIndexParser {
726        parser: Parser::new(regex.as_bytes()),
727    };
728    parser.get_consuming_capture_indexes()
729}
730
731struct ConsumingCaptureIndexParser<'a> {
732    parser: Parser<'a>,
733}
734
735impl<'a> ConsumingCaptureIndexParser<'a> {
736    /// Find capture groups which are not inside lookarounds.
737    ///
738    /// If, in a YAML syntax definition, a scope stack is applied to a capture group inside a
739    /// lookaround, (i.e. "captures:\n x: scope.stack goes.here", where "x" is the number of a
740    /// capture group in a lookahead/behind), those those scopes are not applied, so no need to
741    /// even parse them.
742    fn get_consuming_capture_indexes(mut self) -> Vec<usize> {
743        let mut result = Vec::new();
744        let mut stack = Vec::new();
745        let mut cap_num = 0;
746        let mut in_lookaround = false;
747        stack.push(in_lookaround);
748        result.push(cap_num);
749
750        while let Some(c) = self.parser.peek() {
751            match c {
752                b'\\' => {
753                    self.parser.next();
754                    self.parser.next();
755                }
756                b'[' => {
757                    self.parser.parse_character_class();
758                }
759                b'(' => {
760                    self.parser.next();
761                    // add the current lookaround state to the stack so we can just pop at a closing paren
762                    stack.push(in_lookaround);
763                    if let Some(c2) = self.parser.peek() {
764                        if c2 != b'?' {
765                            // simple numbered capture group
766                            cap_num += 1;
767                            // if we are not currently in a lookaround,
768                            // add this capture group number to the valid ones
769                            if !in_lookaround {
770                                result.push(cap_num);
771                            }
772                        } else {
773                            self.parser.next();
774                            if let Some(c3) = self.parser.peek() {
775                                self.parser.next();
776                                if c3 == b'=' || c3 == b'!' {
777                                    // lookahead
778                                    in_lookaround = true;
779                                } else if c3 == b'<' {
780                                    if let Some(c4) = self.parser.peek() {
781                                        if c4 == b'=' || c4 == b'!' {
782                                            self.parser.next();
783                                            // lookbehind
784                                            in_lookaround = true;
785                                        }
786                                    }
787                                } else if c3 == b'P' {
788                                    if let Some(c4) = self.parser.peek() {
789                                        if c4 == b'<' {
790                                            // named capture group
791                                            cap_num += 1;
792                                            // if we are not currently in a lookaround,
793                                            // add this capture group number to the valid ones
794                                            if !in_lookaround {
795                                                result.push(cap_num);
796                                            }
797                                        }
798                                    }
799                                }
800                            }
801                        }
802                    }
803                }
804                b')' => {
805                    if let Some(value) = stack.pop() {
806                        in_lookaround = value;
807                    }
808                    self.parser.next();
809                }
810                _ => {
811                    self.parser.next();
812                }
813            }
814        }
815        result
816    }
817}
818
819struct Parser<'a> {
820    bytes: &'a [u8],
821    index: usize,
822}
823
824impl<'a> Parser<'a> {
825    fn new(bytes: &[u8]) -> Parser {
826        Parser { bytes, index: 0 }
827    }
828
829    fn peek(&self) -> Option<u8> {
830        self.bytes.get(self.index).copied()
831    }
832
833    fn next(&mut self) {
834        self.index += 1;
835    }
836
837    fn parse_character_class(&mut self) -> (Vec<u8>, bool) {
838        let mut content = Vec::new();
839        let mut negated = false;
840        let mut nesting = 0;
841        let mut matches_newline = false;
842
843        self.next();
844        content.push(b'[');
845        if let Some(b'^') = self.peek() {
846            self.next();
847            content.push(b'^');
848            negated = true;
849        }
850
851        // An unescaped `]` is allowed after `[` or `[^` and doesn't mean the end of the class.
852        if let Some(b']') = self.peek() {
853            self.next();
854            content.push(b']');
855        }
856
857        while let Some(c) = self.peek() {
858            match c {
859                b'\\' => {
860                    self.next();
861                    content.push(c);
862                    if let Some(c2) = self.peek() {
863                        self.next();
864                        if c2 == b'n' && !negated && nesting == 0 {
865                            matches_newline = true;
866                        }
867                        content.push(c2);
868                    }
869                }
870                b'[' => {
871                    self.next();
872                    content.push(b'[');
873                    nesting += 1;
874                }
875                b']' => {
876                    self.next();
877                    content.push(b']');
878                    if nesting == 0 {
879                        break;
880                    }
881                    nesting -= 1;
882                }
883                _ => {
884                    self.next();
885                    content.push(c);
886                }
887            }
888        }
889
890        (content, matches_newline)
891    }
892}
893
894#[cfg(test)]
895mod tests {
896    use super::*;
897    use crate::parsing::Scope;
898
899    #[test]
900    fn can_parse() {
901        let defn: SyntaxDefinition = SyntaxDefinition::load_from_str(
902            "name: C\nscope: source.c\ncontexts: {main: []}",
903            false,
904            None,
905        )
906        .unwrap();
907        assert_eq!(defn.name, "C");
908        assert_eq!(defn.scope, Scope::new("source.c").unwrap());
909        let exts_empty: Vec<String> = Vec::new();
910        assert_eq!(defn.file_extensions, exts_empty);
911        assert!(!defn.hidden);
912        assert!(defn.variables.is_empty());
913        let defn2: SyntaxDefinition = SyntaxDefinition::load_from_str(
914            "
915        name: C
916        scope: source.c
917        file_extensions: [c, h]
918        hidden_file_extensions: [k, l]
919        hidden: true
920        variables:
921          ident: '[QY]+'
922        contexts:
923          prototype:
924            - match: lol
925              scope: source.php
926          main:
927            - match: \\b(if|else|for|while|{{ident}})\\b
928              scope: keyword.control.c keyword.looping.c
929              captures:
930                  1: meta.preprocessor.c++
931                  2: keyword.control.include.c++
932              push: [string, 'scope:source.c#main', 'CSS.sublime-syntax#rule-list-body']
933              with_prototype:
934                - match: wow
935                  pop: true
936            - match: '\"'
937              push: string
938          string:
939            - meta_scope: string.quoted.double.c
940            - meta_include_prototype: false
941            - match: \\\\.
942              scope: constant.character.escape.c
943            - match: '\"'
944              pop: true
945        ",
946            false,
947            None,
948        )
949        .unwrap();
950        assert_eq!(defn2.name, "C");
951        let top_level_scope = Scope::new("source.c").unwrap();
952        assert_eq!(defn2.scope, top_level_scope);
953        let exts: Vec<String> = vec!["c", "h", "k", "l"]
954            .into_iter()
955            .map(String::from)
956            .collect();
957        assert_eq!(defn2.file_extensions, exts);
958        assert!(defn2.hidden);
959        assert_eq!(defn2.variables.get("ident").unwrap(), "[QY]+");
960
961        let n: Vec<Scope> = Vec::new();
962        println!("{:?}", defn2);
963        // unreachable!();
964        let main = &defn2.contexts["main"];
965        assert_eq!(main.meta_content_scope, vec![top_level_scope]);
966        assert_eq!(main.meta_scope, n);
967        assert!(main.meta_include_prototype);
968
969        assert_eq!(defn2.contexts["__main"].meta_content_scope, n);
970        assert_eq!(
971            defn2.contexts["__start"].meta_content_scope,
972            vec![top_level_scope]
973        );
974
975        assert_eq!(
976            defn2.contexts["string"].meta_scope,
977            vec![Scope::new("string.quoted.double.c").unwrap()]
978        );
979        let first_pattern: &Pattern = &main.patterns[0];
980        match *first_pattern {
981            Pattern::Match(ref match_pat) => {
982                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
983                assert_eq!(
984                    &m[0],
985                    &(1, vec![Scope::new("meta.preprocessor.c++").unwrap()])
986                );
987                use crate::parsing::syntax_definition::ContextReference::*;
988
989                // this is sadly necessary because Context is not Eq because of the Regex
990                let expected = MatchOperation::Push(vec![
991                    Named("string".to_owned()),
992                    ByScope {
993                        scope: Scope::new("source.c").unwrap(),
994                        sub_context: Some("main".to_owned()),
995                        with_escape: false,
996                    },
997                    File {
998                        name: "CSS".to_owned(),
999                        sub_context: Some("rule-list-body".to_owned()),
1000                        with_escape: false,
1001                    },
1002                ]);
1003                assert_eq!(
1004                    format!("{:?}", match_pat.operation),
1005                    format!("{:?}", expected)
1006                );
1007
1008                assert_eq!(
1009                    match_pat.scope,
1010                    vec![
1011                        Scope::new("keyword.control.c").unwrap(),
1012                        Scope::new("keyword.looping.c").unwrap()
1013                    ]
1014                );
1015
1016                assert!(match_pat.with_prototype.is_some());
1017            }
1018            _ => unreachable!(),
1019        }
1020    }
1021
1022    #[test]
1023    fn can_parse_embed_as_with_prototypes() {
1024        let old_def = SyntaxDefinition::load_from_str(r#"
1025        name: C
1026        scope: source.c
1027        file_extensions: [c, h]
1028        variables:
1029          ident: '[QY]+'
1030        contexts:
1031          main:
1032            - match: '(>)\s*'
1033              captures:
1034                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1035              push:
1036                - [{ meta_include_prototype: false }, { meta_content_scope: 'source.css.embedded.html' }, { match: '(?i)(?=</style)', pop: true }]
1037                - scope:source.css
1038              with_prototype:
1039                - match: (?=(?i)(?=</style))
1040                  pop: true
1041        "#,false, None).unwrap();
1042
1043        let mut def_with_embed = SyntaxDefinition::load_from_str(
1044            r#"
1045        name: C
1046        scope: source.c
1047        file_extensions: [c, h]
1048        variables:
1049          ident: '[QY]+'
1050        contexts:
1051          main:
1052            - match: '(>)\s*'
1053              captures:
1054                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1055              embed: scope:source.css
1056              embed_scope: source.css.embedded.html
1057              escape: (?i)(?=</style)
1058        "#,
1059            false,
1060            None,
1061        )
1062        .unwrap();
1063
1064        // We will soon do an `assert_eq!()`. But there is one difference we must expect, namely
1065        // that for `def_with_embed`, the value of `ContextReference::ByScope::with_escape` will be
1066        // `true`, whereas for `old_def` it will be `false`. So manually adjust `with_escape` to
1067        // `false` so that `assert_eq!()` will work.
1068        let def_with_embed_context = def_with_embed.contexts.get_mut("main").unwrap();
1069        if let Pattern::Match(ref mut match_pattern) = def_with_embed_context.patterns[0] {
1070            if let MatchOperation::Push(ref mut context_references) = match_pattern.operation {
1071                if let ContextReference::ByScope {
1072                    ref mut with_escape,
1073                    ..
1074                } = context_references[1]
1075                {
1076                    *with_escape = false;
1077                }
1078            }
1079        }
1080
1081        assert_eq!(old_def.contexts["main"], def_with_embed.contexts["main"]);
1082    }
1083
1084    #[test]
1085    fn errors_on_embed_without_escape() {
1086        let def = SyntaxDefinition::load_from_str(
1087            r#"
1088        name: C
1089        scope: source.c
1090        file_extensions: [c, h]
1091        variables:
1092          ident: '[QY]+'
1093        contexts:
1094          main:
1095            - match: '(>)\s*'
1096              captures:
1097                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1098              embed: scope:source.css
1099              embed_scope: source.css.embedded.html
1100        "#,
1101            false,
1102            None,
1103        );
1104        assert!(def.is_err());
1105        match def.unwrap_err() {
1106            ParseSyntaxError::MissingMandatoryKey(key) => assert_eq!(key, "escape"),
1107            _ => unreachable!("Got unexpected ParseSyntaxError"),
1108        }
1109    }
1110
1111    #[test]
1112    fn errors_on_regex_compile_error() {
1113        let def = SyntaxDefinition::load_from_str(
1114            r#"
1115        name: C
1116        scope: source.c
1117        file_extensions: [test]
1118        contexts:
1119          main:
1120            - match: '[a'
1121              scope: keyword.name
1122        "#,
1123            false,
1124            None,
1125        );
1126        assert!(def.is_err());
1127        match def.unwrap_err() {
1128            ParseSyntaxError::RegexCompileError(ref regex, _) => assert_eq!("[a", regex),
1129            _ => unreachable!("Got unexpected ParseSyntaxError"),
1130        }
1131    }
1132
1133    #[test]
1134    fn can_parse_ugly_yaml() {
1135        let defn: SyntaxDefinition = SyntaxDefinition::load_from_str(
1136            "
1137        name: LaTeX
1138        scope: text.tex.latex
1139        contexts:
1140          main:
1141            - match: '((\\\\)(?:framebox|makebox))\\b'
1142              captures:
1143                1: support.function.box.latex
1144                2: punctuation.definition.backslash.latex
1145              push:
1146                - [{meta_scope: meta.function.box.latex}, {match: '', pop: true}]
1147                - argument
1148                - optional-arguments
1149          argument:
1150            - match: '\\{'
1151              scope: punctuation.definition.group.brace.begin.latex
1152            - match: '(?=\\S)'
1153              pop: true
1154          optional-arguments:
1155            - match: '(?=\\S)'
1156              pop: true
1157        ",
1158            false,
1159            None,
1160        )
1161        .unwrap();
1162        assert_eq!(defn.name, "LaTeX");
1163        let top_level_scope = Scope::new("text.tex.latex").unwrap();
1164        assert_eq!(defn.scope, top_level_scope);
1165
1166        let first_pattern: &Pattern = &defn.contexts["main"].patterns[0];
1167        match *first_pattern {
1168            Pattern::Match(ref match_pat) => {
1169                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
1170                assert_eq!(
1171                    &m[0],
1172                    &(1, vec![Scope::new("support.function.box.latex").unwrap()])
1173                );
1174
1175                //use parsing::syntax_definition::ContextReference::*;
1176                // TODO: check the first pushed reference is Inline(...) and has a meta_scope of meta.function.box.latex
1177                // TODO: check the second pushed reference is Named("argument".to_owned())
1178                // TODO: check the third pushed reference is Named("optional-arguments".to_owned())
1179
1180                assert!(match_pat.with_prototype.is_none());
1181            }
1182            _ => unreachable!(),
1183        }
1184    }
1185
1186    #[test]
1187    fn names_anonymous_contexts() {
1188        let def = SyntaxDefinition::load_from_str(
1189            r#"
1190            scope: source.c
1191            contexts:
1192              main:
1193                - match: a
1194                  push: a
1195              a:
1196                - meta_scope: a
1197                - match: x
1198                  push:
1199                    - meta_scope: anonymous_x
1200                    - match: anything
1201                      push:
1202                        - meta_scope: anonymous_x_2
1203                - match: y
1204                  push:
1205                    - meta_scope: anonymous_y
1206                - match: z
1207                  escape: 'test'
1208            "#,
1209            false,
1210            None,
1211        )
1212        .unwrap();
1213
1214        assert_eq!(def.contexts["a"].meta_scope, vec![Scope::new("a").unwrap()]);
1215        assert_eq!(
1216            def.contexts["#anon_a_0"].meta_scope,
1217            vec![Scope::new("anonymous_x").unwrap()]
1218        );
1219        assert_eq!(
1220            def.contexts["#anon_a_1"].meta_scope,
1221            vec![Scope::new("anonymous_x_2").unwrap()]
1222        );
1223        assert_eq!(
1224            def.contexts["#anon_a_2"].meta_scope,
1225            vec![Scope::new("anonymous_y").unwrap()]
1226        );
1227        assert_eq!(def.contexts["#anon_a_3"].patterns.len(), 1); // escape
1228    }
1229
1230    #[test]
1231    fn can_use_fallback_name() {
1232        let def = SyntaxDefinition::load_from_str(
1233            r#"
1234        scope: source.c
1235        contexts:
1236          main:
1237            - match: ''
1238        "#,
1239            false,
1240            Some("C"),
1241        );
1242        assert_eq!(def.unwrap().name, "C");
1243    }
1244
1245    #[test]
1246    fn can_rewrite_regex_for_newlines() {
1247        fn rewrite(s: &str) -> String {
1248            regex_for_newlines(s.to_string())
1249        }
1250
1251        assert_eq!(&rewrite(r"a"), r"a");
1252        assert_eq!(&rewrite(r"\b"), r"\b");
1253        assert_eq!(&rewrite(r"(a)"), r"(a)");
1254        assert_eq!(&rewrite(r"[a]"), r"[a]");
1255        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1256        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1257        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1258
1259        assert_eq!(&rewrite(r"^"), r"^");
1260        assert_eq!(&rewrite(r"$"), r"(?m:$)");
1261        assert_eq!(&rewrite(r"^ab$"), r"^ab(?m:$)");
1262        assert_eq!(&rewrite(r"\^ab\$"), r"\^ab\$");
1263        assert_eq!(&rewrite(r"(//).*$"), r"(//).*(?m:$)");
1264
1265        // Do not rewrite this `$` because it's in a char class and doesn't mean end of line
1266        assert_eq!(&rewrite(r"[a$]"), r"[a$]");
1267    }
1268
1269    #[test]
1270    fn can_rewrite_regex_for_no_newlines() {
1271        fn rewrite(s: &str) -> String {
1272            regex_for_no_newlines(s.to_string())
1273        }
1274
1275        assert_eq!(&rewrite(r"a"), r"a");
1276        assert_eq!(&rewrite(r"\b"), r"\b");
1277        assert_eq!(&rewrite(r"(a)"), r"(a)");
1278        assert_eq!(&rewrite(r"[a]"), r"[a]");
1279        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1280        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1281        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1282
1283        assert_eq!(&rewrite(r"\n"), r"$");
1284        assert_eq!(&rewrite(r"\[\n"), r"\[$");
1285        assert_eq!(&rewrite(r"a\n?"), r"a\n?");
1286        assert_eq!(&rewrite(r"a\n+"), r"a\n+");
1287        assert_eq!(&rewrite(r"a\n*"), r"a\n*");
1288        assert_eq!(&rewrite(r"[abc\n]"), r"(?:[abc\n]|$)");
1289        assert_eq!(&rewrite(r"[^\n]"), r"[^\n]");
1290        assert_eq!(&rewrite(r"[^]\n]"), r"[^]\n]");
1291        assert_eq!(&rewrite(r"[\n]?"), r"[\n]?");
1292        // Removing the `\n` might result in an empty character class, so we should leave it.
1293        assert_eq!(&rewrite(r"[\n]"), r"(?:[\n]|$)");
1294        assert_eq!(&rewrite(r"[]\n]"), r"(?:[]\n]|$)");
1295        // In order to properly understand nesting, we'd have to have a full parser, so ignore it.
1296        assert_eq!(&rewrite(r"[[a]&&[\n]]"), r"[[a]&&[\n]]");
1297
1298        assert_eq!(&rewrite(r"ab(?:\n)?"), r"ab(?:$|)");
1299        assert_eq!(&rewrite(r"(?<!\n)ab"), r"(?<!$)ab");
1300        assert_eq!(&rewrite(r"(?<=\n)ab"), r"(?<=$)ab");
1301    }
1302
1303    #[test]
1304    fn can_get_valid_captures_from_regex() {
1305        let regex = "hello(test)(?=(world))(foo(?P<named>bar))";
1306        println!("{:?}", regex);
1307        let valid_indexes = get_consuming_capture_indexes(regex);
1308        println!("{:?}", valid_indexes);
1309        assert_eq!(valid_indexes, [0, 1, 3, 4]);
1310    }
1311
1312    #[test]
1313    fn can_get_valid_captures_from_regex2() {
1314        let regex = "hello(test)[(?=tricked](foo(bar))";
1315        println!("{:?}", regex);
1316        let valid_indexes = get_consuming_capture_indexes(regex);
1317        println!("{:?}", valid_indexes);
1318        assert_eq!(valid_indexes, [0, 1, 2, 3]);
1319    }
1320
1321    #[test]
1322    fn can_get_valid_captures_from_nested_regex() {
1323        let regex = "hello(test)(?=(world(?!(te(?<=(st))))))(foo(bar))";
1324        println!("{:?}", regex);
1325        let valid_indexes = get_consuming_capture_indexes(regex);
1326        println!("{:?}", valid_indexes);
1327        assert_eq!(valid_indexes, [0, 1, 5, 6]);
1328    }
1329}