Skip to main content

tree_sitter_generate/
parse_grammar.rs

1use std::collections::HashSet;
2
3use log::warn;
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use serde_json::{Map, Value};
7use thiserror::Error;
8
9use crate::{
10    grammars::{InputGrammar, PrecedenceEntry, ReservedWordContext, Variable, VariableType},
11    rules::{Precedence, Rule},
12};
13
14#[derive(Deserialize)]
15#[serde(tag = "type")]
16#[allow(non_camel_case_types)]
17#[allow(clippy::upper_case_acronyms)]
18enum RuleJSON {
19    ALIAS {
20        content: Box<Self>,
21        named: bool,
22        value: String,
23    },
24    BLANK,
25    STRING {
26        value: String,
27    },
28    PATTERN {
29        value: String,
30        flags: Option<String>,
31    },
32    SYMBOL {
33        name: String,
34    },
35    CHOICE {
36        members: Vec<Self>,
37    },
38    FIELD {
39        name: String,
40        content: Box<Self>,
41    },
42    SEQ {
43        members: Vec<Self>,
44    },
45    REPEAT {
46        content: Box<Self>,
47    },
48    REPEAT1 {
49        content: Box<Self>,
50    },
51    PREC_DYNAMIC {
52        value: i32,
53        content: Box<Self>,
54    },
55    PREC_LEFT {
56        value: PrecedenceValueJSON,
57        content: Box<Self>,
58    },
59    PREC_RIGHT {
60        value: PrecedenceValueJSON,
61        content: Box<Self>,
62    },
63    PREC {
64        value: PrecedenceValueJSON,
65        content: Box<Self>,
66    },
67    TOKEN {
68        content: Box<Self>,
69    },
70    IMMEDIATE_TOKEN {
71        content: Box<Self>,
72    },
73    RESERVED {
74        context_name: String,
75        content: Box<Self>,
76    },
77}
78
79#[derive(Deserialize)]
80#[serde(untagged)]
81enum PrecedenceValueJSON {
82    Integer(i32),
83    Name(String),
84}
85
86#[derive(Deserialize)]
87pub struct GrammarJSON {
88    pub name: String,
89    rules: Map<String, Value>,
90    #[serde(default)]
91    precedences: Vec<Vec<RuleJSON>>,
92    #[serde(default)]
93    conflicts: Vec<Vec<String>>,
94    #[serde(default)]
95    externals: Vec<RuleJSON>,
96    #[serde(default)]
97    extras: Vec<RuleJSON>,
98    #[serde(default)]
99    inline: Vec<String>,
100    #[serde(default)]
101    supertypes: Vec<String>,
102    #[serde(default)]
103    word: Option<String>,
104    #[serde(default)]
105    reserved: Map<String, Value>,
106}
107
108pub type ParseGrammarResult<T> = Result<T, ParseGrammarError>;
109
110#[derive(Debug, Error, Serialize)]
111pub enum ParseGrammarError {
112    #[error("{0}")]
113    Serialization(String),
114    #[error("Rules in the `extras` array must not contain empty strings")]
115    InvalidExtra,
116    #[error("Invalid rule in precedences array. Only strings and symbols are allowed")]
117    Unexpected,
118    #[error("Reserved word sets must be arrays")]
119    InvalidReservedWordSet,
120    #[error("Grammar Error: Unexpected rule `{0}` in `token()` call")]
121    UnexpectedRule(String),
122}
123
124impl From<serde_json::Error> for ParseGrammarError {
125    fn from(value: serde_json::Error) -> Self {
126        Self::Serialization(value.to_string())
127    }
128}
129
130/// Check if a rule is referenced by another rule.
131///
132/// This function is used to determine if a variable is used in a given rule,
133/// and `is_other` indicates if the rule is an external, and if it is,
134/// to not assume that a named symbol that is equal to itself means it's being referenced.
135///
136/// For example, if we have an external rule **and** a normal rule both called `foo`,
137/// `foo` should not be thought of as directly used unless it's used within another rule.
138fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool {
139    match rule {
140        Rule::NamedSymbol(name) => name == target && !is_external,
141        Rule::Choice(rules) | Rule::Seq(rules) => {
142            rules.iter().any(|r| rule_is_referenced(r, target, false))
143        }
144        Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
145            rule_is_referenced(rule, target, is_external)
146        }
147        Rule::Repeat(inner) => rule_is_referenced(inner, target, false),
148        Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
149    }
150}
151
152fn variable_is_used(
153    grammar_rules: &[(String, Rule)],
154    extras: &[Rule],
155    externals: &[Rule],
156    target_name: &str,
157    in_progress: &mut HashSet<String>,
158) -> bool {
159    let root = &grammar_rules.first().unwrap().0;
160    if target_name == root {
161        return true;
162    }
163
164    if extras
165        .iter()
166        .any(|rule| rule_is_referenced(rule, target_name, false))
167    {
168        return true;
169    }
170
171    if externals
172        .iter()
173        .any(|rule| rule_is_referenced(rule, target_name, true))
174    {
175        return true;
176    }
177
178    in_progress.insert(target_name.to_string());
179    let result = grammar_rules
180        .iter()
181        .filter(|(key, _)| *key != target_name)
182        .any(|(name, rule)| {
183            if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) {
184                return false;
185            }
186            variable_is_used(grammar_rules, extras, externals, name, in_progress)
187        });
188    in_progress.remove(target_name);
189
190    result
191}
192
193pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult<InputGrammar> {
194    let mut grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
195
196    let mut extra_symbols =
197        grammar_json
198            .extras
199            .into_iter()
200            .try_fold(Vec::<Rule>::new(), |mut acc, item| {
201                let rule = parse_rule(item, false)?;
202                if let Rule::String(ref value) = rule {
203                    if value.is_empty() {
204                        Err(ParseGrammarError::InvalidExtra)?;
205                    }
206                }
207                acc.push(rule);
208                ParseGrammarResult::Ok(acc)
209            })?;
210
211    let mut external_tokens = grammar_json
212        .externals
213        .into_iter()
214        .map(|e| parse_rule(e, false))
215        .collect::<ParseGrammarResult<Vec<_>>>()?;
216
217    let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
218    for list in grammar_json.precedences {
219        let mut ordering = Vec::with_capacity(list.len());
220        for entry in list {
221            ordering.push(match entry {
222                RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
223                RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
224                _ => Err(ParseGrammarError::Unexpected)?,
225            });
226        }
227        precedence_orderings.push(ordering);
228    }
229
230    let mut variables = Vec::with_capacity(grammar_json.rules.len());
231
232    let rules = grammar_json
233        .rules
234        .into_iter()
235        .map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?)))
236        .collect::<ParseGrammarResult<Vec<_>>>()?;
237
238    let mut in_progress = HashSet::new();
239
240    for (name, rule) in &rules {
241        if grammar_json.word.as_ref().is_none_or(|w| w != name)
242            && !variable_is_used(
243                &rules,
244                &extra_symbols,
245                &external_tokens,
246                name,
247                &mut in_progress,
248            )
249        {
250            grammar_json.conflicts.retain(|r| !r.contains(name));
251            grammar_json.supertypes.retain(|r| r != name);
252            grammar_json.inline.retain(|r| r != name);
253            extra_symbols.retain(|r| !rule_is_referenced(r, name, true));
254            external_tokens.retain(|r| !rule_is_referenced(r, name, true));
255            precedence_orderings.retain(|r| {
256                !r.iter().any(|e| {
257                    let PrecedenceEntry::Symbol(s) = e else {
258                        return false;
259                    };
260                    s == name
261                })
262            });
263            continue;
264        }
265
266        if extra_symbols
267            .iter()
268            .any(|r| rule_is_referenced(r, name, false))
269        {
270            let inner_rule = if let Rule::Metadata { rule, .. } = rule {
271                rule
272            } else {
273                rule
274            };
275            let matches_empty = match inner_rule {
276                Rule::String(rule_str) => rule_str.is_empty(),
277                Rule::Pattern(ref value, _) => Regex::new(value).is_ok_and(|reg| reg.is_match("")),
278                _ => false,
279            };
280            if matches_empty {
281                warn!(
282                    concat!(
283                        "Named extra rule `{}` matches the empty string. ",
284                        "Inline this to avoid infinite loops while parsing."
285                    ),
286                    name
287                );
288            }
289        }
290        variables.push(Variable {
291            name: name.clone(),
292            kind: VariableType::Named,
293            rule: rule.clone(),
294        });
295    }
296
297    let reserved_words = grammar_json
298        .reserved
299        .into_iter()
300        .map(|(name, rule_values)| {
301            let Value::Array(rule_values) = rule_values else {
302                Err(ParseGrammarError::InvalidReservedWordSet)?
303            };
304
305            let mut reserved_words = Vec::with_capacity(rule_values.len());
306            for value in rule_values {
307                reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?);
308            }
309            Ok(ReservedWordContext {
310                name,
311                reserved_words,
312            })
313        })
314        .collect::<ParseGrammarResult<Vec<_>>>()?;
315
316    Ok(InputGrammar {
317        name: grammar_json.name,
318        word_token: grammar_json.word,
319        expected_conflicts: grammar_json.conflicts,
320        supertype_symbols: grammar_json.supertypes,
321        variables_to_inline: grammar_json.inline,
322        precedence_orderings,
323        variables,
324        extra_symbols,
325        external_tokens,
326        reserved_words,
327    })
328}
329
330fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult<Rule> {
331    match json {
332        RuleJSON::ALIAS {
333            content,
334            value,
335            named,
336        } => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)),
337        RuleJSON::BLANK => Ok(Rule::Blank),
338        RuleJSON::STRING { value } => Ok(Rule::String(value)),
339        RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern(
340            value,
341            flags.map_or(String::new(), |f| {
342                f.matches(|c| {
343                    if c == 'i' {
344                        true
345                    } else {
346                        // silently ignore unicode flags
347                        if c != 'u' && c != 'v' {
348                            warn!("unsupported flag {c}");
349                        }
350                        false
351                    }
352                })
353                .collect()
354            }),
355        )),
356        RuleJSON::SYMBOL { name } => {
357            if is_token {
358                Err(ParseGrammarError::UnexpectedRule(name))?
359            } else {
360                Ok(Rule::NamedSymbol(name))
361            }
362        }
363        RuleJSON::CHOICE { members } => members
364            .into_iter()
365            .map(|m| parse_rule(m, is_token))
366            .collect::<ParseGrammarResult<Vec<_>>>()
367            .map(Rule::choice),
368        RuleJSON::FIELD { content, name } => {
369            parse_rule(*content, is_token).map(|r| Rule::field(name, r))
370        }
371        RuleJSON::SEQ { members } => members
372            .into_iter()
373            .map(|m| parse_rule(m, is_token))
374            .collect::<ParseGrammarResult<Vec<_>>>()
375            .map(Rule::seq),
376        RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat),
377        RuleJSON::REPEAT { content } => {
378            parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank]))
379        }
380        RuleJSON::PREC { value, content } => {
381            parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r))
382        }
383        RuleJSON::PREC_LEFT { value, content } => {
384            parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r))
385        }
386        RuleJSON::PREC_RIGHT { value, content } => {
387            parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r))
388        }
389        RuleJSON::PREC_DYNAMIC { value, content } => {
390            parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r))
391        }
392        RuleJSON::RESERVED {
393            content,
394            context_name,
395        } => parse_rule(*content, is_token).map(|r| Rule::Reserved {
396            rule: Box::new(r),
397            context_name,
398        }),
399        RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token),
400        RuleJSON::IMMEDIATE_TOKEN { content } => {
401            parse_rule(*content, is_token).map(Rule::immediate_token)
402        }
403    }
404}
405
406impl From<PrecedenceValueJSON> for Precedence {
407    fn from(val: PrecedenceValueJSON) -> Self {
408        match val {
409            PrecedenceValueJSON::Integer(i) => Self::Integer(i),
410            PrecedenceValueJSON::Name(i) => Self::Name(i),
411        }
412    }
413}
414
415#[cfg(test)]
416mod tests {
417    use super::*;
418
419    #[test]
420    fn test_parse_grammar() {
421        let grammar = parse_grammar(
422            r#"{
423            "name": "my_lang",
424            "rules": {
425                "file": {
426                    "type": "REPEAT1",
427                    "content": {
428                        "type": "SYMBOL",
429                        "name": "statement"
430                    }
431                },
432                "statement": {
433                    "type": "STRING",
434                    "value": "foo"
435                }
436            }
437        }"#,
438        )
439        .unwrap();
440
441        assert_eq!(grammar.name, "my_lang");
442        assert_eq!(
443            grammar.variables,
444            vec![
445                Variable {
446                    name: "file".to_string(),
447                    kind: VariableType::Named,
448                    rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
449                },
450                Variable {
451                    name: "statement".to_string(),
452                    kind: VariableType::Named,
453                    rule: Rule::String("foo".to_string())
454                },
455            ]
456        );
457    }
458}