Skip to main content

tree_sitter_generate/
prepare_grammar.rs

1mod expand_repeats;
2mod expand_tokens;
3mod extract_default_aliases;
4mod extract_tokens;
5mod flatten_grammar;
6mod intern_symbols;
7mod process_inlines;
8
9use std::{
10    cmp::Ordering,
11    collections::{hash_map, BTreeSet, HashMap, HashSet},
12    mem,
13};
14
15pub use expand_tokens::ExpandTokensError;
16pub use extract_tokens::ExtractTokensError;
17pub use flatten_grammar::FlattenGrammarError;
18use indexmap::IndexMap;
19pub use intern_symbols::InternSymbolsError;
20pub use process_inlines::ProcessInlinesError;
21use serde::Serialize;
22use thiserror::Error;
23
24pub use self::expand_tokens::expand_tokens;
25use self::{
26    expand_repeats::expand_repeats, extract_default_aliases::extract_default_aliases,
27    extract_tokens::extract_tokens, flatten_grammar::flatten_grammar,
28    intern_symbols::intern_symbols, process_inlines::process_inlines,
29};
30use super::{
31    grammars::{
32        ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, PrecedenceEntry,
33        SyntaxGrammar, Variable,
34    },
35    rules::{AliasMap, Precedence, Rule, Symbol},
36};
37use crate::grammars::ReservedWordContext;
38
39pub struct IntermediateGrammar<T, U> {
40    variables: Vec<Variable>,
41    extra_symbols: Vec<T>,
42    expected_conflicts: Vec<Vec<Symbol>>,
43    precedence_orderings: Vec<Vec<PrecedenceEntry>>,
44    external_tokens: Vec<U>,
45    variables_to_inline: Vec<Symbol>,
46    supertype_symbols: Vec<Symbol>,
47    word_token: Option<Symbol>,
48    reserved_word_sets: Vec<ReservedWordContext<T>>,
49}
50
51pub type InternedGrammar = IntermediateGrammar<Rule, Variable>;
52
53pub type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
54
55#[derive(Debug, PartialEq, Eq)]
56pub struct ExtractedLexicalGrammar {
57    pub variables: Vec<Variable>,
58    pub separators: Vec<Rule>,
59}
60
61impl<T, U> Default for IntermediateGrammar<T, U> {
62    fn default() -> Self {
63        Self {
64            variables: Vec::default(),
65            extra_symbols: Vec::default(),
66            expected_conflicts: Vec::default(),
67            precedence_orderings: Vec::default(),
68            external_tokens: Vec::default(),
69            variables_to_inline: Vec::default(),
70            supertype_symbols: Vec::default(),
71            word_token: Option::default(),
72            reserved_word_sets: Vec::default(),
73        }
74    }
75}
76
77pub type PrepareGrammarResult<T> = Result<T, PrepareGrammarError>;
78
79#[derive(Debug, Error, Serialize)]
80#[error(transparent)]
81pub enum PrepareGrammarError {
82    ValidatePrecedences(#[from] ValidatePrecedenceError),
83    ValidateIndirectRecursion(#[from] IndirectRecursionError),
84    InternSymbols(#[from] InternSymbolsError),
85    ExtractTokens(#[from] ExtractTokensError),
86    FlattenGrammar(#[from] FlattenGrammarError),
87    ExpandTokens(#[from] ExpandTokensError),
88    ProcessInlines(#[from] ProcessInlinesError),
89}
90
91pub type ValidatePrecedenceResult<T> = Result<T, ValidatePrecedenceError>;
92
93#[derive(Debug, Error, Serialize)]
94#[error(transparent)]
95pub enum ValidatePrecedenceError {
96    Undeclared(#[from] UndeclaredPrecedenceError),
97    Ordering(#[from] ConflictingPrecedenceOrderingError),
98}
99
100#[derive(Debug, Error, Serialize)]
101pub struct IndirectRecursionError(pub Vec<String>);
102
103impl std::fmt::Display for IndirectRecursionError {
104    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
105        write!(f, "Grammar contains an indirectly recursive rule: ")?;
106        for (i, symbol) in self.0.iter().enumerate() {
107            if i > 0 {
108                write!(f, " -> ")?;
109            }
110            write!(f, "{symbol}")?;
111        }
112        Ok(())
113    }
114}
115
116#[derive(Debug, Error, Serialize)]
117pub struct UndeclaredPrecedenceError {
118    pub precedence: String,
119    pub rule: String,
120}
121
122impl std::fmt::Display for UndeclaredPrecedenceError {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        write!(
125            f,
126            "Undeclared precedence '{}' in rule '{}'",
127            self.precedence, self.rule
128        )?;
129        Ok(())
130    }
131}
132
133#[derive(Debug, Error, Serialize)]
134pub struct ConflictingPrecedenceOrderingError {
135    pub precedence_1: String,
136    pub precedence_2: String,
137}
138
139impl std::fmt::Display for ConflictingPrecedenceOrderingError {
140    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
141        write!(
142            f,
143            "Conflicting orderings for precedences {} and {}",
144            self.precedence_1, self.precedence_2
145        )?;
146        Ok(())
147    }
148}
149
150/// Transform an input grammar into separate components that are ready
151/// for parse table construction.
152pub fn prepare_grammar(
153    input_grammar: &InputGrammar,
154) -> PrepareGrammarResult<(
155    SyntaxGrammar,
156    LexicalGrammar,
157    InlinedProductionMap,
158    AliasMap,
159)> {
160    validate_precedences(input_grammar)?;
161    validate_indirect_recursion(input_grammar)?;
162
163    let interned_grammar = intern_symbols(input_grammar)?;
164    let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
165    let syntax_grammar = expand_repeats(syntax_grammar);
166    let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
167    let lexical_grammar = expand_tokens(lexical_grammar)?;
168    let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
169    let inlines = process_inlines(&syntax_grammar, &lexical_grammar)?;
170    Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
171}
172
173/// Check for indirect recursion cycles in the grammar that can cause infinite loops while
174/// parsing. An indirect recursion cycle occurs when a non-terminal can derive itself through
175/// a chain of single-symbol productions (e.g., A -> B, B -> A).
176fn validate_indirect_recursion(grammar: &InputGrammar) -> Result<(), IndirectRecursionError> {
177    let mut epsilon_transitions: IndexMap<&str, BTreeSet<String>> = IndexMap::new();
178
179    for variable in &grammar.variables {
180        let productions = get_single_symbol_productions(&variable.rule);
181        // Filter out rules that *directly* reference themselves, as this doesn't
182        // cause a parsing loop.
183        let filtered: BTreeSet<String> = productions
184            .into_iter()
185            .filter(|s| s != &variable.name)
186            .collect();
187        epsilon_transitions.insert(variable.name.as_str(), filtered);
188    }
189
190    for start_symbol in epsilon_transitions.keys() {
191        let mut visited = BTreeSet::new();
192        let mut path = Vec::new();
193        if let Some((start_idx, end_idx)) =
194            get_cycle(start_symbol, &epsilon_transitions, &mut visited, &mut path)
195        {
196            let cycle_symbols = path[start_idx..=end_idx]
197                .iter()
198                .map(|s| (*s).to_string())
199                .collect();
200            return Err(IndirectRecursionError(cycle_symbols));
201        }
202    }
203
204    Ok(())
205}
206
207fn get_single_symbol_productions(rule: &Rule) -> BTreeSet<String> {
208    match rule {
209        Rule::NamedSymbol(name) => BTreeSet::from([name.clone()]),
210        Rule::Choice(choices) => choices
211            .iter()
212            .flat_map(get_single_symbol_productions)
213            .collect(),
214        Rule::Metadata { rule, .. } => get_single_symbol_productions(rule),
215        _ => BTreeSet::new(),
216    }
217}
218
219/// Perform a depth-first search to detect cycles in single state transitions.
220fn get_cycle<'a>(
221    current: &'a str,
222    transitions: &'a IndexMap<&'a str, BTreeSet<String>>,
223    visited: &mut BTreeSet<&'a str>,
224    path: &mut Vec<&'a str>,
225) -> Option<(usize, usize)> {
226    if let Some(first_idx) = path.iter().position(|s| *s == current) {
227        path.push(current);
228        return Some((first_idx, path.len() - 1));
229    }
230
231    if visited.contains(current) {
232        return None;
233    }
234
235    path.push(current);
236    visited.insert(current);
237
238    if let Some(next_symbols) = transitions.get(current) {
239        for next in next_symbols {
240            if let Some(cycle) = get_cycle(next, transitions, visited, path) {
241                return Some(cycle);
242            }
243        }
244    }
245
246    path.pop();
247    None
248}
249
250/// Check that all of the named precedences used in the grammar are declared
251/// within the `precedences` lists, and also that there are no conflicting
252/// precedence orderings declared in those lists.
253fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()> {
254    // Check that no rule contains a named precedence that is not present in
255    // any of the `precedences` lists.
256    fn validate(
257        rule_name: &str,
258        rule: &Rule,
259        names: &HashSet<&String>,
260    ) -> ValidatePrecedenceResult<()> {
261        match rule {
262            Rule::Repeat(rule) => validate(rule_name, rule, names),
263            Rule::Seq(elements) | Rule::Choice(elements) => elements
264                .iter()
265                .try_for_each(|e| validate(rule_name, e, names)),
266            Rule::Metadata { rule, params } => {
267                if let Precedence::Name(n) = &params.precedence {
268                    if !names.contains(n) {
269                        Err(UndeclaredPrecedenceError {
270                            precedence: n.clone(),
271                            rule: rule_name.to_string(),
272                        })?;
273                    }
274                }
275                validate(rule_name, rule, names)?;
276                Ok(())
277            }
278            _ => Ok(()),
279        }
280    }
281
282    // For any two precedence names `a` and `b`, if `a` comes before `b`
283    // in some list, then it cannot come *after* `b` in any list.
284    let mut pairs = HashMap::new();
285    for list in &grammar.precedence_orderings {
286        for (i, mut entry1) in list.iter().enumerate() {
287            for mut entry2 in list.iter().skip(i + 1) {
288                if entry2 == entry1 {
289                    continue;
290                }
291                let mut ordering = Ordering::Greater;
292                if entry1 > entry2 {
293                    ordering = Ordering::Less;
294                    mem::swap(&mut entry1, &mut entry2);
295                }
296                match pairs.entry((entry1, entry2)) {
297                    hash_map::Entry::Vacant(e) => {
298                        e.insert(ordering);
299                    }
300                    hash_map::Entry::Occupied(e) => {
301                        if e.get() != &ordering {
302                            Err(ConflictingPrecedenceOrderingError {
303                                precedence_1: entry1.to_string(),
304                                precedence_2: entry2.to_string(),
305                            })?;
306                        }
307                    }
308                }
309            }
310        }
311    }
312
313    let precedence_names = grammar
314        .precedence_orderings
315        .iter()
316        .flat_map(|l| l.iter())
317        .filter_map(|p| {
318            if let PrecedenceEntry::Name(n) = p {
319                Some(n)
320            } else {
321                None
322            }
323        })
324        .collect::<HashSet<&String>>();
325    for variable in &grammar.variables {
326        validate(&variable.name, &variable.rule, &precedence_names)?;
327    }
328
329    Ok(())
330}
331
332#[cfg(test)]
333mod tests {
334    use super::*;
335    use crate::grammars::VariableType;
336
337    #[test]
338    fn test_validate_precedences_with_undeclared_precedence() {
339        let grammar = InputGrammar {
340            precedence_orderings: vec![
341                vec![
342                    PrecedenceEntry::Name("a".to_string()),
343                    PrecedenceEntry::Name("b".to_string()),
344                ],
345                vec![
346                    PrecedenceEntry::Name("b".to_string()),
347                    PrecedenceEntry::Name("c".to_string()),
348                    PrecedenceEntry::Name("d".to_string()),
349                ],
350            ],
351            variables: vec![
352                Variable {
353                    name: "v1".to_string(),
354                    kind: VariableType::Named,
355                    rule: Rule::Seq(vec![
356                        Rule::prec_left(Precedence::Name("b".to_string()), Rule::string("w")),
357                        Rule::prec(Precedence::Name("c".to_string()), Rule::string("x")),
358                    ]),
359                },
360                Variable {
361                    name: "v2".to_string(),
362                    kind: VariableType::Named,
363                    rule: Rule::repeat(Rule::Choice(vec![
364                        Rule::prec_left(Precedence::Name("omg".to_string()), Rule::string("y")),
365                        Rule::prec(Precedence::Name("c".to_string()), Rule::string("z")),
366                    ])),
367                },
368            ],
369            ..Default::default()
370        };
371
372        let result = validate_precedences(&grammar);
373        assert_eq!(
374            result.unwrap_err().to_string(),
375            "Undeclared precedence 'omg' in rule 'v2'",
376        );
377    }
378
379    #[test]
380    fn test_validate_precedences_with_conflicting_order() {
381        let grammar = InputGrammar {
382            precedence_orderings: vec![
383                vec![
384                    PrecedenceEntry::Name("a".to_string()),
385                    PrecedenceEntry::Name("b".to_string()),
386                ],
387                vec![
388                    PrecedenceEntry::Name("b".to_string()),
389                    PrecedenceEntry::Name("c".to_string()),
390                    PrecedenceEntry::Name("a".to_string()),
391                ],
392            ],
393            variables: vec![
394                Variable {
395                    name: "v1".to_string(),
396                    kind: VariableType::Named,
397                    rule: Rule::Seq(vec![
398                        Rule::prec_left(Precedence::Name("b".to_string()), Rule::string("w")),
399                        Rule::prec(Precedence::Name("c".to_string()), Rule::string("x")),
400                    ]),
401                },
402                Variable {
403                    name: "v2".to_string(),
404                    kind: VariableType::Named,
405                    rule: Rule::repeat(Rule::Choice(vec![
406                        Rule::prec_left(Precedence::Name("a".to_string()), Rule::string("y")),
407                        Rule::prec(Precedence::Name("c".to_string()), Rule::string("z")),
408                    ])),
409                },
410            ],
411            ..Default::default()
412        };
413
414        let result = validate_precedences(&grammar);
415        assert_eq!(
416            result.unwrap_err().to_string(),
417            "Conflicting orderings for precedences 'a' and 'b'",
418        );
419    }
420}