1use std::collections::HashSet;
2
3use log::warn;
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use serde_json::{Map, Value};
7use thiserror::Error;
8
9use crate::{
10 grammars::{InputGrammar, PrecedenceEntry, ReservedWordContext, Variable, VariableType},
11 rules::{Precedence, Rule},
12};
13
14#[derive(Deserialize)]
15#[serde(tag = "type")]
16#[allow(non_camel_case_types)]
17#[allow(clippy::upper_case_acronyms)]
18enum RuleJSON {
19 ALIAS {
20 content: Box<Self>,
21 named: bool,
22 value: String,
23 },
24 BLANK,
25 STRING {
26 value: String,
27 },
28 PATTERN {
29 value: String,
30 flags: Option<String>,
31 },
32 SYMBOL {
33 name: String,
34 },
35 CHOICE {
36 members: Vec<Self>,
37 },
38 FIELD {
39 name: String,
40 content: Box<Self>,
41 },
42 SEQ {
43 members: Vec<Self>,
44 },
45 REPEAT {
46 content: Box<Self>,
47 },
48 REPEAT1 {
49 content: Box<Self>,
50 },
51 PREC_DYNAMIC {
52 value: i32,
53 content: Box<Self>,
54 },
55 PREC_LEFT {
56 value: PrecedenceValueJSON,
57 content: Box<Self>,
58 },
59 PREC_RIGHT {
60 value: PrecedenceValueJSON,
61 content: Box<Self>,
62 },
63 PREC {
64 value: PrecedenceValueJSON,
65 content: Box<Self>,
66 },
67 TOKEN {
68 content: Box<Self>,
69 },
70 IMMEDIATE_TOKEN {
71 content: Box<Self>,
72 },
73 RESERVED {
74 context_name: String,
75 content: Box<Self>,
76 },
77}
78
79#[derive(Deserialize)]
80#[serde(untagged)]
81enum PrecedenceValueJSON {
82 Integer(i32),
83 Name(String),
84}
85
86#[derive(Deserialize)]
87pub struct GrammarJSON {
88 pub name: String,
89 rules: Map<String, Value>,
90 #[serde(default)]
91 precedences: Vec<Vec<RuleJSON>>,
92 #[serde(default)]
93 conflicts: Vec<Vec<String>>,
94 #[serde(default)]
95 externals: Vec<RuleJSON>,
96 #[serde(default)]
97 extras: Vec<RuleJSON>,
98 #[serde(default)]
99 inline: Vec<String>,
100 #[serde(default)]
101 supertypes: Vec<String>,
102 #[serde(default)]
103 word: Option<String>,
104 #[serde(default)]
105 reserved: Map<String, Value>,
106}
107
108pub type ParseGrammarResult<T> = Result<T, ParseGrammarError>;
109
110#[derive(Debug, Error, Serialize)]
111pub enum ParseGrammarError {
112 #[error("{0}")]
113 Serialization(String),
114 #[error("Rules in the `extras` array must not contain empty strings")]
115 InvalidExtra,
116 #[error("Invalid rule in precedences array. Only strings and symbols are allowed")]
117 Unexpected,
118 #[error("Reserved word sets must be arrays")]
119 InvalidReservedWordSet,
120 #[error("Grammar Error: Unexpected rule `{0}` in `token()` call")]
121 UnexpectedRule(String),
122}
123
124impl From<serde_json::Error> for ParseGrammarError {
125 fn from(value: serde_json::Error) -> Self {
126 Self::Serialization(value.to_string())
127 }
128}
129
130fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool {
139 match rule {
140 Rule::NamedSymbol(name) => name == target && !is_external,
141 Rule::Choice(rules) | Rule::Seq(rules) => {
142 rules.iter().any(|r| rule_is_referenced(r, target, false))
143 }
144 Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
145 rule_is_referenced(rule, target, is_external)
146 }
147 Rule::Repeat(inner) => rule_is_referenced(inner, target, false),
148 Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
149 }
150}
151
152fn variable_is_used(
153 grammar_rules: &[(String, Rule)],
154 extras: &[Rule],
155 externals: &[Rule],
156 target_name: &str,
157 in_progress: &mut HashSet<String>,
158) -> bool {
159 let root = &grammar_rules.first().unwrap().0;
160 if target_name == root {
161 return true;
162 }
163
164 if extras
165 .iter()
166 .any(|rule| rule_is_referenced(rule, target_name, false))
167 {
168 return true;
169 }
170
171 if externals
172 .iter()
173 .any(|rule| rule_is_referenced(rule, target_name, true))
174 {
175 return true;
176 }
177
178 in_progress.insert(target_name.to_string());
179 let result = grammar_rules
180 .iter()
181 .filter(|(key, _)| *key != target_name)
182 .any(|(name, rule)| {
183 if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) {
184 return false;
185 }
186 variable_is_used(grammar_rules, extras, externals, name, in_progress)
187 });
188 in_progress.remove(target_name);
189
190 result
191}
192
193pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult<InputGrammar> {
194 let mut grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
195
196 let mut extra_symbols =
197 grammar_json
198 .extras
199 .into_iter()
200 .try_fold(Vec::<Rule>::new(), |mut acc, item| {
201 let rule = parse_rule(item, false)?;
202 if let Rule::String(ref value) = rule {
203 if value.is_empty() {
204 Err(ParseGrammarError::InvalidExtra)?;
205 }
206 }
207 acc.push(rule);
208 ParseGrammarResult::Ok(acc)
209 })?;
210
211 let mut external_tokens = grammar_json
212 .externals
213 .into_iter()
214 .map(|e| parse_rule(e, false))
215 .collect::<ParseGrammarResult<Vec<_>>>()?;
216
217 let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
218 for list in grammar_json.precedences {
219 let mut ordering = Vec::with_capacity(list.len());
220 for entry in list {
221 ordering.push(match entry {
222 RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
223 RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
224 _ => Err(ParseGrammarError::Unexpected)?,
225 });
226 }
227 precedence_orderings.push(ordering);
228 }
229
230 let mut variables = Vec::with_capacity(grammar_json.rules.len());
231
232 let rules = grammar_json
233 .rules
234 .into_iter()
235 .map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?)))
236 .collect::<ParseGrammarResult<Vec<_>>>()?;
237
238 let mut in_progress = HashSet::new();
239
240 for (name, rule) in &rules {
241 if grammar_json.word.as_ref().is_none_or(|w| w != name)
242 && !variable_is_used(
243 &rules,
244 &extra_symbols,
245 &external_tokens,
246 name,
247 &mut in_progress,
248 )
249 {
250 grammar_json.conflicts.retain(|r| !r.contains(name));
251 grammar_json.supertypes.retain(|r| r != name);
252 grammar_json.inline.retain(|r| r != name);
253 extra_symbols.retain(|r| !rule_is_referenced(r, name, true));
254 external_tokens.retain(|r| !rule_is_referenced(r, name, true));
255 precedence_orderings.retain(|r| {
256 !r.iter().any(|e| {
257 let PrecedenceEntry::Symbol(s) = e else {
258 return false;
259 };
260 s == name
261 })
262 });
263 continue;
264 }
265
266 if extra_symbols
267 .iter()
268 .any(|r| rule_is_referenced(r, name, false))
269 {
270 let inner_rule = if let Rule::Metadata { rule, .. } = rule {
271 rule
272 } else {
273 rule
274 };
275 let matches_empty = match inner_rule {
276 Rule::String(rule_str) => rule_str.is_empty(),
277 Rule::Pattern(ref value, _) => Regex::new(value).is_ok_and(|reg| reg.is_match("")),
278 _ => false,
279 };
280 if matches_empty {
281 warn!(
282 concat!(
283 "Named extra rule `{}` matches the empty string. ",
284 "Inline this to avoid infinite loops while parsing."
285 ),
286 name
287 );
288 }
289 }
290 variables.push(Variable {
291 name: name.clone(),
292 kind: VariableType::Named,
293 rule: rule.clone(),
294 });
295 }
296
297 let reserved_words = grammar_json
298 .reserved
299 .into_iter()
300 .map(|(name, rule_values)| {
301 let Value::Array(rule_values) = rule_values else {
302 Err(ParseGrammarError::InvalidReservedWordSet)?
303 };
304
305 let mut reserved_words = Vec::with_capacity(rule_values.len());
306 for value in rule_values {
307 reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?);
308 }
309 Ok(ReservedWordContext {
310 name,
311 reserved_words,
312 })
313 })
314 .collect::<ParseGrammarResult<Vec<_>>>()?;
315
316 Ok(InputGrammar {
317 name: grammar_json.name,
318 word_token: grammar_json.word,
319 expected_conflicts: grammar_json.conflicts,
320 supertype_symbols: grammar_json.supertypes,
321 variables_to_inline: grammar_json.inline,
322 precedence_orderings,
323 variables,
324 extra_symbols,
325 external_tokens,
326 reserved_words,
327 })
328}
329
330fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult<Rule> {
331 match json {
332 RuleJSON::ALIAS {
333 content,
334 value,
335 named,
336 } => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)),
337 RuleJSON::BLANK => Ok(Rule::Blank),
338 RuleJSON::STRING { value } => Ok(Rule::String(value)),
339 RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern(
340 value,
341 flags.map_or(String::new(), |f| {
342 f.matches(|c| {
343 if c == 'i' {
344 true
345 } else {
346 if c != 'u' && c != 'v' {
348 warn!("unsupported flag {c}");
349 }
350 false
351 }
352 })
353 .collect()
354 }),
355 )),
356 RuleJSON::SYMBOL { name } => {
357 if is_token {
358 Err(ParseGrammarError::UnexpectedRule(name))?
359 } else {
360 Ok(Rule::NamedSymbol(name))
361 }
362 }
363 RuleJSON::CHOICE { members } => members
364 .into_iter()
365 .map(|m| parse_rule(m, is_token))
366 .collect::<ParseGrammarResult<Vec<_>>>()
367 .map(Rule::choice),
368 RuleJSON::FIELD { content, name } => {
369 parse_rule(*content, is_token).map(|r| Rule::field(name, r))
370 }
371 RuleJSON::SEQ { members } => members
372 .into_iter()
373 .map(|m| parse_rule(m, is_token))
374 .collect::<ParseGrammarResult<Vec<_>>>()
375 .map(Rule::seq),
376 RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat),
377 RuleJSON::REPEAT { content } => {
378 parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank]))
379 }
380 RuleJSON::PREC { value, content } => {
381 parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r))
382 }
383 RuleJSON::PREC_LEFT { value, content } => {
384 parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r))
385 }
386 RuleJSON::PREC_RIGHT { value, content } => {
387 parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r))
388 }
389 RuleJSON::PREC_DYNAMIC { value, content } => {
390 parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r))
391 }
392 RuleJSON::RESERVED {
393 content,
394 context_name,
395 } => parse_rule(*content, is_token).map(|r| Rule::Reserved {
396 rule: Box::new(r),
397 context_name,
398 }),
399 RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token),
400 RuleJSON::IMMEDIATE_TOKEN { content } => {
401 parse_rule(*content, is_token).map(Rule::immediate_token)
402 }
403 }
404}
405
406impl From<PrecedenceValueJSON> for Precedence {
407 fn from(val: PrecedenceValueJSON) -> Self {
408 match val {
409 PrecedenceValueJSON::Integer(i) => Self::Integer(i),
410 PrecedenceValueJSON::Name(i) => Self::Name(i),
411 }
412 }
413}
414
415#[cfg(test)]
416mod tests {
417 use super::*;
418
419 #[test]
420 fn test_parse_grammar() {
421 let grammar = parse_grammar(
422 r#"{
423 "name": "my_lang",
424 "rules": {
425 "file": {
426 "type": "REPEAT1",
427 "content": {
428 "type": "SYMBOL",
429 "name": "statement"
430 }
431 },
432 "statement": {
433 "type": "STRING",
434 "value": "foo"
435 }
436 }
437 }"#,
438 )
439 .unwrap();
440
441 assert_eq!(grammar.name, "my_lang");
442 assert_eq!(
443 grammar.variables,
444 vec![
445 Variable {
446 name: "file".to_string(),
447 kind: VariableType::Named,
448 rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
449 },
450 Variable {
451 name: "statement".to_string(),
452 kind: VariableType::Named,
453 rule: Rule::String("foo".to_string())
454 },
455 ]
456 );
457 }
458}