treebender/
parse_grammar.rs

1use regex::Regex;
2/// Simple recursive-descent parsing of grammar files
3use std::str::FromStr;
4
5use crate::featurestructure::{Feature, NodeRef};
6use crate::rules::{Grammar, Production, Rule};
7use crate::utils::Err;
8
9pub const TOP_STR: &str = "**top**";
10
11/// Parses a str into a tuple of (rules, nonterminals)
12/// Errors if the grammar doesn't parse or is malformed
13impl FromStr for Grammar {
14  type Err = Err;
15
16  /// Parses a grammar from a string. Assumes the first rule's symbol
17  /// is the start symbol.
18  fn from_str(s: &str) -> Result<Self, Self::Err> {
19    let (rules, s) = parse_rules(s)?;
20    assert!(s.is_empty());
21
22    if rules.is_empty() {
23      Err("empty ruleset".into())
24    } else {
25      Self::new(rules)
26    }
27  }
28}
29
30type Infallible<'a, T> = (T, &'a str);
31type ParseResult<'a, T> = Result<(T, &'a str), Err>;
32
33/// helper macro for initializing a regex with lazy_static!
34macro_rules! regex_static {
35  ($name:ident, $pattern:expr) => {
36    lazy_static! {
37      static ref $name: Regex = Regex::new($pattern).unwrap();
38    }
39  };
40}
41
42/// Try to consume a regex, returning None if it doesn't match
43fn optional_re<'a>(re: &'static Regex, s: &'a str) -> Infallible<'a, Option<&'a str>> {
44  if let Some(caps) = re.captures(s) {
45    let m = caps.get(0).unwrap();
46    if m.start() > 0 {
47      return (None, s);
48    }
49    let (_, rest) = s.split_at(m.end());
50    (Some(m.as_str()), rest)
51  } else {
52    (None, s)
53  }
54}
55
56/// Try to consume a regex, failing if it doesn't match
57fn needed_re<'a>(re: &'static Regex, s: &'a str) -> ParseResult<'a, &'a str> {
58  if let (Some(c), rest) = optional_re(re, s) {
59    Ok((c, rest))
60  } else {
61    Err(format!("couldn't match {} at {}", re, s).into())
62  }
63}
64
65/// Try to consume a char, returning None if it doesn't match
66fn optional_char(c: char, s: &str) -> Infallible<Option<char>> {
67  let mut iter = s.char_indices().peekable();
68  if let Some((_, c1)) = iter.next() {
69    if c == c1 {
70      let rest = if let Some((idx, _)) = iter.peek() {
71        s.split_at(*idx).1
72      } else {
73        ""
74      };
75      return (Some(c), rest);
76    }
77  }
78  (None, s)
79}
80
81/// Try to consume a char, failing if it doesn't match
82fn needed_char(c: char, s: &str) -> ParseResult<char> {
83  if let (Some(c), rest) = optional_char(c, s) {
84    Ok((c, rest))
85  } else {
86    Err(format!("couldn't match {} at {}", c, s).into())
87  }
88}
89
90/// Tries to skip 1 or more \s characters and comments
91fn skip_whitespace(s: &str) -> &str {
92  regex_static!(WHITESPACE_OR_COMMENT, r"\s*(//.*?\n\s*)*");
93  optional_re(&*WHITESPACE_OR_COMMENT, s).1
94}
95
96// Tries to skip 1 or more non-newline whitespace characters
97fn skip_whitespace_nonnewline(s: &str) -> &str {
98  regex_static!(WHITESPACE_NONNEWLINE, r"[\s&&[^\n]]*");
99  optional_re(&*WHITESPACE_NONNEWLINE, s).1
100}
101
102/// Tries to parse a name made of letters, numbers, - and _
103fn parse_name(s: &str) -> ParseResult<&str> {
104  regex_static!(NAME, r"[a-zA-Z0-9\-_]+");
105  needed_re(&*NAME, s).map_err(|err| format!("name: {}", err).into())
106}
107
108/// Tries to parse a name made of dotted segments (foo.bar.c.d)
109fn parse_dotted(s: &str) -> ParseResult<&str> {
110  regex_static!(DOTTED, r"[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+)*");
111  needed_re(&*DOTTED, s).map_err(|e| format!("dotted name: {}", e).into())
112}
113
114/// Parses an optional #tag
115fn parse_tag(s: &str) -> ParseResult<Option<String>> {
116  let (hash, s) = optional_char('#', s);
117  if hash.is_none() {
118    Ok((None, s))
119  } else {
120    let s = skip_whitespace(s);
121    let (name, s) = parse_name(s).map_err(|e| -> Err { format!("tag: {}", e).into() })?;
122    Ok((Some(name.to_string()), s))
123  }
124}
125
126/// Parses a value with an optional tag: #tag value
127fn parse_feature_value(s: &str) -> ParseResult<(Option<String>, NodeRef)> {
128  regex_static!(VALUE, r"[a-zA-Z0-9\-_\*]+");
129  let (tag, s) = parse_tag(s)?;
130  let s = skip_whitespace(s);
131  let (name, s) = optional_re(&*VALUE, s);
132  let value = if let Some(name) = name {
133    if name == TOP_STR {
134      NodeRef::new_top()
135    } else {
136      NodeRef::new_str(name.to_string())
137    }
138  } else if tag.is_some() {
139    NodeRef::new_top()
140  } else {
141    return Err(format!("feature needs tag or value at {}", s).into());
142  };
143  Ok(((tag, value), s))
144}
145
146fn parse_feature(s: &str) -> ParseResult<Feature> {
147  let (name, s) = parse_dotted(s).map_err(|e| format!("feature name: {}", e))?;
148  let s = skip_whitespace(s);
149  let (_, s) = needed_char(':', s)?;
150  let s = skip_whitespace(s);
151  let (value, s) = parse_feature_value(s).map_err(|e| format!("feature value: {}", e))?;
152  let s = skip_whitespace(s);
153  let (_, s) = optional_char(',', s);
154
155  Ok((
156    Feature {
157      path: name.to_string(),
158      tag: value.0,
159      value: value.1,
160    },
161    s,
162  ))
163}
164
165fn parse_featurestructure(s: &str) -> ParseResult<Vec<Feature>> {
166  let mut pairs = Vec::new();
167  let mut rem = needed_char('[', s)?.1;
168  loop {
169    rem = skip_whitespace(rem);
170    if let (Some(_), rem) = optional_char(']', rem) {
171      return Ok((pairs, rem));
172    }
173    let (feature, s) = parse_feature(rem)?;
174    pairs.push(feature);
175    rem = s;
176  }
177}
178
179fn parse_production(s: &str) -> ParseResult<(Production, Vec<Feature>)> {
180  let (name, s) = parse_name(s).map_err(|e| -> Err { format!("symbol: {}", e).into() })?;
181  let s = skip_whitespace_nonnewline(s);
182  let (features, s) = if s.starts_with('[') {
183    parse_featurestructure(s)?
184  } else {
185    (Vec::new(), s)
186  };
187
188  if name.chars().next().unwrap().is_uppercase() {
189    Ok(((Production::new_nonterminal(name.to_string()), features), s))
190  } else {
191    if !features.is_empty() {
192      Err(format!("terminal (lower-case) cannot have features: {} {}", name, s).into())
193    } else {
194      // annotate terminals with their matching string
195      Ok((
196        (
197          Production::new_terminal(name.to_string()),
198          vec![Feature {
199            path: "word".to_string(),
200            tag: None,
201            value: NodeRef::new_str(name.to_string()),
202          }],
203        ),
204        s,
205      ))
206    }
207  }
208}
209
210fn parse_nonterminal(s: &str) -> ParseResult<(String, Vec<Feature>)> {
211  let ((prod, features), s) = parse_production(s)?;
212  if prod.is_nonterminal() {
213    Ok(((prod.symbol, features), s))
214  } else {
215    Err(format!("expected nonterminal, got terminal {}: {}", prod.symbol, s).into())
216  }
217}
218
219/// Symbol, productions, terminated by final newline
220fn parse_rule(s: &str) -> ParseResult<Rule> {
221  #![allow(clippy::trivial_regex)]
222  regex_static!(ARROW, "->");
223
224  let ((symbol, features), s) =
225    parse_nonterminal(s).map_err(|e| -> Err { format!("rule symbol: {}", e).into() })?;
226  let s = skip_whitespace(s);
227  let (_, s) = needed_re(&*ARROW, s).map_err(|e| -> Err { format!("rule arrow: {}", e).into() })?;
228
229  let mut prods_features = Vec::new();
230  let mut rem = s;
231  loop {
232    rem = skip_whitespace_nonnewline(rem);
233
234    let try_newline = skip_whitespace(rem);
235    if rem.is_empty() || try_newline != rem {
236      // end of line, exit loop
237      rem = try_newline;
238      break;
239    }
240
241    let (prod, s) =
242      parse_production(rem).map_err(|e| -> Err { format!("rule production: {}", e).into() })?;
243    prods_features.push(prod);
244    rem = s;
245  }
246
247  let (features, productions) = adopt_child_features(features, prods_features);
248  let features = NodeRef::new_from_paths(features)?;
249
250  Ok((
251    Rule {
252      symbol,
253      features,
254      productions,
255    },
256    rem,
257  ))
258}
259
260/// We want rules to be able to access their child features, and to be able to
261/// unify between them
262/// So we have the rule symbol "adopt" the features of its children, copying the
263/// child features into child-0.(...), child-1.(...), etc.
264///
265/// We could try to implement this when constructing the rule, but it's easier
266/// to do as a simple AST transform.
267fn adopt_child_features(
268  mut rule_features: Vec<Feature>,
269  prods_features: Vec<(Production, Vec<Feature>)>,
270) -> (Vec<Feature>, Vec<Production>) {
271  let mut productions = Vec::with_capacity(prods_features.len());
272
273  for (idx, (prod, features)) in prods_features.into_iter().enumerate() {
274    productions.push(prod);
275    let prefix = format!("child-{}.", idx);
276    for feature in features.into_iter() {
277      rule_features.push(Feature {
278        path: prefix.clone() + &feature.path,
279        tag: feature.tag,
280        value: feature.value,
281      });
282    }
283  }
284
285  (rule_features, productions)
286}
287
288fn parse_rules(s: &str) -> ParseResult<Vec<Rule>> {
289  let mut rules = Vec::new();
290  let mut rem = s;
291  loop {
292    rem = skip_whitespace(rem);
293    if rem.is_empty() {
294      return Ok((rules, rem));
295    }
296    let (rule, s) = parse_rule(rem)?;
297    rules.push(rule);
298    rem = s;
299  }
300}