1use regex::Regex;
2use std::str::FromStr;
4
5use crate::featurestructure::{Feature, NodeRef};
6use crate::rules::{Grammar, Production, Rule};
7use crate::utils::Err;
8
9pub const TOP_STR: &str = "**top**";
10
11impl FromStr for Grammar {
14 type Err = Err;
15
16 fn from_str(s: &str) -> Result<Self, Self::Err> {
19 let (rules, s) = parse_rules(s)?;
20 assert!(s.is_empty());
21
22 if rules.is_empty() {
23 Err("empty ruleset".into())
24 } else {
25 Self::new(rules)
26 }
27 }
28}
29
30type Infallible<'a, T> = (T, &'a str);
31type ParseResult<'a, T> = Result<(T, &'a str), Err>;
32
33macro_rules! regex_static {
35 ($name:ident, $pattern:expr) => {
36 lazy_static! {
37 static ref $name: Regex = Regex::new($pattern).unwrap();
38 }
39 };
40}
41
42fn optional_re<'a>(re: &'static Regex, s: &'a str) -> Infallible<'a, Option<&'a str>> {
44 if let Some(caps) = re.captures(s) {
45 let m = caps.get(0).unwrap();
46 if m.start() > 0 {
47 return (None, s);
48 }
49 let (_, rest) = s.split_at(m.end());
50 (Some(m.as_str()), rest)
51 } else {
52 (None, s)
53 }
54}
55
56fn needed_re<'a>(re: &'static Regex, s: &'a str) -> ParseResult<'a, &'a str> {
58 if let (Some(c), rest) = optional_re(re, s) {
59 Ok((c, rest))
60 } else {
61 Err(format!("couldn't match {} at {}", re, s).into())
62 }
63}
64
65fn optional_char(c: char, s: &str) -> Infallible<Option<char>> {
67 let mut iter = s.char_indices().peekable();
68 if let Some((_, c1)) = iter.next() {
69 if c == c1 {
70 let rest = if let Some((idx, _)) = iter.peek() {
71 s.split_at(*idx).1
72 } else {
73 ""
74 };
75 return (Some(c), rest);
76 }
77 }
78 (None, s)
79}
80
81fn needed_char(c: char, s: &str) -> ParseResult<char> {
83 if let (Some(c), rest) = optional_char(c, s) {
84 Ok((c, rest))
85 } else {
86 Err(format!("couldn't match {} at {}", c, s).into())
87 }
88}
89
90fn skip_whitespace(s: &str) -> &str {
92 regex_static!(WHITESPACE_OR_COMMENT, r"\s*(//.*?\n\s*)*");
93 optional_re(&*WHITESPACE_OR_COMMENT, s).1
94}
95
96fn skip_whitespace_nonnewline(s: &str) -> &str {
98 regex_static!(WHITESPACE_NONNEWLINE, r"[\s&&[^\n]]*");
99 optional_re(&*WHITESPACE_NONNEWLINE, s).1
100}
101
102fn parse_name(s: &str) -> ParseResult<&str> {
104 regex_static!(NAME, r"[a-zA-Z0-9\-_]+");
105 needed_re(&*NAME, s).map_err(|err| format!("name: {}", err).into())
106}
107
108fn parse_dotted(s: &str) -> ParseResult<&str> {
110 regex_static!(DOTTED, r"[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+)*");
111 needed_re(&*DOTTED, s).map_err(|e| format!("dotted name: {}", e).into())
112}
113
114fn parse_tag(s: &str) -> ParseResult<Option<String>> {
116 let (hash, s) = optional_char('#', s);
117 if hash.is_none() {
118 Ok((None, s))
119 } else {
120 let s = skip_whitespace(s);
121 let (name, s) = parse_name(s).map_err(|e| -> Err { format!("tag: {}", e).into() })?;
122 Ok((Some(name.to_string()), s))
123 }
124}
125
126fn parse_feature_value(s: &str) -> ParseResult<(Option<String>, NodeRef)> {
128 regex_static!(VALUE, r"[a-zA-Z0-9\-_\*]+");
129 let (tag, s) = parse_tag(s)?;
130 let s = skip_whitespace(s);
131 let (name, s) = optional_re(&*VALUE, s);
132 let value = if let Some(name) = name {
133 if name == TOP_STR {
134 NodeRef::new_top()
135 } else {
136 NodeRef::new_str(name.to_string())
137 }
138 } else if tag.is_some() {
139 NodeRef::new_top()
140 } else {
141 return Err(format!("feature needs tag or value at {}", s).into());
142 };
143 Ok(((tag, value), s))
144}
145
146fn parse_feature(s: &str) -> ParseResult<Feature> {
147 let (name, s) = parse_dotted(s).map_err(|e| format!("feature name: {}", e))?;
148 let s = skip_whitespace(s);
149 let (_, s) = needed_char(':', s)?;
150 let s = skip_whitespace(s);
151 let (value, s) = parse_feature_value(s).map_err(|e| format!("feature value: {}", e))?;
152 let s = skip_whitespace(s);
153 let (_, s) = optional_char(',', s);
154
155 Ok((
156 Feature {
157 path: name.to_string(),
158 tag: value.0,
159 value: value.1,
160 },
161 s,
162 ))
163}
164
165fn parse_featurestructure(s: &str) -> ParseResult<Vec<Feature>> {
166 let mut pairs = Vec::new();
167 let mut rem = needed_char('[', s)?.1;
168 loop {
169 rem = skip_whitespace(rem);
170 if let (Some(_), rem) = optional_char(']', rem) {
171 return Ok((pairs, rem));
172 }
173 let (feature, s) = parse_feature(rem)?;
174 pairs.push(feature);
175 rem = s;
176 }
177}
178
179fn parse_production(s: &str) -> ParseResult<(Production, Vec<Feature>)> {
180 let (name, s) = parse_name(s).map_err(|e| -> Err { format!("symbol: {}", e).into() })?;
181 let s = skip_whitespace_nonnewline(s);
182 let (features, s) = if s.starts_with('[') {
183 parse_featurestructure(s)?
184 } else {
185 (Vec::new(), s)
186 };
187
188 if name.chars().next().unwrap().is_uppercase() {
189 Ok(((Production::new_nonterminal(name.to_string()), features), s))
190 } else {
191 if !features.is_empty() {
192 Err(format!("terminal (lower-case) cannot have features: {} {}", name, s).into())
193 } else {
194 Ok((
196 (
197 Production::new_terminal(name.to_string()),
198 vec![Feature {
199 path: "word".to_string(),
200 tag: None,
201 value: NodeRef::new_str(name.to_string()),
202 }],
203 ),
204 s,
205 ))
206 }
207 }
208}
209
210fn parse_nonterminal(s: &str) -> ParseResult<(String, Vec<Feature>)> {
211 let ((prod, features), s) = parse_production(s)?;
212 if prod.is_nonterminal() {
213 Ok(((prod.symbol, features), s))
214 } else {
215 Err(format!("expected nonterminal, got terminal {}: {}", prod.symbol, s).into())
216 }
217}
218
219fn parse_rule(s: &str) -> ParseResult<Rule> {
221 #![allow(clippy::trivial_regex)]
222 regex_static!(ARROW, "->");
223
224 let ((symbol, features), s) =
225 parse_nonterminal(s).map_err(|e| -> Err { format!("rule symbol: {}", e).into() })?;
226 let s = skip_whitespace(s);
227 let (_, s) = needed_re(&*ARROW, s).map_err(|e| -> Err { format!("rule arrow: {}", e).into() })?;
228
229 let mut prods_features = Vec::new();
230 let mut rem = s;
231 loop {
232 rem = skip_whitespace_nonnewline(rem);
233
234 let try_newline = skip_whitespace(rem);
235 if rem.is_empty() || try_newline != rem {
236 rem = try_newline;
238 break;
239 }
240
241 let (prod, s) =
242 parse_production(rem).map_err(|e| -> Err { format!("rule production: {}", e).into() })?;
243 prods_features.push(prod);
244 rem = s;
245 }
246
247 let (features, productions) = adopt_child_features(features, prods_features);
248 let features = NodeRef::new_from_paths(features)?;
249
250 Ok((
251 Rule {
252 symbol,
253 features,
254 productions,
255 },
256 rem,
257 ))
258}
259
260fn adopt_child_features(
268 mut rule_features: Vec<Feature>,
269 prods_features: Vec<(Production, Vec<Feature>)>,
270) -> (Vec<Feature>, Vec<Production>) {
271 let mut productions = Vec::with_capacity(prods_features.len());
272
273 for (idx, (prod, features)) in prods_features.into_iter().enumerate() {
274 productions.push(prod);
275 let prefix = format!("child-{}.", idx);
276 for feature in features.into_iter() {
277 rule_features.push(Feature {
278 path: prefix.clone() + &feature.path,
279 tag: feature.tag,
280 value: feature.value,
281 });
282 }
283 }
284
285 (rule_features, productions)
286}
287
288fn parse_rules(s: &str) -> ParseResult<Vec<Rule>> {
289 let mut rules = Vec::new();
290 let mut rem = s;
291 loop {
292 rem = skip_whitespace(rem);
293 if rem.is_empty() {
294 return Ok((rules, rem));
295 }
296 let (rule, s) = parse_rule(rem)?;
297 rules.push(rule);
298 rem = s;
299 }
300}