use regex::Regex;
use std::str::FromStr;
use crate::featurestructure::{Feature, NodeRef};
use crate::rules::{Grammar, Production, Rule};
use crate::utils::Err;
pub const TOP_STR: &str = "**top**";
impl FromStr for Grammar {
type Err = Err;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let (rules, s) = parse_rules(s)?;
assert!(s.is_empty());
if rules.is_empty() {
Err("empty ruleset".into())
} else {
Self::new(rules)
}
}
}
type Infallible<'a, T> = (T, &'a str);
type ParseResult<'a, T> = Result<(T, &'a str), Err>;
macro_rules! regex_static {
($name:ident, $pattern:expr) => {
lazy_static! {
static ref $name: Regex = Regex::new($pattern).unwrap();
}
};
}
fn optional_re<'a>(re: &'static Regex, s: &'a str) -> Infallible<'a, Option<&'a str>> {
if let Some(caps) = re.captures(s) {
let m = caps.get(0).unwrap();
if m.start() > 0 {
return (None, s);
}
let (_, rest) = s.split_at(m.end());
(Some(m.as_str()), rest)
} else {
(None, s)
}
}
fn needed_re<'a>(re: &'static Regex, s: &'a str) -> ParseResult<'a, &'a str> {
if let (Some(c), rest) = optional_re(re, s) {
Ok((c, rest))
} else {
Err(format!("couldn't match {} at {}", re, s).into())
}
}
fn optional_char(c: char, s: &str) -> Infallible<Option<char>> {
let mut iter = s.char_indices().peekable();
if let Some((_, c1)) = iter.next() {
if c == c1 {
let rest = if let Some((idx, _)) = iter.peek() {
s.split_at(*idx).1
} else {
""
};
return (Some(c), rest);
}
}
(None, s)
}
fn needed_char(c: char, s: &str) -> ParseResult<char> {
if let (Some(c), rest) = optional_char(c, s) {
Ok((c, rest))
} else {
Err(format!("couldn't match {} at {}", c, s).into())
}
}
fn skip_whitespace(s: &str) -> &str {
regex_static!(WHITESPACE_OR_COMMENT, r"\s*(//.*?\n\s*)*");
optional_re(&*WHITESPACE_OR_COMMENT, s).1
}
fn skip_whitespace_nonnewline(s: &str) -> &str {
regex_static!(WHITESPACE_NONNEWLINE, r"[\s&&[^\n]]*");
optional_re(&*WHITESPACE_NONNEWLINE, s).1
}
fn parse_name(s: &str) -> ParseResult<&str> {
regex_static!(NAME, r"[a-zA-Z0-9\-_]+");
needed_re(&*NAME, s).map_err(|err| format!("name: {}", err).into())
}
fn parse_dotted(s: &str) -> ParseResult<&str> {
regex_static!(DOTTED, r"[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+)*");
needed_re(&*DOTTED, s).map_err(|e| format!("dotted name: {}", e).into())
}
fn parse_tag(s: &str) -> ParseResult<Option<String>> {
let (hash, s) = optional_char('#', s);
if hash.is_none() {
Ok((None, s))
} else {
let s = skip_whitespace(s);
let (name, s) = parse_name(s).map_err(|e| -> Err { format!("tag: {}", e).into() })?;
Ok((Some(name.to_string()), s))
}
}
fn parse_feature_value(s: &str) -> ParseResult<(Option<String>, NodeRef)> {
regex_static!(VALUE, r"[a-zA-Z0-9\-_\*]+");
let (tag, s) = parse_tag(s)?;
let s = skip_whitespace(s);
let (name, s) = optional_re(&*VALUE, s);
let value = if let Some(name) = name {
if name == TOP_STR {
NodeRef::new_top()
} else {
NodeRef::new_str(name.to_string())
}
} else if tag.is_some() {
NodeRef::new_top()
} else {
return Err(format!("feature needs tag or value at {}", s).into());
};
Ok(((tag, value), s))
}
fn parse_feature(s: &str) -> ParseResult<Feature> {
let (name, s) = parse_dotted(s).map_err(|e| format!("feature name: {}", e))?;
let s = skip_whitespace(s);
let (_, s) = needed_char(':', s)?;
let s = skip_whitespace(s);
let (value, s) = parse_feature_value(s).map_err(|e| format!("feature value: {}", e))?;
let s = skip_whitespace(s);
let (_, s) = optional_char(',', s);
Ok((
Feature {
path: name.to_string(),
tag: value.0,
value: value.1,
},
s,
))
}
fn parse_featurestructure(s: &str) -> ParseResult<Vec<Feature>> {
let mut pairs = Vec::new();
let mut rem = needed_char('[', s)?.1;
loop {
rem = skip_whitespace(rem);
if let (Some(_), rem) = optional_char(']', rem) {
return Ok((pairs, rem));
}
let (feature, s) = parse_feature(rem)?;
pairs.push(feature);
rem = s;
}
}
fn parse_production(s: &str) -> ParseResult<(Production, Vec<Feature>)> {
let (name, s) = parse_name(s).map_err(|e| -> Err { format!("symbol: {}", e).into() })?;
let s = skip_whitespace_nonnewline(s);
let (features, s) = if s.starts_with('[') {
parse_featurestructure(s)?
} else {
(Vec::new(), s)
};
if name.chars().next().unwrap().is_uppercase() {
Ok(((Production::new_nonterminal(name.to_string()), features), s))
} else {
if !features.is_empty() {
Err(format!("terminal (lower-case) cannot have features: {} {}", name, s).into())
} else {
Ok((
(
Production::new_terminal(name.to_string()),
vec![Feature {
path: "word".to_string(),
tag: None,
value: NodeRef::new_str(name.to_string()),
}],
),
s,
))
}
}
}
fn parse_nonterminal(s: &str) -> ParseResult<(String, Vec<Feature>)> {
let ((prod, features), s) = parse_production(s)?;
if prod.is_nonterminal() {
Ok(((prod.symbol, features), s))
} else {
Err(format!("expected nonterminal, got terminal {}: {}", prod.symbol, s).into())
}
}
fn parse_rule(s: &str) -> ParseResult<Rule> {
#![allow(clippy::trivial_regex)]
regex_static!(ARROW, "->");
let ((symbol, features), s) =
parse_nonterminal(s).map_err(|e| -> Err { format!("rule symbol: {}", e).into() })?;
let s = skip_whitespace(s);
let (_, s) = needed_re(&*ARROW, s).map_err(|e| -> Err { format!("rule arrow: {}", e).into() })?;
let mut prods_features = Vec::new();
let mut rem = s;
loop {
rem = skip_whitespace_nonnewline(rem);
let try_newline = skip_whitespace(rem);
if rem.is_empty() || try_newline != rem {
rem = try_newline;
break;
}
let (prod, s) =
parse_production(rem).map_err(|e| -> Err { format!("rule production: {}", e).into() })?;
prods_features.push(prod);
rem = s;
}
let (features, productions) = adopt_child_features(features, prods_features);
let features = NodeRef::new_from_paths(features)?;
Ok((
Rule {
symbol,
features,
productions,
},
rem,
))
}
fn adopt_child_features(
mut rule_features: Vec<Feature>,
prods_features: Vec<(Production, Vec<Feature>)>,
) -> (Vec<Feature>, Vec<Production>) {
let mut productions = Vec::with_capacity(prods_features.len());
for (idx, (prod, features)) in prods_features.into_iter().enumerate() {
productions.push(prod);
let prefix = format!("child-{}.", idx);
for feature in features.into_iter() {
rule_features.push(Feature {
path: prefix.clone() + &feature.path,
tag: feature.tag,
value: feature.value,
});
}
}
(rule_features, productions)
}
fn parse_rules(s: &str) -> ParseResult<Vec<Rule>> {
let mut rules = Vec::new();
let mut rem = s;
loop {
rem = skip_whitespace(rem);
if rem.is_empty() {
return Ok((rules, rem));
}
let (rule, s) = parse_rule(rem)?;
rules.push(rule);
rem = s;
}
}