spindle_lib/
grammar.rs

1use crate::error::ErrorRepr;
2use crate::Error;
3use crate::{ir, regex::Regex, reserved::*, Visitor};
4
5use arbitrary::{Arbitrary, Unstructured};
6use std::{collections::HashSet, fmt, str::FromStr};
7
8/// A state machine that produces `arbitrary` matching expressions or byte sequences from `Unstructured`.
9///
10/// # Implementation
11/// ## Construction
12/// `Grammar` is constructed using `from_str` of a string that specifies the syntax of expressions:
13/// - A peg parser converts the string into an "intermediary representation" AST (in ir.rs).
14/// - The IR is validated for duplicate and conflicting rule definitions
15/// - The IR is converted to a state machine in which regex is parsed and rule definitions are indexed.
16///
17/// ## Expression Generation
18/// `Unstructured` is used to generate `arbitrary` choices and loops to traverse the state machine.
19/// When a "leaf" state is reached, e.g. a pre-defined rule, regex, or string literal, `Unstructured`
20/// is used to write arbitrary data (except for string literals) to an output buffer.
21#[derive(Debug)]
22pub struct Grammar(Vec<Expr>);
23
24impl Grammar {
25    /// Returns a resulting `Visitor` after an arbitirary state machine traversal.
26    ///
27    /// The state machine traversal starts at the first rule in the grammar.
28    pub fn expression<V: Visitor>(
29        &self,
30        u: &mut Unstructured<'_>,
31        max_depth: Option<usize>,
32    ) -> arbitrary::Result<V> {
33        // Maybe there's a smarter way to pre-allocate the buffer,
34        // like predicting or pre-computing arbitrary lengths (but tracking
35        // that requires another data structure).
36        //
37        // It's simple and efficient enough to let 2x Vec growth do it's thing.
38        let mut visitor = V::new();
39        let mut to_write = vec![(&self.0[0], 0)]; // always start at first rule
40
41        while let Some((expr, depth)) = to_write.pop() {
42            if depth > max_depth.unwrap_or(usize::MAX) {
43                return Err(arbitrary::Error::IncorrectFormat);
44            }
45
46            match expr {
47                Expr::Or(v) => {
48                    let arb_index = u.int_in_range(0..=(v.len() - 1))?;
49                    to_write.push((&v[arb_index], depth + 1));
50                    visitor.visit_or(arb_index);
51                }
52                Expr::Concat(v) => {
53                    to_write.extend(v.iter().map(|x| (x, depth + 1)));
54                    visitor.visit_concat();
55                }
56                Expr::Optional(x) => {
57                    let b = bool::arbitrary(u)?;
58                    if b {
59                        to_write.push((x, depth + 1));
60                    }
61                    visitor.visit_optional(b);
62                }
63                Expr::Repetition(x, min_rep) => {
64                    let mut reps = 0;
65                    u.arbitrary_loop(Some(*min_rep), Some(10), |_| {
66                        to_write.push((x, depth + 1));
67                        reps += 1;
68                        Ok(std::ops::ControlFlow::Continue(()))
69                    })?;
70                    visitor.visit_repetition(reps);
71                }
72                Expr::Reference(index) => {
73                    to_write.push((&self.0[*index], depth + 1));
74                    visitor.visit_reference(*index);
75                }
76                Expr::Literal(s) => visitor.visit_literal(s.as_str()),
77                Expr::Bytes(b) => visitor.visit_bytes(b),
78                Expr::Regex(regex) => regex.visit(&mut visitor, u)?,
79                Expr::Group(x) => to_write.push((x, depth + 1)),
80                Expr::Predefined(v) => v.visit(&mut visitor, u)?,
81            }
82        }
83        Ok(visitor)
84    }
85}
86
87/// Pretty prints the state machine.
88///
89/// It's helpful to check if the compiled state machine matches
90/// what is expected from the the un-parsed grammar
91/// (the printed rules are more verbose and order of operations is clearer)
92impl fmt::Display for Grammar {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
94        for (i, rule) in self.0.iter().enumerate() {
95            writeln!(f, "{}: {}", i, rule)?;
96        }
97        Ok(())
98    }
99}
100
101impl FromStr for Grammar {
102    type Err = Error;
103
104    fn from_str(s: &str) -> Result<Self, Self::Err> {
105        let parsed_ir = ir::bnf::expr(s).map_err(|e| Error(ErrorRepr::Grammar(e)))?;
106        Self::try_from(parsed_ir)
107    }
108}
109
110impl TryFrom<Vec<(String, ir::Expr)>> for Grammar {
111    type Error = Error;
112
113    fn try_from(value: Vec<(String, ir::Expr)>) -> Result<Self, Self::Error> {
114        let (mut names, ir_exprs): (Vec<String>, Vec<ir::Expr>) = value.into_iter().unzip();
115
116        if let Some(dups) = find_duplicates(&names) {
117            return Err(Error(ErrorRepr::DuplicateVars(dups)));
118        }
119
120        if let Some(res) = filter_reserved_keywords(&names) {
121            return Err(Error(ErrorRepr::Reserved(res)));
122        }
123
124        names.extend(Predefined::all().map(|s| s.as_str().to_string()));
125
126        let mut exprs = ir_exprs
127            .into_iter()
128            .map(|ir_expr| Expr::try_new(ir_expr, &names))
129            .collect::<Result<Vec<Expr>, _>>()?;
130
131        exprs.extend(Predefined::all().map(Expr::Predefined));
132
133        assert_eq!(names.len(), exprs.len());
134        Ok(Self(exprs))
135    }
136}
137
138fn find_duplicates(names: &[String]) -> Option<HashSet<String>> {
139    let mut set: HashSet<String> = names.iter().cloned().collect();
140    let dups: HashSet<String> = names.iter().filter(|&n| !set.remove(n)).cloned().collect();
141    (!dups.is_empty()).then_some(dups)
142}
143
144#[derive(Debug)]
145enum Expr {
146    Or(Vec<Expr>),
147    Concat(Vec<Expr>),
148    Optional(Box<Expr>),
149    Repetition(Box<Expr>, u32),
150    Reference(usize),
151    Literal(String),
152    Regex(Regex),
153    Bytes(Vec<u8>),
154    Group(Box<Expr>),
155    Predefined(Predefined),
156}
157
158fn fmt_w_name<'a>(
159    name: &'static str,
160    x: impl Iterator<Item = &'a Expr>,
161    f: &mut fmt::Formatter<'_>,
162) -> Result<(), fmt::Error> {
163    write!(
164        f,
165        "{}({})",
166        name,
167        x.into_iter()
168            .map(|x| x.to_string())
169            .collect::<Vec<String>>()
170            .join(", ")
171    )
172}
173
174impl fmt::Display for Expr {
175    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
176        match self {
177            Self::Or(x) => fmt_w_name("or", x.iter(), f)?,
178            Self::Concat(x) => fmt_w_name("concat", x.iter().rev(), f)?,
179            Self::Optional(x) => write!(f, "option({})", x)?,
180            Self::Repetition(x, _) => write!(f, "repeat({})", x)?,
181            Self::Reference(index) => write!(f, "{}", index)?,
182            Self::Literal(l) => write!(f, "{:?}", l)?,
183            Self::Regex(_) => write!(f, "regex")?, // TODO: no way to pretty print regex
184            Self::Bytes(b) => write!(f, "{:?}", b)?,
185            Self::Group(x) => write!(f, "({})", x)?,
186            Self::Predefined(p) => write!(f, "{}", p.as_str())?,
187        }
188        Ok(())
189    }
190}
191
192impl Expr {
193    /// Converts the intermediary representation into a "compiled" state machine.
194    ///
195    /// Checks done:
196    /// - parses regex
197    /// - Converts rule references to rule indexes
198    fn try_new(ir_expr: ir::Expr, names: &[String]) -> Result<Self, Error> {
199        Ok(match ir_expr {
200            ir::Expr::Or(x) => Self::Or(
201                x.into_iter()
202                    .map(|e| Self::try_new(e, names))
203                    .collect::<Result<Vec<_>, _>>()?,
204            ),
205            ir::Expr::Concat(x) => {
206                let mut y = x
207                    .into_iter()
208                    .map(|e| Self::try_new(e, names))
209                    .collect::<Result<Vec<_>, _>>()?;
210                y.reverse(); // reverse so that `expression` can use a stack
211                Self::Concat(y)
212            }
213            ir::Expr::Optional(x) => Self::Optional(Box::new(Self::try_new(*x, names)?)),
214            ir::Expr::Repetition(x, min) => {
215                Self::Repetition(Box::new(Self::try_new(*x, names)?), min)
216            }
217            ir::Expr::Group(x) => Self::Group(Box::new(Self::try_new(*x, names)?)),
218            ir::Expr::Reference(name) => match names.iter().position(|n| *n == name) {
219                Some(i) => Self::Reference(i),
220                None => return Err(Error(ErrorRepr::UnkownVar(name))),
221            },
222            ir::Expr::Literal(x) => Self::Literal(x),
223            ir::Expr::Regex(r) => {
224                Self::Regex(Regex::compile(&r, 10).map_err(|e| Error(ErrorRepr::Regex(e)))?)
225            }
226            ir::Expr::Bytes(x) => Self::Bytes(x),
227        })
228    }
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    #[test]
236    fn catches_duplicates() {
237        for x in [
238            r#"x: y; x: z;"#,
239            r#"x: y; x: x;"#,
240            r#"ok: x; x: y; y: x; x: x;"#,
241        ] {
242            let result: Error = x.parse::<Grammar>().unwrap_err();
243            assert_eq!(
244                result,
245                Error(ErrorRepr::DuplicateVars(["x".into()].into_iter().collect()))
246            )
247        }
248
249        for x in [
250            r#"x: y; x: z; y: x; y: y;"#,
251            r#"x: x; y: x; y: y; x: y; x: z;"#,
252        ] {
253            let result: Error = x.parse::<Grammar>().unwrap_err();
254            assert_eq!(
255                result,
256                Error(ErrorRepr::DuplicateVars(
257                    ["x".into(), "y".into()].into_iter().collect()
258                ))
259            )
260        }
261    }
262
263    #[test]
264    fn rejects_reserved() {
265        for x in [r#"u16: u16 ;"#, r#"u16: [8, 8]  ;"#] {
266            let result: Error = x.parse::<Grammar>().unwrap_err();
267            assert_eq!(
268                result,
269                Error(ErrorRepr::Reserved(["u16".into()].into_iter().collect()))
270            )
271        }
272
273        for x in [r#"String: u16 ;"#, r#"String: [8, 8]  ;"#] {
274            let result: Error = x.parse::<Grammar>().unwrap_err();
275            assert_eq!(
276                result,
277                Error(ErrorRepr::Reserved(["String".into()].into_iter().collect()))
278            )
279        }
280    }
281}