regex_dfa_gen/
ast.rs

1//! Abstract Syntax Tree for Regex.
2//! 
3//! get AstNode by:
4//! 
5//! ```
6//! use regex_dfa_gen::ast::AstNode;
7//! let ast : AstNode = r"12".parse::<AstNode>().unwrap();
8//! ```
9//! 
10
11use std::str::FromStr;
12use crate::set::*;
13#[derive(Debug, Clone, Eq, PartialEq)]
14pub enum AstNode {
15    Char(CharRange),
16    Options(Vec<AstNode>),
17    Multiple(Box<AstNode>),
18    EmptyOr(Box<AstNode>),
19    MultipleNonGreedy(Box<AstNode>),
20    Concat(Vec<AstNode>),
21    // NamedConcat(Vec<AstNode>, String),
22}
23use thiserror::Error as ThisError;
24
25#[derive(Debug, ThisError)]
26pub enum Error {
27    #[error("missing expression at {0}")]
28    MissingExpresion(usize),
29    #[error("missing first expr at {0}")]
30    MissingFirstExpr(usize),
31    #[error("'^' is unusable at {0}")]
32    ExceptNotUsable(usize),
33    #[error("unmatched char '{1}' at {0}")]
34    UnmatchedChar(usize, char),
35    #[error("unexpect end at {0}")]
36    UnexpectedEnd(usize),
37    #[error("unexpected char '{1}' at {0}")]
38    UnexpectedChar(usize, char),
39    #[error("found an empty string")]
40    EmptyString,
41}
42
43// impl std::error::Error for Error {}
44
45// impl std::fmt::Display for Error {
46//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47//         match self {
48//             Error::MissingExpresion(pos) => { write!(f, , pos) }
49//             Error::MissingFirstExpr(pos) => { write!(f, "missing first expr at {}", pos) }
50//             Error::ExceptNotUsable(pos) => { write!(f, "'^' is unusable at {}", pos) }
51//             Error::UnmatchedChar(pos, ch) => { write!(f, "unmatched char '{}' at {}", ch, pos) }
52//             Error::UnexpectedEnd(pos) => { write!(f, "unexpect end at {}", pos) }
53//             Error::UnexpectedChar(pos, ch) => { write!(f, "unexpected char '{}' at {}", ch, pos) }
54//             Error::EmptyString => { write!(f, "found an empty string") }
55//         }
56//     }
57// }
58
59pub type Result<T> = std::result::Result<T,Error>;
60pub trait CharStream: Iterator<Item=char> {}
61
62
63/// LL1 Parser for Regex
64/// 
65/// ```c
66/// Tree -> Option '|' ... '|' Option
67/// Option -> Element ... Element
68/// Element -> '(' Tree ')' | char | [char*] | '^'Element | Element'*'
69/// ```
70struct Parser<Iter : CharStream> {
71    first : char,
72    iter : Iter,
73    pos: usize,
74}
75
76
77impl<Iter : CharStream> Parser<Iter> {
78    /// create a new ll1 parser.
79    pub fn new(mut iter: Iter) -> Result<Self> {
80        Ok(Self {
81            first: iter.next().ok_or(Error::EmptyString)?,
82            iter,
83            pos: 0,
84        })
85    }
86
87    fn next(&mut self) -> char {
88        self.first = self.iter.next().unwrap_or('\0');
89        self.pos += 1;
90        self.first
91    }
92
93    fn next_matches(&mut self, _c : char) {
94        self.first = self.iter.next().unwrap_or('\0');
95        self.pos += 1;
96    }
97    /// read a tree in parser.(`Tree -> Option '|' ... '|' Option`)
98    /// 
99    /// `inside` mark whether it's inside a parentheses.
100    pub fn parse_tree(&mut self, inside : bool) -> Result<AstNode> {
101        let mut ret = Vec::<AstNode>::new();
102        loop{
103            let op = self.parse_option()?;
104            ret.push(op);
105
106            // FOLLOW(Tree) = '(' '\0'
107            if self.first == ')' || self.first == '\0' {
108                if self.first == ')' && !inside {
109                    return Err(Error::UnmatchedChar(self.pos, ')'));
110                }
111                break;
112            }
113
114            self.next_matches('|');
115        }
116
117        if ret.len() == 0 {
118            return Err(Error::MissingExpresion(self.pos));
119        }
120
121        if ret.len() == 1 {
122            Ok(ret.pop().unwrap())
123        } else {
124            Ok(AstNode::Options(ret))
125        }
126    }
127
128    /// read an option in parser.(`Option -> Element ... Element`)
129    pub fn parse_option(&mut self) -> Result<AstNode> {
130
131        let mut ret = Vec::<AstNode>::new();
132        loop {
133            let element = self.parse_element()?;
134            ret.push(element);
135            // FOLLOW(Option) = '|' '\0'
136            if self.first == '|' || self.first == '\0' || self.first == ')' {
137                break;
138            }
139
140            // no self.next()
141        }
142
143        if ret.len() == 0 {
144            return Err(Error::MissingExpresion(self.pos));
145        }
146
147        if ret.len() == 1 {
148            Ok(ret.pop().unwrap())
149        } else  {
150            Ok(AstNode::Concat(ret))
151        }
152    }
153
154    /// read an elemnt in parser.(`Element -> '(' Tree ')' | char | [char*] | '^'Element | Element'*'`)
155    pub fn parse_element(&mut self) -> Result<AstNode> {
156
157        let mut is_except = false;
158        if self.first == '^'{
159            is_except = true;
160            self.next_matches('^');
161        }
162
163        let mut ret = match self.first {
164            '(' => {
165                if is_except {
166                    return Err(Error::ExceptNotUsable(self.pos));
167                }
168                self.next_matches('('); // parse_tree known nothings about this '(' ')'
169                let ret = self.parse_tree(true)?;
170                self.next_matches(')');
171                ret
172            },
173            '[' => {
174                // parse_charset know about these '[' ']'
175                let ret = self.parse_charset(is_except)?;
176                ret
177            },
178            '.' => {
179                if is_except {
180                    return Err(Error::ExceptNotUsable(self.pos));
181                }
182                self.next_matches('.');
183                AstNode::Char(CHAR_MIN..CHAR_MAX)
184            }
185            '\0' => { return Err(Error::UnexpectedEnd(self.pos));}
186            ')' | ']' | '|' | '*' | '+' | '?'  => {
187                return Err(Error::UnexpectedChar(self.pos, self.first));
188            }
189            '\\' => {
190                let c = self.next();
191                self.next();
192                let c = match c {
193                    'n' => '\n',
194                    't' => '\t',
195                    'r' => '\r',
196                    _ => c
197                };
198                AstNode::Char(c..add1(c))
199            }
200            c => {
201                self.next();
202                AstNode::Char(c..add1(c))
203            }
204        };
205
206        if self.first == '*' {
207            self.next_matches('*');
208            if self.first == '?' {
209                self.next_matches('?');
210                ret = AstNode::MultipleNonGreedy(Box::new(ret));
211            }else {
212                ret = AstNode::Multiple(Box::new(ret));
213            }
214        }
215        if self.first == '+' {
216            self.next_matches('+');
217            ret = AstNode::Concat(vec![
218                ret.clone(),
219                AstNode::Multiple(Box::new(ret))
220            ])
221        }
222        if self.first == '?' {
223            self.next_matches('?');
224            ret = AstNode::EmptyOr(Box::new(ret));
225        }
226        Ok(ret)
227    }
228
229    fn parse_charset(&mut self, is_except: bool) -> Result<AstNode> {
230        self.next_matches('[');
231        let mut ret = Vec::<CharRange>::new();
232        loop {
233            match self.first {
234                ']' => { 
235                    self.next_matches(']');
236                    break;
237                },
238                '-' => {
239                    if let Some(range) = ret.pop() {
240                        if add1(range.start) == range.end {
241                            self.next();
242                            ret.push(range.start..add1(self.first));
243                        } else {
244                            return Err(Error::MissingFirstExpr(self.pos));
245                        }
246                    } else {
247                        return Err(Error::MissingFirstExpr(self.pos));
248                    }
249                }
250                c => { ret.push(c..add1(c)) },
251            }
252            self.next();
253        }
254        
255        if ret.len() == 0{
256            return Err(Error::MissingExpresion(self.pos));
257        }
258
259        if is_except {
260            if ret.len() == 1 {
261                let s = ret.pop().unwrap();
262                return Ok(AstNode::Options(vec![
263                    AstNode::Char(
264                        CHAR_MIN..s.start
265                    ),
266                    AstNode::Char(
267                        s.end..CHAR_MAX
268                    ),
269                ]))
270            }
271            panic!("not implemented!");
272        }
273
274        if ret.len() == 1 {
275            Ok(AstNode::Char(ret.pop().unwrap()))
276        } else {
277            Ok(AstNode::Options(
278                ret.iter().map(|c| AstNode::Char(c.clone())).collect()
279            ))
280        }
281    }
282}
283
284impl CharStream for core::str::Chars<'_> {}
285
286impl FromStr for AstNode {
287    type Err = Error;
288    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
289        let iter = s.chars();
290        Parser::new(iter)?.parse_tree(false)
291    }
292}
293
294
295#[cfg(test)]
296mod test {
297    use super::*;
298    use std::assert;
299    use AstNode::*;
300
301    fn charnode(c : char) -> AstNode { Char(c..add1(c)) }
302    fn charrange(c : char, d : char) -> AstNode {
303        Char(c..add1(d))
304    }
305
306    fn multi(n : AstNode) -> AstNode { Multiple(Box::new(n)) }
307    fn multi_non_greedy(n : AstNode) -> AstNode { MultipleNonGreedy(Box::new(n)) }
308
309
310
311    #[test]
312    fn basics() {
313        let ast : AstNode = r"12".parse::<AstNode>().unwrap();
314        assert_eq!(
315            ast, Concat(vec![
316                charnode('1'),
317                charnode('2'),
318            ])
319        );
320
321        let ast : AstNode = r"1|2".parse::<AstNode>().unwrap();
322        assert_eq!(
323            ast , Options(vec![
324                charnode('1'),
325                charnode('2'),
326            ])
327        );
328
329
330        let ast : AstNode = r"1|2*3(5|4)*".parse::<AstNode>().unwrap();
331        assert_eq!(
332            ast , Options(vec![
333                charnode('1'),
334                Concat(vec![
335                    multi(charnode('2')),
336                    charnode('3'),
337                    multi(
338                        Options(vec![
339                            charnode('5'),
340                            charnode('4'),
341                        ])
342                    ),
343                ])
344            ])
345        );
346
347        let ast = r"1([1-9][1-9])*?".parse::<AstNode>().unwrap();
348        assert_eq!(
349            ast , Concat(vec![
350                charnode('1'),
351                multi_non_greedy(
352                    Concat(vec![
353                        charrange('1', '9'),
354                        charrange('1', '9'),
355                    ])
356                )
357            ])
358        );
359
360        let ast2 = r"1(([1-9]([1-9])))*?".parse::<AstNode>().unwrap();
361        assert!(ast2 == ast);
362    }
363}