Skip to main content

rawk_core/
parser.rs

1use crate::{
2    Lexer, Program,
3    ast::{Action, Expression, Rule, Statement},
4    token::{Token, TokenKind},
5};
6
7#[derive(Debug)]
8pub struct Parser<'a> {
9    lexer: Lexer<'a>,
10    current_token: Token<'a>,
11}
12
13impl<'a> Parser<'a> {
14    pub fn new(mut lexer: Lexer<'a>) -> Self {
15        // Enable regex parsing for the first token since it could be a pattern
16        lexer.set_allow_regex(true);
17        let current_token = lexer.next_token();
18        lexer.set_allow_regex(false);
19
20        Parser {
21            lexer,
22            current_token,
23        }
24    }
25
26    fn next_token(&mut self) {
27        self.next_token_with_regex(false);
28    }
29
30    fn next_token_with_regex(&mut self, allow_regex: bool) {
31        self.lexer.set_allow_regex(allow_regex);
32        self.current_token = self.lexer.next_token();
33        self.lexer.set_allow_regex(false);
34    }
35
36    fn is_eof(&self) -> bool {
37        self.current_token.kind == TokenKind::Eof
38    }
39
40    fn parse_next_rule(&mut self) -> Option<Rule<'a>> {
41        match &self.current_token.kind {
42            TokenKind::Begin => {
43                self.next_token();
44                match self.parse_action() {
45                    Rule::Action(action) => Some(Rule::Begin(action)),
46                    _ => panic!("Expected action after BEGIN"),
47                }
48            }
49            TokenKind::NewLine => {
50                self.next_token_with_regex(true);
51                self.parse_next_rule()
52            }
53            TokenKind::Eof => None,
54            TokenKind::LeftCurlyBrace => Some(self.parse_action()),
55            TokenKind::End => {
56                self.next_token();
57                match self.parse_action() {
58                    Rule::Action(action) => Some(Rule::End(action)),
59                    _ => panic!("Expected action after END"),
60                }
61            }
62            TokenKind::Regex => {
63                let pattern = Some(Expression::Regex(self.current_token.literal));
64                self.next_token();
65                if self.current_token.kind == TokenKind::LeftCurlyBrace {
66                    match self.parse_action() {
67                        Rule::Action(action) => Some(Rule::PatternAction {
68                            pattern,
69                            action: Some(action),
70                        }),
71                        _ => panic!("Expected action after regex pattern"),
72                    }
73                } else {
74                    Some(Rule::PatternAction {
75                        pattern,
76                        action: None,
77                    })
78                }
79            }
80            _ => panic!(
81                "parse_next_rule not yet implemented, found token: {:?}",
82                self.current_token
83            ),
84        }
85    }
86
87    fn parse_action(&mut self) -> Rule<'a> {
88        self.next_token(); // consume '{'
89
90        let pattern = None;
91
92        let mut statements = Vec::new();
93        while self.current_token.kind == TokenKind::NewLine {
94            self.next_token();
95        }
96
97        if self.current_token.kind == TokenKind::Print {
98            let print_statement = self.parse_print_function();
99            statements.push(print_statement);
100        }
101
102        while self.current_token.kind != TokenKind::RightCurlyBrace
103            && self.current_token.kind != TokenKind::Eof
104        {
105            self.next_token();
106        }
107
108        if pattern.is_some() {
109            Rule::PatternAction {
110                pattern,
111                action: Some(Action { statements }),
112            }
113        } else {
114            Rule::Action(Action { statements })
115        }
116    }
117
118    fn parse_print_function(&mut self) -> Statement<'a> {
119        let mut expressions = Vec::new();
120        self.next_token();
121
122        while self.current_token.kind != TokenKind::RightCurlyBrace
123            && self.current_token.kind != TokenKind::Eof
124        {
125            let expression = self.parse_expression();
126            expressions.push(expression);
127        }
128
129        Statement::Print(expressions)
130    }
131
132    fn parse_expression(&mut self) -> Expression<'a> {
133        self.parse_expression_with_min_precedence(0)
134    }
135
136    fn parse_expression_with_min_precedence(&mut self, min_precedence: u8) -> Expression<'a> {
137        let mut left = self.parse_primary_expression();
138
139        loop {
140            let (left_precedence, right_precedence) =
141                match infix_operator_precedence(&self.current_token.kind) {
142                    Some(value) => value,
143                    None => break,
144                };
145
146            if left_precedence < min_precedence {
147                break;
148            }
149
150            let operator = self.current_token.clone();
151            self.next_token();
152            let right = self.parse_expression_with_min_precedence(right_precedence);
153
154            left = Expression::Infix {
155                left: Box::new(left),
156                operator,
157                right: Box::new(right),
158            };
159        }
160
161        left
162    }
163
164    fn parse_primary_expression(&mut self) -> Expression<'a> {
165        match self.current_token.kind {
166            TokenKind::String => {
167                let expression = Expression::String(self.current_token.literal);
168                self.next_token();
169                expression
170            }
171            TokenKind::Number => {
172                let expression = if let Ok(value) = self.current_token.literal.parse::<f64>() {
173                    Expression::Number(value)
174                } else {
175                    todo!()
176                };
177                self.next_token();
178                expression
179            }
180            TokenKind::DollarSign => {
181                self.next_token();
182                let expression = self.parse_primary_expression();
183                Expression::Field(Box::new(expression))
184            }
185            TokenKind::LeftParen => {
186                self.next_token();
187                let expression = self.parse_expression();
188                if self.current_token.kind == TokenKind::RightParen {
189                    self.next_token();
190                }
191                expression
192            }
193            _ => {
194                todo!()
195            }
196        }
197    }
198
199    pub fn parse_program(&mut self) -> Program<'_> {
200        let mut program = Program::new();
201
202        while !self.is_eof() {
203            match self.parse_next_rule() {
204                Some(Rule::Begin(action)) => program.add_begin_block(Rule::Begin(action)),
205                Some(Rule::End(action)) => program.add_end_block(Rule::End(action)),
206                Some(rule) => program.add_rule(rule),
207                None => {}
208            }
209            self.next_token_with_regex(true);
210        }
211
212        program
213    }
214}
215
216fn infix_operator_precedence(kind: &TokenKind) -> Option<(u8, u8)> {
217    match kind {
218        TokenKind::Plus | TokenKind::Minus => Some((1, 2)),
219        TokenKind::Asterisk | TokenKind::Division | TokenKind::Percent => Some((3, 4)),
220        TokenKind::Caret => Some((7, 6)),
221        _ => None,
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn create_parser() {
231        let mut parser = Parser::new(Lexer::new("42 == 42"));
232
233        assert_eq!(parser.current_token.literal, "42");
234        parser.next_token();
235        assert_eq!(parser.current_token.literal, "==");
236    }
237
238    #[test]
239    fn parse_empty_program() {
240        let mut parser = Parser::new(Lexer::new(""));
241
242        let program = parser.parse_program();
243
244        assert_eq!(program.len(), 0);
245    }
246
247    #[test]
248    fn parse_action_without_pattern() {
249        let mut parser = Parser::new(Lexer::new("{ print }"));
250
251        let program = parser.parse_program();
252
253        assert_eq!(program.len(), 1);
254        assert_eq!("{ print }", program.to_string());
255    }
256
257    #[test]
258    fn parse_action_with_leading_newlines() {
259        let mut parser = Parser::new(Lexer::new("\n\n{ print }"));
260
261        let program = parser.parse_program();
262
263        assert_eq!(program.len(), 1);
264        assert_eq!("{ print }", program.to_string());
265    }
266
267    #[test]
268    fn parse_begin_block() {
269        let mut parser = Parser::new(Lexer::new("BEGIN { print }"));
270
271        let program = parser.parse_program();
272
273        assert_eq!(program.len(), 1);
274        assert_eq!("BEGIN { print }", program.to_string());
275    }
276
277    #[test]
278    fn parse_end_block() {
279        let mut parser = Parser::new(Lexer::new("END { print 42 }"));
280
281        let program = parser.parse_program();
282
283        assert_eq!(program.len(), 1);
284        assert_eq!("END { print 42 }", program.to_string());
285    }
286
287    #[test]
288    fn parse_regex_pattern_action() {
289        let mut parser = Parser::new(Lexer::new("/foo/ { print }"));
290
291        let program = parser.parse_program();
292
293        assert_eq!(program.len(), 1);
294        assert_eq!("/foo/ { print }", program.to_string());
295    }
296
297    #[test]
298    fn parse_print_infix_expression() {
299        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 }"));
300
301        let program = parser.parse_program();
302        let mut begin_blocks = program.begin_blocks_iter();
303        let rule = begin_blocks.next().expect("expected begin block");
304
305        let statements = match rule {
306            Rule::Begin(Action { statements }) => statements,
307            _ => panic!("expected begin rule"),
308        };
309
310        let exprs = match &statements[0] {
311            Statement::Print(expressions) => expressions,
312        };
313
314        match &exprs[0] {
315            Expression::Infix {
316                left,
317                operator,
318                right,
319            } => {
320                assert!(matches!(**left, Expression::Number(1.0)));
321                assert_eq!(operator.kind, TokenKind::Plus);
322                assert!(matches!(**right, Expression::Number(2.0)));
323            }
324            _ => panic!("expected infix expression"),
325        }
326    }
327
328    #[test]
329    fn parse_print_parenthesized_expression() {
330        let mut parser = Parser::new(Lexer::new("BEGIN { print (1 + 2) * 3 }"));
331
332        let program = parser.parse_program();
333        let mut begin_blocks = program.begin_blocks_iter();
334        let rule = begin_blocks.next().expect("expected begin block");
335
336        let statements = match rule {
337            Rule::Begin(Action { statements }) => statements,
338            _ => panic!("expected begin rule"),
339        };
340
341        let exprs = match &statements[0] {
342            Statement::Print(expressions) => expressions,
343        };
344
345        match &exprs[0] {
346            Expression::Infix {
347                left,
348                operator,
349                right,
350            } => {
351                assert_eq!(operator.kind, TokenKind::Asterisk);
352                assert!(matches!(**right, Expression::Number(3.0)));
353                assert!(matches!(**left, Expression::Infix { .. }));
354            }
355            _ => panic!("expected infix expression"),
356        }
357    }
358
359    #[test]
360    fn parse_print_multiplication_has_higher_precedence_than_addition() {
361        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 * 3 }"));
362
363        let program = parser.parse_program();
364        let mut begin_blocks = program.begin_blocks_iter();
365        let rule = begin_blocks.next().expect("expected begin block");
366
367        let statements = match rule {
368            Rule::Begin(Action { statements }) => statements,
369            _ => panic!("expected begin rule"),
370        };
371
372        let exprs = match &statements[0] {
373            Statement::Print(expressions) => expressions,
374        };
375
376        match &exprs[0] {
377            Expression::Infix {
378                left,
379                operator,
380                right,
381            } => {
382                assert_eq!(operator.kind, TokenKind::Plus);
383                assert!(matches!(**left, Expression::Number(1.0)));
384                match &**right {
385                    Expression::Infix {
386                        operator: right_op, ..
387                    } => assert_eq!(right_op.kind, TokenKind::Asterisk),
388                    _ => panic!("expected nested infix expression"),
389                }
390            }
391            _ => panic!("expected infix expression"),
392        }
393    }
394
395    #[test]
396    fn parse_print_power_is_right_associative() {
397        let mut parser = Parser::new(Lexer::new("BEGIN { print 2 ^ 3 ^ 2 }"));
398
399        let program = parser.parse_program();
400        let mut begin_blocks = program.begin_blocks_iter();
401        let rule = begin_blocks.next().expect("expected begin block");
402
403        let statements = match rule {
404            Rule::Begin(Action { statements }) => statements,
405            _ => panic!("expected begin rule"),
406        };
407
408        let exprs = match &statements[0] {
409            Statement::Print(expressions) => expressions,
410        };
411
412        match &exprs[0] {
413            Expression::Infix {
414                left,
415                operator,
416                right,
417            } => {
418                assert_eq!(operator.kind, TokenKind::Caret);
419                assert!(matches!(**left, Expression::Number(2.0)));
420                match &**right {
421                    Expression::Infix {
422                        operator: right_op, ..
423                    } => assert_eq!(right_op.kind, TokenKind::Caret),
424                    _ => panic!("expected nested infix expression"),
425                }
426            }
427            _ => panic!("expected infix expression"),
428        }
429    }
430
431    #[test]
432    fn parse_print_minus_is_left_associative() {
433        let mut parser = Parser::new(Lexer::new("BEGIN { print 5 - 3 - 1 }"));
434
435        let program = parser.parse_program();
436        let mut begin_blocks = program.begin_blocks_iter();
437        let rule = begin_blocks.next().expect("expected begin block");
438
439        let statements = match rule {
440            Rule::Begin(Action { statements }) => statements,
441            _ => panic!("expected begin rule"),
442        };
443
444        let exprs = match &statements[0] {
445            Statement::Print(expressions) => expressions,
446        };
447
448        match &exprs[0] {
449            Expression::Infix {
450                left,
451                operator,
452                right,
453            } => {
454                assert_eq!(operator.kind, TokenKind::Minus);
455                match &**left {
456                    Expression::Infix {
457                        operator: left_op, ..
458                    } => assert_eq!(left_op.kind, TokenKind::Minus),
459                    _ => panic!("expected nested infix expression"),
460                }
461                assert!(matches!(**right, Expression::Number(1.0)));
462            }
463            _ => panic!("expected infix expression"),
464        }
465    }
466
467    #[test]
468    fn parse_print_concatenation() {
469        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:" 42 }"#));
470
471        let program = parser.parse_program();
472        let mut begin_blocks = program.begin_blocks_iter();
473        let rule = begin_blocks.next().expect("expected begin block");
474
475        let statements = match rule {
476            Rule::Begin(Action { statements }) => statements,
477            _ => panic!("expected begin rule"),
478        };
479
480        let exprs = match &statements[0] {
481            Statement::Print(expressions) => expressions,
482        };
483
484        assert_eq!(exprs.len(), 2);
485        assert!(matches!(exprs[0], Expression::String("Value:")));
486        assert!(matches!(exprs[1], Expression::Number(42.0)));
487    }
488
489    #[test]
490    fn parse_print_field_expression() {
491        let mut parser = Parser::new(Lexer::new("{ print $1 }"));
492
493        let program = parser.parse_program();
494        let mut rules = program.rules_iter();
495        let rule = rules.next().expect("expected rule");
496
497        let statements = match rule {
498            Rule::Action(Action { statements }) => statements,
499            _ => panic!("expected action rule"),
500        };
501
502        let exprs = match &statements[0] {
503            Statement::Print(expressions) => expressions,
504        };
505
506        match &exprs[0] {
507            Expression::Field(inner) => assert!(matches!(**inner, Expression::Number(1.0))),
508            _ => panic!("expected field expression"),
509        }
510    }
511}