Skip to main content

rawk_core/
parser.rs

1use crate::{
2    Lexer, Program,
3    ast::{Action, Expression, Rule, Statement},
4    token::{Token, TokenKind},
5};
6
7#[derive(Debug)]
8pub struct Parser<'a> {
9    lexer: Lexer<'a>,
10    current_token: Token<'a>,
11}
12
13enum CommaBehavior {
14    InsertSpaceExpression,
15    SkipComma,
16}
17
18enum StatementEnd {
19    RightCurlyBrace,
20    NewLine,
21    Semicolon,
22    Eof,
23}
24
25impl<'a> Parser<'a> {
26    pub fn new(mut lexer: Lexer<'a>) -> Self {
27        // Enable regex parsing for the first token since it could be a pattern
28        lexer.set_allow_regex(true);
29        let current_token = lexer.next_token();
30        lexer.set_allow_regex(false);
31
32        Parser {
33            lexer,
34            current_token,
35        }
36    }
37
38    fn next_token(&mut self) {
39        self.next_token_with_regex(false);
40    }
41
42    fn next_token_with_regex(&mut self, allow_regex: bool) {
43        self.lexer.set_allow_regex(allow_regex);
44        self.current_token = self.lexer.next_token();
45        self.lexer.set_allow_regex(false);
46    }
47
48    fn is_eof(&self) -> bool {
49        self.current_token.kind == TokenKind::Eof
50    }
51
52    fn parse_next_rule(&mut self) -> Option<Rule<'a>> {
53        match &self.current_token.kind {
54            TokenKind::Begin => {
55                self.next_token();
56                match self.parse_action() {
57                    Rule::Action(action) => Some(Rule::Begin(action)),
58                    _ => panic!("Expected action after BEGIN"),
59                }
60            }
61            TokenKind::NewLine => {
62                self.next_token_with_regex(true);
63                self.parse_next_rule()
64            }
65            TokenKind::Eof => None,
66            TokenKind::LeftCurlyBrace => Some(self.parse_action()),
67            TokenKind::End => {
68                self.next_token();
69                match self.parse_action() {
70                    Rule::Action(action) => Some(Rule::End(action)),
71                    _ => panic!("Expected action after END"),
72                }
73            }
74            TokenKind::Regex
75            | TokenKind::String
76            | TokenKind::Number
77            | TokenKind::DollarSign
78            | TokenKind::LeftParen
79            | TokenKind::Identifier => self.parse_pattern_rule(),
80            _ => panic!(
81                "parse_next_rule not yet implemented, found token: {:?}",
82                self.current_token
83            ),
84        }
85    }
86
87    fn parse_pattern_rule(&mut self) -> Option<Rule<'a>> {
88        let pattern = Some(self.parse_expression());
89
90        if self.current_token.kind == TokenKind::LeftCurlyBrace {
91            match self.parse_action() {
92                Rule::Action(action) => Some(Rule::PatternAction {
93                    pattern,
94                    action: Some(action),
95                }),
96                _ => panic!("Expected action after pattern"),
97            }
98        } else {
99            Some(Rule::PatternAction {
100                pattern,
101                action: None,
102            })
103        }
104    }
105
106    fn parse_action(&mut self) -> Rule<'a> {
107        self.next_token(); // consume '{'
108
109        let pattern = None;
110
111        let mut statements = Vec::new();
112        while self.current_token.kind != TokenKind::RightCurlyBrace
113            && self.current_token.kind != TokenKind::Eof
114        {
115            while self.current_token.kind == TokenKind::NewLine
116                || self.current_token.kind == TokenKind::Semicolon
117            {
118                self.next_token();
119            }
120
121            if self.current_token.kind == TokenKind::RightCurlyBrace
122                || self.current_token.kind == TokenKind::Eof
123            {
124                break;
125            }
126
127            statements.push(self.parse_statement());
128        }
129
130        if pattern.is_some() {
131            Rule::PatternAction {
132                pattern,
133                action: Some(Action { statements }),
134            }
135        } else {
136            Rule::Action(Action { statements })
137        }
138    }
139
140    fn parse_statement(&mut self) -> Statement<'a> {
141        match self.current_token.kind {
142            TokenKind::Print => self.parse_print_function(),
143            TokenKind::Printf => self.parse_printf_function(),
144            TokenKind::Identifier => self.parse_assignment_statement(),
145            _ => todo!(),
146        }
147    }
148
149    fn parse_assignment_statement(&mut self) -> Statement<'a> {
150        let identifier = self.current_token.literal;
151        self.next_token();
152        if self.current_token.kind != TokenKind::Assign {
153            todo!()
154        }
155        self.next_token();
156        let value = self.parse_expression();
157
158        Statement::Assignment { identifier, value }
159    }
160
161    fn parse_print_function(&mut self) -> Statement<'a> {
162        let expressions = self.parse_expression_list_until_action_end(CommaBehavior::InsertSpaceExpression);
163
164        Statement::Print(expressions)
165    }
166
167    fn parse_printf_function(&mut self) -> Statement<'a> {
168        let expressions = self.parse_expression_list_until_action_end(CommaBehavior::SkipComma);
169
170        Statement::Printf(expressions)
171    }
172
173    fn parse_expression_list_until_action_end(
174        &mut self,
175        comma_behavior: CommaBehavior,
176    ) -> Vec<Expression<'a>> {
177        let mut expressions = Vec::new();
178        self.next_token();
179
180        while !self.is_statement_end() {
181            if self.current_token.kind == TokenKind::Comma {
182                self.next_token();
183                if let CommaBehavior::InsertSpaceExpression = comma_behavior {
184                    expressions.push(Expression::String(" "));
185                }
186            } else {
187                let expression = self.parse_expression();
188                expressions.push(expression);
189            }
190        }
191
192        expressions
193    }
194
195    fn is_statement_end(&self) -> bool {
196        matches!(
197            statement_end_kind(&self.current_token.kind),
198            Some(StatementEnd::RightCurlyBrace)
199                | Some(StatementEnd::NewLine)
200                | Some(StatementEnd::Semicolon)
201                | Some(StatementEnd::Eof)
202        )
203    }
204
205    fn parse_expression(&mut self) -> Expression<'a> {
206        self.parse_expression_with_min_precedence(0)
207    }
208
209    fn parse_expression_with_min_precedence(&mut self, min_precedence: u8) -> Expression<'a> {
210        let mut left = self.parse_primary_expression();
211
212        loop {
213            let (left_precedence, right_precedence) =
214                match infix_operator_precedence(&self.current_token.kind) {
215                    Some(value) => value,
216                    None => break,
217                };
218
219            if left_precedence < min_precedence {
220                break;
221            }
222
223            let operator = self.current_token.clone();
224            if matches!(operator.kind, TokenKind::Tilde | TokenKind::NoMatch) {
225                self.next_token_with_regex(true);
226            } else {
227                self.next_token();
228            }
229            let right = self.parse_expression_with_min_precedence(right_precedence);
230
231            left = Expression::Infix {
232                left: Box::new(left),
233                operator,
234                right: Box::new(right),
235            };
236        }
237
238        left
239    }
240
241    fn parse_primary_expression(&mut self) -> Expression<'a> {
242        match self.current_token.kind {
243            TokenKind::String => {
244                let expression = Expression::String(self.current_token.literal);
245                self.next_token();
246                expression
247            }
248            TokenKind::Regex => {
249                let expression = Expression::Regex(self.current_token.literal);
250                self.next_token();
251                expression
252            }
253            TokenKind::Number => {
254                let expression = if let Ok(value) = self.current_token.literal.parse::<f64>() {
255                    Expression::Number(value)
256                } else {
257                    todo!()
258                };
259                self.next_token();
260                expression
261            }
262            TokenKind::DollarSign => {
263                self.next_token();
264                let expression = self.parse_primary_expression();
265                Expression::Field(Box::new(expression))
266            }
267            TokenKind::LeftParen => {
268                self.next_token();
269                let expression = self.parse_expression();
270                if self.current_token.kind == TokenKind::RightParen {
271                    self.next_token();
272                }
273                expression
274            }
275            TokenKind::Identifier => {
276                let identifier = self.current_token.literal;
277                self.next_token();
278                Expression::Identifier(identifier)
279            }
280            _ => {
281                todo!()
282            }
283        }
284    }
285
286    pub fn parse_program(&mut self) -> Program<'_> {
287        let mut program = Program::new();
288
289        while !self.is_eof() {
290            match self.parse_next_rule() {
291                Some(Rule::Begin(action)) => program.add_begin_block(Rule::Begin(action)),
292                Some(Rule::End(action)) => program.add_end_block(Rule::End(action)),
293                Some(rule) => program.add_rule(rule),
294                None => {}
295            }
296            self.next_token_with_regex(true);
297        }
298
299        program
300    }
301}
302
303fn infix_operator_precedence(kind: &TokenKind) -> Option<(u8, u8)> {
304    match kind {
305        TokenKind::Equal
306        | TokenKind::NotEqual
307        | TokenKind::GreaterThan
308        | TokenKind::GreaterThanOrEqual
309        | TokenKind::LessThan
310        | TokenKind::LessThanOrEqual
311        | TokenKind::Tilde
312        | TokenKind::NoMatch => Some((1, 2)),
313        TokenKind::Plus | TokenKind::Minus => Some((3, 4)),
314        TokenKind::Asterisk | TokenKind::Division | TokenKind::Percent => Some((5, 6)),
315        TokenKind::Caret => Some((9, 8)),
316        _ => None,
317    }
318}
319
320fn statement_end_kind(kind: &TokenKind) -> Option<StatementEnd> {
321    match kind {
322        TokenKind::RightCurlyBrace => Some(StatementEnd::RightCurlyBrace),
323        TokenKind::NewLine => Some(StatementEnd::NewLine),
324        TokenKind::Semicolon => Some(StatementEnd::Semicolon),
325        TokenKind::Eof => Some(StatementEnd::Eof),
326        _ => None,
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    #[test]
335    fn create_parser() {
336        let mut parser = Parser::new(Lexer::new("42 == 42"));
337
338        assert_eq!(parser.current_token.literal, "42");
339        parser.next_token();
340        assert_eq!(parser.current_token.literal, "==");
341    }
342
343    #[test]
344    fn parse_empty_program() {
345        let mut parser = Parser::new(Lexer::new(""));
346
347        let program = parser.parse_program();
348
349        assert_eq!(program.len(), 0);
350    }
351
352    #[test]
353    fn parse_action_without_pattern() {
354        let mut parser = Parser::new(Lexer::new("{ print }"));
355
356        let program = parser.parse_program();
357
358        assert_eq!(program.len(), 1);
359        assert_eq!("{ print }", program.to_string());
360    }
361
362    #[test]
363    fn parse_action_with_leading_newlines() {
364        let mut parser = Parser::new(Lexer::new("\n\n{ print }"));
365
366        let program = parser.parse_program();
367
368        assert_eq!(program.len(), 1);
369        assert_eq!("{ print }", program.to_string());
370    }
371
372    #[test]
373    fn parse_begin_block() {
374        let mut parser = Parser::new(Lexer::new("BEGIN { print }"));
375
376        let program = parser.parse_program();
377
378        assert_eq!(program.len(), 1);
379        assert_eq!("BEGIN { print }", program.to_string());
380    }
381
382    #[test]
383    fn parse_end_block() {
384        let mut parser = Parser::new(Lexer::new("END { print 42 }"));
385
386        let program = parser.parse_program();
387
388        assert_eq!(program.len(), 1);
389        assert_eq!("END { print 42 }", program.to_string());
390    }
391
392    #[test]
393    fn parse_regex_pattern_action() {
394        let mut parser = Parser::new(Lexer::new("/foo/ { print }"));
395
396        let program = parser.parse_program();
397
398        assert_eq!(program.len(), 1);
399        assert_eq!("/foo/ { print }", program.to_string());
400    }
401
402    #[test]
403    fn parse_print_infix_expression() {
404        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 }"));
405
406        let program = parser.parse_program();
407        let mut begin_blocks = program.begin_blocks_iter();
408        let rule = begin_blocks.next().expect("expected begin block");
409
410        let statements = match rule {
411            Rule::Begin(Action { statements }) => statements,
412            _ => panic!("expected begin rule"),
413        };
414
415        let exprs = match &statements[0] {
416            Statement::Print(expressions) => expressions,
417            _ => panic!("expected print statement"),
418        };
419
420        match &exprs[0] {
421            Expression::Infix {
422                left,
423                operator,
424                right,
425            } => {
426                assert!(matches!(**left, Expression::Number(1.0)));
427                assert_eq!(operator.kind, TokenKind::Plus);
428                assert!(matches!(**right, Expression::Number(2.0)));
429            }
430            _ => panic!("expected infix expression"),
431        }
432    }
433
434    #[test]
435    fn parse_print_parenthesized_expression() {
436        let mut parser = Parser::new(Lexer::new("BEGIN { print (1 + 2) * 3 }"));
437
438        let program = parser.parse_program();
439        let mut begin_blocks = program.begin_blocks_iter();
440        let rule = begin_blocks.next().expect("expected begin block");
441
442        let statements = match rule {
443            Rule::Begin(Action { statements }) => statements,
444            _ => panic!("expected begin rule"),
445        };
446
447        let exprs = match &statements[0] {
448            Statement::Print(expressions) => expressions,
449            _ => panic!("expected print statement"),
450        };
451
452        match &exprs[0] {
453            Expression::Infix {
454                left,
455                operator,
456                right,
457            } => {
458                assert_eq!(operator.kind, TokenKind::Asterisk);
459                assert!(matches!(**right, Expression::Number(3.0)));
460                assert!(matches!(**left, Expression::Infix { .. }));
461            }
462            _ => panic!("expected infix expression"),
463        }
464    }
465
466    #[test]
467    fn parse_print_multiplication_has_higher_precedence_than_addition() {
468        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 * 3 }"));
469
470        let program = parser.parse_program();
471        let mut begin_blocks = program.begin_blocks_iter();
472        let rule = begin_blocks.next().expect("expected begin block");
473
474        let statements = match rule {
475            Rule::Begin(Action { statements }) => statements,
476            _ => panic!("expected begin rule"),
477        };
478
479        let exprs = match &statements[0] {
480            Statement::Print(expressions) => expressions,
481            _ => panic!("expected print statement"),
482        };
483
484        match &exprs[0] {
485            Expression::Infix {
486                left,
487                operator,
488                right,
489            } => {
490                assert_eq!(operator.kind, TokenKind::Plus);
491                assert!(matches!(**left, Expression::Number(1.0)));
492                match &**right {
493                    Expression::Infix {
494                        operator: right_op, ..
495                    } => assert_eq!(right_op.kind, TokenKind::Asterisk),
496                    _ => panic!("expected nested infix expression"),
497                }
498            }
499            _ => panic!("expected infix expression"),
500        }
501    }
502
503    #[test]
504    fn parse_print_power_is_right_associative() {
505        let mut parser = Parser::new(Lexer::new("BEGIN { print 2 ^ 3 ^ 2 }"));
506
507        let program = parser.parse_program();
508        let mut begin_blocks = program.begin_blocks_iter();
509        let rule = begin_blocks.next().expect("expected begin block");
510
511        let statements = match rule {
512            Rule::Begin(Action { statements }) => statements,
513            _ => panic!("expected begin rule"),
514        };
515
516        let exprs = match &statements[0] {
517            Statement::Print(expressions) => expressions,
518            _ => panic!("expected print statement"),
519        };
520
521        match &exprs[0] {
522            Expression::Infix {
523                left,
524                operator,
525                right,
526            } => {
527                assert_eq!(operator.kind, TokenKind::Caret);
528                assert!(matches!(**left, Expression::Number(2.0)));
529                match &**right {
530                    Expression::Infix {
531                        operator: right_op, ..
532                    } => assert_eq!(right_op.kind, TokenKind::Caret),
533                    _ => panic!("expected nested infix expression"),
534                }
535            }
536            _ => panic!("expected infix expression"),
537        }
538    }
539
540    #[test]
541    fn parse_print_minus_is_left_associative() {
542        let mut parser = Parser::new(Lexer::new("BEGIN { print 5 - 3 - 1 }"));
543
544        let program = parser.parse_program();
545        let mut begin_blocks = program.begin_blocks_iter();
546        let rule = begin_blocks.next().expect("expected begin block");
547
548        let statements = match rule {
549            Rule::Begin(Action { statements }) => statements,
550            _ => panic!("expected begin rule"),
551        };
552
553        let exprs = match &statements[0] {
554            Statement::Print(expressions) => expressions,
555            _ => panic!("expected print statement"),
556        };
557
558        match &exprs[0] {
559            Expression::Infix {
560                left,
561                operator,
562                right,
563            } => {
564                assert_eq!(operator.kind, TokenKind::Minus);
565                match &**left {
566                    Expression::Infix {
567                        operator: left_op, ..
568                    } => assert_eq!(left_op.kind, TokenKind::Minus),
569                    _ => panic!("expected nested infix expression"),
570                }
571                assert!(matches!(**right, Expression::Number(1.0)));
572            }
573            _ => panic!("expected infix expression"),
574        }
575    }
576
577    #[test]
578    fn parse_print_concatenation() {
579        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:" 42 }"#));
580
581        let program = parser.parse_program();
582        let mut begin_blocks = program.begin_blocks_iter();
583        let rule = begin_blocks.next().expect("expected begin block");
584
585        let statements = match rule {
586            Rule::Begin(Action { statements }) => statements,
587            _ => panic!("expected begin rule"),
588        };
589
590        let exprs = match &statements[0] {
591            Statement::Print(expressions) => expressions,
592            _ => panic!("expected print statement"),
593        };
594
595        assert_eq!(exprs.len(), 2);
596        assert!(matches!(exprs[0], Expression::String("Value:")));
597        assert!(matches!(exprs[1], Expression::Number(42.0)));
598    }
599
600    #[test]
601    fn parse_print_field_expression() {
602        let mut parser = Parser::new(Lexer::new("{ print $1 }"));
603
604        let program = parser.parse_program();
605        let mut rules = program.rules_iter();
606        let rule = rules.next().expect("expected rule");
607
608        let statements = match rule {
609            Rule::Action(Action { statements }) => statements,
610            _ => panic!("expected action rule"),
611        };
612
613        let exprs = match &statements[0] {
614            Statement::Print(expressions) => expressions,
615            _ => panic!("expected print statement"),
616        };
617
618        match &exprs[0] {
619            Expression::Field(inner) => assert!(matches!(**inner, Expression::Number(1.0))),
620            _ => panic!("expected field expression"),
621        }
622    }
623
624    #[test]
625    fn parse_print_with_commas() {
626        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:", 42, $1 }"#));
627
628        let program = parser.parse_program();
629
630        assert_eq!(r#"BEGIN { print "Value:", 42, $1 }"#, program.to_string());
631    }
632
633    #[test]
634    fn parse_number_of_fields_identifier() {
635        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print NF }"#));
636
637        let program = parser.parse_program();
638
639        assert_eq!(r#"BEGIN { print NF }"#, program.to_string());
640    }
641
642    #[test]
643    fn parse_printf_with_format_and_arguments() {
644        let mut parser = Parser::new(Lexer::new(r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#));
645
646        let program = parser.parse_program();
647
648        assert_eq!(
649            r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#,
650            program.to_string()
651        );
652    }
653}