Skip to main content

rawk_core/
parser.rs

1use crate::{
2    Lexer, Program,
3    ast::{Action, Expression, Rule, Statement},
4    token::{Token, TokenKind},
5};
6
7#[derive(Debug)]
8pub struct Parser<'a> {
9    lexer: Lexer<'a>,
10    current_token: Token<'a>,
11}
12
13impl<'a> Parser<'a> {
14    pub fn new(mut lexer: Lexer<'a>) -> Self {
15        // Enable regex parsing for the first token since it could be a pattern
16        lexer.set_allow_regex(true);
17        let current_token = lexer.next_token();
18        lexer.set_allow_regex(false);
19
20        Parser {
21            lexer,
22            current_token,
23        }
24    }
25
26    fn next_token(&mut self) {
27        self.next_token_with_regex(false);
28    }
29
30    fn next_token_with_regex(&mut self, allow_regex: bool) {
31        self.lexer.set_allow_regex(allow_regex);
32        self.current_token = self.lexer.next_token();
33        self.lexer.set_allow_regex(false);
34    }
35
36    fn is_eof(&self) -> bool {
37        self.current_token.kind == TokenKind::Eof
38    }
39
40    fn parse_next_rule(&mut self) -> Option<Rule<'a>> {
41        match &self.current_token.kind {
42            TokenKind::Begin => {
43                self.next_token();
44                match self.parse_action() {
45                    Rule::Action(action) => Some(Rule::Begin(action)),
46                    _ => panic!("Expected action after BEGIN"),
47                }
48            }
49            TokenKind::NewLine => {
50                self.next_token_with_regex(true);
51                self.parse_next_rule()
52            }
53            TokenKind::Eof => None,
54            TokenKind::LeftCurlyBrace => Some(self.parse_action()),
55            TokenKind::Function => {
56                self.parse_function_definition();
57                None
58            }
59            TokenKind::End => {
60                self.next_token();
61                match self.parse_action() {
62                    Rule::Action(action) => Some(Rule::End(action)),
63                    _ => panic!("Expected action after END"),
64                }
65            }
66            TokenKind::Regex
67            | TokenKind::String
68            | TokenKind::Number
69            | TokenKind::DollarSign
70            | TokenKind::LeftParen
71            | TokenKind::Identifier
72            | TokenKind::Length
73            | TokenKind::Rand => self.parse_pattern_rule(),
74            _ => panic!(
75                "parse_next_rule not yet implemented, found token: {:?}",
76                self.current_token
77            ),
78        }
79    }
80
81    fn parse_pattern_rule(&mut self) -> Option<Rule<'a>> {
82        let mut pattern = self.parse_expression();
83        if self.current_token.kind == TokenKind::Comma {
84            let operator = self.current_token.clone();
85            self.next_token_with_regex(true);
86            let right = self.parse_expression();
87            pattern = Expression::Infix {
88                left: Box::new(pattern),
89                operator,
90                right: Box::new(right),
91            };
92        }
93        let pattern = Some(pattern);
94
95        if self.current_token.kind == TokenKind::LeftCurlyBrace {
96            match self.parse_action() {
97                Rule::Action(action) => Some(Rule::PatternAction {
98                    pattern,
99                    action: Some(action),
100                }),
101                _ => panic!("Expected action after pattern"),
102            }
103        } else {
104            Some(Rule::PatternAction {
105                pattern,
106                action: None,
107            })
108        }
109    }
110
111    fn parse_action(&mut self) -> Rule<'a> {
112        self.next_token(); // consume '{'
113
114        let pattern = None;
115
116        let mut statements = Vec::new();
117        while self.current_token.kind != TokenKind::RightCurlyBrace
118            && self.current_token.kind != TokenKind::Eof
119        {
120            while self.current_token.kind == TokenKind::NewLine
121                || self.current_token.kind == TokenKind::Semicolon
122            {
123                self.next_token();
124            }
125
126            if self.current_token.kind == TokenKind::RightCurlyBrace
127                || self.current_token.kind == TokenKind::Eof
128            {
129                break;
130            }
131
132            statements.push(self.parse_statement());
133        }
134
135        if pattern.is_some() {
136            Rule::PatternAction {
137                pattern,
138                action: Some(Action { statements }),
139            }
140        } else {
141            Rule::Action(Action { statements })
142        }
143    }
144
145    fn parse_statement(&mut self) -> Statement<'a> {
146        match self.current_token.kind {
147            TokenKind::Print => self.parse_print_function(),
148            TokenKind::Printf => self.parse_printf_function(),
149            TokenKind::System => self.parse_system_function(),
150            TokenKind::Gsub => self.parse_gsub_function(),
151            TokenKind::If => self.parse_if_statement(),
152            TokenKind::While => self.parse_while_statement(),
153            TokenKind::For => self.parse_for_statement(),
154            TokenKind::Exit => self.parse_exit_statement(),
155            TokenKind::Identifier => self.parse_assignment_statement(),
156            TokenKind::DollarSign => self.parse_field_assignment_statement(),
157            TokenKind::Increment => self.parse_pre_increment_statement(),
158            TokenKind::Decrement => self.parse_pre_decrement_statement(),
159            _ => todo!(),
160        }
161    }
162
163    fn parse_function_definition(&mut self) {
164        let mut brace_depth = 0usize;
165        loop {
166            if self.current_token.kind == TokenKind::Eof {
167                break;
168            }
169            if self.current_token.kind == TokenKind::LeftCurlyBrace {
170                brace_depth += 1;
171            } else if self.current_token.kind == TokenKind::RightCurlyBrace {
172                if brace_depth == 0 {
173                    break;
174                }
175                brace_depth -= 1;
176                if brace_depth == 0 {
177                    break;
178                }
179            }
180            self.next_token_with_regex(true);
181        }
182    }
183
184    fn parse_assignment_statement(&mut self) -> Statement<'a> {
185        let identifier = self.current_token.literal;
186        self.next_token();
187        self.parse_assignment_statement_with_identifier(identifier)
188    }
189
190    fn parse_assignment_statement_with_identifier(&mut self, identifier: &'a str) -> Statement<'a> {
191        if self.current_token.kind == TokenKind::LeftSquareBracket {
192            self.next_token_with_regex(true);
193            let index = self.parse_expression();
194            if self.current_token.kind != TokenKind::RightSquareBracket {
195                todo!()
196            }
197            self.next_token();
198            if self.current_token.kind == TokenKind::Assign {
199                self.next_token();
200                let value = self.parse_expression();
201                return Statement::ArrayAssignment {
202                    identifier,
203                    index,
204                    value,
205                };
206            }
207            if self.current_token.kind == TokenKind::AddAssign {
208                self.next_token();
209                let value = self.parse_expression();
210                return Statement::ArrayAddAssignment {
211                    identifier,
212                    index,
213                    value,
214                };
215            }
216            todo!()
217        }
218        if self.current_token.kind == TokenKind::Assign {
219            self.next_token();
220            if self.current_token.kind == TokenKind::Split {
221                return self.parse_split_assignment_statement(identifier);
222            }
223            let value = self.parse_expression();
224            Statement::Assignment { identifier, value }
225        } else if self.current_token.kind == TokenKind::Increment {
226            self.next_token();
227            Statement::PostIncrement { identifier }
228        } else if self.current_token.kind == TokenKind::Decrement {
229            self.next_token();
230            Statement::PostDecrement { identifier }
231        } else if self.current_token.kind == TokenKind::AddAssign {
232            self.next_token();
233            let value = self.parse_expression();
234            Statement::AddAssignment { identifier, value }
235        } else {
236            todo!()
237        }
238    }
239
240    fn parse_pre_increment_statement(&mut self) -> Statement<'a> {
241        self.next_token();
242        if self.current_token.kind != TokenKind::Identifier {
243            todo!()
244        }
245        let identifier = self.current_token.literal;
246        self.next_token();
247        Statement::PreIncrement { identifier }
248    }
249
250    fn parse_pre_decrement_statement(&mut self) -> Statement<'a> {
251        self.next_token();
252        if self.current_token.kind != TokenKind::Identifier {
253            todo!()
254        }
255        let identifier = self.current_token.literal;
256        self.next_token();
257        Statement::PreDecrement { identifier }
258    }
259
260    fn parse_split_assignment_statement(&mut self, identifier: &'a str) -> Statement<'a> {
261        self.next_token();
262        if self.current_token.kind != TokenKind::LeftParen {
263            todo!()
264        }
265        self.next_token_with_regex(true);
266        let string = self.parse_expression();
267        if self.current_token.kind != TokenKind::Comma {
268            todo!()
269        }
270        self.next_token();
271        if self.current_token.kind != TokenKind::Identifier {
272            todo!()
273        }
274        let array = self.current_token.literal;
275        self.next_token();
276        if self.current_token.kind != TokenKind::RightParen {
277            todo!()
278        }
279        self.next_token();
280        Statement::SplitAssignment {
281            identifier,
282            string,
283            array,
284        }
285    }
286
287    fn parse_field_assignment_statement(&mut self) -> Statement<'a> {
288        self.next_token();
289        let field = self.parse_primary_expression();
290        let assign_token = self.current_token.clone();
291        self.next_token();
292        let right_value = self.parse_expression();
293
294        let value = if assign_token.kind == TokenKind::Assign {
295            right_value
296        } else {
297            let operator = compound_assign_operator(&assign_token);
298            Expression::Infix {
299                left: Box::new(Expression::Field(Box::new(field.clone()))),
300                operator,
301                right: Box::new(right_value),
302            }
303        };
304        Statement::FieldAssignment { field, value }
305    }
306
307    fn parse_if_statement(&mut self) -> Statement<'a> {
308        self.next_token();
309        if self.current_token.kind != TokenKind::LeftParen {
310            todo!()
311        }
312        self.next_token_with_regex(true);
313        let condition = self.parse_expression();
314        if self.current_token.kind != TokenKind::RightParen {
315            todo!()
316        }
317        self.next_token();
318        while self.current_token.kind == TokenKind::NewLine
319            || self.current_token.kind == TokenKind::Semicolon
320        {
321            self.next_token();
322        }
323        let then_statements = if self.current_token.kind == TokenKind::LeftCurlyBrace {
324            self.parse_statement_block()
325        } else {
326            vec![self.parse_statement()]
327        };
328
329        while self.current_token.kind == TokenKind::NewLine
330            || self.current_token.kind == TokenKind::Semicolon
331        {
332            self.next_token();
333        }
334
335        if self.current_token.kind == TokenKind::Else {
336            self.next_token();
337            while self.current_token.kind == TokenKind::NewLine
338                || self.current_token.kind == TokenKind::Semicolon
339            {
340                self.next_token();
341            }
342            let else_statements = if self.current_token.kind == TokenKind::LeftCurlyBrace {
343                self.parse_statement_block()
344            } else {
345                vec![self.parse_statement()]
346            };
347            return Statement::IfElse {
348                condition,
349                then_statements,
350                else_statements,
351            };
352        }
353
354        Statement::If {
355            condition,
356            then_statements,
357        }
358    }
359
360    fn parse_exit_statement(&mut self) -> Statement<'a> {
361        self.next_token();
362        Statement::Exit
363    }
364
365    fn parse_statement_block(&mut self) -> Vec<Statement<'a>> {
366        self.next_token(); // consume '{'
367        let mut statements = Vec::new();
368        while self.current_token.kind != TokenKind::RightCurlyBrace
369            && self.current_token.kind != TokenKind::Eof
370        {
371            while self.current_token.kind == TokenKind::NewLine
372                || self.current_token.kind == TokenKind::Semicolon
373            {
374                self.next_token();
375            }
376
377            if self.current_token.kind == TokenKind::RightCurlyBrace
378                || self.current_token.kind == TokenKind::Eof
379            {
380                break;
381            }
382            statements.push(self.parse_statement());
383        }
384        if self.current_token.kind == TokenKind::RightCurlyBrace {
385            self.next_token();
386        }
387        statements
388    }
389
390    fn parse_while_statement(&mut self) -> Statement<'a> {
391        self.next_token();
392        if self.current_token.kind != TokenKind::LeftParen {
393            todo!()
394        }
395        self.next_token_with_regex(true);
396        let condition = self.parse_expression();
397        if self.current_token.kind != TokenKind::RightParen {
398            todo!()
399        }
400        self.next_token();
401        while self.current_token.kind == TokenKind::NewLine
402            || self.current_token.kind == TokenKind::Semicolon
403        {
404            self.next_token();
405        }
406        if self.current_token.kind != TokenKind::LeftCurlyBrace {
407            todo!()
408        }
409
410        let statements = self.parse_statement_block();
411        Statement::While {
412            condition,
413            statements,
414        }
415    }
416
417    fn parse_for_statement(&mut self) -> Statement<'a> {
418        self.next_token();
419        if self.current_token.kind != TokenKind::LeftParen {
420            todo!()
421        }
422        self.next_token();
423
424        let init = if self.current_token.kind == TokenKind::Identifier {
425            let variable = self.current_token.literal;
426            self.next_token();
427            if self.current_token.kind == TokenKind::In {
428                self.next_token();
429                if self.current_token.kind != TokenKind::Identifier {
430                    todo!()
431                }
432                let array = self.current_token.literal;
433                self.next_token();
434                if self.current_token.kind != TokenKind::RightParen {
435                    todo!()
436                }
437                self.next_token();
438                while self.current_token.kind == TokenKind::NewLine
439                    || self.current_token.kind == TokenKind::Semicolon
440                {
441                    self.next_token();
442                }
443                let statements = if self.current_token.kind == TokenKind::LeftCurlyBrace {
444                    self.parse_statement_block()
445                } else {
446                    vec![self.parse_statement()]
447                };
448                return Statement::ForIn {
449                    variable,
450                    array,
451                    statements,
452                };
453            }
454            self.parse_assignment_statement_with_identifier(variable)
455        } else {
456            self.parse_statement()
457        };
458        if self.current_token.kind != TokenKind::Semicolon {
459            todo!()
460        }
461        self.next_token_with_regex(true);
462
463        let condition = self.parse_expression();
464        if self.current_token.kind != TokenKind::Semicolon {
465            todo!()
466        }
467        self.next_token();
468
469        let update = self.parse_statement();
470        if self.current_token.kind != TokenKind::RightParen {
471            todo!()
472        }
473        self.next_token();
474
475        while self.current_token.kind == TokenKind::NewLine
476            || self.current_token.kind == TokenKind::Semicolon
477        {
478            self.next_token();
479        }
480
481        let statements = if self.current_token.kind == TokenKind::LeftCurlyBrace {
482            self.parse_statement_block()
483        } else {
484            vec![self.parse_statement()]
485        };
486
487        Statement::For {
488            init: Box::new(init),
489            condition,
490            update: Box::new(update),
491            statements,
492        }
493    }
494
495    fn parse_print_function(&mut self) -> Statement<'a> {
496        let mut expressions = Vec::new();
497        let mut expect_more = false;
498        self.next_token();
499
500        loop {
501            if self.current_token.kind == TokenKind::RightCurlyBrace
502                || self.current_token.kind == TokenKind::Eof
503                || self.current_token.kind == TokenKind::GreaterThan
504                || self.current_token.kind == TokenKind::Append
505                || self.current_token.kind == TokenKind::Pipe
506            {
507                break;
508            }
509
510            if self.current_token.kind == TokenKind::NewLine
511                || self.current_token.kind == TokenKind::Semicolon
512            {
513                if expect_more {
514                    self.next_token();
515                    continue;
516                }
517                break;
518            }
519
520            if self.current_token.kind == TokenKind::Comma {
521                self.next_token();
522                expect_more = true;
523                continue;
524            }
525
526            let expression = self.parse_expression();
527            expressions.push(expression);
528            expect_more = false;
529        }
530
531        if self.current_token.kind == TokenKind::GreaterThan
532            || self.current_token.kind == TokenKind::Append
533        {
534            let append = self.current_token.kind == TokenKind::Append;
535            self.next_token();
536            let target = self.parse_expression();
537            return Statement::PrintRedirect {
538                expressions,
539                target,
540                append,
541            };
542        }
543        if self.current_token.kind == TokenKind::Pipe {
544            self.next_token();
545            let target = self.parse_expression();
546            return Statement::PrintPipe { expressions, target };
547        }
548
549        Statement::Print(expressions)
550    }
551
552    fn parse_printf_function(&mut self) -> Statement<'a> {
553        self.next_token();
554        let expressions = if self.current_token.kind == TokenKind::LeftParen {
555            self.next_token_with_regex(true);
556            let mut expressions = Vec::new();
557            while self.current_token.kind != TokenKind::RightParen
558                && self.current_token.kind != TokenKind::Eof
559            {
560                if self.current_token.kind == TokenKind::Comma {
561                    self.next_token();
562                    continue;
563                }
564                expressions.push(self.parse_expression());
565            }
566            if self.current_token.kind == TokenKind::RightParen {
567                self.next_token();
568            }
569            expressions
570        } else {
571            self.parse_expression_list_until_action_end_from_current()
572        };
573
574        Statement::Printf(expressions)
575    }
576
577    fn parse_gsub_function(&mut self) -> Statement<'a> {
578        self.next_token();
579        if self.current_token.kind != TokenKind::LeftParen {
580            todo!()
581        }
582
583        self.next_token_with_regex(true);
584        let pattern = self.parse_expression();
585
586        if self.current_token.kind != TokenKind::Comma {
587            todo!()
588        }
589        self.next_token();
590        let replacement = self.parse_expression();
591
592        if self.current_token.kind == TokenKind::Comma {
593            todo!()
594        }
595
596        if self.current_token.kind != TokenKind::RightParen {
597            todo!()
598        }
599        self.next_token();
600
601        Statement::Gsub {
602            pattern,
603            replacement,
604        }
605    }
606
607    fn parse_system_function(&mut self) -> Statement<'a> {
608        self.next_token();
609        if self.current_token.kind != TokenKind::LeftParen {
610            todo!()
611        }
612        self.next_token();
613        let command = self.parse_expression();
614        if self.current_token.kind != TokenKind::RightParen {
615            todo!()
616        }
617        self.next_token();
618        Statement::System(command)
619    }
620
621    fn parse_expression_list_until_action_end_from_current(&mut self) -> Vec<Expression<'a>> {
622        let mut expressions = Vec::new();
623        let mut expect_more = false;
624
625        loop {
626            if self.current_token.kind == TokenKind::RightCurlyBrace
627                || self.current_token.kind == TokenKind::Eof
628            {
629                break;
630            }
631
632            if self.current_token.kind == TokenKind::NewLine
633                || self.current_token.kind == TokenKind::Semicolon
634            {
635                if expect_more {
636                    self.next_token();
637                    continue;
638                }
639                break;
640            }
641
642            if self.current_token.kind == TokenKind::Comma {
643                self.next_token();
644                expect_more = true;
645                continue;
646            }
647
648            let expression = self.parse_expression();
649            expressions.push(expression);
650            expect_more = false;
651        }
652
653        expressions
654    }
655
656    fn parse_expression(&mut self) -> Expression<'a> {
657        self.parse_expression_with_min_precedence(0)
658    }
659
660    fn parse_expression_with_min_precedence(&mut self, min_precedence: u8) -> Expression<'a> {
661        const CONCAT_LEFT_PRECEDENCE: u8 = 6;
662        const CONCAT_RIGHT_PRECEDENCE: u8 = 7;
663        let mut left = self.parse_primary_expression();
664
665        loop {
666            if self.current_token.kind == TokenKind::QuestionMark {
667                if min_precedence > 0 {
668                    break;
669                }
670                self.next_token_with_regex(true);
671                let then_expr = self.parse_expression_with_min_precedence(0);
672                if self.current_token.kind != TokenKind::Colon {
673                    todo!()
674                }
675                self.next_token_with_regex(true);
676                let else_expr = self.parse_expression_with_min_precedence(0);
677                left = Expression::Ternary {
678                    condition: Box::new(left),
679                    then_expr: Box::new(then_expr),
680                    else_expr: Box::new(else_expr),
681                };
682                continue;
683            }
684
685            if infix_operator_precedence(&self.current_token.kind).is_none()
686                && is_expression_start(&self.current_token.kind)
687            {
688                if CONCAT_LEFT_PRECEDENCE < min_precedence {
689                    break;
690                }
691
692                let right = self.parse_expression_with_min_precedence(CONCAT_RIGHT_PRECEDENCE);
693                left = Expression::Concatenation {
694                    left: Box::new(left),
695                    right: Box::new(right),
696                };
697                continue;
698            }
699
700            let (left_precedence, right_precedence) =
701                match infix_operator_precedence(&self.current_token.kind) {
702                    Some(value) => value,
703                    None => break,
704                };
705
706            if left_precedence < min_precedence {
707                break;
708            }
709
710            let operator = self.current_token.clone();
711            if matches!(
712                operator.kind,
713                TokenKind::Tilde | TokenKind::NoMatch | TokenKind::And | TokenKind::Or
714            ) {
715                self.next_token_with_regex(true);
716            } else {
717                self.next_token();
718            }
719            let right = self.parse_expression_with_min_precedence(right_precedence);
720
721            left = Expression::Infix {
722                left: Box::new(left),
723                operator,
724                right: Box::new(right),
725            };
726        }
727
728        left
729    }
730
731    fn parse_primary_expression(&mut self) -> Expression<'a> {
732        match self.current_token.kind {
733            TokenKind::String => {
734                let expression = Expression::String(self.current_token.literal);
735                self.next_token();
736                expression
737            }
738            TokenKind::Regex => {
739                let expression = Expression::Regex(self.current_token.literal);
740                self.next_token();
741                expression
742            }
743            TokenKind::Number => {
744                let expression = if let Ok(value) = self.current_token.literal.parse::<f64>() {
745                    Expression::Number(value)
746                } else {
747                    todo!()
748                };
749                self.next_token();
750                expression
751            }
752            TokenKind::DollarSign => {
753                self.next_token();
754                let expression = self.parse_primary_expression();
755                Expression::Field(Box::new(expression))
756            }
757            TokenKind::LeftParen => {
758                self.next_token();
759                let expression = self.parse_expression();
760                if self.current_token.kind == TokenKind::RightParen {
761                    self.next_token();
762                }
763                expression
764            }
765            TokenKind::Identifier => {
766                let identifier = self.current_token.literal;
767                self.next_token();
768                if self.current_token.kind == TokenKind::LeftParen {
769                    let args = self.parse_call_arguments();
770                    return Expression::FunctionCall {
771                        name: identifier,
772                        args,
773                    };
774                }
775                if self.current_token.kind == TokenKind::LeftSquareBracket {
776                    self.next_token_with_regex(true);
777                    let index = self.parse_expression();
778                    if self.current_token.kind != TokenKind::RightSquareBracket {
779                        todo!()
780                    }
781                    self.next_token();
782                    Expression::ArrayAccess {
783                        identifier,
784                        index: Box::new(index),
785                    }
786                } else {
787                    Expression::Identifier(identifier)
788                }
789            }
790            TokenKind::Length => {
791                self.next_token();
792                if self.current_token.kind == TokenKind::LeftParen {
793                    self.next_token();
794                    if self.current_token.kind == TokenKind::RightParen {
795                        self.next_token();
796                        Expression::Length(None)
797                    } else {
798                        let expression = self.parse_expression();
799                        if self.current_token.kind != TokenKind::RightParen {
800                            todo!()
801                        }
802                        self.next_token();
803                        Expression::Length(Some(Box::new(expression)))
804                    }
805                } else {
806                    Expression::Length(None)
807                }
808            }
809            TokenKind::Substr => {
810                self.next_token();
811                if self.current_token.kind != TokenKind::LeftParen {
812                    todo!()
813                }
814                self.next_token();
815                let string = self.parse_expression();
816                if self.current_token.kind != TokenKind::Comma {
817                    todo!()
818                }
819                self.next_token();
820                let start = self.parse_expression();
821                let mut length = None;
822                if self.current_token.kind == TokenKind::Comma {
823                    self.next_token();
824                    length = Some(Box::new(self.parse_expression()));
825                }
826                if self.current_token.kind != TokenKind::RightParen {
827                    todo!()
828                }
829                self.next_token();
830                Expression::Substr {
831                    string: Box::new(string),
832                    start: Box::new(start),
833                    length,
834                }
835            }
836            TokenKind::Rand => {
837                self.next_token();
838                if self.current_token.kind == TokenKind::LeftParen {
839                    self.next_token();
840                    if self.current_token.kind != TokenKind::RightParen {
841                        todo!()
842                    }
843                    self.next_token();
844                }
845                Expression::Rand
846            }
847            TokenKind::Sprintf | TokenKind::Split => {
848                let name = self.current_token.literal;
849                self.next_token();
850                if self.current_token.kind == TokenKind::LeftParen {
851                    let args = self.parse_call_arguments();
852                    return Expression::FunctionCall { name, args };
853                }
854                Expression::Number(0.0)
855            }
856            _ => {
857                panic!(
858                    "parse_primary_expression not yet implemented, found token: {:?}",
859                    self.current_token
860                )
861            }
862        }
863    }
864
865    pub fn parse_program(&mut self) -> Program<'_> {
866        let mut program = Program::new();
867
868        while !self.is_eof() {
869            match self.parse_next_rule() {
870                Some(Rule::Begin(action)) => program.add_begin_block(Rule::Begin(action)),
871                Some(Rule::End(action)) => program.add_end_block(Rule::End(action)),
872                Some(rule) => program.add_rule(rule),
873                None => {}
874            }
875            self.next_token_with_regex(true);
876        }
877
878        program
879    }
880
881    fn parse_call_arguments(&mut self) -> Vec<Expression<'a>> {
882        if self.current_token.kind != TokenKind::LeftParen {
883            return vec![];
884        }
885        self.next_token_with_regex(true);
886        let mut args = Vec::new();
887        while self.current_token.kind != TokenKind::RightParen
888            && self.current_token.kind != TokenKind::Eof
889        {
890            if self.current_token.kind == TokenKind::Comma {
891                self.next_token();
892                continue;
893            }
894            args.push(self.parse_expression());
895        }
896        if self.current_token.kind == TokenKind::RightParen {
897            self.next_token();
898        }
899        args
900    }
901}
902
903fn infix_operator_precedence(kind: &TokenKind) -> Option<(u8, u8)> {
904    match kind {
905        TokenKind::Assign => Some((0, 0)),
906        TokenKind::Or => Some((1, 2)),
907        TokenKind::And => Some((3, 4)),
908        TokenKind::Equal
909        | TokenKind::NotEqual
910        | TokenKind::GreaterThan
911        | TokenKind::GreaterThanOrEqual
912        | TokenKind::LessThan
913        | TokenKind::LessThanOrEqual
914        | TokenKind::Tilde
915        | TokenKind::NoMatch => Some((5, 6)),
916        TokenKind::Plus | TokenKind::Minus => Some((7, 8)),
917        TokenKind::Asterisk | TokenKind::Division | TokenKind::Percent => Some((9, 10)),
918        TokenKind::Caret => Some((13, 12)),
919        _ => None,
920    }
921}
922
923fn is_expression_start(kind: &TokenKind) -> bool {
924    matches!(
925        kind,
926        TokenKind::String
927            | TokenKind::Regex
928            | TokenKind::Number
929            | TokenKind::DollarSign
930            | TokenKind::LeftParen
931            | TokenKind::Identifier
932            | TokenKind::Length
933            | TokenKind::Rand
934            | TokenKind::Sprintf
935            | TokenKind::Split
936            | TokenKind::Substr
937    )
938}
939
940fn compound_assign_operator(token: &Token<'_>) -> Token<'static> {
941    let (kind, literal) = match token.kind {
942        TokenKind::AddAssign => (TokenKind::Plus, "+"),
943        TokenKind::SubtractAssign => (TokenKind::Minus, "-"),
944        TokenKind::MultiplyAssign => (TokenKind::Asterisk, "*"),
945        TokenKind::DivideAssign => (TokenKind::Division, "/"),
946        TokenKind::ModuloAssign => (TokenKind::Percent, "%"),
947        TokenKind::PowerAssign => (TokenKind::Caret, "^"),
948        _ => todo!(),
949    };
950
951    Token::new(kind, literal, token.span.start)
952}
953
954#[cfg(test)]
955mod tests {
956    use super::*;
957
958    #[test]
959    fn create_parser() {
960        let mut parser = Parser::new(Lexer::new("42 == 42"));
961
962        assert_eq!(parser.current_token.literal, "42");
963        parser.next_token();
964        assert_eq!(parser.current_token.literal, "==");
965    }
966
967    #[test]
968    fn parse_empty_program() {
969        let mut parser = Parser::new(Lexer::new(""));
970
971        let program = parser.parse_program();
972
973        assert_eq!(program.len(), 0);
974    }
975
976    #[test]
977    fn parse_action_without_pattern() {
978        let mut parser = Parser::new(Lexer::new("{ print }"));
979
980        let program = parser.parse_program();
981
982        assert_eq!(program.len(), 1);
983        assert_eq!("{ print }", program.to_string());
984    }
985
986    #[test]
987    fn parse_action_with_leading_newlines() {
988        let mut parser = Parser::new(Lexer::new("\n\n{ print }"));
989
990        let program = parser.parse_program();
991
992        assert_eq!(program.len(), 1);
993        assert_eq!("{ print }", program.to_string());
994    }
995
996    #[test]
997    fn parse_begin_block() {
998        let mut parser = Parser::new(Lexer::new("BEGIN { print }"));
999
1000        let program = parser.parse_program();
1001
1002        assert_eq!(program.len(), 1);
1003        assert_eq!("BEGIN { print }", program.to_string());
1004    }
1005
1006    #[test]
1007    fn parse_end_block() {
1008        let mut parser = Parser::new(Lexer::new("END { print 42 }"));
1009
1010        let program = parser.parse_program();
1011
1012        assert_eq!(program.len(), 1);
1013        assert_eq!("END { print 42 }", program.to_string());
1014    }
1015
1016    #[test]
1017    fn parse_regex_pattern_action() {
1018        let mut parser = Parser::new(Lexer::new("/foo/ { print }"));
1019
1020        let program = parser.parse_program();
1021
1022        assert_eq!(program.len(), 1);
1023        assert_eq!("/foo/ { print }", program.to_string());
1024    }
1025
1026    #[test]
1027    fn parse_print_infix_expression() {
1028        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 }"));
1029
1030        let program = parser.parse_program();
1031        let mut begin_blocks = program.begin_blocks_iter();
1032        let rule = begin_blocks.next().expect("expected begin block");
1033
1034        let statements = match rule {
1035            Rule::Begin(Action { statements }) => statements,
1036            _ => panic!("expected begin rule"),
1037        };
1038
1039        let exprs = match &statements[0] {
1040            Statement::Print(expressions) => expressions,
1041            _ => panic!("expected print statement"),
1042        };
1043
1044        match &exprs[0] {
1045            Expression::Infix {
1046                left,
1047                operator,
1048                right,
1049            } => {
1050                assert!(matches!(**left, Expression::Number(1.0)));
1051                assert_eq!(operator.kind, TokenKind::Plus);
1052                assert!(matches!(**right, Expression::Number(2.0)));
1053            }
1054            _ => panic!("expected infix expression"),
1055        }
1056    }
1057
1058    #[test]
1059    fn parse_print_parenthesized_expression() {
1060        let mut parser = Parser::new(Lexer::new("BEGIN { print (1 + 2) * 3 }"));
1061
1062        let program = parser.parse_program();
1063        let mut begin_blocks = program.begin_blocks_iter();
1064        let rule = begin_blocks.next().expect("expected begin block");
1065
1066        let statements = match rule {
1067            Rule::Begin(Action { statements }) => statements,
1068            _ => panic!("expected begin rule"),
1069        };
1070
1071        let exprs = match &statements[0] {
1072            Statement::Print(expressions) => expressions,
1073            _ => panic!("expected print statement"),
1074        };
1075
1076        match &exprs[0] {
1077            Expression::Infix {
1078                left,
1079                operator,
1080                right,
1081            } => {
1082                assert_eq!(operator.kind, TokenKind::Asterisk);
1083                assert!(matches!(**right, Expression::Number(3.0)));
1084                assert!(matches!(**left, Expression::Infix { .. }));
1085            }
1086            _ => panic!("expected infix expression"),
1087        }
1088    }
1089
1090    #[test]
1091    fn parse_print_multiplication_has_higher_precedence_than_addition() {
1092        let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 * 3 }"));
1093
1094        let program = parser.parse_program();
1095        let mut begin_blocks = program.begin_blocks_iter();
1096        let rule = begin_blocks.next().expect("expected begin block");
1097
1098        let statements = match rule {
1099            Rule::Begin(Action { statements }) => statements,
1100            _ => panic!("expected begin rule"),
1101        };
1102
1103        let exprs = match &statements[0] {
1104            Statement::Print(expressions) => expressions,
1105            _ => panic!("expected print statement"),
1106        };
1107
1108        match &exprs[0] {
1109            Expression::Infix {
1110                left,
1111                operator,
1112                right,
1113            } => {
1114                assert_eq!(operator.kind, TokenKind::Plus);
1115                assert!(matches!(**left, Expression::Number(1.0)));
1116                match &**right {
1117                    Expression::Infix {
1118                        operator: right_op, ..
1119                    } => assert_eq!(right_op.kind, TokenKind::Asterisk),
1120                    _ => panic!("expected nested infix expression"),
1121                }
1122            }
1123            _ => panic!("expected infix expression"),
1124        }
1125    }
1126
1127    #[test]
1128    fn parse_print_power_is_right_associative() {
1129        let mut parser = Parser::new(Lexer::new("BEGIN { print 2 ^ 3 ^ 2 }"));
1130
1131        let program = parser.parse_program();
1132        let mut begin_blocks = program.begin_blocks_iter();
1133        let rule = begin_blocks.next().expect("expected begin block");
1134
1135        let statements = match rule {
1136            Rule::Begin(Action { statements }) => statements,
1137            _ => panic!("expected begin rule"),
1138        };
1139
1140        let exprs = match &statements[0] {
1141            Statement::Print(expressions) => expressions,
1142            _ => panic!("expected print statement"),
1143        };
1144
1145        match &exprs[0] {
1146            Expression::Infix {
1147                left,
1148                operator,
1149                right,
1150            } => {
1151                assert_eq!(operator.kind, TokenKind::Caret);
1152                assert!(matches!(**left, Expression::Number(2.0)));
1153                match &**right {
1154                    Expression::Infix {
1155                        operator: right_op, ..
1156                    } => assert_eq!(right_op.kind, TokenKind::Caret),
1157                    _ => panic!("expected nested infix expression"),
1158                }
1159            }
1160            _ => panic!("expected infix expression"),
1161        }
1162    }
1163
1164    #[test]
1165    fn parse_print_minus_is_left_associative() {
1166        let mut parser = Parser::new(Lexer::new("BEGIN { print 5 - 3 - 1 }"));
1167
1168        let program = parser.parse_program();
1169        let mut begin_blocks = program.begin_blocks_iter();
1170        let rule = begin_blocks.next().expect("expected begin block");
1171
1172        let statements = match rule {
1173            Rule::Begin(Action { statements }) => statements,
1174            _ => panic!("expected begin rule"),
1175        };
1176
1177        let exprs = match &statements[0] {
1178            Statement::Print(expressions) => expressions,
1179            _ => panic!("expected print statement"),
1180        };
1181
1182        match &exprs[0] {
1183            Expression::Infix {
1184                left,
1185                operator,
1186                right,
1187            } => {
1188                assert_eq!(operator.kind, TokenKind::Minus);
1189                match &**left {
1190                    Expression::Infix {
1191                        operator: left_op, ..
1192                    } => assert_eq!(left_op.kind, TokenKind::Minus),
1193                    _ => panic!("expected nested infix expression"),
1194                }
1195                assert!(matches!(**right, Expression::Number(1.0)));
1196            }
1197            _ => panic!("expected infix expression"),
1198        }
1199    }
1200
1201    #[test]
1202    fn parse_print_concatenation() {
1203        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:" 42 }"#));
1204
1205        let program = parser.parse_program();
1206        let mut begin_blocks = program.begin_blocks_iter();
1207        let rule = begin_blocks.next().expect("expected begin block");
1208
1209        let statements = match rule {
1210            Rule::Begin(Action { statements }) => statements,
1211            _ => panic!("expected begin rule"),
1212        };
1213
1214        let exprs = match &statements[0] {
1215            Statement::Print(expressions) => expressions,
1216            _ => panic!("expected print statement"),
1217        };
1218
1219        assert_eq!(exprs.len(), 1);
1220        match &exprs[0] {
1221            Expression::Concatenation { left, right } => {
1222                assert!(matches!(**left, Expression::String("Value:")));
1223                assert!(matches!(**right, Expression::Number(42.0)));
1224            }
1225            _ => panic!("expected concatenation expression"),
1226        }
1227    }
1228
1229    #[test]
1230    fn parse_print_field_expression() {
1231        let mut parser = Parser::new(Lexer::new("{ print $1 }"));
1232
1233        let program = parser.parse_program();
1234        let mut rules = program.rules_iter();
1235        let rule = rules.next().expect("expected rule");
1236
1237        let statements = match rule {
1238            Rule::Action(Action { statements }) => statements,
1239            _ => panic!("expected action rule"),
1240        };
1241
1242        let exprs = match &statements[0] {
1243            Statement::Print(expressions) => expressions,
1244            _ => panic!("expected print statement"),
1245        };
1246
1247        match &exprs[0] {
1248            Expression::Field(inner) => assert!(matches!(**inner, Expression::Number(1.0))),
1249            _ => panic!("expected field expression"),
1250        }
1251    }
1252
1253    #[test]
1254    fn parse_print_with_commas() {
1255        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:", 42, $1 }"#));
1256
1257        let program = parser.parse_program();
1258
1259        assert_eq!(r#"BEGIN { print "Value:", 42, $1 }"#, program.to_string());
1260    }
1261
1262    #[test]
1263    fn parse_number_of_fields_identifier() {
1264        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print NF }"#));
1265
1266        let program = parser.parse_program();
1267
1268        assert_eq!(r#"BEGIN { print NF }"#, program.to_string());
1269    }
1270
1271    #[test]
1272    fn parse_printf_with_format_and_arguments() {
1273        let mut parser = Parser::new(Lexer::new(r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#));
1274
1275        let program = parser.parse_program();
1276
1277        assert_eq!(
1278            r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#,
1279            program.to_string()
1280        );
1281    }
1282
1283    #[test]
1284    fn parse_add_assignment_and_pre_increment() {
1285        let mut parser = Parser::new(Lexer::new(r#"/Asia/ { pop += $3; ++n }"#));
1286
1287        let program = parser.parse_program();
1288
1289        assert_eq!(r#"/Asia/ { pop += $3; ++n }"#, program.to_string());
1290    }
1291
1292    #[test]
1293    fn parse_regex_match_pattern_action() {
1294        let mut parser = Parser::new(Lexer::new(r#"$4 ~ /Asia/ { print $1 }"#));
1295
1296        let program = parser.parse_program();
1297
1298        assert_eq!(r#"$4 ~ /Asia/ { print $1 }"#, program.to_string());
1299    }
1300
1301    #[test]
1302    fn parse_print_with_line_continuation_after_comma() {
1303        let mut parser = Parser::new(Lexer::new(
1304            "END { print \"population of\", n,\\\n\"Asian countries in millions is\", pop }",
1305        ));
1306
1307        let program = parser.parse_program();
1308
1309        assert_eq!(
1310            "END { print \"population of\", n, \"Asian countries in millions is\", pop }",
1311            program.to_string()
1312        );
1313    }
1314
1315    #[test]
1316    fn parse_gsub_statement() {
1317        let mut parser = Parser::new(Lexer::new(r#"{ gsub(/USA/, "United States"); print }"#));
1318
1319        let program = parser.parse_program();
1320
1321        assert_eq!(
1322            r#"{ gsub(/USA/, "United States"); print }"#,
1323            program.to_string()
1324        );
1325    }
1326
1327    #[test]
1328    fn parse_system_statement() {
1329        let mut parser = Parser::new(Lexer::new(r#"{ system("cat " $2) }"#));
1330
1331        let program = parser.parse_program();
1332
1333        assert_eq!(r#"{ system("cat " $2) }"#, program.to_string());
1334    }
1335
1336    #[test]
1337    fn parse_print_length_builtin_expression() {
1338        let mut parser = Parser::new(Lexer::new(r#"{ print length, $0 }"#));
1339
1340        let program = parser.parse_program();
1341
1342        assert_eq!(r#"{ print length, $0 }"#, program.to_string());
1343    }
1344
1345    #[test]
1346    fn parse_length_expression_as_rule_pattern() {
1347        let mut parser = Parser::new(Lexer::new(
1348            r#"length($1) > max { max = length($1); name = $1 } END { print name }"#,
1349        ));
1350
1351        let program = parser.parse_program();
1352
1353        assert_eq!(
1354            r#"length($1) > max { max = length($1); name = $1 } END { print name }"#,
1355            program.to_string()
1356        );
1357    }
1358
1359    #[test]
1360    fn parse_field_assignment_with_substr() {
1361        let mut parser = Parser::new(Lexer::new(r#"{ $1 = substr($1, 1, 3); print }"#));
1362
1363        let program = parser.parse_program();
1364
1365        assert_eq!(r#"{ $1 = substr($1, 1, 3); print }"#, program.to_string());
1366    }
1367
1368    #[test]
1369    fn parse_assignment_with_concatenation_and_substr() {
1370        let mut parser = Parser::new(Lexer::new(
1371            r#"{ s = s " " substr($1, 1, 3) }"#,
1372        ));
1373
1374        let program = parser.parse_program();
1375
1376        assert_eq!(r#"{ s = s " " substr($1, 1, 3) }"#, program.to_string());
1377    }
1378
1379    #[test]
1380    fn parse_field_divide_assignment() {
1381        let mut parser = Parser::new(Lexer::new(r#"{ $2 /= 1000; print }"#));
1382
1383        let program = parser.parse_program();
1384
1385        assert_eq!(r#"{ $2 = $2 / 1000; print }"#, program.to_string());
1386    }
1387
1388    #[test]
1389    fn parse_chained_assignment() {
1390        let mut parser = Parser::new(Lexer::new(r#"BEGIN { FS = OFS = "\t" }"#));
1391
1392        let program = parser.parse_program();
1393
1394        assert_eq!(r#"BEGIN { FS = OFS = "\t" }"#, program.to_string());
1395    }
1396
1397    #[test]
1398    fn parse_if_statement_with_block() {
1399        let mut parser = Parser::new(Lexer::new(
1400            r#"{ if (maxpop < $3) { maxpop = $3; country = $1 } }"#,
1401        ));
1402
1403        let program = parser.parse_program();
1404
1405        assert_eq!(
1406            r#"{ if (maxpop < $3) { maxpop = $3; country = $1 } }"#,
1407            program.to_string()
1408        );
1409    }
1410
1411    #[test]
1412    fn parse_while_with_post_increment() {
1413        let mut parser = Parser::new(Lexer::new(
1414            r#"{ i = 1; while (i <= NF) { print $i; i++ } }"#,
1415        ));
1416
1417        let program = parser.parse_program();
1418
1419        assert_eq!(
1420            r#"{ i = 1; while (i <= NF) { print $i; i++ } }"#,
1421            program.to_string()
1422        );
1423    }
1424
1425    #[test]
1426    fn parse_post_decrement_statement() {
1427        let mut parser = Parser::new(Lexer::new(r#"{ k-- ; n-- }"#));
1428
1429        let program = parser.parse_program();
1430
1431        assert_eq!(r#"{ k--; n-- }"#, program.to_string());
1432    }
1433
1434    #[test]
1435    fn parse_rand_expression() {
1436        let mut parser = Parser::new(Lexer::new(r#"BEGIN { print rand() }"#));
1437
1438        let program = parser.parse_program();
1439
1440        assert_eq!(r#"BEGIN { print rand() }"#, program.to_string());
1441    }
1442
1443    #[test]
1444    fn parse_for_loop_with_single_body_statement() {
1445        let mut parser = Parser::new(Lexer::new(
1446            r#"{ for (i = 1; i <= NF; i++) print $i }"#,
1447        ));
1448
1449        let program = parser.parse_program();
1450
1451        assert_eq!(
1452            r#"{ for (i = 1; i <= NF; i++) { print $i } }"#,
1453            program.to_string()
1454        );
1455    }
1456
1457    #[test]
1458    fn parse_if_with_single_statement_body() {
1459        let mut parser = Parser::new(Lexer::new(
1460            r#"END { if (NR < 10) print FILENAME " has only " NR " lines" }"#,
1461        ));
1462
1463        let program = parser.parse_program();
1464
1465        assert_eq!(
1466            r#"END { if (NR < 10) { print FILENAME " has only " NR " lines" } }"#,
1467            program.to_string()
1468        );
1469    }
1470
1471    #[test]
1472    fn parse_exit_statement() {
1473        let mut parser = Parser::new(Lexer::new(r#"NR >= 10 { exit }"#));
1474
1475        let program = parser.parse_program();
1476
1477        assert_eq!(r#"NR >= 10 { exit }"#, program.to_string());
1478    }
1479
1480    #[test]
1481    fn parse_array_add_assignment_and_access() {
1482        let mut parser = Parser::new(Lexer::new(
1483            r#"/Asia/ { pop["Asia"] += $3 } END { print pop["Asia"] }"#,
1484        ));
1485
1486        let program = parser.parse_program();
1487
1488        assert_eq!(
1489            r#"/Asia/ { pop["Asia"] += $3 } END { print pop["Asia"] }"#,
1490            program.to_string()
1491        );
1492    }
1493
1494    #[test]
1495    fn parse_for_in_loop() {
1496        let mut parser = Parser::new(Lexer::new(
1497            r#"END { for (name in area) print name ":" area[name] }"#,
1498        ));
1499
1500        let program = parser.parse_program();
1501
1502        assert_eq!(
1503            r#"END { for (name in area) { print name ":" area[name] } }"#,
1504            program.to_string()
1505        );
1506    }
1507
1508    #[test]
1509    fn parse_print_redirection() {
1510        let mut parser = Parser::new(Lexer::new(r#"{ print >"tempbig" }"#));
1511
1512        let program = parser.parse_program();
1513
1514        assert_eq!(r#"{ print > "tempbig" }"#, program.to_string());
1515    }
1516
1517    #[test]
1518    fn parse_print_pipe() {
1519        let mut parser = Parser::new(Lexer::new(r#"{ print c ":" pop[c] | "sort" }"#));
1520
1521        let program = parser.parse_program();
1522
1523        assert_eq!(r#"{ print c ":" pop[c] | "sort" }"#, program.to_string());
1524    }
1525}