1use crate::{
2 Lexer, Program,
3 ast::{Action, Expression, Rule, Statement},
4 token::{Token, TokenKind},
5};
6
7#[derive(Debug)]
8pub struct Parser<'a> {
9 lexer: Lexer<'a>,
10 current_token: Token<'a>,
11}
12
13enum CommaBehavior {
14 InsertSpaceExpression,
15 SkipComma,
16}
17
18enum StatementEnd {
19 RightCurlyBrace,
20 NewLine,
21 Semicolon,
22 Eof,
23}
24
25impl<'a> Parser<'a> {
26 pub fn new(mut lexer: Lexer<'a>) -> Self {
27 lexer.set_allow_regex(true);
29 let current_token = lexer.next_token();
30 lexer.set_allow_regex(false);
31
32 Parser {
33 lexer,
34 current_token,
35 }
36 }
37
38 fn next_token(&mut self) {
39 self.next_token_with_regex(false);
40 }
41
42 fn next_token_with_regex(&mut self, allow_regex: bool) {
43 self.lexer.set_allow_regex(allow_regex);
44 self.current_token = self.lexer.next_token();
45 self.lexer.set_allow_regex(false);
46 }
47
48 fn is_eof(&self) -> bool {
49 self.current_token.kind == TokenKind::Eof
50 }
51
52 fn parse_next_rule(&mut self) -> Option<Rule<'a>> {
53 match &self.current_token.kind {
54 TokenKind::Begin => {
55 self.next_token();
56 match self.parse_action() {
57 Rule::Action(action) => Some(Rule::Begin(action)),
58 _ => panic!("Expected action after BEGIN"),
59 }
60 }
61 TokenKind::NewLine => {
62 self.next_token_with_regex(true);
63 self.parse_next_rule()
64 }
65 TokenKind::Eof => None,
66 TokenKind::LeftCurlyBrace => Some(self.parse_action()),
67 TokenKind::End => {
68 self.next_token();
69 match self.parse_action() {
70 Rule::Action(action) => Some(Rule::End(action)),
71 _ => panic!("Expected action after END"),
72 }
73 }
74 TokenKind::Regex
75 | TokenKind::String
76 | TokenKind::Number
77 | TokenKind::DollarSign
78 | TokenKind::LeftParen
79 | TokenKind::Identifier => self.parse_pattern_rule(),
80 _ => panic!(
81 "parse_next_rule not yet implemented, found token: {:?}",
82 self.current_token
83 ),
84 }
85 }
86
87 fn parse_pattern_rule(&mut self) -> Option<Rule<'a>> {
88 let pattern = Some(self.parse_expression());
89
90 if self.current_token.kind == TokenKind::LeftCurlyBrace {
91 match self.parse_action() {
92 Rule::Action(action) => Some(Rule::PatternAction {
93 pattern,
94 action: Some(action),
95 }),
96 _ => panic!("Expected action after pattern"),
97 }
98 } else {
99 Some(Rule::PatternAction {
100 pattern,
101 action: None,
102 })
103 }
104 }
105
106 fn parse_action(&mut self) -> Rule<'a> {
107 self.next_token(); let pattern = None;
110
111 let mut statements = Vec::new();
112 while self.current_token.kind != TokenKind::RightCurlyBrace
113 && self.current_token.kind != TokenKind::Eof
114 {
115 while self.current_token.kind == TokenKind::NewLine
116 || self.current_token.kind == TokenKind::Semicolon
117 {
118 self.next_token();
119 }
120
121 if self.current_token.kind == TokenKind::RightCurlyBrace
122 || self.current_token.kind == TokenKind::Eof
123 {
124 break;
125 }
126
127 statements.push(self.parse_statement());
128 }
129
130 if pattern.is_some() {
131 Rule::PatternAction {
132 pattern,
133 action: Some(Action { statements }),
134 }
135 } else {
136 Rule::Action(Action { statements })
137 }
138 }
139
140 fn parse_statement(&mut self) -> Statement<'a> {
141 match self.current_token.kind {
142 TokenKind::Print => self.parse_print_function(),
143 TokenKind::Printf => self.parse_printf_function(),
144 TokenKind::Identifier => self.parse_assignment_statement(),
145 _ => todo!(),
146 }
147 }
148
149 fn parse_assignment_statement(&mut self) -> Statement<'a> {
150 let identifier = self.current_token.literal;
151 self.next_token();
152 if self.current_token.kind != TokenKind::Assign {
153 todo!()
154 }
155 self.next_token();
156 let value = self.parse_expression();
157
158 Statement::Assignment { identifier, value }
159 }
160
161 fn parse_print_function(&mut self) -> Statement<'a> {
162 let expressions = self.parse_expression_list_until_action_end(CommaBehavior::InsertSpaceExpression);
163
164 Statement::Print(expressions)
165 }
166
167 fn parse_printf_function(&mut self) -> Statement<'a> {
168 let expressions = self.parse_expression_list_until_action_end(CommaBehavior::SkipComma);
169
170 Statement::Printf(expressions)
171 }
172
173 fn parse_expression_list_until_action_end(
174 &mut self,
175 comma_behavior: CommaBehavior,
176 ) -> Vec<Expression<'a>> {
177 let mut expressions = Vec::new();
178 self.next_token();
179
180 while !self.is_statement_end() {
181 if self.current_token.kind == TokenKind::Comma {
182 self.next_token();
183 if let CommaBehavior::InsertSpaceExpression = comma_behavior {
184 expressions.push(Expression::String(" "));
185 }
186 } else {
187 let expression = self.parse_expression();
188 expressions.push(expression);
189 }
190 }
191
192 expressions
193 }
194
195 fn is_statement_end(&self) -> bool {
196 matches!(
197 statement_end_kind(&self.current_token.kind),
198 Some(StatementEnd::RightCurlyBrace)
199 | Some(StatementEnd::NewLine)
200 | Some(StatementEnd::Semicolon)
201 | Some(StatementEnd::Eof)
202 )
203 }
204
205 fn parse_expression(&mut self) -> Expression<'a> {
206 self.parse_expression_with_min_precedence(0)
207 }
208
209 fn parse_expression_with_min_precedence(&mut self, min_precedence: u8) -> Expression<'a> {
210 let mut left = self.parse_primary_expression();
211
212 loop {
213 let (left_precedence, right_precedence) =
214 match infix_operator_precedence(&self.current_token.kind) {
215 Some(value) => value,
216 None => break,
217 };
218
219 if left_precedence < min_precedence {
220 break;
221 }
222
223 let operator = self.current_token.clone();
224 if matches!(operator.kind, TokenKind::Tilde | TokenKind::NoMatch) {
225 self.next_token_with_regex(true);
226 } else {
227 self.next_token();
228 }
229 let right = self.parse_expression_with_min_precedence(right_precedence);
230
231 left = Expression::Infix {
232 left: Box::new(left),
233 operator,
234 right: Box::new(right),
235 };
236 }
237
238 left
239 }
240
241 fn parse_primary_expression(&mut self) -> Expression<'a> {
242 match self.current_token.kind {
243 TokenKind::String => {
244 let expression = Expression::String(self.current_token.literal);
245 self.next_token();
246 expression
247 }
248 TokenKind::Regex => {
249 let expression = Expression::Regex(self.current_token.literal);
250 self.next_token();
251 expression
252 }
253 TokenKind::Number => {
254 let expression = if let Ok(value) = self.current_token.literal.parse::<f64>() {
255 Expression::Number(value)
256 } else {
257 todo!()
258 };
259 self.next_token();
260 expression
261 }
262 TokenKind::DollarSign => {
263 self.next_token();
264 let expression = self.parse_primary_expression();
265 Expression::Field(Box::new(expression))
266 }
267 TokenKind::LeftParen => {
268 self.next_token();
269 let expression = self.parse_expression();
270 if self.current_token.kind == TokenKind::RightParen {
271 self.next_token();
272 }
273 expression
274 }
275 TokenKind::Identifier => {
276 let identifier = self.current_token.literal;
277 self.next_token();
278 Expression::Identifier(identifier)
279 }
280 _ => {
281 todo!()
282 }
283 }
284 }
285
286 pub fn parse_program(&mut self) -> Program<'_> {
287 let mut program = Program::new();
288
289 while !self.is_eof() {
290 match self.parse_next_rule() {
291 Some(Rule::Begin(action)) => program.add_begin_block(Rule::Begin(action)),
292 Some(Rule::End(action)) => program.add_end_block(Rule::End(action)),
293 Some(rule) => program.add_rule(rule),
294 None => {}
295 }
296 self.next_token_with_regex(true);
297 }
298
299 program
300 }
301}
302
303fn infix_operator_precedence(kind: &TokenKind) -> Option<(u8, u8)> {
304 match kind {
305 TokenKind::Equal
306 | TokenKind::NotEqual
307 | TokenKind::GreaterThan
308 | TokenKind::GreaterThanOrEqual
309 | TokenKind::LessThan
310 | TokenKind::LessThanOrEqual
311 | TokenKind::Tilde
312 | TokenKind::NoMatch => Some((1, 2)),
313 TokenKind::Plus | TokenKind::Minus => Some((3, 4)),
314 TokenKind::Asterisk | TokenKind::Division | TokenKind::Percent => Some((5, 6)),
315 TokenKind::Caret => Some((9, 8)),
316 _ => None,
317 }
318}
319
320fn statement_end_kind(kind: &TokenKind) -> Option<StatementEnd> {
321 match kind {
322 TokenKind::RightCurlyBrace => Some(StatementEnd::RightCurlyBrace),
323 TokenKind::NewLine => Some(StatementEnd::NewLine),
324 TokenKind::Semicolon => Some(StatementEnd::Semicolon),
325 TokenKind::Eof => Some(StatementEnd::Eof),
326 _ => None,
327 }
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333
334 #[test]
335 fn create_parser() {
336 let mut parser = Parser::new(Lexer::new("42 == 42"));
337
338 assert_eq!(parser.current_token.literal, "42");
339 parser.next_token();
340 assert_eq!(parser.current_token.literal, "==");
341 }
342
343 #[test]
344 fn parse_empty_program() {
345 let mut parser = Parser::new(Lexer::new(""));
346
347 let program = parser.parse_program();
348
349 assert_eq!(program.len(), 0);
350 }
351
352 #[test]
353 fn parse_action_without_pattern() {
354 let mut parser = Parser::new(Lexer::new("{ print }"));
355
356 let program = parser.parse_program();
357
358 assert_eq!(program.len(), 1);
359 assert_eq!("{ print }", program.to_string());
360 }
361
362 #[test]
363 fn parse_action_with_leading_newlines() {
364 let mut parser = Parser::new(Lexer::new("\n\n{ print }"));
365
366 let program = parser.parse_program();
367
368 assert_eq!(program.len(), 1);
369 assert_eq!("{ print }", program.to_string());
370 }
371
372 #[test]
373 fn parse_begin_block() {
374 let mut parser = Parser::new(Lexer::new("BEGIN { print }"));
375
376 let program = parser.parse_program();
377
378 assert_eq!(program.len(), 1);
379 assert_eq!("BEGIN { print }", program.to_string());
380 }
381
382 #[test]
383 fn parse_end_block() {
384 let mut parser = Parser::new(Lexer::new("END { print 42 }"));
385
386 let program = parser.parse_program();
387
388 assert_eq!(program.len(), 1);
389 assert_eq!("END { print 42 }", program.to_string());
390 }
391
392 #[test]
393 fn parse_regex_pattern_action() {
394 let mut parser = Parser::new(Lexer::new("/foo/ { print }"));
395
396 let program = parser.parse_program();
397
398 assert_eq!(program.len(), 1);
399 assert_eq!("/foo/ { print }", program.to_string());
400 }
401
402 #[test]
403 fn parse_print_infix_expression() {
404 let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 }"));
405
406 let program = parser.parse_program();
407 let mut begin_blocks = program.begin_blocks_iter();
408 let rule = begin_blocks.next().expect("expected begin block");
409
410 let statements = match rule {
411 Rule::Begin(Action { statements }) => statements,
412 _ => panic!("expected begin rule"),
413 };
414
415 let exprs = match &statements[0] {
416 Statement::Print(expressions) => expressions,
417 _ => panic!("expected print statement"),
418 };
419
420 match &exprs[0] {
421 Expression::Infix {
422 left,
423 operator,
424 right,
425 } => {
426 assert!(matches!(**left, Expression::Number(1.0)));
427 assert_eq!(operator.kind, TokenKind::Plus);
428 assert!(matches!(**right, Expression::Number(2.0)));
429 }
430 _ => panic!("expected infix expression"),
431 }
432 }
433
434 #[test]
435 fn parse_print_parenthesized_expression() {
436 let mut parser = Parser::new(Lexer::new("BEGIN { print (1 + 2) * 3 }"));
437
438 let program = parser.parse_program();
439 let mut begin_blocks = program.begin_blocks_iter();
440 let rule = begin_blocks.next().expect("expected begin block");
441
442 let statements = match rule {
443 Rule::Begin(Action { statements }) => statements,
444 _ => panic!("expected begin rule"),
445 };
446
447 let exprs = match &statements[0] {
448 Statement::Print(expressions) => expressions,
449 _ => panic!("expected print statement"),
450 };
451
452 match &exprs[0] {
453 Expression::Infix {
454 left,
455 operator,
456 right,
457 } => {
458 assert_eq!(operator.kind, TokenKind::Asterisk);
459 assert!(matches!(**right, Expression::Number(3.0)));
460 assert!(matches!(**left, Expression::Infix { .. }));
461 }
462 _ => panic!("expected infix expression"),
463 }
464 }
465
466 #[test]
467 fn parse_print_multiplication_has_higher_precedence_than_addition() {
468 let mut parser = Parser::new(Lexer::new("BEGIN { print 1 + 2 * 3 }"));
469
470 let program = parser.parse_program();
471 let mut begin_blocks = program.begin_blocks_iter();
472 let rule = begin_blocks.next().expect("expected begin block");
473
474 let statements = match rule {
475 Rule::Begin(Action { statements }) => statements,
476 _ => panic!("expected begin rule"),
477 };
478
479 let exprs = match &statements[0] {
480 Statement::Print(expressions) => expressions,
481 _ => panic!("expected print statement"),
482 };
483
484 match &exprs[0] {
485 Expression::Infix {
486 left,
487 operator,
488 right,
489 } => {
490 assert_eq!(operator.kind, TokenKind::Plus);
491 assert!(matches!(**left, Expression::Number(1.0)));
492 match &**right {
493 Expression::Infix {
494 operator: right_op, ..
495 } => assert_eq!(right_op.kind, TokenKind::Asterisk),
496 _ => panic!("expected nested infix expression"),
497 }
498 }
499 _ => panic!("expected infix expression"),
500 }
501 }
502
503 #[test]
504 fn parse_print_power_is_right_associative() {
505 let mut parser = Parser::new(Lexer::new("BEGIN { print 2 ^ 3 ^ 2 }"));
506
507 let program = parser.parse_program();
508 let mut begin_blocks = program.begin_blocks_iter();
509 let rule = begin_blocks.next().expect("expected begin block");
510
511 let statements = match rule {
512 Rule::Begin(Action { statements }) => statements,
513 _ => panic!("expected begin rule"),
514 };
515
516 let exprs = match &statements[0] {
517 Statement::Print(expressions) => expressions,
518 _ => panic!("expected print statement"),
519 };
520
521 match &exprs[0] {
522 Expression::Infix {
523 left,
524 operator,
525 right,
526 } => {
527 assert_eq!(operator.kind, TokenKind::Caret);
528 assert!(matches!(**left, Expression::Number(2.0)));
529 match &**right {
530 Expression::Infix {
531 operator: right_op, ..
532 } => assert_eq!(right_op.kind, TokenKind::Caret),
533 _ => panic!("expected nested infix expression"),
534 }
535 }
536 _ => panic!("expected infix expression"),
537 }
538 }
539
540 #[test]
541 fn parse_print_minus_is_left_associative() {
542 let mut parser = Parser::new(Lexer::new("BEGIN { print 5 - 3 - 1 }"));
543
544 let program = parser.parse_program();
545 let mut begin_blocks = program.begin_blocks_iter();
546 let rule = begin_blocks.next().expect("expected begin block");
547
548 let statements = match rule {
549 Rule::Begin(Action { statements }) => statements,
550 _ => panic!("expected begin rule"),
551 };
552
553 let exprs = match &statements[0] {
554 Statement::Print(expressions) => expressions,
555 _ => panic!("expected print statement"),
556 };
557
558 match &exprs[0] {
559 Expression::Infix {
560 left,
561 operator,
562 right,
563 } => {
564 assert_eq!(operator.kind, TokenKind::Minus);
565 match &**left {
566 Expression::Infix {
567 operator: left_op, ..
568 } => assert_eq!(left_op.kind, TokenKind::Minus),
569 _ => panic!("expected nested infix expression"),
570 }
571 assert!(matches!(**right, Expression::Number(1.0)));
572 }
573 _ => panic!("expected infix expression"),
574 }
575 }
576
577 #[test]
578 fn parse_print_concatenation() {
579 let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:" 42 }"#));
580
581 let program = parser.parse_program();
582 let mut begin_blocks = program.begin_blocks_iter();
583 let rule = begin_blocks.next().expect("expected begin block");
584
585 let statements = match rule {
586 Rule::Begin(Action { statements }) => statements,
587 _ => panic!("expected begin rule"),
588 };
589
590 let exprs = match &statements[0] {
591 Statement::Print(expressions) => expressions,
592 _ => panic!("expected print statement"),
593 };
594
595 assert_eq!(exprs.len(), 2);
596 assert!(matches!(exprs[0], Expression::String("Value:")));
597 assert!(matches!(exprs[1], Expression::Number(42.0)));
598 }
599
600 #[test]
601 fn parse_print_field_expression() {
602 let mut parser = Parser::new(Lexer::new("{ print $1 }"));
603
604 let program = parser.parse_program();
605 let mut rules = program.rules_iter();
606 let rule = rules.next().expect("expected rule");
607
608 let statements = match rule {
609 Rule::Action(Action { statements }) => statements,
610 _ => panic!("expected action rule"),
611 };
612
613 let exprs = match &statements[0] {
614 Statement::Print(expressions) => expressions,
615 _ => panic!("expected print statement"),
616 };
617
618 match &exprs[0] {
619 Expression::Field(inner) => assert!(matches!(**inner, Expression::Number(1.0))),
620 _ => panic!("expected field expression"),
621 }
622 }
623
624 #[test]
625 fn parse_print_with_commas() {
626 let mut parser = Parser::new(Lexer::new(r#"BEGIN { print "Value:", 42, $1 }"#));
627
628 let program = parser.parse_program();
629
630 assert_eq!(r#"BEGIN { print "Value:", 42, $1 }"#, program.to_string());
631 }
632
633 #[test]
634 fn parse_number_of_fields_identifier() {
635 let mut parser = Parser::new(Lexer::new(r#"BEGIN { print NF }"#));
636
637 let program = parser.parse_program();
638
639 assert_eq!(r#"BEGIN { print NF }"#, program.to_string());
640 }
641
642 #[test]
643 fn parse_printf_with_format_and_arguments() {
644 let mut parser = Parser::new(Lexer::new(r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#));
645
646 let program = parser.parse_program();
647
648 assert_eq!(
649 r#"{ printf "[%10s] [%-16d]\n", $1, $3 }"#,
650 program.to_string()
651 );
652 }
653}