1#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
2#![allow(clippy::must_use_candidate)]
3use std::fmt;
11
12use logos::Logos;
13
14#[derive(Logos, Debug, Clone, PartialEq)]
26#[logos(skip r"[ \t\f]+")]
27pub enum Token {
28 #[regex(r"0[1-9]|[1-4][0-9]", priority = 5, callback = |lex| lex.slice().parse::<u8>().ok())]
30 Level(u8),
31
32 #[token("66", priority = 6)]
34 Level66,
35 #[token("77", priority = 6)]
37 Level77,
38 #[token("88", priority = 6)]
40 Level88,
41
42 #[token("PIC", ignore(case))]
45 #[token("PICTURE", ignore(case))]
46 Pic,
47
48 #[token("USAGE", ignore(case))]
50 Usage,
51
52 #[token("DISPLAY", ignore(case))]
54 Display,
55
56 #[token("COMP", ignore(case))]
58 #[token("COMPUTATIONAL", ignore(case))]
59 Comp,
60
61 #[token("COMP-3", ignore(case))]
63 #[token("COMPUTATIONAL-3", ignore(case))]
64 Comp3,
65
66 #[token("COMP-1", ignore(case))]
68 #[token("COMPUTATIONAL-1", ignore(case))]
69 Comp1,
70
71 #[token("COMP-2", ignore(case))]
73 #[token("COMPUTATIONAL-2", ignore(case))]
74 Comp2,
75
76 #[token("BINARY", ignore(case))]
78 Binary,
79
80 #[token("REDEFINES", ignore(case))]
82 Redefines,
83
84 #[token("RENAMES", ignore(case))]
86 Renames,
87
88 #[token("OCCURS", ignore(case))]
90 Occurs,
91
92 #[token("DEPENDING", ignore(case))]
94 Depending,
95
96 #[token("ON", ignore(case))]
98 On,
99
100 #[token("TO", ignore(case))]
102 To,
103
104 #[token("TIMES", ignore(case))]
106 Times,
107
108 #[token("SYNCHRONIZED", ignore(case))]
110 #[token("SYNC", ignore(case))]
111 Synchronized,
112
113 #[token("VALUE", ignore(case))]
115 Value,
116
117 #[token("THRU", ignore(case))]
119 Thru,
120
121 #[token("THROUGH", ignore(case))]
123 Through,
124
125 #[token("SIGN", ignore(case))]
127 Sign,
128
129 #[token("LEADING", ignore(case))]
131 Leading,
132
133 #[token("IS", ignore(case))]
135 Is,
136
137 #[token("TRAILING", ignore(case))]
139 Trailing,
140
141 #[token("SEPARATE", ignore(case))]
143 Separate,
144
145 #[token("BLANK", ignore(case))]
147 Blank,
148
149 #[token("WHEN", ignore(case))]
151 When,
152
153 #[token("ZERO", ignore(case))]
155 #[token("ZEROS", ignore(case))]
156 #[token("ZEROES", ignore(case))]
157 Zero,
158
159 #[regex(r"S?X+", priority = 3, callback = |lex| lex.slice().to_string())]
161 #[regex(r"S?X\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
162 #[regex(r"S?9+", priority = 3, callback = |lex| lex.slice().to_string())]
163 #[regex(r"S?9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
164 #[regex(r"S?9+V9+", priority = 3, callback = |lex| lex.slice().to_string())]
165 #[regex(r"S?9\([0-9]+\)V9+", priority = 3, callback = |lex| lex.slice().to_string())]
166 #[regex(r"S?9+V9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
167 #[regex(r"S?9\([0-9]+\)V9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
168 PicClause(String),
169
170 #[regex(r"0{2,}[0-9]+", priority = 5, callback = |lex| lex.slice().to_string())]
172 #[regex(r"[0Z9]+", priority = 3, callback = |lex| lex.slice().to_string())]
173 #[regex(r"[Z9]*[/,\$\+\-\*]+[Z9]*", priority = 3, callback = |lex| lex.slice().to_string())]
174 EditedPic(String),
175
176 #[regex(r"[0-9]+", priority = 4, callback = |lex| lex.slice().parse::<u32>().ok())]
178 Number(u32),
179
180 #[regex(r"[A-Za-z][A-Za-z0-9\-]*", priority = 1, callback = |lex| lex.slice().to_string())]
182 Identifier(String),
183
184 #[regex(r#""[^"]*""#, callback = |lex| lex.slice()[1..lex.slice().len()-1].to_string())]
186 #[regex(r"'[^']*'", callback = |lex| lex.slice()[1..lex.slice().len()-1].to_string())]
187 StringLiteral(String),
188
189 #[token(".")]
191 Period,
192
193 #[token(",", priority = 4)]
195 Comma,
196
197 #[token("(")]
199 LeftParen,
200
201 #[token(")")]
203 RightParen,
204
205 #[regex(
207 r"\*>[^\r\n]*",
208 priority = 6,
209 callback = |lex| lex.slice()[2..].trim().to_string(),
210 allow_greedy = true
211 )]
212 InlineComment(String),
213
214 #[token("\n")]
216 #[token("\r\n")]
217 Newline,
218
219 Eof,
221}
222
223impl fmt::Display for Token {
224 #[inline]
225 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
226 match self {
227 Token::Level(n) => write!(f, "{n:02}"),
228 Token::Level66 => write!(f, "66"),
229 Token::Level77 => write!(f, "77"),
230 Token::Level88 => write!(f, "88"),
231 Token::Pic => write!(f, "PIC"),
232 Token::Usage => write!(f, "USAGE"),
233 Token::Display => write!(f, "DISPLAY"),
234 Token::Comp => write!(f, "COMP"),
235 Token::Comp3 => write!(f, "COMP-3"),
236 Token::Comp1 => write!(f, "COMP-1"),
237 Token::Comp2 => write!(f, "COMP-2"),
238 Token::Binary => write!(f, "BINARY"),
239 Token::Redefines => write!(f, "REDEFINES"),
240 Token::Renames => write!(f, "RENAMES"),
241 Token::Occurs => write!(f, "OCCURS"),
242 Token::Depending => write!(f, "DEPENDING"),
243 Token::On => write!(f, "ON"),
244 Token::To => write!(f, "TO"),
245 Token::Times => write!(f, "TIMES"),
246 Token::Synchronized => write!(f, "SYNCHRONIZED"),
247 Token::Value => write!(f, "VALUE"),
248 Token::Thru => write!(f, "THRU"),
249 Token::Through => write!(f, "THROUGH"),
250 Token::Sign => write!(f, "SIGN"),
251 Token::Is => write!(f, "IS"),
252 Token::Leading => write!(f, "LEADING"),
253 Token::Trailing => write!(f, "TRAILING"),
254 Token::Separate => write!(f, "SEPARATE"),
255 Token::Blank => write!(f, "BLANK"),
256 Token::When => write!(f, "WHEN"),
257 Token::Zero => write!(f, "ZERO"),
258 Token::PicClause(s) | Token::EditedPic(s) | Token::Identifier(s) => write!(f, "{s}"),
259 Token::Number(n) => write!(f, "{n}"),
260 Token::StringLiteral(s) => write!(f, "\"{s}\""),
261 Token::Period => write!(f, "."),
262 Token::Comma => write!(f, ","),
263 Token::LeftParen => write!(f, "("),
264 Token::RightParen => write!(f, ")"),
265 Token::InlineComment(s) => write!(f, "*> {s}"),
266 Token::Newline => write!(f, "\\n"),
267 Token::Eof => write!(f, "EOF"),
268 }
269 }
270}
271
272#[derive(Debug, Clone, PartialEq)]
274pub struct TokenPos {
275 pub token: Token,
277 pub line: usize,
279 pub column: usize,
281 pub span: std::ops::Range<usize>,
283}
284
285#[derive(Debug, Clone, Copy, PartialEq)]
297pub enum CobolFormat {
298 Fixed,
300 Free,
302}
303
304#[derive(Debug, Clone, Copy)]
306pub struct LexerOptions {
307 pub allow_inline_comments: bool,
309 pub strict_comments: bool,
311}
312
313impl Default for LexerOptions {
314 #[inline]
315 fn default() -> Self {
316 Self {
317 allow_inline_comments: true,
318 strict_comments: false,
319 }
320 }
321}
322
323pub struct Lexer<'a> {
325 _input: &'a str,
326 format: CobolFormat,
327 lines: Vec<ProcessedLine<'a>>,
328 _current_line: usize,
329 _current_pos: usize,
330}
331
332#[derive(Debug, Clone)]
334struct ProcessedLine<'a> {
335 content: &'a str,
336 _original_line: usize,
337 is_comment: bool,
338 is_continuation: bool,
339}
340
341impl<'a> Lexer<'a> {
342 #[inline]
344 pub fn new(input: &'a str) -> Self {
345 Self::new_with_options(input, LexerOptions::default())
346 }
347
348 #[inline]
350 pub fn new_with_options(input: &'a str, options: LexerOptions) -> Self {
351 let format = detect_format(input);
352 let lines = preprocess_lines(input, format, options);
353
354 Self {
355 _input: input,
356 format,
357 lines,
358 _current_line: 0,
359 _current_pos: 0,
360 }
361 }
362
363 #[inline]
365 pub fn format(&self) -> CobolFormat {
366 self.format
367 }
368
369 #[inline]
371 pub fn tokenize(&mut self) -> Vec<TokenPos> {
372 let mut tokens = Vec::new();
373 let processed_text = self.build_processed_text();
374
375 let mut lexer = Token::lexer(&processed_text);
376 let mut line = 1;
377 let mut column = 1;
378
379 while let Some(result) = lexer.next() {
380 let span = lexer.span();
381 let token = if let Ok(token) = result {
382 token
383 } else {
384 let text = &processed_text[span.clone()];
385 Token::Identifier(text.to_string())
386 };
387 let start_column = column;
388
389 if token == Token::Newline {
390 line += 1;
391 column = 1;
392 } else {
393 column += span.len();
394 }
395
396 tokens.push(TokenPos {
397 token,
398 line,
399 column: start_column,
400 span,
401 });
402 }
403
404 tokens.push(TokenPos {
405 token: Token::Eof,
406 line,
407 column,
408 span: processed_text.len()..processed_text.len(),
409 });
410
411 tokens
412 }
413
414 fn build_processed_text(&self) -> String {
416 let mut result = String::new();
417 let mut i = 0;
418
419 while i < self.lines.len() {
420 let line = &self.lines[i];
421
422 if line.is_comment {
423 i += 1;
424 continue;
425 }
426
427 if line.is_continuation && i > 0 {
428 if result.ends_with('\n') {
429 result.pop();
430 }
431
432 let mut trimmed_result = result.trim_end().to_string();
433 let continuation_content = line.content.trim();
434
435 if !trimmed_result.is_empty() && !continuation_content.is_empty() {
436 if trimmed_result.ends_with('-') {
437 if let Some(stripped) = continuation_content.strip_prefix('-') {
438 trimmed_result.push_str(stripped);
439 } else {
440 trimmed_result.push_str(continuation_content);
441 }
442 } else {
443 trimmed_result.push(' ');
444 trimmed_result.push_str(continuation_content);
445 }
446 } else if !continuation_content.is_empty() {
447 trimmed_result.push_str(continuation_content);
448 }
449
450 result = trimmed_result;
451 result.push('\n');
452 } else {
453 result.push_str(line.content);
454 result.push('\n');
455 }
456
457 i += 1;
458 }
459
460 result
461 }
462}
463
464fn detect_format(input: &str) -> CobolFormat {
466 let lines: Vec<&str> = input.lines().collect();
467 let mut fixed_form_indicators = 0;
468 let mut total_content_lines = 0;
469
470 for line in &lines {
471 if line.trim().is_empty() || line.trim_start().starts_with('*') {
472 continue;
473 }
474
475 total_content_lines += 1;
476
477 if line.len() >= 8 && line.is_char_boundary(6) && line.is_char_boundary(7) {
478 let first_six = &line[0..6];
479 let col_7 = line.chars().nth(6).unwrap_or(' ');
480 let col_8_onwards = &line[7..];
481
482 if (first_six.chars().all(|c| c.is_ascii_digit() || c == ' '))
483 && (col_7 == ' ' || col_7 == '*' || col_7 == '-' || col_7 == '/')
484 && !col_8_onwards.trim().is_empty()
485 {
486 fixed_form_indicators += 1;
487 }
488 }
489
490 if line.len() == 72 || line.len() == 80 {
491 fixed_form_indicators += 1;
492 }
493 }
494
495 if total_content_lines > 0 && (fixed_form_indicators * 100 / total_content_lines) >= 50 {
496 CobolFormat::Fixed
497 } else {
498 CobolFormat::Free
499 }
500}
501
502fn preprocess_lines(
504 input: &str,
505 format: CobolFormat,
506 options: LexerOptions,
507) -> Vec<ProcessedLine<'_>> {
508 let mut result = Vec::new();
509
510 for (line_num, line) in input.lines().enumerate() {
511 let processed = match format {
512 CobolFormat::Fixed => process_fixed_form_line(line, line_num + 1),
513 CobolFormat::Free => process_free_form_line(line, line_num + 1, options),
514 };
515 result.push(processed);
516 }
517
518 result
519}
520
521fn process_fixed_form_line(line: &str, line_num: usize) -> ProcessedLine<'_> {
523 if line.is_empty() {
524 return ProcessedLine {
525 content: "",
526 _original_line: line_num,
527 is_comment: false,
528 is_continuation: false,
529 };
530 }
531
532 if line.starts_with('*') {
533 return ProcessedLine {
534 content: line,
535 _original_line: line_num,
536 is_comment: true,
537 is_continuation: false,
538 };
539 }
540
541 let is_continuation =
542 line.len() > 6 && line.is_char_boundary(6) && line.chars().nth(6) == Some('-');
543 let content = if line.len() > 7 && line.is_char_boundary(7) {
544 let end_col = if line.len() > 72 { 72 } else { line.len() };
545 let end_col = if line.is_char_boundary(end_col) {
546 end_col
547 } else {
548 let mut b = end_col;
550 while b > 7 && !line.is_char_boundary(b) {
551 b -= 1;
552 }
553 b
554 };
555 &line[7..end_col]
556 } else {
557 ""
558 };
559
560 ProcessedLine {
561 content,
562 _original_line: line_num,
563 is_comment: false,
564 is_continuation,
565 }
566}
567
568fn process_free_form_line(line: &str, line_num: usize, options: LexerOptions) -> ProcessedLine<'_> {
570 let trimmed = line.trim_start();
571
572 if trimmed.starts_with('*') {
573 return ProcessedLine {
574 content: line,
575 _original_line: line_num,
576 is_comment: true,
577 is_continuation: false,
578 };
579 }
580
581 let content = if options.allow_inline_comments && !options.strict_comments {
582 if let Some(comment_pos) = line.find("*>") {
583 line[..comment_pos].trim_end()
584 } else {
585 line
586 }
587 } else {
588 line
589 };
590
591 ProcessedLine {
592 content,
593 _original_line: line_num,
594 is_comment: false,
595 is_continuation: false,
596 }
597}
598
599#[cfg(test)]
600#[allow(clippy::expect_used)]
601#[allow(clippy::unwrap_used)]
602mod tests {
603 use super::*;
604
605 #[test]
606 fn test_format_detection_fixed() {
607 let input = r" * This is a comment
608 01 CUSTOMER-RECORD.
609 05 CUSTOMER-ID PIC X(10).
610 05 CUSTOMER-NAME PIC X(30).
611";
612 assert_eq!(detect_format(input), CobolFormat::Fixed);
613 }
614
615 #[test]
616 fn test_format_detection_free() {
617 let input = r"*> This is a comment
61801 CUSTOMER-RECORD.
619 05 CUSTOMER-ID PIC X(10).
620 05 CUSTOMER-NAME PIC X(30).
621";
622 assert_eq!(detect_format(input), CobolFormat::Free);
623 }
624
625 #[test]
626 fn test_basic_tokenization() {
627 let input = "01 CUSTOMER-ID PIC X(10).";
628 let mut lexer = Lexer::new(input);
629 let tokens = lexer.tokenize();
630
631 assert_eq!(tokens[0].token, Token::Level(1));
632 assert_eq!(
633 tokens[1].token,
634 Token::Identifier("CUSTOMER-ID".to_string())
635 );
636 assert_eq!(tokens[2].token, Token::Pic);
637 assert_eq!(tokens[3].token, Token::PicClause("X(10)".to_string()));
638 assert_eq!(tokens[4].token, Token::Period);
639 }
640
641 #[test]
642 fn test_continuation_handling() {
643 let input = r" 01 VERY-LONG-FIELD-NAME
644 - PIC X(50).";
645 let lexer = Lexer::new(input);
646 let processed = lexer.build_processed_text();
647
648 assert!(processed.contains("VERY-LONG-FIELD-NAME PIC X(50)"));
649 }
650
651 #[test]
652 fn test_edited_pic_detection() {
653 let input = "01 AMOUNT PIC ZZ,ZZZ.99.";
654 let mut lexer = Lexer::new(input);
655 let tokens = lexer.tokenize();
656
657 let pic_token = tokens
658 .iter()
659 .find(|t| matches!(t.token, Token::EditedPic(_)));
660 assert!(pic_token.is_some());
661 }
662
663 #[test]
664 fn test_comma_tokenization_priority() {
665 let input = ",";
666 let mut lexer = Lexer::new(input);
667 let tokens = lexer.tokenize();
668
669 assert_eq!(tokens[0].token, Token::Comma);
670 }
671
672 #[test]
673 fn test_comma_in_level88_value_clause() {
674 let input = r#"88 IS-VALID VALUE "A", "B", "C"."#;
675 let mut lexer = Lexer::new(input);
676 let tokens = lexer.tokenize();
677
678 let comma_tokens: Vec<_> = tokens
679 .iter()
680 .filter(|t| matches!(t.token, Token::Comma))
681 .collect();
682 assert_eq!(comma_tokens.len(), 2);
683
684 let edited_pic_commas: Vec<_> = tokens
685 .iter()
686 .filter(|t| matches!(&t.token, Token::EditedPic(s) if s == ","))
687 .collect();
688 assert_eq!(edited_pic_commas.len(), 0);
689 }
690
691 #[test]
692 fn test_edited_pic_still_detected_after_comma_fix() {
693 let input = "PIC Z,ZZZ.99";
694 let mut lexer = Lexer::new(input);
695 let tokens = lexer.tokenize();
696
697 let edited_pic = tokens
698 .iter()
699 .find(|t| matches!(t.token, Token::EditedPic(_)));
700 assert!(edited_pic.is_some());
701
702 if let Some(token_pos) = edited_pic
703 && let Token::EditedPic(pattern) = &token_pos.token
704 {
705 assert!(pattern.contains(','));
706 }
707 }
708
709 #[test]
710 fn test_comma_vs_edited_pic_disambiguation() {
711 let mut lexer1 = Lexer::new(",");
712 let tokens1 = lexer1.tokenize();
713 assert!(matches!(tokens1[0].token, Token::Comma));
714
715 let mut lexer2 = Lexer::new("Z,ZZZ");
716 let tokens2 = lexer2.tokenize();
717 assert!(matches!(tokens2[0].token, Token::EditedPic(_)));
718
719 let mut lexer3 = Lexer::new(r#""A,B""#);
721 let tokens3 = lexer3.tokenize();
722 assert!(!tokens3.iter().any(|t| matches!(t.token, Token::Comma)));
723 }
724
725 #[test]
726 fn test_commas_with_spaces_realistic_cobol() {
727 let input = r#"VALUE "A", "B", "C""#;
728 let mut lexer = Lexer::new(input);
729 let tokens = lexer.tokenize();
730
731 let comma_count = tokens
732 .iter()
733 .filter(|t| matches!(t.token, Token::Comma))
734 .count();
735 assert_eq!(comma_count, 2);
736 }
737
738 #[test]
739 fn test_comma_inside_string_literal_not_tokenized() {
740 let mut lx = Lexer::new(r#""A,B""#);
741 let toks = lx.tokenize();
742
743 assert!(!toks.iter().any(|t| matches!(t.token, Token::Comma)));
744 let string_tokens: Vec<_> = toks
745 .iter()
746 .filter(|t| matches!(&t.token, Token::StringLiteral(s) if s == "A,B"))
747 .collect();
748 assert_eq!(string_tokens.len(), 1);
749 }
750
751 #[test]
754 fn test_empty_input() {
755 let mut lexer = Lexer::new("");
756 let tokens = lexer.tokenize();
757 assert_eq!(tokens.last().unwrap().token, Token::Eof);
758 }
759
760 #[test]
761 fn test_whitespace_only_input() {
762 let mut lexer = Lexer::new(" \t ");
763 let tokens = lexer.tokenize();
764 assert!(tokens.iter().any(|t| t.token == Token::Eof));
766 }
767
768 #[test]
769 fn test_level_numbers_01_to_49() {
770 for level in 1..=49u8 {
771 let input = format!("{level:02} FIELD PIC X.");
772 let mut lexer = Lexer::new(&input);
773 let tokens = lexer.tokenize();
774 assert_eq!(tokens[0].token, Token::Level(level), "level {level:02}");
775 }
776 }
777
778 #[test]
779 fn test_level_66() {
780 let input = "66 ALIAS-FIELD RENAMES ORIG-FIELD.";
781 let mut lexer = Lexer::new(input);
782 let tokens = lexer.tokenize();
783 assert_eq!(tokens[0].token, Token::Level66);
784 assert_eq!(tokens[2].token, Token::Renames);
785 }
786
787 #[test]
788 fn test_level_77() {
789 let input = "77 STANDALONE-FIELD PIC 9(5).";
790 let mut lexer = Lexer::new(input);
791 let tokens = lexer.tokenize();
792 assert_eq!(tokens[0].token, Token::Level77);
793 }
794
795 #[test]
796 fn test_level_88() {
797 let input = r#"88 IS-TRUE VALUE "Y"."#;
798 let mut lexer = Lexer::new(input);
799 let tokens = lexer.tokenize();
800 assert_eq!(tokens[0].token, Token::Level88);
801 assert_eq!(tokens[2].token, Token::Value);
802 }
803
804 #[test]
805 fn test_pic_keyword_case_insensitive() {
806 for kw in &["PIC", "pic", "Pic", "PICTURE", "picture"] {
807 let input = format!("{kw} X(5)");
808 let mut lexer = Lexer::new(&input);
809 let tokens = lexer.tokenize();
810 assert!(
811 tokens.iter().any(|t| t.token == Token::Pic),
812 "failed for keyword: {kw}"
813 );
814 }
815 }
816
817 #[test]
818 fn test_comp_variants() {
819 let mut lx = Lexer::new("COMP");
820 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp));
821
822 let mut lx = Lexer::new("COMP-1");
823 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp1));
824
825 let mut lx = Lexer::new("COMP-2");
826 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp2));
827
828 let mut lx = Lexer::new("COMP-3");
829 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp3));
830 }
831
832 #[test]
833 fn test_computational_variants() {
834 let mut lx = Lexer::new("COMPUTATIONAL");
835 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp));
836
837 let mut lx = Lexer::new("COMPUTATIONAL-3");
838 assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp3));
839 }
840
841 #[test]
842 fn test_keyword_tokens() {
843 let cases = vec![
844 ("USAGE", Token::Usage),
845 ("DISPLAY", Token::Display),
846 ("BINARY", Token::Binary),
847 ("REDEFINES", Token::Redefines),
848 ("OCCURS", Token::Occurs),
849 ("DEPENDING", Token::Depending),
850 ("ON", Token::On),
851 ("TO", Token::To),
852 ("TIMES", Token::Times),
853 ("SYNCHRONIZED", Token::Synchronized),
854 ("SYNC", Token::Synchronized),
855 ("VALUE", Token::Value),
856 ("THRU", Token::Thru),
857 ("THROUGH", Token::Through),
858 ("SIGN", Token::Sign),
859 ("LEADING", Token::Leading),
860 ("IS", Token::Is),
861 ("TRAILING", Token::Trailing),
862 ("SEPARATE", Token::Separate),
863 ("BLANK", Token::Blank),
864 ("WHEN", Token::When),
865 ("ZERO", Token::Zero),
866 ("ZEROS", Token::Zero),
867 ("ZEROES", Token::Zero),
868 ];
869
870 for (input, expected) in cases {
871 let mut lx = Lexer::new(input);
872 let tokens = lx.tokenize();
873 assert!(
874 tokens.iter().any(|t| t.token == expected),
875 "keyword {input} not matched"
876 );
877 }
878 }
879
880 #[test]
881 fn test_string_literal_double_quotes() {
882 let mut lx = Lexer::new(r#""HELLO WORLD""#);
883 let tokens = lx.tokenize();
884 assert_eq!(
885 tokens[0].token,
886 Token::StringLiteral("HELLO WORLD".to_string())
887 );
888 }
889
890 #[test]
891 fn test_string_literal_single_quotes() {
892 let mut lx = Lexer::new("'HELLO WORLD'");
893 let tokens = lx.tokenize();
894 assert_eq!(
895 tokens[0].token,
896 Token::StringLiteral("HELLO WORLD".to_string())
897 );
898 }
899
900 #[test]
901 fn test_number_token() {
902 let mut lx = Lexer::new("OCCURS 100 TIMES");
903 let tokens = lx.tokenize();
904 assert_eq!(tokens[1].token, Token::Number(100));
905 assert_eq!(tokens[2].token, Token::Times);
906 }
907
908 #[test]
909 fn test_parentheses() {
910 let mut lx = Lexer::new("(50)");
911 let tokens = lx.tokenize();
912 assert_eq!(tokens[0].token, Token::LeftParen);
913 assert_eq!(tokens[1].token, Token::Number(50));
914 assert_eq!(tokens[2].token, Token::RightParen);
915 }
916
917 #[test]
918 fn test_period_token() {
919 let mut lx = Lexer::new("FIELD-NAME.");
920 let tokens = lx.tokenize();
921 let last_non_eof = tokens
922 .iter()
923 .rev()
924 .find(|t| t.token != Token::Eof && t.token != Token::Newline)
925 .unwrap();
926 assert_eq!(last_non_eof.token, Token::Period);
927 }
928
929 #[test]
930 fn test_pic_clause_patterns() {
931 let patterns = vec![
932 ("X(10)", "X(10)"),
933 ("9(5)", "9(5)"),
934 ("S9(5)V9(2)", "S9(5)V9(2)"),
935 ("XXX", "XXX"),
936 ("S999V99", "S999V99"),
937 ];
938
939 for (input, expected) in patterns {
940 let full = format!("PIC {input}");
941 let mut lx = Lexer::new(&full);
942 let tokens = lx.tokenize();
943 let pic_clause = tokens
944 .iter()
945 .find(|t| matches!(&t.token, Token::PicClause(_)));
946 assert!(pic_clause.is_some(), "no PicClause for pattern: {input}");
947 if let Some(tp) = pic_clause {
948 assert_eq!(tp.token, Token::PicClause(expected.to_string()));
949 }
950 }
951 }
952
953 #[test]
954 fn test_identifier_with_hyphens() {
955 let mut lx = Lexer::new("CUSTOMER-RECORD-ID");
956 let tokens = lx.tokenize();
957 assert_eq!(
958 tokens[0].token,
959 Token::Identifier("CUSTOMER-RECORD-ID".to_string())
960 );
961 }
962
963 #[test]
964 fn test_inline_comment_in_free_form() {
965 let input = "01 FIELD PIC X. *> this is a comment";
968 let mut lx = Lexer::new(input);
969 let tokens = lx.tokenize();
970 assert!(tokens.iter().any(|t| t.token == Token::Level(1)));
972 assert!(tokens.iter().any(|t| t.token == Token::Period));
973 }
974
975 #[test]
976 fn test_token_display_trait() {
977 assert_eq!(format!("{}", Token::Level(5)), "05");
978 assert_eq!(format!("{}", Token::Level66), "66");
979 assert_eq!(format!("{}", Token::Level77), "77");
980 assert_eq!(format!("{}", Token::Level88), "88");
981 assert_eq!(format!("{}", Token::Pic), "PIC");
982 assert_eq!(format!("{}", Token::Comp3), "COMP-3");
983 assert_eq!(format!("{}", Token::Period), ".");
984 assert_eq!(format!("{}", Token::Comma), ",");
985 assert_eq!(format!("{}", Token::LeftParen), "(");
986 assert_eq!(format!("{}", Token::RightParen), ")");
987 assert_eq!(format!("{}", Token::Eof), "EOF");
988 assert_eq!(format!("{}", Token::Newline), "\\n");
989 assert_eq!(format!("{}", Token::Number(42)), "42");
990 assert_eq!(
991 format!("{}", Token::StringLiteral("test".to_string())),
992 "\"test\""
993 );
994 assert_eq!(
995 format!("{}", Token::InlineComment("comment".to_string())),
996 "*> comment"
997 );
998 }
999
1000 #[test]
1001 fn test_lexer_options_default() {
1002 let opts = LexerOptions::default();
1003 assert!(opts.allow_inline_comments);
1004 assert!(!opts.strict_comments);
1005 }
1006
1007 #[test]
1008 fn test_lexer_format_accessor() {
1009 let lexer = Lexer::new("01 FIELD PIC X.");
1010 assert_eq!(lexer.format(), CobolFormat::Free);
1012 }
1013
1014 #[test]
1015 fn test_cobol_format_eq() {
1016 assert_eq!(CobolFormat::Fixed, CobolFormat::Fixed);
1017 assert_eq!(CobolFormat::Free, CobolFormat::Free);
1018 assert_ne!(CobolFormat::Fixed, CobolFormat::Free);
1019 }
1020
1021 #[test]
1022 fn test_token_last_is_always_eof() {
1023 for input in &["01 X PIC X.", "", "OCCURS 5 TIMES.", " "] {
1024 let mut lx = Lexer::new(input);
1025 let tokens = lx.tokenize();
1026 assert_eq!(tokens.last().unwrap().token, Token::Eof);
1027 }
1028 }
1029
1030 #[test]
1031 fn test_tokenpos_has_position_info() {
1032 let mut lx = Lexer::new("01 FIELD PIC X.");
1033 let tokens = lx.tokenize();
1034 let first = &tokens[0];
1035 assert_eq!(first.line, 1);
1036 assert!(first.column >= 1);
1037 assert!(!first.span.is_empty());
1038 }
1039
1040 #[test]
1041 fn test_occurs_depending_on_clause() {
1042 let input = "OCCURS 1 TO 10 DEPENDING ON COUNTER";
1043 let mut lx = Lexer::new(input);
1044 let tokens = lx.tokenize();
1045 let token_types: Vec<_> = tokens.iter().map(|t| &t.token).collect();
1046 assert!(token_types.contains(&&Token::Occurs));
1047 assert!(token_types.contains(&&Token::To));
1048 assert!(token_types.contains(&&Token::Depending));
1049 assert!(token_types.contains(&&Token::On));
1050 }
1051
1052 #[test]
1053 fn test_value_thru_clause() {
1054 let input = "VALUE 1 THRU 100";
1055 let mut lx = Lexer::new(input);
1056 let tokens = lx.tokenize();
1057 assert!(tokens.iter().any(|t| t.token == Token::Value));
1058 assert!(tokens.iter().any(|t| t.token == Token::Thru));
1059 }
1060}