copybook_lexer/
lib.rs

1#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
2#![allow(clippy::must_use_candidate)]
3// SPDX-License-Identifier: AGPL-3.0-or-later
4
5//! COBOL lexer microcrate.
6//!
7//! Provides tokenization for COBOL copybooks with fixed/free form preprocessing,
8//! continuation handling, and keyword/picture token recognition.
9
10use std::fmt;
11
12use logos::Logos;
13
14/// COBOL copybook tokens
15///
16/// # Examples
17///
18/// ```
19/// use copybook_lexer::Token;
20/// use logos::Logos;
21///
22/// let mut lex = Token::lexer("01 FIELD PIC X(10).");
23/// assert_eq!(lex.next(), Some(Ok(Token::Level(1))));
24/// ```
25#[derive(Logos, Debug, Clone, PartialEq)]
26#[logos(skip r"[ \t\f]+")]
27pub enum Token {
28    /// COBOL level number (01–49).
29    #[regex(r"0[1-9]|[1-4][0-9]", priority = 5, callback = |lex| lex.slice().parse::<u8>().ok())]
30    Level(u8),
31
32    /// Level-66 (RENAMES).
33    #[token("66", priority = 6)]
34    Level66,
35    /// Level-77 (standalone working-storage).
36    #[token("77", priority = 6)]
37    Level77,
38    /// Level-88 (condition name).
39    #[token("88", priority = 6)]
40    Level88,
41
42    // Keywords
43    /// `PIC` or `PICTURE` keyword.
44    #[token("PIC", ignore(case))]
45    #[token("PICTURE", ignore(case))]
46    Pic,
47
48    /// `USAGE` keyword.
49    #[token("USAGE", ignore(case))]
50    Usage,
51
52    /// `DISPLAY` usage keyword.
53    #[token("DISPLAY", ignore(case))]
54    Display,
55
56    /// `COMP` / `COMPUTATIONAL` keyword (binary native).
57    #[token("COMP", ignore(case))]
58    #[token("COMPUTATIONAL", ignore(case))]
59    Comp,
60
61    /// `COMP-3` / `COMPUTATIONAL-3` keyword (packed decimal).
62    #[token("COMP-3", ignore(case))]
63    #[token("COMPUTATIONAL-3", ignore(case))]
64    Comp3,
65
66    /// `COMP-1` / `COMPUTATIONAL-1` keyword (single-precision float).
67    #[token("COMP-1", ignore(case))]
68    #[token("COMPUTATIONAL-1", ignore(case))]
69    Comp1,
70
71    /// `COMP-2` / `COMPUTATIONAL-2` keyword (double-precision float).
72    #[token("COMP-2", ignore(case))]
73    #[token("COMPUTATIONAL-2", ignore(case))]
74    Comp2,
75
76    /// `BINARY` usage keyword.
77    #[token("BINARY", ignore(case))]
78    Binary,
79
80    /// `REDEFINES` keyword.
81    #[token("REDEFINES", ignore(case))]
82    Redefines,
83
84    /// `RENAMES` keyword (level-66).
85    #[token("RENAMES", ignore(case))]
86    Renames,
87
88    /// `OCCURS` keyword.
89    #[token("OCCURS", ignore(case))]
90    Occurs,
91
92    /// `DEPENDING` keyword (part of OCCURS DEPENDING ON).
93    #[token("DEPENDING", ignore(case))]
94    Depending,
95
96    /// `ON` keyword.
97    #[token("ON", ignore(case))]
98    On,
99
100    /// `TO` keyword.
101    #[token("TO", ignore(case))]
102    To,
103
104    /// `TIMES` keyword.
105    #[token("TIMES", ignore(case))]
106    Times,
107
108    /// `SYNCHRONIZED` / `SYNC` keyword.
109    #[token("SYNCHRONIZED", ignore(case))]
110    #[token("SYNC", ignore(case))]
111    Synchronized,
112
113    /// `VALUE` keyword.
114    #[token("VALUE", ignore(case))]
115    Value,
116
117    /// `THRU` keyword (range delimiter).
118    #[token("THRU", ignore(case))]
119    Thru,
120
121    /// `THROUGH` keyword (range delimiter, synonym of THRU).
122    #[token("THROUGH", ignore(case))]
123    Through,
124
125    /// `SIGN` keyword.
126    #[token("SIGN", ignore(case))]
127    Sign,
128
129    /// `LEADING` keyword (sign position).
130    #[token("LEADING", ignore(case))]
131    Leading,
132
133    /// `IS` keyword.
134    #[token("IS", ignore(case))]
135    Is,
136
137    /// `TRAILING` keyword (sign position).
138    #[token("TRAILING", ignore(case))]
139    Trailing,
140
141    /// `SEPARATE` keyword (sign storage).
142    #[token("SEPARATE", ignore(case))]
143    Separate,
144
145    /// `BLANK` keyword.
146    #[token("BLANK", ignore(case))]
147    Blank,
148
149    /// `WHEN` keyword.
150    #[token("WHEN", ignore(case))]
151    When,
152
153    /// `ZERO` / `ZEROS` / `ZEROES` keyword.
154    #[token("ZERO", ignore(case))]
155    #[token("ZEROS", ignore(case))]
156    #[token("ZEROES", ignore(case))]
157    Zero,
158
159    /// Standard PIC clause pattern (e.g. `9(5)V9(2)`).
160    #[regex(r"S?X+", priority = 3, callback = |lex| lex.slice().to_string())]
161    #[regex(r"S?X\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
162    #[regex(r"S?9+", priority = 3, callback = |lex| lex.slice().to_string())]
163    #[regex(r"S?9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
164    #[regex(r"S?9+V9+", priority = 3, callback = |lex| lex.slice().to_string())]
165    #[regex(r"S?9\([0-9]+\)V9+", priority = 3, callback = |lex| lex.slice().to_string())]
166    #[regex(r"S?9+V9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
167    #[regex(r"S?9\([0-9]+\)V9\([0-9]+\)", priority = 3, callback = |lex| lex.slice().to_string())]
168    PicClause(String),
169
170    /// Edited PIC pattern (e.g. `ZZZ9`, `$ZZ,ZZZ.99`).
171    #[regex(r"0{2,}[0-9]+", priority = 5, callback = |lex| lex.slice().to_string())]
172    #[regex(r"[0Z9]+", priority = 3, callback = |lex| lex.slice().to_string())]
173    #[regex(r"[Z9]*[/,\$\+\-\*]+[Z9]*", priority = 3, callback = |lex| lex.slice().to_string())]
174    EditedPic(String),
175
176    /// Unsigned integer literal.
177    #[regex(r"[0-9]+", priority = 4, callback = |lex| lex.slice().parse::<u32>().ok())]
178    Number(u32),
179
180    /// COBOL identifier or data name.
181    #[regex(r"[A-Za-z][A-Za-z0-9\-]*", priority = 1, callback = |lex| lex.slice().to_string())]
182    Identifier(String),
183
184    /// Quoted string literal (single or double quotes).
185    #[regex(r#""[^"]*""#, callback = |lex| lex.slice()[1..lex.slice().len()-1].to_string())]
186    #[regex(r"'[^']*'", callback = |lex| lex.slice()[1..lex.slice().len()-1].to_string())]
187    StringLiteral(String),
188
189    /// Period (`.`) statement terminator.
190    #[token(".")]
191    Period,
192
193    /// Comma separator.
194    #[token(",", priority = 4)]
195    Comma,
196
197    /// Left parenthesis.
198    #[token("(")]
199    LeftParen,
200
201    /// Right parenthesis.
202    #[token(")")]
203    RightParen,
204
205    /// COBOL-2002 inline comment (`*>`).
206    #[regex(
207        r"\*>[^\r\n]*",
208        priority = 6,
209        callback = |lex| lex.slice()[2..].trim().to_string(),
210        allow_greedy = true
211    )]
212    InlineComment(String),
213
214    /// Line break (LF or CRLF).
215    #[token("\n")]
216    #[token("\r\n")]
217    Newline,
218
219    /// Sentinel marking end of input.
220    Eof,
221}
222
223impl fmt::Display for Token {
224    #[inline]
225    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
226        match self {
227            Token::Level(n) => write!(f, "{n:02}"),
228            Token::Level66 => write!(f, "66"),
229            Token::Level77 => write!(f, "77"),
230            Token::Level88 => write!(f, "88"),
231            Token::Pic => write!(f, "PIC"),
232            Token::Usage => write!(f, "USAGE"),
233            Token::Display => write!(f, "DISPLAY"),
234            Token::Comp => write!(f, "COMP"),
235            Token::Comp3 => write!(f, "COMP-3"),
236            Token::Comp1 => write!(f, "COMP-1"),
237            Token::Comp2 => write!(f, "COMP-2"),
238            Token::Binary => write!(f, "BINARY"),
239            Token::Redefines => write!(f, "REDEFINES"),
240            Token::Renames => write!(f, "RENAMES"),
241            Token::Occurs => write!(f, "OCCURS"),
242            Token::Depending => write!(f, "DEPENDING"),
243            Token::On => write!(f, "ON"),
244            Token::To => write!(f, "TO"),
245            Token::Times => write!(f, "TIMES"),
246            Token::Synchronized => write!(f, "SYNCHRONIZED"),
247            Token::Value => write!(f, "VALUE"),
248            Token::Thru => write!(f, "THRU"),
249            Token::Through => write!(f, "THROUGH"),
250            Token::Sign => write!(f, "SIGN"),
251            Token::Is => write!(f, "IS"),
252            Token::Leading => write!(f, "LEADING"),
253            Token::Trailing => write!(f, "TRAILING"),
254            Token::Separate => write!(f, "SEPARATE"),
255            Token::Blank => write!(f, "BLANK"),
256            Token::When => write!(f, "WHEN"),
257            Token::Zero => write!(f, "ZERO"),
258            Token::PicClause(s) | Token::EditedPic(s) | Token::Identifier(s) => write!(f, "{s}"),
259            Token::Number(n) => write!(f, "{n}"),
260            Token::StringLiteral(s) => write!(f, "\"{s}\""),
261            Token::Period => write!(f, "."),
262            Token::Comma => write!(f, ","),
263            Token::LeftParen => write!(f, "("),
264            Token::RightParen => write!(f, ")"),
265            Token::InlineComment(s) => write!(f, "*> {s}"),
266            Token::Newline => write!(f, "\\n"),
267            Token::Eof => write!(f, "EOF"),
268        }
269    }
270}
271
272/// Position information for tokens
273#[derive(Debug, Clone, PartialEq)]
274pub struct TokenPos {
275    /// The token value.
276    pub token: Token,
277    /// 1-based line number where the token starts.
278    pub line: usize,
279    /// 1-based column number where the token starts.
280    pub column: usize,
281    /// Byte range within the source text.
282    pub span: std::ops::Range<usize>,
283}
284
285/// COBOL format detection
286///
287/// # Examples
288///
289/// ```
290/// use copybook_lexer::CobolFormat;
291///
292/// let fmt = CobolFormat::Fixed;
293/// assert_eq!(fmt, CobolFormat::Fixed);
294/// assert_ne!(fmt, CobolFormat::Free);
295/// ```
296#[derive(Debug, Clone, Copy, PartialEq)]
297pub enum CobolFormat {
298    /// Traditional fixed-format (columns 1–6 sequence, 7 indicator, 8–72 code).
299    Fixed,
300    /// Free-format source (no column restrictions).
301    Free,
302}
303
304/// Configuration options for the COBOL lexer.
305#[derive(Debug, Clone, Copy)]
306pub struct LexerOptions {
307    /// When `true`, COBOL-2002 inline comments (`*>`) are recognised.
308    pub allow_inline_comments: bool,
309    /// When `true`, only column-7 indicators are treated as comment markers.
310    pub strict_comments: bool,
311}
312
313impl Default for LexerOptions {
314    #[inline]
315    fn default() -> Self {
316        Self {
317            allow_inline_comments: true,
318            strict_comments: false,
319        }
320    }
321}
322
323/// Lexer for COBOL copybooks
324pub struct Lexer<'a> {
325    _input: &'a str,
326    format: CobolFormat,
327    lines: Vec<ProcessedLine<'a>>,
328    _current_line: usize,
329    _current_pos: usize,
330}
331
332/// A processed line after format-specific handling
333#[derive(Debug, Clone)]
334struct ProcessedLine<'a> {
335    content: &'a str,
336    _original_line: usize,
337    is_comment: bool,
338    is_continuation: bool,
339}
340
341impl<'a> Lexer<'a> {
342    /// Create a new lexer for the given input
343    #[inline]
344    pub fn new(input: &'a str) -> Self {
345        Self::new_with_options(input, LexerOptions::default())
346    }
347
348    /// Create a new lexer for the given input with specific options
349    #[inline]
350    pub fn new_with_options(input: &'a str, options: LexerOptions) -> Self {
351        let format = detect_format(input);
352        let lines = preprocess_lines(input, format, options);
353
354        Self {
355            _input: input,
356            format,
357            lines,
358            _current_line: 0,
359            _current_pos: 0,
360        }
361    }
362
363    /// Get the detected format
364    #[inline]
365    pub fn format(&self) -> CobolFormat {
366        self.format
367    }
368
369    /// Tokenize the input and return all tokens with positions
370    #[inline]
371    pub fn tokenize(&mut self) -> Vec<TokenPos> {
372        let mut tokens = Vec::new();
373        let processed_text = self.build_processed_text();
374
375        let mut lexer = Token::lexer(&processed_text);
376        let mut line = 1;
377        let mut column = 1;
378
379        while let Some(result) = lexer.next() {
380            let span = lexer.span();
381            let token = if let Ok(token) = result {
382                token
383            } else {
384                let text = &processed_text[span.clone()];
385                Token::Identifier(text.to_string())
386            };
387            let start_column = column;
388
389            if token == Token::Newline {
390                line += 1;
391                column = 1;
392            } else {
393                column += span.len();
394            }
395
396            tokens.push(TokenPos {
397                token,
398                line,
399                column: start_column,
400                span,
401            });
402        }
403
404        tokens.push(TokenPos {
405            token: Token::Eof,
406            line,
407            column,
408            span: processed_text.len()..processed_text.len(),
409        });
410
411        tokens
412    }
413
414    /// Build the processed text from all non-comment lines
415    fn build_processed_text(&self) -> String {
416        let mut result = String::new();
417        let mut i = 0;
418
419        while i < self.lines.len() {
420            let line = &self.lines[i];
421
422            if line.is_comment {
423                i += 1;
424                continue;
425            }
426
427            if line.is_continuation && i > 0 {
428                if result.ends_with('\n') {
429                    result.pop();
430                }
431
432                let mut trimmed_result = result.trim_end().to_string();
433                let continuation_content = line.content.trim();
434
435                if !trimmed_result.is_empty() && !continuation_content.is_empty() {
436                    if trimmed_result.ends_with('-') {
437                        if let Some(stripped) = continuation_content.strip_prefix('-') {
438                            trimmed_result.push_str(stripped);
439                        } else {
440                            trimmed_result.push_str(continuation_content);
441                        }
442                    } else {
443                        trimmed_result.push(' ');
444                        trimmed_result.push_str(continuation_content);
445                    }
446                } else if !continuation_content.is_empty() {
447                    trimmed_result.push_str(continuation_content);
448                }
449
450                result = trimmed_result;
451                result.push('\n');
452            } else {
453                result.push_str(line.content);
454                result.push('\n');
455            }
456
457            i += 1;
458        }
459
460        result
461    }
462}
463
464/// Detect whether the input is fixed-form or free-form COBOL
465fn detect_format(input: &str) -> CobolFormat {
466    let lines: Vec<&str> = input.lines().collect();
467    let mut fixed_form_indicators = 0;
468    let mut total_content_lines = 0;
469
470    for line in &lines {
471        if line.trim().is_empty() || line.trim_start().starts_with('*') {
472            continue;
473        }
474
475        total_content_lines += 1;
476
477        if line.len() >= 8 && line.is_char_boundary(6) && line.is_char_boundary(7) {
478            let first_six = &line[0..6];
479            let col_7 = line.chars().nth(6).unwrap_or(' ');
480            let col_8_onwards = &line[7..];
481
482            if (first_six.chars().all(|c| c.is_ascii_digit() || c == ' '))
483                && (col_7 == ' ' || col_7 == '*' || col_7 == '-' || col_7 == '/')
484                && !col_8_onwards.trim().is_empty()
485            {
486                fixed_form_indicators += 1;
487            }
488        }
489
490        if line.len() == 72 || line.len() == 80 {
491            fixed_form_indicators += 1;
492        }
493    }
494
495    if total_content_lines > 0 && (fixed_form_indicators * 100 / total_content_lines) >= 50 {
496        CobolFormat::Fixed
497    } else {
498        CobolFormat::Free
499    }
500}
501
502/// Preprocess lines according to the detected format
503fn preprocess_lines(
504    input: &str,
505    format: CobolFormat,
506    options: LexerOptions,
507) -> Vec<ProcessedLine<'_>> {
508    let mut result = Vec::new();
509
510    for (line_num, line) in input.lines().enumerate() {
511        let processed = match format {
512            CobolFormat::Fixed => process_fixed_form_line(line, line_num + 1),
513            CobolFormat::Free => process_free_form_line(line, line_num + 1, options),
514        };
515        result.push(processed);
516    }
517
518    result
519}
520
521/// Process a fixed-form COBOL line
522fn process_fixed_form_line(line: &str, line_num: usize) -> ProcessedLine<'_> {
523    if line.is_empty() {
524        return ProcessedLine {
525            content: "",
526            _original_line: line_num,
527            is_comment: false,
528            is_continuation: false,
529        };
530    }
531
532    if line.starts_with('*') {
533        return ProcessedLine {
534            content: line,
535            _original_line: line_num,
536            is_comment: true,
537            is_continuation: false,
538        };
539    }
540
541    let is_continuation =
542        line.len() > 6 && line.is_char_boundary(6) && line.chars().nth(6) == Some('-');
543    let content = if line.len() > 7 && line.is_char_boundary(7) {
544        let end_col = if line.len() > 72 { 72 } else { line.len() };
545        let end_col = if line.is_char_boundary(end_col) {
546            end_col
547        } else {
548            // Find the nearest valid char boundary at or before end_col
549            let mut b = end_col;
550            while b > 7 && !line.is_char_boundary(b) {
551                b -= 1;
552            }
553            b
554        };
555        &line[7..end_col]
556    } else {
557        ""
558    };
559
560    ProcessedLine {
561        content,
562        _original_line: line_num,
563        is_comment: false,
564        is_continuation,
565    }
566}
567
568/// Process a free-form COBOL line
569fn process_free_form_line(line: &str, line_num: usize, options: LexerOptions) -> ProcessedLine<'_> {
570    let trimmed = line.trim_start();
571
572    if trimmed.starts_with('*') {
573        return ProcessedLine {
574            content: line,
575            _original_line: line_num,
576            is_comment: true,
577            is_continuation: false,
578        };
579    }
580
581    let content = if options.allow_inline_comments && !options.strict_comments {
582        if let Some(comment_pos) = line.find("*>") {
583            line[..comment_pos].trim_end()
584        } else {
585            line
586        }
587    } else {
588        line
589    };
590
591    ProcessedLine {
592        content,
593        _original_line: line_num,
594        is_comment: false,
595        is_continuation: false,
596    }
597}
598
599#[cfg(test)]
600#[allow(clippy::expect_used)]
601#[allow(clippy::unwrap_used)]
602mod tests {
603    use super::*;
604
605    #[test]
606    fn test_format_detection_fixed() {
607        let input = r"      * This is a comment
608       01  CUSTOMER-RECORD.
609           05  CUSTOMER-ID     PIC X(10).
610           05  CUSTOMER-NAME   PIC X(30).
611";
612        assert_eq!(detect_format(input), CobolFormat::Fixed);
613    }
614
615    #[test]
616    fn test_format_detection_free() {
617        let input = r"*> This is a comment
61801 CUSTOMER-RECORD.
619  05 CUSTOMER-ID PIC X(10).
620  05 CUSTOMER-NAME PIC X(30).
621";
622        assert_eq!(detect_format(input), CobolFormat::Free);
623    }
624
625    #[test]
626    fn test_basic_tokenization() {
627        let input = "01 CUSTOMER-ID PIC X(10).";
628        let mut lexer = Lexer::new(input);
629        let tokens = lexer.tokenize();
630
631        assert_eq!(tokens[0].token, Token::Level(1));
632        assert_eq!(
633            tokens[1].token,
634            Token::Identifier("CUSTOMER-ID".to_string())
635        );
636        assert_eq!(tokens[2].token, Token::Pic);
637        assert_eq!(tokens[3].token, Token::PicClause("X(10)".to_string()));
638        assert_eq!(tokens[4].token, Token::Period);
639    }
640
641    #[test]
642    fn test_continuation_handling() {
643        let input = r"       01  VERY-LONG-FIELD-NAME
644      -        PIC X(50).";
645        let lexer = Lexer::new(input);
646        let processed = lexer.build_processed_text();
647
648        assert!(processed.contains("VERY-LONG-FIELD-NAME PIC X(50)"));
649    }
650
651    #[test]
652    fn test_edited_pic_detection() {
653        let input = "01 AMOUNT PIC ZZ,ZZZ.99.";
654        let mut lexer = Lexer::new(input);
655        let tokens = lexer.tokenize();
656
657        let pic_token = tokens
658            .iter()
659            .find(|t| matches!(t.token, Token::EditedPic(_)));
660        assert!(pic_token.is_some());
661    }
662
663    #[test]
664    fn test_comma_tokenization_priority() {
665        let input = ",";
666        let mut lexer = Lexer::new(input);
667        let tokens = lexer.tokenize();
668
669        assert_eq!(tokens[0].token, Token::Comma);
670    }
671
672    #[test]
673    fn test_comma_in_level88_value_clause() {
674        let input = r#"88 IS-VALID VALUE "A", "B", "C"."#;
675        let mut lexer = Lexer::new(input);
676        let tokens = lexer.tokenize();
677
678        let comma_tokens: Vec<_> = tokens
679            .iter()
680            .filter(|t| matches!(t.token, Token::Comma))
681            .collect();
682        assert_eq!(comma_tokens.len(), 2);
683
684        let edited_pic_commas: Vec<_> = tokens
685            .iter()
686            .filter(|t| matches!(&t.token, Token::EditedPic(s) if s == ","))
687            .collect();
688        assert_eq!(edited_pic_commas.len(), 0);
689    }
690
691    #[test]
692    fn test_edited_pic_still_detected_after_comma_fix() {
693        let input = "PIC Z,ZZZ.99";
694        let mut lexer = Lexer::new(input);
695        let tokens = lexer.tokenize();
696
697        let edited_pic = tokens
698            .iter()
699            .find(|t| matches!(t.token, Token::EditedPic(_)));
700        assert!(edited_pic.is_some());
701
702        if let Some(token_pos) = edited_pic
703            && let Token::EditedPic(pattern) = &token_pos.token
704        {
705            assert!(pattern.contains(','));
706        }
707    }
708
709    #[test]
710    fn test_comma_vs_edited_pic_disambiguation() {
711        let mut lexer1 = Lexer::new(",");
712        let tokens1 = lexer1.tokenize();
713        assert!(matches!(tokens1[0].token, Token::Comma));
714
715        let mut lexer2 = Lexer::new("Z,ZZZ");
716        let tokens2 = lexer2.tokenize();
717        assert!(matches!(tokens2[0].token, Token::EditedPic(_)));
718
719        // Comma inside a string literal is NOT tokenized as Comma
720        let mut lexer3 = Lexer::new(r#""A,B""#);
721        let tokens3 = lexer3.tokenize();
722        assert!(!tokens3.iter().any(|t| matches!(t.token, Token::Comma)));
723    }
724
725    #[test]
726    fn test_commas_with_spaces_realistic_cobol() {
727        let input = r#"VALUE "A", "B", "C""#;
728        let mut lexer = Lexer::new(input);
729        let tokens = lexer.tokenize();
730
731        let comma_count = tokens
732            .iter()
733            .filter(|t| matches!(t.token, Token::Comma))
734            .count();
735        assert_eq!(comma_count, 2);
736    }
737
738    #[test]
739    fn test_comma_inside_string_literal_not_tokenized() {
740        let mut lx = Lexer::new(r#""A,B""#);
741        let toks = lx.tokenize();
742
743        assert!(!toks.iter().any(|t| matches!(t.token, Token::Comma)));
744        let string_tokens: Vec<_> = toks
745            .iter()
746            .filter(|t| matches!(&t.token, Token::StringLiteral(s) if s == "A,B"))
747            .collect();
748        assert_eq!(string_tokens.len(), 1);
749    }
750
751    // ── Additional tests ─────────────────────────────────────────────
752
753    #[test]
754    fn test_empty_input() {
755        let mut lexer = Lexer::new("");
756        let tokens = lexer.tokenize();
757        assert_eq!(tokens.last().unwrap().token, Token::Eof);
758    }
759
760    #[test]
761    fn test_whitespace_only_input() {
762        let mut lexer = Lexer::new("   \t  ");
763        let tokens = lexer.tokenize();
764        // Should get newline and Eof
765        assert!(tokens.iter().any(|t| t.token == Token::Eof));
766    }
767
768    #[test]
769    fn test_level_numbers_01_to_49() {
770        for level in 1..=49u8 {
771            let input = format!("{level:02} FIELD PIC X.");
772            let mut lexer = Lexer::new(&input);
773            let tokens = lexer.tokenize();
774            assert_eq!(tokens[0].token, Token::Level(level), "level {level:02}");
775        }
776    }
777
778    #[test]
779    fn test_level_66() {
780        let input = "66 ALIAS-FIELD RENAMES ORIG-FIELD.";
781        let mut lexer = Lexer::new(input);
782        let tokens = lexer.tokenize();
783        assert_eq!(tokens[0].token, Token::Level66);
784        assert_eq!(tokens[2].token, Token::Renames);
785    }
786
787    #[test]
788    fn test_level_77() {
789        let input = "77 STANDALONE-FIELD PIC 9(5).";
790        let mut lexer = Lexer::new(input);
791        let tokens = lexer.tokenize();
792        assert_eq!(tokens[0].token, Token::Level77);
793    }
794
795    #[test]
796    fn test_level_88() {
797        let input = r#"88 IS-TRUE VALUE "Y"."#;
798        let mut lexer = Lexer::new(input);
799        let tokens = lexer.tokenize();
800        assert_eq!(tokens[0].token, Token::Level88);
801        assert_eq!(tokens[2].token, Token::Value);
802    }
803
804    #[test]
805    fn test_pic_keyword_case_insensitive() {
806        for kw in &["PIC", "pic", "Pic", "PICTURE", "picture"] {
807            let input = format!("{kw} X(5)");
808            let mut lexer = Lexer::new(&input);
809            let tokens = lexer.tokenize();
810            assert!(
811                tokens.iter().any(|t| t.token == Token::Pic),
812                "failed for keyword: {kw}"
813            );
814        }
815    }
816
817    #[test]
818    fn test_comp_variants() {
819        let mut lx = Lexer::new("COMP");
820        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp));
821
822        let mut lx = Lexer::new("COMP-1");
823        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp1));
824
825        let mut lx = Lexer::new("COMP-2");
826        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp2));
827
828        let mut lx = Lexer::new("COMP-3");
829        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp3));
830    }
831
832    #[test]
833    fn test_computational_variants() {
834        let mut lx = Lexer::new("COMPUTATIONAL");
835        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp));
836
837        let mut lx = Lexer::new("COMPUTATIONAL-3");
838        assert!(lx.tokenize().iter().any(|t| t.token == Token::Comp3));
839    }
840
841    #[test]
842    fn test_keyword_tokens() {
843        let cases = vec![
844            ("USAGE", Token::Usage),
845            ("DISPLAY", Token::Display),
846            ("BINARY", Token::Binary),
847            ("REDEFINES", Token::Redefines),
848            ("OCCURS", Token::Occurs),
849            ("DEPENDING", Token::Depending),
850            ("ON", Token::On),
851            ("TO", Token::To),
852            ("TIMES", Token::Times),
853            ("SYNCHRONIZED", Token::Synchronized),
854            ("SYNC", Token::Synchronized),
855            ("VALUE", Token::Value),
856            ("THRU", Token::Thru),
857            ("THROUGH", Token::Through),
858            ("SIGN", Token::Sign),
859            ("LEADING", Token::Leading),
860            ("IS", Token::Is),
861            ("TRAILING", Token::Trailing),
862            ("SEPARATE", Token::Separate),
863            ("BLANK", Token::Blank),
864            ("WHEN", Token::When),
865            ("ZERO", Token::Zero),
866            ("ZEROS", Token::Zero),
867            ("ZEROES", Token::Zero),
868        ];
869
870        for (input, expected) in cases {
871            let mut lx = Lexer::new(input);
872            let tokens = lx.tokenize();
873            assert!(
874                tokens.iter().any(|t| t.token == expected),
875                "keyword {input} not matched"
876            );
877        }
878    }
879
880    #[test]
881    fn test_string_literal_double_quotes() {
882        let mut lx = Lexer::new(r#""HELLO WORLD""#);
883        let tokens = lx.tokenize();
884        assert_eq!(
885            tokens[0].token,
886            Token::StringLiteral("HELLO WORLD".to_string())
887        );
888    }
889
890    #[test]
891    fn test_string_literal_single_quotes() {
892        let mut lx = Lexer::new("'HELLO WORLD'");
893        let tokens = lx.tokenize();
894        assert_eq!(
895            tokens[0].token,
896            Token::StringLiteral("HELLO WORLD".to_string())
897        );
898    }
899
900    #[test]
901    fn test_number_token() {
902        let mut lx = Lexer::new("OCCURS 100 TIMES");
903        let tokens = lx.tokenize();
904        assert_eq!(tokens[1].token, Token::Number(100));
905        assert_eq!(tokens[2].token, Token::Times);
906    }
907
908    #[test]
909    fn test_parentheses() {
910        let mut lx = Lexer::new("(50)");
911        let tokens = lx.tokenize();
912        assert_eq!(tokens[0].token, Token::LeftParen);
913        assert_eq!(tokens[1].token, Token::Number(50));
914        assert_eq!(tokens[2].token, Token::RightParen);
915    }
916
917    #[test]
918    fn test_period_token() {
919        let mut lx = Lexer::new("FIELD-NAME.");
920        let tokens = lx.tokenize();
921        let last_non_eof = tokens
922            .iter()
923            .rev()
924            .find(|t| t.token != Token::Eof && t.token != Token::Newline)
925            .unwrap();
926        assert_eq!(last_non_eof.token, Token::Period);
927    }
928
929    #[test]
930    fn test_pic_clause_patterns() {
931        let patterns = vec![
932            ("X(10)", "X(10)"),
933            ("9(5)", "9(5)"),
934            ("S9(5)V9(2)", "S9(5)V9(2)"),
935            ("XXX", "XXX"),
936            ("S999V99", "S999V99"),
937        ];
938
939        for (input, expected) in patterns {
940            let full = format!("PIC {input}");
941            let mut lx = Lexer::new(&full);
942            let tokens = lx.tokenize();
943            let pic_clause = tokens
944                .iter()
945                .find(|t| matches!(&t.token, Token::PicClause(_)));
946            assert!(pic_clause.is_some(), "no PicClause for pattern: {input}");
947            if let Some(tp) = pic_clause {
948                assert_eq!(tp.token, Token::PicClause(expected.to_string()));
949            }
950        }
951    }
952
953    #[test]
954    fn test_identifier_with_hyphens() {
955        let mut lx = Lexer::new("CUSTOMER-RECORD-ID");
956        let tokens = lx.tokenize();
957        assert_eq!(
958            tokens[0].token,
959            Token::Identifier("CUSTOMER-RECORD-ID".to_string())
960        );
961    }
962
963    #[test]
964    fn test_inline_comment_in_free_form() {
965        // Inline comments are stripped during preprocessing in free form.
966        // Verify that content before the comment is preserved.
967        let input = "01 FIELD PIC X. *> this is a comment";
968        let mut lx = Lexer::new(input);
969        let tokens = lx.tokenize();
970        // The comment should be stripped; the field definition should parse
971        assert!(tokens.iter().any(|t| t.token == Token::Level(1)));
972        assert!(tokens.iter().any(|t| t.token == Token::Period));
973    }
974
975    #[test]
976    fn test_token_display_trait() {
977        assert_eq!(format!("{}", Token::Level(5)), "05");
978        assert_eq!(format!("{}", Token::Level66), "66");
979        assert_eq!(format!("{}", Token::Level77), "77");
980        assert_eq!(format!("{}", Token::Level88), "88");
981        assert_eq!(format!("{}", Token::Pic), "PIC");
982        assert_eq!(format!("{}", Token::Comp3), "COMP-3");
983        assert_eq!(format!("{}", Token::Period), ".");
984        assert_eq!(format!("{}", Token::Comma), ",");
985        assert_eq!(format!("{}", Token::LeftParen), "(");
986        assert_eq!(format!("{}", Token::RightParen), ")");
987        assert_eq!(format!("{}", Token::Eof), "EOF");
988        assert_eq!(format!("{}", Token::Newline), "\\n");
989        assert_eq!(format!("{}", Token::Number(42)), "42");
990        assert_eq!(
991            format!("{}", Token::StringLiteral("test".to_string())),
992            "\"test\""
993        );
994        assert_eq!(
995            format!("{}", Token::InlineComment("comment".to_string())),
996            "*> comment"
997        );
998    }
999
1000    #[test]
1001    fn test_lexer_options_default() {
1002        let opts = LexerOptions::default();
1003        assert!(opts.allow_inline_comments);
1004        assert!(!opts.strict_comments);
1005    }
1006
1007    #[test]
1008    fn test_lexer_format_accessor() {
1009        let lexer = Lexer::new("01 FIELD PIC X.");
1010        // Free form since it's a short line
1011        assert_eq!(lexer.format(), CobolFormat::Free);
1012    }
1013
1014    #[test]
1015    fn test_cobol_format_eq() {
1016        assert_eq!(CobolFormat::Fixed, CobolFormat::Fixed);
1017        assert_eq!(CobolFormat::Free, CobolFormat::Free);
1018        assert_ne!(CobolFormat::Fixed, CobolFormat::Free);
1019    }
1020
1021    #[test]
1022    fn test_token_last_is_always_eof() {
1023        for input in &["01 X PIC X.", "", "OCCURS 5 TIMES.", "  "] {
1024            let mut lx = Lexer::new(input);
1025            let tokens = lx.tokenize();
1026            assert_eq!(tokens.last().unwrap().token, Token::Eof);
1027        }
1028    }
1029
1030    #[test]
1031    fn test_tokenpos_has_position_info() {
1032        let mut lx = Lexer::new("01 FIELD PIC X.");
1033        let tokens = lx.tokenize();
1034        let first = &tokens[0];
1035        assert_eq!(first.line, 1);
1036        assert!(first.column >= 1);
1037        assert!(!first.span.is_empty());
1038    }
1039
1040    #[test]
1041    fn test_occurs_depending_on_clause() {
1042        let input = "OCCURS 1 TO 10 DEPENDING ON COUNTER";
1043        let mut lx = Lexer::new(input);
1044        let tokens = lx.tokenize();
1045        let token_types: Vec<_> = tokens.iter().map(|t| &t.token).collect();
1046        assert!(token_types.contains(&&Token::Occurs));
1047        assert!(token_types.contains(&&Token::To));
1048        assert!(token_types.contains(&&Token::Depending));
1049        assert!(token_types.contains(&&Token::On));
1050    }
1051
1052    #[test]
1053    fn test_value_thru_clause() {
1054        let input = "VALUE 1 THRU 100";
1055        let mut lx = Lexer::new(input);
1056        let tokens = lx.tokenize();
1057        assert!(tokens.iter().any(|t| t.token == Token::Value));
1058        assert!(tokens.iter().any(|t| t.token == Token::Thru));
1059    }
1060}
copybook_lexer/lib.rs

copybook_lexer/
lib.rs