patch_rexx/
lexer.rs

1//! REXX lexer — tokenizes source text into a stream of tokens.
2//!
3//! REXX tokenization is straightforward: the language has no reserved words
4//! (keywords are context-sensitive), so the lexer produces generic symbols
5//! and lets the parser decide meaning.
6
7use crate::error::{RexxDiagnostic, RexxError, RexxResult, SourceLoc};
8
9/// Token types produced by the lexer.
10#[derive(Debug, Clone, PartialEq)]
11pub enum TokenKind {
12    // Literals
13    StringLit(String),
14    Number(String),
15    Symbol(String),
16
17    // Operators
18    Plus,
19    Minus,
20    Star,
21    Slash,
22    IntDiv,    // %
23    Remainder, // //
24    Power,     // **
25    Concat,    // ||
26    Assign,    // =
27
28    // Comparison
29    Equal,       // = (context-dependent, same char as Assign)
30    NotEqual,    // \= or <>
31    Greater,     // >
32    Less,        // <
33    GreaterEq,   // >= or \<
34    LessEq,      // <= or \>
35    StrictEq,    // ==
36    StrictNotEq, // \==
37    StrictGt,    // >>
38    StrictLt,    // <<
39    StrictGte,   // >>=
40    StrictLte,   // <<=
41
42    // Logical
43    And, // &
44    Or,  // |
45    Xor, // &&
46    Not, // \ or ¬
47
48    // Delimiters
49    LeftParen,
50    RightParen,
51    Comma,
52    Semicolon,
53    Colon,
54    Dot,
55
56    // Special
57    Eol, // End of logical line (clause terminator)
58    Eof,
59}
60
61#[derive(Debug, Clone)]
62pub struct Token {
63    pub kind: TokenKind,
64    pub loc: SourceLoc,
65    /// Whether whitespace (or a comment) appeared before this token.
66    /// Used by the parser to distinguish abuttal from blank concatenation,
67    /// and function calls (`name(`) from concat-with-parens (`name (`).
68    pub space_before: bool,
69}
70
71impl Token {
72    pub fn new(kind: TokenKind, loc: SourceLoc, space_before: bool) -> Self {
73        Self {
74            kind,
75            loc,
76            space_before,
77        }
78    }
79}
80
81pub struct Lexer {
82    source: Vec<char>,
83    pos: usize,
84    line: usize,
85    col: usize,
86    lines: Vec<String>,
87}
88
89impl Lexer {
90    pub fn new(source: &str) -> Self {
91        let lines: Vec<String> = source.lines().map(String::from).collect();
92        Self {
93            source: source.chars().collect(),
94            pos: 0,
95            line: 1,
96            col: 1,
97            lines,
98        }
99    }
100
101    pub fn tokenize(&mut self) -> RexxResult<Vec<Token>> {
102        let mut tokens = Vec::new();
103
104        loop {
105            let pos_before = self.pos;
106            self.skip_whitespace_and_comments()?;
107            let had_space = self.pos > pos_before;
108
109            if self.at_end() {
110                tokens.push(Token::new(TokenKind::Eof, self.loc(), had_space));
111                break;
112            }
113
114            let mut token = self.next_token()?;
115            token.space_before = had_space;
116
117            // Line continuation: if token is a comma and the rest of the
118            // line (ignoring whitespace/comments) is empty, this comma is a
119            // continuation marker — skip it and join with the next line.
120            if matches!(token.kind, TokenKind::Comma) && self.is_line_continuation() {
121                // Consume everything up to and including the newline
122                while let Some(ch) = self.peek() {
123                    if ch == '\n' {
124                        self.advance(); // consume the newline
125                        break;
126                    }
127                    self.advance(); // consume whitespace/comment chars before newline
128                }
129                continue;
130            }
131
132            tokens.push(token);
133        }
134
135        Ok(tokens)
136    }
137
138    fn loc(&self) -> SourceLoc {
139        let mut loc = SourceLoc::new(self.line, self.col);
140        if self.line > 0 && self.line <= self.lines.len() {
141            loc = loc.with_source(self.lines[self.line - 1].clone());
142        }
143        loc
144    }
145
146    /// Check if the rest of the current line (ignoring whitespace and block
147    /// comments) is empty — i.e., the next non-blank content is a newline or EOF.
148    /// Used to detect trailing-comma line continuation.
149    fn is_line_continuation(&self) -> bool {
150        let mut i = self.pos;
151        while i < self.source.len() {
152            let ch = self.source[i];
153            match ch {
154                ' ' | '\t' | '\r' => {
155                    i += 1;
156                }
157                '\n' => return true,
158                // Block comment: skip it entirely
159                '/' if i + 1 < self.source.len() && self.source[i + 1] == '*' => {
160                    i += 2;
161                    let mut depth = 1u32;
162                    while depth > 0 && i < self.source.len() {
163                        if i + 1 < self.source.len()
164                            && self.source[i] == '/'
165                            && self.source[i + 1] == '*'
166                        {
167                            depth += 1;
168                            i += 2;
169                        } else if i + 1 < self.source.len()
170                            && self.source[i] == '*'
171                            && self.source[i + 1] == '/'
172                        {
173                            depth -= 1;
174                            i += 2;
175                        } else {
176                            i += 1;
177                        }
178                    }
179                }
180                // Line comment: rest of line is a comment → continuation
181                '-' if i + 1 < self.source.len() && self.source[i + 1] == '-' => return true,
182                _ => return false,
183            }
184        }
185        // Reached EOF — treat as continuation (no more lines, comma at end of file)
186        true
187    }
188
189    fn at_end(&self) -> bool {
190        self.pos >= self.source.len()
191    }
192
193    fn peek(&self) -> Option<char> {
194        self.source.get(self.pos).copied()
195    }
196
197    fn peek_ahead(&self, n: usize) -> Option<char> {
198        self.source.get(self.pos + n).copied()
199    }
200
201    fn advance(&mut self) -> Option<char> {
202        let ch = self.source.get(self.pos).copied()?;
203        self.pos += 1;
204        if ch == '\n' {
205            self.line += 1;
206            self.col = 1;
207        } else {
208            self.col += 1;
209        }
210        Some(ch)
211    }
212
213    fn skip_whitespace_and_comments(&mut self) -> RexxResult<()> {
214        // Skip shebang line if at start of file
215        if self.pos == 0 && self.peek() == Some('#') && self.peek_ahead(1) == Some('!') {
216            while let Some(ch) = self.peek() {
217                if ch == '\n' {
218                    break;
219                }
220                self.advance();
221            }
222        }
223
224        loop {
225            // Skip whitespace but NOT newlines — they are clause terminators
226            while let Some(ch) = self.peek() {
227                if ch == ' ' || ch == '\t' || ch == '\r' {
228                    self.advance();
229                } else {
230                    break;
231                }
232            }
233
234            // Skip block comments /* ... */ (can nest)
235            if self.peek() == Some('/') && self.peek_ahead(1) == Some('*') {
236                let loc = self.loc();
237                self.advance(); // /
238                self.advance(); // *
239                let mut depth = 1u32;
240                while depth > 0 {
241                    if self.at_end() {
242                        return Err(RexxDiagnostic::new(RexxError::UnmatchedComment).at(loc));
243                    }
244                    if self.peek() == Some('/') && self.peek_ahead(1) == Some('*') {
245                        self.advance();
246                        self.advance();
247                        depth += 1;
248                    } else if self.peek() == Some('*') && self.peek_ahead(1) == Some('/') {
249                        self.advance();
250                        self.advance();
251                        depth -= 1;
252                    } else {
253                        self.advance();
254                    }
255                }
256                continue;
257            }
258
259            // Skip line comments -- (ANSI REXX extension)
260            if self.peek() == Some('-') && self.peek_ahead(1) == Some('-') {
261                while let Some(ch) = self.peek() {
262                    if ch == '\n' {
263                        break;
264                    }
265                    self.advance();
266                }
267                continue;
268            }
269
270            break;
271        }
272        Ok(())
273    }
274
275    #[allow(clippy::too_many_lines)]
276    fn next_token(&mut self) -> RexxResult<Token> {
277        let loc = self.loc();
278        let ch = self.peek().unwrap();
279
280        match ch {
281            // String literals: 'single' or "double" quoted
282            '\'' | '"' => self.lex_string(ch),
283
284            // Numbers
285            '0'..='9' => Ok(self.lex_number()),
286
287            // Symbols (identifiers, keywords — REXX has no reserved words)
288            'a'..='z' | 'A'..='Z' | '_' | '!' | '?' | '@' | '#' | '$' => Ok(self.lex_symbol()),
289
290            // Dot can start a symbol or be standalone
291            '.' => {
292                if self
293                    .peek_ahead(1)
294                    .is_some_and(|c| c.is_alphanumeric() || c == '_')
295                {
296                    Ok(self.lex_symbol())
297                } else {
298                    self.advance();
299                    Ok(Token::new(TokenKind::Dot, loc, false))
300                }
301            }
302
303            // Operators and delimiters
304            '+' => {
305                self.advance();
306                Ok(Token::new(TokenKind::Plus, loc, false))
307            }
308            '-' => {
309                self.advance();
310                Ok(Token::new(TokenKind::Minus, loc, false))
311            }
312            '*' => {
313                self.advance();
314                if self.peek() == Some('*') {
315                    self.advance();
316                    Ok(Token::new(TokenKind::Power, loc, false))
317                } else {
318                    Ok(Token::new(TokenKind::Star, loc, false))
319                }
320            }
321            '/' => {
322                self.advance();
323                if self.peek() == Some('/') {
324                    self.advance();
325                    Ok(Token::new(TokenKind::Remainder, loc, false))
326                } else {
327                    Ok(Token::new(TokenKind::Slash, loc, false))
328                }
329            }
330            '%' => {
331                self.advance();
332                Ok(Token::new(TokenKind::IntDiv, loc, false))
333            }
334            '|' => {
335                self.advance();
336                if self.peek() == Some('|') {
337                    self.advance();
338                    Ok(Token::new(TokenKind::Concat, loc, false))
339                } else {
340                    Ok(Token::new(TokenKind::Or, loc, false))
341                }
342            }
343            '&' => {
344                self.advance();
345                if self.peek() == Some('&') {
346                    self.advance();
347                    Ok(Token::new(TokenKind::Xor, loc, false))
348                } else {
349                    Ok(Token::new(TokenKind::And, loc, false))
350                }
351            }
352            '\\' | '¬' => {
353                self.advance();
354                if self.peek() == Some('=') {
355                    self.advance();
356                    if self.peek() == Some('=') {
357                        self.advance();
358                        Ok(Token::new(TokenKind::StrictNotEq, loc, false))
359                    } else {
360                        Ok(Token::new(TokenKind::NotEqual, loc, false))
361                    }
362                } else if self.peek() == Some('<') {
363                    self.advance();
364                    Ok(Token::new(TokenKind::GreaterEq, loc, false))
365                } else if self.peek() == Some('>') {
366                    self.advance();
367                    Ok(Token::new(TokenKind::LessEq, loc, false))
368                } else {
369                    Ok(Token::new(TokenKind::Not, loc, false))
370                }
371            }
372            '=' => {
373                self.advance();
374                if self.peek() == Some('=') {
375                    self.advance();
376                    Ok(Token::new(TokenKind::StrictEq, loc, false))
377                } else {
378                    // Parser disambiguates assignment vs comparison
379                    Ok(Token::new(TokenKind::Assign, loc, false))
380                }
381            }
382            '>' => {
383                self.advance();
384                if self.peek() == Some('>') {
385                    self.advance();
386                    if self.peek() == Some('=') {
387                        self.advance();
388                        Ok(Token::new(TokenKind::StrictGte, loc, false))
389                    } else {
390                        Ok(Token::new(TokenKind::StrictGt, loc, false))
391                    }
392                } else if self.peek() == Some('=') {
393                    self.advance();
394                    Ok(Token::new(TokenKind::GreaterEq, loc, false))
395                } else {
396                    Ok(Token::new(TokenKind::Greater, loc, false))
397                }
398            }
399            '<' => {
400                self.advance();
401                if self.peek() == Some('<') {
402                    self.advance();
403                    if self.peek() == Some('=') {
404                        self.advance();
405                        Ok(Token::new(TokenKind::StrictLte, loc, false))
406                    } else {
407                        Ok(Token::new(TokenKind::StrictLt, loc, false))
408                    }
409                } else if self.peek() == Some('=') {
410                    self.advance();
411                    Ok(Token::new(TokenKind::LessEq, loc, false))
412                } else if self.peek() == Some('>') {
413                    self.advance();
414                    Ok(Token::new(TokenKind::NotEqual, loc, false))
415                } else {
416                    Ok(Token::new(TokenKind::Less, loc, false))
417                }
418            }
419            '(' => {
420                self.advance();
421                Ok(Token::new(TokenKind::LeftParen, loc, false))
422            }
423            ')' => {
424                self.advance();
425                Ok(Token::new(TokenKind::RightParen, loc, false))
426            }
427            ',' => {
428                self.advance();
429                Ok(Token::new(TokenKind::Comma, loc, false))
430            }
431            '\n' => {
432                self.advance();
433                Ok(Token::new(TokenKind::Eol, loc, false))
434            }
435            ';' => {
436                self.advance();
437                Ok(Token::new(TokenKind::Semicolon, loc, false))
438            }
439            ':' => {
440                self.advance();
441                Ok(Token::new(TokenKind::Colon, loc, false))
442            }
443            _ => Err(RexxDiagnostic::new(RexxError::InvalidCharacter)
444                .at(loc)
445                .with_detail(format!("unexpected character '{ch}'"))),
446        }
447    }
448
449    fn lex_string(&mut self, quote: char) -> RexxResult<Token> {
450        let loc = self.loc();
451        self.advance(); // opening quote
452        let mut value = String::new();
453
454        loop {
455            if self.at_end() {
456                return Err(RexxDiagnostic::new(RexxError::InvalidExpression)
457                    .at(loc)
458                    .with_detail("unterminated string literal"));
459            }
460            let ch = self.advance().unwrap();
461            if ch == quote {
462                // Doubled quote is an escape: '' inside '...' means literal '
463                if self.peek() == Some(quote) {
464                    self.advance();
465                    value.push(quote);
466                } else {
467                    break;
468                }
469            } else {
470                value.push(ch);
471            }
472        }
473
474        // Check for hex/binary string suffix: '...'X or "..."X or B
475        if let Some(suffix) = self.peek() {
476            match suffix.to_ascii_uppercase() {
477                'X' => {
478                    self.advance();
479                    let decoded = hex_string_to_chars(&value).map_err(|e| {
480                        RexxDiagnostic::new(RexxError::InvalidHexBinary)
481                            .at(loc.clone())
482                            .with_detail(e)
483                    })?;
484                    return Ok(Token::new(TokenKind::StringLit(decoded), loc, false));
485                }
486                'B' => {
487                    self.advance();
488                    let decoded = bin_string_to_chars(&value).map_err(|e| {
489                        RexxDiagnostic::new(RexxError::InvalidHexBinary)
490                            .at(loc.clone())
491                            .with_detail(e)
492                    })?;
493                    return Ok(Token::new(TokenKind::StringLit(decoded), loc, false));
494                }
495                _ => {}
496            }
497        }
498
499        Ok(Token::new(TokenKind::StringLit(value), loc, false))
500    }
501
502    fn lex_number(&mut self) -> Token {
503        let loc = self.loc();
504        let mut num = String::new();
505
506        while let Some(ch) = self.peek() {
507            if ch.is_ascii_digit() || ch == '.' {
508                num.push(ch);
509                self.advance();
510            } else {
511                break;
512            }
513        }
514
515        // Exponent part
516        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
517            num.push(self.advance().unwrap());
518            if self.peek().is_some_and(|c| c == '+' || c == '-') {
519                num.push(self.advance().unwrap());
520            }
521            while let Some(ch) = self.peek() {
522                if ch.is_ascii_digit() {
523                    num.push(ch);
524                    self.advance();
525                } else {
526                    break;
527                }
528            }
529        }
530
531        Token::new(TokenKind::Number(num), loc, false)
532    }
533
534    fn lex_symbol(&mut self) -> Token {
535        let loc = self.loc();
536        let mut name = String::new();
537
538        while let Some(ch) = self.peek() {
539            if ch.is_alphanumeric()
540                || ch == '_'
541                || ch == '.'
542                || ch == '!'
543                || ch == '?'
544                || ch == '@'
545                || ch == '#'
546                || ch == '$'
547            {
548                name.push(ch);
549                self.advance();
550            } else {
551                break;
552            }
553        }
554
555        Token::new(TokenKind::Symbol(name), loc, false)
556    }
557}
558
559/// Convert a hex string like "48 65 6C" to characters.
560fn hex_string_to_chars(s: &str) -> Result<String, String> {
561    let hex: String = s.chars().filter(|c| !c.is_whitespace()).collect();
562    if !hex.len().is_multiple_of(2) {
563        return Err("odd number of hex digits".into());
564    }
565    let mut result = String::new();
566    for i in (0..hex.len()).step_by(2) {
567        let byte = u8::from_str_radix(&hex[i..i + 2], 16)
568            .map_err(|_| format!("invalid hex digit at position {i}"))?;
569        result.push(byte as char);
570    }
571    Ok(result)
572}
573
574/// Convert a binary string like "0100 1000" to characters.
575fn bin_string_to_chars(s: &str) -> Result<String, String> {
576    let bits: String = s.chars().filter(|c| !c.is_whitespace()).collect();
577    if !bits.len().is_multiple_of(8) {
578        return Err("binary string length must be a multiple of 8".into());
579    }
580    let mut result = String::new();
581    for i in (0..bits.len()).step_by(8) {
582        let byte = u8::from_str_radix(&bits[i..i + 8], 2)
583            .map_err(|_| format!("invalid binary digit at position {i}"))?;
584        result.push(byte as char);
585    }
586    Ok(result)
587}
588
589#[cfg(test)]
590mod tests {
591    use super::*;
592
593    #[test]
594    fn simple_say() {
595        let mut lexer = Lexer::new("say 'Hello, World!'");
596        let tokens = lexer.tokenize().unwrap();
597        assert!(matches!(&tokens[0].kind, TokenKind::Symbol(s) if s == "say"));
598        assert!(matches!(&tokens[1].kind, TokenKind::StringLit(s) if s == "Hello, World!"));
599        assert!(matches!(&tokens[2].kind, TokenKind::Eof));
600    }
601
602    #[test]
603    fn arithmetic_tokens() {
604        let mut lexer = Lexer::new("3 + 4 * 2");
605        let tokens = lexer.tokenize().unwrap();
606        assert!(matches!(&tokens[0].kind, TokenKind::Number(n) if n == "3"));
607        assert!(matches!(&tokens[1].kind, TokenKind::Plus));
608        assert!(matches!(&tokens[2].kind, TokenKind::Number(n) if n == "4"));
609        assert!(matches!(&tokens[3].kind, TokenKind::Star));
610        assert!(matches!(&tokens[4].kind, TokenKind::Number(n) if n == "2"));
611    }
612
613    #[test]
614    fn nested_comments() {
615        let mut lexer = Lexer::new("/* outer /* inner */ still comment */ say 'hi'");
616        let tokens = lexer.tokenize().unwrap();
617        assert!(matches!(&tokens[0].kind, TokenKind::Symbol(s) if s == "say"));
618    }
619
620    #[test]
621    fn hex_string() {
622        let mut lexer = Lexer::new("'48656C6C6F'x");
623        let tokens = lexer.tokenize().unwrap();
624        assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "Hello"));
625    }
626
627    #[test]
628    fn doubled_quote_escape() {
629        let mut lexer = Lexer::new("'it''s'");
630        let tokens = lexer.tokenize().unwrap();
631        assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "it's"));
632    }
633
634    #[test]
635    fn comparison_operators() {
636        let mut lexer = Lexer::new("a == b \\= c >> d");
637        let tokens = lexer.tokenize().unwrap();
638        assert!(matches!(&tokens[1].kind, TokenKind::StrictEq));
639        assert!(matches!(&tokens[3].kind, TokenKind::NotEqual));
640        assert!(matches!(&tokens[5].kind, TokenKind::StrictGt));
641    }
642
643    #[test]
644    fn shebang_line_skipped() {
645        let mut lexer = Lexer::new("#!/usr/bin/env rexx\nsay 'hello'");
646        let tokens = lexer.tokenize().unwrap();
647        assert!(matches!(&tokens[0].kind, TokenKind::Eol));
648        assert!(matches!(&tokens[1].kind, TokenKind::Symbol(s) if s == "say"));
649        assert!(matches!(&tokens[2].kind, TokenKind::StringLit(s) if s == "hello"));
650    }
651}
patch_rexx/lexer.rs

patch_rexx/
lexer.rs