runmat_lexer/
lib.rs

1use logos::{Filter, Lexer, Logos};
2
3#[derive(Default, Clone, Copy)]
4pub struct LexerExtras {
5    pub last_was_value: bool,
6    pub line_start: bool,
7}
8
9#[derive(Logos, Debug, PartialEq, Clone)]
10// Skip spaces, tabs and carriage returns, but NOT newlines; we need newlines to detect '%%' at line start
11#[logos(skip r"[ \t\r]+")]
12#[logos(extras = LexerExtras)]
13pub enum Token {
14    // Keywords
15    #[token("function")]
16    Function,
17    #[token("if")]
18    If,
19    #[token("else")]
20    Else,
21    #[token("elseif")]
22    ElseIf,
23    #[token("for")]
24    For,
25    #[token("while")]
26    While,
27    #[token("break")]
28    Break,
29    #[token("continue")]
30    Continue,
31    #[token("return")]
32    Return,
33    #[token("end")]
34    End,
35
36    // Object-oriented and function syntax keywords
37    #[token("classdef")]
38    ClassDef,
39    #[token("properties")]
40    Properties,
41    #[token("methods")]
42    Methods,
43    #[token("events")]
44    Events,
45    #[token("enumeration")]
46    Enumeration,
47    #[token("arguments")]
48    Arguments,
49
50    // Importing packages/classes
51    #[token("import")]
52    Import,
53
54    // Additional keywords (recognized by lexer; parser may treat as identifiers for now)
55    #[token("switch")]
56    Switch,
57    #[token("case")]
58    Case,
59    #[token("otherwise")]
60    Otherwise,
61    #[token("try")]
62    Try,
63    #[token("catch")]
64    Catch,
65    #[token("global")]
66    Global,
67    #[token("persistent")]
68    Persistent,
69    #[token("true", |lex| { lex.extras.last_was_value = true; })]
70    True,
71    #[token("false", |lex| { lex.extras.last_was_value = true; })]
72    False,
73
74    // Identifiers and literals
75    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| { lex.extras.last_was_value = true; })]
76    Ident,
77    // Float with optional underscores as digit separators (strip later)
78    #[regex(r"\d(?:_?\d)*\.(?:\d(?:_?\d)*)?(?:[eE][+-]?\d(?:_?\d)*)?", |lex| {
79        lex.extras.last_was_value = true;
80    })]
81    #[regex(r"\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*", |lex| {
82        lex.extras.last_was_value = true;
83    })]
84    Float,
85    // Integer with optional underscores as digit separators (strip later)
86    #[regex(r"\d(?:_?\d)*", |lex| {
87        lex.extras.last_was_value = true;
88    })]
89    Integer,
90    // Apostrophe is handled contextually in tokenize_detailed: either Transpose or a single-quoted string
91    #[token("'")]
92    Apostrophe,
93    // Double-quoted string scalar (treated as Str at lexer level). Always emit.
94    #[regex(r#""([^"\n\r]|"")*""#, double_quoted_string_emit, priority = 1)]
95    Str,
96    #[token("...", ellipsis_emit_and_skip_to_eol)]
97    Ellipsis,
98    // Section marker: must be at start of line (after optional whitespace). We match until EOL and emit a single token.
99    #[regex(r"%%[^\n]*", section_marker, priority = 3)]
100    Section,
101    #[token(".*")]
102    DotStar,
103    #[token("./")]
104    DotSlash,
105    #[token(".\\")]
106    DotBackslash,
107    #[token(".^")]
108    DotCaret,
109    #[token("&&")]
110    AndAnd,
111    #[token("||")]
112    OrOr,
113    #[token("==")]
114    Equal,
115    #[token("~=")]
116    NotEqual,
117    #[token("<=")]
118    LessEqual,
119    #[token(">=")]
120    GreaterEqual,
121    #[token("+")]
122    Plus,
123    #[token("-")]
124    Minus,
125    #[token("*")]
126    Star,
127    #[token("/")]
128    Slash,
129    #[token("\\")]
130    Backslash,
131    #[token("^")]
132    Caret,
133    #[token("&")]
134    And,
135    #[token("|")]
136    Or,
137    #[token("~")]
138    Tilde,
139    #[token("@")]
140    At,
141    // Meta-class (type) query operator: ?ClassName
142    #[token("?")]
143    Question,
144    #[token("<")]
145    Less,
146    #[token(">")]
147    Greater,
148    #[token("=", |lex| { lex.extras.last_was_value = false; })]
149    Assign,
150    #[token(".")]
151    Dot,
152    // Semicolon ends a statement; next token should not be treated as a value.
153    // This helps disambiguate that a following apostrophe starts a string, not a transpose.
154    #[token(";", |lex| { lex.extras.last_was_value = false; })]
155    Semicolon,
156    #[token(",")]
157    Comma,
158    #[token(":")]
159    Colon,
160    #[token("(", |lex| { lex.extras.last_was_value = false; })]
161    LParen,
162    #[token(")", |lex| { lex.extras.last_was_value = true; })]
163    RParen,
164    #[token("[", |lex| { lex.extras.last_was_value = false; })]
165    LBracket,
166    #[token("]", |lex| { lex.extras.last_was_value = true; })]
167    RBracket,
168    #[token("{", |lex| { lex.extras.last_was_value = false; })]
169    LBrace,
170    #[token("}", |lex| { lex.extras.last_was_value = true; })]
171    RBrace,
172
173    // Newlines are skipped but set line_start for '%%' detection
174    #[regex(r"\n+", newline_skip)]
175    Newline,
176
177    // Block comments: '%{' ... '%}' (non-nesting). Skipped entirely.
178    #[regex(r"%\{", block_comment_skip, priority = 2)]
179    BlockComment,
180
181    // Line comments: single '%' handled here; '%%' and '%{' are matched by other rules first
182    #[token("%", line_comment_start, priority = 0)]
183    LineComment,
184
185    Error,
186    // Synthetic tokens (not produced by Logos directly)
187    Transpose,
188}
189
190#[derive(Debug, Clone, PartialEq)]
191pub struct SpannedToken {
192    pub token: Token,
193    pub lexeme: String,
194    pub start: usize,
195    pub end: usize,
196}
197
198pub fn tokenize(input: &str) -> Vec<Token> {
199    tokenize_detailed(input)
200        .into_iter()
201        .map(|t| t.token)
202        .filter(|tok| !matches!(tok, Token::Newline))
203        .collect()
204}
205
206pub fn tokenize_detailed(input: &str) -> Vec<SpannedToken> {
207    let mut lex = Token::lexer(input);
208    // We begin at the start of a (virtual) line
209    lex.extras.line_start = true;
210    let mut out: Vec<SpannedToken> = Vec::new();
211    while let Some(res) = lex.next() {
212        match res {
213            Ok(tok) => {
214                let mut s = lex.slice().to_string();
215                // Normalize numeric literals: remove underscores in integers/floats
216                if matches!(tok, Token::Float | Token::Integer) {
217                    s.retain(|c| c != '_');
218                }
219                let span = lex.span();
220
221                // Handle contextual apostrophe before normal push logic
222                if matches!(tok, Token::Apostrophe) {
223                    // Decide using adjacency + previous token category.
224                    // Transpose only when there is no whitespace between and previous token is a value or dot.
225                    let (is_adjacent, prev_token_opt) = out
226                        .last()
227                        .map(|t| (t.end == span.start, Some(&t.token)))
228                        .unwrap_or((false, None));
229                    let prev_is_value_or_dot = prev_token_opt
230                        .map(|t| matches!(t, Token::Dot) || last_is_value_token(t))
231                        .unwrap_or(false);
232                    if is_adjacent && prev_is_value_or_dot {
233                        out.push(SpannedToken {
234                            token: Token::Transpose,
235                            lexeme: "'".into(),
236                            start: span.start,
237                            end: span.end,
238                        });
239                        continue;
240                    }
241                    // Otherwise, parse a full single-quoted string starting at this apostrophe
242                    let rem = lex.remainder();
243                    let mut j = 0usize;
244                    let bytes = rem.as_bytes();
245                    let mut ok = false;
246                    while j < rem.len() {
247                        let c = bytes[j] as char;
248                        if c == '\'' {
249                            if j + 1 < rem.len() && bytes[j + 1] as char == '\'' {
250                                j += 2; // escaped quote
251                            } else {
252                                ok = true; // closing quote
253                                j += 1; // include closing
254                                break;
255                            }
256                        } else if c == '\n' || c == '\r' {
257                            break;
258                        } else {
259                            j += 1;
260                        }
261                    }
262                    if ok {
263                        // Consume what we scanned and emit Str for the entire single-quoted literal
264                        let abs_start = span.start;
265                        let abs_end = span.end + j;
266                        let lexeme = format!("'{}", &rem[..j]);
267                        lex.bump(j); // advance past the content following the leading apostrophe
268                        lex.extras.last_was_value = true;
269                        out.push(SpannedToken {
270                            token: Token::Str,
271                            lexeme,
272                            start: abs_start,
273                            end: abs_end,
274                        });
275                        lex.extras.line_start = false;
276                        continue;
277                    } else {
278                        // Unterminated; treat as Error
279                        out.push(SpannedToken {
280                            token: Token::Error,
281                            lexeme: "'".into(),
282                            start: span.start,
283                            end: span.end,
284                        });
285                        continue;
286                    }
287                }
288                // On any emitted token that is not Newline or comment/skip, we are no longer at line start
289                match tok {
290                    Token::Newline | Token::LineComment | Token::BlockComment => {}
291                    _ => {
292                        lex.extras.line_start = false;
293                    }
294                }
295                out.push(SpannedToken {
296                    token: tok,
297                    lexeme: s,
298                    start: span.start,
299                    end: span.end,
300                });
301
302                // Special-case: immediately after a semicolon, allow a single-quoted string literal
303                // to be parsed eagerly to avoid apostrophe/transpose ambiguity.
304                if matches!(out.last().map(|t| &t.token), Some(Token::Semicolon)) {
305                    // Peek the remainder for optional whitespace + a single-quoted string
306                    let rem = lex.remainder();
307                    let mut offset = 0usize;
308                    for ch in rem.chars() {
309                        if ch == ' ' || ch == '\t' || ch == '\r' {
310                            offset += ch.len_utf8();
311                        } else {
312                            break;
313                        }
314                    }
315                    if rem[offset..].starts_with('\'') {
316                        // Try to scan a valid single-quoted string with doubled '' escapes
317                        let mut j = offset + 1;
318                        let bytes = rem.as_bytes();
319                        let mut ok = false;
320                        while j < rem.len() {
321                            let c = bytes[j] as char;
322                            if c == '\'' {
323                                if j + 1 < rem.len() && bytes[j + 1] as char == '\'' {
324                                    j += 2; // escaped quote
325                                } else {
326                                    ok = true; // closing quote at j
327                                    j += 1;
328                                    break;
329                                }
330                            } else if c == '\n' {
331                                break;
332                            } else {
333                                j += 1;
334                            }
335                        }
336                        if ok {
337                            // Consume the scanned slice and emit a Str token
338                            let abs_start = span.end + offset;
339                            let abs_end = span.end + j;
340                            let lexeme = &rem[offset..j];
341                            lex.bump(j); // advance lexer past the string
342                            lex.extras.last_was_value = true;
343                            out.push(SpannedToken {
344                                token: Token::Str,
345                                lexeme: lexeme.to_string(),
346                                start: abs_start,
347                                end: abs_end,
348                            });
349                        }
350                    }
351                }
352            }
353            Err(_) => {
354                // Robust error recovery: scan the remaining slice and emit best-effort tokens
355                // so that downstream parsers can continue (e.g., identifiers, whitespace, parens).
356                let s = lex.slice();
357                let span = lex.span();
358
359                let mut byte_index = 0usize; // offset within s
360
361                while byte_index < s.len() {
362                    // Helper to read next char and its byte length
363                    let ch = s[byte_index..].chars().next().unwrap();
364                    let ch_len = ch.len_utf8();
365
366                    // Skip whitespace entirely (would normally be skipped by Logos attributes)
367                    if ch.is_whitespace() {
368                        byte_index += ch_len;
369                        continue;
370                    }
371
372                    // Double-quoted string recovery: "..." with doubled "" escapes
373                    if ch == '"' {
374                        let start_off = byte_index;
375                        byte_index += ch_len; // consume opening quote
376                        while byte_index < s.len() {
377                            let nxt = s[byte_index..].chars().next().unwrap();
378                            if nxt == '"' {
379                                // Check for doubled quote escape
380                                let next_two = &s[byte_index..];
381                                if next_two.starts_with("\"\"") {
382                                    // consume both quotes as escaped quote
383                                    byte_index += 2;
384                                    continue;
385                                } else {
386                                    // closing quote
387                                    byte_index += 1;
388                                    break;
389                                }
390                            } else if nxt == '\n' || nxt == '\r' {
391                                // Unterminated; emit error for opening quote and break to resume normal scan
392                                let start = span.start + start_off;
393                                out.push(SpannedToken {
394                                    token: Token::Error,
395                                    lexeme: s[start_off..start_off + 1].to_string(),
396                                    start,
397                                    end: start + 1,
398                                });
399                                // do not advance byte_index beyond the opening quote; let normal flow handle following chars
400                                break;
401                            } else {
402                                byte_index += nxt.len_utf8();
403                            }
404                        }
405                        // If we ended on a closing quote, emit Str token
406                        if byte_index > start_off + 1
407                            && &s[start_off..start_off + 1] == "\""
408                            && s[start_off..byte_index].ends_with('"')
409                        {
410                            let start = span.start + start_off;
411                            let end = span.start + byte_index;
412                            // Mark as value for downstream transpose logic
413                            lex.extras.last_was_value = true;
414                            out.push(SpannedToken {
415                                token: Token::Str,
416                                lexeme: s[start_off..byte_index].to_string(),
417                                start,
418                                end,
419                            });
420                            continue;
421                        } else {
422                            // If not properly closed, fall through; single-char error was already emitted
423                            byte_index += ch_len;
424                            continue;
425                        }
426                    }
427
428                    // Coalesce identifiers: [a-zA-Z_][a-zA-Z0-9_]*
429                    if ch == '_' || ch.is_ascii_alphabetic() {
430                        let start_off = byte_index;
431                        byte_index += ch_len;
432                        while byte_index < s.len() {
433                            let nxt = s[byte_index..].chars().next().unwrap();
434                            if nxt == '_' || nxt.is_ascii_alphanumeric() {
435                                byte_index += nxt.len_utf8();
436                            } else {
437                                break;
438                            }
439                        }
440                        let start = span.start + start_off;
441                        let end = span.start + byte_index;
442                        out.push(SpannedToken {
443                            token: Token::Ident,
444                            lexeme: s[start_off..byte_index].to_string(),
445                            start,
446                            end,
447                        });
448                        continue;
449                    }
450
451                    // Numbers: simplistic integer/float scan to avoid splitting
452                    if ch.is_ascii_digit() {
453                        let start_off = byte_index;
454                        byte_index += ch_len;
455                        while byte_index < s.len() {
456                            let nxt = s[byte_index..].chars().next().unwrap();
457                            if nxt.is_ascii_digit() {
458                                byte_index += nxt.len_utf8();
459                            } else if nxt == '.' {
460                                // include one dot and continue scanning digits/exponent
461                                byte_index += 1;
462                            } else if nxt == 'e' || nxt == 'E' || nxt == '+' || nxt == '-' {
463                                byte_index += 1;
464                            } else {
465                                break;
466                            }
467                        }
468                        let start = span.start + start_off;
469                        let end = span.start + byte_index;
470                        out.push(SpannedToken {
471                            token: Token::Integer, // good enough for recovery; detailed kind not required
472                            lexeme: s[start_off..byte_index].to_string(),
473                            start,
474                            end,
475                        });
476                        continue;
477                    }
478
479                    // Single-character punctuation/operators
480                    let token = match ch {
481                        '\'' => {
482                            // In recovery, only treat apostrophe as transpose when the previous token
483                            // was a value; otherwise it's likely a broken string start -> mark as error.
484                            if lex.extras.last_was_value {
485                                Token::Transpose
486                            } else {
487                                Token::Error
488                            }
489                        }
490                        ';' => Token::Semicolon,
491                        ')' => Token::RParen,
492                        '(' => Token::LParen,
493                        ',' => Token::Comma,
494                        ']' => Token::RBracket,
495                        '[' => Token::LBracket,
496                        '}' => Token::RBrace,
497                        '{' => Token::LBrace,
498                        ':' => Token::Colon,
499                        '.' => Token::Dot,
500                        '+' => Token::Plus,
501                        '-' => Token::Minus,
502                        '*' => Token::Star,
503                        '/' => Token::Slash,
504                        '\\' => Token::Backslash,
505                        '^' => Token::Caret,
506                        '&' => Token::And,
507                        '|' => Token::Or,
508                        '~' => Token::Tilde,
509                        '<' => Token::Less,
510                        '>' => Token::Greater,
511                        '=' => Token::Assign,
512                        _ => Token::Error,
513                    };
514
515                    let start = span.start + byte_index;
516                    let end = start + ch_len;
517                    out.push(SpannedToken {
518                        token,
519                        lexeme: ch.to_string(),
520                        start,
521                        end,
522                    });
523                    byte_index += ch_len;
524                }
525            }
526        }
527    }
528    out
529}
530
531#[allow(dead_code)]
532fn last_is_value_token(tok: &Token) -> bool {
533    matches!(
534        tok,
535        Token::Ident
536            | Token::Integer
537            | Token::Float
538            | Token::True
539            | Token::False
540            | Token::RParen
541            | Token::RBracket
542            | Token::RBrace
543            | Token::Str
544    )
545}
546
547fn double_quoted_string_emit(lexer: &mut Lexer<Token>) -> Filter<()> {
548    // Always emit and mark as value
549    lexer.extras.last_was_value = true;
550    Filter::Emit(())
551}
552
553#[allow(dead_code)]
554fn transpose_filter(lex: &mut Lexer<Token>) -> Filter<()> {
555    // Emit transpose only when the previous token formed a value
556    // (e.g., after identifiers, numbers, closing parens/brackets/braces, etc.).
557    // Otherwise, skip so that the Str token (full quoted string) can match.
558    if lex.extras.last_was_value {
559        lex.extras.last_was_value = true;
560        Filter::Emit(())
561    } else {
562        Filter::Skip
563    }
564}
565
566fn ellipsis_emit_and_skip_to_eol(lex: &mut Lexer<Token>) -> Filter<()> {
567    // After an ellipsis, ignore the remainder of the physical line (including comments)
568    let rest = lex.remainder();
569    if let Some((idx, len)) = find_line_terminator(rest) {
570        lex.bump(idx + len); // consume through the newline so no standalone newline token is emitted
571    } else {
572        lex.bump(rest.len());
573    }
574    lex.extras.last_was_value = true; // e.g., '1 + ...\n 2' the ellipsis does not reset value-ness
575    Filter::Emit(())
576}
577
578fn newline_skip(lex: &mut Lexer<Token>) -> Filter<()> {
579    lex.extras.line_start = true;
580    lex.extras.last_was_value = false;
581    Filter::Emit(())
582}
583
584fn section_marker(lex: &mut Lexer<Token>) -> Filter<()> {
585    // Only emit a Section token when at start of line; otherwise, treat as a comment and skip
586    if lex.extras.line_start {
587        lex.extras.line_start = true;
588        lex.extras.last_was_value = false;
589        if let Some((_, len)) = find_line_terminator(lex.remainder()) {
590            lex.bump(len);
591        }
592        Filter::Emit(())
593    } else {
594        // Skip to end of line (already consumed by regex except for the newline char)
595        Filter::Skip
596    }
597}
598
599// Removed: replaced by explicit line_comment_start which consumes from single '%'
600
601fn block_comment_skip(lex: &mut Lexer<Token>) -> Filter<()> {
602    // We matched '%{'. Skip until the first '%}' or end of input.
603    let rest = lex.remainder();
604    if let Some(end) = rest.find("%}") {
605        lex.bump(end + 2); // consume up to and including '%}'
606    } else {
607        lex.bump(rest.len()); // consume to end if no terminator
608    }
609    if let Some((_, len)) = find_line_terminator(lex.remainder()) {
610        lex.bump(len);
611        lex.extras.line_start = true;
612        lex.extras.last_was_value = false;
613    }
614    Filter::Skip
615}
616
617fn line_comment_start(lex: &mut Lexer<Token>) -> Filter<()> {
618    // We just consumed a single '%'. Skip to the end of the line.
619    let rest = lex.remainder();
620    if let Some(pos) = rest.find('\n') {
621        lex.bump(pos);
622    } else {
623        lex.bump(rest.len());
624    }
625    Filter::Skip
626}
627
628fn find_line_terminator(s: &str) -> Option<(usize, usize)> {
629    let bytes = s.as_bytes();
630    for (i, &b) in bytes.iter().enumerate() {
631        match b {
632            b'\n' => return Some((i, 1)),
633            b'\r' => {
634                if bytes.get(i + 1) == Some(&b'\n') {
635                    return Some((i, 2));
636                } else {
637                    return Some((i, 1));
638                }
639            }
640            _ => continue,
641        }
642    }
643    None
644}