runmat_lexer/lib.rs
1use logos::{Filter, Lexer, Logos};
2
3#[derive(Default, Clone, Copy)]
4pub struct LexerExtras {
5 pub last_was_value: bool,
6 pub line_start: bool,
7}
8
9#[derive(Logos, Debug, PartialEq, Clone)]
10// Skip spaces, tabs and carriage returns, but NOT newlines; we need newlines to detect '%%' at line start
11#[logos(skip r"[ \t\r]+")]
12#[logos(extras = LexerExtras)]
13pub enum Token {
14 // Keywords
15 #[token("function")]
16 Function,
17 #[token("if")]
18 If,
19 #[token("else")]
20 Else,
21 #[token("elseif")]
22 ElseIf,
23 #[token("for")]
24 For,
25 #[token("while")]
26 While,
27 #[token("break")]
28 Break,
29 #[token("continue")]
30 Continue,
31 #[token("return")]
32 Return,
33 #[token("end")]
34 End,
35
36 // Object-oriented and function syntax keywords
37 #[token("classdef")]
38 ClassDef,
39 #[token("properties")]
40 Properties,
41 #[token("methods")]
42 Methods,
43 #[token("events")]
44 Events,
45 #[token("enumeration")]
46 Enumeration,
47 #[token("arguments")]
48 Arguments,
49
50 // Importing packages/classes
51 #[token("import")]
52 Import,
53
54 // Additional keywords (recognized by lexer; parser may treat as identifiers for now)
55 #[token("switch")]
56 Switch,
57 #[token("case")]
58 Case,
59 #[token("otherwise")]
60 Otherwise,
61 #[token("try")]
62 Try,
63 #[token("catch")]
64 Catch,
65 #[token("global")]
66 Global,
67 #[token("persistent")]
68 Persistent,
69 #[token("true", |lex| { lex.extras.last_was_value = true; })]
70 True,
71 #[token("false", |lex| { lex.extras.last_was_value = true; })]
72 False,
73
74 // Identifiers and literals
75 #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| { lex.extras.last_was_value = true; })]
76 Ident,
77 // Float with optional underscores as digit separators (strip later)
78 #[regex(r"\d(?:_?\d)*\.(?:\d(?:_?\d)*)?(?:[eE][+-]?\d(?:_?\d)*)?", |lex| {
79 lex.extras.last_was_value = true;
80 })]
81 #[regex(r"\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*", |lex| {
82 lex.extras.last_was_value = true;
83 })]
84 Float,
85 // Integer with optional underscores as digit separators (strip later)
86 #[regex(r"\d(?:_?\d)*", |lex| {
87 lex.extras.last_was_value = true;
88 })]
89 Integer,
90 // Apostrophe is handled contextually in tokenize_detailed: either Transpose or a single-quoted string
91 #[token("'")]
92 Apostrophe,
93 // Double-quoted string scalar (treated as Str at lexer level). Always emit.
94 #[regex(r#""([^"\n\r]|"")*""#, double_quoted_string_emit, priority = 1)]
95 Str,
96 #[token("...", ellipsis_emit_and_skip_to_eol)]
97 Ellipsis,
98 // Section marker: must be at start of line (after optional whitespace). We match until EOL and emit a single token.
99 #[regex(r"%%[^\n]*", section_marker, priority = 3)]
100 Section,
101 #[token(".*")]
102 DotStar,
103 #[token("./")]
104 DotSlash,
105 #[token(".\\")]
106 DotBackslash,
107 #[token(".^")]
108 DotCaret,
109 #[token("&&")]
110 AndAnd,
111 #[token("||")]
112 OrOr,
113 #[token("==")]
114 Equal,
115 #[token("~=")]
116 NotEqual,
117 #[token("<=")]
118 LessEqual,
119 #[token(">=")]
120 GreaterEqual,
121 #[token("+")]
122 Plus,
123 #[token("-")]
124 Minus,
125 #[token("*")]
126 Star,
127 #[token("/")]
128 Slash,
129 #[token("\\")]
130 Backslash,
131 #[token("^")]
132 Caret,
133 #[token("&")]
134 And,
135 #[token("|")]
136 Or,
137 #[token("~")]
138 Tilde,
139 #[token("@")]
140 At,
141 // Meta-class (type) query operator: ?ClassName
142 #[token("?")]
143 Question,
144 #[token("<")]
145 Less,
146 #[token(">")]
147 Greater,
148 #[token("=", |lex| { lex.extras.last_was_value = false; })]
149 Assign,
150 #[token(".")]
151 Dot,
152 // Semicolon ends a statement; next token should not be treated as a value.
153 // This helps disambiguate that a following apostrophe starts a string, not a transpose.
154 #[token(";", |lex| { lex.extras.last_was_value = false; })]
155 Semicolon,
156 #[token(",")]
157 Comma,
158 #[token(":")]
159 Colon,
160 #[token("(", |lex| { lex.extras.last_was_value = false; })]
161 LParen,
162 #[token(")", |lex| { lex.extras.last_was_value = true; })]
163 RParen,
164 #[token("[", |lex| { lex.extras.last_was_value = false; })]
165 LBracket,
166 #[token("]", |lex| { lex.extras.last_was_value = true; })]
167 RBracket,
168 #[token("{", |lex| { lex.extras.last_was_value = false; })]
169 LBrace,
170 #[token("}", |lex| { lex.extras.last_was_value = true; })]
171 RBrace,
172
173 // Newlines are skipped but set line_start for '%%' detection
174 #[regex(r"\n+", newline_skip)]
175 Newline,
176
177 // Block comments: '%{' ... '%}' (non-nesting). Skipped entirely.
178 #[regex(r"%\{", block_comment_skip, priority = 2)]
179 BlockComment,
180
181 // Line comments: single '%' handled here; '%%' and '%{' are matched by other rules first
182 #[token("%", line_comment_start, priority = 0)]
183 LineComment,
184
185 Error,
186 // Synthetic tokens (not produced by Logos directly)
187 Transpose,
188}
189
190#[derive(Debug, Clone, PartialEq)]
191pub struct SpannedToken {
192 pub token: Token,
193 pub lexeme: String,
194 pub start: usize,
195 pub end: usize,
196}
197
198pub fn tokenize(input: &str) -> Vec<Token> {
199 tokenize_detailed(input)
200 .into_iter()
201 .map(|t| t.token)
202 .filter(|tok| !matches!(tok, Token::Newline))
203 .collect()
204}
205
206pub fn tokenize_detailed(input: &str) -> Vec<SpannedToken> {
207 let mut lex = Token::lexer(input);
208 // We begin at the start of a (virtual) line
209 lex.extras.line_start = true;
210 let mut out: Vec<SpannedToken> = Vec::new();
211 while let Some(res) = lex.next() {
212 match res {
213 Ok(tok) => {
214 let mut s = lex.slice().to_string();
215 // Normalize numeric literals: remove underscores in integers/floats
216 if matches!(tok, Token::Float | Token::Integer) {
217 s.retain(|c| c != '_');
218 }
219 let span = lex.span();
220
221 // Handle contextual apostrophe before normal push logic
222 if matches!(tok, Token::Apostrophe) {
223 // Decide using adjacency + previous token category.
224 // Transpose only when there is no whitespace between and previous token is a value or dot.
225 let (is_adjacent, prev_token_opt) = out
226 .last()
227 .map(|t| (t.end == span.start, Some(&t.token)))
228 .unwrap_or((false, None));
229 let prev_is_value_or_dot = prev_token_opt
230 .map(|t| matches!(t, Token::Dot) || last_is_value_token(t))
231 .unwrap_or(false);
232 if is_adjacent && prev_is_value_or_dot {
233 out.push(SpannedToken {
234 token: Token::Transpose,
235 lexeme: "'".into(),
236 start: span.start,
237 end: span.end,
238 });
239 continue;
240 }
241 // Otherwise, parse a full single-quoted string starting at this apostrophe
242 let rem = lex.remainder();
243 let mut j = 0usize;
244 let bytes = rem.as_bytes();
245 let mut ok = false;
246 while j < rem.len() {
247 let c = bytes[j] as char;
248 if c == '\'' {
249 if j + 1 < rem.len() && bytes[j + 1] as char == '\'' {
250 j += 2; // escaped quote
251 } else {
252 ok = true; // closing quote
253 j += 1; // include closing
254 break;
255 }
256 } else if c == '\n' || c == '\r' {
257 break;
258 } else {
259 j += 1;
260 }
261 }
262 if ok {
263 // Consume what we scanned and emit Str for the entire single-quoted literal
264 let abs_start = span.start;
265 let abs_end = span.end + j;
266 let lexeme = format!("'{}", &rem[..j]);
267 lex.bump(j); // advance past the content following the leading apostrophe
268 lex.extras.last_was_value = true;
269 out.push(SpannedToken {
270 token: Token::Str,
271 lexeme,
272 start: abs_start,
273 end: abs_end,
274 });
275 lex.extras.line_start = false;
276 continue;
277 } else {
278 // Unterminated; treat as Error
279 out.push(SpannedToken {
280 token: Token::Error,
281 lexeme: "'".into(),
282 start: span.start,
283 end: span.end,
284 });
285 continue;
286 }
287 }
288 // On any emitted token that is not Newline or comment/skip, we are no longer at line start
289 match tok {
290 Token::Newline | Token::LineComment | Token::BlockComment => {}
291 _ => {
292 lex.extras.line_start = false;
293 }
294 }
295 out.push(SpannedToken {
296 token: tok,
297 lexeme: s,
298 start: span.start,
299 end: span.end,
300 });
301
302 // Special-case: immediately after a semicolon, allow a single-quoted string literal
303 // to be parsed eagerly to avoid apostrophe/transpose ambiguity.
304 if matches!(out.last().map(|t| &t.token), Some(Token::Semicolon)) {
305 // Peek the remainder for optional whitespace + a single-quoted string
306 let rem = lex.remainder();
307 let mut offset = 0usize;
308 for ch in rem.chars() {
309 if ch == ' ' || ch == '\t' || ch == '\r' {
310 offset += ch.len_utf8();
311 } else {
312 break;
313 }
314 }
315 if rem[offset..].starts_with('\'') {
316 // Try to scan a valid single-quoted string with doubled '' escapes
317 let mut j = offset + 1;
318 let bytes = rem.as_bytes();
319 let mut ok = false;
320 while j < rem.len() {
321 let c = bytes[j] as char;
322 if c == '\'' {
323 if j + 1 < rem.len() && bytes[j + 1] as char == '\'' {
324 j += 2; // escaped quote
325 } else {
326 ok = true; // closing quote at j
327 j += 1;
328 break;
329 }
330 } else if c == '\n' {
331 break;
332 } else {
333 j += 1;
334 }
335 }
336 if ok {
337 // Consume the scanned slice and emit a Str token
338 let abs_start = span.end + offset;
339 let abs_end = span.end + j;
340 let lexeme = &rem[offset..j];
341 lex.bump(j); // advance lexer past the string
342 lex.extras.last_was_value = true;
343 out.push(SpannedToken {
344 token: Token::Str,
345 lexeme: lexeme.to_string(),
346 start: abs_start,
347 end: abs_end,
348 });
349 }
350 }
351 }
352 }
353 Err(_) => {
354 // Robust error recovery: scan the remaining slice and emit best-effort tokens
355 // so that downstream parsers can continue (e.g., identifiers, whitespace, parens).
356 let s = lex.slice();
357 let span = lex.span();
358
359 let mut byte_index = 0usize; // offset within s
360
361 while byte_index < s.len() {
362 // Helper to read next char and its byte length
363 let ch = s[byte_index..].chars().next().unwrap();
364 let ch_len = ch.len_utf8();
365
366 // Skip whitespace entirely (would normally be skipped by Logos attributes)
367 if ch.is_whitespace() {
368 byte_index += ch_len;
369 continue;
370 }
371
372 // Double-quoted string recovery: "..." with doubled "" escapes
373 if ch == '"' {
374 let start_off = byte_index;
375 byte_index += ch_len; // consume opening quote
376 while byte_index < s.len() {
377 let nxt = s[byte_index..].chars().next().unwrap();
378 if nxt == '"' {
379 // Check for doubled quote escape
380 let next_two = &s[byte_index..];
381 if next_two.starts_with("\"\"") {
382 // consume both quotes as escaped quote
383 byte_index += 2;
384 continue;
385 } else {
386 // closing quote
387 byte_index += 1;
388 break;
389 }
390 } else if nxt == '\n' || nxt == '\r' {
391 // Unterminated; emit error for opening quote and break to resume normal scan
392 let start = span.start + start_off;
393 out.push(SpannedToken {
394 token: Token::Error,
395 lexeme: s[start_off..start_off + 1].to_string(),
396 start,
397 end: start + 1,
398 });
399 // do not advance byte_index beyond the opening quote; let normal flow handle following chars
400 break;
401 } else {
402 byte_index += nxt.len_utf8();
403 }
404 }
405 // If we ended on a closing quote, emit Str token
406 if byte_index > start_off + 1
407 && &s[start_off..start_off + 1] == "\""
408 && s[start_off..byte_index].ends_with('"')
409 {
410 let start = span.start + start_off;
411 let end = span.start + byte_index;
412 // Mark as value for downstream transpose logic
413 lex.extras.last_was_value = true;
414 out.push(SpannedToken {
415 token: Token::Str,
416 lexeme: s[start_off..byte_index].to_string(),
417 start,
418 end,
419 });
420 continue;
421 } else {
422 // If not properly closed, fall through; single-char error was already emitted
423 byte_index += ch_len;
424 continue;
425 }
426 }
427
428 // Coalesce identifiers: [a-zA-Z_][a-zA-Z0-9_]*
429 if ch == '_' || ch.is_ascii_alphabetic() {
430 let start_off = byte_index;
431 byte_index += ch_len;
432 while byte_index < s.len() {
433 let nxt = s[byte_index..].chars().next().unwrap();
434 if nxt == '_' || nxt.is_ascii_alphanumeric() {
435 byte_index += nxt.len_utf8();
436 } else {
437 break;
438 }
439 }
440 let start = span.start + start_off;
441 let end = span.start + byte_index;
442 out.push(SpannedToken {
443 token: Token::Ident,
444 lexeme: s[start_off..byte_index].to_string(),
445 start,
446 end,
447 });
448 continue;
449 }
450
451 // Numbers: simplistic integer/float scan to avoid splitting
452 if ch.is_ascii_digit() {
453 let start_off = byte_index;
454 byte_index += ch_len;
455 while byte_index < s.len() {
456 let nxt = s[byte_index..].chars().next().unwrap();
457 if nxt.is_ascii_digit() {
458 byte_index += nxt.len_utf8();
459 } else if nxt == '.' {
460 // include one dot and continue scanning digits/exponent
461 byte_index += 1;
462 } else if nxt == 'e' || nxt == 'E' || nxt == '+' || nxt == '-' {
463 byte_index += 1;
464 } else {
465 break;
466 }
467 }
468 let start = span.start + start_off;
469 let end = span.start + byte_index;
470 out.push(SpannedToken {
471 token: Token::Integer, // good enough for recovery; detailed kind not required
472 lexeme: s[start_off..byte_index].to_string(),
473 start,
474 end,
475 });
476 continue;
477 }
478
479 // Single-character punctuation/operators
480 let token = match ch {
481 '\'' => {
482 // In recovery, only treat apostrophe as transpose when the previous token
483 // was a value; otherwise it's likely a broken string start -> mark as error.
484 if lex.extras.last_was_value {
485 Token::Transpose
486 } else {
487 Token::Error
488 }
489 }
490 ';' => Token::Semicolon,
491 ')' => Token::RParen,
492 '(' => Token::LParen,
493 ',' => Token::Comma,
494 ']' => Token::RBracket,
495 '[' => Token::LBracket,
496 '}' => Token::RBrace,
497 '{' => Token::LBrace,
498 ':' => Token::Colon,
499 '.' => Token::Dot,
500 '+' => Token::Plus,
501 '-' => Token::Minus,
502 '*' => Token::Star,
503 '/' => Token::Slash,
504 '\\' => Token::Backslash,
505 '^' => Token::Caret,
506 '&' => Token::And,
507 '|' => Token::Or,
508 '~' => Token::Tilde,
509 '<' => Token::Less,
510 '>' => Token::Greater,
511 '=' => Token::Assign,
512 _ => Token::Error,
513 };
514
515 let start = span.start + byte_index;
516 let end = start + ch_len;
517 out.push(SpannedToken {
518 token,
519 lexeme: ch.to_string(),
520 start,
521 end,
522 });
523 byte_index += ch_len;
524 }
525 }
526 }
527 }
528 out
529}
530
531#[allow(dead_code)]
532fn last_is_value_token(tok: &Token) -> bool {
533 matches!(
534 tok,
535 Token::Ident
536 | Token::Integer
537 | Token::Float
538 | Token::True
539 | Token::False
540 | Token::RParen
541 | Token::RBracket
542 | Token::RBrace
543 | Token::Str
544 )
545}
546
547fn double_quoted_string_emit(lexer: &mut Lexer<Token>) -> Filter<()> {
548 // Always emit and mark as value
549 lexer.extras.last_was_value = true;
550 Filter::Emit(())
551}
552
553#[allow(dead_code)]
554fn transpose_filter(lex: &mut Lexer<Token>) -> Filter<()> {
555 // Emit transpose only when the previous token formed a value
556 // (e.g., after identifiers, numbers, closing parens/brackets/braces, etc.).
557 // Otherwise, skip so that the Str token (full quoted string) can match.
558 if lex.extras.last_was_value {
559 lex.extras.last_was_value = true;
560 Filter::Emit(())
561 } else {
562 Filter::Skip
563 }
564}
565
566fn ellipsis_emit_and_skip_to_eol(lex: &mut Lexer<Token>) -> Filter<()> {
567 // After an ellipsis, ignore the remainder of the physical line (including comments)
568 let rest = lex.remainder();
569 if let Some((idx, len)) = find_line_terminator(rest) {
570 lex.bump(idx + len); // consume through the newline so no standalone newline token is emitted
571 } else {
572 lex.bump(rest.len());
573 }
574 lex.extras.last_was_value = true; // e.g., '1 + ...\n 2' the ellipsis does not reset value-ness
575 Filter::Emit(())
576}
577
578fn newline_skip(lex: &mut Lexer<Token>) -> Filter<()> {
579 lex.extras.line_start = true;
580 lex.extras.last_was_value = false;
581 Filter::Emit(())
582}
583
584fn section_marker(lex: &mut Lexer<Token>) -> Filter<()> {
585 // Only emit a Section token when at start of line; otherwise, treat as a comment and skip
586 if lex.extras.line_start {
587 lex.extras.line_start = true;
588 lex.extras.last_was_value = false;
589 if let Some((_, len)) = find_line_terminator(lex.remainder()) {
590 lex.bump(len);
591 }
592 Filter::Emit(())
593 } else {
594 // Skip to end of line (already consumed by regex except for the newline char)
595 Filter::Skip
596 }
597}
598
599// Removed: replaced by explicit line_comment_start which consumes from single '%'
600
601fn block_comment_skip(lex: &mut Lexer<Token>) -> Filter<()> {
602 // We matched '%{'. Skip until the first '%}' or end of input.
603 let rest = lex.remainder();
604 if let Some(end) = rest.find("%}") {
605 lex.bump(end + 2); // consume up to and including '%}'
606 } else {
607 lex.bump(rest.len()); // consume to end if no terminator
608 }
609 if let Some((_, len)) = find_line_terminator(lex.remainder()) {
610 lex.bump(len);
611 lex.extras.line_start = true;
612 lex.extras.last_was_value = false;
613 }
614 Filter::Skip
615}
616
617fn line_comment_start(lex: &mut Lexer<Token>) -> Filter<()> {
618 // We just consumed a single '%'. Skip to the end of the line.
619 let rest = lex.remainder();
620 if let Some(pos) = rest.find('\n') {
621 lex.bump(pos);
622 } else {
623 lex.bump(rest.len());
624 }
625 Filter::Skip
626}
627
628fn find_line_terminator(s: &str) -> Option<(usize, usize)> {
629 let bytes = s.as_bytes();
630 for (i, &b) in bytes.iter().enumerate() {
631 match b {
632 b'\n' => return Some((i, 1)),
633 b'\r' => {
634 if bytes.get(i + 1) == Some(&b'\n') {
635 return Some((i, 2));
636 } else {
637 return Some((i, 1));
638 }
639 }
640 _ => continue,
641 }
642 }
643 None
644}