1use alloc::{string::String, vec::Vec, format};
8
9use logos::Logos;
10use crate::{Span, LexError};
11
12#[derive(Logos, Debug, Clone, PartialEq)]
19#[logos(skip r"[ \t\r\n]+")]
20pub enum Token {
21 #[token("true")]
22 True,
23
24 #[token("false")]
25 False,
26
27 #[token("nil")]
28 Nil,
29
30 #[token("and")]
31 And,
32
33 #[token("break")]
34 Break,
35
36 #[token("do")]
37 Do,
38
39 #[token("else")]
40 Else,
41
42 #[token("elseif")]
43 Elseif,
44
45 #[token("end")]
46 End,
47
48 #[token("for")]
49 For,
50
51 #[token("function")]
52 Function,
53
54 #[token("if")]
55 If,
56
57 #[token("in")]
58 In,
59
60 #[token("local")]
61 Local,
62
63 #[token("not")]
64 Not,
65
66 #[token("or")]
67 Or,
68
69 #[token("repeat")]
70 Repeat,
71
72 #[token("return")]
73 Return,
74
75 #[token("then")]
76 Then,
77
78 #[token("until")]
79 Until,
80
81 #[token("while")]
82 While,
83
84 #[token("continue")]
85 Continue,
86
87 #[token("export")]
88 Export,
89
90 #[token("type")]
91 Type,
92
93 #[token("goto")]
94 Goto,
95
96 #[token("const")]
97 Const,
98
99 #[regex(r"[\p{L}_][\p{L}\p{N}_]*", |lex| lex.slice().to_string())]
103 Identifier(String),
104
105 #[regex(r"0[xX][0-9a-fA-F_]*(\.[0-9a-fA-F_]*)?([pP][+-]?\d*)?|\d[0-9_]*(\.\d[0-9_]*)?([eE][+-]?\d*)?|\.\d[0-9_]*([eE][+-]?\d*)?|0[bB][01_]*", |lex| lex.slice().to_string())]
106 Number(String),
107
108 #[regex(r#""([^"\\]|\\.)*""#, parse_string)]
109 #[regex(r#"'([^'\\]|\\.)*'"#, parse_string)]
110 String(String),
111
112 #[token("`", parse_interpolation_parts)]
113 InterpolatedString(Vec<InterpolationPart>),
114
115 #[regex(r"--", parse_comment)]
116 Comment(String),
117
118 #[regex(r"\[[=]*\[", parse_long_string)]
119 LongString(String),
120
121 #[token("+")]
122 Plus,
123
124 #[token("-")]
125 Minus,
126
127 #[token("*")]
128 Star,
129
130 #[token("/")]
131 Slash,
132
133 #[token("//")]
134 FloorDiv,
135
136 #[token("%")]
137 Percent,
138
139 #[token("^")]
140 Caret,
141
142 #[token("#")]
143 Hash,
144
145 #[token("==")]
146 EqEq,
147
148 #[token("~=")]
149 NotEq,
150
151 #[token("<=")]
152 LessEq,
153
154 #[token(">=")]
155 GreaterEq,
156
157 #[token("<")]
158 Less,
159
160 #[token(">")]
161 Greater,
162
163 #[token("=")]
164 Eq,
165
166 #[token("+=")]
167 PlusEq,
168
169 #[token("-=")]
170 MinusEq,
171
172 #[token("*=")]
173 StarEq,
174
175 #[token("/=")]
176 SlashEq,
177
178 #[token("//=")]
179 FloorDivEq,
180
181 #[token("%=")]
182 PercentEq,
183
184 #[token("^=")]
185 CaretEq,
186
187 #[token("..=")]
188 ConcatEq,
189
190 #[token("(")]
191 LParen,
192
193 #[token(")")]
194 RParen,
195
196 #[token("{")]
197 LBrace,
198
199 #[token("}")]
200 RBrace,
201
202 #[token("[")]
203 LBracket,
204
205 #[token("]")]
206 RBracket,
207
208 #[token("::")]
209 ColonColon,
210
211 #[token(":")]
212 Colon,
213
214 #[token(";")]
215 Semi,
216
217 #[token(",")]
218 Comma,
219
220 #[token("...")]
221 Dot3,
222
223 #[token("..")]
224 Dot2,
225
226 #[token(".")]
227 Dot,
228
229 #[token("->")]
230 Arrow,
231
232 #[token("|")]
233 Pipe,
234
235 #[token("&")]
236 Ampersand,
237
238 #[token("?")]
239 Question,
240
241 #[token("@")]
242 At,
243
244 #[token("<<")]
245 LeftShift,
246
247 #[token(">>")]
248 RightShift,
249
250 #[token("~")]
251 Tilde,
252
253 Eof,
254}
255
256#[derive(Debug, Clone, PartialEq)]
261pub enum InterpolationPart {
262 Text(String),
264 ExprSpan { start: usize, end: usize },
266}
267
268fn parse_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
269 let slice = lex.slice();
270 let content = &slice[1..slice.len() - 1];
271 Some(unescape_string(content))
272}
273
274fn parse_interpolation_parts(lex: &mut logos::Lexer<Token>) -> Option<Vec<InterpolationPart>> {
275 let start = lex.span().end;
276 let source = lex.source();
277 let bytes = source.as_bytes();
278
279 let mut parts = Vec::new();
280 let mut current_text = String::new();
281 let mut pos = start;
282
283 while pos < bytes.len() {
284 match bytes[pos] {
285 b'`' => {
286 if !current_text.is_empty() {
287 parts.push(InterpolationPart::Text(current_text));
288 }
289 lex.bump(pos - start + 1);
290 return Some(parts);
291 }
292 b'{' => {
293 if !current_text.is_empty() {
294 parts.push(InterpolationPart::Text(current_text.clone()));
295 current_text.clear();
296 }
297
298 let expr_start = pos + 1;
299 let mut depth = 1;
300 pos += 1;
301
302 while pos < bytes.len() && depth > 0 {
303 match bytes[pos] {
304 b'{' => depth += 1,
305 b'}' => depth -= 1,
306 _ => {}
307 }
308 pos += 1;
309 }
310
311 if depth != 0 {
312 return None;
313 }
314
315 let expr_end = pos - 1;
316 parts.push(InterpolationPart::ExprSpan {
317 start: expr_start,
318 end: expr_end,
319 });
320 }
321 b'\\' if pos + 1 < bytes.len() => {
322 match bytes[pos + 1] {
323 b'n' => {
324 current_text.push('\n');
325 pos += 2;
326 }
327 b't' => {
328 current_text.push('\t');
329 pos += 2;
330 }
331 b'r' => {
332 current_text.push('\r');
333 pos += 2;
334 }
335 b'\\' | b'`' | b'{' | b'}' => {
336 current_text.push(bytes[pos + 1] as char);
337 pos += 2;
338 }
339 _ => {
340 current_text.push(bytes[pos] as char);
341 pos += 1;
342 }
343 }
344 }
345 b => {
346 current_text.push(b as char);
347 pos += 1;
348 }
349 }
350 }
351
352 None
353}
354
355fn parse_comment(lex: &mut logos::Lexer<Token>) -> Option<String> {
356 let start = lex.span().end;
357 let source = lex.source();
358 let rest = &source[start..];
359
360 if rest.starts_with('[') {
362 let after_bracket = &rest[1..];
363 let eq_count = after_bracket.chars().take_while(|&c| c == '=').count();
364 if after_bracket.len() > eq_count && after_bracket[eq_count..].starts_with('[') {
365 let closing = format!("]{}]", "=".repeat(eq_count));
367 let block_start = 1 + eq_count + 1; let content_start = start + block_start;
369
370 if let Some(end_pos) = source[content_start..].find(&closing) {
371 let content = source[content_start..content_start + end_pos].to_string();
372 lex.bump(block_start + end_pos + closing.len());
373 return Some(content);
374 } else {
375 let content = source[content_start..].to_string();
377 lex.bump(source.len() - start);
378 return Some(content);
379 }
380 }
381 }
382
383 if let Some(newline_pos) = rest.find('\n') {
385 let content = rest[..newline_pos].trim().to_string();
386 lex.bump(newline_pos);
387 Some(content)
388 } else {
389 let content = rest.trim().to_string();
390 lex.bump(rest.len());
391 Some(content)
392 }
393}
394
395fn parse_long_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
396 let slice = lex.slice();
397
398 let equals_count = slice.chars().filter(|&c| c == '=').count();
399 let closing = format!("]{}]", "=".repeat(equals_count));
400
401 let start = lex.span().end;
402 let source = lex.source();
403
404 let actual_start = if source[start..].starts_with('\n') {
405 start + 1
406 } else if source[start..].starts_with("\r\n") {
407 start + 2
408 } else {
409 start
410 };
411
412 if let Some(end_pos) = source[actual_start..].find(&closing) {
413 let content = source[actual_start..actual_start + end_pos].to_string();
414 lex.bump(actual_start - start + end_pos + closing.len());
415 Some(content)
416 } else {
417 None
418 }
419}
420
421fn unescape_string(s: &str) -> String {
422 let mut result = String::new();
423 let mut chars = s.chars().peekable();
424
425 while let Some(ch) = chars.next() {
426 if ch == '\\' {
427 match chars.next() {
428 Some('n') => result.push('\n'),
429 Some('t') => result.push('\t'),
430 Some('r') => result.push('\r'),
431 Some('\\') => result.push('\\'),
432 Some('"') => result.push('"'),
433 Some('\'') => result.push('\''),
434 Some('0') => result.push('\0'),
435 Some('a') => result.push('\x07'), Some('b') => result.push('\x08'), Some('f') => result.push('\x0C'), Some('v') => result.push('\x0B'), Some('x') => {
442 let mut hex = String::new();
443 if let Some(&h1) = chars.peek() {
444 if h1.is_ascii_hexdigit() {
445 hex.push(chars.next().unwrap());
446 if let Some(&h2) = chars.peek() {
447 if h2.is_ascii_hexdigit() {
448 hex.push(chars.next().unwrap());
449 }
450 }
451 }
452 }
453 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
454 result.push(byte as char);
455 } else {
456 result.push('\\');
457 result.push('x');
458 result.push_str(&hex);
459 }
460 }
461
462 Some('u') => {
464 if chars.peek() == Some(&'{') {
465 chars.next(); let mut hex = String::new();
467
468 while let Some(&ch) = chars.peek() {
469 if ch == '}' {
470 chars.next();
471 break;
472 }
473 if ch.is_ascii_hexdigit() {
474 hex.push(chars.next().unwrap());
475 } else {
476 break;
477 }
478 }
479
480 if let Ok(code) = u32::from_str_radix(&hex, 16) {
481 if let Some(unicode_char) = char::from_u32(code) {
482 result.push(unicode_char);
483 }
484 }
485 } else {
486 result.push('\\');
487 result.push('u');
488 }
489 }
490
491 Some('z') => {
493 while let Some(&ch) = chars.peek() {
494 if ch.is_whitespace() {
495 chars.next();
496 } else {
497 break;
498 }
499 }
500 }
501
502 Some(d) if d.is_ascii_digit() => {
504 let mut num = String::new();
505 num.push(d);
506
507 for _ in 0..2 {
508 if let Some(&next) = chars.peek() {
509 if next.is_ascii_digit() {
510 num.push(chars.next().unwrap());
511 } else {
512 break;
513 }
514 }
515 }
516
517 if let Ok(byte) = num.parse::<u8>() {
518 result.push(byte as char);
519 } else {
520 result.push('\\');
521 result.push_str(&num);
522 }
523 }
524
525 Some(c) => {
526 result.push('\\');
527 result.push(c);
528 }
529 None => result.push('\\'),
530 }
531 } else {
532 result.push(ch);
533 }
534 }
535
536 result
537}
538
539pub fn lex(source: &str) -> Result<Vec<(Token, Span)>, LexError> {
546 let source = if source.starts_with("#!") {
548 match source.find('\n') {
549 Some(pos) => &source[pos + 1..],
550 None => "",
551 }
552 } else {
553 source
554 };
555
556 let mut tokens = Vec::new();
557 let mut lexer = Token::lexer(source);
558
559 while let Some(token_result) = lexer.next() {
560 let span = lexer.span();
561 match token_result {
562 Ok(token) => {
563 if let Token::Number(ref num) = token {
564 if !validate_number(num) {
565 return Err(LexError::InvalidNumber { span });
566 }
567 }
568 tokens.push((token, span));
569 }
570 Err(_) => {
571 return Err(LexError::InvalidNumber { span });
572 }
573 }
574 }
575
576 let eof_pos = source.len();
577 tokens.push((Token::Eof, eof_pos..eof_pos));
578
579 Ok(tokens)
580}
581
582fn validate_number(s: &str) -> bool {
583 if s.starts_with("0x") || s.starts_with("0X") {
584 let after_prefix = &s[2..];
587 if after_prefix.is_empty() {
588 return false;
589 }
590
591 let parts: Vec<&str> = after_prefix.split(|c| c == 'p' || c == 'P').collect();
593 if parts.len() > 2 {
594 return false;
595 }
596
597 let hex_part = parts[0].replace('_', "");
599 if !hex_part.chars().all(|c| c.is_ascii_hexdigit() || c == '.') {
600 return false;
601 }
602
603 if parts.len() == 2 {
605 let exp = parts[1].replace('_', "");
606 let exp = exp.trim_start_matches('+').trim_start_matches('-');
607 if exp.is_empty() || !exp.chars().all(|c| c.is_ascii_digit()) {
608 return false;
609 }
610 }
611 } else if s.starts_with("0b") || s.starts_with("0B") {
612 let after_prefix = &s[2..].replace('_', "");
615 if after_prefix.is_empty() || !after_prefix.chars().all(|c| c == '0' || c == '1') {
616 return false;
617 }
618 } else {
619 let cleaned = s.replace('_', "");
621
622 if !cleaned.chars().any(|c| c.is_ascii_digit()) {
624 return false;
625 }
626
627 if cleaned.contains('e') || cleaned.contains('E') {
628 let parts: Vec<&str> = cleaned.split(|c| c == 'e' || c == 'E').collect();
629 if parts.len() != 2 {
630 return false;
631 }
632
633 let exp = parts[1].trim_start_matches('+').trim_start_matches('-');
634 if exp.is_empty() || !exp.chars().all(|c| c.is_ascii_digit()) {
635 return false;
636 }
637 }
638 }
639
640 true
641}
642
643pub fn lex_for_version<V: crate::marker::LuaVersion>(
649 source: &str,
650) -> Result<Vec<(Token, Span)>, LexError> {
651 let tokens = lex(source)?;
652
653 Ok(tokens
654 .into_iter()
655 .map(|(token, span)| {
656 let t = match token {
657 Token::Continue if !V::HAS_CONTINUE => Token::Identifier("continue".to_string()),
658 Token::Export if !V::HAS_EXPORT => Token::Identifier("export".to_string()),
659 Token::Type if !V::HAS_TYPE_ANNOTATIONS => Token::Identifier("type".to_string()),
660 Token::Goto if !V::HAS_GOTO => Token::Identifier("goto".to_string()),
661 Token::Const if !V::HAS_CONST => Token::Identifier("const".to_string()),
662 t => t,
663 };
664 (t, span)
665 })
666 .collect())
667}