1use crate::error::{RexxDiagnostic, RexxError, RexxResult, SourceLoc};
8
9#[derive(Debug, Clone, PartialEq)]
11pub enum TokenKind {
12 StringLit(String),
14 Number(String),
15 Symbol(String),
16
17 Plus,
19 Minus,
20 Star,
21 Slash,
22 IntDiv, Remainder, Power, Concat, Assign, Equal, NotEqual, Greater, Less, GreaterEq, LessEq, StrictEq, StrictNotEq, StrictGt, StrictLt, StrictGte, StrictLte, And, Or, Xor, Not, LeftParen,
50 RightParen,
51 Comma,
52 Semicolon,
53 Colon,
54 Dot,
55
56 Eol, Eof,
59}
60
61#[derive(Debug, Clone)]
62pub struct Token {
63 pub kind: TokenKind,
64 pub loc: SourceLoc,
65 pub space_before: bool,
69}
70
71impl Token {
72 pub fn new(kind: TokenKind, loc: SourceLoc, space_before: bool) -> Self {
73 Self {
74 kind,
75 loc,
76 space_before,
77 }
78 }
79}
80
81pub struct Lexer {
82 source: Vec<char>,
83 pos: usize,
84 line: usize,
85 col: usize,
86 lines: Vec<String>,
87}
88
89impl Lexer {
90 pub fn new(source: &str) -> Self {
91 let lines: Vec<String> = source.lines().map(String::from).collect();
92 Self {
93 source: source.chars().collect(),
94 pos: 0,
95 line: 1,
96 col: 1,
97 lines,
98 }
99 }
100
101 pub fn tokenize(&mut self) -> RexxResult<Vec<Token>> {
102 let mut tokens = Vec::new();
103
104 loop {
105 let pos_before = self.pos;
106 self.skip_whitespace_and_comments()?;
107 let had_space = self.pos > pos_before;
108
109 if self.at_end() {
110 tokens.push(Token::new(TokenKind::Eof, self.loc(), had_space));
111 break;
112 }
113
114 let mut token = self.next_token()?;
115 token.space_before = had_space;
116
117 if matches!(token.kind, TokenKind::Comma) && self.is_line_continuation() {
121 while let Some(ch) = self.peek() {
123 if ch == '\n' {
124 self.advance(); break;
126 }
127 self.advance(); }
129 continue;
130 }
131
132 tokens.push(token);
133 }
134
135 Ok(tokens)
136 }
137
138 fn loc(&self) -> SourceLoc {
139 let mut loc = SourceLoc::new(self.line, self.col);
140 if self.line > 0 && self.line <= self.lines.len() {
141 loc = loc.with_source(self.lines[self.line - 1].clone());
142 }
143 loc
144 }
145
146 fn is_line_continuation(&self) -> bool {
150 let mut i = self.pos;
151 while i < self.source.len() {
152 let ch = self.source[i];
153 match ch {
154 ' ' | '\t' | '\r' => {
155 i += 1;
156 }
157 '\n' => return true,
158 '/' if i + 1 < self.source.len() && self.source[i + 1] == '*' => {
160 i += 2;
161 let mut depth = 1u32;
162 while depth > 0 && i < self.source.len() {
163 if i + 1 < self.source.len()
164 && self.source[i] == '/'
165 && self.source[i + 1] == '*'
166 {
167 depth += 1;
168 i += 2;
169 } else if i + 1 < self.source.len()
170 && self.source[i] == '*'
171 && self.source[i + 1] == '/'
172 {
173 depth -= 1;
174 i += 2;
175 } else {
176 i += 1;
177 }
178 }
179 }
180 '-' if i + 1 < self.source.len() && self.source[i + 1] == '-' => return true,
182 _ => return false,
183 }
184 }
185 true
187 }
188
189 fn at_end(&self) -> bool {
190 self.pos >= self.source.len()
191 }
192
193 fn peek(&self) -> Option<char> {
194 self.source.get(self.pos).copied()
195 }
196
197 fn peek_ahead(&self, n: usize) -> Option<char> {
198 self.source.get(self.pos + n).copied()
199 }
200
201 fn advance(&mut self) -> Option<char> {
202 let ch = self.source.get(self.pos).copied()?;
203 self.pos += 1;
204 if ch == '\n' {
205 self.line += 1;
206 self.col = 1;
207 } else {
208 self.col += 1;
209 }
210 Some(ch)
211 }
212
213 fn skip_whitespace_and_comments(&mut self) -> RexxResult<()> {
214 if self.pos == 0 && self.peek() == Some('#') && self.peek_ahead(1) == Some('!') {
216 while let Some(ch) = self.peek() {
217 if ch == '\n' {
218 break;
219 }
220 self.advance();
221 }
222 }
223
224 loop {
225 while let Some(ch) = self.peek() {
227 if ch == ' ' || ch == '\t' || ch == '\r' {
228 self.advance();
229 } else {
230 break;
231 }
232 }
233
234 if self.peek() == Some('/') && self.peek_ahead(1) == Some('*') {
236 let loc = self.loc();
237 self.advance(); self.advance(); let mut depth = 1u32;
240 while depth > 0 {
241 if self.at_end() {
242 return Err(RexxDiagnostic::new(RexxError::UnmatchedComment).at(loc));
243 }
244 if self.peek() == Some('/') && self.peek_ahead(1) == Some('*') {
245 self.advance();
246 self.advance();
247 depth += 1;
248 } else if self.peek() == Some('*') && self.peek_ahead(1) == Some('/') {
249 self.advance();
250 self.advance();
251 depth -= 1;
252 } else {
253 self.advance();
254 }
255 }
256 continue;
257 }
258
259 if self.peek() == Some('-') && self.peek_ahead(1) == Some('-') {
261 while let Some(ch) = self.peek() {
262 if ch == '\n' {
263 break;
264 }
265 self.advance();
266 }
267 continue;
268 }
269
270 break;
271 }
272 Ok(())
273 }
274
275 #[allow(clippy::too_many_lines)]
276 fn next_token(&mut self) -> RexxResult<Token> {
277 let loc = self.loc();
278 let ch = self.peek().unwrap();
279
280 match ch {
281 '\'' | '"' => self.lex_string(ch),
283
284 '0'..='9' => Ok(self.lex_number()),
286
287 'a'..='z' | 'A'..='Z' | '_' | '!' | '?' | '@' | '#' | '$' => Ok(self.lex_symbol()),
289
290 '.' => {
292 if self
293 .peek_ahead(1)
294 .is_some_and(|c| c.is_alphanumeric() || c == '_')
295 {
296 Ok(self.lex_symbol())
297 } else {
298 self.advance();
299 Ok(Token::new(TokenKind::Dot, loc, false))
300 }
301 }
302
303 '+' => {
305 self.advance();
306 Ok(Token::new(TokenKind::Plus, loc, false))
307 }
308 '-' => {
309 self.advance();
310 Ok(Token::new(TokenKind::Minus, loc, false))
311 }
312 '*' => {
313 self.advance();
314 if self.peek() == Some('*') {
315 self.advance();
316 Ok(Token::new(TokenKind::Power, loc, false))
317 } else {
318 Ok(Token::new(TokenKind::Star, loc, false))
319 }
320 }
321 '/' => {
322 self.advance();
323 if self.peek() == Some('/') {
324 self.advance();
325 Ok(Token::new(TokenKind::Remainder, loc, false))
326 } else {
327 Ok(Token::new(TokenKind::Slash, loc, false))
328 }
329 }
330 '%' => {
331 self.advance();
332 Ok(Token::new(TokenKind::IntDiv, loc, false))
333 }
334 '|' => {
335 self.advance();
336 if self.peek() == Some('|') {
337 self.advance();
338 Ok(Token::new(TokenKind::Concat, loc, false))
339 } else {
340 Ok(Token::new(TokenKind::Or, loc, false))
341 }
342 }
343 '&' => {
344 self.advance();
345 if self.peek() == Some('&') {
346 self.advance();
347 Ok(Token::new(TokenKind::Xor, loc, false))
348 } else {
349 Ok(Token::new(TokenKind::And, loc, false))
350 }
351 }
352 '\\' | '¬' => {
353 self.advance();
354 if self.peek() == Some('=') {
355 self.advance();
356 if self.peek() == Some('=') {
357 self.advance();
358 Ok(Token::new(TokenKind::StrictNotEq, loc, false))
359 } else {
360 Ok(Token::new(TokenKind::NotEqual, loc, false))
361 }
362 } else if self.peek() == Some('<') {
363 self.advance();
364 Ok(Token::new(TokenKind::GreaterEq, loc, false))
365 } else if self.peek() == Some('>') {
366 self.advance();
367 Ok(Token::new(TokenKind::LessEq, loc, false))
368 } else {
369 Ok(Token::new(TokenKind::Not, loc, false))
370 }
371 }
372 '=' => {
373 self.advance();
374 if self.peek() == Some('=') {
375 self.advance();
376 Ok(Token::new(TokenKind::StrictEq, loc, false))
377 } else {
378 Ok(Token::new(TokenKind::Assign, loc, false))
380 }
381 }
382 '>' => {
383 self.advance();
384 if self.peek() == Some('>') {
385 self.advance();
386 if self.peek() == Some('=') {
387 self.advance();
388 Ok(Token::new(TokenKind::StrictGte, loc, false))
389 } else {
390 Ok(Token::new(TokenKind::StrictGt, loc, false))
391 }
392 } else if self.peek() == Some('=') {
393 self.advance();
394 Ok(Token::new(TokenKind::GreaterEq, loc, false))
395 } else {
396 Ok(Token::new(TokenKind::Greater, loc, false))
397 }
398 }
399 '<' => {
400 self.advance();
401 if self.peek() == Some('<') {
402 self.advance();
403 if self.peek() == Some('=') {
404 self.advance();
405 Ok(Token::new(TokenKind::StrictLte, loc, false))
406 } else {
407 Ok(Token::new(TokenKind::StrictLt, loc, false))
408 }
409 } else if self.peek() == Some('=') {
410 self.advance();
411 Ok(Token::new(TokenKind::LessEq, loc, false))
412 } else if self.peek() == Some('>') {
413 self.advance();
414 Ok(Token::new(TokenKind::NotEqual, loc, false))
415 } else {
416 Ok(Token::new(TokenKind::Less, loc, false))
417 }
418 }
419 '(' => {
420 self.advance();
421 Ok(Token::new(TokenKind::LeftParen, loc, false))
422 }
423 ')' => {
424 self.advance();
425 Ok(Token::new(TokenKind::RightParen, loc, false))
426 }
427 ',' => {
428 self.advance();
429 Ok(Token::new(TokenKind::Comma, loc, false))
430 }
431 '\n' => {
432 self.advance();
433 Ok(Token::new(TokenKind::Eol, loc, false))
434 }
435 ';' => {
436 self.advance();
437 Ok(Token::new(TokenKind::Semicolon, loc, false))
438 }
439 ':' => {
440 self.advance();
441 Ok(Token::new(TokenKind::Colon, loc, false))
442 }
443 _ => Err(RexxDiagnostic::new(RexxError::InvalidCharacter)
444 .at(loc)
445 .with_detail(format!("unexpected character '{ch}'"))),
446 }
447 }
448
449 fn lex_string(&mut self, quote: char) -> RexxResult<Token> {
450 let loc = self.loc();
451 self.advance(); let mut value = String::new();
453
454 loop {
455 if self.at_end() {
456 return Err(RexxDiagnostic::new(RexxError::InvalidExpression)
457 .at(loc)
458 .with_detail("unterminated string literal"));
459 }
460 let ch = self.advance().unwrap();
461 if ch == quote {
462 if self.peek() == Some(quote) {
464 self.advance();
465 value.push(quote);
466 } else {
467 break;
468 }
469 } else {
470 value.push(ch);
471 }
472 }
473
474 if let Some(suffix) = self.peek() {
476 match suffix.to_ascii_uppercase() {
477 'X' => {
478 self.advance();
479 let decoded = hex_string_to_chars(&value).map_err(|e| {
480 RexxDiagnostic::new(RexxError::InvalidHexBinary)
481 .at(loc.clone())
482 .with_detail(e)
483 })?;
484 return Ok(Token::new(TokenKind::StringLit(decoded), loc, false));
485 }
486 'B' => {
487 self.advance();
488 let decoded = bin_string_to_chars(&value).map_err(|e| {
489 RexxDiagnostic::new(RexxError::InvalidHexBinary)
490 .at(loc.clone())
491 .with_detail(e)
492 })?;
493 return Ok(Token::new(TokenKind::StringLit(decoded), loc, false));
494 }
495 _ => {}
496 }
497 }
498
499 Ok(Token::new(TokenKind::StringLit(value), loc, false))
500 }
501
502 fn lex_number(&mut self) -> Token {
503 let loc = self.loc();
504 let mut num = String::new();
505
506 while let Some(ch) = self.peek() {
507 if ch.is_ascii_digit() || ch == '.' {
508 num.push(ch);
509 self.advance();
510 } else {
511 break;
512 }
513 }
514
515 if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
517 num.push(self.advance().unwrap());
518 if self.peek().is_some_and(|c| c == '+' || c == '-') {
519 num.push(self.advance().unwrap());
520 }
521 while let Some(ch) = self.peek() {
522 if ch.is_ascii_digit() {
523 num.push(ch);
524 self.advance();
525 } else {
526 break;
527 }
528 }
529 }
530
531 Token::new(TokenKind::Number(num), loc, false)
532 }
533
534 fn lex_symbol(&mut self) -> Token {
535 let loc = self.loc();
536 let mut name = String::new();
537
538 while let Some(ch) = self.peek() {
539 if ch.is_alphanumeric()
540 || ch == '_'
541 || ch == '.'
542 || ch == '!'
543 || ch == '?'
544 || ch == '@'
545 || ch == '#'
546 || ch == '$'
547 {
548 name.push(ch);
549 self.advance();
550 } else {
551 break;
552 }
553 }
554
555 Token::new(TokenKind::Symbol(name), loc, false)
556 }
557}
558
559fn hex_string_to_chars(s: &str) -> Result<String, String> {
561 let hex: String = s.chars().filter(|c| !c.is_whitespace()).collect();
562 if !hex.len().is_multiple_of(2) {
563 return Err("odd number of hex digits".into());
564 }
565 let mut result = String::new();
566 for i in (0..hex.len()).step_by(2) {
567 let byte = u8::from_str_radix(&hex[i..i + 2], 16)
568 .map_err(|_| format!("invalid hex digit at position {i}"))?;
569 result.push(byte as char);
570 }
571 Ok(result)
572}
573
574fn bin_string_to_chars(s: &str) -> Result<String, String> {
576 let bits: String = s.chars().filter(|c| !c.is_whitespace()).collect();
577 if !bits.len().is_multiple_of(8) {
578 return Err("binary string length must be a multiple of 8".into());
579 }
580 let mut result = String::new();
581 for i in (0..bits.len()).step_by(8) {
582 let byte = u8::from_str_radix(&bits[i..i + 8], 2)
583 .map_err(|_| format!("invalid binary digit at position {i}"))?;
584 result.push(byte as char);
585 }
586 Ok(result)
587}
588
589#[cfg(test)]
590mod tests {
591 use super::*;
592
593 #[test]
594 fn simple_say() {
595 let mut lexer = Lexer::new("say 'Hello, World!'");
596 let tokens = lexer.tokenize().unwrap();
597 assert!(matches!(&tokens[0].kind, TokenKind::Symbol(s) if s == "say"));
598 assert!(matches!(&tokens[1].kind, TokenKind::StringLit(s) if s == "Hello, World!"));
599 assert!(matches!(&tokens[2].kind, TokenKind::Eof));
600 }
601
602 #[test]
603 fn arithmetic_tokens() {
604 let mut lexer = Lexer::new("3 + 4 * 2");
605 let tokens = lexer.tokenize().unwrap();
606 assert!(matches!(&tokens[0].kind, TokenKind::Number(n) if n == "3"));
607 assert!(matches!(&tokens[1].kind, TokenKind::Plus));
608 assert!(matches!(&tokens[2].kind, TokenKind::Number(n) if n == "4"));
609 assert!(matches!(&tokens[3].kind, TokenKind::Star));
610 assert!(matches!(&tokens[4].kind, TokenKind::Number(n) if n == "2"));
611 }
612
613 #[test]
614 fn nested_comments() {
615 let mut lexer = Lexer::new("/* outer /* inner */ still comment */ say 'hi'");
616 let tokens = lexer.tokenize().unwrap();
617 assert!(matches!(&tokens[0].kind, TokenKind::Symbol(s) if s == "say"));
618 }
619
620 #[test]
621 fn hex_string() {
622 let mut lexer = Lexer::new("'48656C6C6F'x");
623 let tokens = lexer.tokenize().unwrap();
624 assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "Hello"));
625 }
626
627 #[test]
628 fn doubled_quote_escape() {
629 let mut lexer = Lexer::new("'it''s'");
630 let tokens = lexer.tokenize().unwrap();
631 assert!(matches!(&tokens[0].kind, TokenKind::StringLit(s) if s == "it's"));
632 }
633
634 #[test]
635 fn comparison_operators() {
636 let mut lexer = Lexer::new("a == b \\= c >> d");
637 let tokens = lexer.tokenize().unwrap();
638 assert!(matches!(&tokens[1].kind, TokenKind::StrictEq));
639 assert!(matches!(&tokens[3].kind, TokenKind::NotEqual));
640 assert!(matches!(&tokens[5].kind, TokenKind::StrictGt));
641 }
642
643 #[test]
644 fn shebang_line_skipped() {
645 let mut lexer = Lexer::new("#!/usr/bin/env rexx\nsay 'hello'");
646 let tokens = lexer.tokenize().unwrap();
647 assert!(matches!(&tokens[0].kind, TokenKind::Eol));
648 assert!(matches!(&tokens[1].kind, TokenKind::Symbol(s) if s == "say"));
649 assert!(matches!(&tokens[2].kind, TokenKind::StringLit(s) if s == "hello"));
650 }
651}