1use crate::token::Token;
8
9pub struct Lexer {
11 input: Vec<char>,
12 pos: usize,
13 line: usize,
14 col: usize,
15}
16
17#[derive(Debug)]
19pub struct LexError {
20 pub message: String,
21 pub line: usize,
22 pub col: usize,
23}
24
25impl LexError {
26 fn new(message: impl Into<String>, line: usize, col: usize) -> Self {
27 LexError {
28 message: message.into(),
29 line,
30 col,
31 }
32 }
33}
34
35impl std::fmt::Display for LexError {
36 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37 write!(
38 f,
39 "Lex error at {}:{}: {}",
40 self.line, self.col, self.message
41 )
42 }
43}
44
45impl Lexer {
48 pub fn new(input: &str) -> Self {
49 Lexer {
50 input: input.chars().collect(),
51 pos: 0,
52 line: 1,
53 col: 1,
54 }
55 }
56
57 fn current(&self) -> Option<char> {
59 self.input.get(self.pos).copied()
60 }
61
62 fn peek(&self) -> Option<char> {
64 self.input.get(self.pos + 1).copied()
65 }
66
67 fn advance(&mut self) -> Option<char> {
69 let ch = self.current();
70 if let Some(c) = ch {
71 self.pos += 1;
72 if c == '\n' {
73 self.line += 1;
74 self.col = 1;
75 } else {
76 self.col += 1;
77 }
78 }
79 ch
80 }
81
82 fn error(&self, message: impl Into<String>) -> LexError {
84 LexError::new(message, self.line, self.col)
85 }
86
87 fn skip_whitespace(&mut self) {
90 while let Some(c) = self.current() {
91 if c.is_whitespace() {
92 self.advance();
93 } else {
94 break;
95 }
96 }
97 }
98
99 fn skip_line_comment(&mut self) {
100 while let Some(c) = self.current() {
101 if c == '\n' {
102 break;
103 }
104 self.advance();
105 }
106 }
107
108 fn skip_block_comment(&mut self) {
109 self.advance();
111 self.advance();
112 let mut depth = 1;
113 while depth > 0 {
114 match self.current() {
115 Some('/') if self.peek() == Some('*') => {
116 self.advance();
117 self.advance();
118 depth += 1;
119 }
120 Some('*') if self.peek() == Some('/') => {
121 self.advance();
122 self.advance();
123 depth -= 1;
124 }
125 Some(_) => {
126 self.advance();
127 }
128 None => break,
129 }
130 }
131 }
132
133 fn read_string(&mut self) -> Result<Token, LexError> {
137 self.advance(); let mut s = String::new();
139 loop {
140 match self.current() {
141 Some('"') => {
142 self.advance();
143 return Ok(Token::StringLiteral(s));
144 }
145 Some('\\') => {
146 self.advance();
147 match self.current() {
148 Some('n') => {
149 s.push('\n');
150 self.advance();
151 }
152 Some('t') => {
153 s.push('\t');
154 self.advance();
155 }
156 Some('r') => {
157 s.push('\r');
158 self.advance();
159 }
160 Some('\\') => {
161 s.push('\\');
162 self.advance();
163 }
164 Some('"') => {
165 s.push('"');
166 self.advance();
167 }
168 Some('0') => {
169 s.push('\0');
170 self.advance();
171 }
172 Some(c) => {
173 return Err(self.error(format!("Unknown escape sequence: \\{c}")));
174 }
175 None => {
176 return Err(self.error("Unterminated string"));
177 }
178 }
179 }
180 Some(c) => {
181 s.push(c);
182 self.advance();
183 }
184 None => {
185 return Err(self.error("Unterminated string"));
186 }
187 }
188 }
189 }
190
191 fn read_char(&mut self) -> Result<Token, LexError> {
193 self.advance(); let ch = match self.current() {
195 Some('\\') => {
196 self.advance();
197 match self.current() {
198 Some('n') => '\n',
199 Some('t') => '\t',
200 Some('r') => '\r',
201 Some('\\') => '\\',
202 Some('\'') => '\'',
203 Some('0') => '\0',
204 _ => return Err(self.error("Invalid char escape")),
205 }
206 }
207 Some(c) => c,
208 None => return Err(self.error("Unterminated char literal")),
209 };
210 self.advance();
211 if self.current() != Some('\'') {
212 return Err(self.error("Unterminated char literal"));
213 }
214 self.advance(); Ok(Token::CharLiteral(ch))
216 }
217
218 fn read_number(&mut self) -> Result<Token, LexError> {
221 let mut num = String::new();
222 let mut is_float = false;
223
224 if self.current() == Some('0') {
226 match self.peek() {
227 Some('x') | Some('X') => {
228 num.push('0');
229 self.advance();
230 num.push('x');
231 self.advance();
232 while let Some(c) = self.current() {
233 if c.is_ascii_hexdigit() || c == '_' {
234 if c != '_' {
235 num.push(c);
236 }
237 self.advance();
238 } else {
239 break;
240 }
241 }
242 let val = i64::from_str_radix(&num[2..], 16).map_err(|_| {
243 self.error(format!("Invalid hex literal: {num}"))
244 })?;
245 return Ok(Token::IntLiteral(val));
246 }
247 Some('b') | Some('B') => {
248 self.advance();
249 self.advance();
250 while let Some(c) = self.current() {
251 if c == '0' || c == '1' || c == '_' {
252 if c != '_' {
253 num.push(c);
254 }
255 self.advance();
256 } else {
257 break;
258 }
259 }
260 let val = i64::from_str_radix(&num, 2).map_err(|_| {
261 self.error(format!("Invalid binary literal: 0b{num}"))
262 })?;
263 return Ok(Token::IntLiteral(val));
264 }
265 Some('o') | Some('O') => {
266 self.advance();
267 self.advance();
268 while let Some(c) = self.current() {
269 if ('0'..='7').contains(&c) || c == '_' {
270 if c != '_' {
271 num.push(c);
272 }
273 self.advance();
274 } else {
275 break;
276 }
277 }
278 let val = i64::from_str_radix(&num, 8).map_err(|_| {
279 self.error(format!("Invalid octal literal: 0o{num}"))
280 })?;
281 return Ok(Token::IntLiteral(val));
282 }
283 _ => {}
284 }
285 }
286
287 while let Some(c) = self.current() {
288 if c.is_ascii_digit() || c == '_' {
289 if c != '_' {
290 num.push(c);
291 }
292 self.advance();
293 } else if c == '.' && !is_float {
294 if let Some(next) = self.peek() {
296 if next.is_ascii_digit() {
297 is_float = true;
298 num.push(c);
299 self.advance();
300 } else {
301 break;
302 }
303 } else {
304 break;
305 }
306 } else {
307 break;
308 }
309 }
310
311 if let Some(c) = self.current() {
313 if c == 'i' || c == 'u' || c == 'f' {
314 let start = self.pos;
315 let mut suffix = String::new();
316 while let Some(sc) = self.current() {
317 if sc.is_alphanumeric() {
318 suffix.push(sc);
319 self.advance();
320 } else {
321 break;
322 }
323 }
324 match suffix.as_str() {
326 "i8" | "i16" | "i32" | "i64" | "i128" | "isize" | "u8" | "u16" | "u32"
327 | "u64" | "u128" | "usize" | "f32" | "f64" => {
328 if suffix.starts_with('f') {
329 is_float = true;
330 }
331 }
332 _ => {
333 self.pos = start;
335 }
336 }
337 }
338 }
339
340 if is_float {
341 let val: f64 = num.parse().map_err(|_| {
342 self.error(format!("Invalid float literal: {num}"))
343 })?;
344 Ok(Token::FloatLiteral(val))
345 } else {
346 let val: i64 = num.parse().map_err(|_| {
347 self.error(format!("Invalid integer literal: {num}"))
348 })?;
349 Ok(Token::IntLiteral(val))
350 }
351 }
352
353 fn read_ident(&mut self) -> Token {
355 let mut ident = String::new();
356 while let Some(c) = self.current() {
357 if c.is_alphanumeric() || c == '_' {
358 ident.push(c);
359 self.advance();
360 } else {
361 break;
362 }
363 }
364 Token::keyword_from_str(&ident).unwrap_or(Token::Ident(ident))
365 }
366
367 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
371 let mut tokens = Vec::new();
372
373 loop {
374 self.skip_whitespace();
375
376 match self.current() {
377 None => {
378 tokens.push(Token::Eof);
379 return Ok(tokens);
380 }
381 Some('/') => match self.peek() {
382 Some('/') => {
383 self.skip_line_comment();
384 continue;
385 }
386 Some('*') => {
387 self.skip_block_comment();
388 continue;
389 }
390 Some('=') => {
391 self.advance();
392 self.advance();
393 tokens.push(Token::SlashEq);
394 }
395 _ => {
396 self.advance();
397 tokens.push(Token::Slash);
398 }
399 },
400 Some('"') => {
401 tokens.push(self.read_string()?);
402 }
403 Some('\'') => {
404 tokens.push(self.read_char()?);
406 }
407 Some(c) if c.is_ascii_digit() => {
408 tokens.push(self.read_number()?);
409 }
410 Some(c) if c.is_alphabetic() || c == '_' => {
411 tokens.push(self.read_ident());
412 }
413 Some('+') => {
414 self.advance();
415 if self.current() == Some('=') {
416 self.advance();
417 tokens.push(Token::PlusEq);
418 } else {
419 tokens.push(Token::Plus);
420 }
421 }
422 Some('-') => {
423 self.advance();
424 if self.current() == Some('>') {
425 self.advance();
426 tokens.push(Token::Arrow);
427 } else if self.current() == Some('=') {
428 self.advance();
429 tokens.push(Token::MinusEq);
430 } else {
431 tokens.push(Token::Minus);
432 }
433 }
434 Some('*') => {
435 self.advance();
436 if self.current() == Some('=') {
437 self.advance();
438 tokens.push(Token::StarEq);
439 } else {
440 tokens.push(Token::Star);
441 }
442 }
443 Some('%') => {
444 self.advance();
445 if self.current() == Some('=') {
446 self.advance();
447 tokens.push(Token::PercentEq);
448 } else {
449 tokens.push(Token::Percent);
450 }
451 }
452 Some('=') => {
453 self.advance();
454 if self.current() == Some('=') {
455 self.advance();
456 tokens.push(Token::EqEq);
457 } else if self.current() == Some('>') {
458 self.advance();
459 tokens.push(Token::FatArrow);
460 } else {
461 tokens.push(Token::Eq);
462 }
463 }
464 Some('!') => {
465 self.advance();
466 if self.current() == Some('=') {
467 self.advance();
468 tokens.push(Token::NotEq);
469 } else {
470 tokens.push(Token::Not);
471 }
472 }
473 Some('<') => {
474 self.advance();
475 if self.current() == Some('=') {
476 self.advance();
477 tokens.push(Token::LtEq);
478 } else if self.current() == Some('<') {
479 self.advance();
480 tokens.push(Token::Shl);
481 } else {
482 tokens.push(Token::Lt);
483 }
484 }
485 Some('>') => {
486 self.advance();
487 if self.current() == Some('=') {
488 self.advance();
489 tokens.push(Token::GtEq);
490 } else if self.current() == Some('>') {
491 self.advance();
492 tokens.push(Token::Shr);
493 } else {
494 tokens.push(Token::Gt);
495 }
496 }
497 Some('&') => {
498 self.advance();
499 if self.current() == Some('&') {
500 self.advance();
501 tokens.push(Token::And);
502 } else {
503 tokens.push(Token::Ampersand);
504 }
505 }
506 Some('|') => {
507 self.advance();
508 if self.current() == Some('|') {
509 self.advance();
510 tokens.push(Token::Or);
511 } else {
512 tokens.push(Token::Pipe);
513 }
514 }
515 Some('^') => {
516 self.advance();
517 tokens.push(Token::Caret);
518 }
519 Some('~') => {
520 self.advance();
521 tokens.push(Token::Tilde);
522 }
523 Some('(') => {
524 self.advance();
525 tokens.push(Token::LParen);
526 }
527 Some(')') => {
528 self.advance();
529 tokens.push(Token::RParen);
530 }
531 Some('{') => {
532 self.advance();
533 tokens.push(Token::LBrace);
534 }
535 Some('}') => {
536 self.advance();
537 tokens.push(Token::RBrace);
538 }
539 Some('[') => {
540 self.advance();
541 tokens.push(Token::LBracket);
542 }
543 Some(']') => {
544 self.advance();
545 tokens.push(Token::RBracket);
546 }
547 Some(',') => {
548 self.advance();
549 tokens.push(Token::Comma);
550 }
551 Some(';') => {
552 self.advance();
553 tokens.push(Token::Semicolon);
554 }
555 Some(':') => {
556 self.advance();
557 if self.current() == Some(':') {
558 self.advance();
559 tokens.push(Token::ColonColon);
560 } else {
561 tokens.push(Token::Colon);
562 }
563 }
564 Some('.') => {
565 self.advance();
566 if self.current() == Some('.') {
567 self.advance();
568 if self.current() == Some('=') {
569 self.advance();
570 tokens.push(Token::DotDotEq);
571 } else {
572 tokens.push(Token::DotDot);
573 }
574 } else {
575 tokens.push(Token::Dot);
576 }
577 }
578 Some('#') => {
579 self.advance();
580 if self.current() == Some('[') {
582 let mut depth = 1;
583 self.advance();
584 while depth > 0 {
585 match self.current() {
586 Some('[') => {
587 depth += 1;
588 self.advance();
589 }
590 Some(']') => {
591 depth -= 1;
592 self.advance();
593 }
594 Some(_) => {
595 self.advance();
596 }
597 None => break,
598 }
599 }
600 continue;
601 }
602 tokens.push(Token::Hash);
603 }
604 Some(c) => {
605 return Err(self.error(format!("Unexpected character: '{c}'")));
606 }
607 }
608 }
609 }
610}