1use crate::tokens::{Token, TokenType};
14
15pub struct Lexer<'a> {
16 source: &'a [u8],
17 pos: usize,
18 line: usize,
19 col: usize,
20}
21
22impl<'a> Lexer<'a> {
23 pub fn new(source: &'a str) -> Self {
24 Self {
25 source: source.as_bytes(),
26 pos: 0,
27 line: 1,
28 col: 1,
29 }
30 }
31
32 pub fn tokenize(source: &str) -> Result<Vec<Token>, LexError> {
34 let mut lexer = Lexer::new(source);
35 let mut tokens = Vec::new();
36 loop {
37 let tok = lexer.next_token()?;
38 let is_eof = tok.token_type == TokenType::Eof;
39 tokens.push(tok);
40 if is_eof {
41 break;
42 }
43 }
44 Ok(tokens)
45 }
46
47 pub fn tokenize_with_comments(source: &str) -> Result<Vec<Token>, LexError> {
50 let mut lexer = Lexer::new(source);
51 let mut tokens = Vec::new();
52 loop {
53 let tok = lexer.next_token_with_comments()?;
54 let is_eof = tok.token_type == TokenType::Eof;
55 tokens.push(tok);
56 if is_eof {
57 break;
58 }
59 }
60 Ok(tokens)
61 }
62
63 fn peek(&self) -> Option<u8> {
64 self.source.get(self.pos).copied()
65 }
66
67 fn peek_at(&self, offset: usize) -> Option<u8> {
68 self.source.get(self.pos + offset).copied()
69 }
70
71 fn advance(&mut self) -> Option<u8> {
72 let ch = self.source.get(self.pos).copied()?;
73 self.pos += 1;
74 if ch == b'\n' {
75 self.line += 1;
76 self.col = 1;
77 } else {
78 self.col += 1;
79 }
80 Some(ch)
81 }
82
83 fn skip_whitespace_and_comments(&mut self) {
84 loop {
85 while let Some(ch) = self.peek() {
87 if ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r' {
88 self.advance();
89 } else {
90 break;
91 }
92 }
93 if self.peek() == Some(b'/') && self.peek_at(1) == Some(b'/') {
95 while let Some(ch) = self.peek() {
97 if ch == b'\n' {
98 break;
99 }
100 self.advance();
101 }
102 continue;
104 }
105 break;
106 }
107 }
108
109 fn skip_whitespace_and_maybe_comment(&mut self) -> Option<Token> {
112 while let Some(ch) = self.peek() {
114 if ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r' {
115 self.advance();
116 } else {
117 break;
118 }
119 }
120 if self.peek() == Some(b'/') && self.peek_at(1) == Some(b'/') {
122 let line = self.line;
123 let col = self.col;
124 let start = self.pos;
125 while let Some(ch) = self.peek() {
127 if ch == b'\n' {
128 break;
129 }
130 self.advance();
131 }
132 let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
133 return Some(Token::new(TokenType::Comment, text, line, col));
134 }
135 None
136 }
137
138 fn next_token_with_comments(&mut self) -> Result<Token, LexError> {
140 if let Some(comment_tok) = self.skip_whitespace_and_maybe_comment() {
141 return Ok(comment_tok);
142 }
143
144 let line = self.line;
145 let col = self.col;
146
147 let ch = match self.peek() {
148 Some(ch) => ch,
149 None => return Ok(Token::new(TokenType::Eof, "", line, col)),
150 };
151
152 self.lex_token_char(ch, line, col)
154 }
155
156 fn next_token(&mut self) -> Result<Token, LexError> {
157 self.skip_whitespace_and_comments();
158
159 let line = self.line;
160 let col = self.col;
161
162 let ch = match self.peek() {
163 Some(ch) => ch,
164 None => return Ok(Token::new(TokenType::Eof, "", line, col)),
165 };
166
167 self.lex_token_char(ch, line, col)
168 }
169
170 fn lex_token_char(&mut self, ch: u8, line: usize, col: usize) -> Result<Token, LexError> {
172 match ch {
173 b'(' => {
174 self.advance();
175 Ok(Token::new(TokenType::LParen, "(", line, col))
176 }
177 b')' => {
178 self.advance();
179 Ok(Token::new(TokenType::RParen, ")", line, col))
180 }
181 b'[' => {
182 self.advance();
183 Ok(Token::new(TokenType::LBracket, "[", line, col))
184 }
185 b']' => {
186 self.advance();
187 Ok(Token::new(TokenType::RBracket, "]", line, col))
188 }
189 b',' => {
190 self.advance();
191 Ok(Token::new(TokenType::Comma, ",", line, col))
192 }
193 b';' => {
194 self.advance();
195 Ok(Token::new(TokenType::Semicolon, ";", line, col))
196 }
197 b':' => {
198 self.advance();
199 Ok(Token::new(TokenType::Colon, ":", line, col))
200 }
201 b'~' => {
202 self.advance();
203 Ok(Token::new(TokenType::Tilde, "~", line, col))
204 }
205 b'.' => {
206 if self.matches_ahead(b".attr(") {
208 for _ in 0..6 {
209 self.advance();
210 }
211 Ok(Token::new(TokenType::DotAttrLParen, ".attr(", line, col))
212 } else {
213 self.advance();
214 Ok(Token::new(TokenType::Dot, ".", line, col))
215 }
216 }
217 b'=' => {
218 if self.peek_at(1) == Some(b'=') || self.is_operator_char_at(1) {
221 self.lex_operator(line, col)
222 } else {
223 self.advance();
224 Ok(Token::new(TokenType::Eq, "=", line, col))
225 }
226 }
227 b'"' => self.lex_string(line, col),
228 b'-' => {
229 if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
235 self.lex_number(line, col)
236 } else {
237 self.lex_operator(line, col)
238 }
239 }
240 _ if ch.is_ascii_digit() => self.lex_number(line, col),
241 _ if is_ident_start(ch) => self.lex_identifier(line, col),
242 _ if is_operator_char(ch) => self.lex_operator(line, col),
243 _ => Err(LexError {
244 message: format!("Unexpected character '{}'", ch as char),
245 line,
246 column: col,
247 }),
248 }
249 }
250
251 fn matches_ahead(&self, pattern: &[u8]) -> bool {
252 if self.pos + pattern.len() > self.source.len() {
253 return false;
254 }
255 &self.source[self.pos..self.pos + pattern.len()] == pattern
256 }
257
258 fn is_operator_char_at(&self, offset: usize) -> bool {
259 self.peek_at(offset).is_some_and(is_operator_char)
260 }
261
262 fn lex_identifier(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
265 let start = self.pos;
266 self.advance();
268 while let Some(ch) = self.peek() {
270 if ch.is_ascii_alphanumeric() || ch == b'_' {
271 self.advance();
272 } else {
273 break;
274 }
275 }
276 while self.peek() == Some(b'-') {
279 if let Some(next) = self.peek_at(1) {
282 if next.is_ascii_alphanumeric() || next == b'_' {
283 self.advance(); while let Some(ch) = self.peek() {
286 if ch.is_ascii_alphanumeric() || ch == b'_' {
287 self.advance();
288 } else {
289 break;
290 }
291 }
292 } else {
293 break;
294 }
295 } else {
296 break;
297 }
298 }
299
300 let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
301 let token_type = keyword_or_ident(text);
302 Ok(Token::new(token_type, text, line, col))
303 }
304
305 fn lex_number(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
307 let start = self.pos;
308
309 if self.peek() == Some(b'-') {
311 self.advance();
312 }
313
314 self.consume_digits();
316
317 if self.peek() == Some(b'.') {
319 let after_dot = self.peek_at(1);
323 let consume_dot = match after_dot {
324 Some(d) if d.is_ascii_digit() => true,
325 Some(d) if is_ident_start(d) => false,
328 _ => true, };
330 if consume_dot {
331 self.advance(); self.consume_digits(); }
334 }
335
336 if let Some(ch) = self.peek() {
338 if ch == b'e' || ch == b'E' {
339 self.advance(); if let Some(sign) = self.peek() {
342 if sign == b'+' || sign == b'-' {
343 self.advance();
344 }
345 }
346 self.consume_digits();
347 }
348 }
349
350 let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
351 Ok(Token::new(TokenType::NumberLit, text, line, col))
352 }
353
354 fn consume_digits(&mut self) {
355 while let Some(ch) = self.peek() {
356 if ch.is_ascii_digit() {
357 self.advance();
358 } else {
359 break;
360 }
361 }
362 }
363
364 fn lex_string(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
366 let start = self.pos;
367 self.advance(); loop {
370 match self.peek() {
371 Some(b'"') => {
372 self.advance(); break;
374 }
375 Some(b'\\') => {
376 self.advance(); self.advance(); }
379 Some(_) => {
380 self.advance();
381 }
382 None => {
383 return Err(LexError {
384 message: "Unterminated string literal".to_string(),
385 line,
386 column: col,
387 });
388 }
389 }
390 }
391
392 let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
393 Ok(Token::new(TokenType::StringLit, text, line, col))
394 }
395
396 fn lex_operator(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
399 let start = self.pos;
400 while let Some(ch) = self.peek() {
401 if is_operator_char(ch) || ch == b'=' {
402 self.advance();
403 } else {
404 break;
405 }
406 }
407 let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
408 Ok(Token::new(TokenType::Operator, text, line, col))
409 }
410}
411
412fn is_ident_start(ch: u8) -> bool {
413 ch.is_ascii_alphabetic() || ch == b'_'
414}
415
416fn is_operator_char(ch: u8) -> bool {
417 matches!(
418 ch,
419 b'*' | b'+' | b'/' | b'%' | b'!' | b'<' | b'>' | b'&' | b'|' | b'^' | b'?'
420 )
421}
422
423fn keyword_or_ident(text: &str) -> TokenType {
424 match text {
425 "wire" => TokenType::Wire,
426 "in" => TokenType::In,
427 "out" => TokenType::Out,
428 "state" => TokenType::State,
429 "msg" => TokenType::Msg,
430 "feedback" => TokenType::Feedback,
431 "signal" => TokenType::Signal,
432 "float" => TokenType::Float,
433 "int" => TokenType::Int,
434 "bang" => TokenType::Bang,
435 "list" => TokenType::List,
436 "symbol" => TokenType::Symbol,
437 _ => TokenType::Identifier,
438 }
439}
440
441#[derive(Debug)]
442pub struct LexError {
443 pub message: String,
444 pub line: usize,
445 pub column: usize,
446}
447
448impl std::fmt::Display for LexError {
449 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
450 write!(
451 f,
452 "Lex error at {}:{}: {}",
453 self.line, self.column, self.message
454 )
455 }
456}
457
458impl std::error::Error for LexError {}
459
460#[cfg(test)]
461mod tests {
462 use super::*;
463 use crate::tokens::TokenType::*;
464
465 fn types(source: &str) -> Vec<TokenType> {
466 Lexer::tokenize(source)
467 .unwrap()
468 .into_iter()
469 .map(|t| t.token_type)
470 .collect()
471 }
472
473 fn lexemes(source: &str) -> Vec<String> {
474 Lexer::tokenize(source)
475 .unwrap()
476 .into_iter()
477 .map(|t| t.lexeme)
478 .collect()
479 }
480
481 #[test]
482 fn test_simple_wire() {
483 let toks = types("wire osc = cycle~(440);");
484 assert_eq!(
485 toks,
486 vec![
487 Wire, Identifier, Eq, Identifier, Tilde, LParen, NumberLit, RParen, Semicolon, Eof,
491 ]
492 );
493 }
494
495 #[test]
496 fn test_in_decl() {
497 let toks = types("in 0 (freq): signal;");
498 assert_eq!(
499 toks,
500 vec![In, NumberLit, LParen, Identifier, RParen, Colon, Signal, Semicolon, Eof]
501 );
502 }
503
504 #[test]
505 fn test_dotted_identifier() {
506 let lex = lexemes("jit.gl.render(440)");
508 assert_eq!(
509 lex,
510 vec!["jit", ".", "gl", ".", "render", "(", "440", ")", ""]
511 );
512 }
513
514 #[test]
515 fn test_port_access() {
516 let toks = types("w_1.in[0]");
518 assert_eq!(
519 toks,
520 vec![Identifier, Dot, In, LBracket, NumberLit, RBracket, Eof]
521 );
522 }
523
524 #[test]
525 fn test_output_port_access() {
526 let toks = types("w_1.out[1]");
527 assert_eq!(
528 toks,
529 vec![Identifier, Dot, Out, LBracket, NumberLit, RBracket, Eof]
530 );
531 }
532
533 #[test]
534 fn test_numbers() {
535 let lex = lexemes("42");
537 assert_eq!(lex, vec!["42", ""]);
538
539 let lex = lexemes("3.14");
541 assert_eq!(lex, vec!["3.14", ""]);
542
543 let lex = lexemes("-7");
545 assert_eq!(lex, vec!["-7", ""]);
546
547 let lex = lexemes("100.");
549 assert_eq!(lex, vec!["100.", ""]);
550
551 let lex = lexemes("1e-6");
553 assert_eq!(lex, vec!["1e-6", ""]);
554
555 let lex = lexemes("3.14E+5");
557 assert_eq!(lex, vec!["3.14E+5", ""]);
558 }
559
560 #[test]
561 fn test_string() {
562 let toks = Lexer::tokenize(r#""hello world""#).unwrap();
563 assert_eq!(toks.len(), 2); assert_eq!(toks[0].token_type, StringLit);
565 assert_eq!(toks[0].lexeme, r#""hello world""#);
566 }
567
568 #[test]
569 fn test_string_with_escapes() {
570 let toks = Lexer::tokenize(r#""hello \"world\"""#).unwrap();
571 assert_eq!(toks[0].token_type, StringLit);
572 assert_eq!(toks[0].lexeme, r#""hello \"world\"""#);
573 }
574
575 #[test]
576 fn test_operator_names() {
577 let toks = types("?(a, b)");
578 assert_eq!(
579 toks,
580 vec![Operator, LParen, Identifier, Comma, Identifier, RParen, Eof]
581 );
582
583 let lex = lexemes("*(x, y)");
584 assert_eq!(lex[0], "*");
585 }
586
587 #[test]
588 fn test_comment_skipped() {
589 let toks = types("// comment\nwire x = 1;");
590 assert_eq!(toks, vec![Wire, Identifier, Eq, NumberLit, Semicolon, Eof]);
591 }
592
593 #[test]
594 fn test_hyphenated_identifier() {
595 let lex = lexemes("drunk-walk");
596 assert_eq!(lex, vec!["drunk-walk", ""]);
597 }
598
599 #[test]
600 fn test_dot_attr_lparen() {
601 let toks = types(".attr(minimum: 0)");
602 assert_eq!(
603 toks,
604 vec![DotAttrLParen, Identifier, Colon, NumberLit, RParen, Eof]
605 );
606 }
607
608 #[test]
609 fn test_negative_float() {
610 let lex = lexemes("-3.14");
611 assert_eq!(lex, vec!["-3.14", ""]);
612 }
613
614 #[test]
615 fn test_line_column_tracking() {
616 let toks = Lexer::tokenize("wire x\n = 1;").unwrap();
617 assert_eq!((toks[0].line, toks[0].column), (1, 1));
619 assert_eq!((toks[1].line, toks[1].column), (1, 6));
621 assert_eq!((toks[2].line, toks[2].column), (2, 3));
623 assert_eq!((toks[3].line, toks[3].column), (2, 5));
625 }
626
627 #[test]
628 fn test_empty_source() {
629 let toks = types("");
630 assert_eq!(toks, vec![Eof]);
631 }
632
633 #[test]
634 fn test_out_assignment_tokens() {
635 let toks = types("out[0] = osc;");
636 assert_eq!(
637 toks,
638 vec![Out, LBracket, NumberLit, RBracket, Eq, Identifier, Semicolon, Eof]
639 );
640 }
641
642 #[test]
643 fn test_operator_eq_disambiguation() {
644 let lex = lexemes("==(a, b)");
646 assert_eq!(lex[0], "==");
647 assert_eq!(
648 types("==(a, b)"),
649 vec![Operator, LParen, Identifier, Comma, Identifier, RParen, Eof]
650 );
651 }
652
653 #[test]
654 fn test_dotted_segment_with_digit() {
655 let lex = lexemes("jit.3m");
660 assert_eq!(lex, vec!["jit", ".", "3", "m", ""]);
663 }
664
665 #[test]
666 fn test_msg_tokens() {
667 let toks = types(r#"msg click = "bang";"#);
668 assert_eq!(toks, vec![Msg, Identifier, Eq, StringLit, Semicolon, Eof]);
669 }
670
671 #[test]
672 fn test_feedback_tokens() {
673 let toks = types("feedback fb: signal;");
674 assert_eq!(
675 toks,
676 vec![Feedback, Identifier, Colon, Signal, Semicolon, Eof]
677 );
678 }
679
680 #[test]
681 fn test_state_tokens() {
682 let toks = types("state counter: int = 0;");
683 assert_eq!(
684 toks,
685 vec![State, Identifier, Colon, Int, Eq, NumberLit, Semicolon, Eof]
686 );
687 }
688
689 #[test]
690 fn test_string_with_url() {
691 let toks = Lexer::tokenize(r#""http://example.com""#).unwrap();
693 assert_eq!(toks.len(), 2);
694 assert_eq!(toks[0].token_type, StringLit);
695 assert_eq!(toks[0].lexeme, r#""http://example.com""#);
696 }
697
698 #[test]
699 fn test_complex_expr() {
700 let lex = lexemes("mul~(osc, 0.5)");
702 assert_eq!(lex, vec!["mul", "~", "(", "osc", ",", "0.5", ")", ""]);
703 }
704}