1use crate::tokens::{keyword_type, Token, TokenType};
21
22#[derive(Debug)]
25pub struct LexerError {
26 pub message: String,
27 pub line: u32,
28 pub column: u32,
29}
30
31pub struct Lexer {
34 source: Vec<char>,
35 _filename: String,
36 pos: usize,
37 line: u32,
38 column: u32,
39 tokens: Vec<Token>,
40 strip_comments: bool,
44}
45
46impl Lexer {
47 pub fn new(source: &str, filename: &str) -> Self {
48 Lexer {
49 source: source.chars().collect(),
50 _filename: filename.to_string(),
51 pos: 0,
52 line: 1,
53 column: 1,
54 tokens: Vec::new(),
55 strip_comments: false,
56 }
57 }
58
59 pub fn tokenize(self) -> Result<Vec<Token>, LexerError> {
62 self.tokenize_with(false)
63 }
64
65 pub fn tokenize_with(mut self, strip_comments: bool) -> Result<Vec<Token>, LexerError> {
72 self.strip_comments = strip_comments;
73 while !self.at_end() {
74 self.consume_trivia()?;
75 if self.at_end() {
76 break;
77 }
78 self.scan_token()?;
79 }
80 self.tokens.push(Token {
81 ttype: TokenType::Eof,
82 value: String::new(),
83 line: self.line,
84 column: self.column,
85 });
86 Ok(self.tokens)
87 }
88
89 fn at_end(&self) -> bool {
92 self.pos >= self.source.len()
93 }
94
95 fn peek(&self) -> char {
96 if self.at_end() {
97 '\0'
98 } else {
99 self.source[self.pos]
100 }
101 }
102
103 fn peek_next(&self) -> char {
104 if self.pos + 1 >= self.source.len() {
105 '\0'
106 } else {
107 self.source[self.pos + 1]
108 }
109 }
110
111 fn advance(&mut self) -> char {
112 let ch = self.source[self.pos];
113 self.pos += 1;
114 if ch == '\n' {
115 self.line += 1;
116 self.column = 1;
117 } else {
118 self.column += 1;
119 }
120 ch
121 }
122
123 fn match_char(&mut self, expected: char) -> bool {
124 if self.at_end() || self.source[self.pos] != expected {
125 return false;
126 }
127 self.advance();
128 true
129 }
130
131 fn emit(&mut self, ttype: TokenType, value: &str, line: u32, column: u32) {
132 self.tokens.push(Token {
133 ttype,
134 value: value.to_string(),
135 line,
136 column,
137 });
138 }
139
140 fn consume_trivia(&mut self) -> Result<(), LexerError> {
150 while !self.at_end() {
151 let ch = self.peek();
152 if ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' {
153 self.advance();
154 } else if ch == '/' && self.peek_next() == '/' {
155 self.consume_line_comment();
156 } else if ch == '/' && self.peek_next() == '*' {
157 self.consume_block_comment()?;
158 } else {
159 break;
160 }
161 }
162 Ok(())
163 }
164
165 fn consume_line_comment(&mut self) {
176 let line = self.line;
177 let col = self.column;
178 self.advance(); self.advance(); let is_outer_doc = self.peek() == '/' && self.peek_next() != '/';
181 let is_inner_doc = !is_outer_doc && self.peek() == '!';
182 if is_outer_doc || is_inner_doc {
183 self.advance(); }
185
186 let body_start = self.pos;
187 while !self.at_end() && self.peek() != '\n' {
188 self.advance();
189 }
190 let body: String = self.source[body_start..self.pos].iter().collect();
191
192 let (ttype, full_text) = if is_outer_doc {
193 (TokenType::DocLineComment, format!("///{body}"))
194 } else if is_inner_doc {
195 (TokenType::InnerDocLineComment, format!("//!{body}"))
196 } else {
197 (TokenType::LineComment, format!("//{body}"))
198 };
199
200 if !self.strip_comments {
201 self.emit(ttype, &full_text, line, col);
202 }
203 }
204
205 fn consume_block_comment(&mut self) -> Result<(), LexerError> {
214 let line = self.line;
215 let col = self.column;
216 self.advance(); self.advance(); let is_outer_doc = self.peek() == '*' && self.peek_next() != '/';
219 let is_inner_doc = !is_outer_doc && self.peek() == '!';
220 if is_outer_doc || is_inner_doc {
221 self.advance(); }
223
224 let body_start = self.pos;
225 while !self.at_end() {
226 if self.peek() == '*' && self.peek_next() == '/' {
227 let body: String = self.source[body_start..self.pos].iter().collect();
228 self.advance(); self.advance(); let (ttype, full_text) = if is_outer_doc {
231 (TokenType::DocBlockComment, format!("/**{body}*/"))
232 } else if is_inner_doc {
233 (TokenType::InnerDocBlockComment, format!("/*!{body}*/"))
234 } else {
235 (TokenType::BlockComment, format!("/*{body}*/"))
236 };
237 if !self.strip_comments {
238 self.emit(ttype, &full_text, line, col);
239 }
240 return Ok(());
241 }
242 self.advance();
243 }
244 Err(LexerError {
245 message: "Unterminated block comment".to_string(),
246 line,
247 column: col,
248 })
249 }
250
251 fn scan_token(&mut self) -> Result<(), LexerError> {
254 let line = self.line;
255 let col = self.column;
256 let ch = self.advance();
257
258 match ch {
259 '{' => self.emit(TokenType::LBrace, "{", line, col),
260 '}' => self.emit(TokenType::RBrace, "}", line, col),
261 '(' => self.emit(TokenType::LParen, "(", line, col),
262 ')' => self.emit(TokenType::RParen, ")", line, col),
263 '[' => self.emit(TokenType::LBracket, "[", line, col),
264 ']' => self.emit(TokenType::RBracket, "]", line, col),
265 ':' => self.emit(TokenType::Colon, ":", line, col),
266 ',' => self.emit(TokenType::Comma, ",", line, col),
267 '?' => self.emit(TokenType::Question, "?", line, col),
268 '@' => self.emit(TokenType::At, "@", line, col),
269 '+' => self.emit(TokenType::Plus, "+", line, col),
270 '*' => self.emit(TokenType::Star, "*", line, col),
271
272 '.' => {
273 if self.match_char('.') {
274 self.emit(TokenType::DotDot, "..", line, col);
275 } else {
276 self.emit(TokenType::Dot, ".", line, col);
277 }
278 }
279
280 '-' => {
281 if self.match_char('>') {
282 self.emit(TokenType::Arrow, "->", line, col);
283 } else if !self.at_end() && self.peek().is_ascii_digit() {
284 self.scan_number(line, col, '\0', true)?;
285 } else {
286 self.emit(TokenType::Minus, "-", line, col);
287 }
288 }
289
290 '/' => self.emit(TokenType::Slash, "/", line, col),
291
292 '<' => {
293 if self.match_char('=') {
294 self.emit(TokenType::Lte, "<=", line, col);
295 } else {
296 self.emit(TokenType::Lt, "<", line, col);
297 }
298 }
299 '>' => {
300 if self.match_char('=') {
301 self.emit(TokenType::Gte, ">=", line, col);
302 } else {
303 self.emit(TokenType::Gt, ">", line, col);
304 }
305 }
306 '=' => {
307 if self.match_char('=') {
308 self.emit(TokenType::Eq, "==", line, col);
309 } else {
310 self.emit(TokenType::Assign, "=", line, col);
311 }
312 }
313 '!' => {
314 if self.match_char('=') {
315 self.emit(TokenType::Neq, "!=", line, col);
316 } else {
317 return Err(LexerError {
318 message: "Unexpected '!'. Did you mean '!='?".to_string(),
319 line,
320 column: col,
321 });
322 }
323 }
324
325 '"' => self.scan_string(line, col)?,
326
327 c if c.is_ascii_digit() => self.scan_number(line, col, c, false)?,
328 c if c.is_alphabetic() || c == '_' => self.scan_identifier(line, col, c),
329
330 '$' => {
338 return Err(LexerError {
339 message: "Unexpected '$'. Interpolation `${name}` / `$name` is only \
340 valid inside a string literal — wrap it in quotes, e.g. \
341 `\"${name}\"`."
342 .to_string(),
343 line,
344 column: col,
345 });
346 }
347
348 c => {
349 return Err(LexerError {
350 message: format!("Unexpected character {:?}", c),
351 line,
352 column: col,
353 });
354 }
355 }
356
357 Ok(())
358 }
359
360 fn scan_string(&mut self, start_line: u32, start_col: u32) -> Result<(), LexerError> {
363 let mut chars = String::new();
364 while !self.at_end() && self.peek() != '"' {
365 if self.peek() == '\n' {
366 chars.push(self.advance());
367 continue;
368 }
369 if self.peek() == '\\' {
370 self.advance(); if self.at_end() {
372 return Err(LexerError {
373 message: "Unterminated escape sequence".to_string(),
374 line: self.line,
375 column: self.column,
376 });
377 }
378 let esc = self.advance();
379 match esc {
380 'n' => chars.push('\n'),
381 't' => chars.push('\t'),
382 '\\' => chars.push('\\'),
383 '"' => chars.push('"'),
384 c => chars.push(c),
385 }
386 } else {
387 chars.push(self.advance());
388 }
389 }
390 if self.at_end() {
391 return Err(LexerError {
392 message: "Unterminated string".to_string(),
393 line: start_line,
394 column: start_col,
395 });
396 }
397 self.advance(); self.emit(TokenType::StringLit, &chars, start_line, start_col);
399 Ok(())
400 }
401
402 fn scan_number(
405 &mut self,
406 start_line: u32,
407 start_col: u32,
408 first_char: char,
409 negative: bool,
410 ) -> Result<(), LexerError> {
411 let mut digits = String::new();
412 if negative {
413 digits.push('-');
414 }
415 if first_char != '\0' {
416 digits.push(first_char);
417 }
418
419 while !self.at_end() && self.peek().is_ascii_digit() {
421 digits.push(self.advance());
422 }
423
424 let mut is_float = false;
425
426 if !self.at_end() && self.peek() == '.' && self.peek_next() != '.' {
428 is_float = true;
429 digits.push(self.advance()); if self.at_end() || !self.peek().is_ascii_digit() {
431 return Err(LexerError {
432 message: "Expected digit after decimal point".to_string(),
433 line: self.line,
434 column: self.column,
435 });
436 }
437 while !self.at_end() && self.peek().is_ascii_digit() {
438 digits.push(self.advance());
439 }
440 }
441
442 let raw = digits.clone();
443
444 if !self.at_end() && self.peek().is_alphabetic() {
446 let saved_pos = self.pos;
447 let saved_col = self.column;
448 let mut suffix = String::new();
449 while !self.at_end() && self.peek().is_alphabetic() {
450 suffix.push(self.advance());
451 }
452 if matches!(suffix.as_str(), "s" | "ms" | "m" | "h" | "d") {
453 let value = format!("{}{}", raw, suffix);
454 self.emit(TokenType::Duration, &value, start_line, start_col);
455 return Ok(());
456 } else {
457 self.pos = saved_pos;
459 self.column = saved_col;
460 }
461 }
462
463 if is_float {
464 self.emit(TokenType::Float, &raw, start_line, start_col);
465 } else {
466 self.emit(TokenType::Integer, &raw, start_line, start_col);
467 }
468 Ok(())
469 }
470
471 fn scan_identifier(&mut self, start_line: u32, start_col: u32, first_char: char) {
472 let mut word = String::new();
473 word.push(first_char);
474 while !self.at_end() && (self.peek().is_alphanumeric() || self.peek() == '_') {
475 word.push(self.advance());
476 }
477 let ttype = keyword_type(&word);
478 self.emit(ttype, &word, start_line, start_col);
479 }
480}
481
482#[cfg(test)]
483mod fase_1_to_5_end_to_end {
484 use super::*;
488
489 fn kinds(source: &str) -> Vec<TokenType> {
490 Lexer::new(source, "<test>")
491 .tokenize()
492 .expect("lex ok")
493 .into_iter()
494 .map(|t| t.ttype)
495 .collect()
496 }
497
498 #[test]
499 fn resource_decl_tokenizes() {
500 let kinds = kinds("resource Db { kind: postgres lifetime: linear }");
501 assert!(kinds.contains(&TokenType::Resource));
502 assert!(kinds.contains(&TokenType::LBrace));
503 assert!(kinds.contains(&TokenType::RBrace));
504 }
505
506 #[test]
507 fn fabric_manifest_observe_tokenize() {
508 let src = r#"
509 fabric Vpc { provider: aws region: "us-east-1" zones: 2 }
510 manifest M { resources: [Db] fabric: Vpc }
511 observe O { sources: [M] quorum: 1 on_partition: degrade }
512 "#;
513 let k = kinds(src);
514 assert!(k.contains(&TokenType::Fabric));
515 assert!(k.contains(&TokenType::Manifest));
516 assert!(k.contains(&TokenType::Observe));
517 }
518
519 #[test]
520 fn reconcile_lease_ensemble_tokenize() {
521 let src = r#"
522 reconcile R { manifest: M observe: O max_retries: 3 period: "60s" }
523 lease L { resource: Db ttl: "30m" renewable: true }
524 ensemble E { daemons: [] quorum: 1 disagreement: degrade }
525 "#;
526 let k = kinds(src);
527 assert!(k.contains(&TokenType::Reconcile));
528 assert!(k.contains(&TokenType::Lease));
529 assert!(k.contains(&TokenType::Ensemble));
530 }
531
532 #[test]
533 fn topology_and_session_pi_calculus_tokenize() {
534 let src = r#"
535 session S {
536 client: [send Request end]
537 server: [receive Request end]
538 }
539 topology T { nodes: [A, B] edges: [A -> B : S] }
540 "#;
541 let k = kinds(src);
542 assert!(k.contains(&TokenType::Session));
543 assert!(k.contains(&TokenType::Send));
544 assert!(k.contains(&TokenType::Receive));
545 assert!(k.contains(&TokenType::End));
546 assert!(k.contains(&TokenType::Topology));
547 }
548
549 #[test]
550 fn immune_reflex_heal_tokenize() {
551 let src = r#"
552 immune I { sensitivity: 0.5 window: "1m" baseline: "7d" action: alert }
553 reflex Rf { on: drift action: throttle }
554 heal H { target: I max_patches: 3 rollback_on: divergence }
555 "#;
556 let k = kinds(src);
557 assert!(k.contains(&TokenType::Immune));
558 assert!(k.contains(&TokenType::Reflex));
559 assert!(k.contains(&TokenType::Heal));
560 }
561
562 #[test]
563 fn new_keywords_do_not_collide_with_identifiers() {
564 let k = kinds("resource_group manifested observer reconciled leased");
566 for tt in k.iter() {
567 assert!(
568 !matches!(
569 tt,
570 TokenType::Resource
571 | TokenType::Manifest
572 | TokenType::Observe
573 | TokenType::Reconcile
574 | TokenType::Lease
575 ),
576 "near-match identifier wrongly classified as keyword: {tt:?}"
577 );
578 }
579 }
580}
581
582#[cfg(test)]
585mod fase14a_trivia_tests {
586 use super::*;
587
588 fn lex(src: &str) -> Vec<Token> {
589 Lexer::new(src, "<test>").tokenize().expect("lex")
590 }
591
592 fn lex_strip(src: &str) -> Vec<Token> {
593 Lexer::new(src, "<test>").tokenize_with(true).expect("lex")
594 }
595
596 fn non_eof(toks: &[Token]) -> Vec<&Token> {
597 toks.iter().filter(|t| t.ttype != TokenType::Eof).collect()
598 }
599
600 #[test]
601 fn regular_line_comment_emitted_as_line_comment() {
602 let toks = lex("// hi");
603 let body: Vec<_> = non_eof(&toks);
604 assert_eq!(body.len(), 1);
605 assert_eq!(body[0].ttype, TokenType::LineComment);
606 assert_eq!(body[0].value, "// hi");
607 }
608
609 #[test]
610 fn doc_line_comment_emitted_with_doc_kind() {
611 let toks = lex("/// docs");
612 let body: Vec<_> = non_eof(&toks);
613 assert_eq!(body[0].ttype, TokenType::DocLineComment);
614 assert_eq!(body[0].value, "/// docs");
615 }
616
617 #[test]
618 fn four_slash_banner_is_regular_not_doc() {
619 let toks = lex("//// banner");
621 assert_eq!(non_eof(&toks)[0].ttype, TokenType::LineComment);
622 }
623
624 #[test]
625 fn regular_block_comment_emitted() {
626 let toks = lex("/* body */");
627 assert_eq!(non_eof(&toks)[0].ttype, TokenType::BlockComment);
628 assert_eq!(non_eof(&toks)[0].value, "/* body */");
629 }
630
631 #[test]
632 fn doc_block_comment_emitted() {
633 let toks = lex("/** docs */");
634 assert_eq!(non_eof(&toks)[0].ttype, TokenType::DocBlockComment);
635 }
636
637 #[test]
638 fn empty_block_is_regular_not_doc() {
639 let toks = lex("/**/");
641 assert_eq!(non_eof(&toks)[0].ttype, TokenType::BlockComment);
642 }
643
644 #[test]
645 fn strip_comments_opt_in_legacy() {
646 let src = "// dropped\nflow F() -> Out { }";
647 let toks = lex_strip(src);
648 for t in &toks {
649 assert!(
650 !matches!(
651 t.ttype,
652 TokenType::LineComment
653 | TokenType::BlockComment
654 | TokenType::DocLineComment
655 | TokenType::DocBlockComment
656 | TokenType::InnerDocLineComment
657 | TokenType::InnerDocBlockComment
658 ),
659 "strip_comments=true must not emit any comment kind, got {:?}",
660 t.ttype
661 );
662 }
663 }
664
665 #[test]
668 fn inner_doc_line_comment_emitted_with_inner_doc_kind() {
669 let toks = lex("//! module docs");
670 let body: Vec<_> = non_eof(&toks);
671 assert_eq!(body[0].ttype, TokenType::InnerDocLineComment);
672 assert_eq!(body[0].value, "//! module docs");
673 }
674
675 #[test]
676 fn inner_doc_block_comment_emitted_with_inner_doc_kind() {
677 let toks = lex("/*! module docs */");
678 let body: Vec<_> = non_eof(&toks);
679 assert_eq!(body[0].ttype, TokenType::InnerDocBlockComment);
680 assert_eq!(body[0].value, "/*! module docs */");
681 }
682
683 #[test]
684 fn outer_and_inner_doc_distinguished() {
685 let toks = lex("/// outer\n//! inner\n// plain");
688 let body: Vec<_> = non_eof(&toks);
689 assert_eq!(body.len(), 3);
690 assert_eq!(body[0].ttype, TokenType::DocLineComment);
691 assert_eq!(body[1].ttype, TokenType::InnerDocLineComment);
692 assert_eq!(body[2].ttype, TokenType::LineComment);
693 }
694
695 #[test]
696 fn block_outer_and_inner_doc_distinguished() {
697 let toks = lex("/** outer */\n/*! inner */\n/* plain */");
698 let body: Vec<_> = non_eof(&toks);
699 assert_eq!(body.len(), 3);
700 assert_eq!(body[0].ttype, TokenType::DocBlockComment);
701 assert_eq!(body[1].ttype, TokenType::InnerDocBlockComment);
702 assert_eq!(body[2].ttype, TokenType::BlockComment);
703 }
704
705 #[test]
706 fn comment_loc_preserved_across_lines() {
707 let toks = lex("// a\n/// b\n/* c */");
708 let body: Vec<_> = non_eof(&toks);
709 assert_eq!(body[0].line, 1);
710 assert_eq!(body[1].line, 2);
711 assert_eq!(body[2].line, 3);
712 }
713
714 #[test]
715 fn unterminated_block_still_errors() {
716 let result = Lexer::new("/* never closes", "<test>").tokenize();
717 assert!(result.is_err());
718 let err = result.unwrap_err();
719 assert!(err.message.contains("Unterminated"));
720 }
721
722 #[test]
723 fn trivia_helpers_strip_markers() {
724 use crate::tokens::{Trivia, TriviaKind};
725 let t = Trivia {
726 kind: TriviaKind::DocLine,
727 text: "/// hi".into(),
728 line: 1,
729 column: 1,
730 };
731 assert!(t.is_doc());
732 assert_eq!(t.stripped_text(), " hi");
733 let b = Trivia {
734 kind: TriviaKind::DocBlock,
735 text: "/** body */".into(),
736 line: 1,
737 column: 1,
738 };
739 assert!(b.is_doc());
740 assert_eq!(b.stripped_text(), " body ");
741 let r = Trivia {
742 kind: TriviaKind::Line,
743 text: "// regular".into(),
744 line: 1,
745 column: 1,
746 };
747 assert!(!r.is_doc());
748 assert_eq!(r.stripped_text(), " regular");
749 }
750}