1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6 Int(i64),
8 Float(f64),
9 Str(String),
10 InterpStr(Vec<(bool, String)>), Bool(bool),
12 Ident(String),
14 Module,
16 Depends,
17 Exposes,
18 Intent,
19 Type,
20 Record,
21 Fn,
22 Effect,
23 Effects,
24 Service,
25 Needs,
26 Decision,
27 Verify,
28 Case,
29 Match,
30 Where,
31 Input,
32 Expect,
33 Arrow, Pipe, FatArrow, Eq, Neq, Lte, Gte, Assign, Bang, Question, Lt, Gt, Plus, Minus, Star, Slash, Dot, Colon, Comma, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Indent,
61 Dedent,
62 Newline,
63 Eof,
64}
65
66impl fmt::Display for TokenKind {
67 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68 match self {
69 TokenKind::Int(n) => write!(f, "integer '{}'", n),
70 TokenKind::Float(n) => write!(f, "float '{}'", n),
71 TokenKind::Str(s) => write!(f, "string \"{}\"", s),
72 TokenKind::InterpStr(_) => write!(f, "interpolated string"),
73 TokenKind::Bool(b) => write!(f, "'{}'", b),
74 TokenKind::Ident(s) => write!(f, "'{}'", s),
75 TokenKind::Module => write!(f, "'module'"),
76 TokenKind::Depends => write!(f, "'depends'"),
77 TokenKind::Exposes => write!(f, "'exposes'"),
78 TokenKind::Intent => write!(f, "'intent'"),
79 TokenKind::Type => write!(f, "'type'"),
80 TokenKind::Record => write!(f, "'record'"),
81 TokenKind::Fn => write!(f, "'fn'"),
82 TokenKind::Effect => write!(f, "'effect'"),
83 TokenKind::Effects => write!(f, "'effects'"),
84 TokenKind::Service => write!(f, "'service'"),
85 TokenKind::Needs => write!(f, "'needs'"),
86 TokenKind::Decision => write!(f, "'decision'"),
87 TokenKind::Verify => write!(f, "'verify'"),
88 TokenKind::Case => write!(f, "'case'"),
89 TokenKind::Match => write!(f, "'match'"),
90 TokenKind::Where => write!(f, "'where'"),
91 TokenKind::Input => write!(f, "'input'"),
92 TokenKind::Expect => write!(f, "'expect'"),
93 TokenKind::Arrow => write!(f, "'->'"),
94 TokenKind::Pipe => write!(f, "'|>'"),
95 TokenKind::FatArrow => write!(f, "'=>'"),
96 TokenKind::Eq => write!(f, "'=='"),
97 TokenKind::Neq => write!(f, "'!='"),
98 TokenKind::Lte => write!(f, "'<='"),
99 TokenKind::Gte => write!(f, "'>='"),
100 TokenKind::Assign => write!(f, "'='"),
101 TokenKind::Bang => write!(f, "'!'"),
102 TokenKind::Question => write!(f, "'?'"),
103 TokenKind::Lt => write!(f, "'<'"),
104 TokenKind::Gt => write!(f, "'>'"),
105 TokenKind::Plus => write!(f, "'+'"),
106 TokenKind::Minus => write!(f, "'-'"),
107 TokenKind::Star => write!(f, "'*'"),
108 TokenKind::Slash => write!(f, "'/'"),
109 TokenKind::Dot => write!(f, "'.'"),
110 TokenKind::Colon => write!(f, "':'"),
111 TokenKind::Comma => write!(f, "','"),
112 TokenKind::LParen => write!(f, "'('"),
113 TokenKind::RParen => write!(f, "')'"),
114 TokenKind::LBracket => write!(f, "'['"),
115 TokenKind::RBracket => write!(f, "']'"),
116 TokenKind::LBrace => write!(f, "'{{'"),
117 TokenKind::RBrace => write!(f, "'}}'"),
118 TokenKind::Indent => write!(f, "indentation"),
119 TokenKind::Dedent => write!(f, "end of block"),
120 TokenKind::Newline => write!(f, "end of line"),
121 TokenKind::Eof => write!(f, "end of file"),
122 }
123 }
124}
125
126#[derive(Debug, Clone)]
127pub struct Token {
128 pub kind: TokenKind,
129 pub line: usize,
130 pub col: usize,
131}
132
133#[derive(Debug, Error)]
134pub enum LexerError {
135 #[error("error[{line}:{col}]: {msg}")]
136 Error {
137 msg: String,
138 line: usize,
139 col: usize,
140 },
141}
142
143fn keyword(s: &str) -> Option<TokenKind> {
144 match s {
145 "module" => Some(TokenKind::Module),
146 "depends" => Some(TokenKind::Depends),
147 "exposes" => Some(TokenKind::Exposes),
148 "intent" => Some(TokenKind::Intent),
149 "type" => Some(TokenKind::Type),
150 "record" => Some(TokenKind::Record),
151 "fn" => Some(TokenKind::Fn),
152 "effect" => Some(TokenKind::Effect),
153 "effects" => Some(TokenKind::Effects),
154 "service" => Some(TokenKind::Service),
155 "needs" => Some(TokenKind::Needs),
156 "decision" => Some(TokenKind::Decision),
157 "verify" => Some(TokenKind::Verify),
158 "case" => Some(TokenKind::Case),
159 "match" => Some(TokenKind::Match),
160 "where" => Some(TokenKind::Where),
161 "input" => Some(TokenKind::Input),
162 "expect" => Some(TokenKind::Expect),
163 "true" => Some(TokenKind::Bool(true)),
164 "false" => Some(TokenKind::Bool(false)),
165 _ => None,
166 }
167}
168
169pub struct Lexer {
170 chars: Vec<char>,
171 pos: usize,
172 line: usize,
173 col: usize,
174 indent_stack: Vec<usize>,
175 at_line_start: bool,
176}
177
178impl Lexer {
179 pub fn new(source: &str) -> Self {
180 Lexer {
181 chars: source.chars().collect(),
182 pos: 0,
183 line: 1,
184 col: 1,
185 indent_stack: vec![0],
186 at_line_start: true,
187 }
188 }
189
190 fn error(&self, msg: impl Into<String>) -> LexerError {
191 LexerError::Error {
192 msg: msg.into(),
193 line: self.line,
194 col: self.col,
195 }
196 }
197
198 fn peek(&self, offset: usize) -> Option<char> {
199 self.chars.get(self.pos + offset).copied()
200 }
201
202 fn current(&self) -> Option<char> {
203 self.chars.get(self.pos).copied()
204 }
205
206 fn advance(&mut self) -> Option<char> {
207 let ch = self.chars.get(self.pos).copied()?;
208 self.pos += 1;
209 if ch == '\n' {
210 self.line += 1;
211 self.col = 1;
212 } else {
213 self.col += 1;
214 }
215 Some(ch)
216 }
217
218 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
219 let mut tokens = Vec::new();
220
221 while self.pos < self.chars.len() {
222 self.scan_token(&mut tokens)?;
223 }
224
225 while self.indent_stack.len() > 1 {
227 self.indent_stack.pop();
228 tokens.push(Token {
229 kind: TokenKind::Dedent,
230 line: self.line,
231 col: self.col,
232 });
233 }
234
235 tokens.push(Token {
236 kind: TokenKind::Eof,
237 line: self.line,
238 col: self.col,
239 });
240
241 Ok(tokens)
242 }
243
244 fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
245 if self.at_line_start {
246 self.handle_indentation(tokens)?;
247 if self.pos >= self.chars.len() {
248 return Ok(());
249 }
250 }
251
252 let ch = match self.current() {
253 Some(c) => c,
254 None => return Ok(()),
255 };
256
257 if ch == ' ' {
259 self.advance();
260 return Ok(());
261 }
262
263 if ch == '\n' {
265 let line = self.line;
266 let col = self.col;
267 self.advance();
268
269 let last_is_structural = tokens
270 .last()
271 .map(|t| {
272 matches!(
273 t.kind,
274 TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
275 )
276 })
277 .unwrap_or(true);
278
279 if !tokens.is_empty() && !last_is_structural {
280 tokens.push(Token {
281 kind: TokenKind::Newline,
282 line,
283 col,
284 });
285 }
286 self.at_line_start = true;
287 return Ok(());
288 }
289
290 if ch == '\r' {
292 self.advance();
293 return Ok(());
294 }
295
296 if ch == '/' && self.peek(1) == Some('/') {
298 self.skip_comment();
299 return Ok(());
300 }
301
302 if ch == '"' {
304 let tok = self.scan_string()?;
305 tokens.push(tok);
306 return Ok(());
307 }
308
309 if ch.is_ascii_digit() {
311 let tok = self.scan_number()?;
312 tokens.push(tok);
313 return Ok(());
314 }
315
316 if ch.is_alphabetic() || ch == '_' {
318 let tok = self.scan_identifier();
319 tokens.push(tok);
320 return Ok(());
321 }
322
323 let tok = self.scan_operator()?;
325 tokens.push(tok);
326 Ok(())
327 }
328
329 fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
330 self.at_line_start = false;
331 let mut indent = 0;
332
333 while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
334 indent += 1;
335 self.pos += 1;
336 self.col += 1;
337 }
338
339 if self.pos < self.chars.len() {
341 let ch = self.chars[self.pos];
342 if ch == '\n' || ch == '\r' {
343 return Ok(());
344 }
345 if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
346 return Ok(());
347 }
348 } else {
349 return Ok(());
350 }
351
352 let current = *self.indent_stack.last().unwrap();
353 let line = self.line;
354
355 if indent > current {
356 self.indent_stack.push(indent);
357 tokens.push(Token {
358 kind: TokenKind::Indent,
359 line,
360 col: 1,
361 });
362 } else if indent < current {
363 while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
364 self.indent_stack.pop();
365 tokens.push(Token {
366 kind: TokenKind::Dedent,
367 line,
368 col: 1,
369 });
370 }
371 if *self.indent_stack.last().unwrap() != indent {
372 return Err(self.error(format!("Invalid indentation level: {}", indent)));
373 }
374 }
375
376 Ok(())
377 }
378
379 fn skip_comment(&mut self) {
380 while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
381 self.advance();
382 }
383 }
384
385 fn scan_string(&mut self) -> Result<Token, LexerError> {
386 let line = self.line;
387 let col = self.col;
388 self.advance(); let mut parts: Vec<(bool, String)> = Vec::new(); let mut current = String::new();
392 let mut has_interp = false;
393
394 loop {
395 match self.current() {
396 None => return Err(self.error("Unterminated string literal")),
397 Some('"') => {
398 self.advance();
399 break;
400 }
401 Some('{') => {
402 if self.chars.get(self.pos + 1).copied() == Some('{') {
404 current.push('{');
405 self.advance(); self.advance(); } else {
408 has_interp = true;
409 if !current.is_empty() {
410 parts.push((false, current.clone()));
411 current.clear();
412 }
413 self.advance(); let mut expr_text = String::new();
415 let mut depth = 1usize;
416 while self.pos < self.chars.len() && depth > 0 {
417 match self.chars[self.pos] {
418 '{' => {
419 depth += 1;
420 expr_text.push('{');
421 self.advance();
422 }
423 '}' => {
424 depth -= 1;
425 if depth > 0 {
426 expr_text.push('}');
427 }
428 self.advance();
429 }
430 c => {
431 expr_text.push(c);
432 self.advance();
433 }
434 }
435 }
436 parts.push((true, expr_text));
437 }
438 }
439 Some('}') => {
440 if self.chars.get(self.pos + 1).copied() == Some('}') {
442 current.push('}');
443 self.advance(); self.advance(); } else {
446 current.push('}');
447 self.advance();
448 }
449 }
450 Some('\\') => {
451 self.advance();
452 match self.advance() {
453 Some('n') => current.push('\n'),
454 Some('t') => current.push('\t'),
455 Some('r') => current.push('\r'),
456 Some('"') => current.push('"'),
457 Some('\\') => current.push('\\'),
458 Some(c) => current.push(c),
459 None => return Err(self.error("Unterminated string literal")),
460 }
461 }
462 Some('\n') => return Err(self.error("Unterminated string literal")),
463 Some(c) => {
464 current.push(c);
465 self.advance();
466 }
467 }
468 }
469
470 if !current.is_empty() {
471 parts.push((false, current));
472 }
473
474 if has_interp {
475 Ok(Token {
476 kind: TokenKind::InterpStr(parts),
477 line,
478 col,
479 })
480 } else {
481 let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
482 Ok(Token {
483 kind: TokenKind::Str(plain),
484 line,
485 col,
486 })
487 }
488 }
489
490 fn scan_number(&mut self) -> Result<Token, LexerError> {
491 let line = self.line;
492 let col = self.col;
493 let mut num_str = String::new();
494 let mut is_float = false;
495
496 while let Some(c) = self.current() {
497 if c.is_ascii_digit() {
498 num_str.push(c);
499 self.advance();
500 } else {
501 break;
502 }
503 }
504
505 if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
506 {
507 is_float = true;
508 num_str.push('.');
509 self.advance(); while let Some(c) = self.current() {
511 if c.is_ascii_digit() {
512 num_str.push(c);
513 self.advance();
514 } else {
515 break;
516 }
517 }
518 }
519
520 if is_float {
521 let f: f64 = num_str
522 .parse()
523 .map_err(|_| self.error("Invalid floating-point number"))?;
524 Ok(Token {
525 kind: TokenKind::Float(f),
526 line,
527 col,
528 })
529 } else {
530 let i: i64 = num_str
531 .parse()
532 .map_err(|_| self.error("Invalid integer literal"))?;
533 Ok(Token {
534 kind: TokenKind::Int(i),
535 line,
536 col,
537 })
538 }
539 }
540
541 fn scan_identifier(&mut self) -> Token {
542 let line = self.line;
543 let col = self.col;
544 let mut ident = String::new();
545
546 while let Some(c) = self.current() {
547 if c.is_alphanumeric() || c == '_' {
548 ident.push(c);
549 self.advance();
550 } else {
551 break;
552 }
553 }
554
555 let kind = keyword(&ident).unwrap_or_else(|| TokenKind::Ident(ident));
556 Token { kind, line, col }
557 }
558
559 fn scan_operator(&mut self) -> Result<Token, LexerError> {
560 let line = self.line;
561 let col = self.col;
562 let ch = self.advance().unwrap();
563
564 let kind = match ch {
565 '-' if self.current() == Some('>') => {
566 self.advance();
567 TokenKind::Arrow
568 }
569 '|' if self.current() == Some('>') => {
570 self.advance();
571 TokenKind::Pipe
572 }
573 '=' if self.current() == Some('>') => {
574 self.advance();
575 TokenKind::FatArrow
576 }
577 '=' if self.current() == Some('=') => {
578 self.advance();
579 TokenKind::Eq
580 }
581 '!' if self.current() == Some('=') => {
582 self.advance();
583 TokenKind::Neq
584 }
585 '<' if self.current() == Some('=') => {
586 self.advance();
587 TokenKind::Lte
588 }
589 '>' if self.current() == Some('=') => {
590 self.advance();
591 TokenKind::Gte
592 }
593 '=' => TokenKind::Assign,
594 '<' => TokenKind::Lt,
595 '>' => TokenKind::Gt,
596 '+' => TokenKind::Plus,
597 '-' => TokenKind::Minus,
598 '*' => TokenKind::Star,
599 '/' => TokenKind::Slash,
600 '!' => TokenKind::Bang,
601 '?' => TokenKind::Question,
602 '.' => TokenKind::Dot,
603 ':' => TokenKind::Colon,
604 ',' => TokenKind::Comma,
605 '(' => TokenKind::LParen,
606 ')' => TokenKind::RParen,
607 '[' => TokenKind::LBracket,
608 ']' => TokenKind::RBracket,
609 '{' => TokenKind::LBrace,
610 '}' => TokenKind::RBrace,
611 other => return Err(self.error(format!("Unknown character: {:?}", other))),
612 };
613
614 Ok(Token { kind, line, col })
615 }
616}