1use crate::error::{CompileError, CompileResult, Span};
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Token {
5 Program,
7 Ss,
8 State,
9 When,
10 Entry,
11 Exit,
12 Option_,
13 Assign,
14 To,
15 Monitor,
16 Sync,
17 EvFlag,
18 If,
19 Else,
20 While,
21 For,
22 Break,
23 Return,
24 Int,
26 Short,
27 Long,
28 Float,
29 Double,
30 String_,
31 Char,
32 Unsigned,
33 IntLit(i64),
35 FloatLit(f64),
36 StringLit(String),
37 Ident(String),
39 LParen,
41 RParen,
42 LBrace,
43 RBrace,
44 LBracket,
45 RBracket,
46 Semi,
47 Comma,
48 Dot,
49 Arrow, Plus,
52 Minus,
53 Star,
54 Slash,
55 Percent,
56 Eq, Ne, Lt,
59 Le, Gt,
61 Ge, And, Or, Not, BitAnd, BitOr, BitXor, BitNot, Shl, Shr, Assign_, PlusEq, MinusEq, StarEq, SlashEq, PlusPlus, MinusMinus, Question, Colon, Hash, DoublePercent, EmbeddedLine(String), Eof,
85}
86
87#[derive(Debug, Clone)]
88pub struct SpannedToken {
89 pub token: Token,
90 pub span: Span,
91}
92
93pub struct Lexer<'a> {
94 input: &'a [u8],
95 pos: usize,
96 line: usize,
97 col: usize,
98}
99
100impl<'a> Lexer<'a> {
101 pub fn new(input: &'a str) -> Self {
102 Self {
103 input: input.as_bytes(),
104 pos: 0,
105 line: 1,
106 col: 1,
107 }
108 }
109
110 fn span(&self) -> Span {
111 Span {
112 offset: self.pos,
113 line: self.line,
114 column: self.col,
115 }
116 }
117
118 fn peek(&self) -> Option<u8> {
119 self.input.get(self.pos).copied()
120 }
121
122 fn peek2(&self) -> Option<u8> {
123 self.input.get(self.pos + 1).copied()
124 }
125
126 fn advance(&mut self) -> Option<u8> {
127 let ch = self.input.get(self.pos).copied()?;
128 self.pos += 1;
129 if ch == b'\n' {
130 self.line += 1;
131 self.col = 1;
132 } else {
133 self.col += 1;
134 }
135 Some(ch)
136 }
137
138 fn skip_whitespace_and_comments(&mut self) {
139 loop {
140 while self.peek().map_or(false, |c| c.is_ascii_whitespace()) {
142 self.advance();
143 }
144
145 if self.peek() == Some(b'/') && self.peek2() == Some(b'/') {
147 while self.peek().map_or(false, |c| c != b'\n') {
148 self.advance();
149 }
150 continue;
151 }
152
153 if self.peek() == Some(b'/') && self.peek2() == Some(b'*') {
155 self.advance();
156 self.advance();
157 let mut depth = 1;
158 while depth > 0 {
159 match self.advance() {
160 Some(b'*') if self.peek() == Some(b'/') => {
161 self.advance();
162 depth -= 1;
163 }
164 Some(b'/') if self.peek() == Some(b'*') => {
165 self.advance();
166 depth += 1;
167 }
168 None => break,
169 _ => {}
170 }
171 }
172 continue;
173 }
174
175 break;
176 }
177 }
178
179 fn read_string(&mut self) -> CompileResult<String> {
180 let span = self.span();
181 self.advance(); let mut s = String::new();
183 loop {
184 match self.advance() {
185 Some(b'"') => return Ok(s),
186 Some(b'\\') => match self.advance() {
187 Some(b'n') => s.push('\n'),
188 Some(b't') => s.push('\t'),
189 Some(b'\\') => s.push('\\'),
190 Some(b'"') => s.push('"'),
191 Some(b'0') => s.push('\0'),
192 Some(c) => {
193 s.push('\\');
194 s.push(c as char);
195 }
196 None => return Err(CompileError::syntax(span, "unterminated string")),
197 },
198 Some(c) => s.push(c as char),
199 None => return Err(CompileError::syntax(span, "unterminated string")),
200 }
201 }
202 }
203
204 fn read_char_literal(&mut self) -> CompileResult<i64> {
205 let span = self.span();
206 self.advance(); let ch = match self.advance() {
208 Some(b'\\') => match self.advance() {
209 Some(b'n') => b'\n',
210 Some(b't') => b'\t',
211 Some(b'\\') => b'\\',
212 Some(b'\'') => b'\'',
213 Some(b'0') => b'\0',
214 Some(b'r') => b'\r',
215 Some(b'a') => 7, Some(b'b') => 8, Some(c) => c,
218 None => return Err(CompileError::syntax(span, "unterminated char literal")),
219 },
220 Some(c) => c,
221 None => return Err(CompileError::syntax(span, "unterminated char literal")),
222 };
223 match self.advance() {
224 Some(b'\'') => Ok(ch as i64),
225 _ => Err(CompileError::syntax(span, "unterminated char literal")),
226 }
227 }
228
229 fn read_number(&mut self) -> Token {
230 let start = self.pos;
231 let mut is_float = false;
232
233 if self.peek() == Some(b'0')
235 && self.input.get(self.pos + 1).map_or(false, |&c| c == b'x' || c == b'X')
236 {
237 self.advance();
238 self.advance();
239 while self.peek().map_or(false, |c| c.is_ascii_hexdigit()) {
240 self.advance();
241 }
242 let s = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
243 return Token::IntLit(i64::from_str_radix(&s[2..], 16).unwrap_or(0));
244 }
245
246 while self.peek().map_or(false, |c| c.is_ascii_digit()) {
247 self.advance();
248 }
249
250 if self.peek() == Some(b'.') && self.peek2().map_or(false, |c| c.is_ascii_digit()) {
251 is_float = true;
252 self.advance();
253 while self.peek().map_or(false, |c| c.is_ascii_digit()) {
254 self.advance();
255 }
256 }
257
258 if self.peek().map_or(false, |c| c == b'e' || c == b'E') {
260 is_float = true;
261 self.advance();
262 if self.peek().map_or(false, |c| c == b'+' || c == b'-') {
263 self.advance();
264 }
265 while self.peek().map_or(false, |c| c.is_ascii_digit()) {
266 self.advance();
267 }
268 }
269
270 if self.peek().map_or(false, |c| c == b'f' || c == b'F') {
272 is_float = true;
273 self.advance();
274 }
275
276 let s = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
277 let s = s.trim_end_matches(|c: char| c == 'f' || c == 'F');
278
279 if is_float {
280 Token::FloatLit(s.parse().unwrap_or(0.0))
281 } else {
282 Token::IntLit(s.parse().unwrap_or(0))
283 }
284 }
285
286 fn read_ident(&mut self) -> String {
287 let start = self.pos;
288 while self
289 .peek()
290 .map_or(false, |c| c.is_ascii_alphanumeric() || c == b'_')
291 {
292 self.advance();
293 }
294 std::str::from_utf8(&self.input[start..self.pos])
295 .unwrap()
296 .to_string()
297 }
298
299 pub fn tokenize(&mut self) -> CompileResult<Vec<SpannedToken>> {
300 let mut tokens = Vec::new();
301
302 loop {
303 self.skip_whitespace_and_comments();
304
305 let span = self.span();
306
307 let Some(ch) = self.peek() else {
308 tokens.push(SpannedToken {
309 token: Token::Eof,
310 span,
311 });
312 break;
313 };
314
315 let token = match ch {
316 b'(' => { self.advance(); Token::LParen }
317 b')' => { self.advance(); Token::RParen }
318 b'{' => { self.advance(); Token::LBrace }
319 b'}' => { self.advance(); Token::RBrace }
320 b'[' => { self.advance(); Token::LBracket }
321 b']' => { self.advance(); Token::RBracket }
322 b';' => { self.advance(); Token::Semi }
323 b',' => { self.advance(); Token::Comma }
324 b'.' => { self.advance(); Token::Dot }
325 b'~' => { self.advance(); Token::BitNot }
326 b'?' => { self.advance(); Token::Question }
327 b':' => { self.advance(); Token::Colon }
328 b'"' => Token::StringLit(self.read_string()?),
329 b'\'' => Token::IntLit(self.read_char_literal()?),
330 b'#' => {
331 self.advance();
332 let start = self.pos;
334 while self.peek().map_or(false, |c| c != b'\n') {
335 self.advance();
336 }
337 let _line = std::str::from_utf8(&self.input[start..self.pos]).unwrap();
338 continue; }
340 b'%' if self.peek2() == Some(b'%') => {
341 self.advance();
342 self.advance();
343 let start = self.pos;
345 while self.peek().map_or(false, |c| c != b'\n') {
346 self.advance();
347 }
348 let code = std::str::from_utf8(&self.input[start..self.pos]).unwrap().to_string();
349 tokens.push(SpannedToken {
350 token: Token::EmbeddedLine(code),
351 span,
352 });
353 continue;
354 }
355 b'+' => {
356 self.advance();
357 match self.peek() {
358 Some(b'+') => { self.advance(); Token::PlusPlus }
359 Some(b'=') => { self.advance(); Token::PlusEq }
360 _ => Token::Plus,
361 }
362 }
363 b'-' => {
364 self.advance();
365 match self.peek() {
366 Some(b'-') => { self.advance(); Token::MinusMinus }
367 Some(b'=') => { self.advance(); Token::MinusEq }
368 Some(b'>') => { self.advance(); Token::Arrow }
369 _ => Token::Minus,
370 }
371 }
372 b'*' => {
373 self.advance();
374 if self.peek() == Some(b'=') { self.advance(); Token::StarEq }
375 else { Token::Star }
376 }
377 b'/' => {
378 self.advance();
379 if self.peek() == Some(b'=') { self.advance(); Token::SlashEq }
380 else { Token::Slash }
381 }
382 b'%' if self.peek2() == Some(b'{') => {
383 self.advance();
384 self.advance();
385 let start = self.pos;
387 loop {
388 match self.peek() {
389 Some(b'}') if self.input.get(self.pos + 1) == Some(&b'%') => {
390 let code = std::str::from_utf8(&self.input[start..self.pos]).unwrap().to_string();
391 self.advance();
392 self.advance();
393 tokens.push(SpannedToken {
394 token: Token::EmbeddedLine(code),
395 span,
396 });
397 break;
398 }
399 Some(_) => { self.advance(); }
400 None => {
401 return Err(CompileError::syntax(span, "unterminated %{ }% block"));
402 }
403 }
404 }
405 continue;
406 }
407 b'%' => {
408 self.advance();
409 Token::Percent
410 }
411 b'=' => {
412 self.advance();
413 if self.peek() == Some(b'=') { self.advance(); Token::Eq }
414 else { Token::Assign_ }
415 }
416 b'!' => {
417 self.advance();
418 if self.peek() == Some(b'=') { self.advance(); Token::Ne }
419 else { Token::Not }
420 }
421 b'<' => {
422 self.advance();
423 match self.peek() {
424 Some(b'=') => { self.advance(); Token::Le }
425 Some(b'<') => { self.advance(); Token::Shl }
426 _ => Token::Lt,
427 }
428 }
429 b'>' => {
430 self.advance();
431 match self.peek() {
432 Some(b'=') => { self.advance(); Token::Ge }
433 Some(b'>') => { self.advance(); Token::Shr }
434 _ => Token::Gt,
435 }
436 }
437 b'&' => {
438 self.advance();
439 if self.peek() == Some(b'&') { self.advance(); Token::And }
440 else { Token::BitAnd }
441 }
442 b'|' => {
443 self.advance();
444 if self.peek() == Some(b'|') { self.advance(); Token::Or }
445 else { Token::BitOr }
446 }
447 b'^' => { self.advance(); Token::BitXor }
448 c if c.is_ascii_digit() => self.read_number(),
449 c if c.is_ascii_alphabetic() || c == b'_' => {
450 let ident = self.read_ident();
451 match ident.as_str() {
452 "program" => Token::Program,
453 "ss" => Token::Ss,
454 "state" => Token::State,
455 "when" => Token::When,
456 "entry" => Token::Entry,
457 "exit" => Token::Exit,
458 "option" => Token::Option_,
459 "assign" => Token::Assign,
460 "to" => Token::To,
461 "monitor" => Token::Monitor,
462 "sync" => Token::Sync,
463 "evflag" => Token::EvFlag,
464 "if" => Token::If,
465 "else" => Token::Else,
466 "while" => Token::While,
467 "for" => Token::For,
468 "break" => Token::Break,
469 "return" => Token::Return,
470 "int" => Token::Int,
471 "short" => Token::Short,
472 "long" => Token::Long,
473 "float" => Token::Float,
474 "double" => Token::Double,
475 "string" => Token::String_,
476 "char" => Token::Char,
477 "unsigned" => Token::Unsigned,
478 "TRUE" | "true" => Token::IntLit(1),
479 "FALSE" | "false" => Token::IntLit(0),
480 _ => Token::Ident(ident),
481 }
482 }
483 _ => {
484 return Err(CompileError::syntax(
485 span,
486 format!("unexpected character: '{}'", ch as char),
487 ));
488 }
489 };
490
491 tokens.push(SpannedToken { token, span });
492 }
493
494 Ok(tokens)
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501
502 fn lex(input: &str) -> Vec<Token> {
503 Lexer::new(input)
504 .tokenize()
505 .unwrap()
506 .into_iter()
507 .map(|st| st.token)
508 .collect()
509 }
510
511 #[test]
512 fn test_keywords() {
513 let tokens = lex("program ss state when entry exit");
514 assert_eq!(
515 tokens,
516 vec![
517 Token::Program,
518 Token::Ss,
519 Token::State,
520 Token::When,
521 Token::Entry,
522 Token::Exit,
523 Token::Eof,
524 ]
525 );
526 }
527
528 #[test]
529 fn test_operators() {
530 let tokens = lex("+ - * / == != <= >= && || ++ -- += -=");
531 assert_eq!(
532 tokens,
533 vec![
534 Token::Plus, Token::Minus, Token::Star, Token::Slash,
535 Token::Eq, Token::Ne, Token::Le, Token::Ge,
536 Token::And, Token::Or, Token::PlusPlus, Token::MinusMinus,
537 Token::PlusEq, Token::MinusEq, Token::Eof,
538 ]
539 );
540 }
541
542 #[test]
543 fn test_numbers() {
544 let tokens = lex("42 3.14 0xFF 1e5");
545 assert_eq!(
546 tokens,
547 vec![
548 Token::IntLit(42),
549 Token::FloatLit(3.14),
550 Token::IntLit(255),
551 Token::FloatLit(1e5),
552 Token::Eof,
553 ]
554 );
555 }
556
557 #[test]
558 fn test_string() {
559 let tokens = lex(r#""hello\nworld""#);
560 assert_eq!(
561 tokens,
562 vec![Token::StringLit("hello\nworld".to_string()), Token::Eof]
563 );
564 }
565
566 #[test]
567 fn test_comment_skip() {
568 let tokens = lex("a /* comment */ b // line\nc");
569 assert_eq!(
570 tokens,
571 vec![
572 Token::Ident("a".into()),
573 Token::Ident("b".into()),
574 Token::Ident("c".into()),
575 Token::Eof,
576 ]
577 );
578 }
579
580 #[test]
581 fn test_simple_program() {
582 let tokens = lex(r#"
583 program test
584 option +s;
585 double x;
586 assign x to "PV:x";
587 monitor x;
588 "#);
589 assert_eq!(tokens[0], Token::Program);
590 assert_eq!(tokens[1], Token::Ident("test".into()));
591 assert_eq!(tokens[2], Token::Option_);
592 }
593
594 #[test]
595 fn test_preprocessor_skipped() {
596 let tokens = lex("#include \"foo.h\"\nint x;");
597 assert_eq!(
598 tokens,
599 vec![Token::Int, Token::Ident("x".into()), Token::Semi, Token::Eof]
600 );
601 }
602
603 #[test]
604 fn test_char_literal() {
605 let tokens = lex("'A' '\\n' '\\0'");
606 assert_eq!(
607 tokens,
608 vec![
609 Token::IntLit(65),
610 Token::IntLit(10),
611 Token::IntLit(0),
612 Token::Eof,
613 ]
614 );
615 }
616
617 #[test]
618 fn test_embedded_line() {
619 let tokens = lex("%% use std::io;\nint x;");
620 assert_eq!(tokens.len(), 5); assert!(matches!(&tokens[0], Token::EmbeddedLine(s) if s.contains("use std::io")));
622 }
623
624 #[test]
625 fn test_embedded_block() {
626 let tokens = lex("%{ some code }%\nint x;");
627 assert_eq!(tokens.len(), 5); assert!(matches!(&tokens[0], Token::EmbeddedLine(s) if s.contains("some code")));
629 }
630
631 #[test]
632 fn test_true_false() {
633 let tokens = lex("TRUE FALSE true false");
634 assert_eq!(
635 tokens,
636 vec![
637 Token::IntLit(1), Token::IntLit(0),
638 Token::IntLit(1), Token::IntLit(0),
639 Token::Eof,
640 ]
641 );
642 }
643}