1use crate::error::ParseError;
4
5#[derive(Debug, Clone, PartialEq)]
7pub enum TokenKind {
8 Pattern,
10 Stage,
11 Unless,
12 Between,
13 After,
14 Graph,
15 Now,
16 Temporal,
17 True,
18 False,
19 Compose, Sharing, Concurrent, LBrace, RBrace, Dot, Arrow, Eq, Lt, Gt, Lte, Gte, Bang, At, DotDot, Question, GtGt, Pipe, Star, LParen, RParen, Comma, Plus, Minus, Colon, Semicolon, Ident(String),
50 String(String),
51 Number(f64),
52
53 Eof,
54}
55
56#[derive(Debug, Clone)]
58pub struct Token {
59 pub kind: TokenKind,
60 pub line: usize,
61 pub column: usize,
62 pub offset: usize,
63 pub len: usize,
64}
65
66impl Token {
67 pub fn span(&self) -> (usize, usize) {
68 (self.offset, self.offset + self.len)
69 }
70}
71
72pub struct Lexer<'a> {
74 source: &'a str,
75 bytes: &'a [u8],
76 pos: usize,
77 line: usize,
78 col: usize,
79}
80
81impl<'a> Lexer<'a> {
82 pub fn new(source: &'a str) -> Self {
83 Self {
84 source,
85 bytes: source.as_bytes(),
86 pos: 0,
87 line: 1,
88 col: 1,
89 }
90 }
91
92 pub fn tokenize(&mut self) -> Result<Vec<Token>, ParseError> {
93 let mut tokens = Vec::new();
94 loop {
95 self.skip_whitespace_and_comments();
96 if self.pos >= self.bytes.len() {
97 tokens.push(Token {
98 kind: TokenKind::Eof,
99 line: self.line,
100 column: self.col,
101 offset: self.pos,
102 len: 0,
103 });
104 break;
105 }
106 tokens.push(self.next_token()?);
107 }
108 Ok(tokens)
109 }
110
111 fn skip_whitespace_and_comments(&mut self) {
112 loop {
113 while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
115 if self.bytes[self.pos] == b'\n' {
116 self.line += 1;
117 self.col = 1;
118 } else {
119 self.col += 1;
120 }
121 self.pos += 1;
122 }
123 if self.pos + 1 < self.bytes.len()
125 && self.bytes[self.pos] == b'/'
126 && self.bytes[self.pos + 1] == b'/'
127 {
128 while self.pos < self.bytes.len() && self.bytes[self.pos] != b'\n' {
129 self.pos += 1;
130 }
131 continue;
132 }
133 break;
134 }
135 }
136
137 fn next_token(&mut self) -> Result<Token, ParseError> {
138 let start = self.pos;
139 let line = self.line;
140 let col = self.col;
141 let ch = self.bytes[self.pos];
142
143 match ch {
144 b'{' => {
145 self.advance();
146 Ok(Token {
147 kind: TokenKind::LBrace,
148 line,
149 column: col,
150 offset: start,
151 len: 1,
152 })
153 }
154 b'}' => {
155 self.advance();
156 Ok(Token {
157 kind: TokenKind::RBrace,
158 line,
159 column: col,
160 offset: start,
161 len: 1,
162 })
163 }
164 b'@' => {
165 self.advance();
166 Ok(Token {
167 kind: TokenKind::At,
168 line,
169 column: col,
170 offset: start,
171 len: 1,
172 })
173 }
174 b'?' => {
175 self.advance();
176 Ok(Token {
177 kind: TokenKind::Question,
178 line,
179 column: col,
180 offset: start,
181 len: 1,
182 })
183 }
184 b'|' => {
185 self.advance();
186 Ok(Token {
187 kind: TokenKind::Pipe,
188 line,
189 column: col,
190 offset: start,
191 len: 1,
192 })
193 }
194 b'*' => {
195 self.advance();
196 Ok(Token {
197 kind: TokenKind::Star,
198 line,
199 column: col,
200 offset: start,
201 len: 1,
202 })
203 }
204 b'(' => {
205 self.advance();
206 Ok(Token {
207 kind: TokenKind::LParen,
208 line,
209 column: col,
210 offset: start,
211 len: 1,
212 })
213 }
214 b')' => {
215 self.advance();
216 Ok(Token {
217 kind: TokenKind::RParen,
218 line,
219 column: col,
220 offset: start,
221 len: 1,
222 })
223 }
224 b',' => {
225 self.advance();
226 Ok(Token {
227 kind: TokenKind::Comma,
228 line,
229 column: col,
230 offset: start,
231 len: 1,
232 })
233 }
234 b'!' => {
235 self.advance();
236 Ok(Token {
237 kind: TokenKind::Bang,
238 line,
239 column: col,
240 offset: start,
241 len: 1,
242 })
243 }
244 b'.' => {
245 self.advance();
246 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'.' {
247 self.advance();
248 Ok(Token {
249 kind: TokenKind::DotDot,
250 line,
251 column: col,
252 offset: start,
253 len: 2,
254 })
255 } else {
256 Ok(Token {
257 kind: TokenKind::Dot,
258 line,
259 column: col,
260 offset: start,
261 len: 1,
262 })
263 }
264 }
265 b'-' => {
266 self.advance();
267 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
268 self.advance();
269 Ok(Token {
270 kind: TokenKind::Arrow,
271 line,
272 column: col,
273 offset: start,
274 len: 2,
275 })
276 } else {
277 Ok(Token {
278 kind: TokenKind::Minus,
279 line,
280 column: col,
281 offset: start,
282 len: 1,
283 })
284 }
285 }
286 b'+' => {
287 self.advance();
288 Ok(Token {
289 kind: TokenKind::Plus,
290 line,
291 column: col,
292 offset: start,
293 len: 1,
294 })
295 }
296 b':' => {
297 self.advance();
298 Ok(Token {
299 kind: TokenKind::Colon,
300 line,
301 column: col,
302 offset: start,
303 len: 1,
304 })
305 }
306 b';' => {
307 self.advance();
308 Ok(Token {
309 kind: TokenKind::Semicolon,
310 line,
311 column: col,
312 offset: start,
313 len: 1,
314 })
315 }
316 b'=' => {
317 self.advance();
318 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
320 self.advance();
321 }
322 Ok(Token {
323 kind: TokenKind::Eq,
324 line,
325 column: col,
326 offset: start,
327 len: self.pos - start,
328 })
329 }
330 b'<' => {
331 self.advance();
332 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
333 self.advance();
334 Ok(Token {
335 kind: TokenKind::Lte,
336 line,
337 column: col,
338 offset: start,
339 len: 2,
340 })
341 } else {
342 Ok(Token {
343 kind: TokenKind::Lt,
344 line,
345 column: col,
346 offset: start,
347 len: 1,
348 })
349 }
350 }
351 b'>' => {
352 self.advance();
353 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
354 self.advance();
355 Ok(Token {
356 kind: TokenKind::Gte,
357 line,
358 column: col,
359 offset: start,
360 len: 2,
361 })
362 } else if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
363 self.advance();
364 Ok(Token {
365 kind: TokenKind::GtGt,
366 line,
367 column: col,
368 offset: start,
369 len: 2,
370 })
371 } else {
372 Ok(Token {
373 kind: TokenKind::Gt,
374 line,
375 column: col,
376 offset: start,
377 len: 1,
378 })
379 }
380 }
381 b'"' => self.read_string(line, col),
382 b'0'..=b'9' => self.read_number(start, line, col),
383 b'a'..=b'z' | b'A'..=b'Z' | b'_' => self.read_ident(start, line, col),
384 _ => Err(self.error_at(
385 line,
386 col,
387 start,
388 &format!("unexpected character '{}'", ch as char),
389 )),
390 }
391 }
392
393 fn advance(&mut self) {
394 self.pos += 1;
395 self.col += 1;
396 }
397
398 fn read_string(&mut self, line: usize, col: usize) -> Result<Token, ParseError> {
399 let start = self.pos;
400 self.advance(); if self.pos + 1 < self.bytes.len()
404 && self.bytes[self.pos] == b'"'
405 && self.bytes[self.pos + 1] == b'"'
406 {
407 self.advance(); self.advance(); return self.read_triple_string(start, line, col);
410 }
411
412 let content_start = self.pos;
414 while self.pos < self.bytes.len() && self.bytes[self.pos] != b'"' {
415 if self.bytes[self.pos] == b'\n' {
416 return Err(self.error_at(line, col, start, "unterminated string literal"));
417 }
418 self.pos += 1;
419 self.col += 1;
420 }
421 if self.pos >= self.bytes.len() {
422 return Err(self.error_at(line, col, start, "unterminated string literal"));
423 }
424 let s = self.source[content_start..self.pos].to_string();
425 self.advance(); Ok(Token {
427 kind: TokenKind::String(s),
428 line,
429 column: col,
430 offset: start,
431 len: self.pos - start,
432 })
433 }
434
435 fn read_triple_string(
436 &mut self,
437 start: usize,
438 line: usize,
439 col: usize,
440 ) -> Result<Token, ParseError> {
441 let content_start = self.pos;
442 loop {
443 if self.pos >= self.bytes.len() {
444 return Err(self.error_at(line, col, start, "unterminated triple-quoted string"));
445 }
446 if self.pos + 2 < self.bytes.len()
447 && self.bytes[self.pos] == b'"'
448 && self.bytes[self.pos + 1] == b'"'
449 && self.bytes[self.pos + 2] == b'"'
450 {
451 let s = self.source[content_start..self.pos].to_string();
452 self.advance(); self.advance(); self.advance(); return Ok(Token {
456 kind: TokenKind::String(s),
457 line,
458 column: col,
459 offset: start,
460 len: self.pos - start,
461 });
462 }
463 if self.bytes[self.pos] == b'\n' {
464 self.line += 1;
465 self.col = 1;
466 self.pos += 1;
467 } else {
468 self.pos += 1;
469 self.col += 1;
470 }
471 }
472 }
473
474 fn read_number(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
475 while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
476 self.pos += 1;
477 self.col += 1;
478 }
479 if self.pos + 1 < self.bytes.len()
481 && self.bytes[self.pos] == b'.'
482 && self.bytes[self.pos + 1] != b'.'
483 && self.bytes[self.pos + 1].is_ascii_digit()
484 {
485 self.pos += 1;
486 self.col += 1;
487 while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
488 self.pos += 1;
489 self.col += 1;
490 }
491 }
492 let num_str = &self.source[start..self.pos];
493 let val: f64 = num_str.parse().map_err(|_| {
494 self.error_at(line, col, start, &format!("invalid number '{}'", num_str))
495 })?;
496 Ok(Token {
497 kind: TokenKind::Number(val),
498 line,
499 column: col,
500 offset: start,
501 len: self.pos - start,
502 })
503 }
504
505 fn read_ident(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
506 while self.pos < self.bytes.len()
507 && (self.bytes[self.pos].is_ascii_alphanumeric() || self.bytes[self.pos] == b'_')
508 {
509 self.pos += 1;
510 self.col += 1;
511 }
512 let word = &self.source[start..self.pos];
513 let kind = match word {
514 "pattern" => TokenKind::Pattern,
515 "stage" => TokenKind::Stage,
516 "unless" => TokenKind::Unless,
517 "between" => TokenKind::Between,
518 "after" => TokenKind::After,
519 "graph" => TokenKind::Graph,
520 "now" => TokenKind::Now,
521 "temporal" => TokenKind::Temporal,
522 "true" => TokenKind::True,
523 "false" => TokenKind::False,
524 "compose" => TokenKind::Compose,
525 "sharing" => TokenKind::Sharing,
526 "concurrent" => TokenKind::Concurrent,
527 _ => TokenKind::Ident(word.to_string()),
528 };
529 Ok(Token {
530 kind,
531 line,
532 column: col,
533 offset: start,
534 len: self.pos - start,
535 })
536 }
537
538 fn error_at(&self, line: usize, col: usize, offset: usize, msg: &str) -> ParseError {
539 ParseError {
540 line,
541 column: col,
542 span: (offset, self.pos.max(offset + 1)),
543 message: msg.to_string(),
544 }
545 }
546}
547
548#[cfg(test)]
549mod tests {
550 use super::*;
551
552 #[test]
553 fn tokenize_simple_pattern() {
554 let src = r#"pattern test { stage e1 { e1.eventType = "enter" } }"#;
555 let tokens = Lexer::new(src).tokenize().unwrap();
556 assert!(matches!(tokens[0].kind, TokenKind::Pattern));
557 assert!(matches!(tokens[1].kind, TokenKind::Ident(ref s) if s == "test"));
558 assert!(matches!(tokens[2].kind, TokenKind::LBrace));
559 assert!(matches!(tokens[3].kind, TokenKind::Stage));
560 }
561
562 #[test]
563 fn tokenize_graph() {
564 let src = r#"graph { @1 ev.type = "enter" @2..5 ev2.type = "siege" }"#;
565 let tokens = Lexer::new(src).tokenize().unwrap();
566 assert!(matches!(tokens[0].kind, TokenKind::Graph));
567 assert!(matches!(tokens[2].kind, TokenKind::At));
568 assert!(matches!(tokens[3].kind, TokenKind::Number(n) if n == 1.0));
569 }
570
571 #[test]
572 fn tokenize_comments() {
573 let src = "// this is a comment\npattern test {}";
574 let tokens = Lexer::new(src).tokenize().unwrap();
575 assert!(matches!(tokens[0].kind, TokenKind::Pattern));
576 }
577
578 #[test]
579 fn tokenize_arrow_and_question() {
580 let src = "e1.actor -> ?guest";
581 let tokens = Lexer::new(src).tokenize().unwrap();
582 assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "e1"));
583 assert!(matches!(tokens[1].kind, TokenKind::Dot));
584 assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "actor"));
585 assert!(matches!(tokens[3].kind, TokenKind::Arrow));
586 assert!(matches!(tokens[4].kind, TokenKind::Question));
587 assert!(matches!(tokens[5].kind, TokenKind::Ident(ref s) if s == "guest"));
588 }
589
590 #[test]
591 fn tokenize_new_symbols() {
592 let src = "+ - : ;";
593 let tokens = Lexer::new(src).tokenize().unwrap();
594 assert!(matches!(tokens[0].kind, TokenKind::Plus));
595 assert!(matches!(tokens[1].kind, TokenKind::Minus));
596 assert!(matches!(tokens[2].kind, TokenKind::Colon));
597 assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
598 }
599
600 #[test]
601 fn tokenize_minus_not_folded_into_number() {
602 let src = "-5";
603 let tokens = Lexer::new(src).tokenize().unwrap();
604 assert!(matches!(tokens[0].kind, TokenKind::Minus));
605 assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 5.0));
606 }
607
608 #[test]
609 fn tokenize_arrow_still_works() {
610 let src = "-> -5";
611 let tokens = Lexer::new(src).tokenize().unwrap();
612 assert!(matches!(tokens[0].kind, TokenKind::Arrow));
613 assert!(matches!(tokens[1].kind, TokenKind::Minus));
614 assert!(matches!(tokens[2].kind, TokenKind::Number(n) if n == 5.0));
615 }
616
617 #[test]
618 fn tokenize_triple_quoted_string() {
619 let src = r#""""hello
620world""""#;
621 let tokens = Lexer::new(src).tokenize().unwrap();
622 assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == "hello\nworld"));
623 }
624
625 #[test]
626 fn tokenize_triple_quoted_empty() {
627 let src = "\"\"\"\"\"\""; let tokens = Lexer::new(src).tokenize().unwrap();
629 assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s.is_empty()));
630 }
631
632 #[test]
633 fn tokenize_triple_quoted_with_single_quotes_inside() {
634 let src = r#""""say "hello" to them""""#;
635 let tokens = Lexer::new(src).tokenize().unwrap();
636 assert!(
637 matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"say "hello" to them"#)
638 );
639 }
640
641 #[test]
642 fn tokenize_triple_quoted_double_quotes_inside() {
643 let src = r#""""has ""two"" inside""""#;
644 let tokens = Lexer::new(src).tokenize().unwrap();
645 assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"has ""two"" inside"#));
646 }
647
648 #[test]
649 fn tokenize_salience_style() {
650 let src = r#"lifecycle: oneshot; priority: normal; adjust ?e2.depth + 1"#;
652 let tokens = Lexer::new(src).tokenize().unwrap();
653 assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "lifecycle"));
654 assert!(matches!(tokens[1].kind, TokenKind::Colon));
655 assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "oneshot"));
656 assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
657 assert!(matches!(tokens[13].kind, TokenKind::Plus));
658 assert!(matches!(tokens[14].kind, TokenKind::Number(n) if n == 1.0));
659 }
660}