Skip to main content

fabula_dsl/
lexer.rs

1//! Tokenizer for the fabula DSL.
2
3use crate::error::ParseError;
4
5/// Token types produced by the lexer.
6#[derive(Debug, Clone, PartialEq)]
7pub enum TokenKind {
8    // Keywords
9    Pattern,
10    Stage,
11    Unless,
12    Between,
13    After,
14    Graph,
15    Now,
16    Temporal,
17    True,
18    False,
19    Compose,    // compose
20    Sharing,    // sharing
21    Concurrent, // concurrent
22
23    // Symbols
24    LBrace,    // {
25    RBrace,    // }
26    Dot,       // .
27    Arrow,     // ->
28    Eq,        // =
29    Lt,        // <
30    Gt,        // >
31    Lte,       // <=
32    Gte,       // >=
33    Bang,      // !
34    At,        // @
35    DotDot,    // ..
36    Question,  // ?
37    GtGt,      // >>
38    Pipe,      // |
39    Star,      // *
40    LParen,    // (
41    RParen,    // )
42    Comma,     // ,
43    Plus,      // +
44    Minus,     // -
45    Colon,     // :
46    Semicolon, // ;
47
48    // Literals
49    Ident(String),
50    String(String),
51    Number(f64),
52
53    Eof,
54}
55
56/// A token with its source location.
57#[derive(Debug, Clone)]
58pub struct Token {
59    pub kind: TokenKind,
60    pub line: usize,
61    pub column: usize,
62    pub offset: usize,
63    pub len: usize,
64}
65
66impl Token {
67    pub fn span(&self) -> (usize, usize) {
68        (self.offset, self.offset + self.len)
69    }
70}
71
72/// Tokenizes DSL source text into a stream of tokens.
73pub struct Lexer<'a> {
74    source: &'a str,
75    bytes: &'a [u8],
76    pos: usize,
77    line: usize,
78    col: usize,
79}
80
81impl<'a> Lexer<'a> {
82    pub fn new(source: &'a str) -> Self {
83        Self {
84            source,
85            bytes: source.as_bytes(),
86            pos: 0,
87            line: 1,
88            col: 1,
89        }
90    }
91
92    pub fn tokenize(&mut self) -> Result<Vec<Token>, ParseError> {
93        let mut tokens = Vec::new();
94        loop {
95            self.skip_whitespace_and_comments();
96            if self.pos >= self.bytes.len() {
97                tokens.push(Token {
98                    kind: TokenKind::Eof,
99                    line: self.line,
100                    column: self.col,
101                    offset: self.pos,
102                    len: 0,
103                });
104                break;
105            }
106            tokens.push(self.next_token()?);
107        }
108        Ok(tokens)
109    }
110
111    fn skip_whitespace_and_comments(&mut self) {
112        loop {
113            // Skip whitespace
114            while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
115                if self.bytes[self.pos] == b'\n' {
116                    self.line += 1;
117                    self.col = 1;
118                } else {
119                    self.col += 1;
120                }
121                self.pos += 1;
122            }
123            // Skip line comments
124            if self.pos + 1 < self.bytes.len()
125                && self.bytes[self.pos] == b'/'
126                && self.bytes[self.pos + 1] == b'/'
127            {
128                while self.pos < self.bytes.len() && self.bytes[self.pos] != b'\n' {
129                    self.pos += 1;
130                }
131                continue;
132            }
133            break;
134        }
135    }
136
137    fn next_token(&mut self) -> Result<Token, ParseError> {
138        let start = self.pos;
139        let line = self.line;
140        let col = self.col;
141        let ch = self.bytes[self.pos];
142
143        match ch {
144            b'{' => {
145                self.advance();
146                Ok(Token {
147                    kind: TokenKind::LBrace,
148                    line,
149                    column: col,
150                    offset: start,
151                    len: 1,
152                })
153            }
154            b'}' => {
155                self.advance();
156                Ok(Token {
157                    kind: TokenKind::RBrace,
158                    line,
159                    column: col,
160                    offset: start,
161                    len: 1,
162                })
163            }
164            b'@' => {
165                self.advance();
166                Ok(Token {
167                    kind: TokenKind::At,
168                    line,
169                    column: col,
170                    offset: start,
171                    len: 1,
172                })
173            }
174            b'?' => {
175                self.advance();
176                Ok(Token {
177                    kind: TokenKind::Question,
178                    line,
179                    column: col,
180                    offset: start,
181                    len: 1,
182                })
183            }
184            b'|' => {
185                self.advance();
186                Ok(Token {
187                    kind: TokenKind::Pipe,
188                    line,
189                    column: col,
190                    offset: start,
191                    len: 1,
192                })
193            }
194            b'*' => {
195                self.advance();
196                Ok(Token {
197                    kind: TokenKind::Star,
198                    line,
199                    column: col,
200                    offset: start,
201                    len: 1,
202                })
203            }
204            b'(' => {
205                self.advance();
206                Ok(Token {
207                    kind: TokenKind::LParen,
208                    line,
209                    column: col,
210                    offset: start,
211                    len: 1,
212                })
213            }
214            b')' => {
215                self.advance();
216                Ok(Token {
217                    kind: TokenKind::RParen,
218                    line,
219                    column: col,
220                    offset: start,
221                    len: 1,
222                })
223            }
224            b',' => {
225                self.advance();
226                Ok(Token {
227                    kind: TokenKind::Comma,
228                    line,
229                    column: col,
230                    offset: start,
231                    len: 1,
232                })
233            }
234            b'!' => {
235                self.advance();
236                Ok(Token {
237                    kind: TokenKind::Bang,
238                    line,
239                    column: col,
240                    offset: start,
241                    len: 1,
242                })
243            }
244            b'.' => {
245                self.advance();
246                if self.pos < self.bytes.len() && self.bytes[self.pos] == b'.' {
247                    self.advance();
248                    Ok(Token {
249                        kind: TokenKind::DotDot,
250                        line,
251                        column: col,
252                        offset: start,
253                        len: 2,
254                    })
255                } else {
256                    Ok(Token {
257                        kind: TokenKind::Dot,
258                        line,
259                        column: col,
260                        offset: start,
261                        len: 1,
262                    })
263                }
264            }
265            b'-' => {
266                self.advance();
267                if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
268                    self.advance();
269                    Ok(Token {
270                        kind: TokenKind::Arrow,
271                        line,
272                        column: col,
273                        offset: start,
274                        len: 2,
275                    })
276                } else {
277                    Ok(Token {
278                        kind: TokenKind::Minus,
279                        line,
280                        column: col,
281                        offset: start,
282                        len: 1,
283                    })
284                }
285            }
286            b'+' => {
287                self.advance();
288                Ok(Token {
289                    kind: TokenKind::Plus,
290                    line,
291                    column: col,
292                    offset: start,
293                    len: 1,
294                })
295            }
296            b':' => {
297                self.advance();
298                Ok(Token {
299                    kind: TokenKind::Colon,
300                    line,
301                    column: col,
302                    offset: start,
303                    len: 1,
304                })
305            }
306            b';' => {
307                self.advance();
308                Ok(Token {
309                    kind: TokenKind::Semicolon,
310                    line,
311                    column: col,
312                    offset: start,
313                    len: 1,
314                })
315            }
316            b'=' => {
317                self.advance();
318                // == is just = (equality), since we don't have assignment
319                if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
320                    self.advance();
321                }
322                Ok(Token {
323                    kind: TokenKind::Eq,
324                    line,
325                    column: col,
326                    offset: start,
327                    len: self.pos - start,
328                })
329            }
330            b'<' => {
331                self.advance();
332                if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
333                    self.advance();
334                    Ok(Token {
335                        kind: TokenKind::Lte,
336                        line,
337                        column: col,
338                        offset: start,
339                        len: 2,
340                    })
341                } else {
342                    Ok(Token {
343                        kind: TokenKind::Lt,
344                        line,
345                        column: col,
346                        offset: start,
347                        len: 1,
348                    })
349                }
350            }
351            b'>' => {
352                self.advance();
353                if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
354                    self.advance();
355                    Ok(Token {
356                        kind: TokenKind::Gte,
357                        line,
358                        column: col,
359                        offset: start,
360                        len: 2,
361                    })
362                } else if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
363                    self.advance();
364                    Ok(Token {
365                        kind: TokenKind::GtGt,
366                        line,
367                        column: col,
368                        offset: start,
369                        len: 2,
370                    })
371                } else {
372                    Ok(Token {
373                        kind: TokenKind::Gt,
374                        line,
375                        column: col,
376                        offset: start,
377                        len: 1,
378                    })
379                }
380            }
381            b'"' => self.read_string(line, col),
382            b'0'..=b'9' => self.read_number(start, line, col),
383            b'a'..=b'z' | b'A'..=b'Z' | b'_' => self.read_ident(start, line, col),
384            _ => Err(self.error_at(
385                line,
386                col,
387                start,
388                &format!("unexpected character '{}'", ch as char),
389            )),
390        }
391    }
392
393    fn advance(&mut self) {
394        self.pos += 1;
395        self.col += 1;
396    }
397
398    fn read_string(&mut self, line: usize, col: usize) -> Result<Token, ParseError> {
399        let start = self.pos;
400        self.advance(); // skip first "
401
402        // Check for triple-quoted string: """..."""
403        if self.pos + 1 < self.bytes.len()
404            && self.bytes[self.pos] == b'"'
405            && self.bytes[self.pos + 1] == b'"'
406        {
407            self.advance(); // skip second "
408            self.advance(); // skip third "
409            return self.read_triple_string(start, line, col);
410        }
411
412        // Single-quoted string: "..."
413        let content_start = self.pos;
414        while self.pos < self.bytes.len() && self.bytes[self.pos] != b'"' {
415            if self.bytes[self.pos] == b'\n' {
416                return Err(self.error_at(line, col, start, "unterminated string literal"));
417            }
418            self.pos += 1;
419            self.col += 1;
420        }
421        if self.pos >= self.bytes.len() {
422            return Err(self.error_at(line, col, start, "unterminated string literal"));
423        }
424        let s = self.source[content_start..self.pos].to_string();
425        self.advance(); // skip closing "
426        Ok(Token {
427            kind: TokenKind::String(s),
428            line,
429            column: col,
430            offset: start,
431            len: self.pos - start,
432        })
433    }
434
435    fn read_triple_string(
436        &mut self,
437        start: usize,
438        line: usize,
439        col: usize,
440    ) -> Result<Token, ParseError> {
441        let content_start = self.pos;
442        loop {
443            if self.pos >= self.bytes.len() {
444                return Err(self.error_at(line, col, start, "unterminated triple-quoted string"));
445            }
446            if self.pos + 2 < self.bytes.len()
447                && self.bytes[self.pos] == b'"'
448                && self.bytes[self.pos + 1] == b'"'
449                && self.bytes[self.pos + 2] == b'"'
450            {
451                let s = self.source[content_start..self.pos].to_string();
452                self.advance(); // skip first closing "
453                self.advance(); // skip second closing "
454                self.advance(); // skip third closing "
455                return Ok(Token {
456                    kind: TokenKind::String(s),
457                    line,
458                    column: col,
459                    offset: start,
460                    len: self.pos - start,
461                });
462            }
463            if self.bytes[self.pos] == b'\n' {
464                self.line += 1;
465                self.col = 1;
466                self.pos += 1;
467            } else {
468                self.pos += 1;
469                self.col += 1;
470            }
471        }
472    }
473
474    fn read_number(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
475        while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
476            self.pos += 1;
477            self.col += 1;
478        }
479        // Check for decimal point (but not ..)
480        if self.pos + 1 < self.bytes.len()
481            && self.bytes[self.pos] == b'.'
482            && self.bytes[self.pos + 1] != b'.'
483            && self.bytes[self.pos + 1].is_ascii_digit()
484        {
485            self.pos += 1;
486            self.col += 1;
487            while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
488                self.pos += 1;
489                self.col += 1;
490            }
491        }
492        let num_str = &self.source[start..self.pos];
493        let val: f64 = num_str.parse().map_err(|_| {
494            self.error_at(line, col, start, &format!("invalid number '{}'", num_str))
495        })?;
496        Ok(Token {
497            kind: TokenKind::Number(val),
498            line,
499            column: col,
500            offset: start,
501            len: self.pos - start,
502        })
503    }
504
505    fn read_ident(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
506        while self.pos < self.bytes.len()
507            && (self.bytes[self.pos].is_ascii_alphanumeric() || self.bytes[self.pos] == b'_')
508        {
509            self.pos += 1;
510            self.col += 1;
511        }
512        let word = &self.source[start..self.pos];
513        let kind = match word {
514            "pattern" => TokenKind::Pattern,
515            "stage" => TokenKind::Stage,
516            "unless" => TokenKind::Unless,
517            "between" => TokenKind::Between,
518            "after" => TokenKind::After,
519            "graph" => TokenKind::Graph,
520            "now" => TokenKind::Now,
521            "temporal" => TokenKind::Temporal,
522            "true" => TokenKind::True,
523            "false" => TokenKind::False,
524            "compose" => TokenKind::Compose,
525            "sharing" => TokenKind::Sharing,
526            "concurrent" => TokenKind::Concurrent,
527            _ => TokenKind::Ident(word.to_string()),
528        };
529        Ok(Token {
530            kind,
531            line,
532            column: col,
533            offset: start,
534            len: self.pos - start,
535        })
536    }
537
538    fn error_at(&self, line: usize, col: usize, offset: usize, msg: &str) -> ParseError {
539        ParseError {
540            line,
541            column: col,
542            span: (offset, self.pos.max(offset + 1)),
543            message: msg.to_string(),
544        }
545    }
546}
547
548#[cfg(test)]
549mod tests {
550    use super::*;
551
552    #[test]
553    fn tokenize_simple_pattern() {
554        let src = r#"pattern test { stage e1 { e1.eventType = "enter" } }"#;
555        let tokens = Lexer::new(src).tokenize().unwrap();
556        assert!(matches!(tokens[0].kind, TokenKind::Pattern));
557        assert!(matches!(tokens[1].kind, TokenKind::Ident(ref s) if s == "test"));
558        assert!(matches!(tokens[2].kind, TokenKind::LBrace));
559        assert!(matches!(tokens[3].kind, TokenKind::Stage));
560    }
561
562    #[test]
563    fn tokenize_graph() {
564        let src = r#"graph { @1 ev.type = "enter" @2..5 ev2.type = "siege" }"#;
565        let tokens = Lexer::new(src).tokenize().unwrap();
566        assert!(matches!(tokens[0].kind, TokenKind::Graph));
567        assert!(matches!(tokens[2].kind, TokenKind::At));
568        assert!(matches!(tokens[3].kind, TokenKind::Number(n) if n == 1.0));
569    }
570
571    #[test]
572    fn tokenize_comments() {
573        let src = "// this is a comment\npattern test {}";
574        let tokens = Lexer::new(src).tokenize().unwrap();
575        assert!(matches!(tokens[0].kind, TokenKind::Pattern));
576    }
577
578    #[test]
579    fn tokenize_arrow_and_question() {
580        let src = "e1.actor -> ?guest";
581        let tokens = Lexer::new(src).tokenize().unwrap();
582        assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "e1"));
583        assert!(matches!(tokens[1].kind, TokenKind::Dot));
584        assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "actor"));
585        assert!(matches!(tokens[3].kind, TokenKind::Arrow));
586        assert!(matches!(tokens[4].kind, TokenKind::Question));
587        assert!(matches!(tokens[5].kind, TokenKind::Ident(ref s) if s == "guest"));
588    }
589
590    #[test]
591    fn tokenize_new_symbols() {
592        let src = "+ - : ;";
593        let tokens = Lexer::new(src).tokenize().unwrap();
594        assert!(matches!(tokens[0].kind, TokenKind::Plus));
595        assert!(matches!(tokens[1].kind, TokenKind::Minus));
596        assert!(matches!(tokens[2].kind, TokenKind::Colon));
597        assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
598    }
599
600    #[test]
601    fn tokenize_minus_not_folded_into_number() {
602        let src = "-5";
603        let tokens = Lexer::new(src).tokenize().unwrap();
604        assert!(matches!(tokens[0].kind, TokenKind::Minus));
605        assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 5.0));
606    }
607
608    #[test]
609    fn tokenize_arrow_still_works() {
610        let src = "-> -5";
611        let tokens = Lexer::new(src).tokenize().unwrap();
612        assert!(matches!(tokens[0].kind, TokenKind::Arrow));
613        assert!(matches!(tokens[1].kind, TokenKind::Minus));
614        assert!(matches!(tokens[2].kind, TokenKind::Number(n) if n == 5.0));
615    }
616
617    #[test]
618    fn tokenize_triple_quoted_string() {
619        let src = r#""""hello
620world""""#;
621        let tokens = Lexer::new(src).tokenize().unwrap();
622        assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == "hello\nworld"));
623    }
624
625    #[test]
626    fn tokenize_triple_quoted_empty() {
627        let src = "\"\"\"\"\"\""; // 6 quote chars: """  """
628        let tokens = Lexer::new(src).tokenize().unwrap();
629        assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s.is_empty()));
630    }
631
632    #[test]
633    fn tokenize_triple_quoted_with_single_quotes_inside() {
634        let src = r#""""say "hello" to them""""#;
635        let tokens = Lexer::new(src).tokenize().unwrap();
636        assert!(
637            matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"say "hello" to them"#)
638        );
639    }
640
641    #[test]
642    fn tokenize_triple_quoted_double_quotes_inside() {
643        let src = r#""""has ""two"" inside""""#;
644        let tokens = Lexer::new(src).tokenize().unwrap();
645        assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"has ""two"" inside"#));
646    }
647
648    #[test]
649    fn tokenize_salience_style() {
650        // lifecycle: oneshot, priority: normal, adjust ?e2.depth + 1
651        let src = r#"lifecycle: oneshot; priority: normal; adjust ?e2.depth + 1"#;
652        let tokens = Lexer::new(src).tokenize().unwrap();
653        assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "lifecycle"));
654        assert!(matches!(tokens[1].kind, TokenKind::Colon));
655        assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "oneshot"));
656        assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
657        assert!(matches!(tokens[13].kind, TokenKind::Plus));
658        assert!(matches!(tokens[14].kind, TokenKind::Number(n) if n == 1.0));
659    }
660}