Skip to main content

repl_core/dsl/
lexer.rs

1//! Lexer for the Symbiont DSL
2//!
3//! Converts raw text input into a stream of tokens for parsing.
4
5use crate::error::{ReplError, Result};
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Token types recognized by the lexer
10#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
11pub enum TokenType {
12    // Literals
13    String(String),
14    Number(f64),
15    Integer(i64),
16    Boolean(bool),
17    Duration(u64, String), // value, unit
18    Size(u64, String),     // value, unit
19
20    // Identifiers and keywords
21    Identifier(String),
22    Keyword(Keyword),
23
24    // Operators
25    Plus,
26    Minus,
27    Multiply,
28    Divide,
29    Modulo,
30    Equal,
31    NotEqual,
32    LessThan,
33    LessThanOrEqual,
34    GreaterThan,
35    GreaterThanOrEqual,
36    And,
37    Or,
38    Not,
39    BitwiseAnd,
40    BitwiseOr,
41    BitwiseXor,
42    BitwiseNot,
43    LeftShift,
44    RightShift,
45    Assign,
46    Question,
47
48    // Delimiters
49    LeftParen,
50    RightParen,
51    LeftBrace,
52    RightBrace,
53    LeftBracket,
54    RightBracket,
55    Comma,
56    Semicolon,
57    Colon,
58    Dot,
59    Arrow,
60    FatArrow,
61
62    // Special
63    Newline,
64    Eof,
65    Comment(String),
66}
67
68/// Keywords in the DSL
69#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
70pub enum Keyword {
71    Agent,
72    Behavior,
73    Function,
74    Struct,
75    Let,
76    If,
77    Else,
78    Match,
79    For,
80    While,
81    Try,
82    Catch,
83    Return,
84    Emit,
85    Require,
86    Check,
87    On,
88    In,
89    Invoke,
90    True,
91    False,
92    Null,
93    Capability,
94    Capabilities,
95    Policy,
96    Has,
97    Name,
98    Version,
99    Author,
100    Description,
101    Resources,
102    Security,
103    Policies,
104    Input,
105    Output,
106    Steps,
107    Memory,
108    Cpu,
109    Network,
110    Storage,
111    Tier,
112    Sandbox,
113    Allow,
114    Strict,
115    Moderate,
116    Permissive,
117    Timeout,
118    Retry,
119    Failure,
120    Terminate,
121    Restart,
122    Escalate,
123    Ignore,
124    Tier1,
125    Tier2,
126    Tier3,
127    Tier4,
128}
129
130/// Token with location information
131#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
132pub struct Token {
133    pub token_type: TokenType,
134    pub line: usize,
135    pub column: usize,
136    pub offset: usize,
137    pub length: usize,
138}
139
140/// Lexer for the Symbiont DSL
141pub struct Lexer {
142    input: Vec<char>,
143    position: usize,
144    line: usize,
145    column: usize,
146    keywords: HashMap<String, Keyword>,
147}
148
149impl Lexer {
150    /// Create a new lexer for the given input
151    pub fn new(input: &str) -> Self {
152        let mut keywords = HashMap::new();
153
154        // Insert all keywords
155        keywords.insert("agent".to_string(), Keyword::Agent);
156        keywords.insert("behavior".to_string(), Keyword::Behavior);
157        keywords.insert("function".to_string(), Keyword::Function);
158        keywords.insert("struct".to_string(), Keyword::Struct);
159        keywords.insert("let".to_string(), Keyword::Let);
160        keywords.insert("if".to_string(), Keyword::If);
161        keywords.insert("else".to_string(), Keyword::Else);
162        keywords.insert("match".to_string(), Keyword::Match);
163        keywords.insert("for".to_string(), Keyword::For);
164        keywords.insert("while".to_string(), Keyword::While);
165        keywords.insert("try".to_string(), Keyword::Try);
166        keywords.insert("catch".to_string(), Keyword::Catch);
167        keywords.insert("return".to_string(), Keyword::Return);
168        keywords.insert("emit".to_string(), Keyword::Emit);
169        keywords.insert("require".to_string(), Keyword::Require);
170        keywords.insert("check".to_string(), Keyword::Check);
171        keywords.insert("on".to_string(), Keyword::On);
172        keywords.insert("in".to_string(), Keyword::In);
173        keywords.insert("invoke".to_string(), Keyword::Invoke);
174        keywords.insert("true".to_string(), Keyword::True);
175        keywords.insert("false".to_string(), Keyword::False);
176        keywords.insert("null".to_string(), Keyword::Null);
177        keywords.insert("capability".to_string(), Keyword::Capability);
178        keywords.insert("capabilities".to_string(), Keyword::Capabilities);
179        keywords.insert("policy".to_string(), Keyword::Policy);
180        keywords.insert("has".to_string(), Keyword::Has);
181        keywords.insert("name".to_string(), Keyword::Name);
182        keywords.insert("version".to_string(), Keyword::Version);
183        keywords.insert("author".to_string(), Keyword::Author);
184        keywords.insert("description".to_string(), Keyword::Description);
185        keywords.insert("resources".to_string(), Keyword::Resources);
186        keywords.insert("security".to_string(), Keyword::Security);
187        keywords.insert("policies".to_string(), Keyword::Policies);
188        keywords.insert("input".to_string(), Keyword::Input);
189        keywords.insert("output".to_string(), Keyword::Output);
190        keywords.insert("steps".to_string(), Keyword::Steps);
191        keywords.insert("memory".to_string(), Keyword::Memory);
192        keywords.insert("cpu".to_string(), Keyword::Cpu);
193        keywords.insert("network".to_string(), Keyword::Network);
194        keywords.insert("storage".to_string(), Keyword::Storage);
195        keywords.insert("tier".to_string(), Keyword::Tier);
196        keywords.insert("sandbox".to_string(), Keyword::Sandbox);
197        keywords.insert("allow".to_string(), Keyword::Allow);
198        keywords.insert("strict".to_string(), Keyword::Strict);
199        keywords.insert("moderate".to_string(), Keyword::Moderate);
200        keywords.insert("permissive".to_string(), Keyword::Permissive);
201        keywords.insert("timeout".to_string(), Keyword::Timeout);
202        keywords.insert("retry".to_string(), Keyword::Retry);
203        keywords.insert("failure".to_string(), Keyword::Failure);
204        keywords.insert("terminate".to_string(), Keyword::Terminate);
205        keywords.insert("restart".to_string(), Keyword::Restart);
206        keywords.insert("escalate".to_string(), Keyword::Escalate);
207        keywords.insert("ignore".to_string(), Keyword::Ignore);
208        keywords.insert("Tier1".to_string(), Keyword::Tier1);
209        keywords.insert("Tier2".to_string(), Keyword::Tier2);
210        keywords.insert("Tier3".to_string(), Keyword::Tier3);
211        keywords.insert("Tier4".to_string(), Keyword::Tier4);
212
213        Self {
214            input: input.chars().collect(),
215            position: 0,
216            line: 1,
217            column: 1,
218            keywords,
219        }
220    }
221
222    /// Tokenize the entire input
223    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
224        let mut tokens = Vec::new();
225
226        loop {
227            let token = self.next_token()?;
228            let is_eof = matches!(token.token_type, TokenType::Eof);
229            tokens.push(token);
230
231            if is_eof {
232                break;
233            }
234        }
235
236        Ok(tokens)
237    }
238
239    /// Get the next token
240    pub fn next_token(&mut self) -> Result<Token> {
241        self.skip_whitespace();
242
243        let start_line = self.line;
244        let start_column = self.column;
245        let start_offset = self.position;
246
247        if self.position >= self.input.len() {
248            return Ok(Token {
249                token_type: TokenType::Eof,
250                line: start_line,
251                column: start_column,
252                offset: start_offset,
253                length: 0,
254            });
255        }
256
257        let ch = self.current_char();
258
259        let token_type = match ch {
260            // Comments
261            '/' if self.peek_char() == Some('/') => {
262                let comment = self.read_line_comment();
263                TokenType::Comment(comment)
264            }
265            '/' if self.peek_char() == Some('*') => {
266                let comment = self.read_block_comment()?;
267                TokenType::Comment(comment)
268            }
269
270            // String literals
271            '"' => {
272                let string = self.read_string()?;
273                TokenType::String(string)
274            }
275
276            // Numbers
277            c if c.is_ascii_digit() => self.read_number()?,
278
279            // Identifiers and keywords
280            c if c.is_alphabetic() || c == '_' => {
281                let identifier = self.read_identifier();
282                if let Some(keyword) = self.keywords.get(&identifier) {
283                    TokenType::Keyword(keyword.clone())
284                } else {
285                    TokenType::Identifier(identifier)
286                }
287            }
288
289            // Operators and punctuation
290            '+' => {
291                self.advance();
292                TokenType::Plus
293            }
294            '-' if self.peek_char() == Some('>') => {
295                self.advance(); // -
296                self.advance(); // >
297                TokenType::Arrow
298            }
299            '-' => {
300                self.advance();
301                TokenType::Minus
302            }
303            '*' => {
304                self.advance();
305                TokenType::Multiply
306            }
307            '/' => {
308                self.advance();
309                TokenType::Divide
310            }
311            '%' => {
312                self.advance();
313                TokenType::Modulo
314            }
315            '=' if self.peek_char() == Some('=') => {
316                self.advance(); // =
317                self.advance(); // =
318                TokenType::Equal
319            }
320            '=' if self.peek_char() == Some('>') => {
321                self.advance(); // =
322                self.advance(); // >
323                TokenType::FatArrow
324            }
325            '=' => {
326                self.advance();
327                TokenType::Assign
328            }
329            '!' if self.peek_char() == Some('=') => {
330                self.advance(); // !
331                self.advance(); // =
332                TokenType::NotEqual
333            }
334            '!' => {
335                self.advance();
336                TokenType::Not
337            }
338            '<' if self.peek_char() == Some('=') => {
339                self.advance(); // <
340                self.advance(); // =
341                TokenType::LessThanOrEqual
342            }
343            '<' if self.peek_char() == Some('<') => {
344                self.advance(); // <
345                self.advance(); // <
346                TokenType::LeftShift
347            }
348            '<' => {
349                self.advance();
350                TokenType::LessThan
351            }
352            '>' if self.peek_char() == Some('=') => {
353                self.advance(); // >
354                self.advance(); // =
355                TokenType::GreaterThanOrEqual
356            }
357            '>' if self.peek_char() == Some('>') => {
358                self.advance(); // >
359                self.advance(); // >
360                TokenType::RightShift
361            }
362            '>' => {
363                self.advance();
364                TokenType::GreaterThan
365            }
366            '&' if self.peek_char() == Some('&') => {
367                self.advance(); // &
368                self.advance(); // &
369                TokenType::And
370            }
371            '&' => {
372                self.advance();
373                TokenType::BitwiseAnd
374            }
375            '|' if self.peek_char() == Some('|') => {
376                self.advance(); // |
377                self.advance(); // |
378                TokenType::Or
379            }
380            '|' => {
381                self.advance();
382                TokenType::BitwiseOr
383            }
384            '^' => {
385                self.advance();
386                TokenType::BitwiseXor
387            }
388            '~' => {
389                self.advance();
390                TokenType::BitwiseNot
391            }
392            '?' => {
393                self.advance();
394                TokenType::Question
395            }
396
397            // Delimiters
398            '(' => {
399                self.advance();
400                TokenType::LeftParen
401            }
402            ')' => {
403                self.advance();
404                TokenType::RightParen
405            }
406            '{' => {
407                self.advance();
408                TokenType::LeftBrace
409            }
410            '}' => {
411                self.advance();
412                TokenType::RightBrace
413            }
414            '[' => {
415                self.advance();
416                TokenType::LeftBracket
417            }
418            ']' => {
419                self.advance();
420                TokenType::RightBracket
421            }
422            ',' => {
423                self.advance();
424                TokenType::Comma
425            }
426            ';' => {
427                self.advance();
428                TokenType::Semicolon
429            }
430            ':' => {
431                self.advance();
432                TokenType::Colon
433            }
434            '.' => {
435                self.advance();
436                TokenType::Dot
437            }
438
439            // Newlines
440            '\n' => {
441                self.advance();
442                self.line += 1;
443                self.column = 1;
444                TokenType::Newline
445            }
446
447            // Unexpected character
448            _ => {
449                return Err(ReplError::Lexing(format!(
450                    "Unexpected character '{}' at line {}, column {}",
451                    ch, self.line, self.column
452                )));
453            }
454        };
455
456        let length = self.position - start_offset;
457
458        Ok(Token {
459            token_type,
460            line: start_line,
461            column: start_column,
462            offset: start_offset,
463            length,
464        })
465    }
466
467    /// Skip whitespace characters
468    fn skip_whitespace(&mut self) {
469        while let Some(ch) = self.current_char_opt() {
470            if ch.is_whitespace() && ch != '\n' {
471                self.advance();
472            } else {
473                break;
474            }
475        }
476    }
477
478    /// Read a string literal
479    fn read_string(&mut self) -> Result<String> {
480        self.advance(); // Skip opening quote
481        let mut string = String::new();
482
483        while let Some(ch) = self.current_char_opt() {
484            match ch {
485                '"' => {
486                    self.advance(); // Skip closing quote
487                    return Ok(string);
488                }
489                '\\' => {
490                    self.advance(); // Skip backslash
491                    if let Some(escaped) = self.current_char_opt() {
492                        match escaped {
493                            'n' => string.push('\n'),
494                            't' => string.push('\t'),
495                            'r' => string.push('\r'),
496                            '\\' => string.push('\\'),
497                            '"' => string.push('"'),
498                            _ => {
499                                string.push('\\');
500                                string.push(escaped);
501                            }
502                        }
503                        self.advance();
504                    } else {
505                        return Err(ReplError::Lexing("Unterminated string literal".to_string()));
506                    }
507                }
508                '\n' => {
509                    self.line += 1;
510                    self.column = 1;
511                    string.push(ch);
512                    self.advance();
513                }
514                _ => {
515                    string.push(ch);
516                    self.advance();
517                }
518            }
519        }
520
521        Err(ReplError::Lexing("Unterminated string literal".to_string()))
522    }
523
524    /// Read a number (integer or float) with optional units
525    fn read_number(&mut self) -> Result<TokenType> {
526        let mut number_str = String::new();
527        let mut has_dot = false;
528
529        // Read digits and optional decimal point
530        while let Some(ch) = self.current_char_opt() {
531            if ch.is_ascii_digit() {
532                number_str.push(ch);
533                self.advance();
534            } else if ch == '.' && !has_dot {
535                has_dot = true;
536                number_str.push(ch);
537                self.advance();
538            } else {
539                break;
540            }
541        }
542
543        // Check for units (duration or size)
544        if let Some(ch) = self.current_char_opt() {
545            if ch.is_alphabetic() {
546                let unit = self.read_unit();
547                let value = if has_dot {
548                    number_str
549                        .parse::<f64>()
550                        .map_err(|_| ReplError::Lexing(format!("Invalid number: {}", number_str)))?
551                        as u64
552                } else {
553                    number_str
554                        .parse::<u64>()
555                        .map_err(|_| ReplError::Lexing(format!("Invalid number: {}", number_str)))?
556                };
557
558                // Determine if it's a duration or size unit
559                if matches!(unit.as_str(), "s" | "m" | "h" | "d" | "ms") {
560                    return Ok(TokenType::Duration(value, unit));
561                } else if matches!(unit.as_str(), "B" | "KB" | "MB" | "GB" | "TB") {
562                    return Ok(TokenType::Size(value, unit));
563                }
564            }
565        }
566
567        // Parse as regular number
568        if has_dot {
569            let value = number_str
570                .parse::<f64>()
571                .map_err(|_| ReplError::Lexing(format!("Invalid number: {}", number_str)))?;
572            Ok(TokenType::Number(value))
573        } else {
574            let value = number_str
575                .parse::<i64>()
576                .map_err(|_| ReplError::Lexing(format!("Invalid number: {}", number_str)))?;
577            Ok(TokenType::Integer(value))
578        }
579    }
580
581    /// Read a unit suffix
582    fn read_unit(&mut self) -> String {
583        let mut unit = String::new();
584        while let Some(ch) = self.current_char_opt() {
585            if ch.is_alphabetic() {
586                unit.push(ch);
587                self.advance();
588            } else {
589                break;
590            }
591        }
592        unit
593    }
594
595    /// Read an identifier
596    fn read_identifier(&mut self) -> String {
597        let mut identifier = String::new();
598
599        while let Some(ch) = self.current_char_opt() {
600            if ch.is_alphanumeric() || ch == '_' {
601                identifier.push(ch);
602                self.advance();
603            } else {
604                break;
605            }
606        }
607
608        identifier
609    }
610
611    /// Read a line comment
612    fn read_line_comment(&mut self) -> String {
613        self.advance(); // /
614        self.advance(); // /
615
616        let mut comment = String::new();
617        while let Some(ch) = self.current_char_opt() {
618            if ch == '\n' {
619                break;
620            }
621            comment.push(ch);
622            self.advance();
623        }
624
625        comment
626    }
627
628    /// Read a block comment
629    fn read_block_comment(&mut self) -> Result<String> {
630        self.advance(); // /
631        self.advance(); // *
632
633        let mut comment = String::new();
634
635        while self.position < self.input.len() - 1 {
636            let ch = self.current_char();
637            let next_ch = self.peek_char();
638
639            if ch == '*' && next_ch == Some('/') {
640                self.advance(); // *
641                self.advance(); // /
642                return Ok(comment);
643            }
644
645            if ch == '\n' {
646                self.line += 1;
647                self.column = 1;
648            }
649
650            comment.push(ch);
651            self.advance();
652        }
653
654        Err(ReplError::Lexing("Unterminated block comment".to_string()))
655    }
656
657    /// Get the current character
658    fn current_char(&self) -> char {
659        self.input[self.position]
660    }
661
662    /// Get the current character as an option
663    fn current_char_opt(&self) -> Option<char> {
664        self.input.get(self.position).copied()
665    }
666
667    /// Peek at the next character
668    fn peek_char(&self) -> Option<char> {
669        self.input.get(self.position + 1).copied()
670    }
671
672    /// Advance to the next character
673    fn advance(&mut self) {
674        if self.position < self.input.len() {
675            self.position += 1;
676            self.column += 1;
677        }
678    }
679}
680
681#[cfg(test)]
682mod tests {
683    use super::*;
684
685    #[test]
686    fn test_basic_tokens() {
687        let mut lexer = Lexer::new("let x = 42");
688        let tokens = lexer.tokenize().unwrap();
689
690        assert_eq!(tokens.len(), 5); // let, x, =, 42, EOF
691        assert!(matches!(
692            tokens[0].token_type,
693            TokenType::Keyword(Keyword::Let)
694        ));
695        assert!(matches!(tokens[1].token_type, TokenType::Identifier(_)));
696        assert!(matches!(tokens[2].token_type, TokenType::Assign));
697        assert!(matches!(tokens[3].token_type, TokenType::Integer(42)));
698        assert!(matches!(tokens[4].token_type, TokenType::Eof));
699    }
700
701    #[test]
702    fn test_string_literal() {
703        let mut lexer = Lexer::new(r#""Hello, world!""#);
704        let tokens = lexer.tokenize().unwrap();
705
706        assert_eq!(tokens.len(), 2); // string, EOF
707        assert!(matches!(tokens[0].token_type, TokenType::String(ref s) if s == "Hello, world!"));
708    }
709
710    #[test]
711    fn test_duration_literal() {
712        let mut lexer = Lexer::new("30s 5m 2h");
713        let tokens = lexer.tokenize().unwrap();
714
715        assert_eq!(tokens.len(), 4); // 30s, 5m, 2h, EOF
716        assert!(matches!(tokens[0].token_type, TokenType::Duration(30, ref unit) if unit == "s"));
717        assert!(matches!(tokens[1].token_type, TokenType::Duration(5, ref unit) if unit == "m"));
718        assert!(matches!(tokens[2].token_type, TokenType::Duration(2, ref unit) if unit == "h"));
719    }
720
721    #[test]
722    fn test_size_literal() {
723        let mut lexer = Lexer::new("1KB 512MB 2GB");
724        let tokens = lexer.tokenize().unwrap();
725
726        assert_eq!(tokens.len(), 4); // 1KB, 512MB, 2GB, EOF
727        assert!(matches!(tokens[0].token_type, TokenType::Size(1, ref unit) if unit == "KB"));
728        assert!(matches!(tokens[1].token_type, TokenType::Size(512, ref unit) if unit == "MB"));
729        assert!(matches!(tokens[2].token_type, TokenType::Size(2, ref unit) if unit == "GB"));
730    }
731
732    #[test]
733    fn test_comments() {
734        let mut lexer = Lexer::new("// line comment\n/* block comment */");
735        let tokens = lexer.tokenize().unwrap();
736
737        assert_eq!(tokens.len(), 4); // line comment, newline, block comment, EOF
738        assert!(matches!(tokens[0].token_type, TokenType::Comment(_)));
739        assert!(matches!(tokens[1].token_type, TokenType::Newline));
740        assert!(matches!(tokens[2].token_type, TokenType::Comment(_)));
741    }
742}