Skip to main content

arc_lang/
lexer.rs

1/// Arc lexer — tokenizes .arc source into a flat stream of tokens.
2/// Designed for maximum error recovery: never panics, always produces tokens.
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub enum TokenKind {
6    // Keywords / types
7    Service, Db, Cache, Queue, Gateway, User, Store, Fn, Worker, External,
8    Group, Include,
9    // Directives
10    At,          // @
11    // Arrows
12    Arrow,       // ->
13    DashedArrow, // -->
14    BiArrow,     // <->
15    BlockArrow,  // -x
16    // Delimiters
17    LBrace,      // {
18    RBrace,      // }
19    LBracket,    // [
20    RBracket,    // ]
21    LParen,      // (
22    RParen,      // )
23    Colon,       // :
24    Comma,       // ,
25    // Literals
26    QuotedString(String),
27    Ident(String),
28    // Structure
29    Newline,
30    Comment(String),
31    // Error recovery
32    Unknown(char),
33}
34
35#[derive(Debug, Clone)]
36pub struct Token {
37    pub kind: TokenKind,
38    pub line: usize,
39    pub col: usize,
40    pub len: usize,
41}
42
43pub fn tokenize(input: &str) -> Vec<Token> {
44    let mut tokens = Vec::new();
45    let chars: Vec<char> = input.chars().collect();
46    let mut pos = 0usize;
47    let mut line = 1usize;
48    let mut col = 1usize;
49
50    while pos < chars.len() {
51        let ch = chars[pos];
52
53        // Skip spaces and tabs (not newlines)
54        if ch == ' ' || ch == '\t' {
55            pos += 1;
56            col += 1;
57            continue;
58        }
59
60        // Newlines
61        if ch == '\n' {
62            tokens.push(Token { kind: TokenKind::Newline, line, col, len: 1 });
63            pos += 1;
64            line += 1;
65            col = 1;
66            continue;
67        }
68        if ch == '\r' {
69            pos += 1;
70            if pos < chars.len() && chars[pos] == '\n' {
71                pos += 1;
72            }
73            tokens.push(Token { kind: TokenKind::Newline, line, col, len: 1 });
74            line += 1;
75            col = 1;
76            continue;
77        }
78
79        // Comments
80        if ch == '#' {
81            let start_col = col;
82            let start = pos;
83            pos += 1;
84            col += 1;
85            while pos < chars.len() && chars[pos] != '\n' && chars[pos] != '\r' {
86                pos += 1;
87                col += 1;
88            }
89            let text: String = chars[start + 1..pos].iter().collect();
90            tokens.push(Token { kind: TokenKind::Comment(text.trim().to_string()), line, col: start_col, len: pos - start });
91            continue;
92        }
93
94        // Quoted strings
95        if ch == '"' {
96            let start_col = col;
97            let start = pos;
98            pos += 1;
99            col += 1;
100            let mut s = String::new();
101            while pos < chars.len() && chars[pos] != '"' && chars[pos] != '\n' {
102                if chars[pos] == '\\' && pos + 1 < chars.len() {
103                    match chars[pos + 1] {
104                        'n' => { s.push('\n'); pos += 2; col += 2; continue; }
105                        '"' => { s.push('"'); pos += 2; col += 2; continue; }
106                        '\\' => { s.push('\\'); pos += 2; col += 2; continue; }
107                        _ => {}
108                    }
109                }
110                s.push(chars[pos]);
111                pos += 1;
112                col += 1;
113            }
114            if pos < chars.len() && chars[pos] == '"' {
115                pos += 1;
116                col += 1;
117            }
118            // Forgiving: if string not closed, still produce token
119            tokens.push(Token { kind: TokenKind::QuotedString(s), line, col: start_col, len: pos - start });
120            continue;
121        }
122
123        // Arrows: ->, -->, <->, -x
124        if ch == '-' {
125            let start_col = col;
126            if pos + 1 < chars.len() && chars[pos + 1] == '>' {
127                tokens.push(Token { kind: TokenKind::Arrow, line, col: start_col, len: 2 });
128                pos += 2; col += 2;
129                continue;
130            }
131            if pos + 2 < chars.len() && chars[pos + 1] == '-' && chars[pos + 2] == '>' {
132                tokens.push(Token { kind: TokenKind::DashedArrow, line, col: start_col, len: 3 });
133                pos += 3; col += 3;
134                continue;
135            }
136            if pos + 1 < chars.len() && chars[pos + 1] == 'x' {
137                tokens.push(Token { kind: TokenKind::BlockArrow, line, col: start_col, len: 2 });
138                pos += 2; col += 2;
139                continue;
140            }
141            // Just a dash — part of an ident? Fall through to ident
142        }
143        if ch == '<' && pos + 2 < chars.len() && chars[pos + 1] == '-' && chars[pos + 2] == '>' {
144            let start_col = col;
145            tokens.push(Token { kind: TokenKind::BiArrow, line, col: start_col, len: 3 });
146            pos += 3; col += 3;
147            continue;
148        }
149
150        // Single char tokens
151        match ch {
152            '@' => { tokens.push(Token { kind: TokenKind::At, line, col, len: 1 }); pos += 1; col += 1; continue; }
153            '{' => { tokens.push(Token { kind: TokenKind::LBrace, line, col, len: 1 }); pos += 1; col += 1; continue; }
154            '}' => { tokens.push(Token { kind: TokenKind::RBrace, line, col, len: 1 }); pos += 1; col += 1; continue; }
155            '[' => { tokens.push(Token { kind: TokenKind::LBracket, line, col, len: 1 }); pos += 1; col += 1; continue; }
156            ']' => { tokens.push(Token { kind: TokenKind::RBracket, line, col, len: 1 }); pos += 1; col += 1; continue; }
157            '(' => { tokens.push(Token { kind: TokenKind::LParen, line, col, len: 1 }); pos += 1; col += 1; continue; }
158            ')' => { tokens.push(Token { kind: TokenKind::RParen, line, col, len: 1 }); pos += 1; col += 1; continue; }
159            ':' => { tokens.push(Token { kind: TokenKind::Colon, line, col, len: 1 }); pos += 1; col += 1; continue; }
160            ',' => { tokens.push(Token { kind: TokenKind::Comma, line, col, len: 1 }); pos += 1; col += 1; continue; }
161            _ => {}
162        }
163
164        // Identifiers and keywords
165        if ch.is_alphanumeric() || ch == '_' || ch == '-' {
166            let start_col = col;
167            let start = pos;
168            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_' || chars[pos] == '-') {
169                pos += 1;
170                col += 1;
171            }
172            let word: String = chars[start..pos].iter().collect();
173            let len = pos - start;
174            // Only match EXACT canonical type names as keywords.
175            // Aliases (svc, database, client, etc.) are handled by
176            // NodeType::from_str_fuzzy in the parser — NOT in the lexer.
177            // This prevents words like "Client" from being misidentified.
178            let kind = match word.as_str() {
179                "service" => TokenKind::Service,
180                "db" => TokenKind::Db,
181                "cache" => TokenKind::Cache,
182                "queue" => TokenKind::Queue,
183                "gateway" => TokenKind::Gateway,
184                "user" => TokenKind::User,
185                "store" => TokenKind::Store,
186                "fn" => TokenKind::Fn,
187                "worker" => TokenKind::Worker,
188                "external" => TokenKind::External,
189                "group" => TokenKind::Group,
190                "include" => TokenKind::Include,
191                _ => TokenKind::Ident(word),
192            };
193            tokens.push(Token { kind, line, col: start_col, len });
194            continue;
195        }
196
197        // Unknown character — error recovery: skip it
198        tokens.push(Token { kind: TokenKind::Unknown(ch), line, col, len: 1 });
199        pos += 1;
200        col += 1;
201    }
202
203    tokens
204}
205
206impl TokenKind {
207    pub fn is_node_type(&self) -> bool {
208        matches!(self,
209            TokenKind::Service | TokenKind::Db | TokenKind::Cache |
210            TokenKind::Queue | TokenKind::Gateway | TokenKind::User |
211            TokenKind::Store | TokenKind::Fn | TokenKind::Worker |
212            TokenKind::External
213        )
214    }
215
216    pub fn to_node_type(&self) -> Option<crate::ast::NodeType> {
217        match self {
218            TokenKind::Service => Some(crate::ast::NodeType::Service),
219            TokenKind::Db => Some(crate::ast::NodeType::Db),
220            TokenKind::Cache => Some(crate::ast::NodeType::Cache),
221            TokenKind::Queue => Some(crate::ast::NodeType::Queue),
222            TokenKind::Gateway => Some(crate::ast::NodeType::Gateway),
223            TokenKind::User => Some(crate::ast::NodeType::User),
224            TokenKind::Store => Some(crate::ast::NodeType::Store),
225            TokenKind::Fn => Some(crate::ast::NodeType::Fn),
226            TokenKind::Worker => Some(crate::ast::NodeType::Worker),
227            TokenKind::External => Some(crate::ast::NodeType::External),
228            _ => None,
229        }
230    }
231
232    pub fn is_arrow(&self) -> bool {
233        matches!(self,
234            TokenKind::Arrow | TokenKind::DashedArrow |
235            TokenKind::BiArrow | TokenKind::BlockArrow
236        )
237    }
238
239    pub fn to_arrow_kind(&self) -> Option<crate::ast::ArrowKind> {
240        match self {
241            TokenKind::Arrow => Some(crate::ast::ArrowKind::Solid),
242            TokenKind::DashedArrow => Some(crate::ast::ArrowKind::Dashed),
243            TokenKind::BiArrow => Some(crate::ast::ArrowKind::Bidirectional),
244            TokenKind::BlockArrow => Some(crate::ast::ArrowKind::Blocked),
245            _ => None,
246        }
247    }
248}