1use thiserror::Error;
2
3use crate::number::{ExactNum, ParseNumError};
4
5#[derive(Debug, Clone)]
6pub enum TokenKind {
7 Let,
8 Yield,
9 Null,
10 True,
11 False,
12 Bytes,
13 Seq,
14 Set,
15 Bag,
16 Map,
17 Prod,
18 BagKV,
19 Some,
20 None,
21 Ok,
22 Fail,
23 Union,
24 Inter,
25 Diff,
26 BUnion,
27 BDiff,
28 In,
29 And,
30 Or,
31 Not,
32 Arrow,
33 FatArrow,
34 Pipe,
35 Eq,
36 Neq,
37 Lt,
38 Le,
39 Gt,
40 Ge,
41 Plus,
42 Minus,
43 Star,
44 Slash,
45 Concat,
46 LParen,
47 RParen,
48 LBrack,
49 RBrack,
50 LBrace,
51 RBrace,
52 QMark,
53 Bang,
54 Colon,
55 Comma,
56 Semi,
57 Bar,
58 SelL,
59 SelR,
60 Ident(String),
61 Str(String),
62 Num(ExactNum),
63 Placeholder,
64 Eof,
65}
66
67impl PartialEq for TokenKind {
68 fn eq(&self, other: &Self) -> bool {
69 match (self, other) {
70 (TokenKind::Num(a), TokenKind::Num(b)) => a == b,
71 (TokenKind::Ident(a), TokenKind::Ident(b)) => a == b,
72 (TokenKind::Str(a), TokenKind::Str(b)) => a == b,
73 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
74 }
75 }
76}
77
78#[derive(Debug, Clone)]
79pub struct Token {
80 pub kind: TokenKind,
81 pub pos: usize,
82}
83
84#[derive(Debug, Error)]
85pub enum LexError {
86 #[error("Unexpected character '{0}' at position {1}")]
87 UnexpectedChar(char, usize),
88 #[error("Unterminated string at position {0}")]
89 UnterminatedString(usize),
90 #[error("Invalid numeric literal '{literal}' at position {pos}: {source}")]
91 InvalidNumber {
92 literal: String,
93 pos: usize,
94 #[source]
95 source: ParseNumError,
96 },
97}
98
99pub fn lex(src: &str) -> Result<Vec<Token>, LexError> {
100 let chars: Vec<char> = src.chars().collect();
101 let mut pos = 0;
102 let mut tokens = Vec::new();
103
104 while pos < chars.len() {
105 let start = pos;
106 let ch = chars[pos];
107
108 if ch.is_whitespace() {
109 pos += 1;
110 continue;
111 }
112
113 if ch == ';' && pos + 1 < chars.len() && chars[pos + 1] == ';' {
114 while pos < chars.len() && chars[pos] != '\n' {
115 pos += 1;
116 }
117 continue;
118 }
119
120 if ch.is_ascii_digit() || (ch == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit()) {
121 let num_start = pos;
122 while pos < chars.len() && (chars[pos].is_ascii_digit() || chars[pos] == '.') {
123 pos += 1;
124 }
125 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
126 pos += 1;
127 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
128 pos += 1;
129 }
130 while pos < chars.len() && chars[pos].is_ascii_digit() {
131 pos += 1;
132 }
133 }
134 let num_str: String = chars[num_start..pos].iter().collect();
135 let n = ExactNum::parse_literal(&num_str).map_err(|source| LexError::InvalidNumber {
136 literal: num_str.clone(),
137 pos: start,
138 source,
139 })?;
140 tokens.push(Token {
141 kind: TokenKind::Num(n),
142 pos: start,
143 });
144 continue;
145 }
146
147 if ch == '"' {
148 pos += 1;
149 let mut s = String::new();
150 while pos < chars.len() && chars[pos] != '"' {
151 if chars[pos] == '\\' {
152 pos += 1;
153 if pos >= chars.len() {
154 return Err(LexError::UnterminatedString(start));
155 }
156 match chars[pos] {
157 'n' => s.push('\n'),
158 't' => s.push('\t'),
159 'r' => s.push('\r'),
160 '"' => s.push('"'),
161 '\\' => s.push('\\'),
162 c => {
163 s.push('\\');
164 s.push(c);
165 }
166 }
167 } else {
168 s.push(chars[pos]);
169 }
170 pos += 1;
171 }
172 if pos >= chars.len() {
173 return Err(LexError::UnterminatedString(start));
174 }
175 pos += 1;
176 tokens.push(Token {
177 kind: TokenKind::Str(s),
178 pos: start,
179 });
180 continue;
181 }
182
183 if ch.is_alphabetic() || ch == '_' {
184 if ch == '_' {
185 let next = pos + 1;
186 if next >= chars.len() || (!chars[next].is_alphanumeric() && chars[next] != '_') {
187 pos += 1;
188 tokens.push(Token {
189 kind: TokenKind::Placeholder,
190 pos: start,
191 });
192 continue;
193 }
194 }
195 let id_start = pos;
196 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
197 pos += 1;
198 }
199 let ident: String = chars[id_start..pos].iter().collect();
200 let kind = match ident.to_ascii_lowercase().as_str() {
201 "let" => TokenKind::Let,
202 "yield" => TokenKind::Yield,
203 "null" => TokenKind::Null,
204 "true" => TokenKind::True,
205 "false" => TokenKind::False,
206 "bytes" => TokenKind::Bytes,
207 "seq" => TokenKind::Seq,
208 "set" => TokenKind::Set,
209 "bag" => TokenKind::Bag,
210 "map" => TokenKind::Map,
211 "prod" => TokenKind::Prod,
212 "bagkv" => TokenKind::BagKV,
213 "some" => TokenKind::Some,
214 "none" => TokenKind::None,
215 "ok" => TokenKind::Ok,
216 "fail" => TokenKind::Fail,
217 "union" => TokenKind::Union,
218 "inter" => TokenKind::Inter,
219 "diff" => TokenKind::Diff,
220 "bunion" => TokenKind::BUnion,
221 "bdiff" => TokenKind::BDiff,
222 "in" => TokenKind::In,
223 "and" => TokenKind::And,
224 "or" => TokenKind::Or,
225 "not" => TokenKind::Not,
226 _ => TokenKind::Ident(ident),
227 };
228 tokens.push(Token { kind, pos: start });
229 continue;
230 }
231
232 let kind = match ch {
233 '→' => {
234 pos += 1;
235 TokenKind::Arrow
236 }
237 '↦' => {
238 pos += 1;
239 TokenKind::FatArrow
240 }
241 '∈' => {
242 pos += 1;
243 TokenKind::In
244 }
245 '∧' => {
246 pos += 1;
247 TokenKind::And
248 }
249 '∨' => {
250 pos += 1;
251 TokenKind::Or
252 }
253 '¬' => {
254 pos += 1;
255 TokenKind::Not
256 }
257 '⟨' => {
258 pos += 1;
259 TokenKind::SelL
260 }
261 '⟩' => {
262 pos += 1;
263 TokenKind::SelR
264 }
265 '∣' => {
266 pos += 1;
267 TokenKind::Bar
268 }
269 '⊎' => {
270 pos += 1;
271 TokenKind::BUnion
272 }
273 '⊖' => {
274 pos += 1;
275 TokenKind::BDiff
276 }
277 '≠' => {
278 pos += 1;
279 TokenKind::Neq
280 }
281 '≤' => {
282 pos += 1;
283 TokenKind::Le
284 }
285 '≥' => {
286 pos += 1;
287 TokenKind::Ge
288 }
289 '•' => {
290 pos += 1;
291 TokenKind::Placeholder
292 }
293 _ => match ch {
294 '-' => {
295 if pos + 1 < chars.len() && chars[pos + 1] == '>' {
296 pos += 2;
297 TokenKind::Arrow
298 } else {
299 pos += 1;
300 TokenKind::Minus
301 }
302 }
303 '=' => {
304 if pos + 1 < chars.len() && chars[pos + 1] == '>' {
305 pos += 2;
306 TokenKind::FatArrow
307 } else {
308 pos += 1;
309 TokenKind::Eq
310 }
311 }
312 '!' => {
313 if pos + 1 < chars.len() && chars[pos + 1] == '=' {
314 pos += 2;
315 TokenKind::Neq
316 } else {
317 pos += 1;
318 TokenKind::Bang
319 }
320 }
321 '<' => {
322 if pos + 1 < chars.len() && chars[pos + 1] == '=' {
323 pos += 2;
324 TokenKind::Le
325 } else {
326 pos += 1;
327 TokenKind::Lt
328 }
329 }
330 '>' => {
331 if pos + 1 < chars.len() && chars[pos + 1] == '=' {
332 pos += 2;
333 TokenKind::Ge
334 } else {
335 pos += 1;
336 TokenKind::Gt
337 }
338 }
339 '|' => {
340 if pos + 1 < chars.len() && chars[pos + 1] == '>' {
341 pos += 2;
342 TokenKind::Pipe
343 } else {
344 pos += 1;
345 TokenKind::Bar
346 }
347 }
348 '+' => {
349 if pos + 1 < chars.len() && chars[pos + 1] == '+' {
350 pos += 2;
351 TokenKind::Concat
352 } else {
353 pos += 1;
354 TokenKind::Plus
355 }
356 }
357 '*' => {
358 pos += 1;
359 TokenKind::Star
360 }
361 '/' => {
362 pos += 1;
363 TokenKind::Slash
364 }
365 '(' => {
366 pos += 1;
367 TokenKind::LParen
368 }
369 ')' => {
370 pos += 1;
371 TokenKind::RParen
372 }
373 '[' => {
374 pos += 1;
375 TokenKind::LBrack
376 }
377 ']' => {
378 pos += 1;
379 TokenKind::RBrack
380 }
381 '{' => {
382 pos += 1;
383 TokenKind::LBrace
384 }
385 '}' => {
386 pos += 1;
387 TokenKind::RBrace
388 }
389 '?' => {
390 pos += 1;
391 TokenKind::QMark
392 }
393 ':' => {
394 pos += 1;
395 TokenKind::Colon
396 }
397 ',' => {
398 pos += 1;
399 TokenKind::Comma
400 }
401 ';' => {
402 pos += 1;
403 TokenKind::Semi
404 }
405 _ => return Err(LexError::UnexpectedChar(ch, start)),
406 },
407 };
408 tokens.push(Token { kind, pos: start });
409 }
410
411 tokens.push(Token {
412 kind: TokenKind::Eof,
413 pos: chars.len(),
414 });
415 Ok(tokens)
416}