Skip to main content

sda_lib/
lexer.rs

1use thiserror::Error;
2
3use crate::number::{ExactNum, ParseNumError};
4
5#[derive(Debug, Clone)]
6pub enum TokenKind {
7    Let,
8    Yield,
9    Null,
10    True,
11    False,
12    Bytes,
13    Seq,
14    Set,
15    Bag,
16    Map,
17    Prod,
18    BagKV,
19    Some,
20    None,
21    Ok,
22    Fail,
23    Union,
24    Inter,
25    Diff,
26    BUnion,
27    BDiff,
28    In,
29    And,
30    Or,
31    Not,
32    Arrow,
33    FatArrow,
34    Pipe,
35    Eq,
36    Neq,
37    Lt,
38    Le,
39    Gt,
40    Ge,
41    Plus,
42    Minus,
43    Star,
44    Slash,
45    Concat,
46    LParen,
47    RParen,
48    LBrack,
49    RBrack,
50    LBrace,
51    RBrace,
52    QMark,
53    Bang,
54    Colon,
55    Comma,
56    Semi,
57    Bar,
58    SelL,
59    SelR,
60    Ident(String),
61    Str(String),
62    Num(ExactNum),
63    Placeholder,
64    Eof,
65}
66
67impl PartialEq for TokenKind {
68    fn eq(&self, other: &Self) -> bool {
69        match (self, other) {
70            (TokenKind::Num(a), TokenKind::Num(b)) => a == b,
71            (TokenKind::Ident(a), TokenKind::Ident(b)) => a == b,
72            (TokenKind::Str(a), TokenKind::Str(b)) => a == b,
73            _ => std::mem::discriminant(self) == std::mem::discriminant(other),
74        }
75    }
76}
77
78#[derive(Debug, Clone)]
79pub struct Token {
80    pub kind: TokenKind,
81    pub pos: usize,
82}
83
84#[derive(Debug, Error)]
85pub enum LexError {
86    #[error("Unexpected character '{0}' at position {1}")]
87    UnexpectedChar(char, usize),
88    #[error("Unterminated string at position {0}")]
89    UnterminatedString(usize),
90    #[error("Invalid numeric literal '{literal}' at position {pos}: {source}")]
91    InvalidNumber {
92        literal: String,
93        pos: usize,
94        #[source]
95        source: ParseNumError,
96    },
97}
98
99pub fn lex(src: &str) -> Result<Vec<Token>, LexError> {
100    let chars: Vec<char> = src.chars().collect();
101    let mut pos = 0;
102    let mut tokens = Vec::new();
103
104    while pos < chars.len() {
105        let start = pos;
106        let ch = chars[pos];
107
108        if ch.is_whitespace() {
109            pos += 1;
110            continue;
111        }
112
113        if ch == ';' && pos + 1 < chars.len() && chars[pos + 1] == ';' {
114            while pos < chars.len() && chars[pos] != '\n' {
115                pos += 1;
116            }
117            continue;
118        }
119
120        if ch.is_ascii_digit() || (ch == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit()) {
121            let num_start = pos;
122            while pos < chars.len() && (chars[pos].is_ascii_digit() || chars[pos] == '.') {
123                pos += 1;
124            }
125            if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
126                pos += 1;
127                if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
128                    pos += 1;
129                }
130                while pos < chars.len() && chars[pos].is_ascii_digit() {
131                    pos += 1;
132                }
133            }
134            let num_str: String = chars[num_start..pos].iter().collect();
135            let n = ExactNum::parse_literal(&num_str).map_err(|source| LexError::InvalidNumber {
136                literal: num_str.clone(),
137                pos: start,
138                source,
139            })?;
140            tokens.push(Token {
141                kind: TokenKind::Num(n),
142                pos: start,
143            });
144            continue;
145        }
146
147        if ch == '"' {
148            pos += 1;
149            let mut s = String::new();
150            while pos < chars.len() && chars[pos] != '"' {
151                if chars[pos] == '\\' {
152                    pos += 1;
153                    if pos >= chars.len() {
154                        return Err(LexError::UnterminatedString(start));
155                    }
156                    match chars[pos] {
157                        'n' => s.push('\n'),
158                        't' => s.push('\t'),
159                        'r' => s.push('\r'),
160                        '"' => s.push('"'),
161                        '\\' => s.push('\\'),
162                        c => {
163                            s.push('\\');
164                            s.push(c);
165                        }
166                    }
167                } else {
168                    s.push(chars[pos]);
169                }
170                pos += 1;
171            }
172            if pos >= chars.len() {
173                return Err(LexError::UnterminatedString(start));
174            }
175            pos += 1;
176            tokens.push(Token {
177                kind: TokenKind::Str(s),
178                pos: start,
179            });
180            continue;
181        }
182
183        if ch.is_alphabetic() || ch == '_' {
184            if ch == '_' {
185                let next = pos + 1;
186                if next >= chars.len() || (!chars[next].is_alphanumeric() && chars[next] != '_') {
187                    pos += 1;
188                    tokens.push(Token {
189                        kind: TokenKind::Placeholder,
190                        pos: start,
191                    });
192                    continue;
193                }
194            }
195            let id_start = pos;
196            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
197                pos += 1;
198            }
199            let ident: String = chars[id_start..pos].iter().collect();
200            let kind = match ident.to_ascii_lowercase().as_str() {
201                "let" => TokenKind::Let,
202                "yield" => TokenKind::Yield,
203                "null" => TokenKind::Null,
204                "true" => TokenKind::True,
205                "false" => TokenKind::False,
206                "bytes" => TokenKind::Bytes,
207                "seq" => TokenKind::Seq,
208                "set" => TokenKind::Set,
209                "bag" => TokenKind::Bag,
210                "map" => TokenKind::Map,
211                "prod" => TokenKind::Prod,
212                "bagkv" => TokenKind::BagKV,
213                "some" => TokenKind::Some,
214                "none" => TokenKind::None,
215                "ok" => TokenKind::Ok,
216                "fail" => TokenKind::Fail,
217                "union" => TokenKind::Union,
218                "inter" => TokenKind::Inter,
219                "diff" => TokenKind::Diff,
220                "bunion" => TokenKind::BUnion,
221                "bdiff" => TokenKind::BDiff,
222                "in" => TokenKind::In,
223                "and" => TokenKind::And,
224                "or" => TokenKind::Or,
225                "not" => TokenKind::Not,
226                _ => TokenKind::Ident(ident),
227            };
228            tokens.push(Token { kind, pos: start });
229            continue;
230        }
231
232        let kind = match ch {
233            '→' => {
234                pos += 1;
235                TokenKind::Arrow
236            }
237            '↦' => {
238                pos += 1;
239                TokenKind::FatArrow
240            }
241            '∈' => {
242                pos += 1;
243                TokenKind::In
244            }
245            '∧' => {
246                pos += 1;
247                TokenKind::And
248            }
249            '∨' => {
250                pos += 1;
251                TokenKind::Or
252            }
253            '¬' => {
254                pos += 1;
255                TokenKind::Not
256            }
257            '⟨' => {
258                pos += 1;
259                TokenKind::SelL
260            }
261            '⟩' => {
262                pos += 1;
263                TokenKind::SelR
264            }
265            '∣' => {
266                pos += 1;
267                TokenKind::Bar
268            }
269            '⊎' => {
270                pos += 1;
271                TokenKind::BUnion
272            }
273            '⊖' => {
274                pos += 1;
275                TokenKind::BDiff
276            }
277            '≠' => {
278                pos += 1;
279                TokenKind::Neq
280            }
281            '≤' => {
282                pos += 1;
283                TokenKind::Le
284            }
285            '≥' => {
286                pos += 1;
287                TokenKind::Ge
288            }
289            '•' => {
290                pos += 1;
291                TokenKind::Placeholder
292            }
293            _ => match ch {
294                '-' => {
295                    if pos + 1 < chars.len() && chars[pos + 1] == '>' {
296                        pos += 2;
297                        TokenKind::Arrow
298                    } else {
299                        pos += 1;
300                        TokenKind::Minus
301                    }
302                }
303                '=' => {
304                    if pos + 1 < chars.len() && chars[pos + 1] == '>' {
305                        pos += 2;
306                        TokenKind::FatArrow
307                    } else {
308                        pos += 1;
309                        TokenKind::Eq
310                    }
311                }
312                '!' => {
313                    if pos + 1 < chars.len() && chars[pos + 1] == '=' {
314                        pos += 2;
315                        TokenKind::Neq
316                    } else {
317                        pos += 1;
318                        TokenKind::Bang
319                    }
320                }
321                '<' => {
322                    if pos + 1 < chars.len() && chars[pos + 1] == '=' {
323                        pos += 2;
324                        TokenKind::Le
325                    } else {
326                        pos += 1;
327                        TokenKind::Lt
328                    }
329                }
330                '>' => {
331                    if pos + 1 < chars.len() && chars[pos + 1] == '=' {
332                        pos += 2;
333                        TokenKind::Ge
334                    } else {
335                        pos += 1;
336                        TokenKind::Gt
337                    }
338                }
339                '|' => {
340                    if pos + 1 < chars.len() && chars[pos + 1] == '>' {
341                        pos += 2;
342                        TokenKind::Pipe
343                    } else {
344                        pos += 1;
345                        TokenKind::Bar
346                    }
347                }
348                '+' => {
349                    if pos + 1 < chars.len() && chars[pos + 1] == '+' {
350                        pos += 2;
351                        TokenKind::Concat
352                    } else {
353                        pos += 1;
354                        TokenKind::Plus
355                    }
356                }
357                '*' => {
358                    pos += 1;
359                    TokenKind::Star
360                }
361                '/' => {
362                    pos += 1;
363                    TokenKind::Slash
364                }
365                '(' => {
366                    pos += 1;
367                    TokenKind::LParen
368                }
369                ')' => {
370                    pos += 1;
371                    TokenKind::RParen
372                }
373                '[' => {
374                    pos += 1;
375                    TokenKind::LBrack
376                }
377                ']' => {
378                    pos += 1;
379                    TokenKind::RBrack
380                }
381                '{' => {
382                    pos += 1;
383                    TokenKind::LBrace
384                }
385                '}' => {
386                    pos += 1;
387                    TokenKind::RBrace
388                }
389                '?' => {
390                    pos += 1;
391                    TokenKind::QMark
392                }
393                ':' => {
394                    pos += 1;
395                    TokenKind::Colon
396                }
397                ',' => {
398                    pos += 1;
399                    TokenKind::Comma
400                }
401                ';' => {
402                    pos += 1;
403                    TokenKind::Semi
404                }
405                _ => return Err(LexError::UnexpectedChar(ch, start)),
406            },
407        };
408        tokens.push(Token { kind, pos: start });
409    }
410
411    tokens.push(Token {
412        kind: TokenKind::Eof,
413        pos: chars.len(),
414    });
415    Ok(tokens)
416}