seraphine_core/
tokenizer.rs

1use std::{iter::Peekable, str::Chars};
2
3use crate::{common::Pos, error::TokenizeError};
4
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub enum Keyword {
7    Fn,
8    If,
9    Else,
10    While,
11    For,
12    In,
13    Continue,
14    Break,
15    Return,
16    True,
17    False,
18    Null,
19}
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum Operator {
23    /// `+`
24    Plus,
25    /// `-`
26    Minus,
27    /// `*`
28    Star,
29    /// `/`
30    Slash,
31    /// `%`
32    Percent,
33    /// `^`
34    Caret,
35    /// `!`
36    Exclamation,
37    /// `==`
38    Equal,
39    /// `!=`
40    Unequal,
41    /// `<`
42    LessThan,
43    /// `>`
44    GreaterThan,
45    /// `<=`
46    LessThanOrEqual,
47    /// `>=`
48    GreaterThanOrEqual,
49    /// `&&`
50    And,
51    /// `||`
52    Or,
53}
54
55#[derive(Debug, Clone, PartialEq)]
56pub enum TokenKind {
57    Keyword(Keyword),
58    Identifier(String),
59    Number(f64),
60    String(String),
61    Operator(Operator),
62    /// `,`
63    Comma,
64    /// `(`
65    LParen,
66    /// `)`
67    RParen,
68    /// `{`
69    LBrace,
70    /// `}`
71    RBrace,
72    /// `[`
73    LBracket,
74    /// `]`
75    RBracket,
76    /// `=`
77    Equal,
78    /// '.'
79    Dot,
80    /// ':'
81    Colon,
82    Newline,
83    /// End of file
84    Eof,
85}
86
87#[derive(Debug, Clone, PartialEq)]
88pub struct Token {
89    pub pos: Pos,
90    pub kind: TokenKind,
91}
92
93pub fn tokenize(s: &str) -> Result<Vec<Token>, TokenizeError> {
94    Tokenizer::new(s).tokenize()
95}
96
97struct Tokenizer<'a> {
98    chars: Peekable<Chars<'a>>,
99    idx: usize,
100}
101
102impl<'a> Tokenizer<'a> {
103    fn new(s: &'a str) -> Self {
104        Self {
105            chars: s.chars().peekable(),
106            idx: 0,
107        }
108    }
109
110    fn next(&mut self) -> Option<char> {
111        self.idx += 1;
112        self.chars.next()
113    }
114
115    fn peek(&mut self) -> Option<&char> {
116        self.chars.peek()
117    }
118
119    fn tokenize(&mut self) -> Result<Vec<Token>, TokenizeError> {
120        let mut tokens = vec![];
121
122        while let Some(c) = self.next() {
123            let token_start_pos = self.idx - 1;
124            let token_kind = match (c, self.peek()) {
125                ('/', Some('/')) => {
126                    self.next();
127                    while let Some(c) = self.peek() {
128                        if *c == '\n' {
129                            break;
130                        }
131                        self.next();
132                    }
133                    continue;
134                }
135                ('!', Some('=')) => {
136                    self.next();
137                    TokenKind::Operator(Operator::Unequal)
138                }
139                ('=', Some('=')) => {
140                    self.next();
141                    TokenKind::Operator(Operator::Equal)
142                }
143                ('<', Some('=')) => {
144                    self.next();
145                    TokenKind::Operator(Operator::LessThanOrEqual)
146                }
147                ('>', Some('=')) => {
148                    self.next();
149                    TokenKind::Operator(Operator::GreaterThanOrEqual)
150                }
151                ('&', Some('&')) => {
152                    self.next();
153                    TokenKind::Operator(Operator::And)
154                }
155                ('|', Some('|')) => {
156                    self.next();
157                    TokenKind::Operator(Operator::Or)
158                }
159                ('!', _) => TokenKind::Operator(Operator::Exclamation),
160                ('<', _) => TokenKind::Operator(Operator::LessThan),
161                ('>', _) => TokenKind::Operator(Operator::GreaterThan),
162                ('+', _) => TokenKind::Operator(Operator::Plus),
163                ('-', _) => TokenKind::Operator(Operator::Minus),
164                ('*', _) => TokenKind::Operator(Operator::Star),
165                ('/', _) => TokenKind::Operator(Operator::Slash),
166                ('^', _) => TokenKind::Operator(Operator::Caret),
167                ('%', _) => TokenKind::Operator(Operator::Percent),
168                (',', _) => TokenKind::Comma,
169                ('(', _) => TokenKind::LParen,
170                (')', _) => TokenKind::RParen,
171                ('{', _) => TokenKind::LBrace,
172                ('}', _) => TokenKind::RBrace,
173                ('[', _) => TokenKind::LBracket,
174                (']', _) => TokenKind::RBracket,
175                ('=', _) => TokenKind::Equal,
176                (c @ ('0'..='9'), _) | (c @ '.', Some('0'..='9')) => {
177                    let mut has_dot = c == '.';
178                    let mut has_e = false;
179
180                    let mut num = String::new();
181                    num.push(c);
182                    while let Some(c) = self.peek() {
183                        match c {
184                            '.' => {
185                                if has_dot || has_e {
186                                    return Err(TokenizeError::UnexpectedChar {
187                                        got: '.',
188                                        pos: self.idx,
189                                    });
190                                }
191                                has_dot = true;
192                            }
193                            'e' => {
194                                if has_e {
195                                    return Err(TokenizeError::UnexpectedChar {
196                                        got: 'e',
197                                        pos: self.idx,
198                                    });
199                                }
200                                has_e = true;
201                            }
202                            '0'..='9' => (),
203                            _ => break,
204                        }
205                        let c = self.next().unwrap();
206                        num.push(c);
207
208                        if c == 'e' && self.peek() == Some(&'-') {
209                            num.push(self.next().unwrap());
210                        }
211                    }
212
213                    if num == "." {
214                        return Err(TokenizeError::UnexpectedChar {
215                            got: '.',
216                            pos: self.idx - 1,
217                        });
218                    }
219
220                    let Ok(n) = num.parse() else {
221                        return Err(TokenizeError::MalformedNumber {
222                            number_str: num,
223                            pos: token_start_pos,
224                        });
225                    };
226                    TokenKind::Number(n)
227                }
228                ('.', _) => TokenKind::Dot,
229                (':', _) => TokenKind::Colon,
230                ('"', _) => {
231                    let mut str = String::new();
232                    let mut terminated = false;
233                    while let Some(c) = self.next() {
234                        match c {
235                            '"' => {
236                                terminated = true;
237                                break;
238                            }
239                            '\n' => break,
240                            '\\' => match self.next() {
241                                Some('"') => str.push('"'),
242                                Some('n') => str.push('\n'),
243                                Some('r') => str.push('\r'),
244                                Some('t') => str.push('\t'),
245                                Some('\\') => str.push('\\'),
246                                Some('0') => str.push('\0'),
247                                Some(c) => {
248                                    return Err(TokenizeError::UnexpectedChar {
249                                        got: c,
250                                        pos: self.idx - 1,
251                                    })
252                                }
253                                None => break,
254                            },
255                            _ => str.push(c),
256                        }
257                    }
258
259                    if !terminated {
260                        return Err(TokenizeError::UnterminatedString {
261                            pos: token_start_pos,
262                        });
263                    }
264
265                    TokenKind::String(str)
266                }
267                (c @ ('a'..='z' | 'A'..='Z' | '_'), _) => {
268                    let mut ident = String::new();
269                    ident.push(c);
270                    while let Some(c) = self.peek() {
271                        match c {
272                            'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => {
273                                let c = self.next().unwrap();
274                                ident.push(c);
275                            }
276                            _ => break,
277                        }
278                    }
279
280                    match ident.as_str() {
281                        "fn" => TokenKind::Keyword(Keyword::Fn),
282                        "if" => TokenKind::Keyword(Keyword::If),
283                        "else" => TokenKind::Keyword(Keyword::Else),
284                        "while" => TokenKind::Keyword(Keyword::While),
285                        "for" => TokenKind::Keyword(Keyword::For),
286                        "in" => TokenKind::Keyword(Keyword::In),
287                        "continue" => TokenKind::Keyword(Keyword::Continue),
288                        "break" => TokenKind::Keyword(Keyword::Break),
289                        "return" => TokenKind::Keyword(Keyword::Return),
290                        "true" => TokenKind::Keyword(Keyword::True),
291                        "false" => TokenKind::Keyword(Keyword::False),
292                        "null" => TokenKind::Keyword(Keyword::Null),
293                        _ => TokenKind::Identifier(ident),
294                    }
295                }
296                // TODO: Account for \r\n
297                ('\n', _) => TokenKind::Newline,
298                (c, _) if c.is_ascii_whitespace() => continue,
299                (c, _) => {
300                    return Err(TokenizeError::UnexpectedChar {
301                        got: c,
302                        pos: self.idx - 1,
303                    })
304                }
305            };
306
307            let token = Token {
308                kind: token_kind,
309                pos: token_start_pos,
310            };
311            tokens.push(token);
312        }
313
314        let eof_token = Token {
315            kind: TokenKind::Eof,
316            pos: self.idx - 1,
317        };
318        tokens.push(eof_token);
319
320        Ok(tokens)
321    }
322}