somni_parser/
lexer.rs

1use std::str::CharIndices;
2
3#[derive(Debug, Copy, Clone, PartialEq)]
4pub enum ErrorKind {
5    UnexpectedCharacter,
6    UnterminatedString,
7    InvalidNumericLiteral,
8}
9
10impl std::fmt::Display for ErrorKind {
11    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
12        match self {
13            Self::UnexpectedCharacter => write!(f, "unexpected character"),
14            Self::UnterminatedString => write!(f, "unterminated string"),
15            Self::InvalidNumericLiteral => write!(f, "invalid numeric literal"),
16        }
17    }
18}
19
20#[derive(Debug, Clone, PartialEq)]
21pub struct LexerError {
22    pub location: Location,
23    pub error: ErrorKind,
24}
25
26impl std::fmt::Display for LexerError {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        write!(f, "Lexer error: {}", self.error)
29    }
30}
31
32#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
33pub struct Location {
34    pub start: usize,
35    pub end: usize,
36}
37
38impl Location {
39    pub const fn dummy() -> Self {
40        Location { start: 0, end: 0 }
41    }
42}
43
44impl Location {
45    pub fn extract(self, source: &str) -> &str {
46        &source[self.start..self.end]
47    }
48}
49
50#[derive(Clone, Copy, Debug, PartialEq)]
51pub enum TokenKind {
52    Symbol,
53    Comment,
54    DecimalInteger,
55    BinaryInteger,
56    HexInteger,
57    Float,
58    Identifier,
59    String,
60}
61
62#[derive(Clone, Copy, Debug)]
63pub struct Token {
64    pub kind: TokenKind,
65    pub location: Location,
66}
67impl Token {
68    pub fn source(self, source: &str) -> &str {
69        self.location.extract(source)
70    }
71}
72
73fn ident_char(c: char) -> bool {
74    c.is_alphanumeric() || ['-', '_'].contains(&c)
75}
76
77struct CharProvider<'s> {
78    chars: CharIndices<'s>,
79}
80
81impl CharProvider<'_> {
82    fn next(&mut self) -> Option<char> {
83        self.chars.next().map(|(_, c)| c)
84    }
85
86    fn consume_n(&mut self, n: usize) {
87        for _ in 0..n {
88            self.chars.next();
89        }
90    }
91
92    fn peek(&self) -> Option<char> {
93        self.chars.clone().next().map(|(_, c)| c)
94    }
95
96    fn peek_n(&self, n: usize) -> Option<&str> {
97        let start = self.offset();
98        self.chars
99            .clone()
100            .nth(n)
101            .map(|(idx, _)| &self.chars.as_str()[..idx - start])
102    }
103
104    fn offset(&self) -> usize {
105        self.chars.offset()
106    }
107}
108
109pub fn tokenize(source: &str) -> impl Iterator<Item = Result<Token, LexerError>> + '_ {
110    let mut chars = CharProvider {
111        chars: source.char_indices(),
112    };
113
114    let mut location = Location { start: 0, end: 0 };
115
116    std::iter::from_fn(move || {
117        loop {
118            // Start lexing a new token
119            location.start = chars.offset();
120
121            // Comments, prefixed numbers, some symbols
122            if let Some(doubles) = chars.peek_n(2) {
123                match doubles {
124                    "//" => {
125                        chars.consume_n(2);
126                        while let Some(maybe_newline) = chars.peek() {
127                            if maybe_newline == '\n' {
128                                break;
129                            }
130
131                            chars.next();
132                        }
133                        location.end = chars.offset();
134
135                        return Some(Ok(Token {
136                            kind: TokenKind::Comment,
137                            location,
138                        }));
139                    }
140                    "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
141                        chars.consume_n(2);
142                        location.end = chars.offset();
143                        return Some(Ok(Token {
144                            kind: TokenKind::Symbol,
145                            location,
146                        }));
147                    }
148                    "0x" | "0b" => {
149                        let kind = if doubles == "0x" {
150                            TokenKind::HexInteger
151                        } else {
152                            TokenKind::BinaryInteger
153                        };
154                        chars.consume_n(2);
155
156                        let mut has_digit = false;
157                        while let Some(maybe_digit) = chars.peek() {
158                            if !ident_char(maybe_digit) {
159                                break;
160                            }
161                            has_digit = true;
162                            chars.next();
163                        }
164                        location.end = chars.offset();
165
166                        return if has_digit {
167                            Some(Ok(Token { kind, location }))
168                        } else {
169                            Some(Err(LexerError {
170                                location,
171                                error: ErrorKind::InvalidNumericLiteral,
172                            }))
173                        };
174                    }
175                    _ => {}
176                }
177            }
178
179            // Single characters
180            let Some(next) = chars.next() else {
181                break;
182            };
183            location.end = chars.offset();
184
185            match next {
186                c if c.is_whitespace() => {
187                    // skip whitespace
188                }
189                '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
190                | '<' | '>' | '&' | '|' | '^' | '=' | '!' => {
191                    return Some(Ok(Token {
192                        kind: TokenKind::Symbol,
193                        location,
194                    }));
195                }
196                c if c.is_numeric() => {
197                    // maybe float
198                    let mut is_float = false;
199                    while let Some(maybe_boundary) = chars.peek() {
200                        if !maybe_boundary.is_numeric() {
201                            if maybe_boundary == '.' && is_float {
202                                break;
203                            }
204
205                            if maybe_boundary == '.' {
206                                is_float = true;
207                            } else {
208                                break;
209                            }
210                        }
211                        chars.next();
212                    }
213                    location.end = chars.offset();
214                    let kind = if is_float {
215                        TokenKind::Float
216                    } else {
217                        TokenKind::DecimalInteger
218                    };
219                    return Some(Ok(Token { kind, location }));
220                }
221                c if ident_char(c) => {
222                    // Identifier
223                    while let Some(maybe_boundary) = chars.peek() {
224                        if !ident_char(maybe_boundary) {
225                            break;
226                        }
227                        chars.next();
228                    }
229                    location.end = chars.offset();
230
231                    return Some(Ok(Token {
232                        kind: TokenKind::Identifier,
233                        location,
234                    }));
235                }
236                '"' => {
237                    // Strings
238                    let mut escape_start = None;
239                    while let Some(c) = chars.peek() {
240                        // Terminating double quote mark?
241                        if c == '"' && escape_start.is_none() {
242                            chars.next();
243                            location.end = chars.offset();
244
245                            return Some(Ok(Token {
246                                kind: TokenKind::String,
247                                location,
248                            }));
249                        }
250
251                        if escape_start.take().is_none() && c == '\\' {
252                            escape_start = Some(chars.offset());
253                        }
254
255                        chars.next();
256                    }
257                    location.end = source.len();
258
259                    return Some(Err(LexerError {
260                        location,
261                        error: ErrorKind::UnterminatedString,
262                    }));
263                }
264                _ => {
265                    return Some(Err(LexerError {
266                        location,
267                        error: ErrorKind::UnexpectedCharacter,
268                    }));
269                }
270            }
271        }
272
273        None
274    })
275}
276
277#[cfg(test)]
278mod test {
279    use super::*;
280
281    fn test_tokenizer(source: &str, expectations: &[Result<(&'static str, TokenKind), ErrorKind>]) {
282        let result = tokenize(source).collect::<Vec<Result<_, _>>>();
283
284        for (idx, expectation) in expectations.iter().enumerate() {
285            match expectation.clone() {
286                Ok((expected, kind)) => {
287                    assert_eq!(
288                        expected,
289                        result[idx].as_ref().unwrap().location.extract(source)
290                    );
291                    assert_eq!(kind, result[idx].as_ref().unwrap().kind);
292                }
293                Err(err) => {
294                    assert_eq!(err, result[idx].as_ref().unwrap_err().error);
295                }
296            }
297        }
298    }
299
300    #[test]
301    fn test_lex_numbers() {
302        let source = "2 2. 2.3 2.34 23.4 234 0b00 0b10 0b2 0x123 0xf 0xF";
303        let expectations = [
304            Ok(("2", TokenKind::DecimalInteger)),
305            Ok(("2.", TokenKind::Float)),
306            Ok(("2.3", TokenKind::Float)),
307            Ok(("2.34", TokenKind::Float)),
308            Ok(("23.4", TokenKind::Float)),
309            Ok(("234", TokenKind::DecimalInteger)),
310            Ok(("0b00", TokenKind::BinaryInteger)),
311            Ok(("0b10", TokenKind::BinaryInteger)),
312            Ok(("0b2", TokenKind::BinaryInteger)),
313            Ok(("0x123", TokenKind::HexInteger)),
314            Ok(("0xf", TokenKind::HexInteger)),
315            Ok(("0xF", TokenKind::HexInteger)),
316        ];
317
318        test_tokenizer(source, &expectations);
319    }
320
321    #[test]
322    fn test_lexer() {
323        let source = "   \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
324
325        let expectations = [
326            Ok(("// **a", TokenKind::Comment)),
327            Ok(("2", TokenKind::DecimalInteger)),
328            Ok(("// b", TokenKind::Comment)),
329            Ok(("foo", TokenKind::Identifier)),
330            Ok((",", TokenKind::Symbol)),
331            Ok(("ar", TokenKind::Identifier)),
332            Ok(("\"string\\\"\"", TokenKind::String)),
333            Ok(("\"\"", TokenKind::String)),
334            Ok(("(", TokenKind::Symbol)),
335            Ok((")", TokenKind::Symbol)),
336            Ok(("->", TokenKind::Symbol)),
337            Ok(("{", TokenKind::Symbol)),
338            Ok(("}", TokenKind::Symbol)),
339        ];
340
341        test_tokenizer(source, &expectations);
342    }
343}