somni_parser/
lexer.rs

1use std::str::CharIndices;
2
3use crate::{Error, Location};
4
5#[derive(Clone, Copy, Debug, PartialEq)]
6pub enum TokenKind {
7    Symbol,
8    Comment,
9    DecimalInteger,
10    BinaryInteger,
11    HexInteger,
12    Float,
13    Identifier,
14    String,
15}
16
17#[derive(Clone, Copy, Debug)]
18pub struct Token {
19    pub kind: TokenKind,
20    pub location: Location,
21}
22impl Token {
23    pub fn source(self, source: &str) -> &str {
24        self.location.extract(source)
25    }
26}
27
28fn ident_char(c: char) -> bool {
29    c.is_alphanumeric() || ['-', '_'].contains(&c)
30}
31
32#[derive(Clone)]
33struct CharProvider<'s> {
34    chars: CharIndices<'s>,
35}
36
37impl CharProvider<'_> {
38    fn next(&mut self) -> Option<char> {
39        self.chars.next().map(|(_, c)| c)
40    }
41
42    fn consume_n(&mut self, n: usize) {
43        for _ in 0..n {
44            self.chars.next();
45        }
46    }
47
48    fn peek(&self) -> Option<char> {
49        self.chars.clone().next().map(|(_, c)| c)
50    }
51
52    fn peek_n(&self, n: usize) -> Option<&str> {
53        let start = self.offset();
54        self.chars
55            .clone()
56            .nth(n)
57            .map(|(idx, _)| &self.chars.as_str()[..idx - start])
58    }
59
60    fn offset(&self) -> usize {
61        self.chars.offset()
62    }
63}
64
65pub(crate) trait Tokenizer: Iterator<Item = Result<Token, crate::Error>> + Clone {}
66impl<I> Tokenizer for I where I: Iterator<Item = Result<Token, crate::Error>> + Clone {}
67
68pub(crate) fn tokenize(source: &str) -> impl Tokenizer + '_ {
69    let mut chars = CharProvider {
70        chars: source.char_indices(),
71    };
72
73    let mut location = Location { start: 0, end: 0 };
74
75    std::iter::from_fn(move || {
76        loop {
77            // Start lexing a new token
78            location.start = chars.offset();
79
80            // Comments, prefixed numbers, some symbols
81            if let Some(doubles) = chars.peek_n(2) {
82                match doubles {
83                    "//" => {
84                        chars.consume_n(2);
85                        while let Some(maybe_newline) = chars.peek() {
86                            if maybe_newline == '\n' {
87                                break;
88                            }
89
90                            chars.next();
91                        }
92                        location.end = chars.offset();
93
94                        return Some(Ok(Token {
95                            kind: TokenKind::Comment,
96                            location,
97                        }));
98                    }
99                    "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
100                        chars.consume_n(2);
101                        location.end = chars.offset();
102                        return Some(Ok(Token {
103                            kind: TokenKind::Symbol,
104                            location,
105                        }));
106                    }
107                    "0x" | "0b" => {
108                        let kind = if doubles == "0x" {
109                            TokenKind::HexInteger
110                        } else {
111                            TokenKind::BinaryInteger
112                        };
113                        chars.consume_n(2);
114
115                        let mut has_digit = false;
116                        while let Some(maybe_digit) = chars.peek() {
117                            // Ignore underscores completely
118                            if maybe_digit == '_' {
119                                chars.next();
120                                continue;
121                            }
122                            if !ident_char(maybe_digit) {
123                                break;
124                            }
125                            has_digit = true;
126                            chars.next();
127                        }
128                        location.end = chars.offset();
129
130                        return if has_digit {
131                            Some(Ok(Token { kind, location }))
132                        } else {
133                            Some(Err(Error {
134                                location,
135                                error: String::from("invalid numeric literal").into_boxed_str(),
136                            }))
137                        };
138                    }
139                    _ => {}
140                }
141            }
142
143            // Single characters
144            let Some(next) = chars.next() else {
145                break;
146            };
147            location.end = chars.offset();
148
149            match next {
150                c if c.is_whitespace() => {
151                    // skip whitespace
152                }
153                '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
154                | '<' | '>' | '&' | '|' | '^' | '=' | '!' | '%' => {
155                    return Some(Ok(Token {
156                        kind: TokenKind::Symbol,
157                        location,
158                    }));
159                }
160                c if c.is_numeric() => {
161                    // maybe float
162                    let mut is_float = false;
163                    while let Some(maybe_boundary) = chars.peek() {
164                        if !maybe_boundary.is_numeric() {
165                            // Ignore underscores completely
166                            if maybe_boundary == '_' {
167                                chars.next();
168                                continue;
169                            }
170                            if maybe_boundary == '.' && is_float {
171                                break;
172                            }
173
174                            if maybe_boundary == '.' {
175                                is_float = true;
176                            } else {
177                                break;
178                            }
179                        }
180                        chars.next();
181                    }
182                    location.end = chars.offset();
183                    let kind = if is_float {
184                        TokenKind::Float
185                    } else {
186                        TokenKind::DecimalInteger
187                    };
188                    return Some(Ok(Token { kind, location }));
189                }
190                c if ident_char(c) => {
191                    // Identifier
192                    while let Some(maybe_boundary) = chars.peek() {
193                        if !ident_char(maybe_boundary) {
194                            break;
195                        }
196                        chars.next();
197                    }
198                    location.end = chars.offset();
199
200                    return Some(Ok(Token {
201                        kind: TokenKind::Identifier,
202                        location,
203                    }));
204                }
205                '"' => {
206                    // Strings
207                    let mut escape_start = None;
208                    while let Some(c) = chars.peek() {
209                        // Terminating double quote mark?
210                        if c == '"' && escape_start.is_none() {
211                            chars.next();
212                            location.end = chars.offset();
213
214                            return Some(Ok(Token {
215                                kind: TokenKind::String,
216                                location,
217                            }));
218                        }
219
220                        if escape_start.take().is_none() && c == '\\' {
221                            escape_start = Some(chars.offset());
222                        }
223
224                        chars.next();
225                    }
226                    location.end = source.len();
227
228                    return Some(Err(Error {
229                        location,
230                        error: String::from("unterminated string").into_boxed_str(),
231                    }));
232                }
233                _ => {
234                    return Some(Err(Error {
235                        location,
236                        error: String::from("unexpected character").into_boxed_str(),
237                    }));
238                }
239            }
240        }
241
242        None
243    })
244}
245
246#[cfg(test)]
247mod test {
248    use super::*;
249
250    fn test_tokenizer(
251        source: &str,
252        expectations: &[Result<(&'static str, TokenKind), &'static str>],
253    ) {
254        let result = tokenize(source).collect::<Vec<Result<_, _>>>();
255
256        for (idx, expectation) in expectations.iter().enumerate() {
257            match expectation.clone() {
258                Ok((expected, kind)) => {
259                    assert_eq!(
260                        expected,
261                        result[idx].as_ref().unwrap().location.extract(source)
262                    );
263                    assert_eq!(kind, result[idx].as_ref().unwrap().kind);
264                }
265                Err(err) => {
266                    assert_eq!(err, result[idx].as_ref().unwrap_err().error.as_ref());
267                }
268            }
269        }
270    }
271
272    #[test]
273    fn test_lex_numbers() {
274        let source = "2 2. 2.3 2.34 23.4 1_000.4 234 0b00 0b10 0b2 0x123 0xf 0xF 1_000 0x10_00";
275        let expectations = [
276            Ok(("2", TokenKind::DecimalInteger)),
277            Ok(("2.", TokenKind::Float)),
278            Ok(("2.3", TokenKind::Float)),
279            Ok(("2.34", TokenKind::Float)),
280            Ok(("23.4", TokenKind::Float)),
281            Ok(("1_000.4", TokenKind::Float)),
282            Ok(("234", TokenKind::DecimalInteger)),
283            Ok(("0b00", TokenKind::BinaryInteger)),
284            Ok(("0b10", TokenKind::BinaryInteger)),
285            Ok(("0b2", TokenKind::BinaryInteger)),
286            Ok(("0x123", TokenKind::HexInteger)),
287            Ok(("0xf", TokenKind::HexInteger)),
288            Ok(("0xF", TokenKind::HexInteger)),
289            Ok(("1_000", TokenKind::DecimalInteger)),
290            Ok(("0x10_00", TokenKind::HexInteger)),
291        ];
292
293        test_tokenizer(source, &expectations);
294    }
295
296    #[test]
297    fn test_lexer() {
298        let source = "   \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
299
300        let expectations = [
301            Ok(("// **a", TokenKind::Comment)),
302            Ok(("2", TokenKind::DecimalInteger)),
303            Ok(("// b", TokenKind::Comment)),
304            Ok(("foo", TokenKind::Identifier)),
305            Ok((",", TokenKind::Symbol)),
306            Ok(("ar", TokenKind::Identifier)),
307            Ok(("\"string\\\"\"", TokenKind::String)),
308            Ok(("\"\"", TokenKind::String)),
309            Ok(("(", TokenKind::Symbol)),
310            Ok((")", TokenKind::Symbol)),
311            Ok(("->", TokenKind::Symbol)),
312            Ok(("{", TokenKind::Symbol)),
313            Ok(("}", TokenKind::Symbol)),
314        ];
315
316        test_tokenizer(source, &expectations);
317    }
318}