somni_parser/
lexer.rs

1use std::str::CharIndices;
2
3use crate::{Error, Location};
4
5#[derive(Clone, Copy, Debug, PartialEq)]
6pub enum TokenKind {
7    Symbol,
8    Comment,
9    DecimalInteger,
10    BinaryInteger,
11    HexInteger,
12    Float,
13    Identifier,
14    String,
15}
16
17#[derive(Clone, Copy, Debug)]
18pub struct Token {
19    pub kind: TokenKind,
20    pub location: Location,
21}
22impl Token {
23    pub fn source(self, source: &str) -> &str {
24        self.location.extract(source)
25    }
26}
27
28fn ident_char(c: char) -> bool {
29    c.is_alphanumeric() || ['-', '_'].contains(&c)
30}
31
32#[derive(Clone)]
33struct CharProvider<'s> {
34    chars: CharIndices<'s>,
35}
36
37impl CharProvider<'_> {
38    fn next(&mut self) -> Option<char> {
39        self.chars.next().map(|(_, c)| c)
40    }
41
42    fn consume_n(&mut self, n: usize) {
43        for _ in 0..n {
44            self.chars.next();
45        }
46    }
47
48    fn peek(&self) -> Option<char> {
49        self.chars.clone().next().map(|(_, c)| c)
50    }
51
52    fn peek_n(&self, n: usize) -> Option<&str> {
53        let start = self.offset();
54        self.chars
55            .clone()
56            .nth(n)
57            .map(|(idx, _)| &self.chars.as_str()[..idx - start])
58    }
59
60    fn offset(&self) -> usize {
61        self.chars.offset()
62    }
63}
64
65pub(crate) trait Tokenizer: Iterator<Item = Result<Token, crate::Error>> + Clone {}
66impl<I> Tokenizer for I where I: Iterator<Item = Result<Token, crate::Error>> + Clone {}
67
68pub(crate) fn tokenize(source: &str) -> impl Tokenizer + '_ {
69    let mut chars = CharProvider {
70        chars: source.char_indices(),
71    };
72
73    let mut location = Location { start: 0, end: 0 };
74
75    std::iter::from_fn(move || {
76        loop {
77            // Start lexing a new token
78            location.start = chars.offset();
79
80            // Comments, prefixed numbers, some symbols
81            if let Some(doubles) = chars.peek_n(2) {
82                match doubles {
83                    "//" => {
84                        chars.consume_n(2);
85                        while let Some(maybe_newline) = chars.peek() {
86                            if maybe_newline == '\n' {
87                                break;
88                            }
89
90                            chars.next();
91                        }
92                        location.end = chars.offset();
93
94                        return Some(Ok(Token {
95                            kind: TokenKind::Comment,
96                            location,
97                        }));
98                    }
99                    "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
100                        chars.consume_n(2);
101                        location.end = chars.offset();
102                        return Some(Ok(Token {
103                            kind: TokenKind::Symbol,
104                            location,
105                        }));
106                    }
107                    "0x" | "0b" => {
108                        let kind = if doubles == "0x" {
109                            TokenKind::HexInteger
110                        } else {
111                            TokenKind::BinaryInteger
112                        };
113                        chars.consume_n(2);
114
115                        let mut has_digit = false;
116                        while let Some(maybe_digit) = chars.peek() {
117                            if !ident_char(maybe_digit) {
118                                break;
119                            }
120                            has_digit = true;
121                            chars.next();
122                        }
123                        location.end = chars.offset();
124
125                        return if has_digit {
126                            Some(Ok(Token { kind, location }))
127                        } else {
128                            Some(Err(Error {
129                                location,
130                                error: String::from("invalid numeric literal").into_boxed_str(),
131                            }))
132                        };
133                    }
134                    _ => {}
135                }
136            }
137
138            // Single characters
139            let Some(next) = chars.next() else {
140                break;
141            };
142            location.end = chars.offset();
143
144            match next {
145                c if c.is_whitespace() => {
146                    // skip whitespace
147                }
148                '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
149                | '<' | '>' | '&' | '|' | '^' | '=' | '!' | '%' => {
150                    return Some(Ok(Token {
151                        kind: TokenKind::Symbol,
152                        location,
153                    }));
154                }
155                c if c.is_numeric() => {
156                    // maybe float
157                    let mut is_float = false;
158                    while let Some(maybe_boundary) = chars.peek() {
159                        if !maybe_boundary.is_numeric() {
160                            if maybe_boundary == '.' && is_float {
161                                break;
162                            }
163
164                            if maybe_boundary == '.' {
165                                is_float = true;
166                            } else {
167                                break;
168                            }
169                        }
170                        chars.next();
171                    }
172                    location.end = chars.offset();
173                    let kind = if is_float {
174                        TokenKind::Float
175                    } else {
176                        TokenKind::DecimalInteger
177                    };
178                    return Some(Ok(Token { kind, location }));
179                }
180                c if ident_char(c) => {
181                    // Identifier
182                    while let Some(maybe_boundary) = chars.peek() {
183                        if !ident_char(maybe_boundary) {
184                            break;
185                        }
186                        chars.next();
187                    }
188                    location.end = chars.offset();
189
190                    return Some(Ok(Token {
191                        kind: TokenKind::Identifier,
192                        location,
193                    }));
194                }
195                '"' => {
196                    // Strings
197                    let mut escape_start = None;
198                    while let Some(c) = chars.peek() {
199                        // Terminating double quote mark?
200                        if c == '"' && escape_start.is_none() {
201                            chars.next();
202                            location.end = chars.offset();
203
204                            return Some(Ok(Token {
205                                kind: TokenKind::String,
206                                location,
207                            }));
208                        }
209
210                        if escape_start.take().is_none() && c == '\\' {
211                            escape_start = Some(chars.offset());
212                        }
213
214                        chars.next();
215                    }
216                    location.end = source.len();
217
218                    return Some(Err(Error {
219                        location,
220                        error: String::from("unterminated string").into_boxed_str(),
221                    }));
222                }
223                _ => {
224                    return Some(Err(Error {
225                        location,
226                        error: String::from("unexpected character").into_boxed_str(),
227                    }));
228                }
229            }
230        }
231
232        None
233    })
234}
235
236#[cfg(test)]
237mod test {
238    use super::*;
239
240    fn test_tokenizer(
241        source: &str,
242        expectations: &[Result<(&'static str, TokenKind), &'static str>],
243    ) {
244        let result = tokenize(source).collect::<Vec<Result<_, _>>>();
245
246        for (idx, expectation) in expectations.iter().enumerate() {
247            match expectation.clone() {
248                Ok((expected, kind)) => {
249                    assert_eq!(
250                        expected,
251                        result[idx].as_ref().unwrap().location.extract(source)
252                    );
253                    assert_eq!(kind, result[idx].as_ref().unwrap().kind);
254                }
255                Err(err) => {
256                    assert_eq!(err, result[idx].as_ref().unwrap_err().error.as_ref());
257                }
258            }
259        }
260    }
261
262    #[test]
263    fn test_lex_numbers() {
264        let source = "2 2. 2.3 2.34 23.4 234 0b00 0b10 0b2 0x123 0xf 0xF";
265        let expectations = [
266            Ok(("2", TokenKind::DecimalInteger)),
267            Ok(("2.", TokenKind::Float)),
268            Ok(("2.3", TokenKind::Float)),
269            Ok(("2.34", TokenKind::Float)),
270            Ok(("23.4", TokenKind::Float)),
271            Ok(("234", TokenKind::DecimalInteger)),
272            Ok(("0b00", TokenKind::BinaryInteger)),
273            Ok(("0b10", TokenKind::BinaryInteger)),
274            Ok(("0b2", TokenKind::BinaryInteger)),
275            Ok(("0x123", TokenKind::HexInteger)),
276            Ok(("0xf", TokenKind::HexInteger)),
277            Ok(("0xF", TokenKind::HexInteger)),
278        ];
279
280        test_tokenizer(source, &expectations);
281    }
282
283    #[test]
284    fn test_lexer() {
285        let source = "   \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
286
287        let expectations = [
288            Ok(("// **a", TokenKind::Comment)),
289            Ok(("2", TokenKind::DecimalInteger)),
290            Ok(("// b", TokenKind::Comment)),
291            Ok(("foo", TokenKind::Identifier)),
292            Ok((",", TokenKind::Symbol)),
293            Ok(("ar", TokenKind::Identifier)),
294            Ok(("\"string\\\"\"", TokenKind::String)),
295            Ok(("\"\"", TokenKind::String)),
296            Ok(("(", TokenKind::Symbol)),
297            Ok((")", TokenKind::Symbol)),
298            Ok(("->", TokenKind::Symbol)),
299            Ok(("{", TokenKind::Symbol)),
300            Ok(("}", TokenKind::Symbol)),
301        ];
302
303        test_tokenizer(source, &expectations);
304    }
305}