1use std::str::CharIndices;
2
3#[derive(Debug, Copy, Clone, PartialEq)]
4pub enum ErrorKind {
5 UnexpectedCharacter,
6 UnterminatedString,
7 InvalidNumericLiteral,
8}
9
10impl std::fmt::Display for ErrorKind {
11 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
12 match self {
13 Self::UnexpectedCharacter => write!(f, "unexpected character"),
14 Self::UnterminatedString => write!(f, "unterminated string"),
15 Self::InvalidNumericLiteral => write!(f, "invalid numeric literal"),
16 }
17 }
18}
19
20#[derive(Debug, Clone, PartialEq)]
21pub struct LexerError {
22 pub location: Location,
23 pub error: ErrorKind,
24}
25
26impl std::fmt::Display for LexerError {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 write!(f, "Lexer error: {}", self.error)
29 }
30}
31
32#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
33pub struct Location {
34 pub start: usize,
35 pub end: usize,
36}
37
38impl Location {
39 pub const fn dummy() -> Self {
40 Location { start: 0, end: 0 }
41 }
42}
43
44impl Location {
45 pub fn extract(self, source: &str) -> &str {
46 &source[self.start..self.end]
47 }
48}
49
50#[derive(Clone, Copy, Debug, PartialEq)]
51pub enum TokenKind {
52 Symbol,
53 Comment,
54 DecimalInteger,
55 BinaryInteger,
56 HexInteger,
57 Float,
58 Identifier,
59 String,
60}
61
62#[derive(Clone, Copy, Debug)]
63pub struct Token {
64 pub kind: TokenKind,
65 pub location: Location,
66}
67impl Token {
68 pub fn source(self, source: &str) -> &str {
69 self.location.extract(source)
70 }
71}
72
73fn ident_char(c: char) -> bool {
74 c.is_alphanumeric() || ['-', '_'].contains(&c)
75}
76
77struct CharProvider<'s> {
78 chars: CharIndices<'s>,
79}
80
81impl CharProvider<'_> {
82 fn next(&mut self) -> Option<char> {
83 self.chars.next().map(|(_, c)| c)
84 }
85
86 fn consume_n(&mut self, n: usize) {
87 for _ in 0..n {
88 self.chars.next();
89 }
90 }
91
92 fn peek(&self) -> Option<char> {
93 self.chars.clone().next().map(|(_, c)| c)
94 }
95
96 fn peek_n(&self, n: usize) -> Option<&str> {
97 let start = self.offset();
98 self.chars
99 .clone()
100 .nth(n)
101 .map(|(idx, _)| &self.chars.as_str()[..idx - start])
102 }
103
104 fn offset(&self) -> usize {
105 self.chars.offset()
106 }
107}
108
109pub fn tokenize(source: &str) -> impl Iterator<Item = Result<Token, LexerError>> + '_ {
110 let mut chars = CharProvider {
111 chars: source.char_indices(),
112 };
113
114 let mut location = Location { start: 0, end: 0 };
115
116 std::iter::from_fn(move || {
117 loop {
118 location.start = chars.offset();
120
121 if let Some(doubles) = chars.peek_n(2) {
123 match doubles {
124 "//" => {
125 chars.consume_n(2);
126 while let Some(maybe_newline) = chars.peek() {
127 if maybe_newline == '\n' {
128 break;
129 }
130
131 chars.next();
132 }
133 location.end = chars.offset();
134
135 return Some(Ok(Token {
136 kind: TokenKind::Comment,
137 location,
138 }));
139 }
140 "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
141 chars.consume_n(2);
142 location.end = chars.offset();
143 return Some(Ok(Token {
144 kind: TokenKind::Symbol,
145 location,
146 }));
147 }
148 "0x" | "0b" => {
149 let kind = if doubles == "0x" {
150 TokenKind::HexInteger
151 } else {
152 TokenKind::BinaryInteger
153 };
154 chars.consume_n(2);
155
156 let mut has_digit = false;
157 while let Some(maybe_digit) = chars.peek() {
158 if !ident_char(maybe_digit) {
159 break;
160 }
161 has_digit = true;
162 chars.next();
163 }
164 location.end = chars.offset();
165
166 return if has_digit {
167 Some(Ok(Token { kind, location }))
168 } else {
169 Some(Err(LexerError {
170 location,
171 error: ErrorKind::InvalidNumericLiteral,
172 }))
173 };
174 }
175 _ => {}
176 }
177 }
178
179 let Some(next) = chars.next() else {
181 break;
182 };
183 location.end = chars.offset();
184
185 match next {
186 c if c.is_whitespace() => {
187 }
189 '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
190 | '<' | '>' | '&' | '|' | '^' | '=' | '!' => {
191 return Some(Ok(Token {
192 kind: TokenKind::Symbol,
193 location,
194 }));
195 }
196 c if c.is_numeric() => {
197 let mut is_float = false;
199 while let Some(maybe_boundary) = chars.peek() {
200 if !maybe_boundary.is_numeric() {
201 if maybe_boundary == '.' && is_float {
202 break;
203 }
204
205 if maybe_boundary == '.' {
206 is_float = true;
207 } else {
208 break;
209 }
210 }
211 chars.next();
212 }
213 location.end = chars.offset();
214 let kind = if is_float {
215 TokenKind::Float
216 } else {
217 TokenKind::DecimalInteger
218 };
219 return Some(Ok(Token { kind, location }));
220 }
221 c if ident_char(c) => {
222 while let Some(maybe_boundary) = chars.peek() {
224 if !ident_char(maybe_boundary) {
225 break;
226 }
227 chars.next();
228 }
229 location.end = chars.offset();
230
231 return Some(Ok(Token {
232 kind: TokenKind::Identifier,
233 location,
234 }));
235 }
236 '"' => {
237 let mut escape_start = None;
239 while let Some(c) = chars.peek() {
240 if c == '"' && escape_start.is_none() {
242 chars.next();
243 location.end = chars.offset();
244
245 return Some(Ok(Token {
246 kind: TokenKind::String,
247 location,
248 }));
249 }
250
251 if escape_start.take().is_none() && c == '\\' {
252 escape_start = Some(chars.offset());
253 }
254
255 chars.next();
256 }
257 location.end = source.len();
258
259 return Some(Err(LexerError {
260 location,
261 error: ErrorKind::UnterminatedString,
262 }));
263 }
264 _ => {
265 return Some(Err(LexerError {
266 location,
267 error: ErrorKind::UnexpectedCharacter,
268 }));
269 }
270 }
271 }
272
273 None
274 })
275}
276
277#[cfg(test)]
278mod test {
279 use super::*;
280
281 fn test_tokenizer(source: &str, expectations: &[Result<(&'static str, TokenKind), ErrorKind>]) {
282 let result = tokenize(source).collect::<Vec<Result<_, _>>>();
283
284 for (idx, expectation) in expectations.iter().enumerate() {
285 match expectation.clone() {
286 Ok((expected, kind)) => {
287 assert_eq!(
288 expected,
289 result[idx].as_ref().unwrap().location.extract(source)
290 );
291 assert_eq!(kind, result[idx].as_ref().unwrap().kind);
292 }
293 Err(err) => {
294 assert_eq!(err, result[idx].as_ref().unwrap_err().error);
295 }
296 }
297 }
298 }
299
300 #[test]
301 fn test_lex_numbers() {
302 let source = "2 2. 2.3 2.34 23.4 234 0b00 0b10 0b2 0x123 0xf 0xF";
303 let expectations = [
304 Ok(("2", TokenKind::DecimalInteger)),
305 Ok(("2.", TokenKind::Float)),
306 Ok(("2.3", TokenKind::Float)),
307 Ok(("2.34", TokenKind::Float)),
308 Ok(("23.4", TokenKind::Float)),
309 Ok(("234", TokenKind::DecimalInteger)),
310 Ok(("0b00", TokenKind::BinaryInteger)),
311 Ok(("0b10", TokenKind::BinaryInteger)),
312 Ok(("0b2", TokenKind::BinaryInteger)),
313 Ok(("0x123", TokenKind::HexInteger)),
314 Ok(("0xf", TokenKind::HexInteger)),
315 Ok(("0xF", TokenKind::HexInteger)),
316 ];
317
318 test_tokenizer(source, &expectations);
319 }
320
321 #[test]
322 fn test_lexer() {
323 let source = " \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
324
325 let expectations = [
326 Ok(("// **a", TokenKind::Comment)),
327 Ok(("2", TokenKind::DecimalInteger)),
328 Ok(("// b", TokenKind::Comment)),
329 Ok(("foo", TokenKind::Identifier)),
330 Ok((",", TokenKind::Symbol)),
331 Ok(("ar", TokenKind::Identifier)),
332 Ok(("\"string\\\"\"", TokenKind::String)),
333 Ok(("\"\"", TokenKind::String)),
334 Ok(("(", TokenKind::Symbol)),
335 Ok((")", TokenKind::Symbol)),
336 Ok(("->", TokenKind::Symbol)),
337 Ok(("{", TokenKind::Symbol)),
338 Ok(("}", TokenKind::Symbol)),
339 ];
340
341 test_tokenizer(source, &expectations);
342 }
343}