1use std::str::CharIndices;
2
3use crate::{Error, Location};
4
5#[derive(Clone, Copy, Debug, PartialEq)]
6pub enum TokenKind {
7 Symbol,
8 Comment,
9 DecimalInteger,
10 BinaryInteger,
11 HexInteger,
12 Float,
13 Identifier,
14 String,
15}
16
17#[derive(Clone, Copy, Debug)]
18pub struct Token {
19 pub kind: TokenKind,
20 pub location: Location,
21}
22impl Token {
23 pub fn source(self, source: &str) -> &str {
24 self.location.extract(source)
25 }
26}
27
28fn ident_char(c: char) -> bool {
29 c.is_alphanumeric() || ['-', '_'].contains(&c)
30}
31
32#[derive(Clone)]
33struct CharProvider<'s> {
34 chars: CharIndices<'s>,
35}
36
37impl CharProvider<'_> {
38 fn next(&mut self) -> Option<char> {
39 self.chars.next().map(|(_, c)| c)
40 }
41
42 fn consume_n(&mut self, n: usize) {
43 for _ in 0..n {
44 self.chars.next();
45 }
46 }
47
48 fn peek(&self) -> Option<char> {
49 self.chars.clone().next().map(|(_, c)| c)
50 }
51
52 fn peek_n(&self, n: usize) -> Option<&str> {
53 let start = self.offset();
54 self.chars
55 .clone()
56 .nth(n)
57 .map(|(idx, _)| &self.chars.as_str()[..idx - start])
58 }
59
60 fn offset(&self) -> usize {
61 self.chars.offset()
62 }
63}
64
65pub(crate) trait Tokenizer: Iterator<Item = Result<Token, crate::Error>> + Clone {}
66impl<I> Tokenizer for I where I: Iterator<Item = Result<Token, crate::Error>> + Clone {}
67
68pub(crate) fn tokenize(source: &str) -> impl Tokenizer + '_ {
69 let mut chars = CharProvider {
70 chars: source.char_indices(),
71 };
72
73 let mut location = Location { start: 0, end: 0 };
74
75 std::iter::from_fn(move || {
76 loop {
77 location.start = chars.offset();
79
80 if let Some(doubles) = chars.peek_n(2) {
82 match doubles {
83 "//" => {
84 chars.consume_n(2);
85 while let Some(maybe_newline) = chars.peek() {
86 if maybe_newline == '\n' {
87 break;
88 }
89
90 chars.next();
91 }
92 location.end = chars.offset();
93
94 return Some(Ok(Token {
95 kind: TokenKind::Comment,
96 location,
97 }));
98 }
99 "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
100 chars.consume_n(2);
101 location.end = chars.offset();
102 return Some(Ok(Token {
103 kind: TokenKind::Symbol,
104 location,
105 }));
106 }
107 "0x" | "0b" => {
108 let kind = if doubles == "0x" {
109 TokenKind::HexInteger
110 } else {
111 TokenKind::BinaryInteger
112 };
113 chars.consume_n(2);
114
115 let mut has_digit = false;
116 while let Some(maybe_digit) = chars.peek() {
117 if maybe_digit == '_' {
119 chars.next();
120 continue;
121 }
122 if !ident_char(maybe_digit) {
123 break;
124 }
125 has_digit = true;
126 chars.next();
127 }
128 location.end = chars.offset();
129
130 return if has_digit {
131 Some(Ok(Token { kind, location }))
132 } else {
133 Some(Err(Error {
134 location,
135 error: String::from("invalid numeric literal").into_boxed_str(),
136 }))
137 };
138 }
139 _ => {}
140 }
141 }
142
143 let Some(next) = chars.next() else {
145 break;
146 };
147 location.end = chars.offset();
148
149 match next {
150 c if c.is_whitespace() => {
151 }
153 '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
154 | '<' | '>' | '&' | '|' | '^' | '=' | '!' | '%' => {
155 return Some(Ok(Token {
156 kind: TokenKind::Symbol,
157 location,
158 }));
159 }
160 c if c.is_numeric() => {
161 let mut is_float = false;
163 while let Some(maybe_boundary) = chars.peek() {
164 if !maybe_boundary.is_numeric() {
165 if maybe_boundary == '_' {
167 chars.next();
168 continue;
169 }
170 if maybe_boundary == '.' && is_float {
171 break;
172 }
173
174 if maybe_boundary == '.' {
175 is_float = true;
176 } else {
177 break;
178 }
179 }
180 chars.next();
181 }
182 location.end = chars.offset();
183 let kind = if is_float {
184 TokenKind::Float
185 } else {
186 TokenKind::DecimalInteger
187 };
188 return Some(Ok(Token { kind, location }));
189 }
190 c if ident_char(c) => {
191 while let Some(maybe_boundary) = chars.peek() {
193 if !ident_char(maybe_boundary) {
194 break;
195 }
196 chars.next();
197 }
198 location.end = chars.offset();
199
200 return Some(Ok(Token {
201 kind: TokenKind::Identifier,
202 location,
203 }));
204 }
205 '"' => {
206 let mut escape_start = None;
208 while let Some(c) = chars.peek() {
209 if c == '"' && escape_start.is_none() {
211 chars.next();
212 location.end = chars.offset();
213
214 return Some(Ok(Token {
215 kind: TokenKind::String,
216 location,
217 }));
218 }
219
220 if escape_start.take().is_none() && c == '\\' {
221 escape_start = Some(chars.offset());
222 }
223
224 chars.next();
225 }
226 location.end = source.len();
227
228 return Some(Err(Error {
229 location,
230 error: String::from("unterminated string").into_boxed_str(),
231 }));
232 }
233 _ => {
234 return Some(Err(Error {
235 location,
236 error: String::from("unexpected character").into_boxed_str(),
237 }));
238 }
239 }
240 }
241
242 None
243 })
244}
245
246#[cfg(test)]
247mod test {
248 use super::*;
249
250 fn test_tokenizer(
251 source: &str,
252 expectations: &[Result<(&'static str, TokenKind), &'static str>],
253 ) {
254 let result = tokenize(source).collect::<Vec<Result<_, _>>>();
255
256 for (idx, expectation) in expectations.iter().enumerate() {
257 match expectation.clone() {
258 Ok((expected, kind)) => {
259 assert_eq!(
260 expected,
261 result[idx].as_ref().unwrap().location.extract(source)
262 );
263 assert_eq!(kind, result[idx].as_ref().unwrap().kind);
264 }
265 Err(err) => {
266 assert_eq!(err, result[idx].as_ref().unwrap_err().error.as_ref());
267 }
268 }
269 }
270 }
271
272 #[test]
273 fn test_lex_numbers() {
274 let source = "2 2. 2.3 2.34 23.4 1_000.4 234 0b00 0b10 0b2 0x123 0xf 0xF 1_000 0x10_00";
275 let expectations = [
276 Ok(("2", TokenKind::DecimalInteger)),
277 Ok(("2.", TokenKind::Float)),
278 Ok(("2.3", TokenKind::Float)),
279 Ok(("2.34", TokenKind::Float)),
280 Ok(("23.4", TokenKind::Float)),
281 Ok(("1_000.4", TokenKind::Float)),
282 Ok(("234", TokenKind::DecimalInteger)),
283 Ok(("0b00", TokenKind::BinaryInteger)),
284 Ok(("0b10", TokenKind::BinaryInteger)),
285 Ok(("0b2", TokenKind::BinaryInteger)),
286 Ok(("0x123", TokenKind::HexInteger)),
287 Ok(("0xf", TokenKind::HexInteger)),
288 Ok(("0xF", TokenKind::HexInteger)),
289 Ok(("1_000", TokenKind::DecimalInteger)),
290 Ok(("0x10_00", TokenKind::HexInteger)),
291 ];
292
293 test_tokenizer(source, &expectations);
294 }
295
296 #[test]
297 fn test_lexer() {
298 let source = " \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
299
300 let expectations = [
301 Ok(("// **a", TokenKind::Comment)),
302 Ok(("2", TokenKind::DecimalInteger)),
303 Ok(("// b", TokenKind::Comment)),
304 Ok(("foo", TokenKind::Identifier)),
305 Ok((",", TokenKind::Symbol)),
306 Ok(("ar", TokenKind::Identifier)),
307 Ok(("\"string\\\"\"", TokenKind::String)),
308 Ok(("\"\"", TokenKind::String)),
309 Ok(("(", TokenKind::Symbol)),
310 Ok((")", TokenKind::Symbol)),
311 Ok(("->", TokenKind::Symbol)),
312 Ok(("{", TokenKind::Symbol)),
313 Ok(("}", TokenKind::Symbol)),
314 ];
315
316 test_tokenizer(source, &expectations);
317 }
318}