1use std::str::CharIndices;
2
3use crate::{Error, Location};
4
5#[derive(Clone, Copy, Debug, PartialEq)]
6pub enum TokenKind {
7 Symbol,
8 Comment,
9 DecimalInteger,
10 BinaryInteger,
11 HexInteger,
12 Float,
13 Identifier,
14 String,
15}
16
17#[derive(Clone, Copy, Debug)]
18pub struct Token {
19 pub kind: TokenKind,
20 pub location: Location,
21}
22impl Token {
23 pub fn source(self, source: &str) -> &str {
24 self.location.extract(source)
25 }
26}
27
28fn ident_char(c: char) -> bool {
29 c.is_alphanumeric() || ['-', '_'].contains(&c)
30}
31
32#[derive(Clone)]
33struct CharProvider<'s> {
34 chars: CharIndices<'s>,
35}
36
37impl CharProvider<'_> {
38 fn next(&mut self) -> Option<char> {
39 self.chars.next().map(|(_, c)| c)
40 }
41
42 fn consume_n(&mut self, n: usize) {
43 for _ in 0..n {
44 self.chars.next();
45 }
46 }
47
48 fn peek(&self) -> Option<char> {
49 self.chars.clone().next().map(|(_, c)| c)
50 }
51
52 fn peek_n(&self, n: usize) -> Option<&str> {
53 let start = self.offset();
54 self.chars
55 .clone()
56 .nth(n)
57 .map(|(idx, _)| &self.chars.as_str()[..idx - start])
58 }
59
60 fn offset(&self) -> usize {
61 self.chars.offset()
62 }
63}
64
65pub(crate) trait Tokenizer: Iterator<Item = Result<Token, crate::Error>> + Clone {}
66impl<I> Tokenizer for I where I: Iterator<Item = Result<Token, crate::Error>> + Clone {}
67
68pub(crate) fn tokenize(source: &str) -> impl Tokenizer + '_ {
69 let mut chars = CharProvider {
70 chars: source.char_indices(),
71 };
72
73 let mut location = Location { start: 0, end: 0 };
74
75 std::iter::from_fn(move || {
76 loop {
77 location.start = chars.offset();
79
80 if let Some(doubles) = chars.peek_n(2) {
82 match doubles {
83 "//" => {
84 chars.consume_n(2);
85 while let Some(maybe_newline) = chars.peek() {
86 if maybe_newline == '\n' {
87 break;
88 }
89
90 chars.next();
91 }
92 location.end = chars.offset();
93
94 return Some(Ok(Token {
95 kind: TokenKind::Comment,
96 location,
97 }));
98 }
99 "==" | "!=" | "<=" | ">=" | "=>" | "&&" | "||" | "**" | "->" | "<<" | ">>" => {
100 chars.consume_n(2);
101 location.end = chars.offset();
102 return Some(Ok(Token {
103 kind: TokenKind::Symbol,
104 location,
105 }));
106 }
107 "0x" | "0b" => {
108 let kind = if doubles == "0x" {
109 TokenKind::HexInteger
110 } else {
111 TokenKind::BinaryInteger
112 };
113 chars.consume_n(2);
114
115 let mut has_digit = false;
116 while let Some(maybe_digit) = chars.peek() {
117 if !ident_char(maybe_digit) {
118 break;
119 }
120 has_digit = true;
121 chars.next();
122 }
123 location.end = chars.offset();
124
125 return if has_digit {
126 Some(Ok(Token { kind, location }))
127 } else {
128 Some(Err(Error {
129 location,
130 error: String::from("invalid numeric literal").into_boxed_str(),
131 }))
132 };
133 }
134 _ => {}
135 }
136 }
137
138 let Some(next) = chars.next() else {
140 break;
141 };
142 location.end = chars.offset();
143
144 match next {
145 c if c.is_whitespace() => {
146 }
148 '+' | '-' | '*' | '(' | ')' | '{' | '}' | '[' | ']' | ',' | ';' | '/' | ':'
149 | '<' | '>' | '&' | '|' | '^' | '=' | '!' | '%' => {
150 return Some(Ok(Token {
151 kind: TokenKind::Symbol,
152 location,
153 }));
154 }
155 c if c.is_numeric() => {
156 let mut is_float = false;
158 while let Some(maybe_boundary) = chars.peek() {
159 if !maybe_boundary.is_numeric() {
160 if maybe_boundary == '.' && is_float {
161 break;
162 }
163
164 if maybe_boundary == '.' {
165 is_float = true;
166 } else {
167 break;
168 }
169 }
170 chars.next();
171 }
172 location.end = chars.offset();
173 let kind = if is_float {
174 TokenKind::Float
175 } else {
176 TokenKind::DecimalInteger
177 };
178 return Some(Ok(Token { kind, location }));
179 }
180 c if ident_char(c) => {
181 while let Some(maybe_boundary) = chars.peek() {
183 if !ident_char(maybe_boundary) {
184 break;
185 }
186 chars.next();
187 }
188 location.end = chars.offset();
189
190 return Some(Ok(Token {
191 kind: TokenKind::Identifier,
192 location,
193 }));
194 }
195 '"' => {
196 let mut escape_start = None;
198 while let Some(c) = chars.peek() {
199 if c == '"' && escape_start.is_none() {
201 chars.next();
202 location.end = chars.offset();
203
204 return Some(Ok(Token {
205 kind: TokenKind::String,
206 location,
207 }));
208 }
209
210 if escape_start.take().is_none() && c == '\\' {
211 escape_start = Some(chars.offset());
212 }
213
214 chars.next();
215 }
216 location.end = source.len();
217
218 return Some(Err(Error {
219 location,
220 error: String::from("unterminated string").into_boxed_str(),
221 }));
222 }
223 _ => {
224 return Some(Err(Error {
225 location,
226 error: String::from("unexpected character").into_boxed_str(),
227 }));
228 }
229 }
230 }
231
232 None
233 })
234}
235
236#[cfg(test)]
237mod test {
238 use super::*;
239
240 fn test_tokenizer(
241 source: &str,
242 expectations: &[Result<(&'static str, TokenKind), &'static str>],
243 ) {
244 let result = tokenize(source).collect::<Vec<Result<_, _>>>();
245
246 for (idx, expectation) in expectations.iter().enumerate() {
247 match expectation.clone() {
248 Ok((expected, kind)) => {
249 assert_eq!(
250 expected,
251 result[idx].as_ref().unwrap().location.extract(source)
252 );
253 assert_eq!(kind, result[idx].as_ref().unwrap().kind);
254 }
255 Err(err) => {
256 assert_eq!(err, result[idx].as_ref().unwrap_err().error.as_ref());
257 }
258 }
259 }
260 }
261
262 #[test]
263 fn test_lex_numbers() {
264 let source = "2 2. 2.3 2.34 23.4 234 0b00 0b10 0b2 0x123 0xf 0xF";
265 let expectations = [
266 Ok(("2", TokenKind::DecimalInteger)),
267 Ok(("2.", TokenKind::Float)),
268 Ok(("2.3", TokenKind::Float)),
269 Ok(("2.34", TokenKind::Float)),
270 Ok(("23.4", TokenKind::Float)),
271 Ok(("234", TokenKind::DecimalInteger)),
272 Ok(("0b00", TokenKind::BinaryInteger)),
273 Ok(("0b10", TokenKind::BinaryInteger)),
274 Ok(("0b2", TokenKind::BinaryInteger)),
275 Ok(("0x123", TokenKind::HexInteger)),
276 Ok(("0xf", TokenKind::HexInteger)),
277 Ok(("0xF", TokenKind::HexInteger)),
278 ];
279
280 test_tokenizer(source, &expectations);
281 }
282
283 #[test]
284 fn test_lexer() {
285 let source = " \n // **a\n2 \n // b\nfoo,ar \"string\\\"\" \"\" () -> {}";
286
287 let expectations = [
288 Ok(("// **a", TokenKind::Comment)),
289 Ok(("2", TokenKind::DecimalInteger)),
290 Ok(("// b", TokenKind::Comment)),
291 Ok(("foo", TokenKind::Identifier)),
292 Ok((",", TokenKind::Symbol)),
293 Ok(("ar", TokenKind::Identifier)),
294 Ok(("\"string\\\"\"", TokenKind::String)),
295 Ok(("\"\"", TokenKind::String)),
296 Ok(("(", TokenKind::Symbol)),
297 Ok((")", TokenKind::Symbol)),
298 Ok(("->", TokenKind::Symbol)),
299 Ok(("{", TokenKind::Symbol)),
300 Ok(("}", TokenKind::Symbol)),
301 ];
302
303 test_tokenizer(source, &expectations);
304 }
305}