1use crate::lexer::codes::{is_token_type, token_type};
2use crate::lexer::cursor::{Cursor, CursorItem};
3use crate::lexer::error::LexerResult;
4use crate::lexer::token::{
5 Bracket, ComparisonOperator, Identifier, LogicalOperator, Operator, Token,
6 TokenKind,
7};
8use crate::lexer::{LexerError, QuotationMark, TemplateString};
9use std::str::FromStr;
10
11#[derive(Debug, Default)]
12pub struct Lexer<'arena> {
13 tokens: Vec<Token<'arena>>,
14}
15
16impl<'arena> Lexer<'arena> {
17 pub fn new() -> Self {
18 Self::default()
19 }
20
21 pub fn tokenize(
22 &mut self,
23 source: &'arena str,
24 ) -> LexerResult<&[Token<'arena>]> {
25 self.tokens.clear();
26
27 Scanner::new(source, &mut self.tokens).scan()?;
28 Ok(&self.tokens)
29 }
30}
31
32struct Scanner<'arena, 'self_ref> {
33 cursor: Cursor<'arena>,
34 tokens: &'self_ref mut Vec<Token<'arena>>,
35 source: &'arena str,
36}
37
38impl<'arena, 'self_ref> Scanner<'arena, 'self_ref> {
39 pub fn new(
40 source: &'arena str,
41 tokens: &'self_ref mut Vec<Token<'arena>>,
42 ) -> Self {
43 Self { cursor: Cursor::from(source), source, tokens }
44 }
45
46 pub fn scan(&mut self) -> LexerResult<()> {
47 while let Some(cursor_item) = self.cursor.peek() {
48 self.scan_cursor_item(cursor_item)?;
49 }
50
51 Ok(())
52 }
53
54 pub(crate) fn scan_cursor_item(
55 &mut self,
56 cursor_item: CursorItem,
57 ) -> LexerResult<()> {
58 let (i, s) = cursor_item;
59
60 match s {
61 token_type!("space") => {
62 self.cursor.next();
63 Ok(())
64 },
65 '\'' => self.string(QuotationMark::SingleQuote),
66 '"' => self.string(QuotationMark::DoubleQuote),
67 token_type!("digit") => self.number(),
68 token_type!("bracket") => self.bracket(),
69 token_type!("cmp_operator") => self.cmp_operator(),
70 token_type!("operator") => self.operator(),
71 token_type!("question_mark") => self.question_mark(),
72 '=' => self.equals(),
73 '`' => self.template_string(),
74 '.' => self.dot(),
75 ';' => self.semi(),
76 token_type!("alpha") => self.identifier(),
77 _ => Err(LexerError::UnmatchedSymbol {
78 symbol: s,
79 position: i as u32,
80 }),
81 }
82 }
83
84 fn next(&self) -> LexerResult<CursorItem> {
85 self.cursor.next().ok_or_else(|| {
86 let (a, b) = self.cursor.peek_back().unwrap_or((0, ' '));
87
88 LexerError::UnexpectedEof { symbol: b, position: a as u32 }
89 })
90 }
91
92 fn push(
93 &mut self,
94 token: Token<'arena>,
95 ) {
96 self.tokens.push(token);
97 }
98
99 fn template_string(&mut self) -> LexerResult<()> {
100 let (start, _) = self.next()?;
101
102 self.tokens.push(Token {
103 kind: TokenKind::QuotationMark(QuotationMark::Backtick),
104 span: (start as u32, (start + 1) as u32),
105 value: QuotationMark::Backtick.into(),
106 });
107
108 let mut in_expression = false;
109 let mut str_start = start + 1;
110 loop {
111 let (e, c) = self.next()?;
112
113 match (c, in_expression) {
114 ('`', _) => {
115 if str_start < e {
116 self.tokens.push(Token {
117 kind: TokenKind::Literal,
118 span: (str_start as u32, e as u32),
119 value: &self.source[str_start..e],
120 });
121 }
122
123 self.tokens.push(Token {
124 kind: TokenKind::QuotationMark(QuotationMark::Backtick),
125 span: (e as u32, (e + 1) as u32),
126 value: QuotationMark::Backtick.into(),
127 });
128
129 break;
130 },
131 ('$', false) => {
132 in_expression = self.cursor.next_if_is("{");
133 if in_expression {
134 self.tokens.push(Token {
135 kind: TokenKind::Literal,
136 span: (str_start as u32, e as u32),
137 value: &self.source[str_start..e],
138 });
139
140 self.tokens.push(Token {
141 kind: TokenKind::TemplateString(
142 TemplateString::ExpressionStart,
143 ),
144 span: (e as u32, (e + 2) as u32),
145 value: TemplateString::ExpressionStart.into(),
146 });
147 }
148 },
149 ('}', true) => {
150 in_expression = false;
151 self.tokens.push(Token {
152 kind: TokenKind::TemplateString(
153 TemplateString::ExpressionEnd,
154 ),
155 span: (str_start as u32, e as u32),
156 value: TemplateString::ExpressionEnd.into(),
157 });
158
159 str_start = e + 1;
160 },
161 (_, false) => {
162 },
164 (_, true) => {
165 self.cursor.back();
166 self.scan_cursor_item((e, c))?;
167 },
168 }
169 }
170
171 Ok(())
172 }
173
174 fn string(
175 &mut self,
176 quote_kind: QuotationMark,
177 ) -> LexerResult<()> {
178 let (start, opener) = self.next()?;
179 let end: usize;
180
181 loop {
182 let (e, c) = self.next()?;
183 if c == opener {
184 end = e;
185 break;
186 }
187 }
188
189 self.push(Token {
190 kind: TokenKind::QuotationMark(quote_kind),
191 span: (start as u32, (start + 1) as u32),
192 value: quote_kind.into(),
193 });
194
195 self.push(Token {
196 kind: TokenKind::Literal,
197 span: ((start + 1) as u32, end as u32),
198 value: &self.source[start + 1..end],
199 });
200
201 self.push(Token {
202 kind: TokenKind::QuotationMark(quote_kind),
203 span: (end as u32, (end + 1) as u32),
204 value: quote_kind.into(),
205 });
206
207 Ok(())
208 }
209
210 fn number(&mut self) -> LexerResult<()> {
211 let (start, _) = self.next()?;
212 let mut end = start;
213 let mut fractal = false;
214
215 while let Some((e, c)) = self
216 .cursor
217 .next_if(|c| is_token_type!(c, "digit") || c == '_' || c == '.')
218 {
219 if fractal && c == '.' {
220 self.cursor.back();
221 break;
222 }
223
224 if c == '.' {
225 if let Some((_, p)) = self.cursor.peek() {
226 if p == '.' {
227 self.cursor.back();
228 break;
229 }
230
231 fractal = true
232 }
233 }
234
235 end = e;
236 }
237
238 if let Some((e_pos, _)) = self.cursor.next_if(|c| c == 'e') {
239 end = e_pos;
240
241 if let Some((sign_pos, _)) =
242 self.cursor.next_if(|c| c == '+' || c == '-')
243 {
244 end = sign_pos;
245 }
246
247 let mut has_exponent_digits = false;
248 while let Some((exp_pos, _)) =
249 self.cursor.next_if(|c| is_token_type!(c, "digit"))
250 {
251 end = exp_pos;
252 has_exponent_digits = true;
253 }
254
255 if !has_exponent_digits {
256 while self.cursor.position() > e_pos {
257 self.cursor.back();
258 }
259
260 end = e_pos - 1;
261 }
262 }
263
264 self.push(Token {
265 kind: TokenKind::Number,
266 span: (start as u32, (end + 1) as u32),
267 value: &self.source[start..=end],
268 });
269
270 Ok(())
271 }
272
273 fn bracket(&mut self) -> LexerResult<()> {
274 let (start, _) = self.next()?;
275
276 let value = &self.source[start..=start];
277 let span = (start as u32, (start + 1) as u32);
278 self.push(Token {
279 kind: TokenKind::Bracket(Bracket::from_str(value).map_err(
280 |_| LexerError::UnexpectedSymbol {
281 symbol: value.to_string(),
282 span,
283 },
284 )?),
285 span,
286 value,
287 });
288
289 Ok(())
290 }
291
292 fn dot(&mut self) -> LexerResult<()> {
293 let (start, _) = self.next()?;
294 let mut end = start;
295
296 if self.cursor.next_if(|c| c == '.').is_some() {
297 end += 1;
298 }
299
300 let value = &self.source[start..=end];
301 let span = (start as u32, (end + 1) as u32);
302 self.push(Token {
303 kind: TokenKind::Operator(Operator::from_str(value).map_err(
304 |_| LexerError::UnexpectedSymbol {
305 symbol: value.to_string(),
306 span,
307 },
308 )?),
309 span,
310 value,
311 });
312
313 Ok(())
314 }
315
316 fn cmp_operator(&mut self) -> LexerResult<()> {
317 let (start, _) = self.next()?;
318 let mut end = start;
319
320 if self.cursor.next_if(|c| c == '=').is_some() {
321 end += 1;
322 }
323
324 let value = &self.source[start..=end];
325 self.push(Token {
326 kind: TokenKind::Operator(Operator::from_str(value).map_err(
327 |_| LexerError::UnexpectedSymbol {
328 symbol: value.to_string(),
329 span: (start as u32, (end + 1) as u32),
330 },
331 )?),
332 span: (start as u32, (end + 1) as u32),
333 value,
334 });
335
336 Ok(())
337 }
338
339 fn semi(&mut self) -> LexerResult<()> {
340 let (start, _) = self.next()?;
341 self.push(Token {
342 kind: TokenKind::Operator(Operator::Semi),
343 span: (start as u32, (start + 1) as u32),
344 value: &self.source[start..=start],
345 });
346
347 Ok(())
348 }
349
350 fn equals(&mut self) -> LexerResult<()> {
351 let (start, _) = self.next()?;
352 let Some((end, _)) = self.cursor.next_if(|c| c == '=') else {
353 self.push(Token {
354 kind: TokenKind::Operator(Operator::Assign),
355 span: (start as u32, (start + 1) as u32),
356 value: &self.source[start..=start],
357 });
358
359 return Ok(());
360 };
361
362 self.push(Token {
363 kind: TokenKind::Operator(Operator::Comparison(
364 ComparisonOperator::Equal,
365 )),
366 span: (start as u32, (end + 1) as u32),
367 value: &self.source[start..=end],
368 });
369
370 Ok(())
371 }
372
373 fn question_mark(&mut self) -> LexerResult<()> {
374 let (start, _) = self.next()?;
375 let mut kind = TokenKind::Operator(Operator::QuestionMark);
376 let mut end = start;
377
378 if self.cursor.next_if(|c| c == '?').is_some() {
379 kind = TokenKind::Operator(Operator::Logical(
380 LogicalOperator::NullishCoalescing,
381 ));
382 end += 1;
383 }
384
385 let value = &self.source[start..=end];
386 self.push(Token {
387 kind,
388 value,
389 span: (start as u32, (end + 1) as u32),
390 });
391
392 Ok(())
393 }
394
395 fn operator(&mut self) -> LexerResult<()> {
396 let (start, _) = self.next()?;
397
398 let value = &self.source[start..=start];
399 let span = (start as u32, (start + 1) as u32);
400 self.push(Token {
401 kind: TokenKind::Operator(Operator::from_str(value).map_err(
402 |_| LexerError::UnexpectedSymbol {
403 symbol: value.to_string(),
404 span,
405 },
406 )?),
407 span,
408 value,
409 });
410
411 Ok(())
412 }
413
414 fn not(
415 &mut self,
416 start: usize,
417 ) -> LexerResult<()> {
418 if self.cursor.next_if_is(" in ") {
419 let end = self.cursor.position();
420
421 self.push(Token {
422 kind: TokenKind::Operator(Operator::Comparison(
423 ComparisonOperator::NotIn,
424 )),
425 span: (start as u32, (end - 1) as u32),
426 value: "not in",
427 })
428 } else {
429 let end = self.cursor.position();
430
431 self.push(Token {
432 kind: TokenKind::Operator(Operator::Logical(
433 LogicalOperator::Not,
434 )),
435 span: (start as u32, end as u32),
436 value: "not",
437 })
438 }
439
440 Ok(())
441 }
442
443 fn identifier(&mut self) -> LexerResult<()> {
444 let (start, _) = self.next()?;
445 let mut end = start;
446
447 while let Some((e, _)) =
448 self.cursor.next_if(|c| is_token_type!(c, "alphanumeric"))
449 {
450 end = e;
451 }
452
453 let value = &self.source[start..=end];
454 match value {
455 "and" => self.push(Token {
456 kind: TokenKind::Operator(Operator::Logical(
457 LogicalOperator::And,
458 )),
459 span: (start as u32, (end + 1) as u32),
460 value,
461 }),
462 "or" => self.push(Token {
463 kind: TokenKind::Operator(Operator::Logical(
464 LogicalOperator::Or,
465 )),
466 span: (start as u32, (end + 1) as u32),
467 value,
468 }),
469 "in" => self.push(Token {
470 kind: TokenKind::Operator(Operator::Comparison(
471 ComparisonOperator::In,
472 )),
473 span: (start as u32, (end + 1) as u32),
474 value,
475 }),
476 "true" => self.push(Token {
477 kind: TokenKind::Boolean(true),
478 span: (start as u32, (end + 1) as u32),
479 value,
480 }),
481 "false" => self.push(Token {
482 kind: TokenKind::Boolean(false),
483 span: (start as u32, (end + 1) as u32),
484 value,
485 }),
486 "not" => self.not(start)?,
487 _ => self.push(Token {
488 kind: Identifier::try_from(value)
489 .map(|identifier| TokenKind::Identifier(identifier))
490 .unwrap_or(TokenKind::Literal),
491 span: (start as u32, (end + 1) as u32),
492 value,
493 }),
494 }
495
496 Ok(())
497 }
498}