1use crate::lexer::codes::{is_token_type, token_type};
2use crate::lexer::cursor::{Cursor, CursorItem};
3use crate::lexer::error::LexerResult;
4use crate::lexer::token::{
5 Bracket, ComparisonOperator, Identifier, LogicalOperator, Operator, Token, TokenKind,
6};
7use crate::lexer::{LexerError, QuotationMark, TemplateString};
8use std::str::FromStr;
9
10#[derive(Debug, Default)]
11pub struct Lexer<'arena> {
12 tokens: Vec<Token<'arena>>,
13}
14
15impl<'arena> Lexer<'arena> {
16 pub fn new() -> Self {
17 Self::default()
18 }
19
20 pub fn tokenize(&mut self, source: &'arena str) -> LexerResult<&[Token<'arena>]> {
21 self.tokens.clear();
22
23 Scanner::new(source, &mut self.tokens).scan()?;
24 Ok(&self.tokens)
25 }
26}
27
28struct Scanner<'arena, 'self_ref> {
29 cursor: Cursor<'arena>,
30 tokens: &'self_ref mut Vec<Token<'arena>>,
31 source: &'arena str,
32}
33
34impl<'arena, 'self_ref> Scanner<'arena, 'self_ref> {
35 pub fn new(source: &'arena str, tokens: &'self_ref mut Vec<Token<'arena>>) -> Self {
36 Self {
37 cursor: Cursor::from(source),
38 source,
39 tokens,
40 }
41 }
42
43 pub fn scan(&mut self) -> LexerResult<()> {
44 while let Some(cursor_item) = self.cursor.peek() {
45 self.scan_cursor_item(cursor_item)?;
46 }
47
48 Ok(())
49 }
50
51 pub(crate) fn scan_cursor_item(&mut self, cursor_item: CursorItem) -> LexerResult<()> {
52 let (i, s) = cursor_item;
53
54 match s {
55 token_type!("space") => {
56 self.cursor.next();
57 Ok(())
58 }
59 '\'' => self.string(QuotationMark::SingleQuote),
60 '"' => self.string(QuotationMark::DoubleQuote),
61 token_type!("digit") => self.number(),
62 token_type!("bracket") => self.bracket(),
63 token_type!("cmp_operator") => self.cmp_operator(),
64 token_type!("operator") => self.operator(),
65 token_type!("question_mark") => self.question_mark(),
66 '`' => self.template_string(),
67 '.' => self.dot(),
68 token_type!("alpha") => self.identifier(),
69 _ => Err(LexerError::UnmatchedSymbol {
70 symbol: s,
71 position: i as u32,
72 }),
73 }
74 }
75
76 fn next(&self) -> LexerResult<CursorItem> {
77 self.cursor.next().ok_or_else(|| {
78 let (a, b) = self.cursor.peek_back().unwrap_or((0, ' '));
79
80 LexerError::UnexpectedEof {
81 symbol: b,
82 position: a as u32,
83 }
84 })
85 }
86
87 fn push(&mut self, token: Token<'arena>) {
88 self.tokens.push(token);
89 }
90
91 fn template_string(&mut self) -> LexerResult<()> {
92 let (start, _) = self.next()?;
93
94 self.tokens.push(Token {
95 kind: TokenKind::QuotationMark(QuotationMark::Backtick),
96 span: (start as u32, (start + 1) as u32),
97 value: QuotationMark::Backtick.into(),
98 });
99
100 let mut in_expression = false;
101 let mut str_start = start + 1;
102 loop {
103 let (e, c) = self.next()?;
104
105 match (c, in_expression) {
106 ('`', _) => {
107 if str_start < e {
108 self.tokens.push(Token {
109 kind: TokenKind::Literal,
110 span: (str_start as u32, e as u32),
111 value: &self.source[str_start..e],
112 });
113 }
114
115 self.tokens.push(Token {
116 kind: TokenKind::QuotationMark(QuotationMark::Backtick),
117 span: (e as u32, (e + 1) as u32),
118 value: QuotationMark::Backtick.into(),
119 });
120
121 break;
122 }
123 ('$', false) => {
124 in_expression = self.cursor.next_if_is("{");
125 if in_expression {
126 self.tokens.push(Token {
127 kind: TokenKind::Literal,
128 span: (str_start as u32, e as u32),
129 value: &self.source[str_start..e],
130 });
131
132 self.tokens.push(Token {
133 kind: TokenKind::TemplateString(TemplateString::ExpressionStart),
134 span: (e as u32, (e + 2) as u32),
135 value: TemplateString::ExpressionStart.into(),
136 });
137 }
138 }
139 ('}', true) => {
140 in_expression = false;
141 self.tokens.push(Token {
142 kind: TokenKind::TemplateString(TemplateString::ExpressionEnd),
143 span: (str_start as u32, e as u32),
144 value: TemplateString::ExpressionEnd.into(),
145 });
146
147 str_start = e + 1;
148 }
149 (_, false) => {
150 }
152 (_, true) => {
153 self.cursor.back();
154 self.scan_cursor_item((e, c))?;
155 }
156 }
157 }
158
159 Ok(())
160 }
161
162 fn string(&mut self, quote_kind: QuotationMark) -> LexerResult<()> {
163 let (start, opener) = self.next()?;
164 let end: usize;
165
166 loop {
167 let (e, c) = self.next()?;
168 if c == opener {
169 end = e;
170 break;
171 }
172 }
173
174 self.push(Token {
175 kind: TokenKind::QuotationMark(quote_kind),
176 span: (start as u32, (start + 1) as u32),
177 value: quote_kind.into(),
178 });
179
180 self.push(Token {
181 kind: TokenKind::Literal,
182 span: ((start + 1) as u32, end as u32),
183 value: &self.source[start + 1..end],
184 });
185
186 self.push(Token {
187 kind: TokenKind::QuotationMark(quote_kind),
188 span: (end as u32, (end + 1) as u32),
189 value: quote_kind.into(),
190 });
191
192 Ok(())
193 }
194
195 fn number(&mut self) -> LexerResult<()> {
196 let (start, _) = self.next()?;
197 let mut end = start;
198 let mut fractal = false;
199
200 while let Some((e, c)) = self
201 .cursor
202 .next_if(|c| is_token_type!(c, "digit") || c == '_' || c == '.')
203 {
204 if fractal && c == '.' {
205 self.cursor.back();
206 break;
207 }
208
209 if c == '.' {
210 if let Some((_, p)) = self.cursor.peek() {
211 if p == '.' {
212 self.cursor.back();
213 break;
214 }
215
216 fractal = true
217 }
218 }
219
220 end = e;
221 }
222
223 if let Some((e_pos, _)) = self.cursor.next_if(|c| c == 'e') {
224 end = e_pos;
225
226 if let Some((sign_pos, _)) = self.cursor.next_if(|c| c == '+' || c == '-') {
227 end = sign_pos;
228 }
229
230 let mut has_exponent_digits = false;
231 while let Some((exp_pos, _)) = self.cursor.next_if(|c| is_token_type!(c, "digit")) {
232 end = exp_pos;
233 has_exponent_digits = true;
234 }
235
236 if !has_exponent_digits {
237 while self.cursor.position() > e_pos {
238 self.cursor.back();
239 }
240
241 end = e_pos - 1;
242 }
243 }
244
245 self.push(Token {
246 kind: TokenKind::Number,
247 span: (start as u32, (end + 1) as u32),
248 value: &self.source[start..=end],
249 });
250
251 Ok(())
252 }
253
254 fn bracket(&mut self) -> LexerResult<()> {
255 let (start, _) = self.next()?;
256
257 let value = &self.source[start..=start];
258 let span = (start as u32, (start + 1) as u32);
259 self.push(Token {
260 kind: TokenKind::Bracket(Bracket::from_str(value).map_err(|_| {
261 LexerError::UnexpectedSymbol {
262 symbol: value.to_string(),
263 span,
264 }
265 })?),
266 span,
267 value,
268 });
269
270 Ok(())
271 }
272
273 fn dot(&mut self) -> LexerResult<()> {
274 let (start, _) = self.next()?;
275 let mut end = start;
276
277 if self.cursor.next_if(|c| c == '.').is_some() {
278 end += 1;
279 }
280
281 let value = &self.source[start..=end];
282 let span = (start as u32, (end + 1) as u32);
283 self.push(Token {
284 kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
285 LexerError::UnexpectedSymbol {
286 symbol: value.to_string(),
287 span,
288 }
289 })?),
290 span,
291 value,
292 });
293
294 Ok(())
295 }
296
297 fn cmp_operator(&mut self) -> LexerResult<()> {
298 let (start, _) = self.next()?;
299 let mut end = start;
300
301 if self.cursor.next_if(|c| c == '=').is_some() {
302 end += 1;
303 }
304
305 let value = &self.source[start..=end];
306 self.push(Token {
307 kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
308 LexerError::UnexpectedSymbol {
309 symbol: value.to_string(),
310 span: (start as u32, (end + 1) as u32),
311 }
312 })?),
313 span: (start as u32, (end + 1) as u32),
314 value,
315 });
316
317 Ok(())
318 }
319
320 fn question_mark(&mut self) -> LexerResult<()> {
321 let (start, _) = self.next()?;
322 let mut kind = TokenKind::Operator(Operator::QuestionMark);
323 let mut end = start;
324
325 if self.cursor.next_if(|c| c == '?').is_some() {
326 kind = TokenKind::Operator(Operator::Logical(LogicalOperator::NullishCoalescing));
327 end += 1;
328 }
329
330 let value = &self.source[start..=end];
331 self.push(Token {
332 kind,
333 value,
334 span: (start as u32, (end + 1) as u32),
335 });
336
337 Ok(())
338 }
339
340 fn operator(&mut self) -> LexerResult<()> {
341 let (start, _) = self.next()?;
342
343 let value = &self.source[start..=start];
344 let span = (start as u32, (start + 1) as u32);
345 self.push(Token {
346 kind: TokenKind::Operator(Operator::from_str(value).map_err(|_| {
347 LexerError::UnexpectedSymbol {
348 symbol: value.to_string(),
349 span,
350 }
351 })?),
352 span,
353 value,
354 });
355
356 Ok(())
357 }
358
359 fn not(&mut self, start: usize) -> LexerResult<()> {
360 if self.cursor.next_if_is(" in ") {
361 let end = self.cursor.position();
362
363 self.push(Token {
364 kind: TokenKind::Operator(Operator::Comparison(ComparisonOperator::NotIn)),
365 span: (start as u32, (end - 1) as u32),
366 value: "not in",
367 })
368 } else {
369 let end = self.cursor.position();
370
371 self.push(Token {
372 kind: TokenKind::Operator(Operator::Logical(LogicalOperator::Not)),
373 span: (start as u32, end as u32),
374 value: "not",
375 })
376 }
377
378 Ok(())
379 }
380
381 fn identifier(&mut self) -> LexerResult<()> {
382 let (start, _) = self.next()?;
383 let mut end = start;
384
385 while let Some((e, _)) = self.cursor.next_if(|c| is_token_type!(c, "alphanumeric")) {
386 end = e;
387 }
388
389 let value = &self.source[start..=end];
390 match value {
391 "and" => self.push(Token {
392 kind: TokenKind::Operator(Operator::Logical(LogicalOperator::And)),
393 span: (start as u32, (end + 1) as u32),
394 value,
395 }),
396 "or" => self.push(Token {
397 kind: TokenKind::Operator(Operator::Logical(LogicalOperator::Or)),
398 span: (start as u32, (end + 1) as u32),
399 value,
400 }),
401 "in" => self.push(Token {
402 kind: TokenKind::Operator(Operator::Comparison(ComparisonOperator::In)),
403 span: (start as u32, (end + 1) as u32),
404 value,
405 }),
406 "true" => self.push(Token {
407 kind: TokenKind::Boolean(true),
408 span: (start as u32, (end + 1) as u32),
409 value,
410 }),
411 "false" => self.push(Token {
412 kind: TokenKind::Boolean(false),
413 span: (start as u32, (end + 1) as u32),
414 value,
415 }),
416 "not" => self.not(start)?,
417 _ => self.push(Token {
418 kind: Identifier::try_from(value)
419 .map(|identifier| TokenKind::Identifier(identifier))
420 .unwrap_or(TokenKind::Literal),
421 span: (start as u32, (end + 1) as u32),
422 value,
423 }),
424 }
425
426 Ok(())
427 }
428}