1pub mod keyword;
2pub mod token;
3
4use crate::ast::span::{Location, Span};
5use crate::dialect::Dialect;
6use crate::error::{ParserError, Result};
7use keyword::Keyword;
8use token::{Token, TokenWithSpan, Word};
9
10#[derive(Debug, Clone)]
12pub struct Tokenizer<'a> {
13 dialect: &'a dyn Dialect,
14 input: &'a str,
15 pos: usize,
16 line: u64,
17 column: u64,
18}
19
20impl<'a> Tokenizer<'a> {
21 pub fn new(dialect: &'a dyn Dialect, input: &'a str) -> Self {
22 Self {
23 dialect,
24 input,
25 pos: 0,
26 line: 1,
27 column: 1,
28 }
29 }
30
31 pub fn tokenize(&mut self) -> Result<Vec<TokenWithSpan>> {
32 let mut tokens = Vec::new();
33
34 loop {
35 self.skip_ignored();
36
37 if self.peek_char().is_none() {
38 let loc = Location::new(self.line, self.column);
39 tokens.push(TokenWithSpan {
40 token: Token::EOF,
41 span: Span::new(loc, loc),
42 });
43 break;
44 }
45
46 let token = match self.peek_char().unwrap() {
47 c if self.is_identifier_start(c) => self.lex_word()?,
48 c if c.is_ascii_digit() => self.lex_number()?,
49 '\'' => self.lex_string()?,
50 ',' => self.single_char_token(Token::Comma),
51 '=' => self.single_char_token(Token::Eq),
52 '<' => self.lex_lt_related()?,
53 '>' => self.lex_gt_related()?,
54 '!' => self.lex_bang_related()?,
55 '+' => self.single_char_token(Token::Plus),
56 '-' => self.single_char_token(Token::Minus),
57 '*' => self.single_char_token(Token::Mul),
58 '/' => self.single_char_token(Token::Div),
59 '%' => self.single_char_token(Token::Mod),
60 '(' => self.single_char_token(Token::LParen),
61 ')' => self.single_char_token(Token::RParen),
62 '[' => self.single_char_token(Token::LBracket),
63 ']' => self.single_char_token(Token::RBracket),
64 '.' => self.single_char_token(Token::Period),
65 ':' => self.single_char_token(Token::Colon),
66 ';' => self.single_char_token(Token::SemiColon),
67 '|' => self.lex_pipe_related()?,
68 other => {
69 return Err(ParserError::UnexpectedToken {
70 line: self.line,
71 column: self.column,
72 expected: "valid token".into(),
73 found: other.to_string(),
74 });
75 }
76 };
77
78 tokens.push(token);
79 }
80
81 Ok(tokens)
82 }
83
84 fn skip_ignored(&mut self) {
85 loop {
86 while let Some(ch) = self.peek_char() {
88 if ch.is_whitespace() {
89 self.next_char();
90 } else {
91 break;
92 }
93 }
94
95 let mut skip_comment = false;
97 if self.peek_char() == Some('-') && self.peek_next_char() == Some('-') {
98 self.next_char();
100 self.next_char();
101 skip_comment = true;
102 while let Some((ch, _)) = self.next_char() {
103 if ch == '\n' {
104 break;
105 }
106 }
107 }
108
109 if !skip_comment {
110 break;
111 }
112 }
113 }
114
115 fn lex_word(&mut self) -> Result<TokenWithSpan> {
116 let start_pos = self.pos;
117 let (_, start_loc) = self.next_char().expect("peek ensured Some");
118 let mut last_loc = start_loc;
119
120 while let Some(ch) = self.peek_char() {
121 if self.is_identifier_part(ch) {
122 let (_, loc) = self.next_char().unwrap();
123 last_loc = loc;
124 } else {
125 break;
126 }
127 }
128
129 let value = &self.input[start_pos..self.pos];
130 let keyword = Keyword::from_str(value);
131 let word = Word {
132 value: value.to_string(),
133 quote_style: None,
134 keyword,
135 };
136
137 Ok(TokenWithSpan {
138 token: Token::Word(word),
139 span: Span::new(start_loc, last_loc),
140 })
141 }
142
143 fn lex_number(&mut self) -> Result<TokenWithSpan> {
144 let start_pos = self.pos;
145 let (_, start_loc) = self.next_char().expect("peek ensured Some");
146 let mut last_loc = start_loc;
147 let mut seen_dot = false;
148 let mut seen_exp = false;
149
150 while let Some(ch) = self.peek_char() {
151 if ch.is_ascii_digit() {
152 let (_, loc) = self.next_char().unwrap();
153 last_loc = loc;
154 continue;
155 }
156
157 if ch == '.' {
158 if seen_dot {
159 let mut value = self.input[start_pos..self.pos].to_string();
160 let _ = self.next_char().unwrap(); value.push('.');
162 while let Some(c) = self.peek_char() {
163 if c.is_ascii_digit() {
164 let (d, _) = self.next_char().unwrap();
165 value.push(d);
166 } else {
167 break;
168 }
169 }
170 return Err(ParserError::InvalidNumber {
171 line: start_loc.line,
172 column: start_loc.column,
173 value,
174 });
175 }
176
177 if self
179 .peek_next_char()
180 .map(|c| c.is_ascii_digit())
181 .unwrap_or(false)
182 {
183 let (_, loc) = self.next_char().unwrap();
184 last_loc = loc;
185 seen_dot = true;
186 continue;
187 } else {
188 break;
189 }
190 }
191
192 if ch == 'e' || ch == 'E' {
193 if seen_exp {
194 break;
195 }
196 seen_exp = true;
197 let (_, exp_loc) = self.next_char().unwrap();
198 last_loc = exp_loc;
199
200 if let Some(sign @ ('+' | '-')) = self.peek_char() {
202 let (_, loc) = self.next_char().unwrap();
203 last_loc = loc;
204 let _ = sign;
206 }
207
208 if !self
209 .peek_char()
210 .map(|c| c.is_ascii_digit())
211 .unwrap_or(false)
212 {
213 let value = self.input[start_pos..self.pos].to_string();
214 return Err(ParserError::InvalidNumber {
215 line: start_loc.line,
216 column: start_loc.column,
217 value,
218 });
219 }
220
221 while let Some(c) = self.peek_char() {
222 if c.is_ascii_digit() {
223 let (_, loc) = self.next_char().unwrap();
224 last_loc = loc;
225 } else {
226 break;
227 }
228 }
229 continue;
230 }
231
232 break;
233 }
234
235 let value = self.input[start_pos..self.pos].to_string();
236 Ok(TokenWithSpan {
237 token: Token::Number(value),
238 span: Span::new(start_loc, last_loc),
239 })
240 }
241
242 #[allow(unused_assignments)]
243 fn lex_string(&mut self) -> Result<TokenWithSpan> {
244 let (_, start_loc) = self.next_char().expect("peek ensured Some"); let mut last_loc = start_loc;
246 let mut content = String::new();
247
248 loop {
249 let Some((ch, loc)) = self.next_char() else {
250 return Err(ParserError::UnterminatedString {
251 line: start_loc.line,
252 column: start_loc.column,
253 });
254 };
255 last_loc = loc;
256
257 if ch == '\'' {
258 if self.peek_char() == Some('\'') {
259 let _ = self.next_char().unwrap();
260 content.push('\'');
261 continue;
262 } else {
263 break; }
265 } else {
266 content.push(ch);
267 }
268 }
269
270 Ok(TokenWithSpan {
271 token: Token::SingleQuotedString(content),
272 span: Span::new(start_loc, last_loc),
273 })
274 }
275
276 fn lex_lt_related(&mut self) -> Result<TokenWithSpan> {
277 let (_, start_loc) = self.next_char().unwrap();
278 let mut last_loc = start_loc;
279
280 let token = match self.peek_char() {
281 Some('=') => {
282 let (_, loc) = self.next_char().unwrap();
283 last_loc = loc;
284 Token::LtEq
285 }
286 Some('>') => {
287 let (_, loc) = self.next_char().unwrap();
288 last_loc = loc;
289 Token::Neq
290 }
291 _ => Token::Lt,
292 };
293
294 Ok(TokenWithSpan {
295 token,
296 span: Span::new(start_loc, last_loc),
297 })
298 }
299
300 fn lex_gt_related(&mut self) -> Result<TokenWithSpan> {
301 let (_, start_loc) = self.next_char().unwrap();
302 let mut last_loc = start_loc;
303
304 let token = match self.peek_char() {
305 Some('=') => {
306 let (_, loc) = self.next_char().unwrap();
307 last_loc = loc;
308 Token::GtEq
309 }
310 _ => Token::Gt,
311 };
312
313 Ok(TokenWithSpan {
314 token,
315 span: Span::new(start_loc, last_loc),
316 })
317 }
318
319 fn lex_bang_related(&mut self) -> Result<TokenWithSpan> {
320 let (_, start_loc) = self.next_char().unwrap();
321
322 if self.peek_char() == Some('=') {
323 let (_, loc) = self.next_char().unwrap();
324 return Ok(TokenWithSpan {
325 token: Token::Neq,
326 span: Span::new(start_loc, loc),
327 });
328 }
329
330 Err(ParserError::UnexpectedToken {
331 line: start_loc.line,
332 column: start_loc.column,
333 expected: "valid operator".into(),
334 found: "!".into(),
335 })
336 }
337
338 fn lex_pipe_related(&mut self) -> Result<TokenWithSpan> {
339 let (_, start_loc) = self.next_char().unwrap();
340
341 if self.peek_char() == Some('|') {
342 let (_, loc) = self.next_char().unwrap();
343 Ok(TokenWithSpan {
344 token: Token::StringConcat,
345 span: Span::new(start_loc, loc),
346 })
347 } else {
348 Err(ParserError::UnexpectedToken {
349 line: start_loc.line,
350 column: start_loc.column,
351 expected: "||".into(),
352 found: "|".into(),
353 })
354 }
355 }
356
357 fn single_char_token(&mut self, token: Token) -> TokenWithSpan {
358 let (_, start_loc) = self.next_char().unwrap();
359 TokenWithSpan {
360 token,
361 span: Span::new(start_loc, start_loc),
362 }
363 }
364
365 fn peek_char(&self) -> Option<char> {
366 self.input[self.pos..].chars().next()
367 }
368
369 fn peek_next_char(&self) -> Option<char> {
370 let mut iter = self.input[self.pos..].chars();
371 iter.next();
372 iter.next()
373 }
374
375 fn next_char(&mut self) -> Option<(char, Location)> {
376 let ch = self.peek_char()?;
377 let loc = Location::new(self.line, self.column);
378 self.pos += ch.len_utf8();
379 if ch == '\n' {
380 self.line += 1;
381 self.column = 1;
382 } else {
383 self.column += 1;
384 }
385 Some((ch, loc))
386 }
387
388 fn is_identifier_start(&self, ch: char) -> bool {
389 self.dialect.is_identifier_start(ch)
390 }
391
392 fn is_identifier_part(&self, ch: char) -> bool {
393 self.dialect.is_identifier_part(ch)
394 }
395}