alopex_sql/tokenizer/
mod.rs

1pub mod keyword;
2pub mod token;
3
4use crate::ast::span::{Location, Span};
5use crate::dialect::Dialect;
6use crate::error::{ParserError, Result};
7use keyword::Keyword;
8use token::{Token, TokenWithSpan, Word};
9
10/// 字句解析器。入力文字列をトークン列へ変換する。
11#[derive(Debug, Clone)]
12pub struct Tokenizer<'a> {
13    dialect: &'a dyn Dialect,
14    input: &'a str,
15    pos: usize,
16    line: u64,
17    column: u64,
18}
19
20impl<'a> Tokenizer<'a> {
21    pub fn new(dialect: &'a dyn Dialect, input: &'a str) -> Self {
22        Self {
23            dialect,
24            input,
25            pos: 0,
26            line: 1,
27            column: 1,
28        }
29    }
30
31    pub fn tokenize(&mut self) -> Result<Vec<TokenWithSpan>> {
32        let mut tokens = Vec::new();
33
34        loop {
35            self.skip_ignored();
36
37            if self.peek_char().is_none() {
38                let loc = Location::new(self.line, self.column);
39                tokens.push(TokenWithSpan {
40                    token: Token::EOF,
41                    span: Span::new(loc, loc),
42                });
43                break;
44            }
45
46            let token = match self.peek_char().unwrap() {
47                c if self.is_identifier_start(c) => self.lex_word()?,
48                c if c.is_ascii_digit() => self.lex_number()?,
49                '\'' => self.lex_string()?,
50                ',' => self.single_char_token(Token::Comma),
51                '=' => self.single_char_token(Token::Eq),
52                '<' => self.lex_lt_related()?,
53                '>' => self.lex_gt_related()?,
54                '!' => self.lex_bang_related()?,
55                '+' => self.single_char_token(Token::Plus),
56                '-' => self.single_char_token(Token::Minus),
57                '*' => self.single_char_token(Token::Mul),
58                '/' => self.single_char_token(Token::Div),
59                '%' => self.single_char_token(Token::Mod),
60                '(' => self.single_char_token(Token::LParen),
61                ')' => self.single_char_token(Token::RParen),
62                '[' => self.single_char_token(Token::LBracket),
63                ']' => self.single_char_token(Token::RBracket),
64                '.' => self.single_char_token(Token::Period),
65                ':' => self.single_char_token(Token::Colon),
66                ';' => self.single_char_token(Token::SemiColon),
67                '|' => self.lex_pipe_related()?,
68                other => {
69                    return Err(ParserError::UnexpectedToken {
70                        line: self.line,
71                        column: self.column,
72                        expected: "valid token".into(),
73                        found: other.to_string(),
74                    });
75                }
76            };
77
78            tokens.push(token);
79        }
80
81        Ok(tokens)
82    }
83
84    fn skip_ignored(&mut self) {
85        loop {
86            // Whitespace
87            while let Some(ch) = self.peek_char() {
88                if ch.is_whitespace() {
89                    self.next_char();
90                } else {
91                    break;
92                }
93            }
94
95            // Single-line comment: --
96            let mut skip_comment = false;
97            if self.peek_char() == Some('-') && self.peek_next_char() == Some('-') {
98                // consume the two dashes
99                self.next_char();
100                self.next_char();
101                skip_comment = true;
102                while let Some((ch, _)) = self.next_char() {
103                    if ch == '\n' {
104                        break;
105                    }
106                }
107            }
108
109            if !skip_comment {
110                break;
111            }
112        }
113    }
114
115    fn lex_word(&mut self) -> Result<TokenWithSpan> {
116        let start_pos = self.pos;
117        let (_, start_loc) = self.next_char().expect("peek ensured Some");
118        let mut last_loc = start_loc;
119
120        while let Some(ch) = self.peek_char() {
121            if self.is_identifier_part(ch) {
122                let (_, loc) = self.next_char().unwrap();
123                last_loc = loc;
124            } else {
125                break;
126            }
127        }
128
129        let value = &self.input[start_pos..self.pos];
130        let keyword = Keyword::from_str(value);
131        let word = Word {
132            value: value.to_string(),
133            quote_style: None,
134            keyword,
135        };
136
137        Ok(TokenWithSpan {
138            token: Token::Word(word),
139            span: Span::new(start_loc, last_loc),
140        })
141    }
142
143    fn lex_number(&mut self) -> Result<TokenWithSpan> {
144        let start_pos = self.pos;
145        let (_, start_loc) = self.next_char().expect("peek ensured Some");
146        let mut last_loc = start_loc;
147        let mut seen_dot = false;
148        let mut seen_exp = false;
149
150        while let Some(ch) = self.peek_char() {
151            if ch.is_ascii_digit() {
152                let (_, loc) = self.next_char().unwrap();
153                last_loc = loc;
154                continue;
155            }
156
157            if ch == '.' {
158                if seen_dot {
159                    let mut value = self.input[start_pos..self.pos].to_string();
160                    let _ = self.next_char().unwrap(); // consume the second '.'
161                    value.push('.');
162                    while let Some(c) = self.peek_char() {
163                        if c.is_ascii_digit() {
164                            let (d, _) = self.next_char().unwrap();
165                            value.push(d);
166                        } else {
167                            break;
168                        }
169                    }
170                    return Err(ParserError::InvalidNumber {
171                        line: start_loc.line,
172                        column: start_loc.column,
173                        value,
174                    });
175                }
176
177                // Only treat as float if next char is digit.
178                if self
179                    .peek_next_char()
180                    .map(|c| c.is_ascii_digit())
181                    .unwrap_or(false)
182                {
183                    let (_, loc) = self.next_char().unwrap();
184                    last_loc = loc;
185                    seen_dot = true;
186                    continue;
187                } else {
188                    break;
189                }
190            }
191
192            if ch == 'e' || ch == 'E' {
193                if seen_exp {
194                    break;
195                }
196                seen_exp = true;
197                let (_, exp_loc) = self.next_char().unwrap();
198                last_loc = exp_loc;
199
200                // Optional sign
201                if let Some(sign @ ('+' | '-')) = self.peek_char() {
202                    let (_, loc) = self.next_char().unwrap();
203                    last_loc = loc;
204                    // record sign but no need to store
205                    let _ = sign;
206                }
207
208                if !self
209                    .peek_char()
210                    .map(|c| c.is_ascii_digit())
211                    .unwrap_or(false)
212                {
213                    let value = self.input[start_pos..self.pos].to_string();
214                    return Err(ParserError::InvalidNumber {
215                        line: start_loc.line,
216                        column: start_loc.column,
217                        value,
218                    });
219                }
220
221                while let Some(c) = self.peek_char() {
222                    if c.is_ascii_digit() {
223                        let (_, loc) = self.next_char().unwrap();
224                        last_loc = loc;
225                    } else {
226                        break;
227                    }
228                }
229                continue;
230            }
231
232            break;
233        }
234
235        let value = self.input[start_pos..self.pos].to_string();
236        Ok(TokenWithSpan {
237            token: Token::Number(value),
238            span: Span::new(start_loc, last_loc),
239        })
240    }
241
242    #[allow(unused_assignments)]
243    fn lex_string(&mut self) -> Result<TokenWithSpan> {
244        let (_, start_loc) = self.next_char().expect("peek ensured Some"); // opening quote
245        let mut last_loc = start_loc;
246        let mut content = String::new();
247
248        loop {
249            let Some((ch, loc)) = self.next_char() else {
250                return Err(ParserError::UnterminatedString {
251                    line: start_loc.line,
252                    column: start_loc.column,
253                });
254            };
255            last_loc = loc;
256
257            if ch == '\'' {
258                if self.peek_char() == Some('\'') {
259                    let _ = self.next_char().unwrap();
260                    content.push('\'');
261                    continue;
262                } else {
263                    break; // end of string
264                }
265            } else {
266                content.push(ch);
267            }
268        }
269
270        Ok(TokenWithSpan {
271            token: Token::SingleQuotedString(content),
272            span: Span::new(start_loc, last_loc),
273        })
274    }
275
276    fn lex_lt_related(&mut self) -> Result<TokenWithSpan> {
277        let (_, start_loc) = self.next_char().unwrap();
278        let mut last_loc = start_loc;
279
280        let token = match self.peek_char() {
281            Some('=') => {
282                let (_, loc) = self.next_char().unwrap();
283                last_loc = loc;
284                Token::LtEq
285            }
286            Some('>') => {
287                let (_, loc) = self.next_char().unwrap();
288                last_loc = loc;
289                Token::Neq
290            }
291            _ => Token::Lt,
292        };
293
294        Ok(TokenWithSpan {
295            token,
296            span: Span::new(start_loc, last_loc),
297        })
298    }
299
300    fn lex_gt_related(&mut self) -> Result<TokenWithSpan> {
301        let (_, start_loc) = self.next_char().unwrap();
302        let mut last_loc = start_loc;
303
304        let token = match self.peek_char() {
305            Some('=') => {
306                let (_, loc) = self.next_char().unwrap();
307                last_loc = loc;
308                Token::GtEq
309            }
310            _ => Token::Gt,
311        };
312
313        Ok(TokenWithSpan {
314            token,
315            span: Span::new(start_loc, last_loc),
316        })
317    }
318
319    fn lex_bang_related(&mut self) -> Result<TokenWithSpan> {
320        let (_, start_loc) = self.next_char().unwrap();
321
322        if self.peek_char() == Some('=') {
323            let (_, loc) = self.next_char().unwrap();
324            return Ok(TokenWithSpan {
325                token: Token::Neq,
326                span: Span::new(start_loc, loc),
327            });
328        }
329
330        Err(ParserError::UnexpectedToken {
331            line: start_loc.line,
332            column: start_loc.column,
333            expected: "valid operator".into(),
334            found: "!".into(),
335        })
336    }
337
338    fn lex_pipe_related(&mut self) -> Result<TokenWithSpan> {
339        let (_, start_loc) = self.next_char().unwrap();
340
341        if self.peek_char() == Some('|') {
342            let (_, loc) = self.next_char().unwrap();
343            Ok(TokenWithSpan {
344                token: Token::StringConcat,
345                span: Span::new(start_loc, loc),
346            })
347        } else {
348            Err(ParserError::UnexpectedToken {
349                line: start_loc.line,
350                column: start_loc.column,
351                expected: "||".into(),
352                found: "|".into(),
353            })
354        }
355    }
356
357    fn single_char_token(&mut self, token: Token) -> TokenWithSpan {
358        let (_, start_loc) = self.next_char().unwrap();
359        TokenWithSpan {
360            token,
361            span: Span::new(start_loc, start_loc),
362        }
363    }
364
365    fn peek_char(&self) -> Option<char> {
366        self.input[self.pos..].chars().next()
367    }
368
369    fn peek_next_char(&self) -> Option<char> {
370        let mut iter = self.input[self.pos..].chars();
371        iter.next();
372        iter.next()
373    }
374
375    fn next_char(&mut self) -> Option<(char, Location)> {
376        let ch = self.peek_char()?;
377        let loc = Location::new(self.line, self.column);
378        self.pos += ch.len_utf8();
379        if ch == '\n' {
380            self.line += 1;
381            self.column = 1;
382        } else {
383            self.column += 1;
384        }
385        Some((ch, loc))
386    }
387
388    fn is_identifier_start(&self, ch: char) -> bool {
389        self.dialect.is_identifier_start(ch)
390    }
391
392    fn is_identifier_part(&self, ch: char) -> bool {
393        self.dialect.is_identifier_part(ch)
394    }
395}