Skip to main content

scopeql_parser/
tokenizer.rs

1// Copyright 2025 ScopeDB, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::ops::Range;
16
17use logos::Lexer;
18use logos::Logos;
19
20#[derive(Debug)]
21pub struct Tokenizer<'source> {
22    lexer: Lexer<'source, TokenKind>,
23    eoi: bool,
24}
25
26impl<'source> Tokenizer<'source> {
27    pub fn new(source: &'source str) -> Self {
28        Self {
29            lexer: TokenKind::lexer(source),
30            eoi: false,
31        }
32    }
33
34    pub fn slice(&self) -> &'source str {
35        self.lexer.slice()
36    }
37
38    pub fn span(&self) -> Range<usize> {
39        self.lexer.span()
40    }
41}
42
43impl<'source> Iterator for Tokenizer<'source> {
44    type Item = Result<TokenKind, ()>;
45
46    fn next(&mut self) -> Option<Self::Item> {
47        match self.lexer.next() {
48            Some(Err(())) => Some(Err(())),
49            Some(Ok(kind)) => Some(Ok(kind)),
50            None => {
51                if self.eoi {
52                    // already emitted EOI
53                    None
54                } else {
55                    // emit EOI; the next call will return None
56                    self.eoi = true;
57                    Some(Ok(TokenKind::EOI))
58                }
59            }
60        }
61    }
62}
63
64#[derive(Logos, Clone, Copy, Debug, PartialEq, Eq)]
65pub enum TokenKind {
66    /// A special token representing the end of input.
67    EOI,
68
69    // Skipped tokens
70    /// Whitespace characters.
71    #[regex(r"[ \t\r\n\f]+")]
72    Whitespace,
73
74    /// Single-line or multi-line comments.
75    #[regex(r"--[^\r\n\f]*")]
76    #[regex(r"/\*([^\*]|(\*[^/]))*\*/")]
77    Comment,
78
79    /// Unquoted identifiers.
80    ///
81    /// The identifier will be normalized to lowercase, and thus only ASCII letters are allowed.
82    /// Otherwise, the normalization may subtly change the intent of the identifier.
83    #[regex(r#"[_a-zA-Z][_a-zA-Z0-9]*"#)]
84    Ident,
85
86    #[regex(r#"'([^'\\]|\\.|'')*'"#)]
87    #[regex(r#""([^"\\]|\\.|"")*""#)]
88    #[regex(r#"`([^`\\]|\\.|``)*`"#)]
89    LiteralString,
90    #[regex(r"[xX]'[a-fA-F0-9]*'")]
91    LiteralHexBinaryString,
92
93    #[regex(r"[0-9]+(_|[0-9])*")]
94    LiteralInteger,
95    /// Hexadecimal integer literals with '0x' prefix.
96    #[regex(r"0[xX][a-fA-F0-9]+")]
97    LiteralHexInteger,
98    /// Floating point literals.
99    #[regex(r"[0-9]+[eE][+-]?[0-9]+")]
100    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?")]
101    LiteralFloat,
102
103    // Symbols
104    #[token("=")]
105    Eq,
106    #[token("<>")]
107    #[token("!=")]
108    NotEq,
109    #[token("<")]
110    Lt,
111    #[token(">")]
112    Gt,
113    #[token("<=")]
114    Lte,
115    #[token(">=")]
116    Gte,
117    #[token("+")]
118    Plus,
119    #[token("-")]
120    Minus,
121    #[token("*")]
122    Multiply,
123    #[token("/")]
124    Divide,
125    #[token("%")]
126    Modulo,
127    #[token("||")]
128    Concat,
129    #[token("(")]
130    LParen,
131    #[token(")")]
132    RParen,
133    #[token("[")]
134    LBracket,
135    #[token("]")]
136    RBracket,
137    #[token("{")]
138    LBrace,
139    #[token("}")]
140    RBrace,
141    #[token(",")]
142    Comma,
143    #[token(".")]
144    Dot,
145    #[token(":")]
146    Colon,
147    #[token("::")]
148    DoubleColon,
149    #[token(";")]
150    SemiColon,
151    #[token("$")]
152    Dollar,
153    #[token("=>")]
154    Arrow,
155
156    // Case-insensitive keywords
157    #[token("ADD", ignore(case))]
158    ADD,
159    #[token("AGGREGATE", ignore(case))]
160    AGGREGATE,
161    #[token("ALL", ignore(case))]
162    ALL,
163    #[token("ALTER", ignore(case))]
164    ALTER,
165    #[token("ANALYZE", ignore(case))]
166    ANALYZE,
167    #[token("AND", ignore(case))]
168    AND,
169    #[token("ANY", ignore(case))]
170    ANY,
171    #[token("ARRAY", ignore(case))]
172    ARRAY,
173    #[token("AS", ignore(case))]
174    AS,
175    #[token("ASC", ignore(case))]
176    ASC,
177    #[token("BEGIN", ignore(case))]
178    BEGIN,
179    #[token("BETWEEN", ignore(case))]
180    BETWEEN,
181    #[token("BOOLEAN", ignore(case))]
182    BOOLEAN,
183    #[token("BY", ignore(case))]
184    BY,
185    #[token("CASE", ignore(case))]
186    CASE,
187    #[token("CAST", ignore(case))]
188    CAST,
189    #[token("CLUSTER", ignore(case))]
190    CLUSTER,
191    #[token("COLUMN", ignore(case))]
192    COLUMN,
193    #[token("COMMENT", ignore(case))]
194    COMMENT,
195    #[token("CREATE", ignore(case))]
196    CREATE,
197    #[token("DATABASES", ignore(case))]
198    DATABASES,
199    #[token("DATABASE", ignore(case))]
200    DATABASE,
201    #[token("DELETE", ignore(case))]
202    DELETE,
203    #[token("DESC", ignore(case))]
204    DESC,
205    #[token("DESCRIBE", ignore(case))]
206    DESCRIBE,
207    #[token("DISTINCT", ignore(case))]
208    DISTINCT,
209    #[token("DROP", ignore(case))]
210    DROP,
211    #[token("ELSE", ignore(case))]
212    ELSE,
213    #[token("END", ignore(case))]
214    END,
215    #[token("EXCLUDE", ignore(case))]
216    EXCLUDE,
217    #[token("EXEC", ignore(case))]
218    EXEC,
219    #[token("EXISTS", ignore(case))]
220    EXISTS,
221    #[token("EXPLAIN", ignore(case))]
222    EXPLAIN,
223    #[token("FALSE", ignore(case))]
224    FALSE,
225    #[token("FIRST", ignore(case))]
226    FIRST,
227    #[token("FLOAT", ignore(case))]
228    FLOAT,
229    #[token("FROM", ignore(case))]
230    FROM,
231    #[token("FULL", ignore(case))]
232    FULL,
233    #[token("GROUP", ignore(case))]
234    GROUP,
235    #[token("IF", ignore(case))]
236    IF,
237    #[token("IN", ignore(case))]
238    IN,
239    #[token("INDEX", ignore(case))]
240    INDEX,
241    #[token("INNER", ignore(case))]
242    INNER,
243    #[token("INSERT", ignore(case))]
244    INSERT,
245    #[token("INT", ignore(case))]
246    INT,
247    #[token("INTERVAL", ignore(case))]
248    INTERVAL,
249    #[token("INTO", ignore(case))]
250    INTO,
251    #[token("IS", ignore(case))]
252    IS,
253    #[token("JOB", ignore(case))]
254    JOB,
255    #[token("JOBS", ignore(case))]
256    JOBS,
257    #[token("JOIN", ignore(case))]
258    JOIN,
259    #[token("KEY", ignore(case))]
260    KEY,
261    #[token("LAST", ignore(case))]
262    LAST,
263    #[token("LEFT", ignore(case))]
264    LEFT,
265    #[token("LIMIT", ignore(case))]
266    LIMIT,
267    #[token("MATERIALIZED", ignore(case))]
268    MATERIALIZED,
269    #[token("NODEGROUP", ignore(case))]
270    NODEGROUP,
271    #[token("NOT", ignore(case))]
272    NOT,
273    #[token("NULL", ignore(case))]
274    NULL,
275    #[token("NULLS", ignore(case))]
276    NULLS,
277    #[token("OBJECT", ignore(case))]
278    OBJECT,
279    #[token("OFFSET", ignore(case))]
280    OFFSET,
281    #[token("ON", ignore(case))]
282    ON,
283    #[token("OPTIMIZE", ignore(case))]
284    OPTIMIZE,
285    #[token("OR", ignore(case))]
286    OR,
287    #[token("ORDER", ignore(case))]
288    ORDER,
289    #[token("OUTER", ignore(case))]
290    OUTER,
291    #[token("PARTITION", ignore(case))]
292    PARTITION,
293    #[token("PERCENT", ignore(case))]
294    PERCENT,
295    #[token("PLAN", ignore(case))]
296    PLAN,
297    #[token("POINT", ignore(case))]
298    POINT,
299    #[token("RANGE", ignore(case))]
300    RANGE,
301    #[token("RENAME", ignore(case))]
302    RENAME,
303    #[token("REPLACE", ignore(case))]
304    REPLACE,
305    #[token("RESUME", ignore(case))]
306    RESUME,
307    #[token("RIGHT", ignore(case))]
308    RIGHT,
309    #[token("SAMPLE", ignore(case))]
310    SAMPLE,
311    #[token("SCHEDULE", ignore(case))]
312    SCHEDULE,
313    #[token("SCHEMAS", ignore(case))]
314    SCHEMAS,
315    #[token("SCHEMA", ignore(case))]
316    SCHEMA,
317    #[token("SEARCH", ignore(case))]
318    SEARCH,
319    #[token("SELECT", ignore(case))]
320    SELECT,
321    #[token("SET", ignore(case))]
322    SET,
323    #[token("SHOW", ignore(case))]
324    SHOW,
325    #[token("STATEMENTS", ignore(case))]
326    STATEMENTS,
327    #[token("STRING", ignore(case))]
328    STRING,
329    #[token("SUSPEND", ignore(case))]
330    SUSPEND,
331    #[token("TABLE", ignore(case))]
332    TABLE,
333    #[token("TABLES", ignore(case))]
334    TABLES,
335    #[token("THEN", ignore(case))]
336    THEN,
337    #[token("TIMESTAMP", ignore(case))]
338    TIMESTAMP,
339    #[token("TO", ignore(case))]
340    TO,
341    #[token("TRUE", ignore(case))]
342    TRUE,
343    #[token("UINT", ignore(case))]
344    UINT,
345    #[token("UNION", ignore(case))]
346    UNION,
347    #[token("UPDATE", ignore(case))]
348    UPDATE,
349    #[token("VACUUM", ignore(case))]
350    VACUUM,
351    #[token("VALUES", ignore(case))]
352    VALUES,
353    #[token("VIEW", ignore(case))]
354    VIEW,
355    #[token("VIEWS", ignore(case))]
356    VIEWS,
357    #[token("WHEN", ignore(case))]
358    WHEN,
359    #[token("WHERE", ignore(case))]
360    WHERE,
361    #[token("WINDOW", ignore(case))]
362    WINDOW,
363    #[token("WITH", ignore(case))]
364    WITH,
365    #[token("WITHIN", ignore(case))]
366    WITHIN,
367    #[token("XOR", ignore(case))]
368    XOR,
369
370    // Command-line only tokens
371    #[cfg(feature = "command")]
372    #[token("\\")]
373    BackSlash,
374    #[cfg(feature = "command")]
375    #[token("CANCEL", ignore(case))]
376    CANCEL,
377}
378
379impl TokenKind {
380    pub fn is_literal(&self) -> bool {
381        use TokenKind::*;
382
383        matches!(
384            self,
385            LiteralFloat
386                | LiteralInteger
387                | LiteralString
388                | LiteralHexBinaryString
389                | LiteralHexInteger
390        )
391    }
392
393    pub fn is_symbol(&self) -> bool {
394        use TokenKind::*;
395
396        #[cfg(feature = "command")]
397        if matches!(self, BackSlash) {
398            return true;
399        }
400
401        matches!(
402            self,
403            Eq | NotEq
404                | Lt
405                | Gt
406                | Lte
407                | Gte
408                | Plus
409                | Minus
410                | Multiply
411                | Divide
412                | Modulo
413                | Concat
414                | LParen
415                | RParen
416                | LBracket
417                | RBracket
418                | LBrace
419                | RBrace
420                | Comma
421                | Dot
422                | Colon
423                | DoubleColon
424                | SemiColon
425                | Dollar
426                | Arrow
427        )
428    }
429
430    pub fn is_keyword(&self) -> bool {
431        use TokenKind::*;
432
433        !self.is_literal()
434            && !self.is_symbol()
435            && !matches!(self, Ident | EOI | Whitespace | Comment)
436    }
437
438    pub fn is_reserved_keyword(&self) -> bool {
439        use TokenKind::*;
440
441        matches!(
442            self,
443            FROM | JOIN
444                | VALUES
445                | WHERE
446                | ORDER
447                | DISTINCT
448                | LIMIT
449                | SELECT
450                | AGGREGATE
451                | WINDOW
452                | WITHIN
453                | GROUP
454                | INSERT
455                | UNION
456                | SAMPLE
457                | NULL
458                | TRUE
459                | FALSE
460                | AS
461                | BY
462                | ON
463                | CASE
464                | WHEN
465                | THEN
466                | ELSE
467                | END
468                | CAST
469                | NOT
470                | IS
471                | IN
472                | BETWEEN
473                | AND
474                | OR
475        )
476    }
477}