scopeql_parser/
tokenizer.rs

1// Copyright 2025 ScopeDB, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::ops::Range;
16
17use logos::Lexer;
18use logos::Logos;
19
20#[derive(Debug)]
21pub struct Tokenizer<'source> {
22    lexer: Lexer<'source, TokenKind>,
23    eoi: bool,
24}
25
26impl<'source> Tokenizer<'source> {
27    pub fn new(source: &'source str) -> Self {
28        Self {
29            lexer: TokenKind::lexer(source),
30            eoi: false,
31        }
32    }
33
34    pub fn slice(&self) -> &'source str {
35        self.lexer.slice()
36    }
37
38    pub fn span(&self) -> Range<usize> {
39        self.lexer.span()
40    }
41}
42
43impl<'source> Iterator for Tokenizer<'source> {
44    type Item = Result<TokenKind, ()>;
45
46    fn next(&mut self) -> Option<Self::Item> {
47        match self.lexer.next() {
48            Some(Err(())) => Some(Err(())),
49            Some(Ok(kind)) => Some(Ok(kind)),
50            None => {
51                if self.eoi {
52                    // already emitted EOI
53                    None
54                } else {
55                    // emit EOI; the next call will return None
56                    self.eoi = true;
57                    Some(Ok(TokenKind::EOI))
58                }
59            }
60        }
61    }
62}
63
64#[derive(Logos, Clone, Copy, Debug, PartialEq, Eq)]
65pub enum TokenKind {
66    /// A special token representing the end of input.
67    EOI,
68
69    // Skipped tokens
70    /// Whitespace characters.
71    #[regex(r"[ \t\r\n\f]+")]
72    Whitespace,
73
74    /// Single-line or multi-line comments.
75    #[regex(r"--[^\r\n\f]*")]
76    #[regex(r"/\*([^\*]|(\*[^/]))*\*/")]
77    Comment,
78
79    /// Unquoted identifiers.
80    ///
81    /// The identifier will be normalized to lowercase, and thus only ASCII letters are allowed.
82    /// Otherwise, the normalization may subtly change the intent of the identifier.
83    #[regex(r#"[_a-zA-Z][_a-zA-Z0-9]*"#)]
84    Ident,
85
86    #[regex(r#"'([^'\\]|\\.|'')*'"#)]
87    #[regex(r#""([^"\\]|\\.|"")*""#)]
88    #[regex(r#"`([^`\\]|\\.|``)*`"#)]
89    LiteralString,
90    #[regex(r"[xX]'[a-fA-F0-9]*'")]
91    LiteralHexBinaryString,
92
93    #[regex(r"[0-9]+(_|[0-9])*")]
94    LiteralInteger,
95    /// Hexadecimal integer literals with '0x' prefix.
96    #[regex(r"0[xX][a-fA-F0-9]+")]
97    LiteralHexInteger,
98    /// Floating point literals.
99    #[regex(r"[0-9]+[eE][+-]?[0-9]+")]
100    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?")]
101    LiteralFloat,
102
103    // Symbols
104    #[token("=")]
105    Eq,
106    #[token("<>")]
107    #[token("!=")]
108    NotEq,
109    #[token("<")]
110    Lt,
111    #[token(">")]
112    Gt,
113    #[token("<=")]
114    Lte,
115    #[token(">=")]
116    Gte,
117    #[token("+")]
118    Plus,
119    #[token("-")]
120    Minus,
121    #[token("*")]
122    Multiply,
123    #[token("/")]
124    Divide,
125    #[token("%")]
126    Modulo,
127    #[token("||")]
128    Concat,
129    #[token("(")]
130    LParen,
131    #[token(")")]
132    RParen,
133    #[token("[")]
134    LBracket,
135    #[token("]")]
136    RBracket,
137    #[token("{")]
138    LBrace,
139    #[token("}")]
140    RBrace,
141    #[token(",")]
142    Comma,
143    #[token(".")]
144    Dot,
145    #[token(":")]
146    Colon,
147    #[token("::")]
148    DoubleColon,
149    #[token(";")]
150    SemiColon,
151    #[token("$")]
152    Dollar,
153    #[token("=>")]
154    Arrow,
155
156    // Case-insensitive keywords
157    #[token("ADD", ignore(case))]
158    ADD,
159    #[token("AGGREGATE", ignore(case))]
160    AGGREGATE,
161    #[token("ALL", ignore(case))]
162    ALL,
163    #[token("ALTER", ignore(case))]
164    ALTER,
165    #[token("ANALYZE", ignore(case))]
166    ANALYZE,
167    #[token("AND", ignore(case))]
168    AND,
169    #[token("ANY", ignore(case))]
170    ANY,
171    #[token("ARRAY", ignore(case))]
172    ARRAY,
173    #[token("AS", ignore(case))]
174    AS,
175    #[token("ASC", ignore(case))]
176    ASC,
177    #[token("BEGIN", ignore(case))]
178    BEGIN,
179    #[token("BETWEEN", ignore(case))]
180    BETWEEN,
181    #[token("BOOLEAN", ignore(case))]
182    BOOLEAN,
183    #[token("BY", ignore(case))]
184    BY,
185    #[token("CASE", ignore(case))]
186    CASE,
187    #[token("CAST", ignore(case))]
188    CAST,
189    #[token("CLUSTER", ignore(case))]
190    CLUSTER,
191    #[token("COLUMN", ignore(case))]
192    COLUMN,
193    #[token("COMMENT", ignore(case))]
194    COMMENT,
195    #[token("CREATE", ignore(case))]
196    CREATE,
197    #[token("DATABASES", ignore(case))]
198    DATABASES,
199    #[token("DATABASE", ignore(case))]
200    DATABASE,
201    #[token("DELETE", ignore(case))]
202    DELETE,
203    #[token("DESC", ignore(case))]
204    DESC,
205    #[token("DESCRIBE", ignore(case))]
206    DESCRIBE,
207    #[token("DISTINCT", ignore(case))]
208    DISTINCT,
209    #[token("DROP", ignore(case))]
210    DROP,
211    #[token("ELSE", ignore(case))]
212    ELSE,
213    #[token("END", ignore(case))]
214    END,
215    #[token("EQUALITY", ignore(case))]
216    EQUALITY,
217    #[token("EXCLUDE", ignore(case))]
218    EXCLUDE,
219    #[token("EXEC", ignore(case))]
220    EXEC,
221    #[token("EXISTS", ignore(case))]
222    EXISTS,
223    #[token("EXPLAIN", ignore(case))]
224    EXPLAIN,
225    #[token("FALSE", ignore(case))]
226    FALSE,
227    #[token("FIRST", ignore(case))]
228    FIRST,
229    #[token("FLOAT", ignore(case))]
230    FLOAT,
231    #[token("FROM", ignore(case))]
232    FROM,
233    #[token("FULL", ignore(case))]
234    FULL,
235    #[token("GROUP", ignore(case))]
236    GROUP,
237    #[token("IF", ignore(case))]
238    IF,
239    #[token("IN", ignore(case))]
240    IN,
241    #[token("INDEX", ignore(case))]
242    INDEX,
243    #[token("INNER", ignore(case))]
244    INNER,
245    #[token("INSERT", ignore(case))]
246    INSERT,
247    #[token("INT", ignore(case))]
248    INT,
249    #[token("INTERVAL", ignore(case))]
250    INTERVAL,
251    #[token("INTO", ignore(case))]
252    INTO,
253    #[token("IS", ignore(case))]
254    IS,
255    #[token("JOB", ignore(case))]
256    JOB,
257    #[token("JOBS", ignore(case))]
258    JOBS,
259    #[token("JOIN", ignore(case))]
260    JOIN,
261    #[token("KEY", ignore(case))]
262    KEY,
263    #[token("LAST", ignore(case))]
264    LAST,
265    #[token("LEFT", ignore(case))]
266    LEFT,
267    #[token("LIMIT", ignore(case))]
268    LIMIT,
269    #[token("MATERIALIZED", ignore(case))]
270    MATERIALIZED,
271    #[token("NODEGROUP", ignore(case))]
272    NODEGROUP,
273    #[token("NOT", ignore(case))]
274    NOT,
275    #[token("NULL", ignore(case))]
276    NULL,
277    #[token("NULLS", ignore(case))]
278    NULLS,
279    #[token("OBJECT", ignore(case))]
280    OBJECT,
281    #[token("OFFSET", ignore(case))]
282    OFFSET,
283    #[token("ON", ignore(case))]
284    ON,
285    #[token("OPTIMIZE", ignore(case))]
286    OPTIMIZE,
287    #[token("OR", ignore(case))]
288    OR,
289    #[token("ORDER", ignore(case))]
290    ORDER,
291    #[token("OUTER", ignore(case))]
292    OUTER,
293    #[token("PERCENT", ignore(case))]
294    PERCENT,
295    #[token("PLAN", ignore(case))]
296    PLAN,
297    #[token("RANGE", ignore(case))]
298    RANGE,
299    #[token("RENAME", ignore(case))]
300    RENAME,
301    #[token("REPLACE", ignore(case))]
302    REPLACE,
303    #[token("RESUME", ignore(case))]
304    RESUME,
305    #[token("RIGHT", ignore(case))]
306    RIGHT,
307    #[token("SAMPLE", ignore(case))]
308    SAMPLE,
309    #[token("SCHEDULE", ignore(case))]
310    SCHEDULE,
311    #[token("SCHEMAS", ignore(case))]
312    SCHEMAS,
313    #[token("SCHEMA", ignore(case))]
314    SCHEMA,
315    #[token("SEARCH", ignore(case))]
316    SEARCH,
317    #[token("SELECT", ignore(case))]
318    SELECT,
319    #[token("SET", ignore(case))]
320    SET,
321    #[token("SHOW", ignore(case))]
322    SHOW,
323    #[token("STATEMENTS", ignore(case))]
324    STATEMENTS,
325    #[token("STRING", ignore(case))]
326    STRING,
327    #[token("SUSPEND", ignore(case))]
328    SUSPEND,
329    #[token("TABLE", ignore(case))]
330    TABLE,
331    #[token("TABLES", ignore(case))]
332    TABLES,
333    #[token("THEN", ignore(case))]
334    THEN,
335    #[token("TIMESTAMP", ignore(case))]
336    TIMESTAMP,
337    #[token("TO", ignore(case))]
338    TO,
339    #[token("TRUE", ignore(case))]
340    TRUE,
341    #[token("UINT", ignore(case))]
342    UINT,
343    #[token("UNION", ignore(case))]
344    UNION,
345    #[token("UPDATE", ignore(case))]
346    UPDATE,
347    #[token("VACUUM", ignore(case))]
348    VACUUM,
349    #[token("VALUES", ignore(case))]
350    VALUES,
351    #[token("VIEW", ignore(case))]
352    VIEW,
353    #[token("VIEWS", ignore(case))]
354    VIEWS,
355    #[token("WHEN", ignore(case))]
356    WHEN,
357    #[token("WHERE", ignore(case))]
358    WHERE,
359    #[token("WINDOW", ignore(case))]
360    WINDOW,
361    #[token("WITH", ignore(case))]
362    WITH,
363    #[token("WITHIN", ignore(case))]
364    WITHIN,
365    #[token("XOR", ignore(case))]
366    XOR,
367
368    // Command-line only tokens
369    #[cfg(feature = "command")]
370    #[token("\\")]
371    BackSlash,
372    #[cfg(feature = "command")]
373    #[token("CANCEL", ignore(case))]
374    CANCEL,
375}
376
377impl TokenKind {
378    pub fn is_literal(&self) -> bool {
379        use TokenKind::*;
380
381        matches!(
382            self,
383            LiteralFloat
384                | LiteralInteger
385                | LiteralString
386                | LiteralHexBinaryString
387                | LiteralHexInteger
388        )
389    }
390
391    pub fn is_symbol(&self) -> bool {
392        use TokenKind::*;
393
394        #[cfg(feature = "command")]
395        if matches!(self, BackSlash) {
396            return true;
397        }
398
399        matches!(
400            self,
401            Eq | NotEq
402                | Lt
403                | Gt
404                | Lte
405                | Gte
406                | Plus
407                | Minus
408                | Multiply
409                | Divide
410                | Modulo
411                | Concat
412                | LParen
413                | RParen
414                | LBracket
415                | RBracket
416                | LBrace
417                | RBrace
418                | Comma
419                | Dot
420                | Colon
421                | DoubleColon
422                | SemiColon
423                | Dollar
424                | Arrow
425        )
426    }
427
428    pub fn is_keyword(&self) -> bool {
429        use TokenKind::*;
430
431        !self.is_literal()
432            && !self.is_symbol()
433            && !matches!(self, Ident | EOI | Whitespace | Comment)
434    }
435
436    pub fn is_reserved_keyword(&self) -> bool {
437        use TokenKind::*;
438
439        matches!(
440            self,
441            FROM | JOIN
442                | VALUES
443                | WHERE
444                | ORDER
445                | DISTINCT
446                | LIMIT
447                | SELECT
448                | AGGREGATE
449                | WINDOW
450                | WITHIN
451                | GROUP
452                | INSERT
453                | UNION
454                | SAMPLE
455                | NULL
456                | TRUE
457                | FALSE
458                | AS
459                | BY
460                | ON
461                | CASE
462                | WHEN
463                | THEN
464                | ELSE
465                | END
466                | CAST
467                | NOT
468                | IS
469                | IN
470                | BETWEEN
471                | AND
472                | OR
473        )
474    }
475}
scopeql_parser/tokenizer.rs

scopeql_parser/
tokenizer.rs