Skip to main content

graphos_adapters/query/gql/
lexer.rs

1//! GQL Lexer.
2
3use graphos_common::utils::error::SourceSpan;
4
5/// A token in the GQL language.
6#[derive(Debug, Clone, PartialEq)]
7pub struct Token {
8    /// The token kind.
9    pub kind: TokenKind,
10    /// The source text.
11    pub text: String,
12    /// Source span.
13    pub span: SourceSpan,
14}
15
16/// Token kinds in GQL.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum TokenKind {
19    // Keywords
20    /// MATCH keyword.
21    Match,
22    /// RETURN keyword.
23    Return,
24    /// WHERE keyword.
25    Where,
26    /// AND keyword.
27    And,
28    /// OR keyword.
29    Or,
30    /// NOT keyword.
31    Not,
32    /// INSERT keyword.
33    Insert,
34    /// DELETE keyword.
35    Delete,
36    /// SET keyword.
37    Set,
38    /// CREATE keyword.
39    Create,
40    /// NODE keyword.
41    Node,
42    /// EDGE keyword.
43    Edge,
44    /// TYPE keyword.
45    Type,
46    /// AS keyword.
47    As,
48    /// DISTINCT keyword.
49    Distinct,
50    /// ORDER keyword.
51    Order,
52    /// BY keyword.
53    By,
54    /// ASC keyword.
55    Asc,
56    /// DESC keyword.
57    Desc,
58    /// SKIP keyword.
59    Skip,
60    /// LIMIT keyword.
61    Limit,
62    /// NULL keyword.
63    Null,
64    /// TRUE keyword.
65    True,
66    /// FALSE keyword.
67    False,
68    /// DETACH keyword.
69    Detach,
70    /// CALL keyword.
71    Call,
72    /// YIELD keyword.
73    Yield,
74    /// IN keyword.
75    In,
76    /// LIKE keyword.
77    Like,
78    /// IS keyword.
79    Is,
80    /// CASE keyword.
81    Case,
82    /// WHEN keyword.
83    When,
84    /// THEN keyword.
85    Then,
86    /// ELSE keyword.
87    Else,
88    /// END keyword.
89    End,
90    /// OPTIONAL keyword.
91    Optional,
92    /// WITH keyword.
93    With,
94    /// EXISTS keyword for subquery expressions.
95    Exists,
96
97    // Literals
98    /// Integer literal.
99    Integer,
100    /// Float literal.
101    Float,
102    /// String literal.
103    String,
104
105    // Identifiers
106    /// Identifier.
107    Identifier,
108
109    // Operators
110    /// = operator.
111    Eq,
112    /// <> operator.
113    Ne,
114    /// < operator.
115    Lt,
116    /// <= operator.
117    Le,
118    /// > operator.
119    Gt,
120    /// >= operator.
121    Ge,
122    /// + operator.
123    Plus,
124    /// - operator.
125    Minus,
126    /// * operator.
127    Star,
128    /// / operator.
129    Slash,
130    /// % operator.
131    Percent,
132    /// || operator.
133    Concat,
134
135    // Punctuation
136    /// ( punctuation.
137    LParen,
138    /// ) punctuation.
139    RParen,
140    /// [ punctuation.
141    LBracket,
142    /// ] punctuation.
143    RBracket,
144    /// { punctuation.
145    LBrace,
146    /// } punctuation.
147    RBrace,
148    /// : punctuation.
149    Colon,
150    /// , punctuation.
151    Comma,
152    /// . punctuation.
153    Dot,
154    /// -> arrow.
155    Arrow,
156    /// <- arrow.
157    LeftArrow,
158    /// -- double dash.
159    DoubleDash,
160
161    /// Parameter ($name).
162    Parameter,
163
164    /// End of input.
165    Eof,
166
167    /// Error token.
168    Error,
169}
170
171/// GQL Lexer.
172pub struct Lexer<'a> {
173    input: &'a str,
174    position: usize,
175    line: u32,
176    column: u32,
177}
178
179impl<'a> Lexer<'a> {
180    /// Creates a new lexer for the given input.
181    pub fn new(input: &'a str) -> Self {
182        Self {
183            input,
184            position: 0,
185            line: 1,
186            column: 1,
187        }
188    }
189
190    /// Returns the next token.
191    pub fn next_token(&mut self) -> Token {
192        self.skip_whitespace();
193
194        let start = self.position;
195        let start_line = self.line;
196        let start_column = self.column;
197
198        if self.position >= self.input.len() {
199            return Token {
200                kind: TokenKind::Eof,
201                text: String::new(),
202                span: SourceSpan::new(start, start, start_line, start_column),
203            };
204        }
205
206        let ch = self.current_char();
207
208        let kind = match ch {
209            '(' => {
210                self.advance();
211                TokenKind::LParen
212            }
213            ')' => {
214                self.advance();
215                TokenKind::RParen
216            }
217            '[' => {
218                self.advance();
219                TokenKind::LBracket
220            }
221            ']' => {
222                self.advance();
223                TokenKind::RBracket
224            }
225            '{' => {
226                self.advance();
227                TokenKind::LBrace
228            }
229            '}' => {
230                self.advance();
231                TokenKind::RBrace
232            }
233            ':' => {
234                self.advance();
235                TokenKind::Colon
236            }
237            ',' => {
238                self.advance();
239                TokenKind::Comma
240            }
241            '.' => {
242                self.advance();
243                TokenKind::Dot
244            }
245            '+' => {
246                self.advance();
247                TokenKind::Plus
248            }
249            '*' => {
250                self.advance();
251                TokenKind::Star
252            }
253            '/' => {
254                self.advance();
255                TokenKind::Slash
256            }
257            '%' => {
258                self.advance();
259                TokenKind::Percent
260            }
261            '=' => {
262                self.advance();
263                TokenKind::Eq
264            }
265            '<' => {
266                self.advance();
267                if self.current_char() == '>' {
268                    self.advance();
269                    TokenKind::Ne
270                } else if self.current_char() == '=' {
271                    self.advance();
272                    TokenKind::Le
273                } else if self.current_char() == '-' {
274                    self.advance();
275                    TokenKind::LeftArrow
276                } else {
277                    TokenKind::Lt
278                }
279            }
280            '>' => {
281                self.advance();
282                if self.current_char() == '=' {
283                    self.advance();
284                    TokenKind::Ge
285                } else {
286                    TokenKind::Gt
287                }
288            }
289            '-' => {
290                self.advance();
291                if self.current_char() == '>' {
292                    self.advance();
293                    TokenKind::Arrow
294                } else if self.current_char() == '-' {
295                    self.advance();
296                    TokenKind::DoubleDash
297                } else {
298                    TokenKind::Minus
299                }
300            }
301            '|' => {
302                self.advance();
303                if self.current_char() == '|' {
304                    self.advance();
305                    TokenKind::Concat
306                } else {
307                    TokenKind::Error
308                }
309            }
310            '\'' | '"' => self.scan_string(),
311            '$' => self.scan_parameter(),
312            _ if ch.is_ascii_digit() => self.scan_number(),
313            _ if ch.is_ascii_alphabetic() || ch == '_' => self.scan_identifier(),
314            _ => {
315                self.advance();
316                TokenKind::Error
317            }
318        };
319
320        let text = self.input[start..self.position].to_string();
321        Token {
322            kind,
323            text,
324            span: SourceSpan::new(start, self.position, start_line, start_column),
325        }
326    }
327
328    fn skip_whitespace(&mut self) {
329        while self.position < self.input.len() {
330            let ch = self.current_char();
331            if ch.is_whitespace() {
332                if ch == '\n' {
333                    self.line += 1;
334                    self.column = 1;
335                } else {
336                    self.column += 1;
337                }
338                self.position += 1;
339            } else {
340                break;
341            }
342        }
343    }
344
345    fn current_char(&self) -> char {
346        self.input[self.position..].chars().next().unwrap_or('\0')
347    }
348
349    fn advance(&mut self) {
350        if self.position < self.input.len() {
351            self.position += 1;
352            self.column += 1;
353        }
354    }
355
356    fn scan_string(&mut self) -> TokenKind {
357        let quote = self.current_char();
358        self.advance();
359
360        while self.position < self.input.len() {
361            let ch = self.current_char();
362            if ch == quote {
363                self.advance();
364                return TokenKind::String;
365            }
366            if ch == '\\' {
367                self.advance();
368            }
369            self.advance();
370        }
371
372        TokenKind::Error // Unterminated string
373    }
374
375    fn scan_number(&mut self) -> TokenKind {
376        while self.position < self.input.len() && self.current_char().is_ascii_digit() {
377            self.advance();
378        }
379
380        if self.current_char() == '.' {
381            self.advance();
382            while self.position < self.input.len() && self.current_char().is_ascii_digit() {
383                self.advance();
384            }
385            TokenKind::Float
386        } else {
387            TokenKind::Integer
388        }
389    }
390
391    fn scan_parameter(&mut self) -> TokenKind {
392        // Skip the '$'
393        self.advance();
394
395        // Parameter name must start with a letter or underscore
396        if self.position >= self.input.len() {
397            return TokenKind::Error;
398        }
399
400        let ch = self.current_char();
401        if !ch.is_ascii_alphabetic() && ch != '_' {
402            return TokenKind::Error;
403        }
404
405        // Scan the rest of the identifier
406        while self.position < self.input.len() {
407            let ch = self.current_char();
408            if ch.is_ascii_alphanumeric() || ch == '_' {
409                self.advance();
410            } else {
411                break;
412            }
413        }
414
415        TokenKind::Parameter
416    }
417
418    fn scan_identifier(&mut self) -> TokenKind {
419        let start = self.position;
420        while self.position < self.input.len() {
421            let ch = self.current_char();
422            if ch.is_ascii_alphanumeric() || ch == '_' {
423                self.advance();
424            } else {
425                break;
426            }
427        }
428
429        let text = &self.input[start..self.position];
430        match text.to_uppercase().as_str() {
431            "MATCH" => TokenKind::Match,
432            "RETURN" => TokenKind::Return,
433            "WHERE" => TokenKind::Where,
434            "AND" => TokenKind::And,
435            "OR" => TokenKind::Or,
436            "NOT" => TokenKind::Not,
437            "INSERT" => TokenKind::Insert,
438            "DELETE" => TokenKind::Delete,
439            "SET" => TokenKind::Set,
440            "CREATE" => TokenKind::Create,
441            "NODE" => TokenKind::Node,
442            "EDGE" => TokenKind::Edge,
443            "TYPE" => TokenKind::Type,
444            "AS" => TokenKind::As,
445            "DISTINCT" => TokenKind::Distinct,
446            "ORDER" => TokenKind::Order,
447            "BY" => TokenKind::By,
448            "ASC" => TokenKind::Asc,
449            "DESC" => TokenKind::Desc,
450            "SKIP" => TokenKind::Skip,
451            "LIMIT" => TokenKind::Limit,
452            "NULL" => TokenKind::Null,
453            "TRUE" => TokenKind::True,
454            "FALSE" => TokenKind::False,
455            "DETACH" => TokenKind::Detach,
456            "CALL" => TokenKind::Call,
457            "YIELD" => TokenKind::Yield,
458            "IN" => TokenKind::In,
459            "LIKE" => TokenKind::Like,
460            "IS" => TokenKind::Is,
461            "CASE" => TokenKind::Case,
462            "WHEN" => TokenKind::When,
463            "THEN" => TokenKind::Then,
464            "ELSE" => TokenKind::Else,
465            "END" => TokenKind::End,
466            "EXISTS" => TokenKind::Exists,
467            "OPTIONAL" => TokenKind::Optional,
468            "WITH" => TokenKind::With,
469            _ => TokenKind::Identifier,
470        }
471    }
472}
473
474#[cfg(test)]
475mod tests {
476    use super::*;
477
478    #[test]
479    fn test_simple_tokens() {
480        let mut lexer = Lexer::new("MATCH (n) RETURN n");
481
482        assert_eq!(lexer.next_token().kind, TokenKind::Match);
483        assert_eq!(lexer.next_token().kind, TokenKind::LParen);
484        assert_eq!(lexer.next_token().kind, TokenKind::Identifier);
485        assert_eq!(lexer.next_token().kind, TokenKind::RParen);
486        assert_eq!(lexer.next_token().kind, TokenKind::Return);
487        assert_eq!(lexer.next_token().kind, TokenKind::Identifier);
488        assert_eq!(lexer.next_token().kind, TokenKind::Eof);
489    }
490
491    #[test]
492    fn test_arrow_tokens() {
493        let mut lexer = Lexer::new("->  <-  --");
494
495        assert_eq!(lexer.next_token().kind, TokenKind::Arrow);
496        assert_eq!(lexer.next_token().kind, TokenKind::LeftArrow);
497        assert_eq!(lexer.next_token().kind, TokenKind::DoubleDash);
498    }
499
500    #[test]
501    fn test_number_tokens() {
502        let mut lexer = Lexer::new("42 3.14");
503
504        let int_token = lexer.next_token();
505        assert_eq!(int_token.kind, TokenKind::Integer);
506        assert_eq!(int_token.text, "42");
507
508        let float_token = lexer.next_token();
509        assert_eq!(float_token.kind, TokenKind::Float);
510        assert_eq!(float_token.text, "3.14");
511    }
512
513    #[test]
514    fn test_string_tokens() {
515        let mut lexer = Lexer::new("'hello' \"world\"");
516
517        let s1 = lexer.next_token();
518        assert_eq!(s1.kind, TokenKind::String);
519        assert_eq!(s1.text, "'hello'");
520
521        let s2 = lexer.next_token();
522        assert_eq!(s2.kind, TokenKind::String);
523        assert_eq!(s2.text, "\"world\"");
524    }
525
526    #[test]
527    fn test_parameter_tokens() {
528        let mut lexer = Lexer::new("$param1 $another_param");
529
530        let p1 = lexer.next_token();
531        assert_eq!(p1.kind, TokenKind::Parameter);
532        assert_eq!(p1.text, "$param1");
533
534        let p2 = lexer.next_token();
535        assert_eq!(p2.kind, TokenKind::Parameter);
536        assert_eq!(p2.text, "$another_param");
537    }
538
539    #[test]
540    fn test_parameter_in_query() {
541        let mut lexer = Lexer::new("n.age > $min_age");
542
543        assert_eq!(lexer.next_token().kind, TokenKind::Identifier); // n
544        assert_eq!(lexer.next_token().kind, TokenKind::Dot);
545        assert_eq!(lexer.next_token().kind, TokenKind::Identifier); // age
546        assert_eq!(lexer.next_token().kind, TokenKind::Gt);
547
548        let param = lexer.next_token();
549        assert_eq!(param.kind, TokenKind::Parameter);
550        assert_eq!(param.text, "$min_age");
551    }
552}