Skip to main content

graphos_adapters/query/gql/
lexer.rs

1//! GQL Lexer.
2
3use graphos_common::utils::error::SourceSpan;
4
5/// A token in the GQL language.
6#[derive(Debug, Clone, PartialEq)]
7pub struct Token {
8    /// The token kind.
9    pub kind: TokenKind,
10    /// The source text.
11    pub text: String,
12    /// Source span.
13    pub span: SourceSpan,
14}
15
16/// Token kinds in GQL.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum TokenKind {
19    // Keywords
20    /// MATCH keyword.
21    Match,
22    /// RETURN keyword.
23    Return,
24    /// WHERE keyword.
25    Where,
26    /// AND keyword.
27    And,
28    /// OR keyword.
29    Or,
30    /// NOT keyword.
31    Not,
32    /// INSERT keyword.
33    Insert,
34    /// DELETE keyword.
35    Delete,
36    /// SET keyword.
37    Set,
38    /// CREATE keyword.
39    Create,
40    /// NODE keyword.
41    Node,
42    /// EDGE keyword.
43    Edge,
44    /// TYPE keyword.
45    Type,
46    /// AS keyword.
47    As,
48    /// DISTINCT keyword.
49    Distinct,
50    /// ORDER keyword.
51    Order,
52    /// BY keyword.
53    By,
54    /// ASC keyword.
55    Asc,
56    /// DESC keyword.
57    Desc,
58    /// SKIP keyword.
59    Skip,
60    /// LIMIT keyword.
61    Limit,
62    /// NULL keyword.
63    Null,
64    /// TRUE keyword.
65    True,
66    /// FALSE keyword.
67    False,
68    /// DETACH keyword.
69    Detach,
70    /// CALL keyword.
71    Call,
72    /// YIELD keyword.
73    Yield,
74    /// IN keyword.
75    In,
76    /// LIKE keyword.
77    Like,
78    /// IS keyword.
79    Is,
80    /// CASE keyword.
81    Case,
82    /// WHEN keyword.
83    When,
84    /// THEN keyword.
85    Then,
86    /// ELSE keyword.
87    Else,
88    /// END keyword.
89    End,
90
91    // Literals
92    /// Integer literal.
93    Integer,
94    /// Float literal.
95    Float,
96    /// String literal.
97    String,
98
99    // Identifiers
100    /// Identifier.
101    Identifier,
102
103    // Operators
104    /// = operator.
105    Eq,
106    /// <> operator.
107    Ne,
108    /// < operator.
109    Lt,
110    /// <= operator.
111    Le,
112    /// > operator.
113    Gt,
114    /// >= operator.
115    Ge,
116    /// + operator.
117    Plus,
118    /// - operator.
119    Minus,
120    /// * operator.
121    Star,
122    /// / operator.
123    Slash,
124    /// % operator.
125    Percent,
126    /// || operator.
127    Concat,
128
129    // Punctuation
130    /// ( punctuation.
131    LParen,
132    /// ) punctuation.
133    RParen,
134    /// [ punctuation.
135    LBracket,
136    /// ] punctuation.
137    RBracket,
138    /// { punctuation.
139    LBrace,
140    /// } punctuation.
141    RBrace,
142    /// : punctuation.
143    Colon,
144    /// , punctuation.
145    Comma,
146    /// . punctuation.
147    Dot,
148    /// -> arrow.
149    Arrow,
150    /// <- arrow.
151    LeftArrow,
152    /// -- double dash.
153    DoubleDash,
154
155    /// End of input.
156    Eof,
157
158    /// Error token.
159    Error,
160}
161
162/// GQL Lexer.
163pub struct Lexer<'a> {
164    input: &'a str,
165    position: usize,
166    line: u32,
167    column: u32,
168}
169
170impl<'a> Lexer<'a> {
171    /// Creates a new lexer for the given input.
172    pub fn new(input: &'a str) -> Self {
173        Self {
174            input,
175            position: 0,
176            line: 1,
177            column: 1,
178        }
179    }
180
181    /// Returns the next token.
182    pub fn next_token(&mut self) -> Token {
183        self.skip_whitespace();
184
185        let start = self.position;
186        let start_line = self.line;
187        let start_column = self.column;
188
189        if self.position >= self.input.len() {
190            return Token {
191                kind: TokenKind::Eof,
192                text: String::new(),
193                span: SourceSpan::new(start, start, start_line, start_column),
194            };
195        }
196
197        let ch = self.current_char();
198
199        let kind = match ch {
200            '(' => {
201                self.advance();
202                TokenKind::LParen
203            }
204            ')' => {
205                self.advance();
206                TokenKind::RParen
207            }
208            '[' => {
209                self.advance();
210                TokenKind::LBracket
211            }
212            ']' => {
213                self.advance();
214                TokenKind::RBracket
215            }
216            '{' => {
217                self.advance();
218                TokenKind::LBrace
219            }
220            '}' => {
221                self.advance();
222                TokenKind::RBrace
223            }
224            ':' => {
225                self.advance();
226                TokenKind::Colon
227            }
228            ',' => {
229                self.advance();
230                TokenKind::Comma
231            }
232            '.' => {
233                self.advance();
234                TokenKind::Dot
235            }
236            '+' => {
237                self.advance();
238                TokenKind::Plus
239            }
240            '*' => {
241                self.advance();
242                TokenKind::Star
243            }
244            '/' => {
245                self.advance();
246                TokenKind::Slash
247            }
248            '%' => {
249                self.advance();
250                TokenKind::Percent
251            }
252            '=' => {
253                self.advance();
254                TokenKind::Eq
255            }
256            '<' => {
257                self.advance();
258                if self.current_char() == '>' {
259                    self.advance();
260                    TokenKind::Ne
261                } else if self.current_char() == '=' {
262                    self.advance();
263                    TokenKind::Le
264                } else if self.current_char() == '-' {
265                    self.advance();
266                    TokenKind::LeftArrow
267                } else {
268                    TokenKind::Lt
269                }
270            }
271            '>' => {
272                self.advance();
273                if self.current_char() == '=' {
274                    self.advance();
275                    TokenKind::Ge
276                } else {
277                    TokenKind::Gt
278                }
279            }
280            '-' => {
281                self.advance();
282                if self.current_char() == '>' {
283                    self.advance();
284                    TokenKind::Arrow
285                } else if self.current_char() == '-' {
286                    self.advance();
287                    TokenKind::DoubleDash
288                } else {
289                    TokenKind::Minus
290                }
291            }
292            '|' => {
293                self.advance();
294                if self.current_char() == '|' {
295                    self.advance();
296                    TokenKind::Concat
297                } else {
298                    TokenKind::Error
299                }
300            }
301            '\'' | '"' => self.scan_string(),
302            _ if ch.is_ascii_digit() => self.scan_number(),
303            _ if ch.is_ascii_alphabetic() || ch == '_' => self.scan_identifier(),
304            _ => {
305                self.advance();
306                TokenKind::Error
307            }
308        };
309
310        let text = self.input[start..self.position].to_string();
311        Token {
312            kind,
313            text,
314            span: SourceSpan::new(start, self.position, start_line, start_column),
315        }
316    }
317
318    fn skip_whitespace(&mut self) {
319        while self.position < self.input.len() {
320            let ch = self.current_char();
321            if ch.is_whitespace() {
322                if ch == '\n' {
323                    self.line += 1;
324                    self.column = 1;
325                } else {
326                    self.column += 1;
327                }
328                self.position += 1;
329            } else {
330                break;
331            }
332        }
333    }
334
335    fn current_char(&self) -> char {
336        self.input[self.position..].chars().next().unwrap_or('\0')
337    }
338
339    fn advance(&mut self) {
340        if self.position < self.input.len() {
341            self.position += 1;
342            self.column += 1;
343        }
344    }
345
346    fn scan_string(&mut self) -> TokenKind {
347        let quote = self.current_char();
348        self.advance();
349
350        while self.position < self.input.len() {
351            let ch = self.current_char();
352            if ch == quote {
353                self.advance();
354                return TokenKind::String;
355            }
356            if ch == '\\' {
357                self.advance();
358            }
359            self.advance();
360        }
361
362        TokenKind::Error // Unterminated string
363    }
364
365    fn scan_number(&mut self) -> TokenKind {
366        while self.position < self.input.len() && self.current_char().is_ascii_digit() {
367            self.advance();
368        }
369
370        if self.current_char() == '.' {
371            self.advance();
372            while self.position < self.input.len() && self.current_char().is_ascii_digit() {
373                self.advance();
374            }
375            TokenKind::Float
376        } else {
377            TokenKind::Integer
378        }
379    }
380
381    fn scan_identifier(&mut self) -> TokenKind {
382        let start = self.position;
383        while self.position < self.input.len() {
384            let ch = self.current_char();
385            if ch.is_ascii_alphanumeric() || ch == '_' {
386                self.advance();
387            } else {
388                break;
389            }
390        }
391
392        let text = &self.input[start..self.position];
393        match text.to_uppercase().as_str() {
394            "MATCH" => TokenKind::Match,
395            "RETURN" => TokenKind::Return,
396            "WHERE" => TokenKind::Where,
397            "AND" => TokenKind::And,
398            "OR" => TokenKind::Or,
399            "NOT" => TokenKind::Not,
400            "INSERT" => TokenKind::Insert,
401            "DELETE" => TokenKind::Delete,
402            "SET" => TokenKind::Set,
403            "CREATE" => TokenKind::Create,
404            "NODE" => TokenKind::Node,
405            "EDGE" => TokenKind::Edge,
406            "TYPE" => TokenKind::Type,
407            "AS" => TokenKind::As,
408            "DISTINCT" => TokenKind::Distinct,
409            "ORDER" => TokenKind::Order,
410            "BY" => TokenKind::By,
411            "ASC" => TokenKind::Asc,
412            "DESC" => TokenKind::Desc,
413            "SKIP" => TokenKind::Skip,
414            "LIMIT" => TokenKind::Limit,
415            "NULL" => TokenKind::Null,
416            "TRUE" => TokenKind::True,
417            "FALSE" => TokenKind::False,
418            "DETACH" => TokenKind::Detach,
419            "CALL" => TokenKind::Call,
420            "YIELD" => TokenKind::Yield,
421            "IN" => TokenKind::In,
422            "LIKE" => TokenKind::Like,
423            "IS" => TokenKind::Is,
424            "CASE" => TokenKind::Case,
425            "WHEN" => TokenKind::When,
426            "THEN" => TokenKind::Then,
427            "ELSE" => TokenKind::Else,
428            "END" => TokenKind::End,
429            _ => TokenKind::Identifier,
430        }
431    }
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    #[test]
439    fn test_simple_tokens() {
440        let mut lexer = Lexer::new("MATCH (n) RETURN n");
441
442        assert_eq!(lexer.next_token().kind, TokenKind::Match);
443        assert_eq!(lexer.next_token().kind, TokenKind::LParen);
444        assert_eq!(lexer.next_token().kind, TokenKind::Identifier);
445        assert_eq!(lexer.next_token().kind, TokenKind::RParen);
446        assert_eq!(lexer.next_token().kind, TokenKind::Return);
447        assert_eq!(lexer.next_token().kind, TokenKind::Identifier);
448        assert_eq!(lexer.next_token().kind, TokenKind::Eof);
449    }
450
451    #[test]
452    fn test_arrow_tokens() {
453        let mut lexer = Lexer::new("->  <-  --");
454
455        assert_eq!(lexer.next_token().kind, TokenKind::Arrow);
456        assert_eq!(lexer.next_token().kind, TokenKind::LeftArrow);
457        assert_eq!(lexer.next_token().kind, TokenKind::DoubleDash);
458    }
459
460    #[test]
461    fn test_number_tokens() {
462        let mut lexer = Lexer::new("42 3.14");
463
464        let int_token = lexer.next_token();
465        assert_eq!(int_token.kind, TokenKind::Integer);
466        assert_eq!(int_token.text, "42");
467
468        let float_token = lexer.next_token();
469        assert_eq!(float_token.kind, TokenKind::Float);
470        assert_eq!(float_token.text, "3.14");
471    }
472
473    #[test]
474    fn test_string_tokens() {
475        let mut lexer = Lexer::new("'hello' \"world\"");
476
477        let s1 = lexer.next_token();
478        assert_eq!(s1.kind, TokenKind::String);
479        assert_eq!(s1.text, "'hello'");
480
481        let s2 = lexer.next_token();
482        assert_eq!(s2.kind, TokenKind::String);
483        assert_eq!(s2.text, "\"world\"");
484    }
485}