eventql_parser/
lexer.rs

1//! Lexical analysis (tokenization) for EventQL.
2//!
3//! This module provides the tokenizer that converts raw EventQL query strings
4//! into a sequence of tokens. The tokenizer is built using the `nom` parser
5//! combinator library.
6//!
7//! # Main Function
8//!
9//! - [`tokenize`] - Convert a query string into a vector of tokens
10use crate::token::{Operator, Sym, Symbol, Text, Token};
11use nom::branch::alt;
12use nom::bytes::complete::take_while;
13use nom::character::complete::{alpha1, alphanumeric0, char, multispace0};
14use nom::character::one_of;
15use nom::combinator::{eof, opt, recognize};
16use nom::error::{Error, context};
17use nom::number::complete::double;
18use nom::sequence::{delimited, pair};
19use nom::{IResult, Parser};
20
21/// Tokenize an EventQL query string.
22///
23/// This function performs lexical analysis on the input string, converting it
24/// into a sequence of tokens. Each token includes position information (line
25/// and column numbers) for error reporting.
26/// # Recognized Tokens
27///
28/// - **Identifiers**: Alphanumeric names starting with a letter (e.g., `events`, `e`)
29/// - **Keywords**: Case-insensitive SQL-like keywords detected by the parser
30/// - **Numbers**: Floating-point literals (e.g., `42`, `3.14`)
31/// - **Strings**: Double-quoted string literals (e.g., `"hello"`)
32/// - **Operators**: Arithmetic (`+`, `-`, `*`, `/`), comparison (`==`, `!=`, `<`, `<=`, `>`, `>=`), logical (`AND`, `OR`, `XOR`, `NOT`)
33/// - **Symbols**: Structural characters (`(`, `)`, `[`, `]`, `{`, `}`, `.`, `,`, `:`)
34pub fn tokenize(input: &str) -> Result<Vec<Token<'_>>, nom::Err<Error<Text<'_>>>> {
35    let mut input = Text::new(input);
36    let mut tokens = Vec::new();
37
38    loop {
39        let (remaining, token) = token(input)?;
40        input = remaining;
41
42        tokens.push(token);
43
44        if matches!(token.sym, Sym::Eof) {
45            break;
46        }
47    }
48
49    Ok(tokens)
50}
51
52fn token(input: Text) -> IResult<Text, Token> {
53    delimited(
54        multispace0,
55        alt((end_of_file, symbol, operator, ident, number, string)),
56        multispace0,
57    )
58    .parse(input)
59}
60
61fn symbol(input: Text) -> IResult<Text, Token> {
62    one_of("().,:[]{}")
63        .map(|c| match c {
64            '(' => Symbol::OpenParen,
65            ')' => Symbol::CloseParen,
66            '.' => Symbol::Dot,
67            ',' => Symbol::Comma,
68            ':' => Symbol::Colon,
69            '[' => Symbol::OpenBracket,
70            ']' => Symbol::CloseBracket,
71            '{' => Symbol::OpenBrace,
72            '}' => Symbol::CloseBrace,
73            _ => unreachable!(),
74        })
75        .map(move |sym| Token {
76            sym: Sym::Symbol(sym),
77            line: input.location_line(),
78            col: input.get_column() as u32,
79        })
80        .parse(input)
81}
82
83fn end_of_file(input: Text) -> IResult<Text, Token> {
84    eof.map(|_| Token {
85        sym: Sym::Eof,
86        line: input.location_line(),
87        col: input.get_column() as u32,
88    })
89    .parse(input)
90}
91
92fn operator(input: Text) -> IResult<Text, Token> {
93    alt((operator_1, operator_2)).parse(input)
94}
95
96fn operator_1(input: Text) -> IResult<Text, Token> {
97    one_of("+-*/^")
98        .map(|c| match c {
99            '+' => Operator::Add,
100            '-' => Operator::Sub,
101            '*' => Operator::Mul,
102            '/' => Operator::Div,
103            _ => unreachable!(),
104        })
105        .map(move |op| Token {
106            sym: Sym::Operator(op),
107            line: input.location_line(),
108            col: input.get_column() as u32,
109        })
110        .parse(input)
111}
112
113fn operator_2(input: Text) -> IResult<Text, Token> {
114    one_of("<>!=")
115        .flat_map(|c| {
116            context(
117                "valid character when parsing an operator",
118                opt(char('=')).map_opt(move |eq_opt| match (c, eq_opt.is_some()) {
119                    ('<', false) => Some(Operator::Lt),
120                    ('<', true) => Some(Operator::Lte),
121                    ('>', false) => Some(Operator::Gt),
122                    ('>', true) => Some(Operator::Gte),
123                    ('!', true) => Some(Operator::Neq),
124                    ('=', true) => Some(Operator::Eq),
125                    _ => None,
126                }),
127            )
128        })
129        .map(move |op| Token {
130            sym: Sym::Operator(op),
131            line: input.location_line(),
132            col: input.get_column() as u32,
133        })
134        .parse(input)
135}
136
137fn ident(input: Text) -> IResult<Text, Token> {
138    recognize(pair(alpha1, alphanumeric0))
139        .map(|value: Text| {
140            let sym = if value.fragment().eq_ignore_ascii_case("and") {
141                Sym::Operator(Operator::And)
142            } else if value.fragment().eq_ignore_ascii_case("or") {
143                Sym::Operator(Operator::Or)
144            } else if value.fragment().eq_ignore_ascii_case("xor") {
145                Sym::Operator(Operator::Xor)
146            } else if value.fragment().eq_ignore_ascii_case("not") {
147                Sym::Operator(Operator::Not)
148            } else {
149                Sym::Id(value.fragment())
150            };
151
152            Token {
153                sym,
154                line: value.location_line(),
155                col: value.get_column() as u32,
156            }
157        })
158        .parse(input)
159}
160
161fn number(input: Text) -> IResult<Text, Token> {
162    double
163        .map(|value| Token {
164            sym: Sym::Number(value),
165            line: input.location_line(),
166            col: input.get_column() as u32,
167        })
168        .parse(input)
169}
170
171fn string(input: Text) -> IResult<Text, Token> {
172    delimited(char('"'), take_while(|c| c != '"'), char('"'))
173        .map(|value: Text| Token {
174            sym: Sym::String(value.fragment()),
175            line: input.location_line(),
176            col: input.get_column() as u32,
177        })
178        .parse(input)
179}