Skip to main content

eventql_parser/
lexer.rs

1//! Lexical analysis (tokenization) for EventQL.
2//!
3//! This module provides the tokenizer that converts raw EventQL query strings
4//! into a sequence of tokens. The tokenizer is built using the `nom` parser
5//! combinator library.
6//!
7//! # Main Function
8//!
9//! - [`tokenize`] - Convert a query string into a vector of tokens
10use crate::error::LexerError;
11use crate::token::{Operator, Sym, Symbol, Text, Token};
12use nom::branch::alt;
13use nom::bytes::complete::{tag, take_while};
14use nom::character::complete::{alpha1, char, multispace1, satisfy};
15use nom::character::one_of;
16use nom::combinator::{eof, opt, recognize};
17use nom::error::{Error, context};
18use nom::multi::many0;
19use nom::number::complete::double;
20use nom::sequence::{delimited, pair};
21use nom::{AsChar, IResult, Parser};
22
23/// Tokenize an EventQL query string.
24///
25/// This function performs lexical analysis on the input string, converting it
26/// into a sequence of tokens. Each token includes position information (line
27/// and column numbers) for error reporting.
28/// # Recognized Tokens
29///
30/// - **Identifiers**: Alphanumeric names starting with a letter (e.g., `events`, `e`)
31/// - **Keywords**: Case-insensitive SQL-like keywords detected by the parser
32/// - **Numbers**: Floating-point literals (e.g., `42`, `3.14`)
33/// - **Strings**: Double-quoted string literals (e.g., `"hello"`)
34/// - **Operators**: Arithmetic (`+`, `-`, `*`, `/`), comparison (`==`, `!=`, `<`, `<=`, `>`, `>=`), logical (`AND`, `OR`, `XOR`, `NOT`)
35/// - **Symbols**: Structural characters (`(`, `)`, `[`, `]`, `{`, `}`, `.`, `,`, `:`)
36pub(crate) fn tokenize(input: &str) -> Result<Vec<Token<'_>>, LexerError> {
37    let mut input = Text::new(input);
38    let mut tokens = Vec::new();
39
40    loop {
41        let (remaining, token) = token(input).map_err(massage_nom_error)?;
42        input = remaining;
43
44        tokens.push(token);
45
46        if matches!(token.sym, Sym::Eof) {
47            break;
48        }
49    }
50
51    Ok(tokens)
52}
53
54fn massage_nom_error(err: nom::Err<Error<Text>>) -> LexerError {
55    match err {
56        nom::Err::Incomplete(_) => LexerError::IncompleteInput,
57        nom::Err::Error(err) => {
58            LexerError::InvalidSymbol(err.input.location_line(), err.input.get_column() as u32)
59        }
60        nom::Err::Failure(err) => {
61            LexerError::InvalidSymbol(err.input.location_line(), err.input.get_column() as u32)
62        }
63    }
64}
65
66fn token(input: Text) -> IResult<Text, Token> {
67    delimited(
68        skip_whitespace_and_comments,
69        parse_token,
70        skip_whitespace_and_comments,
71    )
72    .parse(input)
73}
74
75fn skip_whitespace_and_comments(input: Text) -> IResult<Text, Text> {
76    recognize(many0(alt((comment, multispace1)))).parse(input)
77}
78
79fn comment(input: Text) -> IResult<Text, Text> {
80    recognize(pair(
81        pair(tag("//"), take_while(|c: char| c != '\n' && c != '\r')),
82        opt(alt((tag("\r\n"), tag("\n"), tag("\r")))),
83    ))
84    .parse(input)
85}
86
87fn parse_token(input: Text) -> IResult<Text, Token> {
88    alt((end_of_file, symbol, operator, ident, number, string)).parse(input)
89}
90
91fn symbol(input: Text) -> IResult<Text, Token> {
92    one_of("().,:[]{}")
93        .map(|c| match c {
94            '(' => Symbol::OpenParen,
95            ')' => Symbol::CloseParen,
96            '.' => Symbol::Dot,
97            ',' => Symbol::Comma,
98            ':' => Symbol::Colon,
99            '[' => Symbol::OpenBracket,
100            ']' => Symbol::CloseBracket,
101            '{' => Symbol::OpenBrace,
102            '}' => Symbol::CloseBrace,
103            _ => unreachable!(),
104        })
105        .map(move |sym| Token {
106            sym: Sym::Symbol(sym),
107            line: input.location_line(),
108            col: input.get_column() as u32,
109        })
110        .parse(input)
111}
112
113fn end_of_file(input: Text) -> IResult<Text, Token> {
114    eof.map(|_| Token {
115        sym: Sym::Eof,
116        line: input.location_line(),
117        col: input.get_column() as u32,
118    })
119    .parse(input)
120}
121
122fn operator(input: Text) -> IResult<Text, Token> {
123    alt((operator_1, operator_2)).parse(input)
124}
125
126fn operator_1(input: Text) -> IResult<Text, Token> {
127    one_of("+-*/^")
128        .map(|c| match c {
129            '+' => Operator::Add,
130            '-' => Operator::Sub,
131            '*' => Operator::Mul,
132            '/' => Operator::Div,
133            _ => unreachable!(),
134        })
135        .map(move |op| Token {
136            sym: Sym::Operator(op),
137            line: input.location_line(),
138            col: input.get_column() as u32,
139        })
140        .parse(input)
141}
142
143fn operator_2(input: Text) -> IResult<Text, Token> {
144    one_of("<>!=")
145        .flat_map(|c| {
146            context(
147                "valid character when parsing an operator",
148                opt(char('=')).map_opt(move |eq_opt| match (c, eq_opt.is_some()) {
149                    ('<', false) => Some(Operator::Lt),
150                    ('<', true) => Some(Operator::Lte),
151                    ('>', false) => Some(Operator::Gt),
152                    ('>', true) => Some(Operator::Gte),
153                    ('!', true) => Some(Operator::Neq),
154                    ('=', true) => Some(Operator::Eq),
155                    _ => None,
156                }),
157            )
158        })
159        .map(move |op| Token {
160            sym: Sym::Operator(op),
161            line: input.location_line(),
162            col: input.get_column() as u32,
163        })
164        .parse(input)
165}
166
167fn ident(input: Text) -> IResult<Text, Token> {
168    recognize(pair(
169        alpha1,
170        many0(satisfy(|c| AsChar::is_alphanum(c) || c == '_')),
171    ))
172    .map(|value: Text| {
173        let sym = if value.fragment().eq_ignore_ascii_case("and") {
174            Sym::Operator(Operator::And)
175        } else if value.fragment().eq_ignore_ascii_case("or") {
176            Sym::Operator(Operator::Or)
177        } else if value.fragment().eq_ignore_ascii_case("xor") {
178            Sym::Operator(Operator::Xor)
179        } else if value.fragment().eq_ignore_ascii_case("not") {
180            Sym::Operator(Operator::Not)
181        } else if value.fragment().eq_ignore_ascii_case("contains") {
182            Sym::Operator(Operator::Contains)
183        } else if value.fragment().eq_ignore_ascii_case("as") {
184            Sym::Operator(Operator::As)
185        } else {
186            Sym::Id(value.fragment())
187        };
188
189        Token {
190            sym,
191            line: value.location_line(),
192            col: value.get_column() as u32,
193        }
194    })
195    .parse(input)
196}
197
198fn number(input: Text) -> IResult<Text, Token> {
199    double
200        .map(|value| Token {
201            sym: Sym::Number(value),
202            line: input.location_line(),
203            col: input.get_column() as u32,
204        })
205        .parse(input)
206}
207
208fn string(input: Text) -> IResult<Text, Token> {
209    delimited(char('"'), take_while(|c| c != '"'), char('"'))
210        .map(|value: Text| Token {
211            sym: Sym::String(value.fragment()),
212            line: input.location_line(),
213            col: input.get_column() as u32,
214        })
215        .parse(input)
216}