usiem/components/query/
mod.rs

1pub struct QueryLexer {
2    input: Vec<char>,
3    pub position: usize,
4    pub read_position: usize,
5    pub ch: char,
6}
7
8fn is_function(name: &str) -> bool {
9    matches!(
10        name,
11        "to_number"
12            | "to_string"
13            | "lowercase"
14            | "uppercase"
15            | "replace"
16            | "len"
17            | "floor"
18            | "trim"
19            | "to_integer"
20            | "to_float"
21    )
22}
23
24fn is_letter(ch: char) -> bool {
25    ch.is_ascii_alphabetic() || ch == '_' || ch == '.'
26}
27
28fn count_asterix(input: &Vec<char>) -> usize {
29    let mut counter = 0;
30    let mut last_char = '\0';
31    for char in input {
32        if *char == '*' && last_char != '\\' {
33            counter += 1;
34        }
35        last_char = *char;
36    }
37    counter
38}
39
40fn transform_escape_char(ch: char) -> Result<char, ()> {
41    match ch {
42        'n' => Ok('\n'),
43        't' => Ok('\t'),
44        'r' => Ok('\r'),
45        '0' => Ok('\0'),
46        '*' => Err(()),
47        _ => Ok(ch),
48    }
49}
50
51impl QueryLexer {
52    pub fn new(input: Vec<char>) -> Self {
53        Self {
54            input,
55            position: 0,
56            read_position: 0,
57            ch: '0',
58        }
59    }
60
61    pub fn read_char(&mut self) {
62        if self.read_position >= self.input.len() {
63            self.ch = '0';
64        } else {
65            self.ch = self.input[self.read_position];
66        }
67        self.position = self.read_position;
68        self.read_position += 1;
69    }
70
71    pub fn skip_whitespace(&mut self) {
72        loop {
73            let ch = self.ch;
74            if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
75                self.read_char();
76            } else {
77                return;
78            }
79        }
80    }
81
82    pub fn next_token(&mut self) -> Token {
83        let read_identifier = |l: &mut QueryLexer| -> Vec<char> {
84            let position = l.position;
85            while l.position < l.input.len() && (is_letter(l.ch) || l.ch.is_ascii_digit()) {
86                l.read_char();
87            }
88            l.input[position..l.position].to_vec()
89        };
90        let read_literal_string = |l: &mut QueryLexer| -> Vec<char> {
91            let mut is_escape = false;
92            let mut to_ret = Vec::with_capacity(32);
93            while (is_escape || l.ch != '\'') && l.position < l.input.len() {
94                if l.ch == '\\' {
95                    if is_escape {
96                        to_ret.push('\\');
97                    }
98                    is_escape = !is_escape;
99                } else {
100                    if is_escape {
101                        match transform_escape_char(l.ch) {
102                            Ok(ch) => to_ret.push(ch),
103                            Err(_) => {
104                                to_ret.push(l.ch);
105                            }
106                        };
107                    } else {
108                        to_ret.push(l.ch);
109                    }
110                    is_escape = false;
111                }
112                l.read_char();
113            }
114            to_ret
115        };
116
117        let read_string = |l: &mut QueryLexer| -> Vec<char> {
118            let mut is_escape = false;
119            let mut to_ret = Vec::with_capacity(32);
120            while (is_escape || l.ch != '"') && l.position < l.input.len() {
121                if l.ch == '\\' {
122                    if is_escape {
123                        to_ret.push('\\');
124                    }
125                    is_escape = !is_escape;
126                } else {
127                    if is_escape {
128                        match transform_escape_char(l.ch) {
129                            Ok(ch) => to_ret.push(ch),
130                            Err(_) => {
131                                // The \\ character is used
132                                to_ret.push('\\');
133                                to_ret.push(l.ch);
134                            }
135                        };
136                    } else {
137                        to_ret.push(l.ch);
138                    }
139                    is_escape = false;
140                }
141                l.read_char();
142            }
143            to_ret
144        };
145
146        let read_number = |l: &mut QueryLexer| -> Vec<char> {
147            let position = l.position;
148            while l.position < l.input.len() && l.ch.is_ascii_digit() {
149                l.read_char();
150            }
151            l.input[position..l.position].to_vec()
152        };
153
154        let tok: Token;
155        self.skip_whitespace();
156        match self.ch {
157            '=' => {
158                tok = Token::ASSIGN;
159            }
160            '|' => {
161                tok = Token::PIPE;
162            }
163            '+' => {
164                tok = Token::PLUS(self.ch);
165            }
166            '-' => {
167                tok = Token::MINUS(self.ch);
168            }
169            '!' => {
170                tok = Token::BANG(self.ch);
171            }
172            '/' => {
173                tok = Token::SLASH(self.ch);
174            }
175            '*' => {
176                tok = Token::ASTERISK(self.ch);
177            }
178            '<' => {
179                tok = Token::LT(self.ch);
180            }
181            '>' => {
182                tok = Token::GT(self.ch);
183            }
184            ';' => {
185                tok = Token::SEMICOLON(self.ch);
186            }
187            '(' => {
188                tok = Token::LPAREN(self.ch);
189            }
190            ')' => {
191                tok = Token::RPAREN(self.ch);
192            }
193            ',' => {
194                tok = Token::COMMA(self.ch);
195            }
196            '{' => {
197                tok = Token::LBRACE(self.ch);
198            }
199            '}' => {
200                tok = Token::RBRACE(self.ch);
201            }
202            '0' => {
203                tok = Token::EOF;
204            }
205            '\'' => {
206                self.read_char();
207                let data = read_literal_string(self);
208                tok = Token::String(data.iter().collect())
209            }
210            '"' => {
211                self.read_char();
212                let data = read_string(self);
213                if data.len() > 1 {
214                    let n_asterix = count_asterix(&data);
215                    //Test if can be a start_with, contains, ends_with or like
216                    if n_asterix > 2 {
217                        tok = Token::Like(data.iter().collect())
218                    } else {
219                        let starts_astx = data[0] == '*';
220                        let ends_astx = data[data.len() - 1] == '*';
221                        if starts_astx && ends_astx {
222                            tok = Token::Contains(data.iter().filter(|c| *c != &'*').collect())
223                        } else if starts_astx {
224                            tok = Token::StartsWith(data.iter().filter(|c| *c != &'*').collect())
225                        } else if ends_astx {
226                            tok = Token::EndsWith(data.iter().filter(|c| *c != &'*').collect())
227                        } else if n_asterix == 0 {
228                            tok = Token::String(data.iter().collect())
229                        } else {
230                            tok = Token::Like(data.iter().collect())
231                        }
232                    }
233                } else {
234                    tok = Token::String(data.iter().collect())
235                }
236            }
237            _ => {
238                if is_letter(self.ch) {
239                    let ident: Vec<char> = read_identifier(self);
240                    match get_keyword_token(&ident) {
241                        Ok(keywork_token) => {
242                            return keywork_token;
243                        }
244                        Err(_err) => {
245                            return Token::FIELD(ident.into_iter().collect());
246                        }
247                    }
248                } else if self.ch.is_ascii_digit() {
249                    let ident: Vec<char> = read_number(self);
250                    return Token::INT(ident.into_iter().collect());
251                } else {
252                    return Token::ILLEGAL;
253                }
254            }
255        }
256        self.read_char();
257        tok
258    }
259}
260
261#[derive(Debug, PartialEq)]
262pub enum Token {
263    ILLEGAL,
264    EOF,
265    FIELD(String),
266    INT(String),
267    ASSIGN,
268    PIPE,
269    PLUS(char),
270    COMMA(char),
271    SEMICOLON(char),
272    LPAREN(char),
273    RPAREN(char),
274    LBRACE(char),
275    RBRACE(char),
276    FUNCTION(String),
277    TRUE,
278    FALSE,
279    AND,
280    OR,
281    NOT,
282    RETURN,
283    MINUS(char),
284    BANG(char),
285    ASTERISK(char),
286    SLASH(char),
287    LT(char),
288    GT(char),
289    FILTER,
290    FIELDS,
291    AS,
292    String(String),
293    RegexField(String),
294    StartsWith(String),
295    EndsWith(String),
296    Like(String),
297    Contains(String),
298}
299
300pub fn get_keyword_token(ident: &[char]) -> Result<Token, String> {
301    let identifier: String = ident.iter().collect();
302    match &identifier[..] {
303        "true" => Ok(Token::TRUE),
304        "false" => Ok(Token::FALSE),
305        "AND" => Ok(Token::AND),
306        "OR" => Ok(Token::OR),
307        "NOT" => Ok(Token::NOT),
308        "filter" => Ok(Token::FILTER),
309        "fields" => Ok(Token::FIELDS),
310        "as" => Ok(Token::AS),
311        _ => {
312            if is_function(&identifier) {
313                return Ok(Token::FUNCTION(identifier));
314            }
315            Err(String::from("Not a keyword"))
316        }
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    #[test]
324    fn should_parse_the_query() {
325        let input = String::from("filter field_name2=\"*something\" | fields os.actor_process as osap | filter to_string(osap,'something') = \"12345\"");
326        let mut l = QueryLexer::new(input.chars().collect());
327        l.read_char();
328        loop {
329            let token = l.next_token();
330            if token == Token::ILLEGAL {
331                break;
332            }
333            if token == Token::EOF {
334                break;
335            } else {
336                println!("{:?}", token);
337            }
338        }
339        println!("{} {} {}", char::from(l.ch), l.position, l.read_position);
340    }
341}