1use thiserror::Error;
6
7#[derive(Debug, Clone, PartialEq)]
9#[allow(missing_docs)]
10pub enum Token {
11 Ident(String), String(String), Number(u64), Regex(String), And, Or, Not, In, Contains, IContains, StartsWith, EndsWith, Matches, IsEmpty, NotEmpty, Header, IEquals, Rand, Eq, Ne, Dot, LParen, RParen, LBrace, RBrace, Comma, Eof,
45}
46
47#[derive(Debug, Clone, Error, PartialEq)]
49#[allow(missing_docs)]
50pub enum LexError {
51 #[error("Unexpected character '{0}' at position {1}")]
52 UnexpectedChar(char, usize),
53
54 #[error("Unterminated string starting at position {0}")]
55 UnterminatedString(usize),
56
57 #[error("Unterminated regex starting at position {0}")]
58 UnterminatedRegex(usize),
59
60 #[error("Invalid escape sequence at position {0}")]
61 InvalidEscape(usize),
62
63 #[error("Invalid number at position {0}")]
64 InvalidNumber(usize),
65}
66
67pub struct Lexer<'a> {
69 input: &'a str,
70 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
71 position: usize,
72}
73
74impl<'a> Lexer<'a> {
75 pub fn new(input: &'a str) -> Self {
77 Self {
78 input,
79 chars: input.char_indices().peekable(),
80 position: 0,
81 }
82 }
83
84 pub fn next_token(&mut self) -> Result<Token, LexError> {
89 self.skip_whitespace();
90
91 let Some(&(pos, ch)) = self.chars.peek() else {
92 return Ok(Token::Eof);
93 };
94
95 self.position = pos;
96
97 match ch {
98 '(' => {
100 self.chars.next();
101 Ok(Token::LParen)
102 }
103 ')' => {
104 self.chars.next();
105 Ok(Token::RParen)
106 }
107 '{' => {
108 self.chars.next();
109 Ok(Token::LBrace)
110 }
111 '}' => {
112 self.chars.next();
113 Ok(Token::RBrace)
114 }
115 ',' => {
116 self.chars.next();
117 Ok(Token::Comma)
118 }
119 '.' => {
120 self.chars.next();
121 Ok(Token::Dot)
122 }
123
124 '=' => {
126 self.chars.next();
127 if self.chars.peek().map(|&(_, c)| c) == Some('=') {
128 self.chars.next();
129 Ok(Token::Eq)
130 } else {
131 Ok(Token::Eq) }
133 }
134 '!' => {
135 self.chars.next();
136 if self.chars.peek().map(|&(_, c)| c) == Some('=') {
137 self.chars.next();
138 Ok(Token::Ne)
139 } else {
140 Ok(Token::Not)
141 }
142 }
143 '&' => {
144 self.chars.next();
145 if self.chars.peek().map(|&(_, c)| c) == Some('&') {
146 self.chars.next();
147 }
148 Ok(Token::And)
149 }
150 '|' => {
151 self.chars.next();
152 if self.chars.peek().map(|&(_, c)| c) == Some('|') {
153 self.chars.next();
154 }
155 Ok(Token::Or)
156 }
157
158 '"' | '\'' => self.read_string(ch),
160
161 'r' if self.peek_char(1) == Some('"') => self.read_regex_r(),
163 '/' => self.read_regex_slash(),
164
165 '0'..='9' => self.read_number(),
167
168 'a'..='z' | 'A'..='Z' | '_' => self.read_ident(),
170
171 _ => Err(LexError::UnexpectedChar(ch, pos)),
172 }
173 }
174
175 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
180 let mut tokens = Vec::new();
181 loop {
182 let token = self.next_token()?;
183 if token == Token::Eof {
184 break;
185 }
186 tokens.push(token);
187 }
188 Ok(tokens)
189 }
190
191 fn skip_whitespace(&mut self) {
192 while let Some(&(_, ch)) = self.chars.peek() {
193 if ch.is_whitespace() {
194 self.chars.next();
195 } else if ch == '#' {
196 while let Some(&(_, c)) = self.chars.peek() {
198 if c == '\n' {
199 self.chars.next();
200 break;
201 }
202 self.chars.next();
203 }
204 } else {
205 break;
206 }
207 }
208 }
209
210 fn peek_char(&self, offset: usize) -> Option<char> {
211 self.input[self.position..].chars().nth(offset)
212 }
213
214 fn read_string(&mut self, quote: char) -> Result<Token, LexError> {
215 let start = self.position;
216 self.chars.next(); let mut value = String::new();
219
220 loop {
221 match self.chars.next() {
222 Some((_, ch)) if ch == quote => {
223 return Ok(Token::String(value));
224 }
225 Some((pos, '\\')) => {
226 match self.chars.next() {
228 Some((_, 'n')) => value.push('\n'),
229 Some((_, 'r')) => value.push('\r'),
230 Some((_, 't')) => value.push('\t'),
231 Some((_, '\\')) => value.push('\\'),
232 Some((_, c)) if c == quote => value.push(c),
233 Some((_, '"')) => value.push('"'),
234 Some((_, '\'')) => value.push('\''),
235 _ => return Err(LexError::InvalidEscape(pos)),
236 }
237 }
238 Some((_, ch)) => value.push(ch),
239 None => return Err(LexError::UnterminatedString(start)),
240 }
241 }
242 }
243
244 fn read_regex_r(&mut self) -> Result<Token, LexError> {
245 let start = self.position;
246 self.chars.next(); self.chars.next(); let mut pattern = String::new();
250
251 loop {
252 match self.chars.next() {
253 Some((_, '"')) => {
254 return Ok(Token::Regex(pattern));
255 }
256 Some((_, '\\')) => {
257 pattern.push('\\');
259 if let Some((_, ch)) = self.chars.next() {
260 pattern.push(ch);
261 }
262 }
263 Some((_, ch)) => pattern.push(ch),
264 None => return Err(LexError::UnterminatedRegex(start)),
265 }
266 }
267 }
268
269 fn read_regex_slash(&mut self) -> Result<Token, LexError> {
270 let start = self.position;
271 self.chars.next(); let mut pattern = String::new();
274
275 loop {
276 match self.chars.next() {
277 Some((_, '/')) => {
278 while let Some(&(_, ch)) = self.chars.peek() {
280 if ch.is_ascii_alphabetic() {
281 self.chars.next();
282 } else {
284 break;
285 }
286 }
287 return Ok(Token::Regex(pattern));
288 }
289 Some((_, '\\')) => {
290 pattern.push('\\');
291 if let Some((_, ch)) = self.chars.next() {
292 pattern.push(ch);
293 }
294 }
295 Some((_, ch)) => pattern.push(ch),
296 None => return Err(LexError::UnterminatedRegex(start)),
297 }
298 }
299 }
300
301 fn read_number(&mut self) -> Result<Token, LexError> {
302 let start = self.position;
303 let mut num_str = String::new();
304
305 while let Some(&(_, ch)) = self.chars.peek() {
306 if ch.is_ascii_digit() {
307 num_str.push(ch);
308 self.chars.next();
309 } else {
310 break;
311 }
312 }
313
314 num_str
315 .parse::<u64>()
316 .map(Token::Number)
317 .map_err(|_| LexError::InvalidNumber(start))
318 }
319
320 fn read_ident(&mut self) -> Result<Token, LexError> {
321 let mut ident = String::new();
322
323 while let Some(&(_, ch)) = self.chars.peek() {
324 if ch.is_ascii_alphanumeric() || ch == '_' {
325 ident.push(ch);
326 self.chars.next();
327 } else {
328 break;
329 }
330 }
331
332 let token = match ident.to_ascii_lowercase().as_str() {
334 "and" => Token::And,
335 "or" => Token::Or,
336 "not" => Token::Not,
337 "in" => Token::In,
338 "contains" => Token::Contains,
339 "icontains" => Token::IContains,
340 "starts_with" | "startswith" => Token::StartsWith,
341 "ends_with" | "endswith" => Token::EndsWith,
342 "matches" => Token::Matches,
343 "is_empty" | "isempty" | "empty" => Token::IsEmpty,
344 "not_empty" | "notempty" => Token::NotEmpty,
345 "header" => Token::Header,
346 "iequals" | "ieq" => Token::IEquals,
347 "rand" | "random" => Token::Rand,
348 "true" => return Ok(Token::Ident("true".into())),
349 "false" => return Ok(Token::Ident("false".into())),
350 _ => Token::Ident(ident),
351 };
352
353 Ok(token)
354 }
355}
356
357#[cfg(test)]
358mod tests {
359 use super::*;
360
361 fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
362 Lexer::new(input).tokenize()
363 }
364
365 #[test]
366 fn test_simple_tokens() {
367 assert_eq!(
368 tokenize("(){},.").unwrap(),
369 vec![
370 Token::LParen,
371 Token::RParen,
372 Token::LBrace,
373 Token::RBrace,
374 Token::Comma,
375 Token::Dot,
376 ]
377 );
378 }
379
380 #[test]
381 fn test_operators() {
382 assert_eq!(
383 tokenize("== != && ||").unwrap(),
384 vec![Token::Eq, Token::Ne, Token::And, Token::Or,]
385 );
386 }
387
388 #[test]
389 fn test_strings() {
390 assert_eq!(
391 tokenize(r#""hello" 'world'"#).unwrap(),
392 vec![Token::String("hello".into()), Token::String("world".into()),]
393 );
394 }
395
396 #[test]
397 fn test_string_escapes() {
398 assert_eq!(
399 tokenize(r#""hello\nworld""#).unwrap(),
400 vec![Token::String("hello\nworld".into()),]
401 );
402 }
403
404 #[test]
405 fn test_numbers() {
406 assert_eq!(
407 tokenize("123 456").unwrap(),
408 vec![Token::Number(123), Token::Number(456),]
409 );
410 }
411
412 #[test]
413 fn test_keywords() {
414 assert_eq!(
415 tokenize("AND OR NOT contains matches").unwrap(),
416 vec![
417 Token::And,
418 Token::Or,
419 Token::Not,
420 Token::Contains,
421 Token::Matches,
422 ]
423 );
424 }
425
426 #[test]
427 fn test_case_insensitive_keywords() {
428 assert_eq!(
429 tokenize("and AND And").unwrap(),
430 vec![Token::And, Token::And, Token::And,]
431 );
432 }
433
434 #[test]
435 fn test_identifiers() {
436 assert_eq!(
437 tokenize("MESSAGE_TYPE field1").unwrap(),
438 vec![
439 Token::Ident("MESSAGE_TYPE".into()),
440 Token::Ident("field1".into()),
441 ]
442 );
443 }
444
445 #[test]
446 fn test_regex_r_syntax() {
447 assert_eq!(
448 tokenize(r#"r"hello.*world""#).unwrap(),
449 vec![Token::Regex("hello.*world".into()),]
450 );
451 }
452
453 #[test]
454 fn test_regex_slash_syntax() {
455 assert_eq!(
456 tokenize(r#"/hello.*world/"#).unwrap(),
457 vec![Token::Regex("hello.*world".into()),]
458 );
459 }
460
461 #[test]
462 fn test_complex_expression() {
463 let input = r#"MESSAGE_TYPE == "2" AND payload contains "error""#;
464 assert_eq!(
465 tokenize(input).unwrap(),
466 vec![
467 Token::Ident("MESSAGE_TYPE".into()),
468 Token::Eq,
469 Token::String("2".into()),
470 Token::And,
471 Token::Ident("payload".into()),
472 Token::Contains,
473 Token::String("error".into()),
474 ]
475 );
476 }
477
478 #[test]
479 fn test_rand() {
480 assert_eq!(
481 tokenize("rand(100)").unwrap(),
482 vec![
483 Token::Rand,
484 Token::LParen,
485 Token::Number(100),
486 Token::RParen,
487 ]
488 );
489 }
490
491 #[test]
492 fn test_in_set() {
493 assert_eq!(
494 tokenize(r#"field in {"a", "b", "c"}"#).unwrap(),
495 vec![
496 Token::Ident("field".into()),
497 Token::In,
498 Token::LBrace,
499 Token::String("a".into()),
500 Token::Comma,
501 Token::String("b".into()),
502 Token::Comma,
503 Token::String("c".into()),
504 Token::RBrace,
505 ]
506 );
507 }
508
509 #[test]
510 fn test_header_syntax() {
511 assert_eq!(
512 tokenize(r#"field.header("X-Custom") iequals "value""#).unwrap(),
513 vec![
514 Token::Ident("field".into()),
515 Token::Dot,
516 Token::Header,
517 Token::LParen,
518 Token::String("X-Custom".into()),
519 Token::RParen,
520 Token::IEquals,
521 Token::String("value".into()),
522 ]
523 );
524 }
525
526 #[test]
527 fn test_end_of_line_comments() {
528 let input = "MESSAGE_TYPE == \"2\" # check type\nAND MESSAGE_SUB_TYPE == \"11\" # CUSTOM PROBE";
530 assert_eq!(
531 tokenize(input).unwrap(),
532 vec![
533 Token::Ident("MESSAGE_TYPE".into()),
534 Token::Eq,
535 Token::String("2".into()),
536 Token::And,
537 Token::Ident("MESSAGE_SUB_TYPE".into()),
538 Token::Eq,
539 Token::String("11".into()),
540 ]
541 );
542
543 assert_eq!(
545 tokenize("true # done").unwrap(),
546 vec![Token::Ident("true".into()),]
547 );
548
549 assert_eq!(tokenize("# nothing here").unwrap(), vec![]);
551 }
552
553 #[test]
554 fn test_unterminated_string() {
555 assert!(matches!(
556 tokenize(r#""hello"#),
557 Err(LexError::UnterminatedString(_))
558 ));
559 }
560
561 #[test]
562 fn test_unterminated_regex() {
563 assert!(matches!(
564 tokenize(r#"/hello"#),
565 Err(LexError::UnterminatedRegex(_))
566 ));
567 }
568}