sqry_core/query/
lexer.rs

1//! Lexer for the query language
2//!
3//! This module implements tokenization of query strings into a stream of tokens.
4//! Supports keywords (AND, OR, NOT), operators (:, ~=, >, <, etc.), string literals,
5//! regex literals with flags, numbers, and identifiers.
6
7use crate::query::error::LexError;
8use crate::query::types::{RegexFlags, Span};
9use log::trace;
10use std::cell::RefCell;
11use std::env;
12use std::str::Chars;
13use std::thread_local;
14
15#[cfg(all(test, feature = "dhat-heap"))]
16#[global_allocator]
17static DHAT_ALLOC: dhat::Alloc = dhat::Alloc;
18
19/// A token type in the query language
20#[derive(Debug, Clone, PartialEq)]
21pub enum TokenType {
22    // Keywords
23    /// AND keyword
24    And,
25    /// OR keyword
26    Or,
27    /// NOT keyword
28    Not,
29
30    // Operators
31    /// `:` operator (exact match / glob for paths)
32    Colon,
33    /// `~=` operator (regex match)
34    RegexOp,
35    /// `>` operator (greater than)
36    Greater,
37    /// `<` operator (less than)
38    Less,
39    /// `>=` operator (greater than or equal)
40    GreaterEq,
41    /// `<=` operator (less than or equal)
42    LessEq,
43    /// `|` pipe operator (for aggregation pipeline)
44    Pipe,
45
46    // Delimiters
47    /// `(` left parenthesis
48    LParen,
49    /// `)` right parenthesis
50    RParen,
51
52    // Values
53    /// Identifier (field names)
54    Identifier(String),
55    /// String literal (double or single quoted)
56    StringLiteral(String),
57    /// Regex literal with pattern and flags
58    RegexLiteral {
59        /// Regex pattern
60        pattern: String,
61        /// Regex flags (case-insensitive, multiline, dot-all)
62        flags: RegexFlags,
63    },
64    /// Number literal
65    NumberLiteral(i64),
66    /// Boolean literal
67    BooleanLiteral(bool),
68    /// Bare word (unquoted value)
69    Word(String),
70    /// Variable reference (`$name`)
71    Variable(String),
72
73    // Special
74    /// End of input
75    Eof,
76}
77
78/// A token with position information
79#[derive(Debug, Clone, PartialEq)]
80pub struct Token {
81    /// The type of token
82    pub token_type: TokenType,
83    /// Source position
84    pub span: Span,
85}
86
87impl Token {
88    /// Create a new token
89    #[must_use]
90    pub fn new(token_type: TokenType, span: Span) -> Self {
91        Self { token_type, span }
92    }
93}
94
95/// Lexer state for tokenization
96pub(crate) struct RawLexer<'a> {
97    /// Input string
98    input: &'a str,
99    /// Character iterator
100    chars: Chars<'a>,
101    /// Current position (byte offset)
102    position: usize,
103    /// Current line (1-based)
104    line: usize,
105    /// Current column (1-based)
106    column: usize,
107    /// Peeked character (for lookahead)
108    peeked: Option<char>,
109}
110
111impl<'a> RawLexer<'a> {
112    /// Create a new lexer for the given input
113    pub fn new(input: &'a str) -> Self {
114        Self {
115            input,
116            chars: input.chars(),
117            position: 0,
118            line: 1,
119            column: 1,
120            peeked: None,
121        }
122    }
123
124    /// Reset lexer state to the beginning of the current input.
125    pub fn restart(&mut self) {
126        self.chars = self.input.chars();
127        self.position = 0;
128        self.line = 1;
129        self.column = 1;
130        self.peeked = None;
131    }
132
133    /// Tokenize the input, appending tokens to the provided buffer.
134    pub fn tokenize_into(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexError> {
135        loop {
136            let token = self.next_token()?;
137            let is_eof = matches!(token.token_type, TokenType::Eof);
138            tokens.push(token);
139
140            if is_eof {
141                break;
142            }
143        }
144
145        Ok(())
146    }
147
148    /// Get the next token
149    #[allow(clippy::too_many_lines)] // Lexer state machine is clearer when kept in one place.
150    pub fn next_token(&mut self) -> Result<Token, LexError> {
151        self.skip_whitespace();
152
153        let start_pos = self.position;
154        let start_line = self.line;
155        let start_col = self.column;
156
157        let Some(ch) = self.peek_char() else {
158            return Ok(Token::new(
159                TokenType::Eof,
160                Span::with_position(self.position, self.position, self.line, self.column),
161            ));
162        };
163
164        let token_type = if let Some(token) = self.read_simple_token(ch) {
165            token
166        } else if ch == '$' {
167            self.read_variable_token(start_pos, start_line, start_col)?
168        } else if ch == '~' {
169            self.read_regex_operator(start_pos, start_line, start_col)?
170        } else if ch == '>' || ch == '<' {
171            self.read_comparison_operator(ch)
172        } else if ch == '"' || ch == '\'' {
173            let s = self.read_quoted_string(ch)?;
174            TokenType::StringLiteral(s)
175        } else if ch == '/' {
176            let (pattern, flags) = self.read_regex()?;
177            TokenType::RegexLiteral { pattern, flags }
178        } else if self.is_number_start(ch) {
179            let n = self.read_number()?;
180            TokenType::NumberLiteral(n)
181        } else if Self::is_word_start(ch) {
182            self.read_word_token()
183        } else {
184            return Err(LexError::UnexpectedChar {
185                char: ch,
186                span: Span::with_position(
187                    start_pos,
188                    start_pos + ch.len_utf8(),
189                    start_line,
190                    start_col,
191                ),
192            });
193        };
194
195        Ok(Token::new(
196            token_type,
197            Span::with_position(start_pos, self.position, start_line, start_col),
198        ))
199    }
200
201    fn read_simple_token(&mut self, ch: char) -> Option<TokenType> {
202        let token = match ch {
203            '(' => TokenType::LParen,
204            ')' => TokenType::RParen,
205            ':' => TokenType::Colon,
206            '|' => TokenType::Pipe,
207            _ => return None,
208        };
209        self.next_char();
210        Some(token)
211    }
212
213    fn read_regex_operator(
214        &mut self,
215        start_pos: usize,
216        start_line: usize,
217        start_col: usize,
218    ) -> Result<TokenType, LexError> {
219        self.next_char();
220        if self.peek_char() == Some('=') {
221            self.next_char();
222            Ok(TokenType::RegexOp)
223        } else {
224            Err(LexError::UnexpectedChar {
225                char: '~',
226                span: Span::with_position(start_pos, self.position, start_line, start_col),
227            })
228        }
229    }
230
231    fn read_comparison_operator(&mut self, ch: char) -> TokenType {
232        self.next_char();
233        let (equal, plain) = if ch == '>' {
234            (TokenType::GreaterEq, TokenType::Greater)
235        } else {
236            (TokenType::LessEq, TokenType::Less)
237        };
238        if self.peek_char() == Some('=') {
239            self.next_char();
240            equal
241        } else {
242            plain
243        }
244    }
245
246    fn is_number_start(&self, ch: char) -> bool {
247        ch.is_ascii_digit() || (ch == '-' && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()))
248    }
249
250    fn is_word_start(ch: char) -> bool {
251        ch.is_ascii_alphabetic() || ch == '_'
252    }
253
254    /// Read a variable token: `$name` where name is alphanumeric/underscore.
255    fn read_variable_token(
256        &mut self,
257        start_pos: usize,
258        start_line: usize,
259        start_col: usize,
260    ) -> Result<TokenType, LexError> {
261        self.next_char(); // consume '$'
262
263        // Read the variable name (must be non-empty, alphanumeric + underscore)
264        let mut name = String::new();
265        while let Some(c) = self.peek_char() {
266            if c.is_ascii_alphanumeric() || c == '_' {
267                name.push(c);
268                self.next_char();
269            } else {
270                break;
271            }
272        }
273
274        if name.is_empty() {
275            return Err(LexError::UnexpectedChar {
276                char: '$',
277                span: Span::with_position(start_pos, self.position, start_line, start_col),
278            });
279        }
280
281        Ok(TokenType::Variable(name))
282    }
283
284    fn read_word_token(&mut self) -> TokenType {
285        let word = self.read_word();
286        match word.to_uppercase().as_str() {
287            "AND" => TokenType::And,
288            "OR" => TokenType::Or,
289            "NOT" => TokenType::Not,
290            "TRUE" => TokenType::BooleanLiteral(true),
291            "FALSE" => TokenType::BooleanLiteral(false),
292            _ => {
293                self.skip_whitespace();
294                match self.peek_char() {
295                    Some(':' | '~' | '>' | '<') => TokenType::Identifier(word),
296                    _ => TokenType::Word(word),
297                }
298            }
299        }
300    }
301
302    /// Peek at the next character without consuming it
303    fn peek_char(&mut self) -> Option<char> {
304        if self.peeked.is_none() {
305            self.peeked = self.chars.next();
306        }
307        self.peeked
308    }
309
310    /// Peek ahead n characters
311    fn peek_ahead(&self, n: usize) -> Option<char> {
312        self.input[self.position..].chars().nth(n)
313    }
314
315    /// Consume and return the next character
316    fn next_char(&mut self) -> Option<char> {
317        let ch = if let Some(c) = self.peeked.take() {
318            Some(c)
319        } else {
320            self.chars.next()
321        };
322
323        if let Some(c) = ch {
324            self.position += c.len_utf8();
325            if c == '\n' {
326                self.line += 1;
327                self.column = 1;
328            } else {
329                self.column += 1;
330            }
331        }
332
333        ch
334    }
335
336    /// Skip whitespace characters
337    fn skip_whitespace(&mut self) {
338        while let Some(c) = self.peek_char() {
339            if c.is_whitespace() {
340                self.next_char();
341            } else {
342                break;
343            }
344        }
345    }
346
347    /// Read a quoted string with escape handling
348    fn read_quoted_string(&mut self, quote: char) -> Result<String, LexError> {
349        let start_pos = self.position;
350        let start_line = self.line;
351        let start_col = self.column;
352        self.next_char(); // Skip opening quote
353
354        let mut result = String::new();
355
356        loop {
357            match self.next_char() {
358                Some(c) if c == quote => {
359                    // Closing quote
360                    return Ok(result);
361                }
362                Some('\\') => {
363                    let escaped = self.read_escape_sequence(start_pos, start_line, start_col)?;
364                    result.push(escaped);
365                }
366                Some(c) => result.push(c),
367                None => {
368                    return Err(LexError::UnterminatedString {
369                        span: Span::with_position(start_pos, self.position, start_line, start_col),
370                    });
371                }
372            }
373        }
374    }
375
376    fn read_escape_sequence(
377        &mut self,
378        start_pos: usize,
379        start_line: usize,
380        start_col: usize,
381    ) -> Result<char, LexError> {
382        match self.next_char() {
383            Some('"') => Ok('"'),
384            Some('\'') => Ok('\''),
385            Some('\\') => Ok('\\'),
386            Some('n') => Ok('\n'),
387            Some('t') => Ok('\t'),
388            Some('r') => Ok('\r'),
389            Some('u') => self.read_unicode_escape(),
390            // Glob metacharacter escapes - passed through literally for glob pattern matching
391            Some('*') => Ok('*'),
392            Some('?') => Ok('?'),
393            Some('[') => Ok('['),
394            Some(']') => Ok(']'),
395            Some('{') => Ok('{'),
396            Some('}') => Ok('}'),
397            Some(c) => Err(LexError::InvalidEscape {
398                char: c,
399                span: Span::with_position(self.position - 2, self.position, self.line, self.column),
400            }),
401            None => Err(LexError::UnterminatedString {
402                span: Span::with_position(start_pos, self.position, start_line, start_col),
403            }),
404        }
405    }
406
407    fn read_unicode_escape(&mut self) -> Result<char, LexError> {
408        // Unicode escape: \uXXXX
409        let hex = self.read_hex_digits(4)?;
410        let code_point =
411            u32::from_str_radix(&hex, 16).map_err(|_| LexError::InvalidUnicodeEscape {
412                got: hex.chars().next().unwrap_or('?'),
413                span: Span::with_position(
414                    self.position - hex.len() - 2,
415                    self.position,
416                    self.line,
417                    self.column,
418                ),
419            })?;
420        let ch = char::from_u32(code_point).ok_or_else(|| LexError::InvalidUnicodeEscape {
421            got: hex.chars().next().unwrap_or('?'),
422            span: Span::with_position(
423                self.position - hex.len() - 2,
424                self.position,
425                self.line,
426                self.column,
427            ),
428        })?;
429        Ok(ch)
430    }
431
432    /// Read a regex literal: /pattern/flags
433    fn read_regex(&mut self) -> Result<(String, RegexFlags), LexError> {
434        let start_pos = self.position;
435        let start_line = self.line;
436        let start_col = self.column;
437        self.next_char(); // Skip opening /
438
439        let pattern = self.read_regex_pattern(start_pos, start_line, start_col)?;
440        let flags = self.read_regex_flags(start_pos, start_line, start_col, &pattern)?;
441        self.validate_regex_pattern(&pattern, &flags, start_pos, start_line, start_col)?;
442        Ok((pattern, flags))
443    }
444
445    fn read_regex_pattern(
446        &mut self,
447        start_pos: usize,
448        start_line: usize,
449        start_col: usize,
450    ) -> Result<String, LexError> {
451        let mut pattern = String::new();
452
453        // Read pattern until closing /
454        loop {
455            match self.next_char() {
456                Some('/') => {
457                    // Count trailing backslashes to determine if slash is escaped
458                    let trailing_backslashes =
459                        pattern.chars().rev().take_while(|&c| c == '\\').count();
460
461                    if trailing_backslashes % 2 == 1 {
462                        // Odd number of backslashes: last one escapes the slash
463                        pattern.push('/');
464                        continue;
465                    }
466                    // Even number (or zero): slash is not escaped, end of pattern
467                    break;
468                }
469                Some(c) => pattern.push(c),
470                None => {
471                    return Err(LexError::UnterminatedRegex {
472                        span: Span::with_position(start_pos, self.position, start_line, start_col),
473                    });
474                }
475            }
476        }
477
478        Ok(pattern)
479    }
480
481    fn read_regex_flags(
482        &mut self,
483        start_pos: usize,
484        start_line: usize,
485        start_col: usize,
486        pattern: &str,
487    ) -> Result<RegexFlags, LexError> {
488        let mut flags = RegexFlags::default();
489        while let Some(ch) = self.peek_char() {
490            match ch {
491                'i' => {
492                    flags.case_insensitive = true;
493                    self.next_char();
494                }
495                'm' => {
496                    flags.multiline = true;
497                    self.next_char();
498                }
499                's' => {
500                    flags.dot_all = true;
501                    self.next_char();
502                }
503                _ if ch.is_ascii_alphabetic() => {
504                    // Unknown flag - return error
505                    return Err(LexError::InvalidRegex {
506                        pattern: pattern.to_string(),
507                        error: format!("Unknown regex flag '{ch}'"),
508                        span: Span::with_position(
509                            start_pos,
510                            self.position + 1,
511                            start_line,
512                            start_col,
513                        ),
514                    });
515                }
516                _ => break,
517            }
518        }
519
520        Ok(flags)
521    }
522
523    fn validate_regex_pattern(
524        &self,
525        pattern: &str,
526        flags: &RegexFlags,
527        start_pos: usize,
528        start_line: usize,
529        start_col: usize,
530    ) -> Result<(), LexError> {
531        let mut builder = regex::RegexBuilder::new(pattern);
532        builder
533            .case_insensitive(flags.case_insensitive)
534            .multi_line(flags.multiline)
535            .dot_matches_new_line(flags.dot_all);
536
537        if let Err(e) = builder.build() {
538            return Err(LexError::InvalidRegex {
539                pattern: pattern.to_string(),
540                error: e.to_string(),
541                span: Span::with_position(start_pos, self.position, start_line, start_col),
542            });
543        }
544
545        Ok(())
546    }
547
548    /// Read hexadecimal digits for Unicode escapes
549    fn read_hex_digits(&mut self, count: usize) -> Result<String, LexError> {
550        let mut hex = String::new();
551
552        for _ in 0..count {
553            match self.next_char() {
554                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
555                Some(c) => {
556                    return Err(LexError::InvalidUnicodeEscape {
557                        got: c,
558                        span: Span::with_position(
559                            self.position - 1,
560                            self.position,
561                            self.line,
562                            self.column.saturating_sub(1),
563                        ),
564                    });
565                }
566                None => {
567                    return Err(LexError::InvalidUnicodeEscape {
568                        got: '?',
569                        span: Span::with_position(
570                            self.position,
571                            self.position,
572                            self.line,
573                            self.column,
574                        ),
575                    });
576                }
577            }
578        }
579
580        Ok(hex)
581    }
582
583    /// Read a number (integer, possibly negative, possibly with underscores)
584    fn read_number(&mut self) -> Result<i64, LexError> {
585        let start_pos = self.position;
586        let start_line = self.line;
587        let start_col = self.column;
588        let mut num_str = String::new();
589
590        // Handle negative sign
591        if self.peek_char() == Some('-') {
592            num_str.push('-');
593            self.next_char();
594        }
595
596        // Read digits (with optional underscores)
597        while let Some(c) = self.peek_char() {
598            if c.is_ascii_digit() {
599                num_str.push(c);
600                self.next_char();
601            } else if c == '_' {
602                // Skip underscores
603                self.next_char();
604            } else {
605                break;
606            }
607        }
608
609        // Parse the number
610        num_str
611            .parse::<i64>()
612            .map_err(|e| LexError::NumberOverflow {
613                text: num_str.clone(),
614                error: e.to_string(),
615                span: Span::with_position(start_pos, self.position, start_line, start_col),
616            })
617    }
618
619    /// Read a word (identifier or keyword).
620    /// Supports characters: [a-zA-Z0-9_.*?/-]+ plus generic segments like `<T,U>`.
621    fn read_word(&mut self) -> String {
622        let mut word = String::new();
623
624        while let Some(c) = self.peek_char() {
625            match self.classify_word_char(c) {
626                WordCharType::Basic => {
627                    word.push(c);
628                    self.next_char();
629                }
630                WordCharType::DoubleColon => {
631                    word.push_str("::");
632                    self.next_char();
633                    self.next_char();
634                }
635                WordCharType::GenericStart => {
636                    self.consume_generic_segment(&mut word);
637                }
638                WordCharType::End => break,
639            }
640        }
641
642        word
643    }
644
645    /// Classify a character for word parsing.
646    fn classify_word_char(&self, c: char) -> WordCharType {
647        if c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '*' | '?' | '/' | '-' | '[' | ']') {
648            WordCharType::Basic
649        } else if c == ':' && self.peek_ahead(1) == Some(':') {
650            WordCharType::DoubleColon
651        } else if c == '<' && self.has_generic_closing_angle() {
652            WordCharType::GenericStart
653        } else {
654            WordCharType::End
655        }
656    }
657
658    /// Consume a generic segment like `<T,U>` into the word buffer.
659    fn consume_generic_segment(&mut self, word: &mut String) {
660        word.push('<');
661        self.next_char();
662
663        let mut depth = 1usize;
664        while let Some(ch) = self.peek_char() {
665            if ch.is_whitespace() {
666                break;
667            }
668            depth = match ch {
669                '<' => depth.saturating_add(1),
670                '>' => depth.saturating_sub(1),
671                _ => depth,
672            };
673            word.push(ch);
674            self.next_char();
675            if depth == 0 {
676                break;
677            }
678        }
679    }
680
681    /// Check if there's a matching closing angle bracket for generics.
682    fn has_generic_closing_angle(&self) -> bool {
683        let mut depth = 0usize;
684
685        for ch in self.input[self.position..].chars() {
686            if ch.is_whitespace() {
687                return false;
688            }
689            match ch {
690                '<' => depth = depth.saturating_add(1),
691                '>' => {
692                    if depth == 0 {
693                        return false;
694                    }
695                    depth = depth.saturating_sub(1);
696                    if depth == 0 {
697                        return true;
698                    }
699                }
700                _ => {}
701            }
702        }
703
704        false
705    }
706}
707
708/// Classification of characters for word parsing.
709enum WordCharType {
710    /// Basic word character (alphanumeric or special symbols).
711    Basic,
712    /// Double colon `::` for namespace separation.
713    DoubleColon,
714    /// Start of a generic segment `<`.
715    GenericStart,
716    /// End of word (not a valid word character).
717    End,
718}
719
720/// Public lexer wrapper retained for compatibility until pooling integration lands.
721pub struct Lexer<'a> {
722    raw: RawLexer<'a>,
723}
724
725impl<'a> Lexer<'a> {
726    /// Create a new lexer for the given input.
727    #[must_use]
728    pub fn new(input: &'a str) -> Self {
729        Self {
730            raw: RawLexer::new(input),
731        }
732    }
733
734    /// Tokenize the entire input into a vector of tokens.
735    ///
736    /// # Errors
737    ///
738    /// Returns [`LexError`] when lexical analysis fails (unterminated strings, invalid regexes, etc.).
739    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
740        let mut tokens = Vec::with_capacity(16);
741        self.raw.restart();
742        self.raw.tokenize_into(&mut tokens)?;
743        Ok(tokens)
744    }
745
746    /// Fetch the next token from the stream (used in parser tests).
747    ///
748    /// # Errors
749    ///
750    /// Returns [`LexError`] when the next token cannot be produced.
751    pub fn next_token(&mut self) -> Result<Token, LexError> {
752        self.raw.next_token()
753    }
754}
755
756#[allow(dead_code)]
757#[derive(Clone, Copy, Debug, PartialEq, Eq)]
758pub(crate) struct ShrinkPolicy {
759    pub max_capacity: usize,
760    pub shrink_ratio: usize,
761}
762
763impl Default for ShrinkPolicy {
764    fn default() -> Self {
765        Self {
766            max_capacity: 256,
767            shrink_ratio: 8,
768        }
769    }
770}
771
772// Runtime knobs (env-first) controlling the lexer pool.
773const POOL_MAX_DEFAULT: usize = 4;
774const ENV_POOL_MAX: &str = "SQRY_LEXER_POOL_MAX";
775const ENV_POOL_MAX_CAP: &str = "SQRY_LEXER_POOL_MAX_CAP";
776const ENV_POOL_SHRINK_RATIO: &str = "SQRY_LEXER_POOL_SHRINK_RATIO";
777
778#[derive(Clone, Copy, Debug, PartialEq, Eq)]
779struct PoolConfig {
780    max_size: usize,
781    shrink_policy: ShrinkPolicy,
782}
783
784impl PoolConfig {
785    fn default() -> Self {
786        Self {
787            max_size: POOL_MAX_DEFAULT,
788            shrink_policy: ShrinkPolicy::default(),
789        }
790    }
791
792    fn from_environment() -> Self {
793        let mut config = Self::default();
794
795        if let Some(value) = read_env_usize(ENV_POOL_MAX) {
796            config.max_size = value;
797        }
798
799        if let Some(value) = read_env_usize(ENV_POOL_MAX_CAP) {
800            config.shrink_policy.max_capacity = value.max(1);
801        }
802
803        if let Some(value) = read_env_usize(ENV_POOL_SHRINK_RATIO) {
804            config.shrink_policy.shrink_ratio = value.max(1);
805        }
806
807        config
808    }
809}
810
811fn read_env_usize(var: &str) -> Option<usize> {
812    match env::var(var) {
813        Ok(value) => match value.parse::<usize>() {
814            Ok(parsed) => Some(parsed),
815            Err(err) => {
816                trace!("Ignoring invalid value for {var}: {err}");
817                None
818            }
819        },
820        Err(std::env::VarError::NotPresent) => None,
821        Err(std::env::VarError::NotUnicode(_)) => {
822            trace!("Ignoring non-unicode value for {var}");
823            None
824        }
825    }
826}
827
828thread_local! {
829    static LEXER_POOL: RefCell<LexerPool> = RefCell::new(LexerPool::new(PoolConfig::default()));
830}
831
832struct LexerPool {
833    stash: Vec<ReusableLexer>,
834    in_flight: usize,
835    config: PoolConfig,
836}
837
838impl LexerPool {
839    fn new(config: PoolConfig) -> Self {
840        Self {
841            stash: Vec::new(),
842            in_flight: 0,
843            config,
844        }
845    }
846
847    fn apply_config(&mut self, config: PoolConfig) {
848        if self.config == config {
849            return;
850        }
851
852        trace!(
853            "sqry::query::lexer: updating pool config -> max_size={}, max_capacity={}, shrink_ratio={}",
854            config.max_size, config.shrink_policy.max_capacity, config.shrink_policy.shrink_ratio
855        );
856
857        self.config = config;
858        self.stash.clear();
859        self.in_flight = 0;
860    }
861
862    fn acquire(&mut self) -> LexerHandle {
863        if let Some(lexer) = self.stash.pop() {
864            self.in_flight += 1;
865            return LexerHandle::pooled(lexer);
866        }
867
868        if self.in_flight < self.config.max_size {
869            self.in_flight += 1;
870            let lexer = ReusableLexer::with_policy(self.config.shrink_policy);
871            return LexerHandle::pooled(lexer);
872        }
873
874        LexerHandle::temporary(ReusableLexer::with_policy(self.config.shrink_policy))
875    }
876
877    fn release(&mut self, lexer: ReusableLexer) {
878        if self.config.max_size == 0 {
879            self.in_flight = self.in_flight.saturating_sub(1);
880            return;
881        }
882
883        self.in_flight = self.in_flight.saturating_sub(1);
884        if self.stash.len() < self.config.max_size {
885            self.stash.push(lexer);
886        }
887    }
888
889    #[cfg(test)]
890    fn stats(&self) -> (usize, usize, PoolConfig) {
891        (self.stash.len(), self.in_flight, self.config)
892    }
893
894    #[cfg(test)]
895    fn reset(&mut self, config: PoolConfig) {
896        self.stash.clear();
897        self.in_flight = 0;
898        self.config = config;
899    }
900}
901
902struct LexerHandle {
903    lexer: Option<ReusableLexer>,
904    pooled: bool,
905}
906
907impl LexerHandle {
908    fn pooled(lexer: ReusableLexer) -> Self {
909        Self {
910            lexer: Some(lexer),
911            pooled: true,
912        }
913    }
914
915    fn temporary(lexer: ReusableLexer) -> Self {
916        Self {
917            lexer: Some(lexer),
918            pooled: false,
919        }
920    }
921
922    fn lexer_mut(&mut self) -> &mut ReusableLexer {
923        // SAFETY: self.lexer is always Some during LexerHandle's lifetime.
924        // It only becomes None during Drop (after moving to pool), which occurs
925        // after all user operations complete. This expect() cannot panic during normal use.
926        self.lexer.as_mut().expect("lexer handle missing lexer")
927    }
928
929    fn reset(&mut self, input: &str) {
930        self.lexer_mut().reset(input);
931    }
932
933    fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
934        self.lexer_mut().tokenize()
935    }
936}
937
938impl Drop for LexerHandle {
939    fn drop(&mut self) {
940        if !self.pooled {
941            return;
942        }
943
944        if let Some(lexer) = self.lexer.take() {
945            LEXER_POOL.with(|cell| {
946                cell.borrow_mut().release(lexer);
947            });
948        }
949    }
950}
951
952#[cfg(test)]
953pub(crate) fn configure_pool_for_tests(max_size: usize, shrink_policy: ShrinkPolicy) {
954    LEXER_POOL.with(|cell| {
955        cell.borrow_mut().reset(PoolConfig {
956            max_size,
957            shrink_policy,
958        });
959    });
960}
961
962#[cfg(test)]
963pub(crate) fn reset_pool_to_default_for_tests() {
964    configure_pool_for_tests(POOL_MAX_DEFAULT, ShrinkPolicy::default());
965}
966
967#[cfg(test)]
968pub(crate) fn pool_stats_for_tests() -> (usize, usize, usize) {
969    LEXER_POOL.with(|cell| {
970        let (stash, in_flight, config) = cell.borrow().stats();
971        (stash, in_flight, config.max_size)
972    })
973}
974
975pub(crate) fn with_lexer<F, T>(input: &str, f: F) -> Result<T, LexError>
976where
977    F: FnOnce(TokenBatch<'_>) -> Result<T, LexError>,
978{
979    let config = PoolConfig::from_environment();
980
981    if config.max_size == 0 {
982        LEXER_POOL.with(|cell| {
983            cell.borrow_mut().apply_config(config);
984        });
985        let mut lexer = ReusableLexer::with_policy(config.shrink_policy);
986        lexer.reset(input);
987        let batch = lexer.tokenize()?;
988        return f(batch);
989    }
990
991    let mut handle = LEXER_POOL.with(|cell| {
992        let mut pool = cell.borrow_mut();
993        pool.apply_config(config);
994        pool.acquire()
995    });
996
997    handle.reset(input);
998    let batch = handle.tokenize()?;
999    let result = f(batch);
1000    drop(handle);
1001    result
1002}
1003
1004/// Tokenize using the thread-local lexer pool, returning owned tokens.
1005///
1006/// This is useful for benches and integration points that only need the token
1007/// stream and do not want to work with the internal `TokenBatch` guard.
1008///
1009/// # Errors
1010///
1011/// Returns [`LexError`] when lexical analysis fails.
1012pub fn tokenize_with_pool(input: &str) -> Result<Vec<Token>, LexError> {
1013    with_lexer(input, |batch| Ok(batch.into_vec()))
1014}
1015
1016#[cfg(debug_assertions)]
1017#[allow(dead_code)]
1018#[derive(Debug, Default, Clone, Copy)]
1019struct LexerDiagnostics {
1020    reuse_count: usize,
1021    max_capacity_seen: usize,
1022    shrink_count: usize,
1023}
1024
1025#[cfg(debug_assertions)]
1026#[allow(dead_code)]
1027impl LexerDiagnostics {
1028    fn record_reuse(&mut self, capacity: usize) {
1029        self.reuse_count += 1;
1030        if capacity > self.max_capacity_seen {
1031            self.max_capacity_seen = capacity;
1032        }
1033    }
1034
1035    fn record_shrink(&mut self) {
1036        self.shrink_count += 1;
1037    }
1038}
1039
1040/// Policy controlling how aggressively reusable lexer buffers shrink.
1041/// Reusable lexer that owns its input, token buffer, and shrink policy.
1042#[allow(dead_code)]
1043pub(crate) struct ReusableLexer {
1044    input: String,
1045    token_buffer: Vec<Token>,
1046    shrink_policy: ShrinkPolicy,
1047    #[cfg(debug_assertions)]
1048    diagnostics: LexerDiagnostics,
1049}
1050
1051#[allow(dead_code)]
1052impl ReusableLexer {
1053    pub fn new() -> Self {
1054        Self::with_policy(ShrinkPolicy::default())
1055    }
1056
1057    pub fn with_policy(shrink_policy: ShrinkPolicy) -> Self {
1058        Self {
1059            input: String::new(),
1060            token_buffer: Vec::with_capacity(16),
1061            shrink_policy,
1062            #[cfg(debug_assertions)]
1063            diagnostics: LexerDiagnostics::default(),
1064        }
1065    }
1066
1067    /// Reset the lexer to a new input string.
1068    pub fn reset(&mut self, input: &str) {
1069        self.input.clear();
1070        self.input.push_str(input);
1071        self.token_buffer.clear();
1072    }
1073
1074    /// Tokenize the current input, returning an RAII guard over the buffer.
1075    pub fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
1076        self.token_buffer.clear();
1077        let mut raw = RawLexer::new(self.input.as_str());
1078        raw.tokenize_into(&mut self.token_buffer)?;
1079        #[cfg(debug_assertions)]
1080        self.diagnostics.record_reuse(self.token_buffer.capacity());
1081        Ok(TokenBatch {
1082            tokens: &mut self.token_buffer,
1083            shrink_policy: self.shrink_policy,
1084            #[cfg(debug_assertions)]
1085            diagnostics: &mut self.diagnostics,
1086        })
1087    }
1088
1089    #[cfg(debug_assertions)]
1090    fn diagnostics(&self) -> &LexerDiagnostics {
1091        &self.diagnostics
1092    }
1093}
1094
1095/// RAII guard for the reusable token buffer.
1096///
1097/// Provides read-only access via `as_slice()` or transfers ownership via
1098/// `into_vec()`, draining the reusable buffer without cloning. The guard holds
1099/// a mutable borrow to the underlying buffer so additional tokenization cannot
1100/// start until it is dropped. On drop the buffer is cleared and, if the shrink
1101/// policy deems it oversized, the capacity is reduced.
1102#[allow(dead_code)]
1103pub(crate) struct TokenBatch<'a> {
1104    tokens: &'a mut Vec<Token>,
1105    shrink_policy: ShrinkPolicy,
1106    #[cfg(debug_assertions)]
1107    diagnostics: &'a mut LexerDiagnostics,
1108}
1109
1110#[allow(dead_code)]
1111impl TokenBatch<'_> {
1112    pub fn as_slice(&self) -> &[Token] {
1113        self.tokens.as_slice()
1114    }
1115
1116    #[allow(unused_mut)]
1117    pub fn into_vec(mut self) -> Vec<Token> {
1118        let result = self.tokens.drain(..).collect();
1119        #[cfg(debug_assertions)]
1120        let _ = &mut *self.diagnostics; // keep diagnostics reference alive for Drop
1121        result
1122    }
1123}
1124
1125impl Drop for TokenBatch<'_> {
1126    fn drop(&mut self) {
1127        if !self.tokens.is_empty() {
1128            self.tokens.clear();
1129        }
1130
1131        let shrink_threshold = self
1132            .shrink_policy
1133            .max_capacity
1134            .saturating_mul(self.shrink_policy.shrink_ratio);
1135        if shrink_threshold > 0 && self.tokens.capacity() > shrink_threshold {
1136            self.tokens.shrink_to(self.shrink_policy.max_capacity);
1137            #[cfg(debug_assertions)]
1138            self.diagnostics.record_shrink();
1139        }
1140    }
1141}
1142
1143#[cfg(test)]
1144mod tests {
1145    use super::*;
1146    use std::panic::{AssertUnwindSafe, catch_unwind};
1147    use std::sync::{Mutex, OnceLock};
1148
1149    #[cfg(feature = "dhat-heap")]
1150    use dhat::{HeapStats, Profiler};
1151
1152    fn reset_pool_from_env() {
1153        let config = PoolConfig::from_environment();
1154        LEXER_POOL.with(|cell| {
1155            cell.borrow_mut().reset(config);
1156        });
1157    }
1158
1159    fn reset_pool_default() {
1160        unsafe {
1161            std::env::remove_var(ENV_POOL_MAX);
1162            std::env::remove_var(ENV_POOL_MAX_CAP);
1163            std::env::remove_var(ENV_POOL_SHRINK_RATIO);
1164        }
1165        reset_pool_from_env();
1166    }
1167
1168    fn set_env(var: &str, value: &str) {
1169        unsafe {
1170            std::env::set_var(var, value);
1171        }
1172    }
1173
1174    fn remove_env(var: &str) {
1175        unsafe {
1176            std::env::remove_var(var);
1177        }
1178    }
1179
1180    fn env_lock() -> &'static Mutex<()> {
1181        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
1182        LOCK.get_or_init(|| Mutex::new(()))
1183    }
1184
1185    #[test]
1186    fn reusable_lexer_reuses_buffer_across_calls() {
1187        let mut lexer = ReusableLexer::new();
1188        lexer.reset("kind:function");
1189
1190        let first_ptr = {
1191            let batch = lexer.tokenize().unwrap();
1192            let ptr = batch.as_slice().as_ptr();
1193            assert!(!batch.as_slice().is_empty());
1194            ptr
1195        };
1196        assert_eq!(first_ptr, lexer.token_buffer.as_ptr());
1197
1198        lexer.reset("name:test");
1199        let second_ptr = {
1200            let batch = lexer.tokenize().unwrap();
1201            let ptr = batch.as_slice().as_ptr();
1202            assert!(!batch.as_slice().is_empty());
1203            ptr
1204        };
1205        assert_eq!(second_ptr, lexer.token_buffer.as_ptr());
1206        assert_eq!(first_ptr, second_ptr);
1207        #[cfg(debug_assertions)]
1208        {
1209            let diagnostics = lexer.diagnostics();
1210            assert!(diagnostics.reuse_count >= 2);
1211            assert!(diagnostics.max_capacity_seen >= lexer.token_buffer.capacity());
1212        }
1213    }
1214
1215    #[test]
1216    fn reusable_lexer_clears_buffer_on_panic() {
1217        let mut lexer = ReusableLexer::new();
1218        lexer.reset("kind:function");
1219
1220        let result = catch_unwind(AssertUnwindSafe(|| {
1221            let _batch = lexer.tokenize().unwrap();
1222            panic!("boom");
1223        }));
1224
1225        assert!(result.is_err());
1226        assert_eq!(lexer.token_buffer.len(), 0);
1227    }
1228
1229    #[test]
1230    fn reusable_lexer_into_vec_drains_tokens() {
1231        let mut lexer = ReusableLexer::new();
1232        lexer.reset("kind:function");
1233
1234        let tokens = {
1235            let batch = lexer.tokenize().unwrap();
1236            batch.into_vec()
1237        };
1238
1239        assert_eq!(tokens.len(), 4);
1240        assert_eq!(lexer.token_buffer.len(), 0);
1241    }
1242
1243    #[test]
1244    fn reusable_lexer_shrink_policy_applies() {
1245        let policy = ShrinkPolicy {
1246            max_capacity: 8,
1247            shrink_ratio: 2,
1248        };
1249
1250        let mut lexer = ReusableLexer::with_policy(policy);
1251        let large_query = (0..128)
1252            .map(|i| format!("name:value{i}"))
1253            .collect::<Vec<_>>()
1254            .join(" ");
1255        lexer.reset(&large_query);
1256
1257        {
1258            let batch = lexer.tokenize().unwrap();
1259            let _ = batch.into_vec();
1260        }
1261
1262        if lexer.token_buffer.capacity() <= policy.max_capacity * policy.shrink_ratio {
1263            lexer
1264                .token_buffer
1265                .reserve(policy.max_capacity * policy.shrink_ratio * 2);
1266        }
1267        assert!(lexer.token_buffer.capacity() > policy.max_capacity * policy.shrink_ratio);
1268
1269        lexer.reset("kind:function");
1270        {
1271            let batch = lexer.tokenize().unwrap();
1272            drop(batch);
1273        }
1274
1275        assert!(lexer.token_buffer.capacity() <= policy.max_capacity);
1276
1277        #[cfg(debug_assertions)]
1278        {
1279            let diagnostics = lexer.diagnostics();
1280            assert!(diagnostics.shrink_count >= 1);
1281        }
1282    }
1283
1284    #[test]
1285    fn lexer_pool_returns_lexers_to_stash() {
1286        let _guard = env_lock().lock().unwrap();
1287        reset_pool_default();
1288
1289        assert_eq!(PoolConfig::from_environment().max_size, POOL_MAX_DEFAULT);
1290
1291        let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1292        assert_eq!(tokens.len(), 4);
1293
1294        LEXER_POOL.with(|cell| {
1295            let (stash_len, in_flight, config) = cell.borrow().stats();
1296            assert_eq!(config.max_size, POOL_MAX_DEFAULT);
1297            assert_eq!(in_flight, 0);
1298            assert_eq!(stash_len, 1);
1299        });
1300    }
1301
1302    #[test]
1303    fn lexer_pool_respects_zero_capacity_env() {
1304        let _guard = env_lock().lock().unwrap();
1305        set_env(ENV_POOL_MAX, "0");
1306        reset_pool_from_env();
1307
1308        let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1309        assert_eq!(tokens.len(), 4);
1310
1311        LEXER_POOL.with(|cell| {
1312            let (stash_len, in_flight, config) = cell.borrow().stats();
1313            assert_eq!(config.max_size, 0);
1314            assert_eq!(in_flight, 0);
1315            assert_eq!(stash_len, 0);
1316        });
1317
1318        remove_env(ENV_POOL_MAX);
1319        reset_pool_default();
1320    }
1321
1322    #[test]
1323    fn lexer_pool_reuses_single_slot() {
1324        let _guard = env_lock().lock().unwrap();
1325        set_env(ENV_POOL_MAX, "1");
1326        reset_pool_from_env();
1327
1328        assert_eq!(PoolConfig::from_environment().max_size, 1);
1329
1330        for query in ["kind:function", "name:test"] {
1331            let _ = with_lexer(query, |batch| Ok(batch.into_vec())).unwrap();
1332        }
1333
1334        LEXER_POOL.with(|cell| {
1335            let (stash_len, in_flight, config) = cell.borrow().stats();
1336            assert_eq!(config.max_size, 1);
1337            assert_eq!(in_flight, 0);
1338            assert_eq!(stash_len, 1);
1339        });
1340
1341        remove_env(ENV_POOL_MAX);
1342        reset_pool_default();
1343    }
1344
1345    #[test]
1346    fn lexer_handles_double_colon_in_words() {
1347        let mut lexer = Lexer::new("callers:Player::takeDamage");
1348        let tokens = lexer.tokenize().unwrap();
1349        assert_eq!(tokens.len(), 4); // Identifier, Colon, Word, Eof
1350        assert_eq!(
1351            tokens[0].token_type,
1352            TokenType::Identifier("callers".to_string())
1353        );
1354        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1355        assert_eq!(
1356            tokens[2].token_type,
1357            TokenType::Word("Player::takeDamage".to_string())
1358        );
1359        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1360    }
1361
1362    #[test]
1363    #[ignore = "Test depends on clean env_lock state. Run in isolation with: cargo test -p sqry-core --lib with_lexer_allows_reentrant_usage -- --ignored --test-threads=1"]
1364    fn with_lexer_allows_reentrant_usage() {
1365        let _guard = env_lock().lock().unwrap();
1366        reset_pool_default();
1367
1368        let result = with_lexer("kind:function", |batch| {
1369            assert!(!batch.as_slice().is_empty());
1370            with_lexer("name:test", |inner_batch| {
1371                assert!(!inner_batch.as_slice().is_empty());
1372                Ok(())
1373            })
1374        });
1375
1376        assert!(result.is_ok());
1377        reset_pool_default();
1378    }
1379
1380    #[test]
1381    fn lexer_pool_thread_local_isolation() {
1382        let _guard = env_lock().lock().unwrap();
1383        reset_pool_default();
1384
1385        let handles: Vec<_> = (0..4)
1386            .map(|_| {
1387                std::thread::spawn(|| {
1388                    for _ in 0..50 {
1389                        for query in ["kind:function", "name:test", "lang:rust"] {
1390                            with_lexer(query, |batch| {
1391                                assert!(!batch.as_slice().is_empty());
1392                                Ok(batch.into_vec())
1393                            })
1394                            .unwrap();
1395                        }
1396                    }
1397
1398                    let (stash, in_flight, max_size) = crate::query::lexer::pool_stats_for_tests();
1399                    assert!(stash <= max_size);
1400                    assert_eq!(in_flight, 0);
1401                })
1402            })
1403            .collect();
1404
1405        for handle in handles {
1406            handle.join().unwrap();
1407        }
1408
1409        reset_pool_default();
1410    }
1411
1412    #[cfg(feature = "dhat-heap")]
1413    #[test]
1414    #[ignore = "Heap profiling test must run in isolation. Run with: cargo test -p sqry-core --lib lexer_reuse_minimizes_heap_allocations -- --ignored --test-threads=1"]
1415    fn lexer_reuse_minimizes_heap_allocations() {
1416        let _guard = env_lock().lock().unwrap();
1417        reset_pool_default();
1418
1419        let profiler = Profiler::new_heap();
1420
1421        for _ in 0..5 {
1422            with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1423        }
1424
1425        let stats = HeapStats::get();
1426        drop(profiler);
1427        // Threshold adjusted for integration test environment: when running with --tests,
1428        // all plugin crates (dev-dependencies) are loaded, increasing baseline allocations
1429        // from ~28 blocks (unit tests only) to ~58 blocks (with plugins loaded)
1430        assert!(
1431            stats.total_blocks <= 65,
1432            "expected limited allocations, observed {} blocks (threshold accounts for plugin loading in integration tests)",
1433            stats.total_blocks
1434        );
1435
1436        reset_pool_default();
1437    }
1438
1439    #[test]
1440    fn reusable_lexer_capacity_growth_and_retention() {
1441        let mut lexer = ReusableLexer::new();
1442
1443        lexer.reset("kind:function");
1444        {
1445            let batch = lexer.tokenize().unwrap();
1446            assert!(!batch.as_slice().is_empty());
1447        }
1448        let initial_capacity = lexer.token_buffer.capacity();
1449
1450        let large_query = (0..50)
1451            .map(|i| format!("name:value{i}"))
1452            .collect::<Vec<_>>()
1453            .join(" AND ");
1454        lexer.reset(&large_query);
1455        {
1456            let batch = lexer.tokenize().unwrap();
1457            assert!(batch.as_slice().len() > 50);
1458        }
1459        let grown_capacity = lexer.token_buffer.capacity();
1460        assert!(grown_capacity > initial_capacity);
1461
1462        lexer.reset("kind:function");
1463        {
1464            let batch = lexer.tokenize().unwrap();
1465            assert!(!batch.as_slice().is_empty());
1466        }
1467        let retained_capacity = lexer.token_buffer.capacity();
1468        assert_eq!(retained_capacity, grown_capacity);
1469
1470        #[cfg(debug_assertions)]
1471        {
1472            let diagnostics = lexer.diagnostics();
1473            assert!(diagnostics.reuse_count >= 3);
1474            assert!(diagnostics.max_capacity_seen >= grown_capacity);
1475        }
1476    }
1477
1478    #[test]
1479    fn reusable_lexer_error_recovery_clears_buffer() {
1480        let mut lexer = ReusableLexer::new();
1481
1482        lexer.reset("kind:function");
1483        {
1484            let batch = lexer.tokenize().unwrap();
1485            assert!(!batch.as_slice().is_empty());
1486        }
1487
1488        lexer.reset("kind@invalid");
1489        let result = lexer.tokenize();
1490        assert!(result.is_err());
1491        drop(result);
1492
1493        lexer.reset("name:test");
1494        {
1495            let batch = lexer.tokenize().unwrap();
1496            assert!(!batch.as_slice().is_empty());
1497        }
1498    }
1499
1500    #[test]
1501    fn reusable_lexer_panic_after_into_vec_has_clean_buffer() {
1502        let mut lexer = ReusableLexer::new();
1503        lexer.reset("kind:function");
1504
1505        let result = catch_unwind(AssertUnwindSafe(|| {
1506            let batch = lexer.tokenize().unwrap();
1507            let _tokens = batch.into_vec();
1508            panic!("boom");
1509        }));
1510
1511        assert!(result.is_err());
1512        assert_eq!(lexer.token_buffer.len(), 0);
1513    }
1514
1515    #[test]
1516    fn test_tokenize_simple_query() {
1517        let mut lexer = Lexer::new("kind:function");
1518        let tokens = lexer.tokenize().unwrap();
1519
1520        assert_eq!(tokens.len(), 4);
1521        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "kind"));
1522        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1523        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "function"));
1524        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1525    }
1526
1527    #[test]
1528    fn test_tokenize_generic_type_value() {
1529        let mut lexer = Lexer::new("returns:Optional<User>");
1530        let tokens = lexer.tokenize().unwrap();
1531
1532        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1533        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1534        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Optional<User>"));
1535        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1536    }
1537
1538    #[test]
1539    fn test_tokenize_nested_generic_value() {
1540        let mut lexer = Lexer::new("returns:Map<String,List<Order>>");
1541        let tokens = lexer.tokenize().unwrap();
1542
1543        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1544        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1545        assert!(
1546            matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Map<String,List<Order>>")
1547        );
1548        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1549    }
1550
1551    #[test]
1552    fn test_tokenize_numeric_comparison_after_identifier() {
1553        let mut lexer = Lexer::new("line>10");
1554        let tokens = lexer.tokenize().unwrap();
1555
1556        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "line"));
1557        assert!(matches!(tokens[1].token_type, TokenType::Greater));
1558        assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(10)));
1559        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1560    }
1561
1562    #[test]
1563    fn test_tokenize_keywords_case_insensitive() {
1564        let mut lexer = Lexer::new("AND and Or NOT not");
1565        let tokens = lexer.tokenize().unwrap();
1566
1567        assert!(matches!(tokens[0].token_type, TokenType::And));
1568        assert!(matches!(tokens[1].token_type, TokenType::And));
1569        assert!(matches!(tokens[2].token_type, TokenType::Or));
1570        assert!(matches!(tokens[3].token_type, TokenType::Not));
1571        assert!(matches!(tokens[4].token_type, TokenType::Not));
1572    }
1573
1574    #[test]
1575    fn test_tokenize_operators() {
1576        let mut lexer = Lexer::new(": ~= > < >= <=");
1577        let tokens = lexer.tokenize().unwrap();
1578
1579        assert!(matches!(tokens[0].token_type, TokenType::Colon));
1580        assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1581        assert!(matches!(tokens[2].token_type, TokenType::Greater));
1582        assert!(matches!(tokens[3].token_type, TokenType::Less));
1583        assert!(matches!(tokens[4].token_type, TokenType::GreaterEq));
1584        assert!(matches!(tokens[5].token_type, TokenType::LessEq));
1585    }
1586
1587    #[test]
1588    fn test_tokenize_parentheses() {
1589        let mut lexer = Lexer::new("( )");
1590        let tokens = lexer.tokenize().unwrap();
1591
1592        assert!(matches!(tokens[0].token_type, TokenType::LParen));
1593        assert!(matches!(tokens[1].token_type, TokenType::RParen));
1594    }
1595
1596    #[test]
1597    fn test_tokenize_double_quoted_string() {
1598        let mut lexer = Lexer::new(r#"name:"hello world""#);
1599        let tokens = lexer.tokenize().unwrap();
1600
1601        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1602        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1603        assert!(
1604            matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1605        );
1606    }
1607
1608    #[test]
1609    fn test_tokenize_single_quoted_string() {
1610        let mut lexer = Lexer::new(r"name:'hello world'");
1611        let tokens = lexer.tokenize().unwrap();
1612
1613        assert!(
1614            matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1615        );
1616    }
1617
1618    #[test]
1619    fn test_string_escape_sequences() {
1620        let mut lexer = Lexer::new(r#""line1\nline2\ttab\"quote\\backslash""#);
1621        let tokens = lexer.tokenize().unwrap();
1622
1623        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1624            assert_eq!(s, "line1\nline2\ttab\"quote\\backslash");
1625        } else {
1626            panic!("Expected string literal");
1627        }
1628    }
1629
1630    #[test]
1631    fn test_unicode_escape() {
1632        let mut lexer = Lexer::new(r#""\u0041BC""#);
1633        let tokens = lexer.tokenize().unwrap();
1634
1635        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1636            assert_eq!(s, "ABC");
1637        } else {
1638            panic!("Expected string literal");
1639        }
1640    }
1641
1642    #[test]
1643    fn test_unterminated_string() {
1644        let mut lexer = Lexer::new(r#"name:"unclosed"#);
1645        let result = lexer.tokenize();
1646        assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
1647    }
1648
1649    #[test]
1650    fn test_invalid_escape() {
1651        let mut lexer = Lexer::new(r#""\x""#);
1652        let result = lexer.tokenize();
1653        assert!(matches!(
1654            result,
1655            Err(LexError::InvalidEscape { char: 'x', .. })
1656        ));
1657    }
1658
1659    #[test]
1660    fn test_glob_metacharacter_escape_sequences() {
1661        // Test that glob metacharacters can be escaped in quoted strings
1662        // This is required for path: predicates with literal glob chars
1663        let mut lexer = Lexer::new(r#""src/\[test\]/\*\?file\{a,b\}""#);
1664        let tokens = lexer.tokenize().unwrap();
1665
1666        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1667            assert_eq!(s, "src/[test]/*?file{a,b}");
1668        } else {
1669            panic!("Expected string literal, got {:?}", tokens[0].token_type);
1670        }
1671    }
1672
1673    #[test]
1674    fn test_path_predicate_with_escaped_glob_chars() {
1675        // Simulate a path: predicate with escaped glob characters
1676        let mut lexer = Lexer::new(r#"path:"src/\[test\]/**""#);
1677        let tokens = lexer.tokenize().unwrap();
1678
1679        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "path"));
1680        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1681        if let TokenType::StringLiteral(s) = &tokens[2].token_type {
1682            // The escaped brackets become literal brackets, ** remains as-is
1683            assert_eq!(s, "src/[test]/**");
1684        } else {
1685            panic!("Expected string literal");
1686        }
1687    }
1688
1689    #[test]
1690    fn test_tokenize_regex() {
1691        let mut lexer = Lexer::new(r"name~=/^test_/i");
1692        let tokens = lexer.tokenize().unwrap();
1693
1694        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1695        assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1696
1697        if let TokenType::RegexLiteral { pattern, flags } = &tokens[2].token_type {
1698            assert_eq!(pattern, "^test_");
1699            assert!(flags.case_insensitive);
1700            assert!(!flags.multiline);
1701            assert!(!flags.dot_all);
1702        } else {
1703            panic!("Expected regex literal");
1704        }
1705    }
1706
1707    #[test]
1708    fn test_regex_multiple_flags() {
1709        let mut lexer = Lexer::new(r"/pattern/ims");
1710        let tokens = lexer.tokenize().unwrap();
1711
1712        if let TokenType::RegexLiteral { flags, .. } = &tokens[0].token_type {
1713            assert!(flags.case_insensitive);
1714            assert!(flags.multiline);
1715            assert!(flags.dot_all);
1716        } else {
1717            panic!("Expected regex literal");
1718        }
1719    }
1720
1721    #[test]
1722    fn test_regex_escaped_slash() {
1723        let mut lexer = Lexer::new(r"/path\/to\/file/");
1724        let tokens = lexer.tokenize().unwrap();
1725
1726        if let TokenType::RegexLiteral { pattern, .. } = &tokens[0].token_type {
1727            assert_eq!(pattern, r"path\/to\/file");
1728        } else {
1729            panic!("Expected regex literal");
1730        }
1731    }
1732
1733    #[test]
1734    fn test_regex_escaped_backslash_then_slash() {
1735        // Input raw string r"/a\\\\/": This contains /a\\\\/
1736        // In the raw string, \\\\ is 4 literal backslash characters
1737        // So pattern before / is: a\\\\
1738        // Trailing backslashes: 4 (even number)
1739        // Even count means slash is NOT escaped, so pattern ends
1740        // Result pattern should be: a\\\\ (4 backslashes)
1741        let mut lexer = Lexer::new(r"/a\\\\/");
1742        let token = lexer.next_token().unwrap();
1743        match token.token_type {
1744            TokenType::RegexLiteral { pattern, .. } => {
1745                assert_eq!(pattern, r"a\\\\"); // 4 backslashes
1746            }
1747            _ => panic!("Expected RegexLiteral"),
1748        }
1749    }
1750
1751    #[test]
1752    fn test_regex_single_escaped_slash() {
1753        let mut lexer = Lexer::new(r"/a\/b/"); // Pattern: a/b with escaped slash
1754        let token = lexer.next_token().unwrap();
1755        match token.token_type {
1756            TokenType::RegexLiteral { pattern, .. } => {
1757                assert_eq!(pattern, r"a\/b");
1758            }
1759            _ => panic!("Expected RegexLiteral"),
1760        }
1761    }
1762
1763    #[test]
1764    fn test_unterminated_regex() {
1765        let mut lexer = Lexer::new(r"/unclosed");
1766        let result = lexer.tokenize();
1767        assert!(matches!(result, Err(LexError::UnterminatedRegex { .. })));
1768    }
1769
1770    #[test]
1771    fn test_invalid_regex_pattern() {
1772        let mut lexer = Lexer::new(r"/^[/");
1773        let result = lexer.tokenize();
1774        assert!(matches!(result, Err(LexError::InvalidRegex { .. })));
1775    }
1776
1777    #[test]
1778    fn test_regex_unknown_flag() {
1779        let mut lexer = Lexer::new("/pattern/x");
1780        let err = lexer.next_token().unwrap_err();
1781        match err {
1782            LexError::InvalidRegex { error, .. } => {
1783                assert!(error.contains("Unknown regex flag"));
1784            }
1785            _ => panic!("Expected InvalidRegex error"),
1786        }
1787    }
1788
1789    #[test]
1790    fn test_tokenize_positive_number() {
1791        let mut lexer = Lexer::new("lines:42");
1792        let tokens = lexer.tokenize().unwrap();
1793
1794        assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(42)));
1795    }
1796
1797    #[test]
1798    fn test_tokenize_negative_number() {
1799        let mut lexer = Lexer::new("lines:-42");
1800        let tokens = lexer.tokenize().unwrap();
1801
1802        assert!(matches!(
1803            tokens[2].token_type,
1804            TokenType::NumberLiteral(-42)
1805        ));
1806    }
1807
1808    #[test]
1809    fn test_tokenize_number_with_underscores() {
1810        let mut lexer = Lexer::new("lines:1_000_000");
1811        let tokens = lexer.tokenize().unwrap();
1812
1813        assert!(matches!(
1814            tokens[2].token_type,
1815            TokenType::NumberLiteral(1_000_000)
1816        ));
1817    }
1818
1819    #[test]
1820    fn test_number_overflow() {
1821        let mut lexer = Lexer::new("lines:99999999999999999999");
1822        let result = lexer.tokenize();
1823        assert!(matches!(result, Err(LexError::NumberOverflow { .. })));
1824    }
1825
1826    #[test]
1827    fn test_tokenize_boolean_true() {
1828        let mut lexer = Lexer::new("async:true");
1829        let tokens = lexer.tokenize().unwrap();
1830
1831        assert!(matches!(
1832            tokens[2].token_type,
1833            TokenType::BooleanLiteral(true)
1834        ));
1835    }
1836
1837    #[test]
1838    fn test_tokenize_boolean_false() {
1839        let mut lexer = Lexer::new("async:FALSE");
1840        let tokens = lexer.tokenize().unwrap();
1841
1842        assert!(matches!(
1843            tokens[2].token_type,
1844            TokenType::BooleanLiteral(false)
1845        ));
1846    }
1847
1848    #[test]
1849    fn test_tokenize_complex_query() {
1850        let mut lexer = Lexer::new(r"kind:function AND async:true OR name~=/^test_/i");
1851        let tokens = lexer.tokenize().unwrap();
1852
1853        // Verify token sequence
1854        assert!(matches!(tokens[0].token_type, TokenType::Identifier(_)));
1855        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1856        assert!(matches!(tokens[2].token_type, TokenType::Word(_)));
1857        assert!(matches!(tokens[3].token_type, TokenType::And));
1858        assert!(matches!(tokens[4].token_type, TokenType::Identifier(_)));
1859        assert!(matches!(tokens[5].token_type, TokenType::Colon));
1860        assert!(matches!(
1861            tokens[6].token_type,
1862            TokenType::BooleanLiteral(true)
1863        ));
1864        assert!(matches!(tokens[7].token_type, TokenType::Or));
1865        assert!(matches!(tokens[8].token_type, TokenType::Identifier(_)));
1866        assert!(matches!(tokens[9].token_type, TokenType::RegexOp));
1867        assert!(matches!(
1868            tokens[10].token_type,
1869            TokenType::RegexLiteral { .. }
1870        ));
1871        assert!(matches!(tokens[11].token_type, TokenType::Eof));
1872    }
1873
1874    #[test]
1875    fn test_whitespace_handling() {
1876        let mut lexer = Lexer::new("  kind  :  function  ");
1877        let tokens = lexer.tokenize().unwrap();
1878
1879        assert_eq!(tokens.len(), 4); // kind, :, function, EOF
1880    }
1881
1882    #[test]
1883    fn test_unexpected_character() {
1884        let mut lexer = Lexer::new("kind@function");
1885        let result = lexer.tokenize();
1886        assert!(matches!(
1887            result,
1888            Err(LexError::UnexpectedChar { char: '@', .. })
1889        ));
1890    }
1891
1892    #[test]
1893    fn test_empty_string_literal() {
1894        let mut lexer = Lexer::new(r#"name:"""#);
1895        let tokens = lexer.tokenize().unwrap();
1896
1897        assert!(matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s.is_empty()));
1898    }
1899
1900    #[test]
1901    fn test_empty_regex_literal() {
1902        let mut lexer = Lexer::new(r"name~=//");
1903        let tokens = lexer.tokenize().unwrap();
1904
1905        if let TokenType::RegexLiteral { pattern, .. } = &tokens[2].token_type {
1906            assert_eq!(pattern, "");
1907        } else {
1908            panic!("Expected regex literal");
1909        }
1910    }
1911
1912    #[test]
1913    fn test_span_tracking() {
1914        let mut lexer = Lexer::new("kind:function");
1915        let tokens = lexer.tokenize().unwrap();
1916
1917        // Verify spans are set
1918        assert!(tokens[0].span.start == 0);
1919        assert!(tokens[0].span.end == 4); // "kind"
1920        assert!(tokens[1].span.start == 4);
1921        assert!(tokens[1].span.end == 5); // ":"
1922        assert!(tokens[2].span.start == 5);
1923        assert!(tokens[2].span.end == 13); // "function"
1924    }
1925
1926    #[test]
1927    fn test_identifier_vs_word() {
1928        let mut lexer = Lexer::new("kind:value value");
1929        let tokens = lexer.tokenize().unwrap();
1930
1931        // First 'value' after ':' should be a Word
1932        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "value"));
1933        // Second 'value' standalone should also be a Word
1934        assert!(matches!(tokens[3].token_type, TokenType::Word(ref s) if s == "value"));
1935    }
1936
1937    #[test]
1938    fn test_bare_word_with_glob() {
1939        let mut lexer = Lexer::new("path:src/*.rs");
1940        lexer.next_token().unwrap(); // path (identifier)
1941        lexer.next_token().unwrap(); // :
1942        let token = lexer.next_token().unwrap();
1943        match token.token_type {
1944            TokenType::Word(s) => assert_eq!(s, "src/*.rs"),
1945            _ => panic!(
1946                "Expected Word with glob pattern, got {:?}",
1947                token.token_type
1948            ),
1949        }
1950    }
1951
1952    #[test]
1953    fn test_bare_word_with_hyphen() {
1954        let mut lexer = Lexer::new("name:foo-bar");
1955        lexer.next_token().unwrap(); // name
1956        lexer.next_token().unwrap(); // :
1957        let token = lexer.next_token().unwrap();
1958        match token.token_type {
1959            TokenType::Word(s) => assert_eq!(s, "foo-bar"),
1960            _ => panic!("Expected Word with hyphen, got {:?}", token.token_type),
1961        }
1962    }
1963
1964    #[test]
1965    fn test_bare_word_with_dot() {
1966        let mut lexer = Lexer::new("path:foo.rs");
1967        lexer.next_token().unwrap(); // path
1968        lexer.next_token().unwrap(); // :
1969        let token = lexer.next_token().unwrap();
1970        match token.token_type {
1971            TokenType::Word(s) => assert_eq!(s, "foo.rs"),
1972            _ => panic!("Expected Word with dot, got {:?}", token.token_type),
1973        }
1974    }
1975
1976    #[test]
1977    fn test_variable_token() {
1978        let mut lexer = Lexer::new("$name");
1979        let tokens = lexer.tokenize().unwrap();
1980        assert_eq!(tokens.len(), 2); // Variable, Eof
1981        assert_eq!(
1982            tokens[0].token_type,
1983            TokenType::Variable("name".to_string())
1984        );
1985        assert!(matches!(tokens[1].token_type, TokenType::Eof));
1986    }
1987
1988    #[test]
1989    fn test_variable_token_with_underscores() {
1990        let mut lexer = Lexer::new("$my_var");
1991        let tokens = lexer.tokenize().unwrap();
1992        assert_eq!(tokens.len(), 2); // Variable, Eof
1993        assert_eq!(
1994            tokens[0].token_type,
1995            TokenType::Variable("my_var".to_string())
1996        );
1997    }
1998
1999    #[test]
2000    fn test_pipe_token() {
2001        let mut lexer = Lexer::new("|");
2002        let tokens = lexer.tokenize().unwrap();
2003        assert_eq!(tokens.len(), 2); // Pipe, Eof
2004        assert!(matches!(tokens[0].token_type, TokenType::Pipe));
2005    }
2006
2007    #[test]
2008    fn test_dollar_sign_alone_error() {
2009        let mut lexer = Lexer::new("$ ");
2010        let result = lexer.tokenize();
2011        assert!(
2012            matches!(result, Err(LexError::UnexpectedChar { char: '$', .. })),
2013            "Bare '$' should produce an error, got: {result:?}"
2014        );
2015    }
2016}
sqry_core/query/lexer.rs

sqry_core/query/
lexer.rs