sqry_core/query/
lexer.rs

1//! Lexer for the query language
2//!
3//! This module implements tokenization of query strings into a stream of tokens.
4//! Supports keywords (AND, OR, NOT), operators (:, ~=, >, <, etc.), string literals,
5//! regex literals with flags, numbers, and identifiers.
6
7use crate::query::error::LexError;
8use crate::query::types::{RegexFlags, Span};
9use log::trace;
10use std::cell::RefCell;
11use std::env;
12use std::str::Chars;
13use std::thread_local;
14
15#[cfg(all(test, feature = "dhat-heap"))]
16#[global_allocator]
17static DHAT_ALLOC: dhat::Alloc = dhat::Alloc;
18
19/// A token type in the query language
20#[derive(Debug, Clone, PartialEq)]
21pub enum TokenType {
22    // Keywords
23    /// AND keyword
24    And,
25    /// OR keyword
26    Or,
27    /// NOT keyword
28    Not,
29
30    // Operators
31    /// `:` operator (exact match / glob for paths)
32    Colon,
33    /// `~=` operator (regex match)
34    RegexOp,
35    /// `>` operator (greater than)
36    Greater,
37    /// `<` operator (less than)
38    Less,
39    /// `>=` operator (greater than or equal)
40    GreaterEq,
41    /// `<=` operator (less than or equal)
42    LessEq,
43    /// `|` pipe operator (for aggregation pipeline)
44    Pipe,
45
46    // Delimiters
47    /// `(` left parenthesis
48    LParen,
49    /// `)` right parenthesis
50    RParen,
51
52    // Values
53    /// Identifier (field names)
54    Identifier(String),
55    /// String literal (double or single quoted)
56    StringLiteral(String),
57    /// Regex literal with pattern and flags
58    RegexLiteral {
59        /// Regex pattern
60        pattern: String,
61        /// Regex flags (case-insensitive, multiline, dot-all)
62        flags: RegexFlags,
63    },
64    /// Number literal
65    NumberLiteral(i64),
66    /// Boolean literal
67    BooleanLiteral(bool),
68    /// Bare word (unquoted value)
69    Word(String),
70    /// Variable reference (`$name`)
71    Variable(String),
72
73    // Special
74    /// End of input
75    Eof,
76}
77
78/// A token with position information
79#[derive(Debug, Clone, PartialEq)]
80pub struct Token {
81    /// The type of token
82    pub token_type: TokenType,
83    /// Source position
84    pub span: Span,
85}
86
87impl Token {
88    /// Create a new token
89    #[must_use]
90    pub fn new(token_type: TokenType, span: Span) -> Self {
91        Self { token_type, span }
92    }
93}
94
95/// Lexer state for tokenization
96pub(crate) struct RawLexer<'a> {
97    /// Input string
98    input: &'a str,
99    /// Character iterator
100    chars: Chars<'a>,
101    /// Current position (byte offset)
102    position: usize,
103    /// Current line (1-based)
104    line: usize,
105    /// Current column (1-based)
106    column: usize,
107    /// Peeked character (for lookahead)
108    peeked: Option<char>,
109}
110
111impl<'a> RawLexer<'a> {
112    /// Create a new lexer for the given input
113    pub fn new(input: &'a str) -> Self {
114        Self {
115            input,
116            chars: input.chars(),
117            position: 0,
118            line: 1,
119            column: 1,
120            peeked: None,
121        }
122    }
123
124    /// Reset lexer state to the beginning of the current input.
125    pub fn restart(&mut self) {
126        self.chars = self.input.chars();
127        self.position = 0;
128        self.line = 1;
129        self.column = 1;
130        self.peeked = None;
131    }
132
133    /// Tokenize the input, appending tokens to the provided buffer.
134    pub fn tokenize_into(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexError> {
135        loop {
136            let token = self.next_token()?;
137            let is_eof = matches!(token.token_type, TokenType::Eof);
138            tokens.push(token);
139
140            if is_eof {
141                break;
142            }
143        }
144
145        Ok(())
146    }
147
148    /// Get the next token
149    #[allow(clippy::too_many_lines)] // Lexer state machine is clearer when kept in one place.
150    pub fn next_token(&mut self) -> Result<Token, LexError> {
151        self.skip_whitespace();
152
153        let start_pos = self.position;
154        let start_line = self.line;
155        let start_col = self.column;
156
157        let Some(ch) = self.peek_char() else {
158            return Ok(Token::new(
159                TokenType::Eof,
160                Span::with_position(self.position, self.position, self.line, self.column),
161            ));
162        };
163
164        let token_type = if let Some(token) = self.read_simple_token(ch) {
165            token
166        } else if ch == '$' {
167            self.read_variable_token(start_pos, start_line, start_col)?
168        } else if ch == '~' {
169            self.read_regex_operator(start_pos, start_line, start_col)?
170        } else if ch == '>' || ch == '<' {
171            self.read_comparison_operator(ch)
172        } else if ch == '"' || ch == '\'' {
173            let s = self.read_quoted_string(ch)?;
174            TokenType::StringLiteral(s)
175        } else if ch == '/' {
176            let (pattern, flags) = self.read_regex()?;
177            TokenType::RegexLiteral { pattern, flags }
178        } else if self.is_number_start(ch) {
179            let n = self.read_number()?;
180            TokenType::NumberLiteral(n)
181        } else if Self::is_word_start(ch) {
182            self.read_word_token()
183        } else {
184            return Err(LexError::UnexpectedChar {
185                char: ch,
186                span: Span::with_position(
187                    start_pos,
188                    start_pos + ch.len_utf8(),
189                    start_line,
190                    start_col,
191                ),
192            });
193        };
194
195        Ok(Token::new(
196            token_type,
197            Span::with_position(start_pos, self.position, start_line, start_col),
198        ))
199    }
200
201    fn read_simple_token(&mut self, ch: char) -> Option<TokenType> {
202        let token = match ch {
203            '(' => TokenType::LParen,
204            ')' => TokenType::RParen,
205            ':' => TokenType::Colon,
206            '|' => TokenType::Pipe,
207            _ => return None,
208        };
209        self.next_char();
210        Some(token)
211    }
212
213    fn read_regex_operator(
214        &mut self,
215        start_pos: usize,
216        start_line: usize,
217        start_col: usize,
218    ) -> Result<TokenType, LexError> {
219        self.next_char();
220        if self.peek_char() == Some('=') {
221            self.next_char();
222            Ok(TokenType::RegexOp)
223        } else {
224            Err(LexError::UnexpectedChar {
225                char: '~',
226                span: Span::with_position(start_pos, self.position, start_line, start_col),
227            })
228        }
229    }
230
231    fn read_comparison_operator(&mut self, ch: char) -> TokenType {
232        self.next_char();
233        let (equal, plain) = if ch == '>' {
234            (TokenType::GreaterEq, TokenType::Greater)
235        } else {
236            (TokenType::LessEq, TokenType::Less)
237        };
238        if self.peek_char() == Some('=') {
239            self.next_char();
240            equal
241        } else {
242            plain
243        }
244    }
245
246    fn is_number_start(&self, ch: char) -> bool {
247        ch.is_ascii_digit() || (ch == '-' && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()))
248    }
249
250    fn is_word_start(ch: char) -> bool {
251        ch.is_ascii_alphabetic() || ch == '_'
252    }
253
254    /// Read a variable token: `$name` where name is alphanumeric/underscore.
255    fn read_variable_token(
256        &mut self,
257        start_pos: usize,
258        start_line: usize,
259        start_col: usize,
260    ) -> Result<TokenType, LexError> {
261        self.next_char(); // consume '$'
262
263        // Read the variable name (must be non-empty, alphanumeric + underscore)
264        let mut name = String::new();
265        while let Some(c) = self.peek_char() {
266            if c.is_ascii_alphanumeric() || c == '_' {
267                name.push(c);
268                self.next_char();
269            } else {
270                break;
271            }
272        }
273
274        if name.is_empty() {
275            return Err(LexError::UnexpectedChar {
276                char: '$',
277                span: Span::with_position(start_pos, self.position, start_line, start_col),
278            });
279        }
280
281        Ok(TokenType::Variable(name))
282    }
283
284    fn read_word_token(&mut self) -> TokenType {
285        let word = self.read_word();
286        match word.to_uppercase().as_str() {
287            "AND" => TokenType::And,
288            "OR" => TokenType::Or,
289            "NOT" => TokenType::Not,
290            "TRUE" => TokenType::BooleanLiteral(true),
291            "FALSE" => TokenType::BooleanLiteral(false),
292            _ => {
293                self.skip_whitespace();
294                match self.peek_char() {
295                    Some(':' | '~' | '>' | '<') => TokenType::Identifier(word),
296                    _ => TokenType::Word(word),
297                }
298            }
299        }
300    }
301
302    /// Peek at the next character without consuming it
303    fn peek_char(&mut self) -> Option<char> {
304        if self.peeked.is_none() {
305            self.peeked = self.chars.next();
306        }
307        self.peeked
308    }
309
310    /// Peek ahead n characters
311    fn peek_ahead(&self, n: usize) -> Option<char> {
312        self.input[self.position..].chars().nth(n)
313    }
314
315    /// Consume and return the next character
316    fn next_char(&mut self) -> Option<char> {
317        let ch = if let Some(c) = self.peeked.take() {
318            Some(c)
319        } else {
320            self.chars.next()
321        };
322
323        if let Some(c) = ch {
324            self.position += c.len_utf8();
325            if c == '\n' {
326                self.line += 1;
327                self.column = 1;
328            } else {
329                self.column += 1;
330            }
331        }
332
333        ch
334    }
335
336    /// Skip whitespace characters
337    fn skip_whitespace(&mut self) {
338        while let Some(c) = self.peek_char() {
339            if c.is_whitespace() {
340                self.next_char();
341            } else {
342                break;
343            }
344        }
345    }
346
347    /// Read a quoted string with escape handling
348    fn read_quoted_string(&mut self, quote: char) -> Result<String, LexError> {
349        let start_pos = self.position;
350        let start_line = self.line;
351        let start_col = self.column;
352        self.next_char(); // Skip opening quote
353
354        let mut result = String::new();
355
356        loop {
357            match self.next_char() {
358                Some(c) if c == quote => {
359                    // Closing quote
360                    return Ok(result);
361                }
362                Some('\\') => {
363                    let escaped = self.read_escape_sequence(start_pos, start_line, start_col)?;
364                    result.push(escaped);
365                }
366                Some(c) => result.push(c),
367                None => {
368                    return Err(LexError::UnterminatedString {
369                        span: Span::with_position(start_pos, self.position, start_line, start_col),
370                    });
371                }
372            }
373        }
374    }
375
376    fn read_escape_sequence(
377        &mut self,
378        start_pos: usize,
379        start_line: usize,
380        start_col: usize,
381    ) -> Result<char, LexError> {
382        match self.next_char() {
383            Some('"') => Ok('"'),
384            Some('\'') => Ok('\''),
385            Some('\\') => Ok('\\'),
386            Some('n') => Ok('\n'),
387            Some('t') => Ok('\t'),
388            Some('r') => Ok('\r'),
389            Some('u') => self.read_unicode_escape(),
390            // Glob metacharacter escapes - passed through literally for glob pattern matching
391            Some('*') => Ok('*'),
392            Some('?') => Ok('?'),
393            Some('[') => Ok('['),
394            Some(']') => Ok(']'),
395            Some('{') => Ok('{'),
396            Some('}') => Ok('}'),
397            Some(c) => Err(LexError::InvalidEscape {
398                char: c,
399                span: Span::with_position(self.position - 2, self.position, self.line, self.column),
400            }),
401            None => Err(LexError::UnterminatedString {
402                span: Span::with_position(start_pos, self.position, start_line, start_col),
403            }),
404        }
405    }
406
407    fn read_unicode_escape(&mut self) -> Result<char, LexError> {
408        // Unicode escape: \uXXXX
409        let hex = self.read_hex_digits(4)?;
410        let code_point =
411            u32::from_str_radix(&hex, 16).map_err(|_| LexError::InvalidUnicodeEscape {
412                got: hex.chars().next().unwrap_or('?'),
413                span: Span::with_position(
414                    self.position - hex.len() - 2,
415                    self.position,
416                    self.line,
417                    self.column,
418                ),
419            })?;
420        let ch = char::from_u32(code_point).ok_or_else(|| LexError::InvalidUnicodeEscape {
421            got: hex.chars().next().unwrap_or('?'),
422            span: Span::with_position(
423                self.position - hex.len() - 2,
424                self.position,
425                self.line,
426                self.column,
427            ),
428        })?;
429        Ok(ch)
430    }
431
432    /// Read a regex literal: /pattern/flags
433    fn read_regex(&mut self) -> Result<(String, RegexFlags), LexError> {
434        let start_pos = self.position;
435        let start_line = self.line;
436        let start_col = self.column;
437        self.next_char(); // Skip opening /
438
439        let pattern = self.read_regex_pattern(start_pos, start_line, start_col)?;
440        let flags = self.read_regex_flags(start_pos, start_line, start_col, &pattern)?;
441        self.validate_regex_pattern(&pattern, &flags, start_pos, start_line, start_col)?;
442        Ok((pattern, flags))
443    }
444
445    fn read_regex_pattern(
446        &mut self,
447        start_pos: usize,
448        start_line: usize,
449        start_col: usize,
450    ) -> Result<String, LexError> {
451        let mut pattern = String::new();
452
453        // Read pattern until closing /
454        loop {
455            match self.next_char() {
456                Some('/') => {
457                    // Count trailing backslashes to determine if slash is escaped
458                    let trailing_backslashes =
459                        pattern.chars().rev().take_while(|&c| c == '\\').count();
460
461                    if trailing_backslashes % 2 == 1 {
462                        // Odd number of backslashes: last one escapes the slash
463                        pattern.push('/');
464                        continue;
465                    }
466                    // Even number (or zero): slash is not escaped, end of pattern
467                    break;
468                }
469                Some(c) => pattern.push(c),
470                None => {
471                    return Err(LexError::UnterminatedRegex {
472                        span: Span::with_position(start_pos, self.position, start_line, start_col),
473                    });
474                }
475            }
476        }
477
478        Ok(pattern)
479    }
480
481    fn read_regex_flags(
482        &mut self,
483        start_pos: usize,
484        start_line: usize,
485        start_col: usize,
486        pattern: &str,
487    ) -> Result<RegexFlags, LexError> {
488        let mut flags = RegexFlags::default();
489        while let Some(ch) = self.peek_char() {
490            match ch {
491                'i' => {
492                    flags.case_insensitive = true;
493                    self.next_char();
494                }
495                'm' => {
496                    flags.multiline = true;
497                    self.next_char();
498                }
499                's' => {
500                    flags.dot_all = true;
501                    self.next_char();
502                }
503                _ if ch.is_ascii_alphabetic() => {
504                    // Unknown flag - return error
505                    return Err(LexError::InvalidRegex {
506                        pattern: pattern.to_string(),
507                        error: format!("Unknown regex flag '{ch}'"),
508                        span: Span::with_position(
509                            start_pos,
510                            self.position + 1,
511                            start_line,
512                            start_col,
513                        ),
514                    });
515                }
516                _ => break,
517            }
518        }
519
520        Ok(flags)
521    }
522
523    fn validate_regex_pattern(
524        &self,
525        pattern: &str,
526        flags: &RegexFlags,
527        start_pos: usize,
528        start_line: usize,
529        start_col: usize,
530    ) -> Result<(), LexError> {
531        let mut builder = regex::RegexBuilder::new(pattern);
532        builder
533            .case_insensitive(flags.case_insensitive)
534            .multi_line(flags.multiline)
535            .dot_matches_new_line(flags.dot_all);
536
537        if let Err(e) = builder.build() {
538            return Err(LexError::InvalidRegex {
539                pattern: pattern.to_string(),
540                error: e.to_string(),
541                span: Span::with_position(start_pos, self.position, start_line, start_col),
542            });
543        }
544
545        Ok(())
546    }
547
548    /// Read hexadecimal digits for Unicode escapes
549    fn read_hex_digits(&mut self, count: usize) -> Result<String, LexError> {
550        let mut hex = String::new();
551
552        for _ in 0..count {
553            match self.next_char() {
554                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
555                Some(c) => {
556                    return Err(LexError::InvalidUnicodeEscape {
557                        got: c,
558                        span: Span::with_position(
559                            self.position - 1,
560                            self.position,
561                            self.line,
562                            self.column.saturating_sub(1),
563                        ),
564                    });
565                }
566                None => {
567                    return Err(LexError::InvalidUnicodeEscape {
568                        got: '?',
569                        span: Span::with_position(
570                            self.position,
571                            self.position,
572                            self.line,
573                            self.column,
574                        ),
575                    });
576                }
577            }
578        }
579
580        Ok(hex)
581    }
582
583    /// Read a number (integer, possibly negative, possibly with underscores)
584    fn read_number(&mut self) -> Result<i64, LexError> {
585        let start_pos = self.position;
586        let start_line = self.line;
587        let start_col = self.column;
588        let mut num_str = String::new();
589
590        // Handle negative sign
591        if self.peek_char() == Some('-') {
592            num_str.push('-');
593            self.next_char();
594        }
595
596        // Read digits (with optional underscores)
597        while let Some(c) = self.peek_char() {
598            if c.is_ascii_digit() {
599                num_str.push(c);
600                self.next_char();
601            } else if c == '_' {
602                // Skip underscores
603                self.next_char();
604            } else {
605                break;
606            }
607        }
608
609        // Parse the number
610        num_str
611            .parse::<i64>()
612            .map_err(|e| LexError::NumberOverflow {
613                text: num_str.clone(),
614                error: e.to_string(),
615                span: Span::with_position(start_pos, self.position, start_line, start_col),
616            })
617    }
618
619    /// Read a word (identifier or keyword).
620    /// Supports characters: [a-zA-Z0-9_.*?/-]+ plus generic segments like `<T,U>`.
621    fn read_word(&mut self) -> String {
622        let mut word = String::new();
623
624        while let Some(c) = self.peek_char() {
625            match self.classify_word_char(c) {
626                WordCharType::Basic => {
627                    word.push(c);
628                    self.next_char();
629                }
630                WordCharType::DoubleColon => {
631                    word.push_str("::");
632                    self.next_char();
633                    self.next_char();
634                }
635                WordCharType::GenericStart => {
636                    self.consume_generic_segment(&mut word);
637                }
638                WordCharType::End => break,
639            }
640        }
641
642        word
643    }
644
645    /// Classify a character for word parsing.
646    fn classify_word_char(&self, c: char) -> WordCharType {
647        if c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '*' | '?' | '/' | '-' | '[' | ']') {
648            WordCharType::Basic
649        } else if c == ':' && self.peek_ahead(1) == Some(':') {
650            WordCharType::DoubleColon
651        } else if c == '<' && self.has_generic_closing_angle() {
652            WordCharType::GenericStart
653        } else {
654            WordCharType::End
655        }
656    }
657
658    /// Consume a generic segment like `<T,U>` into the word buffer.
659    fn consume_generic_segment(&mut self, word: &mut String) {
660        word.push('<');
661        self.next_char();
662
663        let mut depth = 1usize;
664        while let Some(ch) = self.peek_char() {
665            if ch.is_whitespace() {
666                break;
667            }
668            depth = match ch {
669                '<' => depth.saturating_add(1),
670                '>' => depth.saturating_sub(1),
671                _ => depth,
672            };
673            word.push(ch);
674            self.next_char();
675            if depth == 0 {
676                break;
677            }
678        }
679    }
680
681    /// Check if there's a matching closing angle bracket for generics.
682    fn has_generic_closing_angle(&self) -> bool {
683        let mut depth = 0usize;
684
685        for ch in self.input[self.position..].chars() {
686            if ch.is_whitespace() {
687                return false;
688            }
689            match ch {
690                '<' => depth = depth.saturating_add(1),
691                '>' => {
692                    if depth == 0 {
693                        return false;
694                    }
695                    depth = depth.saturating_sub(1);
696                    if depth == 0 {
697                        return true;
698                    }
699                }
700                _ => {}
701            }
702        }
703
704        false
705    }
706}
707
708/// Classification of characters for word parsing.
709enum WordCharType {
710    /// Basic word character (alphanumeric or special symbols).
711    Basic,
712    /// Double colon `::` for namespace separation.
713    DoubleColon,
714    /// Start of a generic segment `<`.
715    GenericStart,
716    /// End of word (not a valid word character).
717    End,
718}
719
720/// Public, non-pooled lexer wrapper over [`RawLexer`], re-exported for the query
721/// parser tests and external callers that want a one-shot tokenizer. Production
722/// query parsing goes through the thread-local pool via `with_lexer` (see
723/// `parser_new`); this wrapper stays for the simpler owned-`Vec<Token>` API.
724pub struct Lexer<'a> {
725    raw: RawLexer<'a>,
726}
727
728impl<'a> Lexer<'a> {
729    /// Create a new lexer for the given input.
730    #[must_use]
731    pub fn new(input: &'a str) -> Self {
732        Self {
733            raw: RawLexer::new(input),
734        }
735    }
736
737    /// Tokenize the entire input into a vector of tokens.
738    ///
739    /// # Errors
740    ///
741    /// Returns [`LexError`] when lexical analysis fails (unterminated strings, invalid regexes, etc.).
742    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
743        let mut tokens = Vec::with_capacity(16);
744        self.raw.restart();
745        self.raw.tokenize_into(&mut tokens)?;
746        Ok(tokens)
747    }
748
749    /// Fetch the next token from the stream (used in parser tests).
750    ///
751    /// # Errors
752    ///
753    /// Returns [`LexError`] when the next token cannot be produced.
754    pub fn next_token(&mut self) -> Result<Token, LexError> {
755        self.raw.next_token()
756    }
757}
758
759#[derive(Clone, Copy, Debug, PartialEq, Eq)]
760pub(crate) struct ShrinkPolicy {
761    pub max_capacity: usize,
762    pub shrink_ratio: usize,
763}
764
765impl Default for ShrinkPolicy {
766    fn default() -> Self {
767        Self {
768            max_capacity: 256,
769            shrink_ratio: 8,
770        }
771    }
772}
773
774// Runtime knobs (env-first) controlling the lexer pool.
775const POOL_MAX_DEFAULT: usize = 4;
776const ENV_POOL_MAX: &str = "SQRY_LEXER_POOL_MAX";
777const ENV_POOL_MAX_CAP: &str = "SQRY_LEXER_POOL_MAX_CAP";
778const ENV_POOL_SHRINK_RATIO: &str = "SQRY_LEXER_POOL_SHRINK_RATIO";
779
780#[derive(Clone, Copy, Debug, PartialEq, Eq)]
781struct PoolConfig {
782    max_size: usize,
783    shrink_policy: ShrinkPolicy,
784}
785
786impl PoolConfig {
787    fn default() -> Self {
788        Self {
789            max_size: POOL_MAX_DEFAULT,
790            shrink_policy: ShrinkPolicy::default(),
791        }
792    }
793
794    fn from_environment() -> Self {
795        let mut config = Self::default();
796
797        if let Some(value) = read_env_usize(ENV_POOL_MAX) {
798            config.max_size = value;
799        }
800
801        if let Some(value) = read_env_usize(ENV_POOL_MAX_CAP) {
802            config.shrink_policy.max_capacity = value.max(1);
803        }
804
805        if let Some(value) = read_env_usize(ENV_POOL_SHRINK_RATIO) {
806            config.shrink_policy.shrink_ratio = value.max(1);
807        }
808
809        config
810    }
811}
812
813fn read_env_usize(var: &str) -> Option<usize> {
814    match env::var(var) {
815        Ok(value) => match value.parse::<usize>() {
816            Ok(parsed) => Some(parsed),
817            Err(err) => {
818                trace!("Ignoring invalid value for {var}: {err}");
819                None
820            }
821        },
822        Err(std::env::VarError::NotPresent) => None,
823        Err(std::env::VarError::NotUnicode(_)) => {
824            trace!("Ignoring non-unicode value for {var}");
825            None
826        }
827    }
828}
829
830thread_local! {
831    static LEXER_POOL: RefCell<LexerPool> = RefCell::new(LexerPool::new(PoolConfig::default()));
832}
833
834struct LexerPool {
835    stash: Vec<ReusableLexer>,
836    in_flight: usize,
837    config: PoolConfig,
838}
839
840impl LexerPool {
841    fn new(config: PoolConfig) -> Self {
842        Self {
843            stash: Vec::new(),
844            in_flight: 0,
845            config,
846        }
847    }
848
849    fn apply_config(&mut self, config: PoolConfig) {
850        if self.config == config {
851            return;
852        }
853
854        trace!(
855            "sqry::query::lexer: updating pool config -> max_size={}, max_capacity={}, shrink_ratio={}",
856            config.max_size, config.shrink_policy.max_capacity, config.shrink_policy.shrink_ratio
857        );
858
859        self.config = config;
860        self.stash.clear();
861        self.in_flight = 0;
862    }
863
864    fn acquire(&mut self) -> LexerHandle {
865        if let Some(lexer) = self.stash.pop() {
866            self.in_flight += 1;
867            return LexerHandle::pooled(lexer);
868        }
869
870        if self.in_flight < self.config.max_size {
871            self.in_flight += 1;
872            let lexer = ReusableLexer::with_policy(self.config.shrink_policy);
873            return LexerHandle::pooled(lexer);
874        }
875
876        LexerHandle::temporary(ReusableLexer::with_policy(self.config.shrink_policy))
877    }
878
879    fn release(&mut self, lexer: ReusableLexer) {
880        if self.config.max_size == 0 {
881            self.in_flight = self.in_flight.saturating_sub(1);
882            return;
883        }
884
885        self.in_flight = self.in_flight.saturating_sub(1);
886        if self.stash.len() < self.config.max_size {
887            self.stash.push(lexer);
888        }
889    }
890
891    #[cfg(test)]
892    fn stats(&self) -> (usize, usize, PoolConfig) {
893        (self.stash.len(), self.in_flight, self.config)
894    }
895
896    #[cfg(test)]
897    fn reset(&mut self, config: PoolConfig) {
898        self.stash.clear();
899        self.in_flight = 0;
900        self.config = config;
901    }
902}
903
904struct LexerHandle {
905    lexer: Option<ReusableLexer>,
906    pooled: bool,
907}
908
909impl LexerHandle {
910    fn pooled(lexer: ReusableLexer) -> Self {
911        Self {
912            lexer: Some(lexer),
913            pooled: true,
914        }
915    }
916
917    fn temporary(lexer: ReusableLexer) -> Self {
918        Self {
919            lexer: Some(lexer),
920            pooled: false,
921        }
922    }
923
924    fn lexer_mut(&mut self) -> &mut ReusableLexer {
925        // SAFETY: self.lexer is always Some during LexerHandle's lifetime.
926        // It only becomes None during Drop (after moving to pool), which occurs
927        // after all user operations complete. This expect() cannot panic during normal use.
928        self.lexer.as_mut().expect("lexer handle missing lexer")
929    }
930
931    fn reset(&mut self, input: &str) {
932        self.lexer_mut().reset(input);
933    }
934
935    fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
936        self.lexer_mut().tokenize()
937    }
938}
939
940impl Drop for LexerHandle {
941    fn drop(&mut self) {
942        if !self.pooled {
943            return;
944        }
945
946        if let Some(lexer) = self.lexer.take() {
947            LEXER_POOL.with(|cell| {
948                cell.borrow_mut().release(lexer);
949            });
950        }
951    }
952}
953
954#[cfg(test)]
955pub(crate) fn configure_pool_for_tests(max_size: usize, shrink_policy: ShrinkPolicy) {
956    LEXER_POOL.with(|cell| {
957        cell.borrow_mut().reset(PoolConfig {
958            max_size,
959            shrink_policy,
960        });
961    });
962}
963
964#[cfg(test)]
965pub(crate) fn reset_pool_to_default_for_tests() {
966    configure_pool_for_tests(POOL_MAX_DEFAULT, ShrinkPolicy::default());
967}
968
969#[cfg(test)]
970pub(crate) fn pool_stats_for_tests() -> (usize, usize, usize) {
971    LEXER_POOL.with(|cell| {
972        let (stash, in_flight, config) = cell.borrow().stats();
973        (stash, in_flight, config.max_size)
974    })
975}
976
977pub(crate) fn with_lexer<F, T>(input: &str, f: F) -> Result<T, LexError>
978where
979    F: FnOnce(TokenBatch<'_>) -> Result<T, LexError>,
980{
981    let config = PoolConfig::from_environment();
982
983    if config.max_size == 0 {
984        LEXER_POOL.with(|cell| {
985            cell.borrow_mut().apply_config(config);
986        });
987        let mut lexer = ReusableLexer::with_policy(config.shrink_policy);
988        lexer.reset(input);
989        let batch = lexer.tokenize()?;
990        return f(batch);
991    }
992
993    let mut handle = LEXER_POOL.with(|cell| {
994        let mut pool = cell.borrow_mut();
995        pool.apply_config(config);
996        pool.acquire()
997    });
998
999    handle.reset(input);
1000    let batch = handle.tokenize()?;
1001    let result = f(batch);
1002    drop(handle);
1003    result
1004}
1005
1006/// Tokenize using the thread-local lexer pool, returning owned tokens.
1007///
1008/// This is useful for benches and integration points that only need the token
1009/// stream and do not want to work with the internal `TokenBatch` guard.
1010///
1011/// # Errors
1012///
1013/// Returns [`LexError`] when lexical analysis fails.
1014pub fn tokenize_with_pool(input: &str) -> Result<Vec<Token>, LexError> {
1015    with_lexer(input, |batch| Ok(batch.into_vec()))
1016}
1017
1018#[cfg(debug_assertions)]
1019#[derive(Debug, Default, Clone, Copy)]
1020struct LexerDiagnostics {
1021    reuse_count: usize,
1022    max_capacity_seen: usize,
1023    shrink_count: usize,
1024}
1025
1026#[cfg(debug_assertions)]
1027impl LexerDiagnostics {
1028    fn record_reuse(&mut self, capacity: usize) {
1029        self.reuse_count += 1;
1030        if capacity > self.max_capacity_seen {
1031            self.max_capacity_seen = capacity;
1032        }
1033    }
1034
1035    fn record_shrink(&mut self) {
1036        self.shrink_count += 1;
1037    }
1038}
1039
1040/// Policy controlling how aggressively reusable lexer buffers shrink.
1041/// Reusable lexer that owns its input, token buffer, and shrink policy.
1042pub(crate) struct ReusableLexer {
1043    input: String,
1044    token_buffer: Vec<Token>,
1045    shrink_policy: ShrinkPolicy,
1046    #[cfg(debug_assertions)]
1047    diagnostics: LexerDiagnostics,
1048}
1049
1050impl ReusableLexer {
1051    // Only the in-crate test suite constructs a lexer with the default policy;
1052    // the pool always goes through `with_policy`, so gate the lint to non-test
1053    // builds instead of silencing it everywhere.
1054    #[cfg_attr(not(test), allow(dead_code))]
1055    pub fn new() -> Self {
1056        Self::with_policy(ShrinkPolicy::default())
1057    }
1058
1059    pub fn with_policy(shrink_policy: ShrinkPolicy) -> Self {
1060        Self {
1061            input: String::new(),
1062            token_buffer: Vec::with_capacity(16),
1063            shrink_policy,
1064            #[cfg(debug_assertions)]
1065            diagnostics: LexerDiagnostics::default(),
1066        }
1067    }
1068
1069    /// Reset the lexer to a new input string.
1070    pub fn reset(&mut self, input: &str) {
1071        self.input.clear();
1072        self.input.push_str(input);
1073        self.token_buffer.clear();
1074    }
1075
1076    /// Tokenize the current input, returning an RAII guard over the buffer.
1077    pub fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
1078        self.token_buffer.clear();
1079        let mut raw = RawLexer::new(self.input.as_str());
1080        raw.tokenize_into(&mut self.token_buffer)?;
1081        #[cfg(debug_assertions)]
1082        self.diagnostics.record_reuse(self.token_buffer.capacity());
1083        Ok(TokenBatch {
1084            tokens: &mut self.token_buffer,
1085            shrink_policy: self.shrink_policy,
1086            #[cfg(debug_assertions)]
1087            diagnostics: &mut self.diagnostics,
1088        })
1089    }
1090
1091    // Read back by the debug-assertion reuse/shrink tests only; the runtime path
1092    // records into `diagnostics` but never reads it back.
1093    #[cfg(debug_assertions)]
1094    #[cfg_attr(not(test), allow(dead_code))]
1095    fn diagnostics(&self) -> &LexerDiagnostics {
1096        &self.diagnostics
1097    }
1098}
1099
1100/// RAII guard for the reusable token buffer.
1101///
1102/// Provides read-only access via `as_slice()` or transfers ownership via
1103/// `into_vec()`, draining the reusable buffer without cloning. The guard holds
1104/// a mutable borrow to the underlying buffer so additional tokenization cannot
1105/// start until it is dropped. On drop the buffer is cleared and, if the shrink
1106/// policy deems it oversized, the capacity is reduced.
1107pub(crate) struct TokenBatch<'a> {
1108    tokens: &'a mut Vec<Token>,
1109    shrink_policy: ShrinkPolicy,
1110    #[cfg(debug_assertions)]
1111    diagnostics: &'a mut LexerDiagnostics,
1112}
1113
1114impl TokenBatch<'_> {
1115    // Runtime callers drain the batch via `into_vec`; only the tests borrow the
1116    // buffer read-only, so keep the lint live outside test builds.
1117    #[cfg_attr(not(test), allow(dead_code))]
1118    pub fn as_slice(&self) -> &[Token] {
1119        self.tokens.as_slice()
1120    }
1121
1122    #[allow(unused_mut)]
1123    pub fn into_vec(mut self) -> Vec<Token> {
1124        let result = self.tokens.drain(..).collect();
1125        #[cfg(debug_assertions)]
1126        let _ = &mut *self.diagnostics; // keep diagnostics reference alive for Drop
1127        result
1128    }
1129}
1130
1131impl Drop for TokenBatch<'_> {
1132    fn drop(&mut self) {
1133        if !self.tokens.is_empty() {
1134            self.tokens.clear();
1135        }
1136
1137        let shrink_threshold = self
1138            .shrink_policy
1139            .max_capacity
1140            .saturating_mul(self.shrink_policy.shrink_ratio);
1141        if shrink_threshold > 0 && self.tokens.capacity() > shrink_threshold {
1142            self.tokens.shrink_to(self.shrink_policy.max_capacity);
1143            #[cfg(debug_assertions)]
1144            self.diagnostics.record_shrink();
1145        }
1146    }
1147}
1148
1149#[cfg(test)]
1150mod tests {
1151    use super::*;
1152    use std::panic::{AssertUnwindSafe, catch_unwind};
1153    use std::sync::{Mutex, OnceLock};
1154
1155    #[cfg(feature = "dhat-heap")]
1156    use dhat::{HeapStats, Profiler};
1157
1158    fn reset_pool_from_env() {
1159        let config = PoolConfig::from_environment();
1160        LEXER_POOL.with(|cell| {
1161            cell.borrow_mut().reset(config);
1162        });
1163    }
1164
1165    fn reset_pool_default() {
1166        unsafe {
1167            std::env::remove_var(ENV_POOL_MAX);
1168            std::env::remove_var(ENV_POOL_MAX_CAP);
1169            std::env::remove_var(ENV_POOL_SHRINK_RATIO);
1170        }
1171        reset_pool_from_env();
1172    }
1173
1174    fn set_env(var: &str, value: &str) {
1175        unsafe {
1176            std::env::set_var(var, value);
1177        }
1178    }
1179
1180    fn remove_env(var: &str) {
1181        unsafe {
1182            std::env::remove_var(var);
1183        }
1184    }
1185
1186    fn env_lock() -> &'static Mutex<()> {
1187        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
1188        LOCK.get_or_init(|| Mutex::new(()))
1189    }
1190
1191    #[test]
1192    fn reusable_lexer_reuses_buffer_across_calls() {
1193        let mut lexer = ReusableLexer::new();
1194        lexer.reset("kind:function");
1195
1196        let first_ptr = {
1197            let batch = lexer.tokenize().unwrap();
1198            let ptr = batch.as_slice().as_ptr();
1199            assert!(!batch.as_slice().is_empty());
1200            ptr
1201        };
1202        assert_eq!(first_ptr, lexer.token_buffer.as_ptr());
1203
1204        lexer.reset("name:test");
1205        let second_ptr = {
1206            let batch = lexer.tokenize().unwrap();
1207            let ptr = batch.as_slice().as_ptr();
1208            assert!(!batch.as_slice().is_empty());
1209            ptr
1210        };
1211        assert_eq!(second_ptr, lexer.token_buffer.as_ptr());
1212        assert_eq!(first_ptr, second_ptr);
1213        #[cfg(debug_assertions)]
1214        {
1215            let diagnostics = lexer.diagnostics();
1216            assert!(diagnostics.reuse_count >= 2);
1217            assert!(diagnostics.max_capacity_seen >= lexer.token_buffer.capacity());
1218        }
1219    }
1220
1221    #[test]
1222    fn reusable_lexer_clears_buffer_on_panic() {
1223        let mut lexer = ReusableLexer::new();
1224        lexer.reset("kind:function");
1225
1226        let result = catch_unwind(AssertUnwindSafe(|| {
1227            let _batch = lexer.tokenize().unwrap();
1228            panic!("boom");
1229        }));
1230
1231        assert!(result.is_err());
1232        assert_eq!(lexer.token_buffer.len(), 0);
1233    }
1234
1235    #[test]
1236    fn reusable_lexer_into_vec_drains_tokens() {
1237        let mut lexer = ReusableLexer::new();
1238        lexer.reset("kind:function");
1239
1240        let tokens = {
1241            let batch = lexer.tokenize().unwrap();
1242            batch.into_vec()
1243        };
1244
1245        assert_eq!(tokens.len(), 4);
1246        assert_eq!(lexer.token_buffer.len(), 0);
1247    }
1248
1249    #[test]
1250    fn reusable_lexer_shrink_policy_applies() {
1251        let policy = ShrinkPolicy {
1252            max_capacity: 8,
1253            shrink_ratio: 2,
1254        };
1255
1256        let mut lexer = ReusableLexer::with_policy(policy);
1257        let large_query = (0..128)
1258            .map(|i| format!("name:value{i}"))
1259            .collect::<Vec<_>>()
1260            .join(" ");
1261        lexer.reset(&large_query);
1262
1263        {
1264            let batch = lexer.tokenize().unwrap();
1265            let _ = batch.into_vec();
1266        }
1267
1268        if lexer.token_buffer.capacity() <= policy.max_capacity * policy.shrink_ratio {
1269            lexer
1270                .token_buffer
1271                .reserve(policy.max_capacity * policy.shrink_ratio * 2);
1272        }
1273        assert!(lexer.token_buffer.capacity() > policy.max_capacity * policy.shrink_ratio);
1274
1275        lexer.reset("kind:function");
1276        {
1277            let batch = lexer.tokenize().unwrap();
1278            drop(batch);
1279        }
1280
1281        assert!(lexer.token_buffer.capacity() <= policy.max_capacity);
1282
1283        #[cfg(debug_assertions)]
1284        {
1285            let diagnostics = lexer.diagnostics();
1286            assert!(diagnostics.shrink_count >= 1);
1287        }
1288    }
1289
1290    // The pool env tests below hold the local env_lock() (pool state
1291    // discipline) AND join the crate-wide #[serial_test::serial] lane:
1292    // SQRY_LEXER_POOL_MAX is listed in config::snapshot::CONFIG_INVENTORY,
1293    // whose serial-laned tests remove every inventory var. The two
1294    // exclusion domains don't coordinate, so the mutex alone still
1295    // races those tests (codex iter2 MAJOR).
1296    #[test]
1297    #[serial_test::serial]
1298    fn lexer_pool_returns_lexers_to_stash() {
1299        let _guard = env_lock().lock().unwrap();
1300        reset_pool_default();
1301
1302        assert_eq!(PoolConfig::from_environment().max_size, POOL_MAX_DEFAULT);
1303
1304        let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1305        assert_eq!(tokens.len(), 4);
1306
1307        LEXER_POOL.with(|cell| {
1308            let (stash_len, in_flight, config) = cell.borrow().stats();
1309            assert_eq!(config.max_size, POOL_MAX_DEFAULT);
1310            assert_eq!(in_flight, 0);
1311            assert_eq!(stash_len, 1);
1312        });
1313    }
1314
1315    #[test]
1316    #[serial_test::serial]
1317    fn lexer_pool_respects_zero_capacity_env() {
1318        let _guard = env_lock().lock().unwrap();
1319        set_env(ENV_POOL_MAX, "0");
1320        reset_pool_from_env();
1321
1322        let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1323        assert_eq!(tokens.len(), 4);
1324
1325        LEXER_POOL.with(|cell| {
1326            let (stash_len, in_flight, config) = cell.borrow().stats();
1327            assert_eq!(config.max_size, 0);
1328            assert_eq!(in_flight, 0);
1329            assert_eq!(stash_len, 0);
1330        });
1331
1332        remove_env(ENV_POOL_MAX);
1333        reset_pool_default();
1334    }
1335
1336    #[test]
1337    #[serial_test::serial]
1338    fn lexer_pool_reuses_single_slot() {
1339        let _guard = env_lock().lock().unwrap();
1340        set_env(ENV_POOL_MAX, "1");
1341        reset_pool_from_env();
1342
1343        assert_eq!(PoolConfig::from_environment().max_size, 1);
1344
1345        for query in ["kind:function", "name:test"] {
1346            let _ = with_lexer(query, |batch| Ok(batch.into_vec())).unwrap();
1347        }
1348
1349        LEXER_POOL.with(|cell| {
1350            let (stash_len, in_flight, config) = cell.borrow().stats();
1351            assert_eq!(config.max_size, 1);
1352            assert_eq!(in_flight, 0);
1353            assert_eq!(stash_len, 1);
1354        });
1355
1356        remove_env(ENV_POOL_MAX);
1357        reset_pool_default();
1358    }
1359
1360    #[test]
1361    fn lexer_handles_double_colon_in_words() {
1362        let mut lexer = Lexer::new("callers:Player::takeDamage");
1363        let tokens = lexer.tokenize().unwrap();
1364        assert_eq!(tokens.len(), 4); // Identifier, Colon, Word, Eof
1365        assert_eq!(
1366            tokens[0].token_type,
1367            TokenType::Identifier("callers".to_string())
1368        );
1369        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1370        assert_eq!(
1371            tokens[2].token_type,
1372            TokenType::Word("Player::takeDamage".to_string())
1373        );
1374        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1375    }
1376
1377    #[test]
1378    #[serial_test::serial]
1379    #[ignore = "Test depends on clean env_lock state. Run in isolation with: cargo test -p sqry-core --lib with_lexer_allows_reentrant_usage -- --ignored --test-threads=1"]
1380    fn with_lexer_allows_reentrant_usage() {
1381        let _guard = env_lock().lock().unwrap();
1382        reset_pool_default();
1383
1384        let result = with_lexer("kind:function", |batch| {
1385            assert!(!batch.as_slice().is_empty());
1386            with_lexer("name:test", |inner_batch| {
1387                assert!(!inner_batch.as_slice().is_empty());
1388                Ok(())
1389            })
1390        });
1391
1392        assert!(result.is_ok());
1393        reset_pool_default();
1394    }
1395
1396    #[test]
1397    #[serial_test::serial]
1398    fn lexer_pool_thread_local_isolation() {
1399        let _guard = env_lock().lock().unwrap();
1400        reset_pool_default();
1401
1402        let handles: Vec<_> = (0..4)
1403            .map(|_| {
1404                std::thread::spawn(|| {
1405                    for _ in 0..50 {
1406                        for query in ["kind:function", "name:test", "lang:rust"] {
1407                            with_lexer(query, |batch| {
1408                                assert!(!batch.as_slice().is_empty());
1409                                Ok(batch.into_vec())
1410                            })
1411                            .unwrap();
1412                        }
1413                    }
1414
1415                    let (stash, in_flight, max_size) = crate::query::lexer::pool_stats_for_tests();
1416                    assert!(stash <= max_size);
1417                    assert_eq!(in_flight, 0);
1418                })
1419            })
1420            .collect();
1421
1422        for handle in handles {
1423            handle.join().unwrap();
1424        }
1425
1426        reset_pool_default();
1427    }
1428
1429    #[cfg(feature = "dhat-heap")]
1430    #[test]
1431    #[serial_test::serial]
1432    #[ignore = "Heap profiling test must run in isolation. Run with: cargo test -p sqry-core --lib lexer_reuse_minimizes_heap_allocations -- --ignored --test-threads=1"]
1433    fn lexer_reuse_minimizes_heap_allocations() {
1434        let _guard = env_lock().lock().unwrap();
1435        reset_pool_default();
1436
1437        let profiler = Profiler::new_heap();
1438
1439        for _ in 0..5 {
1440            with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1441        }
1442
1443        let stats = HeapStats::get();
1444        drop(profiler);
1445        // Threshold adjusted for integration test environment: when running with --tests,
1446        // all plugin crates (dev-dependencies) are loaded, increasing baseline allocations
1447        // from ~28 blocks (unit tests only) to ~58 blocks (with plugins loaded)
1448        assert!(
1449            stats.total_blocks <= 65,
1450            "expected limited allocations, observed {} blocks (threshold accounts for plugin loading in integration tests)",
1451            stats.total_blocks
1452        );
1453
1454        reset_pool_default();
1455    }
1456
1457    #[test]
1458    fn reusable_lexer_capacity_growth_and_retention() {
1459        let mut lexer = ReusableLexer::new();
1460
1461        lexer.reset("kind:function");
1462        {
1463            let batch = lexer.tokenize().unwrap();
1464            assert!(!batch.as_slice().is_empty());
1465        }
1466        let initial_capacity = lexer.token_buffer.capacity();
1467
1468        let large_query = (0..50)
1469            .map(|i| format!("name:value{i}"))
1470            .collect::<Vec<_>>()
1471            .join(" AND ");
1472        lexer.reset(&large_query);
1473        {
1474            let batch = lexer.tokenize().unwrap();
1475            assert!(batch.as_slice().len() > 50);
1476        }
1477        let grown_capacity = lexer.token_buffer.capacity();
1478        assert!(grown_capacity > initial_capacity);
1479
1480        lexer.reset("kind:function");
1481        {
1482            let batch = lexer.tokenize().unwrap();
1483            assert!(!batch.as_slice().is_empty());
1484        }
1485        let retained_capacity = lexer.token_buffer.capacity();
1486        assert_eq!(retained_capacity, grown_capacity);
1487
1488        #[cfg(debug_assertions)]
1489        {
1490            let diagnostics = lexer.diagnostics();
1491            assert!(diagnostics.reuse_count >= 3);
1492            assert!(diagnostics.max_capacity_seen >= grown_capacity);
1493        }
1494    }
1495
1496    #[test]
1497    fn reusable_lexer_error_recovery_clears_buffer() {
1498        let mut lexer = ReusableLexer::new();
1499
1500        lexer.reset("kind:function");
1501        {
1502            let batch = lexer.tokenize().unwrap();
1503            assert!(!batch.as_slice().is_empty());
1504        }
1505
1506        lexer.reset("kind@invalid");
1507        let result = lexer.tokenize();
1508        assert!(result.is_err());
1509        drop(result);
1510
1511        lexer.reset("name:test");
1512        {
1513            let batch = lexer.tokenize().unwrap();
1514            assert!(!batch.as_slice().is_empty());
1515        }
1516    }
1517
1518    #[test]
1519    fn reusable_lexer_panic_after_into_vec_has_clean_buffer() {
1520        let mut lexer = ReusableLexer::new();
1521        lexer.reset("kind:function");
1522
1523        let result = catch_unwind(AssertUnwindSafe(|| {
1524            let batch = lexer.tokenize().unwrap();
1525            let _tokens = batch.into_vec();
1526            panic!("boom");
1527        }));
1528
1529        assert!(result.is_err());
1530        assert_eq!(lexer.token_buffer.len(), 0);
1531    }
1532
1533    #[test]
1534    fn test_tokenize_simple_query() {
1535        let mut lexer = Lexer::new("kind:function");
1536        let tokens = lexer.tokenize().unwrap();
1537
1538        assert_eq!(tokens.len(), 4);
1539        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "kind"));
1540        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1541        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "function"));
1542        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1543    }
1544
1545    #[test]
1546    fn test_tokenize_generic_type_value() {
1547        let mut lexer = Lexer::new("returns:Optional<User>");
1548        let tokens = lexer.tokenize().unwrap();
1549
1550        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1551        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1552        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Optional<User>"));
1553        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1554    }
1555
1556    #[test]
1557    fn test_tokenize_nested_generic_value() {
1558        let mut lexer = Lexer::new("returns:Map<String,List<Order>>");
1559        let tokens = lexer.tokenize().unwrap();
1560
1561        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1562        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1563        assert!(
1564            matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Map<String,List<Order>>")
1565        );
1566        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1567    }
1568
1569    #[test]
1570    fn test_tokenize_numeric_comparison_after_identifier() {
1571        let mut lexer = Lexer::new("line>10");
1572        let tokens = lexer.tokenize().unwrap();
1573
1574        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "line"));
1575        assert!(matches!(tokens[1].token_type, TokenType::Greater));
1576        assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(10)));
1577        assert!(matches!(tokens[3].token_type, TokenType::Eof));
1578    }
1579
1580    #[test]
1581    fn test_tokenize_keywords_case_insensitive() {
1582        let mut lexer = Lexer::new("AND and Or NOT not");
1583        let tokens = lexer.tokenize().unwrap();
1584
1585        assert!(matches!(tokens[0].token_type, TokenType::And));
1586        assert!(matches!(tokens[1].token_type, TokenType::And));
1587        assert!(matches!(tokens[2].token_type, TokenType::Or));
1588        assert!(matches!(tokens[3].token_type, TokenType::Not));
1589        assert!(matches!(tokens[4].token_type, TokenType::Not));
1590    }
1591
1592    #[test]
1593    fn test_tokenize_operators() {
1594        let mut lexer = Lexer::new(": ~= > < >= <=");
1595        let tokens = lexer.tokenize().unwrap();
1596
1597        assert!(matches!(tokens[0].token_type, TokenType::Colon));
1598        assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1599        assert!(matches!(tokens[2].token_type, TokenType::Greater));
1600        assert!(matches!(tokens[3].token_type, TokenType::Less));
1601        assert!(matches!(tokens[4].token_type, TokenType::GreaterEq));
1602        assert!(matches!(tokens[5].token_type, TokenType::LessEq));
1603    }
1604
1605    #[test]
1606    fn test_tokenize_parentheses() {
1607        let mut lexer = Lexer::new("( )");
1608        let tokens = lexer.tokenize().unwrap();
1609
1610        assert!(matches!(tokens[0].token_type, TokenType::LParen));
1611        assert!(matches!(tokens[1].token_type, TokenType::RParen));
1612    }
1613
1614    #[test]
1615    fn test_tokenize_double_quoted_string() {
1616        let mut lexer = Lexer::new(r#"name:"hello world""#);
1617        let tokens = lexer.tokenize().unwrap();
1618
1619        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1620        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1621        assert!(
1622            matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1623        );
1624    }
1625
1626    #[test]
1627    fn test_tokenize_single_quoted_string() {
1628        let mut lexer = Lexer::new(r"name:'hello world'");
1629        let tokens = lexer.tokenize().unwrap();
1630
1631        assert!(
1632            matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1633        );
1634    }
1635
1636    #[test]
1637    fn test_string_escape_sequences() {
1638        let mut lexer = Lexer::new(r#""line1\nline2\ttab\"quote\\backslash""#);
1639        let tokens = lexer.tokenize().unwrap();
1640
1641        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1642            assert_eq!(s, "line1\nline2\ttab\"quote\\backslash");
1643        } else {
1644            panic!("Expected string literal");
1645        }
1646    }
1647
1648    #[test]
1649    fn test_unicode_escape() {
1650        let mut lexer = Lexer::new(r#""\u0041BC""#);
1651        let tokens = lexer.tokenize().unwrap();
1652
1653        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1654            assert_eq!(s, "ABC");
1655        } else {
1656            panic!("Expected string literal");
1657        }
1658    }
1659
1660    #[test]
1661    fn test_unterminated_string() {
1662        let mut lexer = Lexer::new(r#"name:"unclosed"#);
1663        let result = lexer.tokenize();
1664        assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
1665    }
1666
1667    #[test]
1668    fn test_invalid_escape() {
1669        let mut lexer = Lexer::new(r#""\x""#);
1670        let result = lexer.tokenize();
1671        assert!(matches!(
1672            result,
1673            Err(LexError::InvalidEscape { char: 'x', .. })
1674        ));
1675    }
1676
1677    #[test]
1678    fn test_glob_metacharacter_escape_sequences() {
1679        // Test that glob metacharacters can be escaped in quoted strings
1680        // This is required for path: predicates with literal glob chars
1681        let mut lexer = Lexer::new(r#""src/\[test\]/\*\?file\{a,b\}""#);
1682        let tokens = lexer.tokenize().unwrap();
1683
1684        if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1685            assert_eq!(s, "src/[test]/*?file{a,b}");
1686        } else {
1687            panic!("Expected string literal, got {:?}", tokens[0].token_type);
1688        }
1689    }
1690
1691    #[test]
1692    fn test_path_predicate_with_escaped_glob_chars() {
1693        // Simulate a path: predicate with escaped glob characters
1694        let mut lexer = Lexer::new(r#"path:"src/\[test\]/**""#);
1695        let tokens = lexer.tokenize().unwrap();
1696
1697        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "path"));
1698        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1699        if let TokenType::StringLiteral(s) = &tokens[2].token_type {
1700            // The escaped brackets become literal brackets, ** remains as-is
1701            assert_eq!(s, "src/[test]/**");
1702        } else {
1703            panic!("Expected string literal");
1704        }
1705    }
1706
1707    #[test]
1708    fn test_tokenize_regex() {
1709        let mut lexer = Lexer::new(r"name~=/^test_/i");
1710        let tokens = lexer.tokenize().unwrap();
1711
1712        assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1713        assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1714
1715        if let TokenType::RegexLiteral { pattern, flags } = &tokens[2].token_type {
1716            assert_eq!(pattern, "^test_");
1717            assert!(flags.case_insensitive);
1718            assert!(!flags.multiline);
1719            assert!(!flags.dot_all);
1720        } else {
1721            panic!("Expected regex literal");
1722        }
1723    }
1724
1725    #[test]
1726    fn test_regex_multiple_flags() {
1727        let mut lexer = Lexer::new(r"/pattern/ims");
1728        let tokens = lexer.tokenize().unwrap();
1729
1730        if let TokenType::RegexLiteral { flags, .. } = &tokens[0].token_type {
1731            assert!(flags.case_insensitive);
1732            assert!(flags.multiline);
1733            assert!(flags.dot_all);
1734        } else {
1735            panic!("Expected regex literal");
1736        }
1737    }
1738
1739    #[test]
1740    fn test_regex_escaped_slash() {
1741        let mut lexer = Lexer::new(r"/path\/to\/file/");
1742        let tokens = lexer.tokenize().unwrap();
1743
1744        if let TokenType::RegexLiteral { pattern, .. } = &tokens[0].token_type {
1745            assert_eq!(pattern, r"path\/to\/file");
1746        } else {
1747            panic!("Expected regex literal");
1748        }
1749    }
1750
1751    #[test]
1752    fn test_regex_escaped_backslash_then_slash() {
1753        // Input raw string r"/a\\\\/": This contains /a\\\\/
1754        // In the raw string, \\\\ is 4 literal backslash characters
1755        // So pattern before / is: a\\\\
1756        // Trailing backslashes: 4 (even number)
1757        // Even count means slash is NOT escaped, so pattern ends
1758        // Result pattern should be: a\\\\ (4 backslashes)
1759        let mut lexer = Lexer::new(r"/a\\\\/");
1760        let token = lexer.next_token().unwrap();
1761        match token.token_type {
1762            TokenType::RegexLiteral { pattern, .. } => {
1763                assert_eq!(pattern, r"a\\\\"); // 4 backslashes
1764            }
1765            _ => panic!("Expected RegexLiteral"),
1766        }
1767    }
1768
1769    #[test]
1770    fn test_regex_single_escaped_slash() {
1771        let mut lexer = Lexer::new(r"/a\/b/"); // Pattern: a/b with escaped slash
1772        let token = lexer.next_token().unwrap();
1773        match token.token_type {
1774            TokenType::RegexLiteral { pattern, .. } => {
1775                assert_eq!(pattern, r"a\/b");
1776            }
1777            _ => panic!("Expected RegexLiteral"),
1778        }
1779    }
1780
1781    #[test]
1782    fn test_unterminated_regex() {
1783        let mut lexer = Lexer::new(r"/unclosed");
1784        let result = lexer.tokenize();
1785        assert!(matches!(result, Err(LexError::UnterminatedRegex { .. })));
1786    }
1787
1788    #[test]
1789    fn test_invalid_regex_pattern() {
1790        let mut lexer = Lexer::new(r"/^[/");
1791        let result = lexer.tokenize();
1792        assert!(matches!(result, Err(LexError::InvalidRegex { .. })));
1793    }
1794
1795    #[test]
1796    fn test_regex_unknown_flag() {
1797        let mut lexer = Lexer::new("/pattern/x");
1798        let err = lexer.next_token().unwrap_err();
1799        match err {
1800            LexError::InvalidRegex { error, .. } => {
1801                assert!(error.contains("Unknown regex flag"));
1802            }
1803            _ => panic!("Expected InvalidRegex error"),
1804        }
1805    }
1806
1807    #[test]
1808    fn test_tokenize_positive_number() {
1809        let mut lexer = Lexer::new("lines:42");
1810        let tokens = lexer.tokenize().unwrap();
1811
1812        assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(42)));
1813    }
1814
1815    #[test]
1816    fn test_tokenize_negative_number() {
1817        let mut lexer = Lexer::new("lines:-42");
1818        let tokens = lexer.tokenize().unwrap();
1819
1820        assert!(matches!(
1821            tokens[2].token_type,
1822            TokenType::NumberLiteral(-42)
1823        ));
1824    }
1825
1826    #[test]
1827    fn test_tokenize_number_with_underscores() {
1828        let mut lexer = Lexer::new("lines:1_000_000");
1829        let tokens = lexer.tokenize().unwrap();
1830
1831        assert!(matches!(
1832            tokens[2].token_type,
1833            TokenType::NumberLiteral(1_000_000)
1834        ));
1835    }
1836
1837    #[test]
1838    fn test_number_overflow() {
1839        let mut lexer = Lexer::new("lines:99999999999999999999");
1840        let result = lexer.tokenize();
1841        assert!(matches!(result, Err(LexError::NumberOverflow { .. })));
1842    }
1843
1844    #[test]
1845    fn test_tokenize_boolean_true() {
1846        let mut lexer = Lexer::new("async:true");
1847        let tokens = lexer.tokenize().unwrap();
1848
1849        assert!(matches!(
1850            tokens[2].token_type,
1851            TokenType::BooleanLiteral(true)
1852        ));
1853    }
1854
1855    #[test]
1856    fn test_tokenize_boolean_false() {
1857        let mut lexer = Lexer::new("async:FALSE");
1858        let tokens = lexer.tokenize().unwrap();
1859
1860        assert!(matches!(
1861            tokens[2].token_type,
1862            TokenType::BooleanLiteral(false)
1863        ));
1864    }
1865
1866    #[test]
1867    fn test_tokenize_complex_query() {
1868        let mut lexer = Lexer::new(r"kind:function AND async:true OR name~=/^test_/i");
1869        let tokens = lexer.tokenize().unwrap();
1870
1871        // Verify token sequence
1872        assert!(matches!(tokens[0].token_type, TokenType::Identifier(_)));
1873        assert!(matches!(tokens[1].token_type, TokenType::Colon));
1874        assert!(matches!(tokens[2].token_type, TokenType::Word(_)));
1875        assert!(matches!(tokens[3].token_type, TokenType::And));
1876        assert!(matches!(tokens[4].token_type, TokenType::Identifier(_)));
1877        assert!(matches!(tokens[5].token_type, TokenType::Colon));
1878        assert!(matches!(
1879            tokens[6].token_type,
1880            TokenType::BooleanLiteral(true)
1881        ));
1882        assert!(matches!(tokens[7].token_type, TokenType::Or));
1883        assert!(matches!(tokens[8].token_type, TokenType::Identifier(_)));
1884        assert!(matches!(tokens[9].token_type, TokenType::RegexOp));
1885        assert!(matches!(
1886            tokens[10].token_type,
1887            TokenType::RegexLiteral { .. }
1888        ));
1889        assert!(matches!(tokens[11].token_type, TokenType::Eof));
1890    }
1891
1892    #[test]
1893    fn test_whitespace_handling() {
1894        let mut lexer = Lexer::new("  kind  :  function  ");
1895        let tokens = lexer.tokenize().unwrap();
1896
1897        assert_eq!(tokens.len(), 4); // kind, :, function, EOF
1898    }
1899
1900    #[test]
1901    fn test_unexpected_character() {
1902        let mut lexer = Lexer::new("kind@function");
1903        let result = lexer.tokenize();
1904        assert!(matches!(
1905            result,
1906            Err(LexError::UnexpectedChar { char: '@', .. })
1907        ));
1908    }
1909
1910    #[test]
1911    fn test_empty_string_literal() {
1912        let mut lexer = Lexer::new(r#"name:"""#);
1913        let tokens = lexer.tokenize().unwrap();
1914
1915        assert!(matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s.is_empty()));
1916    }
1917
1918    #[test]
1919    fn test_empty_regex_literal() {
1920        let mut lexer = Lexer::new(r"name~=//");
1921        let tokens = lexer.tokenize().unwrap();
1922
1923        if let TokenType::RegexLiteral { pattern, .. } = &tokens[2].token_type {
1924            assert_eq!(pattern, "");
1925        } else {
1926            panic!("Expected regex literal");
1927        }
1928    }
1929
1930    #[test]
1931    fn test_span_tracking() {
1932        let mut lexer = Lexer::new("kind:function");
1933        let tokens = lexer.tokenize().unwrap();
1934
1935        // Verify spans are set
1936        assert!(tokens[0].span.start == 0);
1937        assert!(tokens[0].span.end == 4); // "kind"
1938        assert!(tokens[1].span.start == 4);
1939        assert!(tokens[1].span.end == 5); // ":"
1940        assert!(tokens[2].span.start == 5);
1941        assert!(tokens[2].span.end == 13); // "function"
1942    }
1943
1944    #[test]
1945    fn test_identifier_vs_word() {
1946        let mut lexer = Lexer::new("kind:value value");
1947        let tokens = lexer.tokenize().unwrap();
1948
1949        // First 'value' after ':' should be a Word
1950        assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "value"));
1951        // Second 'value' standalone should also be a Word
1952        assert!(matches!(tokens[3].token_type, TokenType::Word(ref s) if s == "value"));
1953    }
1954
1955    #[test]
1956    fn test_bare_word_with_glob() {
1957        let mut lexer = Lexer::new("path:src/*.rs");
1958        lexer.next_token().unwrap(); // path (identifier)
1959        lexer.next_token().unwrap(); // :
1960        let token = lexer.next_token().unwrap();
1961        match token.token_type {
1962            TokenType::Word(s) => assert_eq!(s, "src/*.rs"),
1963            _ => panic!(
1964                "Expected Word with glob pattern, got {:?}",
1965                token.token_type
1966            ),
1967        }
1968    }
1969
1970    #[test]
1971    fn test_bare_word_with_hyphen() {
1972        let mut lexer = Lexer::new("name:foo-bar");
1973        lexer.next_token().unwrap(); // name
1974        lexer.next_token().unwrap(); // :
1975        let token = lexer.next_token().unwrap();
1976        match token.token_type {
1977            TokenType::Word(s) => assert_eq!(s, "foo-bar"),
1978            _ => panic!("Expected Word with hyphen, got {:?}", token.token_type),
1979        }
1980    }
1981
1982    #[test]
1983    fn test_bare_word_with_dot() {
1984        let mut lexer = Lexer::new("path:foo.rs");
1985        lexer.next_token().unwrap(); // path
1986        lexer.next_token().unwrap(); // :
1987        let token = lexer.next_token().unwrap();
1988        match token.token_type {
1989            TokenType::Word(s) => assert_eq!(s, "foo.rs"),
1990            _ => panic!("Expected Word with dot, got {:?}", token.token_type),
1991        }
1992    }
1993
1994    #[test]
1995    fn test_variable_token() {
1996        let mut lexer = Lexer::new("$name");
1997        let tokens = lexer.tokenize().unwrap();
1998        assert_eq!(tokens.len(), 2); // Variable, Eof
1999        assert_eq!(
2000            tokens[0].token_type,
2001            TokenType::Variable("name".to_string())
2002        );
2003        assert!(matches!(tokens[1].token_type, TokenType::Eof));
2004    }
2005
2006    #[test]
2007    fn test_variable_token_with_underscores() {
2008        let mut lexer = Lexer::new("$my_var");
2009        let tokens = lexer.tokenize().unwrap();
2010        assert_eq!(tokens.len(), 2); // Variable, Eof
2011        assert_eq!(
2012            tokens[0].token_type,
2013            TokenType::Variable("my_var".to_string())
2014        );
2015    }
2016
2017    #[test]
2018    fn test_pipe_token() {
2019        let mut lexer = Lexer::new("|");
2020        let tokens = lexer.tokenize().unwrap();
2021        assert_eq!(tokens.len(), 2); // Pipe, Eof
2022        assert!(matches!(tokens[0].token_type, TokenType::Pipe));
2023    }
2024
2025    #[test]
2026    fn test_dollar_sign_alone_error() {
2027        let mut lexer = Lexer::new("$ ");
2028        let result = lexer.tokenize();
2029        assert!(
2030            matches!(result, Err(LexError::UnexpectedChar { char: '$', .. })),
2031            "Bare '$' should produce an error, got: {result:?}"
2032        );
2033    }
2034}
sqry_core/query/lexer.rs

sqry_core/query/
lexer.rs