1use crate::query::error::LexError;
8use crate::query::types::{RegexFlags, Span};
9use log::trace;
10use std::cell::RefCell;
11use std::env;
12use std::str::Chars;
13use std::thread_local;
14
15#[cfg(all(test, feature = "dhat-heap"))]
16#[global_allocator]
17static DHAT_ALLOC: dhat::Alloc = dhat::Alloc;
18
19#[derive(Debug, Clone, PartialEq)]
21pub enum TokenType {
22 And,
25 Or,
27 Not,
29
30 Colon,
33 RegexOp,
35 Greater,
37 Less,
39 GreaterEq,
41 LessEq,
43 Pipe,
45
46 LParen,
49 RParen,
51
52 Identifier(String),
55 StringLiteral(String),
57 RegexLiteral {
59 pattern: String,
61 flags: RegexFlags,
63 },
64 NumberLiteral(i64),
66 BooleanLiteral(bool),
68 Word(String),
70 Variable(String),
72
73 Eof,
76}
77
78#[derive(Debug, Clone, PartialEq)]
80pub struct Token {
81 pub token_type: TokenType,
83 pub span: Span,
85}
86
87impl Token {
88 #[must_use]
90 pub fn new(token_type: TokenType, span: Span) -> Self {
91 Self { token_type, span }
92 }
93}
94
95pub(crate) struct RawLexer<'a> {
97 input: &'a str,
99 chars: Chars<'a>,
101 position: usize,
103 line: usize,
105 column: usize,
107 peeked: Option<char>,
109}
110
111impl<'a> RawLexer<'a> {
112 pub fn new(input: &'a str) -> Self {
114 Self {
115 input,
116 chars: input.chars(),
117 position: 0,
118 line: 1,
119 column: 1,
120 peeked: None,
121 }
122 }
123
124 pub fn restart(&mut self) {
126 self.chars = self.input.chars();
127 self.position = 0;
128 self.line = 1;
129 self.column = 1;
130 self.peeked = None;
131 }
132
133 pub fn tokenize_into(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexError> {
135 loop {
136 let token = self.next_token()?;
137 let is_eof = matches!(token.token_type, TokenType::Eof);
138 tokens.push(token);
139
140 if is_eof {
141 break;
142 }
143 }
144
145 Ok(())
146 }
147
148 #[allow(clippy::too_many_lines)] pub fn next_token(&mut self) -> Result<Token, LexError> {
151 self.skip_whitespace();
152
153 let start_pos = self.position;
154 let start_line = self.line;
155 let start_col = self.column;
156
157 let Some(ch) = self.peek_char() else {
158 return Ok(Token::new(
159 TokenType::Eof,
160 Span::with_position(self.position, self.position, self.line, self.column),
161 ));
162 };
163
164 let token_type = if let Some(token) = self.read_simple_token(ch) {
165 token
166 } else if ch == '$' {
167 self.read_variable_token(start_pos, start_line, start_col)?
168 } else if ch == '~' {
169 self.read_regex_operator(start_pos, start_line, start_col)?
170 } else if ch == '>' || ch == '<' {
171 self.read_comparison_operator(ch)
172 } else if ch == '"' || ch == '\'' {
173 let s = self.read_quoted_string(ch)?;
174 TokenType::StringLiteral(s)
175 } else if ch == '/' {
176 let (pattern, flags) = self.read_regex()?;
177 TokenType::RegexLiteral { pattern, flags }
178 } else if self.is_number_start(ch) {
179 let n = self.read_number()?;
180 TokenType::NumberLiteral(n)
181 } else if Self::is_word_start(ch) {
182 self.read_word_token()
183 } else {
184 return Err(LexError::UnexpectedChar {
185 char: ch,
186 span: Span::with_position(
187 start_pos,
188 start_pos + ch.len_utf8(),
189 start_line,
190 start_col,
191 ),
192 });
193 };
194
195 Ok(Token::new(
196 token_type,
197 Span::with_position(start_pos, self.position, start_line, start_col),
198 ))
199 }
200
201 fn read_simple_token(&mut self, ch: char) -> Option<TokenType> {
202 let token = match ch {
203 '(' => TokenType::LParen,
204 ')' => TokenType::RParen,
205 ':' => TokenType::Colon,
206 '|' => TokenType::Pipe,
207 _ => return None,
208 };
209 self.next_char();
210 Some(token)
211 }
212
213 fn read_regex_operator(
214 &mut self,
215 start_pos: usize,
216 start_line: usize,
217 start_col: usize,
218 ) -> Result<TokenType, LexError> {
219 self.next_char();
220 if self.peek_char() == Some('=') {
221 self.next_char();
222 Ok(TokenType::RegexOp)
223 } else {
224 Err(LexError::UnexpectedChar {
225 char: '~',
226 span: Span::with_position(start_pos, self.position, start_line, start_col),
227 })
228 }
229 }
230
231 fn read_comparison_operator(&mut self, ch: char) -> TokenType {
232 self.next_char();
233 let (equal, plain) = if ch == '>' {
234 (TokenType::GreaterEq, TokenType::Greater)
235 } else {
236 (TokenType::LessEq, TokenType::Less)
237 };
238 if self.peek_char() == Some('=') {
239 self.next_char();
240 equal
241 } else {
242 plain
243 }
244 }
245
246 fn is_number_start(&self, ch: char) -> bool {
247 ch.is_ascii_digit() || (ch == '-' && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()))
248 }
249
250 fn is_word_start(ch: char) -> bool {
251 ch.is_ascii_alphabetic() || ch == '_'
252 }
253
254 fn read_variable_token(
256 &mut self,
257 start_pos: usize,
258 start_line: usize,
259 start_col: usize,
260 ) -> Result<TokenType, LexError> {
261 self.next_char(); let mut name = String::new();
265 while let Some(c) = self.peek_char() {
266 if c.is_ascii_alphanumeric() || c == '_' {
267 name.push(c);
268 self.next_char();
269 } else {
270 break;
271 }
272 }
273
274 if name.is_empty() {
275 return Err(LexError::UnexpectedChar {
276 char: '$',
277 span: Span::with_position(start_pos, self.position, start_line, start_col),
278 });
279 }
280
281 Ok(TokenType::Variable(name))
282 }
283
284 fn read_word_token(&mut self) -> TokenType {
285 let word = self.read_word();
286 match word.to_uppercase().as_str() {
287 "AND" => TokenType::And,
288 "OR" => TokenType::Or,
289 "NOT" => TokenType::Not,
290 "TRUE" => TokenType::BooleanLiteral(true),
291 "FALSE" => TokenType::BooleanLiteral(false),
292 _ => {
293 self.skip_whitespace();
294 match self.peek_char() {
295 Some(':' | '~' | '>' | '<') => TokenType::Identifier(word),
296 _ => TokenType::Word(word),
297 }
298 }
299 }
300 }
301
302 fn peek_char(&mut self) -> Option<char> {
304 if self.peeked.is_none() {
305 self.peeked = self.chars.next();
306 }
307 self.peeked
308 }
309
310 fn peek_ahead(&self, n: usize) -> Option<char> {
312 self.input[self.position..].chars().nth(n)
313 }
314
315 fn next_char(&mut self) -> Option<char> {
317 let ch = if let Some(c) = self.peeked.take() {
318 Some(c)
319 } else {
320 self.chars.next()
321 };
322
323 if let Some(c) = ch {
324 self.position += c.len_utf8();
325 if c == '\n' {
326 self.line += 1;
327 self.column = 1;
328 } else {
329 self.column += 1;
330 }
331 }
332
333 ch
334 }
335
336 fn skip_whitespace(&mut self) {
338 while let Some(c) = self.peek_char() {
339 if c.is_whitespace() {
340 self.next_char();
341 } else {
342 break;
343 }
344 }
345 }
346
347 fn read_quoted_string(&mut self, quote: char) -> Result<String, LexError> {
349 let start_pos = self.position;
350 let start_line = self.line;
351 let start_col = self.column;
352 self.next_char(); let mut result = String::new();
355
356 loop {
357 match self.next_char() {
358 Some(c) if c == quote => {
359 return Ok(result);
361 }
362 Some('\\') => {
363 let escaped = self.read_escape_sequence(start_pos, start_line, start_col)?;
364 result.push(escaped);
365 }
366 Some(c) => result.push(c),
367 None => {
368 return Err(LexError::UnterminatedString {
369 span: Span::with_position(start_pos, self.position, start_line, start_col),
370 });
371 }
372 }
373 }
374 }
375
376 fn read_escape_sequence(
377 &mut self,
378 start_pos: usize,
379 start_line: usize,
380 start_col: usize,
381 ) -> Result<char, LexError> {
382 match self.next_char() {
383 Some('"') => Ok('"'),
384 Some('\'') => Ok('\''),
385 Some('\\') => Ok('\\'),
386 Some('n') => Ok('\n'),
387 Some('t') => Ok('\t'),
388 Some('r') => Ok('\r'),
389 Some('u') => self.read_unicode_escape(),
390 Some('*') => Ok('*'),
392 Some('?') => Ok('?'),
393 Some('[') => Ok('['),
394 Some(']') => Ok(']'),
395 Some('{') => Ok('{'),
396 Some('}') => Ok('}'),
397 Some(c) => Err(LexError::InvalidEscape {
398 char: c,
399 span: Span::with_position(self.position - 2, self.position, self.line, self.column),
400 }),
401 None => Err(LexError::UnterminatedString {
402 span: Span::with_position(start_pos, self.position, start_line, start_col),
403 }),
404 }
405 }
406
407 fn read_unicode_escape(&mut self) -> Result<char, LexError> {
408 let hex = self.read_hex_digits(4)?;
410 let code_point =
411 u32::from_str_radix(&hex, 16).map_err(|_| LexError::InvalidUnicodeEscape {
412 got: hex.chars().next().unwrap_or('?'),
413 span: Span::with_position(
414 self.position - hex.len() - 2,
415 self.position,
416 self.line,
417 self.column,
418 ),
419 })?;
420 let ch = char::from_u32(code_point).ok_or_else(|| LexError::InvalidUnicodeEscape {
421 got: hex.chars().next().unwrap_or('?'),
422 span: Span::with_position(
423 self.position - hex.len() - 2,
424 self.position,
425 self.line,
426 self.column,
427 ),
428 })?;
429 Ok(ch)
430 }
431
432 fn read_regex(&mut self) -> Result<(String, RegexFlags), LexError> {
434 let start_pos = self.position;
435 let start_line = self.line;
436 let start_col = self.column;
437 self.next_char(); let pattern = self.read_regex_pattern(start_pos, start_line, start_col)?;
440 let flags = self.read_regex_flags(start_pos, start_line, start_col, &pattern)?;
441 self.validate_regex_pattern(&pattern, &flags, start_pos, start_line, start_col)?;
442 Ok((pattern, flags))
443 }
444
445 fn read_regex_pattern(
446 &mut self,
447 start_pos: usize,
448 start_line: usize,
449 start_col: usize,
450 ) -> Result<String, LexError> {
451 let mut pattern = String::new();
452
453 loop {
455 match self.next_char() {
456 Some('/') => {
457 let trailing_backslashes =
459 pattern.chars().rev().take_while(|&c| c == '\\').count();
460
461 if trailing_backslashes % 2 == 1 {
462 pattern.push('/');
464 continue;
465 }
466 break;
468 }
469 Some(c) => pattern.push(c),
470 None => {
471 return Err(LexError::UnterminatedRegex {
472 span: Span::with_position(start_pos, self.position, start_line, start_col),
473 });
474 }
475 }
476 }
477
478 Ok(pattern)
479 }
480
481 fn read_regex_flags(
482 &mut self,
483 start_pos: usize,
484 start_line: usize,
485 start_col: usize,
486 pattern: &str,
487 ) -> Result<RegexFlags, LexError> {
488 let mut flags = RegexFlags::default();
489 while let Some(ch) = self.peek_char() {
490 match ch {
491 'i' => {
492 flags.case_insensitive = true;
493 self.next_char();
494 }
495 'm' => {
496 flags.multiline = true;
497 self.next_char();
498 }
499 's' => {
500 flags.dot_all = true;
501 self.next_char();
502 }
503 _ if ch.is_ascii_alphabetic() => {
504 return Err(LexError::InvalidRegex {
506 pattern: pattern.to_string(),
507 error: format!("Unknown regex flag '{ch}'"),
508 span: Span::with_position(
509 start_pos,
510 self.position + 1,
511 start_line,
512 start_col,
513 ),
514 });
515 }
516 _ => break,
517 }
518 }
519
520 Ok(flags)
521 }
522
523 fn validate_regex_pattern(
524 &self,
525 pattern: &str,
526 flags: &RegexFlags,
527 start_pos: usize,
528 start_line: usize,
529 start_col: usize,
530 ) -> Result<(), LexError> {
531 let mut builder = regex::RegexBuilder::new(pattern);
532 builder
533 .case_insensitive(flags.case_insensitive)
534 .multi_line(flags.multiline)
535 .dot_matches_new_line(flags.dot_all);
536
537 if let Err(e) = builder.build() {
538 return Err(LexError::InvalidRegex {
539 pattern: pattern.to_string(),
540 error: e.to_string(),
541 span: Span::with_position(start_pos, self.position, start_line, start_col),
542 });
543 }
544
545 Ok(())
546 }
547
548 fn read_hex_digits(&mut self, count: usize) -> Result<String, LexError> {
550 let mut hex = String::new();
551
552 for _ in 0..count {
553 match self.next_char() {
554 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
555 Some(c) => {
556 return Err(LexError::InvalidUnicodeEscape {
557 got: c,
558 span: Span::with_position(
559 self.position - 1,
560 self.position,
561 self.line,
562 self.column.saturating_sub(1),
563 ),
564 });
565 }
566 None => {
567 return Err(LexError::InvalidUnicodeEscape {
568 got: '?',
569 span: Span::with_position(
570 self.position,
571 self.position,
572 self.line,
573 self.column,
574 ),
575 });
576 }
577 }
578 }
579
580 Ok(hex)
581 }
582
583 fn read_number(&mut self) -> Result<i64, LexError> {
585 let start_pos = self.position;
586 let start_line = self.line;
587 let start_col = self.column;
588 let mut num_str = String::new();
589
590 if self.peek_char() == Some('-') {
592 num_str.push('-');
593 self.next_char();
594 }
595
596 while let Some(c) = self.peek_char() {
598 if c.is_ascii_digit() {
599 num_str.push(c);
600 self.next_char();
601 } else if c == '_' {
602 self.next_char();
604 } else {
605 break;
606 }
607 }
608
609 num_str
611 .parse::<i64>()
612 .map_err(|e| LexError::NumberOverflow {
613 text: num_str.clone(),
614 error: e.to_string(),
615 span: Span::with_position(start_pos, self.position, start_line, start_col),
616 })
617 }
618
619 fn read_word(&mut self) -> String {
622 let mut word = String::new();
623
624 while let Some(c) = self.peek_char() {
625 match self.classify_word_char(c) {
626 WordCharType::Basic => {
627 word.push(c);
628 self.next_char();
629 }
630 WordCharType::DoubleColon => {
631 word.push_str("::");
632 self.next_char();
633 self.next_char();
634 }
635 WordCharType::GenericStart => {
636 self.consume_generic_segment(&mut word);
637 }
638 WordCharType::End => break,
639 }
640 }
641
642 word
643 }
644
645 fn classify_word_char(&self, c: char) -> WordCharType {
647 if c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '*' | '?' | '/' | '-' | '[' | ']') {
648 WordCharType::Basic
649 } else if c == ':' && self.peek_ahead(1) == Some(':') {
650 WordCharType::DoubleColon
651 } else if c == '<' && self.has_generic_closing_angle() {
652 WordCharType::GenericStart
653 } else {
654 WordCharType::End
655 }
656 }
657
658 fn consume_generic_segment(&mut self, word: &mut String) {
660 word.push('<');
661 self.next_char();
662
663 let mut depth = 1usize;
664 while let Some(ch) = self.peek_char() {
665 if ch.is_whitespace() {
666 break;
667 }
668 depth = match ch {
669 '<' => depth.saturating_add(1),
670 '>' => depth.saturating_sub(1),
671 _ => depth,
672 };
673 word.push(ch);
674 self.next_char();
675 if depth == 0 {
676 break;
677 }
678 }
679 }
680
681 fn has_generic_closing_angle(&self) -> bool {
683 let mut depth = 0usize;
684
685 for ch in self.input[self.position..].chars() {
686 if ch.is_whitespace() {
687 return false;
688 }
689 match ch {
690 '<' => depth = depth.saturating_add(1),
691 '>' => {
692 if depth == 0 {
693 return false;
694 }
695 depth = depth.saturating_sub(1);
696 if depth == 0 {
697 return true;
698 }
699 }
700 _ => {}
701 }
702 }
703
704 false
705 }
706}
707
708enum WordCharType {
710 Basic,
712 DoubleColon,
714 GenericStart,
716 End,
718}
719
720pub struct Lexer<'a> {
725 raw: RawLexer<'a>,
726}
727
728impl<'a> Lexer<'a> {
729 #[must_use]
731 pub fn new(input: &'a str) -> Self {
732 Self {
733 raw: RawLexer::new(input),
734 }
735 }
736
737 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
743 let mut tokens = Vec::with_capacity(16);
744 self.raw.restart();
745 self.raw.tokenize_into(&mut tokens)?;
746 Ok(tokens)
747 }
748
749 pub fn next_token(&mut self) -> Result<Token, LexError> {
755 self.raw.next_token()
756 }
757}
758
759#[derive(Clone, Copy, Debug, PartialEq, Eq)]
760pub(crate) struct ShrinkPolicy {
761 pub max_capacity: usize,
762 pub shrink_ratio: usize,
763}
764
765impl Default for ShrinkPolicy {
766 fn default() -> Self {
767 Self {
768 max_capacity: 256,
769 shrink_ratio: 8,
770 }
771 }
772}
773
774const POOL_MAX_DEFAULT: usize = 4;
776const ENV_POOL_MAX: &str = "SQRY_LEXER_POOL_MAX";
777const ENV_POOL_MAX_CAP: &str = "SQRY_LEXER_POOL_MAX_CAP";
778const ENV_POOL_SHRINK_RATIO: &str = "SQRY_LEXER_POOL_SHRINK_RATIO";
779
780#[derive(Clone, Copy, Debug, PartialEq, Eq)]
781struct PoolConfig {
782 max_size: usize,
783 shrink_policy: ShrinkPolicy,
784}
785
786impl PoolConfig {
787 fn default() -> Self {
788 Self {
789 max_size: POOL_MAX_DEFAULT,
790 shrink_policy: ShrinkPolicy::default(),
791 }
792 }
793
794 fn from_environment() -> Self {
795 let mut config = Self::default();
796
797 if let Some(value) = read_env_usize(ENV_POOL_MAX) {
798 config.max_size = value;
799 }
800
801 if let Some(value) = read_env_usize(ENV_POOL_MAX_CAP) {
802 config.shrink_policy.max_capacity = value.max(1);
803 }
804
805 if let Some(value) = read_env_usize(ENV_POOL_SHRINK_RATIO) {
806 config.shrink_policy.shrink_ratio = value.max(1);
807 }
808
809 config
810 }
811}
812
813fn read_env_usize(var: &str) -> Option<usize> {
814 match env::var(var) {
815 Ok(value) => match value.parse::<usize>() {
816 Ok(parsed) => Some(parsed),
817 Err(err) => {
818 trace!("Ignoring invalid value for {var}: {err}");
819 None
820 }
821 },
822 Err(std::env::VarError::NotPresent) => None,
823 Err(std::env::VarError::NotUnicode(_)) => {
824 trace!("Ignoring non-unicode value for {var}");
825 None
826 }
827 }
828}
829
830thread_local! {
831 static LEXER_POOL: RefCell<LexerPool> = RefCell::new(LexerPool::new(PoolConfig::default()));
832}
833
834struct LexerPool {
835 stash: Vec<ReusableLexer>,
836 in_flight: usize,
837 config: PoolConfig,
838}
839
840impl LexerPool {
841 fn new(config: PoolConfig) -> Self {
842 Self {
843 stash: Vec::new(),
844 in_flight: 0,
845 config,
846 }
847 }
848
849 fn apply_config(&mut self, config: PoolConfig) {
850 if self.config == config {
851 return;
852 }
853
854 trace!(
855 "sqry::query::lexer: updating pool config -> max_size={}, max_capacity={}, shrink_ratio={}",
856 config.max_size, config.shrink_policy.max_capacity, config.shrink_policy.shrink_ratio
857 );
858
859 self.config = config;
860 self.stash.clear();
861 self.in_flight = 0;
862 }
863
864 fn acquire(&mut self) -> LexerHandle {
865 if let Some(lexer) = self.stash.pop() {
866 self.in_flight += 1;
867 return LexerHandle::pooled(lexer);
868 }
869
870 if self.in_flight < self.config.max_size {
871 self.in_flight += 1;
872 let lexer = ReusableLexer::with_policy(self.config.shrink_policy);
873 return LexerHandle::pooled(lexer);
874 }
875
876 LexerHandle::temporary(ReusableLexer::with_policy(self.config.shrink_policy))
877 }
878
879 fn release(&mut self, lexer: ReusableLexer) {
880 if self.config.max_size == 0 {
881 self.in_flight = self.in_flight.saturating_sub(1);
882 return;
883 }
884
885 self.in_flight = self.in_flight.saturating_sub(1);
886 if self.stash.len() < self.config.max_size {
887 self.stash.push(lexer);
888 }
889 }
890
891 #[cfg(test)]
892 fn stats(&self) -> (usize, usize, PoolConfig) {
893 (self.stash.len(), self.in_flight, self.config)
894 }
895
896 #[cfg(test)]
897 fn reset(&mut self, config: PoolConfig) {
898 self.stash.clear();
899 self.in_flight = 0;
900 self.config = config;
901 }
902}
903
904struct LexerHandle {
905 lexer: Option<ReusableLexer>,
906 pooled: bool,
907}
908
909impl LexerHandle {
910 fn pooled(lexer: ReusableLexer) -> Self {
911 Self {
912 lexer: Some(lexer),
913 pooled: true,
914 }
915 }
916
917 fn temporary(lexer: ReusableLexer) -> Self {
918 Self {
919 lexer: Some(lexer),
920 pooled: false,
921 }
922 }
923
924 fn lexer_mut(&mut self) -> &mut ReusableLexer {
925 self.lexer.as_mut().expect("lexer handle missing lexer")
929 }
930
931 fn reset(&mut self, input: &str) {
932 self.lexer_mut().reset(input);
933 }
934
935 fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
936 self.lexer_mut().tokenize()
937 }
938}
939
940impl Drop for LexerHandle {
941 fn drop(&mut self) {
942 if !self.pooled {
943 return;
944 }
945
946 if let Some(lexer) = self.lexer.take() {
947 LEXER_POOL.with(|cell| {
948 cell.borrow_mut().release(lexer);
949 });
950 }
951 }
952}
953
954#[cfg(test)]
955pub(crate) fn configure_pool_for_tests(max_size: usize, shrink_policy: ShrinkPolicy) {
956 LEXER_POOL.with(|cell| {
957 cell.borrow_mut().reset(PoolConfig {
958 max_size,
959 shrink_policy,
960 });
961 });
962}
963
964#[cfg(test)]
965pub(crate) fn reset_pool_to_default_for_tests() {
966 configure_pool_for_tests(POOL_MAX_DEFAULT, ShrinkPolicy::default());
967}
968
969#[cfg(test)]
970pub(crate) fn pool_stats_for_tests() -> (usize, usize, usize) {
971 LEXER_POOL.with(|cell| {
972 let (stash, in_flight, config) = cell.borrow().stats();
973 (stash, in_flight, config.max_size)
974 })
975}
976
977pub(crate) fn with_lexer<F, T>(input: &str, f: F) -> Result<T, LexError>
978where
979 F: FnOnce(TokenBatch<'_>) -> Result<T, LexError>,
980{
981 let config = PoolConfig::from_environment();
982
983 if config.max_size == 0 {
984 LEXER_POOL.with(|cell| {
985 cell.borrow_mut().apply_config(config);
986 });
987 let mut lexer = ReusableLexer::with_policy(config.shrink_policy);
988 lexer.reset(input);
989 let batch = lexer.tokenize()?;
990 return f(batch);
991 }
992
993 let mut handle = LEXER_POOL.with(|cell| {
994 let mut pool = cell.borrow_mut();
995 pool.apply_config(config);
996 pool.acquire()
997 });
998
999 handle.reset(input);
1000 let batch = handle.tokenize()?;
1001 let result = f(batch);
1002 drop(handle);
1003 result
1004}
1005
1006pub fn tokenize_with_pool(input: &str) -> Result<Vec<Token>, LexError> {
1015 with_lexer(input, |batch| Ok(batch.into_vec()))
1016}
1017
1018#[cfg(debug_assertions)]
1019#[derive(Debug, Default, Clone, Copy)]
1020struct LexerDiagnostics {
1021 reuse_count: usize,
1022 max_capacity_seen: usize,
1023 shrink_count: usize,
1024}
1025
1026#[cfg(debug_assertions)]
1027impl LexerDiagnostics {
1028 fn record_reuse(&mut self, capacity: usize) {
1029 self.reuse_count += 1;
1030 if capacity > self.max_capacity_seen {
1031 self.max_capacity_seen = capacity;
1032 }
1033 }
1034
1035 fn record_shrink(&mut self) {
1036 self.shrink_count += 1;
1037 }
1038}
1039
1040pub(crate) struct ReusableLexer {
1043 input: String,
1044 token_buffer: Vec<Token>,
1045 shrink_policy: ShrinkPolicy,
1046 #[cfg(debug_assertions)]
1047 diagnostics: LexerDiagnostics,
1048}
1049
1050impl ReusableLexer {
1051 #[cfg_attr(not(test), allow(dead_code))]
1055 pub fn new() -> Self {
1056 Self::with_policy(ShrinkPolicy::default())
1057 }
1058
1059 pub fn with_policy(shrink_policy: ShrinkPolicy) -> Self {
1060 Self {
1061 input: String::new(),
1062 token_buffer: Vec::with_capacity(16),
1063 shrink_policy,
1064 #[cfg(debug_assertions)]
1065 diagnostics: LexerDiagnostics::default(),
1066 }
1067 }
1068
1069 pub fn reset(&mut self, input: &str) {
1071 self.input.clear();
1072 self.input.push_str(input);
1073 self.token_buffer.clear();
1074 }
1075
1076 pub fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
1078 self.token_buffer.clear();
1079 let mut raw = RawLexer::new(self.input.as_str());
1080 raw.tokenize_into(&mut self.token_buffer)?;
1081 #[cfg(debug_assertions)]
1082 self.diagnostics.record_reuse(self.token_buffer.capacity());
1083 Ok(TokenBatch {
1084 tokens: &mut self.token_buffer,
1085 shrink_policy: self.shrink_policy,
1086 #[cfg(debug_assertions)]
1087 diagnostics: &mut self.diagnostics,
1088 })
1089 }
1090
1091 #[cfg(debug_assertions)]
1094 #[cfg_attr(not(test), allow(dead_code))]
1095 fn diagnostics(&self) -> &LexerDiagnostics {
1096 &self.diagnostics
1097 }
1098}
1099
1100pub(crate) struct TokenBatch<'a> {
1108 tokens: &'a mut Vec<Token>,
1109 shrink_policy: ShrinkPolicy,
1110 #[cfg(debug_assertions)]
1111 diagnostics: &'a mut LexerDiagnostics,
1112}
1113
1114impl TokenBatch<'_> {
1115 #[cfg_attr(not(test), allow(dead_code))]
1118 pub fn as_slice(&self) -> &[Token] {
1119 self.tokens.as_slice()
1120 }
1121
1122 #[allow(unused_mut)]
1123 pub fn into_vec(mut self) -> Vec<Token> {
1124 let result = self.tokens.drain(..).collect();
1125 #[cfg(debug_assertions)]
1126 let _ = &mut *self.diagnostics; result
1128 }
1129}
1130
1131impl Drop for TokenBatch<'_> {
1132 fn drop(&mut self) {
1133 if !self.tokens.is_empty() {
1134 self.tokens.clear();
1135 }
1136
1137 let shrink_threshold = self
1138 .shrink_policy
1139 .max_capacity
1140 .saturating_mul(self.shrink_policy.shrink_ratio);
1141 if shrink_threshold > 0 && self.tokens.capacity() > shrink_threshold {
1142 self.tokens.shrink_to(self.shrink_policy.max_capacity);
1143 #[cfg(debug_assertions)]
1144 self.diagnostics.record_shrink();
1145 }
1146 }
1147}
1148
1149#[cfg(test)]
1150mod tests {
1151 use super::*;
1152 use std::panic::{AssertUnwindSafe, catch_unwind};
1153 use std::sync::{Mutex, OnceLock};
1154
1155 #[cfg(feature = "dhat-heap")]
1156 use dhat::{HeapStats, Profiler};
1157
1158 fn reset_pool_from_env() {
1159 let config = PoolConfig::from_environment();
1160 LEXER_POOL.with(|cell| {
1161 cell.borrow_mut().reset(config);
1162 });
1163 }
1164
1165 fn reset_pool_default() {
1166 unsafe {
1167 std::env::remove_var(ENV_POOL_MAX);
1168 std::env::remove_var(ENV_POOL_MAX_CAP);
1169 std::env::remove_var(ENV_POOL_SHRINK_RATIO);
1170 }
1171 reset_pool_from_env();
1172 }
1173
1174 fn set_env(var: &str, value: &str) {
1175 unsafe {
1176 std::env::set_var(var, value);
1177 }
1178 }
1179
1180 fn remove_env(var: &str) {
1181 unsafe {
1182 std::env::remove_var(var);
1183 }
1184 }
1185
1186 fn env_lock() -> &'static Mutex<()> {
1187 static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
1188 LOCK.get_or_init(|| Mutex::new(()))
1189 }
1190
1191 #[test]
1192 fn reusable_lexer_reuses_buffer_across_calls() {
1193 let mut lexer = ReusableLexer::new();
1194 lexer.reset("kind:function");
1195
1196 let first_ptr = {
1197 let batch = lexer.tokenize().unwrap();
1198 let ptr = batch.as_slice().as_ptr();
1199 assert!(!batch.as_slice().is_empty());
1200 ptr
1201 };
1202 assert_eq!(first_ptr, lexer.token_buffer.as_ptr());
1203
1204 lexer.reset("name:test");
1205 let second_ptr = {
1206 let batch = lexer.tokenize().unwrap();
1207 let ptr = batch.as_slice().as_ptr();
1208 assert!(!batch.as_slice().is_empty());
1209 ptr
1210 };
1211 assert_eq!(second_ptr, lexer.token_buffer.as_ptr());
1212 assert_eq!(first_ptr, second_ptr);
1213 #[cfg(debug_assertions)]
1214 {
1215 let diagnostics = lexer.diagnostics();
1216 assert!(diagnostics.reuse_count >= 2);
1217 assert!(diagnostics.max_capacity_seen >= lexer.token_buffer.capacity());
1218 }
1219 }
1220
1221 #[test]
1222 fn reusable_lexer_clears_buffer_on_panic() {
1223 let mut lexer = ReusableLexer::new();
1224 lexer.reset("kind:function");
1225
1226 let result = catch_unwind(AssertUnwindSafe(|| {
1227 let _batch = lexer.tokenize().unwrap();
1228 panic!("boom");
1229 }));
1230
1231 assert!(result.is_err());
1232 assert_eq!(lexer.token_buffer.len(), 0);
1233 }
1234
1235 #[test]
1236 fn reusable_lexer_into_vec_drains_tokens() {
1237 let mut lexer = ReusableLexer::new();
1238 lexer.reset("kind:function");
1239
1240 let tokens = {
1241 let batch = lexer.tokenize().unwrap();
1242 batch.into_vec()
1243 };
1244
1245 assert_eq!(tokens.len(), 4);
1246 assert_eq!(lexer.token_buffer.len(), 0);
1247 }
1248
1249 #[test]
1250 fn reusable_lexer_shrink_policy_applies() {
1251 let policy = ShrinkPolicy {
1252 max_capacity: 8,
1253 shrink_ratio: 2,
1254 };
1255
1256 let mut lexer = ReusableLexer::with_policy(policy);
1257 let large_query = (0..128)
1258 .map(|i| format!("name:value{i}"))
1259 .collect::<Vec<_>>()
1260 .join(" ");
1261 lexer.reset(&large_query);
1262
1263 {
1264 let batch = lexer.tokenize().unwrap();
1265 let _ = batch.into_vec();
1266 }
1267
1268 if lexer.token_buffer.capacity() <= policy.max_capacity * policy.shrink_ratio {
1269 lexer
1270 .token_buffer
1271 .reserve(policy.max_capacity * policy.shrink_ratio * 2);
1272 }
1273 assert!(lexer.token_buffer.capacity() > policy.max_capacity * policy.shrink_ratio);
1274
1275 lexer.reset("kind:function");
1276 {
1277 let batch = lexer.tokenize().unwrap();
1278 drop(batch);
1279 }
1280
1281 assert!(lexer.token_buffer.capacity() <= policy.max_capacity);
1282
1283 #[cfg(debug_assertions)]
1284 {
1285 let diagnostics = lexer.diagnostics();
1286 assert!(diagnostics.shrink_count >= 1);
1287 }
1288 }
1289
1290 #[test]
1297 #[serial_test::serial]
1298 fn lexer_pool_returns_lexers_to_stash() {
1299 let _guard = env_lock().lock().unwrap();
1300 reset_pool_default();
1301
1302 assert_eq!(PoolConfig::from_environment().max_size, POOL_MAX_DEFAULT);
1303
1304 let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1305 assert_eq!(tokens.len(), 4);
1306
1307 LEXER_POOL.with(|cell| {
1308 let (stash_len, in_flight, config) = cell.borrow().stats();
1309 assert_eq!(config.max_size, POOL_MAX_DEFAULT);
1310 assert_eq!(in_flight, 0);
1311 assert_eq!(stash_len, 1);
1312 });
1313 }
1314
1315 #[test]
1316 #[serial_test::serial]
1317 fn lexer_pool_respects_zero_capacity_env() {
1318 let _guard = env_lock().lock().unwrap();
1319 set_env(ENV_POOL_MAX, "0");
1320 reset_pool_from_env();
1321
1322 let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1323 assert_eq!(tokens.len(), 4);
1324
1325 LEXER_POOL.with(|cell| {
1326 let (stash_len, in_flight, config) = cell.borrow().stats();
1327 assert_eq!(config.max_size, 0);
1328 assert_eq!(in_flight, 0);
1329 assert_eq!(stash_len, 0);
1330 });
1331
1332 remove_env(ENV_POOL_MAX);
1333 reset_pool_default();
1334 }
1335
1336 #[test]
1337 #[serial_test::serial]
1338 fn lexer_pool_reuses_single_slot() {
1339 let _guard = env_lock().lock().unwrap();
1340 set_env(ENV_POOL_MAX, "1");
1341 reset_pool_from_env();
1342
1343 assert_eq!(PoolConfig::from_environment().max_size, 1);
1344
1345 for query in ["kind:function", "name:test"] {
1346 let _ = with_lexer(query, |batch| Ok(batch.into_vec())).unwrap();
1347 }
1348
1349 LEXER_POOL.with(|cell| {
1350 let (stash_len, in_flight, config) = cell.borrow().stats();
1351 assert_eq!(config.max_size, 1);
1352 assert_eq!(in_flight, 0);
1353 assert_eq!(stash_len, 1);
1354 });
1355
1356 remove_env(ENV_POOL_MAX);
1357 reset_pool_default();
1358 }
1359
1360 #[test]
1361 fn lexer_handles_double_colon_in_words() {
1362 let mut lexer = Lexer::new("callers:Player::takeDamage");
1363 let tokens = lexer.tokenize().unwrap();
1364 assert_eq!(tokens.len(), 4); assert_eq!(
1366 tokens[0].token_type,
1367 TokenType::Identifier("callers".to_string())
1368 );
1369 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1370 assert_eq!(
1371 tokens[2].token_type,
1372 TokenType::Word("Player::takeDamage".to_string())
1373 );
1374 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1375 }
1376
1377 #[test]
1378 #[serial_test::serial]
1379 #[ignore = "Test depends on clean env_lock state. Run in isolation with: cargo test -p sqry-core --lib with_lexer_allows_reentrant_usage -- --ignored --test-threads=1"]
1380 fn with_lexer_allows_reentrant_usage() {
1381 let _guard = env_lock().lock().unwrap();
1382 reset_pool_default();
1383
1384 let result = with_lexer("kind:function", |batch| {
1385 assert!(!batch.as_slice().is_empty());
1386 with_lexer("name:test", |inner_batch| {
1387 assert!(!inner_batch.as_slice().is_empty());
1388 Ok(())
1389 })
1390 });
1391
1392 assert!(result.is_ok());
1393 reset_pool_default();
1394 }
1395
1396 #[test]
1397 #[serial_test::serial]
1398 fn lexer_pool_thread_local_isolation() {
1399 let _guard = env_lock().lock().unwrap();
1400 reset_pool_default();
1401
1402 let handles: Vec<_> = (0..4)
1403 .map(|_| {
1404 std::thread::spawn(|| {
1405 for _ in 0..50 {
1406 for query in ["kind:function", "name:test", "lang:rust"] {
1407 with_lexer(query, |batch| {
1408 assert!(!batch.as_slice().is_empty());
1409 Ok(batch.into_vec())
1410 })
1411 .unwrap();
1412 }
1413 }
1414
1415 let (stash, in_flight, max_size) = crate::query::lexer::pool_stats_for_tests();
1416 assert!(stash <= max_size);
1417 assert_eq!(in_flight, 0);
1418 })
1419 })
1420 .collect();
1421
1422 for handle in handles {
1423 handle.join().unwrap();
1424 }
1425
1426 reset_pool_default();
1427 }
1428
1429 #[cfg(feature = "dhat-heap")]
1430 #[test]
1431 #[serial_test::serial]
1432 #[ignore = "Heap profiling test must run in isolation. Run with: cargo test -p sqry-core --lib lexer_reuse_minimizes_heap_allocations -- --ignored --test-threads=1"]
1433 fn lexer_reuse_minimizes_heap_allocations() {
1434 let _guard = env_lock().lock().unwrap();
1435 reset_pool_default();
1436
1437 let profiler = Profiler::new_heap();
1438
1439 for _ in 0..5 {
1440 with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1441 }
1442
1443 let stats = HeapStats::get();
1444 drop(profiler);
1445 assert!(
1449 stats.total_blocks <= 65,
1450 "expected limited allocations, observed {} blocks (threshold accounts for plugin loading in integration tests)",
1451 stats.total_blocks
1452 );
1453
1454 reset_pool_default();
1455 }
1456
1457 #[test]
1458 fn reusable_lexer_capacity_growth_and_retention() {
1459 let mut lexer = ReusableLexer::new();
1460
1461 lexer.reset("kind:function");
1462 {
1463 let batch = lexer.tokenize().unwrap();
1464 assert!(!batch.as_slice().is_empty());
1465 }
1466 let initial_capacity = lexer.token_buffer.capacity();
1467
1468 let large_query = (0..50)
1469 .map(|i| format!("name:value{i}"))
1470 .collect::<Vec<_>>()
1471 .join(" AND ");
1472 lexer.reset(&large_query);
1473 {
1474 let batch = lexer.tokenize().unwrap();
1475 assert!(batch.as_slice().len() > 50);
1476 }
1477 let grown_capacity = lexer.token_buffer.capacity();
1478 assert!(grown_capacity > initial_capacity);
1479
1480 lexer.reset("kind:function");
1481 {
1482 let batch = lexer.tokenize().unwrap();
1483 assert!(!batch.as_slice().is_empty());
1484 }
1485 let retained_capacity = lexer.token_buffer.capacity();
1486 assert_eq!(retained_capacity, grown_capacity);
1487
1488 #[cfg(debug_assertions)]
1489 {
1490 let diagnostics = lexer.diagnostics();
1491 assert!(diagnostics.reuse_count >= 3);
1492 assert!(diagnostics.max_capacity_seen >= grown_capacity);
1493 }
1494 }
1495
1496 #[test]
1497 fn reusable_lexer_error_recovery_clears_buffer() {
1498 let mut lexer = ReusableLexer::new();
1499
1500 lexer.reset("kind:function");
1501 {
1502 let batch = lexer.tokenize().unwrap();
1503 assert!(!batch.as_slice().is_empty());
1504 }
1505
1506 lexer.reset("kind@invalid");
1507 let result = lexer.tokenize();
1508 assert!(result.is_err());
1509 drop(result);
1510
1511 lexer.reset("name:test");
1512 {
1513 let batch = lexer.tokenize().unwrap();
1514 assert!(!batch.as_slice().is_empty());
1515 }
1516 }
1517
1518 #[test]
1519 fn reusable_lexer_panic_after_into_vec_has_clean_buffer() {
1520 let mut lexer = ReusableLexer::new();
1521 lexer.reset("kind:function");
1522
1523 let result = catch_unwind(AssertUnwindSafe(|| {
1524 let batch = lexer.tokenize().unwrap();
1525 let _tokens = batch.into_vec();
1526 panic!("boom");
1527 }));
1528
1529 assert!(result.is_err());
1530 assert_eq!(lexer.token_buffer.len(), 0);
1531 }
1532
1533 #[test]
1534 fn test_tokenize_simple_query() {
1535 let mut lexer = Lexer::new("kind:function");
1536 let tokens = lexer.tokenize().unwrap();
1537
1538 assert_eq!(tokens.len(), 4);
1539 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "kind"));
1540 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1541 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "function"));
1542 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1543 }
1544
1545 #[test]
1546 fn test_tokenize_generic_type_value() {
1547 let mut lexer = Lexer::new("returns:Optional<User>");
1548 let tokens = lexer.tokenize().unwrap();
1549
1550 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1551 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1552 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Optional<User>"));
1553 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1554 }
1555
1556 #[test]
1557 fn test_tokenize_nested_generic_value() {
1558 let mut lexer = Lexer::new("returns:Map<String,List<Order>>");
1559 let tokens = lexer.tokenize().unwrap();
1560
1561 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1562 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1563 assert!(
1564 matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Map<String,List<Order>>")
1565 );
1566 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1567 }
1568
1569 #[test]
1570 fn test_tokenize_numeric_comparison_after_identifier() {
1571 let mut lexer = Lexer::new("line>10");
1572 let tokens = lexer.tokenize().unwrap();
1573
1574 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "line"));
1575 assert!(matches!(tokens[1].token_type, TokenType::Greater));
1576 assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(10)));
1577 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1578 }
1579
1580 #[test]
1581 fn test_tokenize_keywords_case_insensitive() {
1582 let mut lexer = Lexer::new("AND and Or NOT not");
1583 let tokens = lexer.tokenize().unwrap();
1584
1585 assert!(matches!(tokens[0].token_type, TokenType::And));
1586 assert!(matches!(tokens[1].token_type, TokenType::And));
1587 assert!(matches!(tokens[2].token_type, TokenType::Or));
1588 assert!(matches!(tokens[3].token_type, TokenType::Not));
1589 assert!(matches!(tokens[4].token_type, TokenType::Not));
1590 }
1591
1592 #[test]
1593 fn test_tokenize_operators() {
1594 let mut lexer = Lexer::new(": ~= > < >= <=");
1595 let tokens = lexer.tokenize().unwrap();
1596
1597 assert!(matches!(tokens[0].token_type, TokenType::Colon));
1598 assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1599 assert!(matches!(tokens[2].token_type, TokenType::Greater));
1600 assert!(matches!(tokens[3].token_type, TokenType::Less));
1601 assert!(matches!(tokens[4].token_type, TokenType::GreaterEq));
1602 assert!(matches!(tokens[5].token_type, TokenType::LessEq));
1603 }
1604
1605 #[test]
1606 fn test_tokenize_parentheses() {
1607 let mut lexer = Lexer::new("( )");
1608 let tokens = lexer.tokenize().unwrap();
1609
1610 assert!(matches!(tokens[0].token_type, TokenType::LParen));
1611 assert!(matches!(tokens[1].token_type, TokenType::RParen));
1612 }
1613
1614 #[test]
1615 fn test_tokenize_double_quoted_string() {
1616 let mut lexer = Lexer::new(r#"name:"hello world""#);
1617 let tokens = lexer.tokenize().unwrap();
1618
1619 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1620 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1621 assert!(
1622 matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1623 );
1624 }
1625
1626 #[test]
1627 fn test_tokenize_single_quoted_string() {
1628 let mut lexer = Lexer::new(r"name:'hello world'");
1629 let tokens = lexer.tokenize().unwrap();
1630
1631 assert!(
1632 matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1633 );
1634 }
1635
1636 #[test]
1637 fn test_string_escape_sequences() {
1638 let mut lexer = Lexer::new(r#""line1\nline2\ttab\"quote\\backslash""#);
1639 let tokens = lexer.tokenize().unwrap();
1640
1641 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1642 assert_eq!(s, "line1\nline2\ttab\"quote\\backslash");
1643 } else {
1644 panic!("Expected string literal");
1645 }
1646 }
1647
1648 #[test]
1649 fn test_unicode_escape() {
1650 let mut lexer = Lexer::new(r#""\u0041BC""#);
1651 let tokens = lexer.tokenize().unwrap();
1652
1653 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1654 assert_eq!(s, "ABC");
1655 } else {
1656 panic!("Expected string literal");
1657 }
1658 }
1659
1660 #[test]
1661 fn test_unterminated_string() {
1662 let mut lexer = Lexer::new(r#"name:"unclosed"#);
1663 let result = lexer.tokenize();
1664 assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
1665 }
1666
1667 #[test]
1668 fn test_invalid_escape() {
1669 let mut lexer = Lexer::new(r#""\x""#);
1670 let result = lexer.tokenize();
1671 assert!(matches!(
1672 result,
1673 Err(LexError::InvalidEscape { char: 'x', .. })
1674 ));
1675 }
1676
1677 #[test]
1678 fn test_glob_metacharacter_escape_sequences() {
1679 let mut lexer = Lexer::new(r#""src/\[test\]/\*\?file\{a,b\}""#);
1682 let tokens = lexer.tokenize().unwrap();
1683
1684 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1685 assert_eq!(s, "src/[test]/*?file{a,b}");
1686 } else {
1687 panic!("Expected string literal, got {:?}", tokens[0].token_type);
1688 }
1689 }
1690
1691 #[test]
1692 fn test_path_predicate_with_escaped_glob_chars() {
1693 let mut lexer = Lexer::new(r#"path:"src/\[test\]/**""#);
1695 let tokens = lexer.tokenize().unwrap();
1696
1697 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "path"));
1698 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1699 if let TokenType::StringLiteral(s) = &tokens[2].token_type {
1700 assert_eq!(s, "src/[test]/**");
1702 } else {
1703 panic!("Expected string literal");
1704 }
1705 }
1706
1707 #[test]
1708 fn test_tokenize_regex() {
1709 let mut lexer = Lexer::new(r"name~=/^test_/i");
1710 let tokens = lexer.tokenize().unwrap();
1711
1712 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1713 assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1714
1715 if let TokenType::RegexLiteral { pattern, flags } = &tokens[2].token_type {
1716 assert_eq!(pattern, "^test_");
1717 assert!(flags.case_insensitive);
1718 assert!(!flags.multiline);
1719 assert!(!flags.dot_all);
1720 } else {
1721 panic!("Expected regex literal");
1722 }
1723 }
1724
1725 #[test]
1726 fn test_regex_multiple_flags() {
1727 let mut lexer = Lexer::new(r"/pattern/ims");
1728 let tokens = lexer.tokenize().unwrap();
1729
1730 if let TokenType::RegexLiteral { flags, .. } = &tokens[0].token_type {
1731 assert!(flags.case_insensitive);
1732 assert!(flags.multiline);
1733 assert!(flags.dot_all);
1734 } else {
1735 panic!("Expected regex literal");
1736 }
1737 }
1738
1739 #[test]
1740 fn test_regex_escaped_slash() {
1741 let mut lexer = Lexer::new(r"/path\/to\/file/");
1742 let tokens = lexer.tokenize().unwrap();
1743
1744 if let TokenType::RegexLiteral { pattern, .. } = &tokens[0].token_type {
1745 assert_eq!(pattern, r"path\/to\/file");
1746 } else {
1747 panic!("Expected regex literal");
1748 }
1749 }
1750
1751 #[test]
1752 fn test_regex_escaped_backslash_then_slash() {
1753 let mut lexer = Lexer::new(r"/a\\\\/");
1760 let token = lexer.next_token().unwrap();
1761 match token.token_type {
1762 TokenType::RegexLiteral { pattern, .. } => {
1763 assert_eq!(pattern, r"a\\\\"); }
1765 _ => panic!("Expected RegexLiteral"),
1766 }
1767 }
1768
1769 #[test]
1770 fn test_regex_single_escaped_slash() {
1771 let mut lexer = Lexer::new(r"/a\/b/"); let token = lexer.next_token().unwrap();
1773 match token.token_type {
1774 TokenType::RegexLiteral { pattern, .. } => {
1775 assert_eq!(pattern, r"a\/b");
1776 }
1777 _ => panic!("Expected RegexLiteral"),
1778 }
1779 }
1780
1781 #[test]
1782 fn test_unterminated_regex() {
1783 let mut lexer = Lexer::new(r"/unclosed");
1784 let result = lexer.tokenize();
1785 assert!(matches!(result, Err(LexError::UnterminatedRegex { .. })));
1786 }
1787
1788 #[test]
1789 fn test_invalid_regex_pattern() {
1790 let mut lexer = Lexer::new(r"/^[/");
1791 let result = lexer.tokenize();
1792 assert!(matches!(result, Err(LexError::InvalidRegex { .. })));
1793 }
1794
1795 #[test]
1796 fn test_regex_unknown_flag() {
1797 let mut lexer = Lexer::new("/pattern/x");
1798 let err = lexer.next_token().unwrap_err();
1799 match err {
1800 LexError::InvalidRegex { error, .. } => {
1801 assert!(error.contains("Unknown regex flag"));
1802 }
1803 _ => panic!("Expected InvalidRegex error"),
1804 }
1805 }
1806
1807 #[test]
1808 fn test_tokenize_positive_number() {
1809 let mut lexer = Lexer::new("lines:42");
1810 let tokens = lexer.tokenize().unwrap();
1811
1812 assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(42)));
1813 }
1814
1815 #[test]
1816 fn test_tokenize_negative_number() {
1817 let mut lexer = Lexer::new("lines:-42");
1818 let tokens = lexer.tokenize().unwrap();
1819
1820 assert!(matches!(
1821 tokens[2].token_type,
1822 TokenType::NumberLiteral(-42)
1823 ));
1824 }
1825
1826 #[test]
1827 fn test_tokenize_number_with_underscores() {
1828 let mut lexer = Lexer::new("lines:1_000_000");
1829 let tokens = lexer.tokenize().unwrap();
1830
1831 assert!(matches!(
1832 tokens[2].token_type,
1833 TokenType::NumberLiteral(1_000_000)
1834 ));
1835 }
1836
1837 #[test]
1838 fn test_number_overflow() {
1839 let mut lexer = Lexer::new("lines:99999999999999999999");
1840 let result = lexer.tokenize();
1841 assert!(matches!(result, Err(LexError::NumberOverflow { .. })));
1842 }
1843
1844 #[test]
1845 fn test_tokenize_boolean_true() {
1846 let mut lexer = Lexer::new("async:true");
1847 let tokens = lexer.tokenize().unwrap();
1848
1849 assert!(matches!(
1850 tokens[2].token_type,
1851 TokenType::BooleanLiteral(true)
1852 ));
1853 }
1854
1855 #[test]
1856 fn test_tokenize_boolean_false() {
1857 let mut lexer = Lexer::new("async:FALSE");
1858 let tokens = lexer.tokenize().unwrap();
1859
1860 assert!(matches!(
1861 tokens[2].token_type,
1862 TokenType::BooleanLiteral(false)
1863 ));
1864 }
1865
1866 #[test]
1867 fn test_tokenize_complex_query() {
1868 let mut lexer = Lexer::new(r"kind:function AND async:true OR name~=/^test_/i");
1869 let tokens = lexer.tokenize().unwrap();
1870
1871 assert!(matches!(tokens[0].token_type, TokenType::Identifier(_)));
1873 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1874 assert!(matches!(tokens[2].token_type, TokenType::Word(_)));
1875 assert!(matches!(tokens[3].token_type, TokenType::And));
1876 assert!(matches!(tokens[4].token_type, TokenType::Identifier(_)));
1877 assert!(matches!(tokens[5].token_type, TokenType::Colon));
1878 assert!(matches!(
1879 tokens[6].token_type,
1880 TokenType::BooleanLiteral(true)
1881 ));
1882 assert!(matches!(tokens[7].token_type, TokenType::Or));
1883 assert!(matches!(tokens[8].token_type, TokenType::Identifier(_)));
1884 assert!(matches!(tokens[9].token_type, TokenType::RegexOp));
1885 assert!(matches!(
1886 tokens[10].token_type,
1887 TokenType::RegexLiteral { .. }
1888 ));
1889 assert!(matches!(tokens[11].token_type, TokenType::Eof));
1890 }
1891
1892 #[test]
1893 fn test_whitespace_handling() {
1894 let mut lexer = Lexer::new(" kind : function ");
1895 let tokens = lexer.tokenize().unwrap();
1896
1897 assert_eq!(tokens.len(), 4); }
1899
1900 #[test]
1901 fn test_unexpected_character() {
1902 let mut lexer = Lexer::new("kind@function");
1903 let result = lexer.tokenize();
1904 assert!(matches!(
1905 result,
1906 Err(LexError::UnexpectedChar { char: '@', .. })
1907 ));
1908 }
1909
1910 #[test]
1911 fn test_empty_string_literal() {
1912 let mut lexer = Lexer::new(r#"name:"""#);
1913 let tokens = lexer.tokenize().unwrap();
1914
1915 assert!(matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s.is_empty()));
1916 }
1917
1918 #[test]
1919 fn test_empty_regex_literal() {
1920 let mut lexer = Lexer::new(r"name~=//");
1921 let tokens = lexer.tokenize().unwrap();
1922
1923 if let TokenType::RegexLiteral { pattern, .. } = &tokens[2].token_type {
1924 assert_eq!(pattern, "");
1925 } else {
1926 panic!("Expected regex literal");
1927 }
1928 }
1929
1930 #[test]
1931 fn test_span_tracking() {
1932 let mut lexer = Lexer::new("kind:function");
1933 let tokens = lexer.tokenize().unwrap();
1934
1935 assert!(tokens[0].span.start == 0);
1937 assert!(tokens[0].span.end == 4); assert!(tokens[1].span.start == 4);
1939 assert!(tokens[1].span.end == 5); assert!(tokens[2].span.start == 5);
1941 assert!(tokens[2].span.end == 13); }
1943
1944 #[test]
1945 fn test_identifier_vs_word() {
1946 let mut lexer = Lexer::new("kind:value value");
1947 let tokens = lexer.tokenize().unwrap();
1948
1949 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "value"));
1951 assert!(matches!(tokens[3].token_type, TokenType::Word(ref s) if s == "value"));
1953 }
1954
1955 #[test]
1956 fn test_bare_word_with_glob() {
1957 let mut lexer = Lexer::new("path:src/*.rs");
1958 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1961 match token.token_type {
1962 TokenType::Word(s) => assert_eq!(s, "src/*.rs"),
1963 _ => panic!(
1964 "Expected Word with glob pattern, got {:?}",
1965 token.token_type
1966 ),
1967 }
1968 }
1969
1970 #[test]
1971 fn test_bare_word_with_hyphen() {
1972 let mut lexer = Lexer::new("name:foo-bar");
1973 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1976 match token.token_type {
1977 TokenType::Word(s) => assert_eq!(s, "foo-bar"),
1978 _ => panic!("Expected Word with hyphen, got {:?}", token.token_type),
1979 }
1980 }
1981
1982 #[test]
1983 fn test_bare_word_with_dot() {
1984 let mut lexer = Lexer::new("path:foo.rs");
1985 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1988 match token.token_type {
1989 TokenType::Word(s) => assert_eq!(s, "foo.rs"),
1990 _ => panic!("Expected Word with dot, got {:?}", token.token_type),
1991 }
1992 }
1993
1994 #[test]
1995 fn test_variable_token() {
1996 let mut lexer = Lexer::new("$name");
1997 let tokens = lexer.tokenize().unwrap();
1998 assert_eq!(tokens.len(), 2); assert_eq!(
2000 tokens[0].token_type,
2001 TokenType::Variable("name".to_string())
2002 );
2003 assert!(matches!(tokens[1].token_type, TokenType::Eof));
2004 }
2005
2006 #[test]
2007 fn test_variable_token_with_underscores() {
2008 let mut lexer = Lexer::new("$my_var");
2009 let tokens = lexer.tokenize().unwrap();
2010 assert_eq!(tokens.len(), 2); assert_eq!(
2012 tokens[0].token_type,
2013 TokenType::Variable("my_var".to_string())
2014 );
2015 }
2016
2017 #[test]
2018 fn test_pipe_token() {
2019 let mut lexer = Lexer::new("|");
2020 let tokens = lexer.tokenize().unwrap();
2021 assert_eq!(tokens.len(), 2); assert!(matches!(tokens[0].token_type, TokenType::Pipe));
2023 }
2024
2025 #[test]
2026 fn test_dollar_sign_alone_error() {
2027 let mut lexer = Lexer::new("$ ");
2028 let result = lexer.tokenize();
2029 assert!(
2030 matches!(result, Err(LexError::UnexpectedChar { char: '$', .. })),
2031 "Bare '$' should produce an error, got: {result:?}"
2032 );
2033 }
2034}