1use crate::query::error::LexError;
8use crate::query::types::{RegexFlags, Span};
9use log::trace;
10use std::cell::RefCell;
11use std::env;
12use std::str::Chars;
13use std::thread_local;
14
15#[cfg(all(test, feature = "dhat-heap"))]
16#[global_allocator]
17static DHAT_ALLOC: dhat::Alloc = dhat::Alloc;
18
19#[derive(Debug, Clone, PartialEq)]
21pub enum TokenType {
22 And,
25 Or,
27 Not,
29
30 Colon,
33 RegexOp,
35 Greater,
37 Less,
39 GreaterEq,
41 LessEq,
43 Pipe,
45
46 LParen,
49 RParen,
51
52 Identifier(String),
55 StringLiteral(String),
57 RegexLiteral {
59 pattern: String,
61 flags: RegexFlags,
63 },
64 NumberLiteral(i64),
66 BooleanLiteral(bool),
68 Word(String),
70 Variable(String),
72
73 Eof,
76}
77
78#[derive(Debug, Clone, PartialEq)]
80pub struct Token {
81 pub token_type: TokenType,
83 pub span: Span,
85}
86
87impl Token {
88 #[must_use]
90 pub fn new(token_type: TokenType, span: Span) -> Self {
91 Self { token_type, span }
92 }
93}
94
95pub(crate) struct RawLexer<'a> {
97 input: &'a str,
99 chars: Chars<'a>,
101 position: usize,
103 line: usize,
105 column: usize,
107 peeked: Option<char>,
109}
110
111impl<'a> RawLexer<'a> {
112 pub fn new(input: &'a str) -> Self {
114 Self {
115 input,
116 chars: input.chars(),
117 position: 0,
118 line: 1,
119 column: 1,
120 peeked: None,
121 }
122 }
123
124 pub fn restart(&mut self) {
126 self.chars = self.input.chars();
127 self.position = 0;
128 self.line = 1;
129 self.column = 1;
130 self.peeked = None;
131 }
132
133 pub fn tokenize_into(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexError> {
135 loop {
136 let token = self.next_token()?;
137 let is_eof = matches!(token.token_type, TokenType::Eof);
138 tokens.push(token);
139
140 if is_eof {
141 break;
142 }
143 }
144
145 Ok(())
146 }
147
148 #[allow(clippy::too_many_lines)] pub fn next_token(&mut self) -> Result<Token, LexError> {
151 self.skip_whitespace();
152
153 let start_pos = self.position;
154 let start_line = self.line;
155 let start_col = self.column;
156
157 let Some(ch) = self.peek_char() else {
158 return Ok(Token::new(
159 TokenType::Eof,
160 Span::with_position(self.position, self.position, self.line, self.column),
161 ));
162 };
163
164 let token_type = if let Some(token) = self.read_simple_token(ch) {
165 token
166 } else if ch == '$' {
167 self.read_variable_token(start_pos, start_line, start_col)?
168 } else if ch == '~' {
169 self.read_regex_operator(start_pos, start_line, start_col)?
170 } else if ch == '>' || ch == '<' {
171 self.read_comparison_operator(ch)
172 } else if ch == '"' || ch == '\'' {
173 let s = self.read_quoted_string(ch)?;
174 TokenType::StringLiteral(s)
175 } else if ch == '/' {
176 let (pattern, flags) = self.read_regex()?;
177 TokenType::RegexLiteral { pattern, flags }
178 } else if self.is_number_start(ch) {
179 let n = self.read_number()?;
180 TokenType::NumberLiteral(n)
181 } else if Self::is_word_start(ch) {
182 self.read_word_token()
183 } else {
184 return Err(LexError::UnexpectedChar {
185 char: ch,
186 span: Span::with_position(
187 start_pos,
188 start_pos + ch.len_utf8(),
189 start_line,
190 start_col,
191 ),
192 });
193 };
194
195 Ok(Token::new(
196 token_type,
197 Span::with_position(start_pos, self.position, start_line, start_col),
198 ))
199 }
200
201 fn read_simple_token(&mut self, ch: char) -> Option<TokenType> {
202 let token = match ch {
203 '(' => TokenType::LParen,
204 ')' => TokenType::RParen,
205 ':' => TokenType::Colon,
206 '|' => TokenType::Pipe,
207 _ => return None,
208 };
209 self.next_char();
210 Some(token)
211 }
212
213 fn read_regex_operator(
214 &mut self,
215 start_pos: usize,
216 start_line: usize,
217 start_col: usize,
218 ) -> Result<TokenType, LexError> {
219 self.next_char();
220 if self.peek_char() == Some('=') {
221 self.next_char();
222 Ok(TokenType::RegexOp)
223 } else {
224 Err(LexError::UnexpectedChar {
225 char: '~',
226 span: Span::with_position(start_pos, self.position, start_line, start_col),
227 })
228 }
229 }
230
231 fn read_comparison_operator(&mut self, ch: char) -> TokenType {
232 self.next_char();
233 let (equal, plain) = if ch == '>' {
234 (TokenType::GreaterEq, TokenType::Greater)
235 } else {
236 (TokenType::LessEq, TokenType::Less)
237 };
238 if self.peek_char() == Some('=') {
239 self.next_char();
240 equal
241 } else {
242 plain
243 }
244 }
245
246 fn is_number_start(&self, ch: char) -> bool {
247 ch.is_ascii_digit() || (ch == '-' && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()))
248 }
249
250 fn is_word_start(ch: char) -> bool {
251 ch.is_ascii_alphabetic() || ch == '_'
252 }
253
254 fn read_variable_token(
256 &mut self,
257 start_pos: usize,
258 start_line: usize,
259 start_col: usize,
260 ) -> Result<TokenType, LexError> {
261 self.next_char(); let mut name = String::new();
265 while let Some(c) = self.peek_char() {
266 if c.is_ascii_alphanumeric() || c == '_' {
267 name.push(c);
268 self.next_char();
269 } else {
270 break;
271 }
272 }
273
274 if name.is_empty() {
275 return Err(LexError::UnexpectedChar {
276 char: '$',
277 span: Span::with_position(start_pos, self.position, start_line, start_col),
278 });
279 }
280
281 Ok(TokenType::Variable(name))
282 }
283
284 fn read_word_token(&mut self) -> TokenType {
285 let word = self.read_word();
286 match word.to_uppercase().as_str() {
287 "AND" => TokenType::And,
288 "OR" => TokenType::Or,
289 "NOT" => TokenType::Not,
290 "TRUE" => TokenType::BooleanLiteral(true),
291 "FALSE" => TokenType::BooleanLiteral(false),
292 _ => {
293 self.skip_whitespace();
294 match self.peek_char() {
295 Some(':' | '~' | '>' | '<') => TokenType::Identifier(word),
296 _ => TokenType::Word(word),
297 }
298 }
299 }
300 }
301
302 fn peek_char(&mut self) -> Option<char> {
304 if self.peeked.is_none() {
305 self.peeked = self.chars.next();
306 }
307 self.peeked
308 }
309
310 fn peek_ahead(&self, n: usize) -> Option<char> {
312 self.input[self.position..].chars().nth(n)
313 }
314
315 fn next_char(&mut self) -> Option<char> {
317 let ch = if let Some(c) = self.peeked.take() {
318 Some(c)
319 } else {
320 self.chars.next()
321 };
322
323 if let Some(c) = ch {
324 self.position += c.len_utf8();
325 if c == '\n' {
326 self.line += 1;
327 self.column = 1;
328 } else {
329 self.column += 1;
330 }
331 }
332
333 ch
334 }
335
336 fn skip_whitespace(&mut self) {
338 while let Some(c) = self.peek_char() {
339 if c.is_whitespace() {
340 self.next_char();
341 } else {
342 break;
343 }
344 }
345 }
346
347 fn read_quoted_string(&mut self, quote: char) -> Result<String, LexError> {
349 let start_pos = self.position;
350 let start_line = self.line;
351 let start_col = self.column;
352 self.next_char(); let mut result = String::new();
355
356 loop {
357 match self.next_char() {
358 Some(c) if c == quote => {
359 return Ok(result);
361 }
362 Some('\\') => {
363 let escaped = self.read_escape_sequence(start_pos, start_line, start_col)?;
364 result.push(escaped);
365 }
366 Some(c) => result.push(c),
367 None => {
368 return Err(LexError::UnterminatedString {
369 span: Span::with_position(start_pos, self.position, start_line, start_col),
370 });
371 }
372 }
373 }
374 }
375
376 fn read_escape_sequence(
377 &mut self,
378 start_pos: usize,
379 start_line: usize,
380 start_col: usize,
381 ) -> Result<char, LexError> {
382 match self.next_char() {
383 Some('"') => Ok('"'),
384 Some('\'') => Ok('\''),
385 Some('\\') => Ok('\\'),
386 Some('n') => Ok('\n'),
387 Some('t') => Ok('\t'),
388 Some('r') => Ok('\r'),
389 Some('u') => self.read_unicode_escape(),
390 Some('*') => Ok('*'),
392 Some('?') => Ok('?'),
393 Some('[') => Ok('['),
394 Some(']') => Ok(']'),
395 Some('{') => Ok('{'),
396 Some('}') => Ok('}'),
397 Some(c) => Err(LexError::InvalidEscape {
398 char: c,
399 span: Span::with_position(self.position - 2, self.position, self.line, self.column),
400 }),
401 None => Err(LexError::UnterminatedString {
402 span: Span::with_position(start_pos, self.position, start_line, start_col),
403 }),
404 }
405 }
406
407 fn read_unicode_escape(&mut self) -> Result<char, LexError> {
408 let hex = self.read_hex_digits(4)?;
410 let code_point =
411 u32::from_str_radix(&hex, 16).map_err(|_| LexError::InvalidUnicodeEscape {
412 got: hex.chars().next().unwrap_or('?'),
413 span: Span::with_position(
414 self.position - hex.len() - 2,
415 self.position,
416 self.line,
417 self.column,
418 ),
419 })?;
420 let ch = char::from_u32(code_point).ok_or_else(|| LexError::InvalidUnicodeEscape {
421 got: hex.chars().next().unwrap_or('?'),
422 span: Span::with_position(
423 self.position - hex.len() - 2,
424 self.position,
425 self.line,
426 self.column,
427 ),
428 })?;
429 Ok(ch)
430 }
431
432 fn read_regex(&mut self) -> Result<(String, RegexFlags), LexError> {
434 let start_pos = self.position;
435 let start_line = self.line;
436 let start_col = self.column;
437 self.next_char(); let pattern = self.read_regex_pattern(start_pos, start_line, start_col)?;
440 let flags = self.read_regex_flags(start_pos, start_line, start_col, &pattern)?;
441 self.validate_regex_pattern(&pattern, &flags, start_pos, start_line, start_col)?;
442 Ok((pattern, flags))
443 }
444
445 fn read_regex_pattern(
446 &mut self,
447 start_pos: usize,
448 start_line: usize,
449 start_col: usize,
450 ) -> Result<String, LexError> {
451 let mut pattern = String::new();
452
453 loop {
455 match self.next_char() {
456 Some('/') => {
457 let trailing_backslashes =
459 pattern.chars().rev().take_while(|&c| c == '\\').count();
460
461 if trailing_backslashes % 2 == 1 {
462 pattern.push('/');
464 continue;
465 }
466 break;
468 }
469 Some(c) => pattern.push(c),
470 None => {
471 return Err(LexError::UnterminatedRegex {
472 span: Span::with_position(start_pos, self.position, start_line, start_col),
473 });
474 }
475 }
476 }
477
478 Ok(pattern)
479 }
480
481 fn read_regex_flags(
482 &mut self,
483 start_pos: usize,
484 start_line: usize,
485 start_col: usize,
486 pattern: &str,
487 ) -> Result<RegexFlags, LexError> {
488 let mut flags = RegexFlags::default();
489 while let Some(ch) = self.peek_char() {
490 match ch {
491 'i' => {
492 flags.case_insensitive = true;
493 self.next_char();
494 }
495 'm' => {
496 flags.multiline = true;
497 self.next_char();
498 }
499 's' => {
500 flags.dot_all = true;
501 self.next_char();
502 }
503 _ if ch.is_ascii_alphabetic() => {
504 return Err(LexError::InvalidRegex {
506 pattern: pattern.to_string(),
507 error: format!("Unknown regex flag '{ch}'"),
508 span: Span::with_position(
509 start_pos,
510 self.position + 1,
511 start_line,
512 start_col,
513 ),
514 });
515 }
516 _ => break,
517 }
518 }
519
520 Ok(flags)
521 }
522
523 fn validate_regex_pattern(
524 &self,
525 pattern: &str,
526 flags: &RegexFlags,
527 start_pos: usize,
528 start_line: usize,
529 start_col: usize,
530 ) -> Result<(), LexError> {
531 let mut builder = regex::RegexBuilder::new(pattern);
532 builder
533 .case_insensitive(flags.case_insensitive)
534 .multi_line(flags.multiline)
535 .dot_matches_new_line(flags.dot_all);
536
537 if let Err(e) = builder.build() {
538 return Err(LexError::InvalidRegex {
539 pattern: pattern.to_string(),
540 error: e.to_string(),
541 span: Span::with_position(start_pos, self.position, start_line, start_col),
542 });
543 }
544
545 Ok(())
546 }
547
548 fn read_hex_digits(&mut self, count: usize) -> Result<String, LexError> {
550 let mut hex = String::new();
551
552 for _ in 0..count {
553 match self.next_char() {
554 Some(c) if c.is_ascii_hexdigit() => hex.push(c),
555 Some(c) => {
556 return Err(LexError::InvalidUnicodeEscape {
557 got: c,
558 span: Span::with_position(
559 self.position - 1,
560 self.position,
561 self.line,
562 self.column.saturating_sub(1),
563 ),
564 });
565 }
566 None => {
567 return Err(LexError::InvalidUnicodeEscape {
568 got: '?',
569 span: Span::with_position(
570 self.position,
571 self.position,
572 self.line,
573 self.column,
574 ),
575 });
576 }
577 }
578 }
579
580 Ok(hex)
581 }
582
583 fn read_number(&mut self) -> Result<i64, LexError> {
585 let start_pos = self.position;
586 let start_line = self.line;
587 let start_col = self.column;
588 let mut num_str = String::new();
589
590 if self.peek_char() == Some('-') {
592 num_str.push('-');
593 self.next_char();
594 }
595
596 while let Some(c) = self.peek_char() {
598 if c.is_ascii_digit() {
599 num_str.push(c);
600 self.next_char();
601 } else if c == '_' {
602 self.next_char();
604 } else {
605 break;
606 }
607 }
608
609 num_str
611 .parse::<i64>()
612 .map_err(|e| LexError::NumberOverflow {
613 text: num_str.clone(),
614 error: e.to_string(),
615 span: Span::with_position(start_pos, self.position, start_line, start_col),
616 })
617 }
618
619 fn read_word(&mut self) -> String {
622 let mut word = String::new();
623
624 while let Some(c) = self.peek_char() {
625 match self.classify_word_char(c) {
626 WordCharType::Basic => {
627 word.push(c);
628 self.next_char();
629 }
630 WordCharType::DoubleColon => {
631 word.push_str("::");
632 self.next_char();
633 self.next_char();
634 }
635 WordCharType::GenericStart => {
636 self.consume_generic_segment(&mut word);
637 }
638 WordCharType::End => break,
639 }
640 }
641
642 word
643 }
644
645 fn classify_word_char(&self, c: char) -> WordCharType {
647 if c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '*' | '?' | '/' | '-' | '[' | ']') {
648 WordCharType::Basic
649 } else if c == ':' && self.peek_ahead(1) == Some(':') {
650 WordCharType::DoubleColon
651 } else if c == '<' && self.has_generic_closing_angle() {
652 WordCharType::GenericStart
653 } else {
654 WordCharType::End
655 }
656 }
657
658 fn consume_generic_segment(&mut self, word: &mut String) {
660 word.push('<');
661 self.next_char();
662
663 let mut depth = 1usize;
664 while let Some(ch) = self.peek_char() {
665 if ch.is_whitespace() {
666 break;
667 }
668 depth = match ch {
669 '<' => depth.saturating_add(1),
670 '>' => depth.saturating_sub(1),
671 _ => depth,
672 };
673 word.push(ch);
674 self.next_char();
675 if depth == 0 {
676 break;
677 }
678 }
679 }
680
681 fn has_generic_closing_angle(&self) -> bool {
683 let mut depth = 0usize;
684
685 for ch in self.input[self.position..].chars() {
686 if ch.is_whitespace() {
687 return false;
688 }
689 match ch {
690 '<' => depth = depth.saturating_add(1),
691 '>' => {
692 if depth == 0 {
693 return false;
694 }
695 depth = depth.saturating_sub(1);
696 if depth == 0 {
697 return true;
698 }
699 }
700 _ => {}
701 }
702 }
703
704 false
705 }
706}
707
708enum WordCharType {
710 Basic,
712 DoubleColon,
714 GenericStart,
716 End,
718}
719
720pub struct Lexer<'a> {
722 raw: RawLexer<'a>,
723}
724
725impl<'a> Lexer<'a> {
726 #[must_use]
728 pub fn new(input: &'a str) -> Self {
729 Self {
730 raw: RawLexer::new(input),
731 }
732 }
733
734 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
740 let mut tokens = Vec::with_capacity(16);
741 self.raw.restart();
742 self.raw.tokenize_into(&mut tokens)?;
743 Ok(tokens)
744 }
745
746 pub fn next_token(&mut self) -> Result<Token, LexError> {
752 self.raw.next_token()
753 }
754}
755
756#[allow(dead_code)]
757#[derive(Clone, Copy, Debug, PartialEq, Eq)]
758pub(crate) struct ShrinkPolicy {
759 pub max_capacity: usize,
760 pub shrink_ratio: usize,
761}
762
763impl Default for ShrinkPolicy {
764 fn default() -> Self {
765 Self {
766 max_capacity: 256,
767 shrink_ratio: 8,
768 }
769 }
770}
771
772const POOL_MAX_DEFAULT: usize = 4;
774const ENV_POOL_MAX: &str = "SQRY_LEXER_POOL_MAX";
775const ENV_POOL_MAX_CAP: &str = "SQRY_LEXER_POOL_MAX_CAP";
776const ENV_POOL_SHRINK_RATIO: &str = "SQRY_LEXER_POOL_SHRINK_RATIO";
777
778#[derive(Clone, Copy, Debug, PartialEq, Eq)]
779struct PoolConfig {
780 max_size: usize,
781 shrink_policy: ShrinkPolicy,
782}
783
784impl PoolConfig {
785 fn default() -> Self {
786 Self {
787 max_size: POOL_MAX_DEFAULT,
788 shrink_policy: ShrinkPolicy::default(),
789 }
790 }
791
792 fn from_environment() -> Self {
793 let mut config = Self::default();
794
795 if let Some(value) = read_env_usize(ENV_POOL_MAX) {
796 config.max_size = value;
797 }
798
799 if let Some(value) = read_env_usize(ENV_POOL_MAX_CAP) {
800 config.shrink_policy.max_capacity = value.max(1);
801 }
802
803 if let Some(value) = read_env_usize(ENV_POOL_SHRINK_RATIO) {
804 config.shrink_policy.shrink_ratio = value.max(1);
805 }
806
807 config
808 }
809}
810
811fn read_env_usize(var: &str) -> Option<usize> {
812 match env::var(var) {
813 Ok(value) => match value.parse::<usize>() {
814 Ok(parsed) => Some(parsed),
815 Err(err) => {
816 trace!("Ignoring invalid value for {var}: {err}");
817 None
818 }
819 },
820 Err(std::env::VarError::NotPresent) => None,
821 Err(std::env::VarError::NotUnicode(_)) => {
822 trace!("Ignoring non-unicode value for {var}");
823 None
824 }
825 }
826}
827
828thread_local! {
829 static LEXER_POOL: RefCell<LexerPool> = RefCell::new(LexerPool::new(PoolConfig::default()));
830}
831
832struct LexerPool {
833 stash: Vec<ReusableLexer>,
834 in_flight: usize,
835 config: PoolConfig,
836}
837
838impl LexerPool {
839 fn new(config: PoolConfig) -> Self {
840 Self {
841 stash: Vec::new(),
842 in_flight: 0,
843 config,
844 }
845 }
846
847 fn apply_config(&mut self, config: PoolConfig) {
848 if self.config == config {
849 return;
850 }
851
852 trace!(
853 "sqry::query::lexer: updating pool config -> max_size={}, max_capacity={}, shrink_ratio={}",
854 config.max_size, config.shrink_policy.max_capacity, config.shrink_policy.shrink_ratio
855 );
856
857 self.config = config;
858 self.stash.clear();
859 self.in_flight = 0;
860 }
861
862 fn acquire(&mut self) -> LexerHandle {
863 if let Some(lexer) = self.stash.pop() {
864 self.in_flight += 1;
865 return LexerHandle::pooled(lexer);
866 }
867
868 if self.in_flight < self.config.max_size {
869 self.in_flight += 1;
870 let lexer = ReusableLexer::with_policy(self.config.shrink_policy);
871 return LexerHandle::pooled(lexer);
872 }
873
874 LexerHandle::temporary(ReusableLexer::with_policy(self.config.shrink_policy))
875 }
876
877 fn release(&mut self, lexer: ReusableLexer) {
878 if self.config.max_size == 0 {
879 self.in_flight = self.in_flight.saturating_sub(1);
880 return;
881 }
882
883 self.in_flight = self.in_flight.saturating_sub(1);
884 if self.stash.len() < self.config.max_size {
885 self.stash.push(lexer);
886 }
887 }
888
889 #[cfg(test)]
890 fn stats(&self) -> (usize, usize, PoolConfig) {
891 (self.stash.len(), self.in_flight, self.config)
892 }
893
894 #[cfg(test)]
895 fn reset(&mut self, config: PoolConfig) {
896 self.stash.clear();
897 self.in_flight = 0;
898 self.config = config;
899 }
900}
901
902struct LexerHandle {
903 lexer: Option<ReusableLexer>,
904 pooled: bool,
905}
906
907impl LexerHandle {
908 fn pooled(lexer: ReusableLexer) -> Self {
909 Self {
910 lexer: Some(lexer),
911 pooled: true,
912 }
913 }
914
915 fn temporary(lexer: ReusableLexer) -> Self {
916 Self {
917 lexer: Some(lexer),
918 pooled: false,
919 }
920 }
921
922 fn lexer_mut(&mut self) -> &mut ReusableLexer {
923 self.lexer.as_mut().expect("lexer handle missing lexer")
927 }
928
929 fn reset(&mut self, input: &str) {
930 self.lexer_mut().reset(input);
931 }
932
933 fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
934 self.lexer_mut().tokenize()
935 }
936}
937
938impl Drop for LexerHandle {
939 fn drop(&mut self) {
940 if !self.pooled {
941 return;
942 }
943
944 if let Some(lexer) = self.lexer.take() {
945 LEXER_POOL.with(|cell| {
946 cell.borrow_mut().release(lexer);
947 });
948 }
949 }
950}
951
952#[cfg(test)]
953pub(crate) fn configure_pool_for_tests(max_size: usize, shrink_policy: ShrinkPolicy) {
954 LEXER_POOL.with(|cell| {
955 cell.borrow_mut().reset(PoolConfig {
956 max_size,
957 shrink_policy,
958 });
959 });
960}
961
962#[cfg(test)]
963pub(crate) fn reset_pool_to_default_for_tests() {
964 configure_pool_for_tests(POOL_MAX_DEFAULT, ShrinkPolicy::default());
965}
966
967#[cfg(test)]
968pub(crate) fn pool_stats_for_tests() -> (usize, usize, usize) {
969 LEXER_POOL.with(|cell| {
970 let (stash, in_flight, config) = cell.borrow().stats();
971 (stash, in_flight, config.max_size)
972 })
973}
974
975pub(crate) fn with_lexer<F, T>(input: &str, f: F) -> Result<T, LexError>
976where
977 F: FnOnce(TokenBatch<'_>) -> Result<T, LexError>,
978{
979 let config = PoolConfig::from_environment();
980
981 if config.max_size == 0 {
982 LEXER_POOL.with(|cell| {
983 cell.borrow_mut().apply_config(config);
984 });
985 let mut lexer = ReusableLexer::with_policy(config.shrink_policy);
986 lexer.reset(input);
987 let batch = lexer.tokenize()?;
988 return f(batch);
989 }
990
991 let mut handle = LEXER_POOL.with(|cell| {
992 let mut pool = cell.borrow_mut();
993 pool.apply_config(config);
994 pool.acquire()
995 });
996
997 handle.reset(input);
998 let batch = handle.tokenize()?;
999 let result = f(batch);
1000 drop(handle);
1001 result
1002}
1003
1004pub fn tokenize_with_pool(input: &str) -> Result<Vec<Token>, LexError> {
1013 with_lexer(input, |batch| Ok(batch.into_vec()))
1014}
1015
1016#[cfg(debug_assertions)]
1017#[allow(dead_code)]
1018#[derive(Debug, Default, Clone, Copy)]
1019struct LexerDiagnostics {
1020 reuse_count: usize,
1021 max_capacity_seen: usize,
1022 shrink_count: usize,
1023}
1024
1025#[cfg(debug_assertions)]
1026#[allow(dead_code)]
1027impl LexerDiagnostics {
1028 fn record_reuse(&mut self, capacity: usize) {
1029 self.reuse_count += 1;
1030 if capacity > self.max_capacity_seen {
1031 self.max_capacity_seen = capacity;
1032 }
1033 }
1034
1035 fn record_shrink(&mut self) {
1036 self.shrink_count += 1;
1037 }
1038}
1039
1040#[allow(dead_code)]
1043pub(crate) struct ReusableLexer {
1044 input: String,
1045 token_buffer: Vec<Token>,
1046 shrink_policy: ShrinkPolicy,
1047 #[cfg(debug_assertions)]
1048 diagnostics: LexerDiagnostics,
1049}
1050
1051#[allow(dead_code)]
1052impl ReusableLexer {
1053 pub fn new() -> Self {
1054 Self::with_policy(ShrinkPolicy::default())
1055 }
1056
1057 pub fn with_policy(shrink_policy: ShrinkPolicy) -> Self {
1058 Self {
1059 input: String::new(),
1060 token_buffer: Vec::with_capacity(16),
1061 shrink_policy,
1062 #[cfg(debug_assertions)]
1063 diagnostics: LexerDiagnostics::default(),
1064 }
1065 }
1066
1067 pub fn reset(&mut self, input: &str) {
1069 self.input.clear();
1070 self.input.push_str(input);
1071 self.token_buffer.clear();
1072 }
1073
1074 pub fn tokenize(&mut self) -> Result<TokenBatch<'_>, LexError> {
1076 self.token_buffer.clear();
1077 let mut raw = RawLexer::new(self.input.as_str());
1078 raw.tokenize_into(&mut self.token_buffer)?;
1079 #[cfg(debug_assertions)]
1080 self.diagnostics.record_reuse(self.token_buffer.capacity());
1081 Ok(TokenBatch {
1082 tokens: &mut self.token_buffer,
1083 shrink_policy: self.shrink_policy,
1084 #[cfg(debug_assertions)]
1085 diagnostics: &mut self.diagnostics,
1086 })
1087 }
1088
1089 #[cfg(debug_assertions)]
1090 fn diagnostics(&self) -> &LexerDiagnostics {
1091 &self.diagnostics
1092 }
1093}
1094
1095#[allow(dead_code)]
1103pub(crate) struct TokenBatch<'a> {
1104 tokens: &'a mut Vec<Token>,
1105 shrink_policy: ShrinkPolicy,
1106 #[cfg(debug_assertions)]
1107 diagnostics: &'a mut LexerDiagnostics,
1108}
1109
1110#[allow(dead_code)]
1111impl TokenBatch<'_> {
1112 pub fn as_slice(&self) -> &[Token] {
1113 self.tokens.as_slice()
1114 }
1115
1116 #[allow(unused_mut)]
1117 pub fn into_vec(mut self) -> Vec<Token> {
1118 let result = self.tokens.drain(..).collect();
1119 #[cfg(debug_assertions)]
1120 let _ = &mut *self.diagnostics; result
1122 }
1123}
1124
1125impl Drop for TokenBatch<'_> {
1126 fn drop(&mut self) {
1127 if !self.tokens.is_empty() {
1128 self.tokens.clear();
1129 }
1130
1131 let shrink_threshold = self
1132 .shrink_policy
1133 .max_capacity
1134 .saturating_mul(self.shrink_policy.shrink_ratio);
1135 if shrink_threshold > 0 && self.tokens.capacity() > shrink_threshold {
1136 self.tokens.shrink_to(self.shrink_policy.max_capacity);
1137 #[cfg(debug_assertions)]
1138 self.diagnostics.record_shrink();
1139 }
1140 }
1141}
1142
1143#[cfg(test)]
1144mod tests {
1145 use super::*;
1146 use std::panic::{AssertUnwindSafe, catch_unwind};
1147 use std::sync::{Mutex, OnceLock};
1148
1149 #[cfg(feature = "dhat-heap")]
1150 use dhat::{HeapStats, Profiler};
1151
1152 fn reset_pool_from_env() {
1153 let config = PoolConfig::from_environment();
1154 LEXER_POOL.with(|cell| {
1155 cell.borrow_mut().reset(config);
1156 });
1157 }
1158
1159 fn reset_pool_default() {
1160 unsafe {
1161 std::env::remove_var(ENV_POOL_MAX);
1162 std::env::remove_var(ENV_POOL_MAX_CAP);
1163 std::env::remove_var(ENV_POOL_SHRINK_RATIO);
1164 }
1165 reset_pool_from_env();
1166 }
1167
1168 fn set_env(var: &str, value: &str) {
1169 unsafe {
1170 std::env::set_var(var, value);
1171 }
1172 }
1173
1174 fn remove_env(var: &str) {
1175 unsafe {
1176 std::env::remove_var(var);
1177 }
1178 }
1179
1180 fn env_lock() -> &'static Mutex<()> {
1181 static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
1182 LOCK.get_or_init(|| Mutex::new(()))
1183 }
1184
1185 #[test]
1186 fn reusable_lexer_reuses_buffer_across_calls() {
1187 let mut lexer = ReusableLexer::new();
1188 lexer.reset("kind:function");
1189
1190 let first_ptr = {
1191 let batch = lexer.tokenize().unwrap();
1192 let ptr = batch.as_slice().as_ptr();
1193 assert!(!batch.as_slice().is_empty());
1194 ptr
1195 };
1196 assert_eq!(first_ptr, lexer.token_buffer.as_ptr());
1197
1198 lexer.reset("name:test");
1199 let second_ptr = {
1200 let batch = lexer.tokenize().unwrap();
1201 let ptr = batch.as_slice().as_ptr();
1202 assert!(!batch.as_slice().is_empty());
1203 ptr
1204 };
1205 assert_eq!(second_ptr, lexer.token_buffer.as_ptr());
1206 assert_eq!(first_ptr, second_ptr);
1207 #[cfg(debug_assertions)]
1208 {
1209 let diagnostics = lexer.diagnostics();
1210 assert!(diagnostics.reuse_count >= 2);
1211 assert!(diagnostics.max_capacity_seen >= lexer.token_buffer.capacity());
1212 }
1213 }
1214
1215 #[test]
1216 fn reusable_lexer_clears_buffer_on_panic() {
1217 let mut lexer = ReusableLexer::new();
1218 lexer.reset("kind:function");
1219
1220 let result = catch_unwind(AssertUnwindSafe(|| {
1221 let _batch = lexer.tokenize().unwrap();
1222 panic!("boom");
1223 }));
1224
1225 assert!(result.is_err());
1226 assert_eq!(lexer.token_buffer.len(), 0);
1227 }
1228
1229 #[test]
1230 fn reusable_lexer_into_vec_drains_tokens() {
1231 let mut lexer = ReusableLexer::new();
1232 lexer.reset("kind:function");
1233
1234 let tokens = {
1235 let batch = lexer.tokenize().unwrap();
1236 batch.into_vec()
1237 };
1238
1239 assert_eq!(tokens.len(), 4);
1240 assert_eq!(lexer.token_buffer.len(), 0);
1241 }
1242
1243 #[test]
1244 fn reusable_lexer_shrink_policy_applies() {
1245 let policy = ShrinkPolicy {
1246 max_capacity: 8,
1247 shrink_ratio: 2,
1248 };
1249
1250 let mut lexer = ReusableLexer::with_policy(policy);
1251 let large_query = (0..128)
1252 .map(|i| format!("name:value{i}"))
1253 .collect::<Vec<_>>()
1254 .join(" ");
1255 lexer.reset(&large_query);
1256
1257 {
1258 let batch = lexer.tokenize().unwrap();
1259 let _ = batch.into_vec();
1260 }
1261
1262 if lexer.token_buffer.capacity() <= policy.max_capacity * policy.shrink_ratio {
1263 lexer
1264 .token_buffer
1265 .reserve(policy.max_capacity * policy.shrink_ratio * 2);
1266 }
1267 assert!(lexer.token_buffer.capacity() > policy.max_capacity * policy.shrink_ratio);
1268
1269 lexer.reset("kind:function");
1270 {
1271 let batch = lexer.tokenize().unwrap();
1272 drop(batch);
1273 }
1274
1275 assert!(lexer.token_buffer.capacity() <= policy.max_capacity);
1276
1277 #[cfg(debug_assertions)]
1278 {
1279 let diagnostics = lexer.diagnostics();
1280 assert!(diagnostics.shrink_count >= 1);
1281 }
1282 }
1283
1284 #[test]
1285 fn lexer_pool_returns_lexers_to_stash() {
1286 let _guard = env_lock().lock().unwrap();
1287 reset_pool_default();
1288
1289 assert_eq!(PoolConfig::from_environment().max_size, POOL_MAX_DEFAULT);
1290
1291 let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1292 assert_eq!(tokens.len(), 4);
1293
1294 LEXER_POOL.with(|cell| {
1295 let (stash_len, in_flight, config) = cell.borrow().stats();
1296 assert_eq!(config.max_size, POOL_MAX_DEFAULT);
1297 assert_eq!(in_flight, 0);
1298 assert_eq!(stash_len, 1);
1299 });
1300 }
1301
1302 #[test]
1303 fn lexer_pool_respects_zero_capacity_env() {
1304 let _guard = env_lock().lock().unwrap();
1305 set_env(ENV_POOL_MAX, "0");
1306 reset_pool_from_env();
1307
1308 let tokens = with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1309 assert_eq!(tokens.len(), 4);
1310
1311 LEXER_POOL.with(|cell| {
1312 let (stash_len, in_flight, config) = cell.borrow().stats();
1313 assert_eq!(config.max_size, 0);
1314 assert_eq!(in_flight, 0);
1315 assert_eq!(stash_len, 0);
1316 });
1317
1318 remove_env(ENV_POOL_MAX);
1319 reset_pool_default();
1320 }
1321
1322 #[test]
1323 fn lexer_pool_reuses_single_slot() {
1324 let _guard = env_lock().lock().unwrap();
1325 set_env(ENV_POOL_MAX, "1");
1326 reset_pool_from_env();
1327
1328 assert_eq!(PoolConfig::from_environment().max_size, 1);
1329
1330 for query in ["kind:function", "name:test"] {
1331 let _ = with_lexer(query, |batch| Ok(batch.into_vec())).unwrap();
1332 }
1333
1334 LEXER_POOL.with(|cell| {
1335 let (stash_len, in_flight, config) = cell.borrow().stats();
1336 assert_eq!(config.max_size, 1);
1337 assert_eq!(in_flight, 0);
1338 assert_eq!(stash_len, 1);
1339 });
1340
1341 remove_env(ENV_POOL_MAX);
1342 reset_pool_default();
1343 }
1344
1345 #[test]
1346 fn lexer_handles_double_colon_in_words() {
1347 let mut lexer = Lexer::new("callers:Player::takeDamage");
1348 let tokens = lexer.tokenize().unwrap();
1349 assert_eq!(tokens.len(), 4); assert_eq!(
1351 tokens[0].token_type,
1352 TokenType::Identifier("callers".to_string())
1353 );
1354 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1355 assert_eq!(
1356 tokens[2].token_type,
1357 TokenType::Word("Player::takeDamage".to_string())
1358 );
1359 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1360 }
1361
1362 #[test]
1363 #[ignore = "Test depends on clean env_lock state. Run in isolation with: cargo test -p sqry-core --lib with_lexer_allows_reentrant_usage -- --ignored --test-threads=1"]
1364 fn with_lexer_allows_reentrant_usage() {
1365 let _guard = env_lock().lock().unwrap();
1366 reset_pool_default();
1367
1368 let result = with_lexer("kind:function", |batch| {
1369 assert!(!batch.as_slice().is_empty());
1370 with_lexer("name:test", |inner_batch| {
1371 assert!(!inner_batch.as_slice().is_empty());
1372 Ok(())
1373 })
1374 });
1375
1376 assert!(result.is_ok());
1377 reset_pool_default();
1378 }
1379
1380 #[test]
1381 fn lexer_pool_thread_local_isolation() {
1382 let _guard = env_lock().lock().unwrap();
1383 reset_pool_default();
1384
1385 let handles: Vec<_> = (0..4)
1386 .map(|_| {
1387 std::thread::spawn(|| {
1388 for _ in 0..50 {
1389 for query in ["kind:function", "name:test", "lang:rust"] {
1390 with_lexer(query, |batch| {
1391 assert!(!batch.as_slice().is_empty());
1392 Ok(batch.into_vec())
1393 })
1394 .unwrap();
1395 }
1396 }
1397
1398 let (stash, in_flight, max_size) = crate::query::lexer::pool_stats_for_tests();
1399 assert!(stash <= max_size);
1400 assert_eq!(in_flight, 0);
1401 })
1402 })
1403 .collect();
1404
1405 for handle in handles {
1406 handle.join().unwrap();
1407 }
1408
1409 reset_pool_default();
1410 }
1411
1412 #[cfg(feature = "dhat-heap")]
1413 #[test]
1414 #[ignore = "Heap profiling test must run in isolation. Run with: cargo test -p sqry-core --lib lexer_reuse_minimizes_heap_allocations -- --ignored --test-threads=1"]
1415 fn lexer_reuse_minimizes_heap_allocations() {
1416 let _guard = env_lock().lock().unwrap();
1417 reset_pool_default();
1418
1419 let profiler = Profiler::new_heap();
1420
1421 for _ in 0..5 {
1422 with_lexer("kind:function", |batch| Ok(batch.into_vec())).unwrap();
1423 }
1424
1425 let stats = HeapStats::get();
1426 drop(profiler);
1427 assert!(
1431 stats.total_blocks <= 65,
1432 "expected limited allocations, observed {} blocks (threshold accounts for plugin loading in integration tests)",
1433 stats.total_blocks
1434 );
1435
1436 reset_pool_default();
1437 }
1438
1439 #[test]
1440 fn reusable_lexer_capacity_growth_and_retention() {
1441 let mut lexer = ReusableLexer::new();
1442
1443 lexer.reset("kind:function");
1444 {
1445 let batch = lexer.tokenize().unwrap();
1446 assert!(!batch.as_slice().is_empty());
1447 }
1448 let initial_capacity = lexer.token_buffer.capacity();
1449
1450 let large_query = (0..50)
1451 .map(|i| format!("name:value{i}"))
1452 .collect::<Vec<_>>()
1453 .join(" AND ");
1454 lexer.reset(&large_query);
1455 {
1456 let batch = lexer.tokenize().unwrap();
1457 assert!(batch.as_slice().len() > 50);
1458 }
1459 let grown_capacity = lexer.token_buffer.capacity();
1460 assert!(grown_capacity > initial_capacity);
1461
1462 lexer.reset("kind:function");
1463 {
1464 let batch = lexer.tokenize().unwrap();
1465 assert!(!batch.as_slice().is_empty());
1466 }
1467 let retained_capacity = lexer.token_buffer.capacity();
1468 assert_eq!(retained_capacity, grown_capacity);
1469
1470 #[cfg(debug_assertions)]
1471 {
1472 let diagnostics = lexer.diagnostics();
1473 assert!(diagnostics.reuse_count >= 3);
1474 assert!(diagnostics.max_capacity_seen >= grown_capacity);
1475 }
1476 }
1477
1478 #[test]
1479 fn reusable_lexer_error_recovery_clears_buffer() {
1480 let mut lexer = ReusableLexer::new();
1481
1482 lexer.reset("kind:function");
1483 {
1484 let batch = lexer.tokenize().unwrap();
1485 assert!(!batch.as_slice().is_empty());
1486 }
1487
1488 lexer.reset("kind@invalid");
1489 let result = lexer.tokenize();
1490 assert!(result.is_err());
1491 drop(result);
1492
1493 lexer.reset("name:test");
1494 {
1495 let batch = lexer.tokenize().unwrap();
1496 assert!(!batch.as_slice().is_empty());
1497 }
1498 }
1499
1500 #[test]
1501 fn reusable_lexer_panic_after_into_vec_has_clean_buffer() {
1502 let mut lexer = ReusableLexer::new();
1503 lexer.reset("kind:function");
1504
1505 let result = catch_unwind(AssertUnwindSafe(|| {
1506 let batch = lexer.tokenize().unwrap();
1507 let _tokens = batch.into_vec();
1508 panic!("boom");
1509 }));
1510
1511 assert!(result.is_err());
1512 assert_eq!(lexer.token_buffer.len(), 0);
1513 }
1514
1515 #[test]
1516 fn test_tokenize_simple_query() {
1517 let mut lexer = Lexer::new("kind:function");
1518 let tokens = lexer.tokenize().unwrap();
1519
1520 assert_eq!(tokens.len(), 4);
1521 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "kind"));
1522 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1523 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "function"));
1524 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1525 }
1526
1527 #[test]
1528 fn test_tokenize_generic_type_value() {
1529 let mut lexer = Lexer::new("returns:Optional<User>");
1530 let tokens = lexer.tokenize().unwrap();
1531
1532 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1533 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1534 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Optional<User>"));
1535 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1536 }
1537
1538 #[test]
1539 fn test_tokenize_nested_generic_value() {
1540 let mut lexer = Lexer::new("returns:Map<String,List<Order>>");
1541 let tokens = lexer.tokenize().unwrap();
1542
1543 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "returns"));
1544 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1545 assert!(
1546 matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "Map<String,List<Order>>")
1547 );
1548 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1549 }
1550
1551 #[test]
1552 fn test_tokenize_numeric_comparison_after_identifier() {
1553 let mut lexer = Lexer::new("line>10");
1554 let tokens = lexer.tokenize().unwrap();
1555
1556 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "line"));
1557 assert!(matches!(tokens[1].token_type, TokenType::Greater));
1558 assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(10)));
1559 assert!(matches!(tokens[3].token_type, TokenType::Eof));
1560 }
1561
1562 #[test]
1563 fn test_tokenize_keywords_case_insensitive() {
1564 let mut lexer = Lexer::new("AND and Or NOT not");
1565 let tokens = lexer.tokenize().unwrap();
1566
1567 assert!(matches!(tokens[0].token_type, TokenType::And));
1568 assert!(matches!(tokens[1].token_type, TokenType::And));
1569 assert!(matches!(tokens[2].token_type, TokenType::Or));
1570 assert!(matches!(tokens[3].token_type, TokenType::Not));
1571 assert!(matches!(tokens[4].token_type, TokenType::Not));
1572 }
1573
1574 #[test]
1575 fn test_tokenize_operators() {
1576 let mut lexer = Lexer::new(": ~= > < >= <=");
1577 let tokens = lexer.tokenize().unwrap();
1578
1579 assert!(matches!(tokens[0].token_type, TokenType::Colon));
1580 assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1581 assert!(matches!(tokens[2].token_type, TokenType::Greater));
1582 assert!(matches!(tokens[3].token_type, TokenType::Less));
1583 assert!(matches!(tokens[4].token_type, TokenType::GreaterEq));
1584 assert!(matches!(tokens[5].token_type, TokenType::LessEq));
1585 }
1586
1587 #[test]
1588 fn test_tokenize_parentheses() {
1589 let mut lexer = Lexer::new("( )");
1590 let tokens = lexer.tokenize().unwrap();
1591
1592 assert!(matches!(tokens[0].token_type, TokenType::LParen));
1593 assert!(matches!(tokens[1].token_type, TokenType::RParen));
1594 }
1595
1596 #[test]
1597 fn test_tokenize_double_quoted_string() {
1598 let mut lexer = Lexer::new(r#"name:"hello world""#);
1599 let tokens = lexer.tokenize().unwrap();
1600
1601 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1602 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1603 assert!(
1604 matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1605 );
1606 }
1607
1608 #[test]
1609 fn test_tokenize_single_quoted_string() {
1610 let mut lexer = Lexer::new(r"name:'hello world'");
1611 let tokens = lexer.tokenize().unwrap();
1612
1613 assert!(
1614 matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s == "hello world")
1615 );
1616 }
1617
1618 #[test]
1619 fn test_string_escape_sequences() {
1620 let mut lexer = Lexer::new(r#""line1\nline2\ttab\"quote\\backslash""#);
1621 let tokens = lexer.tokenize().unwrap();
1622
1623 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1624 assert_eq!(s, "line1\nline2\ttab\"quote\\backslash");
1625 } else {
1626 panic!("Expected string literal");
1627 }
1628 }
1629
1630 #[test]
1631 fn test_unicode_escape() {
1632 let mut lexer = Lexer::new(r#""\u0041BC""#);
1633 let tokens = lexer.tokenize().unwrap();
1634
1635 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1636 assert_eq!(s, "ABC");
1637 } else {
1638 panic!("Expected string literal");
1639 }
1640 }
1641
1642 #[test]
1643 fn test_unterminated_string() {
1644 let mut lexer = Lexer::new(r#"name:"unclosed"#);
1645 let result = lexer.tokenize();
1646 assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
1647 }
1648
1649 #[test]
1650 fn test_invalid_escape() {
1651 let mut lexer = Lexer::new(r#""\x""#);
1652 let result = lexer.tokenize();
1653 assert!(matches!(
1654 result,
1655 Err(LexError::InvalidEscape { char: 'x', .. })
1656 ));
1657 }
1658
1659 #[test]
1660 fn test_glob_metacharacter_escape_sequences() {
1661 let mut lexer = Lexer::new(r#""src/\[test\]/\*\?file\{a,b\}""#);
1664 let tokens = lexer.tokenize().unwrap();
1665
1666 if let TokenType::StringLiteral(s) = &tokens[0].token_type {
1667 assert_eq!(s, "src/[test]/*?file{a,b}");
1668 } else {
1669 panic!("Expected string literal, got {:?}", tokens[0].token_type);
1670 }
1671 }
1672
1673 #[test]
1674 fn test_path_predicate_with_escaped_glob_chars() {
1675 let mut lexer = Lexer::new(r#"path:"src/\[test\]/**""#);
1677 let tokens = lexer.tokenize().unwrap();
1678
1679 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "path"));
1680 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1681 if let TokenType::StringLiteral(s) = &tokens[2].token_type {
1682 assert_eq!(s, "src/[test]/**");
1684 } else {
1685 panic!("Expected string literal");
1686 }
1687 }
1688
1689 #[test]
1690 fn test_tokenize_regex() {
1691 let mut lexer = Lexer::new(r"name~=/^test_/i");
1692 let tokens = lexer.tokenize().unwrap();
1693
1694 assert!(matches!(tokens[0].token_type, TokenType::Identifier(ref s) if s == "name"));
1695 assert!(matches!(tokens[1].token_type, TokenType::RegexOp));
1696
1697 if let TokenType::RegexLiteral { pattern, flags } = &tokens[2].token_type {
1698 assert_eq!(pattern, "^test_");
1699 assert!(flags.case_insensitive);
1700 assert!(!flags.multiline);
1701 assert!(!flags.dot_all);
1702 } else {
1703 panic!("Expected regex literal");
1704 }
1705 }
1706
1707 #[test]
1708 fn test_regex_multiple_flags() {
1709 let mut lexer = Lexer::new(r"/pattern/ims");
1710 let tokens = lexer.tokenize().unwrap();
1711
1712 if let TokenType::RegexLiteral { flags, .. } = &tokens[0].token_type {
1713 assert!(flags.case_insensitive);
1714 assert!(flags.multiline);
1715 assert!(flags.dot_all);
1716 } else {
1717 panic!("Expected regex literal");
1718 }
1719 }
1720
1721 #[test]
1722 fn test_regex_escaped_slash() {
1723 let mut lexer = Lexer::new(r"/path\/to\/file/");
1724 let tokens = lexer.tokenize().unwrap();
1725
1726 if let TokenType::RegexLiteral { pattern, .. } = &tokens[0].token_type {
1727 assert_eq!(pattern, r"path\/to\/file");
1728 } else {
1729 panic!("Expected regex literal");
1730 }
1731 }
1732
1733 #[test]
1734 fn test_regex_escaped_backslash_then_slash() {
1735 let mut lexer = Lexer::new(r"/a\\\\/");
1742 let token = lexer.next_token().unwrap();
1743 match token.token_type {
1744 TokenType::RegexLiteral { pattern, .. } => {
1745 assert_eq!(pattern, r"a\\\\"); }
1747 _ => panic!("Expected RegexLiteral"),
1748 }
1749 }
1750
1751 #[test]
1752 fn test_regex_single_escaped_slash() {
1753 let mut lexer = Lexer::new(r"/a\/b/"); let token = lexer.next_token().unwrap();
1755 match token.token_type {
1756 TokenType::RegexLiteral { pattern, .. } => {
1757 assert_eq!(pattern, r"a\/b");
1758 }
1759 _ => panic!("Expected RegexLiteral"),
1760 }
1761 }
1762
1763 #[test]
1764 fn test_unterminated_regex() {
1765 let mut lexer = Lexer::new(r"/unclosed");
1766 let result = lexer.tokenize();
1767 assert!(matches!(result, Err(LexError::UnterminatedRegex { .. })));
1768 }
1769
1770 #[test]
1771 fn test_invalid_regex_pattern() {
1772 let mut lexer = Lexer::new(r"/^[/");
1773 let result = lexer.tokenize();
1774 assert!(matches!(result, Err(LexError::InvalidRegex { .. })));
1775 }
1776
1777 #[test]
1778 fn test_regex_unknown_flag() {
1779 let mut lexer = Lexer::new("/pattern/x");
1780 let err = lexer.next_token().unwrap_err();
1781 match err {
1782 LexError::InvalidRegex { error, .. } => {
1783 assert!(error.contains("Unknown regex flag"));
1784 }
1785 _ => panic!("Expected InvalidRegex error"),
1786 }
1787 }
1788
1789 #[test]
1790 fn test_tokenize_positive_number() {
1791 let mut lexer = Lexer::new("lines:42");
1792 let tokens = lexer.tokenize().unwrap();
1793
1794 assert!(matches!(tokens[2].token_type, TokenType::NumberLiteral(42)));
1795 }
1796
1797 #[test]
1798 fn test_tokenize_negative_number() {
1799 let mut lexer = Lexer::new("lines:-42");
1800 let tokens = lexer.tokenize().unwrap();
1801
1802 assert!(matches!(
1803 tokens[2].token_type,
1804 TokenType::NumberLiteral(-42)
1805 ));
1806 }
1807
1808 #[test]
1809 fn test_tokenize_number_with_underscores() {
1810 let mut lexer = Lexer::new("lines:1_000_000");
1811 let tokens = lexer.tokenize().unwrap();
1812
1813 assert!(matches!(
1814 tokens[2].token_type,
1815 TokenType::NumberLiteral(1_000_000)
1816 ));
1817 }
1818
1819 #[test]
1820 fn test_number_overflow() {
1821 let mut lexer = Lexer::new("lines:99999999999999999999");
1822 let result = lexer.tokenize();
1823 assert!(matches!(result, Err(LexError::NumberOverflow { .. })));
1824 }
1825
1826 #[test]
1827 fn test_tokenize_boolean_true() {
1828 let mut lexer = Lexer::new("async:true");
1829 let tokens = lexer.tokenize().unwrap();
1830
1831 assert!(matches!(
1832 tokens[2].token_type,
1833 TokenType::BooleanLiteral(true)
1834 ));
1835 }
1836
1837 #[test]
1838 fn test_tokenize_boolean_false() {
1839 let mut lexer = Lexer::new("async:FALSE");
1840 let tokens = lexer.tokenize().unwrap();
1841
1842 assert!(matches!(
1843 tokens[2].token_type,
1844 TokenType::BooleanLiteral(false)
1845 ));
1846 }
1847
1848 #[test]
1849 fn test_tokenize_complex_query() {
1850 let mut lexer = Lexer::new(r"kind:function AND async:true OR name~=/^test_/i");
1851 let tokens = lexer.tokenize().unwrap();
1852
1853 assert!(matches!(tokens[0].token_type, TokenType::Identifier(_)));
1855 assert!(matches!(tokens[1].token_type, TokenType::Colon));
1856 assert!(matches!(tokens[2].token_type, TokenType::Word(_)));
1857 assert!(matches!(tokens[3].token_type, TokenType::And));
1858 assert!(matches!(tokens[4].token_type, TokenType::Identifier(_)));
1859 assert!(matches!(tokens[5].token_type, TokenType::Colon));
1860 assert!(matches!(
1861 tokens[6].token_type,
1862 TokenType::BooleanLiteral(true)
1863 ));
1864 assert!(matches!(tokens[7].token_type, TokenType::Or));
1865 assert!(matches!(tokens[8].token_type, TokenType::Identifier(_)));
1866 assert!(matches!(tokens[9].token_type, TokenType::RegexOp));
1867 assert!(matches!(
1868 tokens[10].token_type,
1869 TokenType::RegexLiteral { .. }
1870 ));
1871 assert!(matches!(tokens[11].token_type, TokenType::Eof));
1872 }
1873
1874 #[test]
1875 fn test_whitespace_handling() {
1876 let mut lexer = Lexer::new(" kind : function ");
1877 let tokens = lexer.tokenize().unwrap();
1878
1879 assert_eq!(tokens.len(), 4); }
1881
1882 #[test]
1883 fn test_unexpected_character() {
1884 let mut lexer = Lexer::new("kind@function");
1885 let result = lexer.tokenize();
1886 assert!(matches!(
1887 result,
1888 Err(LexError::UnexpectedChar { char: '@', .. })
1889 ));
1890 }
1891
1892 #[test]
1893 fn test_empty_string_literal() {
1894 let mut lexer = Lexer::new(r#"name:"""#);
1895 let tokens = lexer.tokenize().unwrap();
1896
1897 assert!(matches!(tokens[2].token_type, TokenType::StringLiteral(ref s) if s.is_empty()));
1898 }
1899
1900 #[test]
1901 fn test_empty_regex_literal() {
1902 let mut lexer = Lexer::new(r"name~=//");
1903 let tokens = lexer.tokenize().unwrap();
1904
1905 if let TokenType::RegexLiteral { pattern, .. } = &tokens[2].token_type {
1906 assert_eq!(pattern, "");
1907 } else {
1908 panic!("Expected regex literal");
1909 }
1910 }
1911
1912 #[test]
1913 fn test_span_tracking() {
1914 let mut lexer = Lexer::new("kind:function");
1915 let tokens = lexer.tokenize().unwrap();
1916
1917 assert!(tokens[0].span.start == 0);
1919 assert!(tokens[0].span.end == 4); assert!(tokens[1].span.start == 4);
1921 assert!(tokens[1].span.end == 5); assert!(tokens[2].span.start == 5);
1923 assert!(tokens[2].span.end == 13); }
1925
1926 #[test]
1927 fn test_identifier_vs_word() {
1928 let mut lexer = Lexer::new("kind:value value");
1929 let tokens = lexer.tokenize().unwrap();
1930
1931 assert!(matches!(tokens[2].token_type, TokenType::Word(ref s) if s == "value"));
1933 assert!(matches!(tokens[3].token_type, TokenType::Word(ref s) if s == "value"));
1935 }
1936
1937 #[test]
1938 fn test_bare_word_with_glob() {
1939 let mut lexer = Lexer::new("path:src/*.rs");
1940 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1943 match token.token_type {
1944 TokenType::Word(s) => assert_eq!(s, "src/*.rs"),
1945 _ => panic!(
1946 "Expected Word with glob pattern, got {:?}",
1947 token.token_type
1948 ),
1949 }
1950 }
1951
1952 #[test]
1953 fn test_bare_word_with_hyphen() {
1954 let mut lexer = Lexer::new("name:foo-bar");
1955 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1958 match token.token_type {
1959 TokenType::Word(s) => assert_eq!(s, "foo-bar"),
1960 _ => panic!("Expected Word with hyphen, got {:?}", token.token_type),
1961 }
1962 }
1963
1964 #[test]
1965 fn test_bare_word_with_dot() {
1966 let mut lexer = Lexer::new("path:foo.rs");
1967 lexer.next_token().unwrap(); lexer.next_token().unwrap(); let token = lexer.next_token().unwrap();
1970 match token.token_type {
1971 TokenType::Word(s) => assert_eq!(s, "foo.rs"),
1972 _ => panic!("Expected Word with dot, got {:?}", token.token_type),
1973 }
1974 }
1975
1976 #[test]
1977 fn test_variable_token() {
1978 let mut lexer = Lexer::new("$name");
1979 let tokens = lexer.tokenize().unwrap();
1980 assert_eq!(tokens.len(), 2); assert_eq!(
1982 tokens[0].token_type,
1983 TokenType::Variable("name".to_string())
1984 );
1985 assert!(matches!(tokens[1].token_type, TokenType::Eof));
1986 }
1987
1988 #[test]
1989 fn test_variable_token_with_underscores() {
1990 let mut lexer = Lexer::new("$my_var");
1991 let tokens = lexer.tokenize().unwrap();
1992 assert_eq!(tokens.len(), 2); assert_eq!(
1994 tokens[0].token_type,
1995 TokenType::Variable("my_var".to_string())
1996 );
1997 }
1998
1999 #[test]
2000 fn test_pipe_token() {
2001 let mut lexer = Lexer::new("|");
2002 let tokens = lexer.tokenize().unwrap();
2003 assert_eq!(tokens.len(), 2); assert!(matches!(tokens[0].token_type, TokenType::Pipe));
2005 }
2006
2007 #[test]
2008 fn test_dollar_sign_alone_error() {
2009 let mut lexer = Lexer::new("$ ");
2010 let result = lexer.tokenize();
2011 assert!(
2012 matches!(result, Err(LexError::UnexpectedChar { char: '$', .. })),
2013 "Bare '$' should produce an error, got: {result:?}"
2014 );
2015 }
2016}