1use regex::Regex;
2use std::sync::LazyLock;
3
4static CAMEL_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z][A-Z]").unwrap());
5static SNAKE_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z]_[a-z]").unwrap());
6static PASCAL_CASE_RE: LazyLock<Regex> =
7 LazyLock::new(|| Regex::new(r"^[A-Z][a-z]+[A-Z]").unwrap());
8static ACRONYM_PASCAL_RE: LazyLock<Regex> =
9 LazyLock::new(|| Regex::new(r"\b[A-Z]{2,}[A-Z][a-z]").unwrap());
10static DOT_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-zA-Z]\.[a-zA-Z]").unwrap());
11static FILE_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[/\\].*\.\w{1,5}$").unwrap());
12static HEX_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"0x[A-Fa-f0-9]+").unwrap());
13static ERROR_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bERR_\w+").unwrap());
14static NUMERIC_ERROR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bE\d{4,}").unwrap());
15static TYPESCRIPT_ERROR_RE: LazyLock<Regex> =
16 LazyLock::new(|| Regex::new(r"\bTS\d{4,}\b").unwrap());
17static HTTP_STATUS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[1-5]\d{2}\b").unwrap());
18static IDENTIFIER_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
19 Regex::new(r"\b[A-Za-z_$][A-Za-z0-9_$]*(?:\.[A-Za-z_$][A-Za-z0-9_$]*)*\b").unwrap()
20});
21
22static WINDOWS_ABS_PATH_RE: LazyLock<Regex> =
23 LazyLock::new(|| Regex::new(r"^[A-Za-z]:[\\/][A-Za-z0-9_.\-+?\\/' ]+$").unwrap());
24static WINDOWS_REL_PATH_RE: LazyLock<Regex> =
25 LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(\\[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
26static POSIX_ABS_PATH_RE: LazyLock<Regex> =
27 LazyLock::new(|| Regex::new(r"^/[A-Za-z0-9_.\-+?/' ]+$").unwrap());
28static POSIX_REL_PATH_RE: LazyLock<Regex> =
29 LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(/[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
30static UNC_PATH_RE: LazyLock<Regex> =
31 LazyLock::new(|| Regex::new(r"^\\\\[A-Za-z0-9_.\-+?\\']+$").unwrap());
32static FILENAME_EXEMPTION_RE: LazyLock<Regex> =
33 LazyLock::new(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_.\-+'? ]*\.[A-Za-z0-9]{1,8}$").unwrap());
34static BRACE_QUANTIFIER_RE: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"\{\d+(?:,\d*)?\}").unwrap());
36static NAMED_CAPTURE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(\?P<[^>]+>").unwrap());
37static CHAR_RANGE_RE: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"[A-Za-z0-9]-[A-Za-z0-9]").unwrap());
39
40const QUESTION_WORDS: &[&str] = &[
41 "how", "what", "where", "why", "when", "which", "who", "does",
42];
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum QueryKind {
46 Identifier,
47 Mixed,
48 ErrorCode,
49 Path,
50 Regex,
51 NaturalLanguage,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub struct ShapeWeights {
56 pub semantic: f32,
57 pub lexical: f32,
58 pub should_use_lexical: bool,
59}
60
61#[derive(Debug, Clone, Copy, PartialEq)]
62pub struct QueryShape {
63 pub kind: QueryKind,
64 pub weights: ShapeWeights,
65}
66
67pub fn classify(query: &str) -> QueryShape {
68 let trimmed = query.trim();
69 if trimmed.is_empty() {
70 return shape(QueryKind::NaturalLanguage);
71 }
72
73 if pre_tier_exempt(trimmed).is_some() {
74 return shape(QueryKind::Path);
75 }
76
77 if looks_like_regex(trimmed) {
78 return shape(QueryKind::Regex);
79 }
80
81 let words: Vec<&str> = trimmed.split_whitespace().collect();
82 let word_count = words.len();
83 let first_word_lower = words[0].to_ascii_lowercase();
84
85 if FILE_PATH_RE.is_match(trimmed) {
86 return shape(QueryKind::Path);
87 }
88
89 let has_question_word = QUESTION_WORDS.contains(&first_word_lower.as_str());
90 let is_long_phrase = word_count > 2;
91 let is_two_word_concept = is_two_word_lowercase_concept(&words);
92 let has_natural_language_signals = has_question_word || is_long_phrase || is_two_word_concept;
93 let has_error_code = contains_error_code(trimmed, word_count);
94
95 if has_error_code && has_natural_language_signals {
96 return shape(QueryKind::Mixed);
97 }
98
99 if has_error_code {
100 return shape(QueryKind::ErrorCode);
101 }
102
103 let has_code_identifier = CAMEL_CASE_RE.is_match(trimmed)
104 || SNAKE_CASE_RE.is_match(trimmed)
105 || PASCAL_CASE_RE.is_match(trimmed)
106 || ACRONYM_PASCAL_RE.is_match(trimmed)
107 || DOT_PATH_RE.is_match(trimmed);
108
109 if has_code_identifier && has_natural_language_signals {
110 return shape(QueryKind::Mixed);
111 }
112
113 if has_code_identifier || (word_count <= 2 && !has_natural_language_signals) {
114 return shape(QueryKind::Identifier);
115 }
116
117 shape(QueryKind::NaturalLanguage)
118}
119
120pub fn extract_tokens(query: &str, shape: &QueryShape) -> Vec<String> {
121 match shape.kind {
122 QueryKind::NaturalLanguage | QueryKind::Regex => Vec::new(),
123 QueryKind::Path => extract_path_tokens(query),
124 QueryKind::ErrorCode => extract_error_code_tokens(query),
125 QueryKind::Identifier => extract_identifier_tokens(query, false),
126 QueryKind::Mixed => extract_identifier_tokens(query, true),
127 }
128}
129
130pub fn extract_short_nl_lexical_tokens(query: &str) -> Vec<String> {
136 query
137 .split_whitespace()
138 .filter(|word| word.chars().count() >= 3)
139 .map(str::to_string)
140 .collect()
141}
142
143pub fn pre_tier_exempt(query: &str) -> Option<&'static str> {
144 if let Some(kind) = check_url_exemption(query) {
145 return Some(kind);
146 }
147 check_path_exemption(query)
148}
149
150pub fn looks_like_regex(query: &str) -> bool {
151 crate::pattern_compile::detect_unsupported_features(query).is_some()
152 || tier_a_regex_signal(query)
153 || tier_b_character_class(query)
154 || tier_c_adjacent_meta(query)
155}
156
157fn check_url_exemption(query: &str) -> Option<&'static str> {
158 let parsed = url::Url::parse(query).ok()?;
159 if !matches!(parsed.scheme(), "http" | "https" | "file" | "ftp" | "ssh") {
160 return None;
161 }
162 if has_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
163 return None;
164 }
165 Some("url")
166}
167
168fn check_path_exemption(query: &str) -> Option<&'static str> {
169 let kind = if WINDOWS_ABS_PATH_RE.is_match(query) {
170 "windows_abs"
171 } else if WINDOWS_REL_PATH_RE.is_match(query) {
172 "windows_rel"
173 } else if POSIX_ABS_PATH_RE.is_match(query) {
174 "posix_abs"
175 } else if POSIX_REL_PATH_RE.is_match(query) {
176 "posix_rel"
177 } else if UNC_PATH_RE.is_match(query) {
178 "unc"
179 } else if FILENAME_EXEMPTION_RE.is_match(query) {
180 "filename"
181 } else {
182 return None;
183 };
184 if has_path_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
185 return None;
186 }
187 Some(kind)
188}
189
190fn contains_error_code(query: &str, word_count: usize) -> bool {
191 HEX_CODE_RE.is_match(query)
192 || ERROR_PREFIX_RE.is_match(query)
193 || NUMERIC_ERROR_RE.is_match(query)
194 || TYPESCRIPT_ERROR_RE.is_match(query)
195 || has_http_status(query, word_count)
196}
197
198fn has_http_status(query: &str, word_count: usize) -> bool {
199 HTTP_STATUS_RE.is_match(query)
200 && (word_count <= 3 || query.to_ascii_lowercase().contains("http"))
201}
202
203fn is_two_word_lowercase_concept(words: &[&str]) -> bool {
204 words.len() == 2
205 && words
206 .iter()
207 .all(|word| is_dictionary_style_lowercase_word(word))
208}
209
210fn is_dictionary_style_lowercase_word(word: &str) -> bool {
211 word.len() >= 3 && word.bytes().all(|byte| byte.is_ascii_lowercase())
212}
213
214fn has_regex_meta_sequences(query: &str) -> bool {
215 query.contains(".+")
216 || query.contains(".*")
217 || query.contains(".?")
218 || query.contains(r"\n")
219 || query.contains(r"\t")
220 || query.contains(r"\r")
221 || query.contains(r"\b")
222 || query.contains(r"\B")
223 || query.contains(r"\w")
224 || query.contains(r"\W")
225 || query.contains(r"\d")
226 || query.contains(r"\D")
227 || query.contains(r"\s")
228 || query.contains(r"\S")
229 || query.contains(r"\p{")
230 || query.contains(r"\x")
231 || query.contains(r"\u{")
232 || has_escaped_regex_metachar(query)
233}
234
235fn has_path_regex_meta_sequences(query: &str) -> bool {
236 query.contains(".+")
237 || query.contains(".*")
238 || query.contains(".?")
239 || query.contains(r"\p{")
240 || query.contains(r"\x")
241 || query.contains(r"\u{")
242 || has_path_context_regex_escape(query)
243 || has_escaped_regex_metachar(query)
244}
245
246fn has_path_context_regex_escape(query: &str) -> bool {
247 let chars = query.char_indices().collect::<Vec<_>>();
248 for index in 0..chars.len().saturating_sub(1) {
249 if chars[index].1 != '\\' {
250 continue;
251 }
252 let escaped = chars[index + 1].1;
253 if matches!(escaped, 'b' | 'B' | 'w' | 'W' | 'd' | 'D' | 's' | 'S')
254 && path_escape_looks_like_regex(&chars, index + 1)
255 {
256 return true;
257 }
258 }
259 false
260}
261
262fn path_escape_looks_like_regex(chars: &[(usize, char)], escaped_index: usize) -> bool {
263 let Some((_, next)) = chars.get(escaped_index + 1) else {
264 return true;
265 };
266
267 matches!(
268 *next,
269 '*' | '+' | '?' | '{' | '(' | '[' | '|' | '^' | '$' | '\\' | '/'
270 )
271}
272
273fn has_escaped_regex_metachar(query: &str) -> bool {
274 let mut escaped = false;
275 for ch in query.chars() {
276 if escaped {
277 if is_escaped_metachar(ch) {
278 return true;
279 }
280 escaped = false;
281 continue;
282 }
283 escaped = ch == '\\';
284 }
285 false
286}
287
288fn has_obvious_regex_chars(query: &str) -> bool {
289 query.contains('*')
290 || query.contains('[')
291 || query.contains(']')
292 || query.contains('(')
293 || query.contains(')')
294 || query.contains('|')
295 || query.contains('{')
296 || query.contains('}')
297}
298
299fn tier_a_regex_signal(query: &str) -> bool {
300 query.contains("(?:")
301 || NAMED_CAPTURE_RE.is_match(query)
302 || ["(?i)", "(?m)", "(?s)", "(?x)"]
303 .iter()
304 .any(|signal| query.contains(signal))
305 || [
306 r"\b", r"\B", r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\p{", r"\x", r"\u{", r"\n",
307 r"\t", r"\r",
308 ]
309 .iter()
310 .any(|signal| query.contains(signal))
311 || has_brace_quantifier(query)
312 || has_anchored_identifier(query)
313 || has_contextual_escaped_metachar(query)
314}
315
316fn has_brace_quantifier(query: &str) -> bool {
317 for matched in BRACE_QUANTIFIER_RE.find_iter(query) {
318 if matched.start() > 0
319 && query[..matched.start()]
320 .chars()
321 .last()
322 .is_some_and(|ch| !ch.is_whitespace())
323 {
324 return true;
325 }
326 }
327 false
328}
329
330fn has_anchored_identifier(query: &str) -> bool {
331 let trimmed = query.trim();
332 if let Some(rest) = trimmed.strip_prefix('^') {
333 if leading_identifier_len(rest) >= 3 {
334 return true;
335 }
336 }
337 if let Some(rest) = trimmed.strip_suffix('$') {
338 if trailing_identifier_len(rest) >= 3 {
339 return true;
340 }
341 }
342 false
343}
344
345fn leading_identifier_len(text: &str) -> usize {
346 text.chars()
347 .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
348 .count()
349}
350
351fn trailing_identifier_len(text: &str) -> usize {
352 text.chars()
353 .rev()
354 .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
355 .count()
356}
357
358fn has_contextual_escaped_metachar(query: &str) -> bool {
359 let chars: Vec<char> = query.chars().collect();
360 let mut index = 0usize;
361 while index + 1 < chars.len() {
362 if chars[index] == '\\' && is_escaped_metachar(chars[index + 1]) {
363 let literal_after = chars[index + 2..]
364 .iter()
365 .filter(|ch| ch.is_ascii_alphanumeric() || **ch == '_')
366 .count();
367 if literal_after >= 2 {
368 return true;
369 }
370 index += 2;
371 } else {
372 index += 1;
373 }
374 }
375 false
376}
377
378fn is_escaped_metachar(ch: char) -> bool {
379 matches!(
380 ch,
381 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$'
382 )
383}
384
385fn tier_b_character_class(query: &str) -> bool {
386 for content in bracket_contents(query) {
387 if content.starts_with('^')
388 || CHAR_RANGE_RE.is_match(&content)
389 || [r"\w", r"\d", r"\s", r"\W", r"\D", r"\S"]
390 .iter()
391 .any(|signal| content.contains(signal))
392 || multi_char_non_identifier_class(&content)
393 {
394 return true;
395 }
396 }
397 false
398}
399
400fn bracket_contents(query: &str) -> Vec<String> {
401 let mut contents = Vec::new();
402 let mut escaped = false;
403 let mut start = None;
404 for (index, ch) in query.char_indices() {
405 if escaped {
406 escaped = false;
407 continue;
408 }
409 if ch == '\\' {
410 escaped = true;
411 continue;
412 }
413 match ch {
414 '[' if start.is_none() => start = Some(index + ch.len_utf8()),
415 ']' => {
416 if let Some(open) = start.take() {
417 contents.push(query[open..index].to_string());
418 }
419 }
420 _ => {}
421 }
422 }
423 contents
424}
425
426fn multi_char_non_identifier_class(content: &str) -> bool {
427 let char_count = content.chars().count();
428 char_count >= 2
429 && !content.chars().any(|ch| {
430 ch.is_ascii_alphanumeric() || ch == '_' || ch == '"' || ch == '\'' || ch == ';'
431 })
432}
433
434fn tier_c_adjacent_meta(query: &str) -> bool {
435 has_dot_quantifier(query)
436 || has_literal_atom_quantifier(query)
437 || has_regex_pipe(query)
438 || escaped_paren_count(query) >= 2
439}
440
441fn has_dot_quantifier(query: &str) -> bool {
442 [".*", ".+", ".?"]
443 .iter()
444 .any(|signal| query.contains(signal) && query.trim().len() > signal.len())
445}
446
447fn has_literal_atom_quantifier(query: &str) -> bool {
448 let chars = query.char_indices().collect::<Vec<_>>();
449 for (index, (byte_index, ch)) in chars.iter().copied().enumerate() {
450 if !is_bare_quantifier(ch) || is_escaped_at(query, byte_index) {
451 continue;
452 }
453 if chars
454 .get(index + 1)
455 .is_some_and(|(_, next)| is_bare_quantifier(*next))
456 {
457 continue;
458 }
459 if ch == '?'
460 && (sentence_final_question_mark_in_phrase(query, byte_index)
461 || question_mark_is_code_shape(&chars, index))
462 {
463 continue;
464 }
465 if previous_is_literal_atom(&chars, index) {
466 return true;
467 }
468 }
469 false
470}
471
472fn sentence_final_question_mark_in_phrase(query: &str, byte_index: usize) -> bool {
473 query[byte_index + '?'.len_utf8()..].trim().is_empty()
474 && query[..byte_index].split_whitespace().count() > 1
475}
476
477fn question_mark_is_code_shape(chars: &[(usize, char)], question_index: usize) -> bool {
478 question_mark_is_optional_chain(chars, question_index)
479 || question_mark_after_empty_call(chars, question_index)
480 || question_mark_after_index_expression(chars, question_index)
481 || question_mark_is_typescript_optional(chars, question_index)
482}
483
484fn question_mark_is_optional_chain(chars: &[(usize, char)], question_index: usize) -> bool {
485 chars
486 .get(question_index + 1)
487 .is_some_and(|(_, next)| *next == '.')
488 && question_index
489 .checked_sub(1)
490 .and_then(|previous_index| chars.get(previous_index))
491 .is_some_and(|(_, previous)| is_code_expression_tail(*previous))
492}
493
494fn question_mark_after_empty_call(chars: &[(usize, char)], question_index: usize) -> bool {
495 let Some(call_open_index) = question_index.checked_sub(2) else {
496 return false;
497 };
498 chars
499 .get(question_index - 1)
500 .is_some_and(|(_, previous)| *previous == ')')
501 && chars
502 .get(call_open_index)
503 .is_some_and(|(_, open)| *open == '(')
504 && call_open_index
505 .checked_sub(1)
506 .and_then(|callee_index| chars.get(callee_index))
507 .is_some_and(|(_, callee_tail)| is_code_expression_tail(*callee_tail))
508}
509
510fn question_mark_after_index_expression(chars: &[(usize, char)], question_index: usize) -> bool {
511 if chars
512 .get(question_index.checked_sub(1).unwrap_or(usize::MAX))
513 .is_none_or(|(_, previous)| *previous != ']')
514 {
515 return false;
516 }
517
518 let mut depth = 0usize;
519 for index in (0..question_index).rev() {
520 match chars[index].1 {
521 ']' => depth += 1,
522 '[' => {
523 depth = depth.saturating_sub(1);
524 if depth == 0 {
525 return index
526 .checked_sub(1)
527 .and_then(|target_index| chars.get(target_index))
528 .is_some_and(|(_, target_tail)| is_code_expression_tail(*target_tail));
529 }
530 }
531 _ => {}
532 }
533 }
534 false
535}
536
537fn question_mark_is_typescript_optional(chars: &[(usize, char)], question_index: usize) -> bool {
538 let previous_is_identifier = question_index
539 .checked_sub(1)
540 .and_then(|previous_index| chars.get(previous_index))
541 .is_some_and(|(_, previous)| is_identifier_tail(*previous));
542 if !previous_is_identifier {
543 return false;
544 }
545 if chars
546 .get(question_index + 1)
547 .is_none_or(|(_, next)| *next != ':')
548 {
549 return false;
550 }
551
552 chars
553 .get(question_index + 2)
554 .is_none_or(|(_, after_colon)| {
555 after_colon.is_whitespace()
556 || after_colon.is_ascii_alphabetic()
557 || matches!(*after_colon, '_' | '{' | '[' | '(' | '"' | '\'')
558 })
559}
560
561fn is_code_expression_tail(ch: char) -> bool {
562 is_identifier_tail(ch) || matches!(ch, ')' | ']')
563}
564
565fn is_identifier_tail(ch: char) -> bool {
566 ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
567}
568
569fn previous_is_literal_atom(chars: &[(usize, char)], quantifier_index: usize) -> bool {
570 let Some((_, previous)) = quantifier_index
571 .checked_sub(1)
572 .and_then(|previous_index| chars.get(previous_index))
573 else {
574 return false;
575 };
576
577 previous.is_ascii_alphanumeric() || *previous == '_' || *previous == ')' || *previous == ']'
578}
579
580fn is_bare_quantifier(ch: char) -> bool {
581 matches!(ch, '*' | '+' | '?')
582}
583
584fn is_escaped_at(query: &str, byte_index: usize) -> bool {
585 let backslash_count = query[..byte_index]
586 .chars()
587 .rev()
588 .take_while(|ch| *ch == '\\')
589 .count();
590 backslash_count % 2 == 1
591}
592
593fn has_regex_pipe(query: &str) -> bool {
594 for (index, ch) in query.char_indices() {
595 if ch != '|' {
596 continue;
597 }
598 let left = trailing_identifier_len(&query[..index]);
599 let right = leading_identifier_len(&query[index + ch.len_utf8()..]);
600 if left >= 3 && right >= 3 {
601 return true;
602 }
603 }
604 false
605}
606
607fn escaped_paren_count(query: &str) -> usize {
608 let mut count = 0usize;
609 let mut escaped = false;
610 for ch in query.chars() {
611 if escaped {
612 if ch == '(' || ch == ')' {
613 count += 1;
614 }
615 escaped = false;
616 continue;
617 }
618 escaped = ch == '\\';
619 }
620 count
621}
622
623fn extract_path_tokens(query: &str) -> Vec<String> {
624 let mut tokens = Vec::new();
625 for segment in query
626 .split(['/', '\\'])
627 .filter(|segment| !segment.is_empty())
628 {
629 if segment.contains('.') {
630 if let Some(stem) = segment.rsplit_once('.').map(|(stem, _)| stem) {
631 push_unique(&mut tokens, stem);
632 }
633 }
634 push_unique(&mut tokens, segment);
635 }
636 tokens
637}
638
639fn extract_error_code_tokens(query: &str) -> Vec<String> {
640 let mut tokens = Vec::new();
641 for regex in [
642 &*HEX_CODE_RE,
643 &*ERROR_PREFIX_RE,
644 &*NUMERIC_ERROR_RE,
645 &*TYPESCRIPT_ERROR_RE,
646 &*HTTP_STATUS_RE,
647 ] {
648 for mat in regex.find_iter(query) {
649 push_unique(&mut tokens, mat.as_str());
650 }
651 }
652 if tokens.is_empty() && !query.trim().is_empty() {
653 push_unique(&mut tokens, query.trim());
654 }
655 tokens
656}
657
658fn extract_identifier_tokens(query: &str, require_code_shape: bool) -> Vec<String> {
659 let mut tokens = Vec::new();
660 for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
661 let token = mat.as_str();
662 if require_code_shape && !is_code_identifier_token(token) {
663 continue;
664 }
665 push_unique(&mut tokens, token);
666 }
667 tokens
668}
669
670fn is_code_identifier_token(token: &str) -> bool {
671 CAMEL_CASE_RE.is_match(token)
672 || SNAKE_CASE_RE.is_match(token)
673 || PASCAL_CASE_RE.is_match(token)
674 || ACRONYM_PASCAL_RE.is_match(token)
675 || DOT_PATH_RE.is_match(token)
676 || ERROR_PREFIX_RE.is_match(token)
677 || NUMERIC_ERROR_RE.is_match(token)
678 || TYPESCRIPT_ERROR_RE.is_match(token)
679}
680
681fn push_unique(tokens: &mut Vec<String>, token: &str) {
682 if !token.is_empty() && !tokens.iter().any(|existing| existing == token) {
683 tokens.push(token.to_string());
684 }
685}
686
687fn shape(kind: QueryKind) -> QueryShape {
688 QueryShape {
689 kind,
690 weights: weights_for(kind),
691 }
692}
693
694fn weights_for(kind: QueryKind) -> ShapeWeights {
695 match kind {
696 QueryKind::Identifier => ShapeWeights {
697 semantic: 0.2,
698 lexical: 0.8,
699 should_use_lexical: true,
700 },
701 QueryKind::Path | QueryKind::ErrorCode => ShapeWeights {
702 semantic: 0.1,
703 lexical: 0.9,
704 should_use_lexical: true,
705 },
706 QueryKind::Regex => ShapeWeights {
707 semantic: 0.0,
708 lexical: 1.0,
709 should_use_lexical: false,
710 },
711 QueryKind::NaturalLanguage => ShapeWeights {
712 semantic: 0.6,
713 lexical: 0.4,
714 should_use_lexical: false,
715 },
716 QueryKind::Mixed => ShapeWeights {
717 semantic: 0.4,
718 lexical: 0.6,
719 should_use_lexical: true,
720 },
721 }
722}
723
724#[cfg(test)]
725mod tests {
726 use super::*;
727
728 fn kind(query: &str) -> QueryKind {
729 classify(query).kind
730 }
731
732 #[test]
733 fn url_exemptions_allow_common_literal_url_punctuation() {
734 for query in [
735 "https://api.io/path",
736 "https://api.io/foo?q=test",
737 "https://api.io/foo+bar",
738 "https://api.io/foo@bar",
739 "https://api.io/foo#anchor",
740 ] {
741 assert_eq!(pre_tier_exempt(query), Some("url"), "{query}");
742 assert_ne!(kind(query), QueryKind::Regex, "{query}");
743 }
744 }
745
746 #[test]
747 fn url_exemptions_reject_regex_sequences() {
748 for query in [
749 "https://.*",
750 "https://api.io/.+",
751 "file://[^ ]+",
752 "file:///tmp/.+",
753 r"https://api.io/users/\w+",
754 ] {
755 assert_eq!(kind(query), QueryKind::Regex, "{query}");
756 }
757 }
758
759 #[test]
760 fn path_and_filename_exemptions_allow_literal_punctuation() {
761 for (query, expected) in [
762 (r"C:\new\test", "windows_abs"),
763 (r"src\bin\main.rs", "windows_rel"),
764 (r"src\tab\main.ts", "windows_rel"),
765 (r"packages\opencode-plugin\src", "windows_rel"),
766 ("/usr/local/bin", "posix_abs"),
767 ("/Users/John Doe/Documents", "posix_abs"),
768 ("/home/user/.gitignore", "posix_abs"),
769 ("v1/release/notes.md", "posix_rel"),
770 ("/home/user/jeff's-folder", "posix_abs"),
771 ("C++/parser/main.cpp", "posix_rel"),
772 ("foo+bar/baz.ts", "posix_rel"),
773 ("is_valid?.ts", "filename"),
774 ("Cargo.lock", "filename"),
775 ("tsconfig.json", "filename"),
776 ] {
777 assert_eq!(pre_tier_exempt(query), Some(expected), "{query}");
778 assert_eq!(kind(query), QueryKind::Path, "{query}");
779 }
780 assert_eq!(pre_tier_exempt("foo?"), None);
781 }
782
783 #[test]
784 fn path_exemptions_reject_regex_sequences() {
785 for query in [
786 "src/.*",
787 "src/.+",
788 r"C:\bin\foo*.exe",
789 r"C:\Users\\w+",
790 r"src\w+\main.ts",
791 ] {
792 assert_eq!(kind(query), QueryKind::Regex, "{query}");
793 }
794 }
795
796 #[test]
797 fn tier_a_and_c_regex_signals_route_to_regex() {
798 for query in [
799 "^export",
800 "foo$",
801 "^main$",
802 r"foo\.bar",
803 r"\(method\)",
804 r"\bTODO\b",
805 ".*foo",
806 "foo|bar",
807 "(?:foo)",
808 "(?P<n>foo)",
809 "(?i)Todo",
810 r"\p{Lu}",
811 r"\xFF",
812 r"\u{1F600}",
813 "a{3}",
814 r"\n",
819 r"\t",
820 r"\r",
821 r"\tindent",
822 ] {
823 assert_eq!(kind(query), QueryKind::Regex, "{query}");
824 }
825 }
826
827 #[test]
828 fn character_classes_route_only_when_they_look_like_classes() {
829 for query in ["[a-z]+", "[^abc]", r"[\w]+"] {
830 assert_eq!(kind(query), QueryKind::Regex, "{query}");
831 }
832 for query in [
833 "arr[0]",
834 "obj[key]",
835 "config[\"key\"]",
836 "#[derive]",
837 "Vec<[u8; 32]>",
838 ] {
839 assert_ne!(kind(query), QueryKind::Regex, "{query}");
840 }
841 }
842
843 #[test]
844 fn unsupported_regex_syntax_still_routes_to_regex_for_compile_error() {
845 for query in [
846 "(?=foo)",
847 "(?!foo)",
848 "(?<=foo)",
849 "(?<!foo)",
850 "(?P=name)",
851 r"\1",
852 "foo*+",
853 "(?>foo)",
854 ] {
855 assert_eq!(kind(query), QueryKind::Regex, "{query}");
856 }
857 }
858
859 #[test]
860 fn two_word_lowercase_concepts_route_to_natural_language() {
861 for query in ["retry logic", "auth flow", "cache invalidation"] {
862 assert_eq!(kind(query), QueryKind::NaturalLanguage, "{query}");
863 }
864 }
865
866 #[test]
867 fn identifierish_short_queries_stay_identifier() {
868 for query in ["useState hook", "parseConfig", "parse_config option"] {
869 assert_eq!(kind(query), QueryKind::Identifier, "{query}");
870 }
871 }
872
873 #[test]
874 fn question_mark_code_shapes_do_not_route_to_regex() {
875 for query in ["foo()?", "optional?.length", "user?.name", "arr[0]?"] {
876 assert_ne!(kind(query), QueryKind::Regex, "{query}");
877 }
878 }
879
880 #[test]
881 fn question_mark_regex_quantifiers_still_route_to_regex() {
882 for query in ["colou?r", "https?"] {
883 assert_eq!(kind(query), QueryKind::Regex, "{query}");
884 }
885 }
886
887 #[test]
888 fn weak_regex_like_punctuation_does_not_route_to_regex() {
889 for query in [
890 "^id",
891 "id$",
892 "^",
893 "$",
894 "$HOME",
895 r"\.",
896 "array.length",
897 "foo()",
898 "map.get(key)",
899 "a|b",
900 ] {
901 assert_ne!(kind(query), QueryKind::Regex, "{query}");
902 }
903 }
904}