1use regex::Regex;
2use std::sync::LazyLock;
3
4static CAMEL_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z][A-Z]").unwrap());
5static SNAKE_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z]_[a-z]").unwrap());
6static PASCAL_CASE_RE: LazyLock<Regex> =
7 LazyLock::new(|| Regex::new(r"^[A-Z][a-z]+[A-Z]").unwrap());
8static ACRONYM_PASCAL_RE: LazyLock<Regex> =
9 LazyLock::new(|| Regex::new(r"\b[A-Z]{2,}[A-Z][a-z]").unwrap());
10static DOT_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-zA-Z]\.[a-zA-Z]").unwrap());
11static FILE_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[/\\].*\.\w{1,5}$").unwrap());
12static HEX_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"0x[A-Fa-f0-9]+").unwrap());
13static ERROR_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bERR_\w+").unwrap());
14static NUMERIC_ERROR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bE\d{4,}").unwrap());
15static TYPESCRIPT_ERROR_RE: LazyLock<Regex> =
16 LazyLock::new(|| Regex::new(r"\bTS\d{4,}\b").unwrap());
17static HTTP_STATUS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[1-5]\d{2}\b").unwrap());
18static IDENTIFIER_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
19 Regex::new(r"\b[A-Za-z_$][A-Za-z0-9_$]*(?:\.[A-Za-z_$][A-Za-z0-9_$]*)*\b").unwrap()
20});
21
22static WINDOWS_ABS_PATH_RE: LazyLock<Regex> =
23 LazyLock::new(|| Regex::new(r"^[A-Za-z]:[\\/][A-Za-z0-9_.\-+?\\/' ]+$").unwrap());
24static WINDOWS_REL_PATH_RE: LazyLock<Regex> =
25 LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(\\[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
26static POSIX_ABS_PATH_RE: LazyLock<Regex> =
27 LazyLock::new(|| Regex::new(r"^/[A-Za-z0-9_.\-+?/' ]+$").unwrap());
28static POSIX_REL_PATH_RE: LazyLock<Regex> =
29 LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(/[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
30static UNC_PATH_RE: LazyLock<Regex> =
31 LazyLock::new(|| Regex::new(r"^\\\\[A-Za-z0-9_.\-+?\\']+$").unwrap());
32static FILENAME_EXEMPTION_RE: LazyLock<Regex> =
33 LazyLock::new(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_.\-+'? ]*\.[A-Za-z0-9]{1,8}$").unwrap());
34static BRACE_QUANTIFIER_RE: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"\{\d+(?:,\d*)?\}").unwrap());
36static NAMED_CAPTURE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(\?P<[^>]+>").unwrap());
37static CHAR_RANGE_RE: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"[A-Za-z0-9]-[A-Za-z0-9]").unwrap());
39
40const QUESTION_WORDS: &[&str] = &[
41 "how", "what", "where", "why", "when", "which", "who", "does",
42];
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum QueryKind {
46 Identifier,
47 Mixed,
48 ErrorCode,
49 Path,
50 Regex,
51 NaturalLanguage,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub struct ShapeWeights {
56 pub semantic: f32,
57 pub lexical: f32,
58 pub should_use_lexical: bool,
59}
60
61#[derive(Debug, Clone, Copy, PartialEq)]
62pub struct QueryShape {
63 pub kind: QueryKind,
64 pub weights: ShapeWeights,
65}
66
67pub fn classify(query: &str) -> QueryShape {
68 let trimmed = query.trim();
69 if trimmed.is_empty() {
70 return shape(QueryKind::NaturalLanguage);
71 }
72
73 if pre_tier_exempt(trimmed).is_some() {
74 return shape(QueryKind::Path);
75 }
76
77 if looks_like_regex(trimmed) {
78 return shape(QueryKind::Regex);
79 }
80
81 let words: Vec<&str> = trimmed.split_whitespace().collect();
82 let word_count = words.len();
83 let first_word_lower = words[0].to_ascii_lowercase();
84
85 if FILE_PATH_RE.is_match(trimmed) {
86 return shape(QueryKind::Path);
87 }
88
89 let has_question_word = QUESTION_WORDS.contains(&first_word_lower.as_str());
90 let is_long_phrase = word_count > 2;
91 let is_two_word_concept = is_two_word_lowercase_concept(&words);
92 let has_natural_language_signals = has_question_word || is_long_phrase || is_two_word_concept;
93 let has_error_code = contains_error_code(trimmed, word_count);
94
95 if has_error_code && has_natural_language_signals {
96 return shape(QueryKind::Mixed);
97 }
98
99 if has_error_code {
100 return shape(QueryKind::ErrorCode);
101 }
102
103 let has_code_identifier = CAMEL_CASE_RE.is_match(trimmed)
104 || SNAKE_CASE_RE.is_match(trimmed)
105 || PASCAL_CASE_RE.is_match(trimmed)
106 || ACRONYM_PASCAL_RE.is_match(trimmed)
107 || DOT_PATH_RE.is_match(trimmed);
108
109 if has_code_identifier && has_natural_language_signals {
110 return shape(QueryKind::Mixed);
111 }
112
113 if has_code_identifier || (word_count <= 2 && !has_natural_language_signals) {
114 return shape(QueryKind::Identifier);
115 }
116
117 shape(QueryKind::NaturalLanguage)
118}
119
120pub fn extract_tokens(query: &str, shape: &QueryShape) -> Vec<String> {
121 match shape.kind {
122 QueryKind::NaturalLanguage | QueryKind::Regex => Vec::new(),
123 QueryKind::Path => extract_path_tokens(query),
124 QueryKind::ErrorCode => extract_error_code_tokens(query),
125 QueryKind::Identifier => extract_identifier_tokens(query, false),
126 QueryKind::Mixed => extract_identifier_tokens(query, true),
127 }
128}
129
130pub fn extract_short_nl_lexical_tokens(query: &str) -> Vec<String> {
136 query
137 .split_whitespace()
138 .filter(|word| word.chars().count() >= 3)
139 .map(str::to_string)
140 .collect()
141}
142
143pub(crate) fn is_type_concept_identifier_query(query: &str, shape: &QueryShape) -> bool {
144 if shape.kind != QueryKind::Identifier {
145 return false;
146 }
147
148 let mut identifier_token_count = 0;
149 let mut has_type_token = false;
150 let mut has_lowercase_concept_word = false;
151
152 for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
153 let token = mat.as_str();
154 identifier_token_count += 1;
155 has_type_token |= is_type_concept_type_token(token);
156 has_lowercase_concept_word |= is_dictionary_style_lowercase_word(token);
157 }
158
159 identifier_token_count >= 2 && has_type_token && has_lowercase_concept_word
160}
161
162fn is_type_concept_type_token(token: &str) -> bool {
163 token
164 .chars()
165 .next()
166 .is_some_and(|first| first.is_ascii_uppercase())
167 && (is_titlecase_word(token)
168 || PASCAL_CASE_RE.is_match(token)
169 || ACRONYM_PASCAL_RE.is_match(token))
170}
171
172pub(crate) fn extract_explicit_code_tokens(query: &str) -> Vec<String> {
177 let mut tokens = Vec::new();
178
179 push_quoted_code_tokens(query, &mut tokens);
180 let title_spans = push_adjacent_titlecase_tokens(query, 0, &mut tokens);
181 for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
182 if span_is_covered(&title_spans, mat.start(), mat.end()) {
183 continue;
184 }
185 let token = mat.as_str();
186 if is_code_identifier_token(token) {
187 push_unique(&mut tokens, token);
188 }
189 }
190
191 tokens
192}
193
194pub fn pre_tier_exempt(query: &str) -> Option<&'static str> {
195 if let Some(kind) = check_url_exemption(query) {
196 return Some(kind);
197 }
198 check_path_exemption(query)
199}
200
201pub fn looks_like_regex(query: &str) -> bool {
202 crate::pattern_compile::detect_unsupported_features(query).is_some()
203 || tier_a_regex_signal(query)
204 || tier_b_character_class(query)
205 || tier_c_adjacent_meta(query)
206}
207
208fn check_url_exemption(query: &str) -> Option<&'static str> {
209 let parsed = url::Url::parse(query).ok()?;
210 if !matches!(parsed.scheme(), "http" | "https" | "file" | "ftp" | "ssh") {
211 return None;
212 }
213 if has_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
214 return None;
215 }
216 Some("url")
217}
218
219fn check_path_exemption(query: &str) -> Option<&'static str> {
220 let kind = if WINDOWS_ABS_PATH_RE.is_match(query) {
221 "windows_abs"
222 } else if WINDOWS_REL_PATH_RE.is_match(query) {
223 "windows_rel"
224 } else if POSIX_ABS_PATH_RE.is_match(query) {
225 "posix_abs"
226 } else if POSIX_REL_PATH_RE.is_match(query) {
227 "posix_rel"
228 } else if UNC_PATH_RE.is_match(query) {
229 "unc"
230 } else if FILENAME_EXEMPTION_RE.is_match(query) {
231 "filename"
232 } else {
233 return None;
234 };
235 if has_path_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
236 return None;
237 }
238 Some(kind)
239}
240
241fn contains_error_code(query: &str, word_count: usize) -> bool {
242 HEX_CODE_RE.is_match(query)
243 || ERROR_PREFIX_RE.is_match(query)
244 || NUMERIC_ERROR_RE.is_match(query)
245 || TYPESCRIPT_ERROR_RE.is_match(query)
246 || has_http_status(query, word_count)
247}
248
249fn has_http_status(query: &str, word_count: usize) -> bool {
250 HTTP_STATUS_RE.is_match(query)
251 && (word_count <= 3 || query.to_ascii_lowercase().contains("http"))
252}
253
254fn is_two_word_lowercase_concept(words: &[&str]) -> bool {
255 words.len() == 2
256 && words
257 .iter()
258 .all(|word| is_dictionary_style_lowercase_word(word))
259}
260
261fn is_dictionary_style_lowercase_word(word: &str) -> bool {
262 word.len() >= 3 && word.bytes().all(|byte| byte.is_ascii_lowercase())
263}
264
265fn has_regex_meta_sequences(query: &str) -> bool {
266 query.contains(".+")
267 || query.contains(".*")
268 || query.contains(".?")
269 || query.contains(r"\n")
270 || query.contains(r"\t")
271 || query.contains(r"\r")
272 || query.contains(r"\b")
273 || query.contains(r"\B")
274 || query.contains(r"\w")
275 || query.contains(r"\W")
276 || query.contains(r"\d")
277 || query.contains(r"\D")
278 || query.contains(r"\s")
279 || query.contains(r"\S")
280 || query.contains(r"\p{")
281 || query.contains(r"\x")
282 || query.contains(r"\u{")
283 || has_escaped_regex_metachar(query)
284}
285
286fn has_path_regex_meta_sequences(query: &str) -> bool {
287 query.contains(".+")
288 || query.contains(".*")
289 || query.contains(".?")
290 || query.contains(r"\p{")
291 || query.contains(r"\x")
292 || query.contains(r"\u{")
293 || has_path_context_regex_escape(query)
294 || has_escaped_regex_metachar(query)
295}
296
297fn has_path_context_regex_escape(query: &str) -> bool {
298 let chars = query.char_indices().collect::<Vec<_>>();
299 for index in 0..chars.len().saturating_sub(1) {
300 if chars[index].1 != '\\' {
301 continue;
302 }
303 let escaped = chars[index + 1].1;
304 if matches!(escaped, 'b' | 'B' | 'w' | 'W' | 'd' | 'D' | 's' | 'S')
305 && path_escape_looks_like_regex(&chars, index + 1)
306 {
307 return true;
308 }
309 }
310 false
311}
312
313fn path_escape_looks_like_regex(chars: &[(usize, char)], escaped_index: usize) -> bool {
314 let Some((_, next)) = chars.get(escaped_index + 1) else {
315 return true;
316 };
317
318 matches!(
319 *next,
320 '*' | '+' | '?' | '{' | '(' | '[' | '|' | '^' | '$' | '\\' | '/'
321 )
322}
323
324fn has_escaped_regex_metachar(query: &str) -> bool {
325 let mut escaped = false;
326 for ch in query.chars() {
327 if escaped {
328 if is_escaped_metachar(ch) {
329 return true;
330 }
331 escaped = false;
332 continue;
333 }
334 escaped = ch == '\\';
335 }
336 false
337}
338
339fn has_obvious_regex_chars(query: &str) -> bool {
340 query.contains('*')
341 || query.contains('[')
342 || query.contains(']')
343 || query.contains('(')
344 || query.contains(')')
345 || query.contains('|')
346 || query.contains('{')
347 || query.contains('}')
348}
349
350fn tier_a_regex_signal(query: &str) -> bool {
351 query.contains("(?:")
352 || NAMED_CAPTURE_RE.is_match(query)
353 || ["(?i)", "(?m)", "(?s)", "(?x)"]
354 .iter()
355 .any(|signal| query.contains(signal))
356 || [
357 r"\b", r"\B", r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\p{", r"\x", r"\u{", r"\n",
358 r"\t", r"\r",
359 ]
360 .iter()
361 .any(|signal| query.contains(signal))
362 || has_brace_quantifier(query)
363 || has_anchored_identifier(query)
364 || has_contextual_escaped_metachar(query)
365}
366
367fn has_brace_quantifier(query: &str) -> bool {
368 for matched in BRACE_QUANTIFIER_RE.find_iter(query) {
369 if matched.start() > 0
370 && query[..matched.start()]
371 .chars()
372 .last()
373 .is_some_and(|ch| !ch.is_whitespace())
374 {
375 return true;
376 }
377 }
378 false
379}
380
381fn has_anchored_identifier(query: &str) -> bool {
382 let trimmed = query.trim();
383 if let Some(rest) = trimmed.strip_prefix('^') {
384 if leading_identifier_len(rest) >= 3 {
385 return true;
386 }
387 }
388 if let Some(rest) = trimmed.strip_suffix('$') {
389 if trailing_identifier_len(rest) >= 3 {
390 return true;
391 }
392 }
393 false
394}
395
396fn leading_identifier_len(text: &str) -> usize {
397 text.chars()
398 .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
399 .count()
400}
401
402fn trailing_identifier_len(text: &str) -> usize {
403 text.chars()
404 .rev()
405 .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
406 .count()
407}
408
409fn has_contextual_escaped_metachar(query: &str) -> bool {
410 let chars: Vec<char> = query.chars().collect();
411 let mut index = 0usize;
412 while index + 1 < chars.len() {
413 if chars[index] == '\\' && is_escaped_metachar(chars[index + 1]) {
414 let literal_after = chars[index + 2..]
415 .iter()
416 .filter(|ch| ch.is_ascii_alphanumeric() || **ch == '_')
417 .count();
418 if literal_after >= 2 {
419 return true;
420 }
421 index += 2;
422 } else {
423 index += 1;
424 }
425 }
426 false
427}
428
429fn is_escaped_metachar(ch: char) -> bool {
430 matches!(
431 ch,
432 '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$'
433 )
434}
435
436fn tier_b_character_class(query: &str) -> bool {
437 for content in bracket_contents(query) {
438 if content.starts_with('^')
439 || CHAR_RANGE_RE.is_match(&content)
440 || [r"\w", r"\d", r"\s", r"\W", r"\D", r"\S"]
441 .iter()
442 .any(|signal| content.contains(signal))
443 || multi_char_non_identifier_class(&content)
444 {
445 return true;
446 }
447 }
448 false
449}
450
451fn bracket_contents(query: &str) -> Vec<String> {
452 let mut contents = Vec::new();
453 let mut escaped = false;
454 let mut start = None;
455 for (index, ch) in query.char_indices() {
456 if escaped {
457 escaped = false;
458 continue;
459 }
460 if ch == '\\' {
461 escaped = true;
462 continue;
463 }
464 match ch {
465 '[' if start.is_none() => start = Some(index + ch.len_utf8()),
466 ']' => {
467 if let Some(open) = start.take() {
468 contents.push(query[open..index].to_string());
469 }
470 }
471 _ => {}
472 }
473 }
474 contents
475}
476
477fn multi_char_non_identifier_class(content: &str) -> bool {
478 let char_count = content.chars().count();
479 char_count >= 2
480 && !content.chars().any(|ch| {
481 ch.is_ascii_alphanumeric() || ch == '_' || ch == '"' || ch == '\'' || ch == ';'
482 })
483}
484
485fn tier_c_adjacent_meta(query: &str) -> bool {
486 has_dot_quantifier(query)
487 || has_literal_atom_quantifier(query)
488 || has_regex_pipe(query)
489 || escaped_paren_count(query) >= 2
490}
491
492fn has_dot_quantifier(query: &str) -> bool {
493 [".*", ".+", ".?"]
494 .iter()
495 .any(|signal| query.contains(signal) && query.trim().len() > signal.len())
496}
497
498fn has_literal_atom_quantifier(query: &str) -> bool {
499 let chars = query.char_indices().collect::<Vec<_>>();
500 for (index, (byte_index, ch)) in chars.iter().copied().enumerate() {
501 if !is_bare_quantifier(ch) || is_escaped_at(query, byte_index) {
502 continue;
503 }
504 if chars
505 .get(index + 1)
506 .is_some_and(|(_, next)| is_bare_quantifier(*next))
507 {
508 continue;
509 }
510 if ch == '?'
511 && (sentence_final_question_mark_in_phrase(query, byte_index)
512 || question_mark_is_code_shape(&chars, index))
513 {
514 continue;
515 }
516 if previous_is_literal_atom(&chars, index) {
517 return true;
518 }
519 }
520 false
521}
522
523fn sentence_final_question_mark_in_phrase(query: &str, byte_index: usize) -> bool {
524 query[byte_index + '?'.len_utf8()..].trim().is_empty()
525 && query[..byte_index].split_whitespace().count() > 1
526}
527
528fn question_mark_is_code_shape(chars: &[(usize, char)], question_index: usize) -> bool {
529 question_mark_is_optional_chain(chars, question_index)
530 || question_mark_after_empty_call(chars, question_index)
531 || question_mark_after_index_expression(chars, question_index)
532 || question_mark_is_typescript_optional(chars, question_index)
533}
534
535fn question_mark_is_optional_chain(chars: &[(usize, char)], question_index: usize) -> bool {
536 chars
537 .get(question_index + 1)
538 .is_some_and(|(_, next)| *next == '.')
539 && question_index
540 .checked_sub(1)
541 .and_then(|previous_index| chars.get(previous_index))
542 .is_some_and(|(_, previous)| is_code_expression_tail(*previous))
543}
544
545fn question_mark_after_empty_call(chars: &[(usize, char)], question_index: usize) -> bool {
546 let Some(call_open_index) = question_index.checked_sub(2) else {
547 return false;
548 };
549 chars
550 .get(question_index - 1)
551 .is_some_and(|(_, previous)| *previous == ')')
552 && chars
553 .get(call_open_index)
554 .is_some_and(|(_, open)| *open == '(')
555 && call_open_index
556 .checked_sub(1)
557 .and_then(|callee_index| chars.get(callee_index))
558 .is_some_and(|(_, callee_tail)| is_code_expression_tail(*callee_tail))
559}
560
561fn question_mark_after_index_expression(chars: &[(usize, char)], question_index: usize) -> bool {
562 if chars
563 .get(question_index.checked_sub(1).unwrap_or(usize::MAX))
564 .is_none_or(|(_, previous)| *previous != ']')
565 {
566 return false;
567 }
568
569 let mut depth = 0usize;
570 for index in (0..question_index).rev() {
571 match chars[index].1 {
572 ']' => depth += 1,
573 '[' => {
574 depth = depth.saturating_sub(1);
575 if depth == 0 {
576 return index
577 .checked_sub(1)
578 .and_then(|target_index| chars.get(target_index))
579 .is_some_and(|(_, target_tail)| is_code_expression_tail(*target_tail));
580 }
581 }
582 _ => {}
583 }
584 }
585 false
586}
587
588fn question_mark_is_typescript_optional(chars: &[(usize, char)], question_index: usize) -> bool {
589 let previous_is_identifier = question_index
590 .checked_sub(1)
591 .and_then(|previous_index| chars.get(previous_index))
592 .is_some_and(|(_, previous)| is_identifier_tail(*previous));
593 if !previous_is_identifier {
594 return false;
595 }
596 if chars
597 .get(question_index + 1)
598 .is_none_or(|(_, next)| *next != ':')
599 {
600 return false;
601 }
602
603 chars
604 .get(question_index + 2)
605 .is_none_or(|(_, after_colon)| {
606 after_colon.is_whitespace()
607 || after_colon.is_ascii_alphabetic()
608 || matches!(*after_colon, '_' | '{' | '[' | '(' | '"' | '\'')
609 })
610}
611
612fn is_code_expression_tail(ch: char) -> bool {
613 is_identifier_tail(ch) || matches!(ch, ')' | ']')
614}
615
616fn is_identifier_tail(ch: char) -> bool {
617 ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
618}
619
620fn previous_is_literal_atom(chars: &[(usize, char)], quantifier_index: usize) -> bool {
621 let Some((_, previous)) = quantifier_index
622 .checked_sub(1)
623 .and_then(|previous_index| chars.get(previous_index))
624 else {
625 return false;
626 };
627
628 previous.is_ascii_alphanumeric() || *previous == '_' || *previous == ')' || *previous == ']'
629}
630
631fn is_bare_quantifier(ch: char) -> bool {
632 matches!(ch, '*' | '+' | '?')
633}
634
635fn is_escaped_at(query: &str, byte_index: usize) -> bool {
636 let backslash_count = query[..byte_index]
637 .chars()
638 .rev()
639 .take_while(|ch| *ch == '\\')
640 .count();
641 backslash_count % 2 == 1
642}
643
644fn has_regex_pipe(query: &str) -> bool {
645 for (index, ch) in query.char_indices() {
646 if ch != '|' {
647 continue;
648 }
649 let left = trailing_identifier_len(&query[..index]);
650 let right = leading_identifier_len(&query[index + ch.len_utf8()..]);
651 if left >= 3 && right >= 3 {
652 return true;
653 }
654 }
655 false
656}
657
658fn escaped_paren_count(query: &str) -> usize {
659 let mut count = 0usize;
660 let mut escaped = false;
661 for ch in query.chars() {
662 if escaped {
663 if ch == '(' || ch == ')' {
664 count += 1;
665 }
666 escaped = false;
667 continue;
668 }
669 escaped = ch == '\\';
670 }
671 count
672}
673
674fn push_quoted_code_tokens(query: &str, tokens: &mut Vec<String>) {
675 let mut open: Option<(char, usize)> = None;
676 let mut escaped = false;
677
678 for (index, ch) in query.char_indices() {
679 if escaped {
680 escaped = false;
681 continue;
682 }
683 if ch == '\\' {
684 escaped = true;
685 continue;
686 }
687
688 if let Some((delimiter, content_start)) = open {
689 if ch == delimiter {
690 let content = &query[content_start..index];
691 let title_spans = push_adjacent_titlecase_tokens(content, content_start, tokens);
692 for mat in IDENTIFIER_TOKEN_RE.find_iter(content) {
693 let start = content_start + mat.start();
694 let end = content_start + mat.end();
695 if !span_is_covered(&title_spans, start, end) {
696 push_unique(tokens, mat.as_str());
697 }
698 }
699 open = None;
700 }
701 } else if matches!(ch, '"' | '\'' | '`') {
702 open = Some((ch, index + ch.len_utf8()));
703 }
704 }
705}
706
707fn push_adjacent_titlecase_tokens(
708 text: &str,
709 base_offset: usize,
710 tokens: &mut Vec<String>,
711) -> Vec<(usize, usize)> {
712 let mut covered_spans = Vec::new();
713 let mut current: Vec<(usize, usize, &str)> = Vec::new();
714 let mut previous_end: Option<usize> = None;
715
716 for mat in IDENTIFIER_TOKEN_RE.find_iter(text) {
717 let token = mat.as_str();
718 let adjacent_to_current = previous_end.is_some_and(|end| {
719 !current.is_empty() && text[end..mat.start()].chars().all(|ch| ch.is_whitespace())
720 });
721 if is_titlecase_word(token) && (current.is_empty() || adjacent_to_current) {
722 current.push((base_offset + mat.start(), base_offset + mat.end(), token));
723 } else {
724 flush_titlecase_sequence(&mut current, &mut covered_spans, tokens);
725 if is_titlecase_word(token) {
726 current.push((base_offset + mat.start(), base_offset + mat.end(), token));
727 }
728 }
729 previous_end = Some(mat.end());
730 }
731
732 flush_titlecase_sequence(&mut current, &mut covered_spans, tokens);
733 covered_spans
734}
735
736fn flush_titlecase_sequence(
737 current: &mut Vec<(usize, usize, &str)>,
738 covered_spans: &mut Vec<(usize, usize)>,
739 tokens: &mut Vec<String>,
740) {
741 if current.len() >= 2 {
742 let qualified = current
743 .iter()
744 .map(|(_, _, token)| *token)
745 .collect::<Vec<_>>()
746 .join(".");
747 push_unique(tokens, &qualified);
748 covered_spans.extend(current.iter().map(|(start, end, _)| (*start, *end)));
749 }
750 current.clear();
751}
752
753fn span_is_covered(spans: &[(usize, usize)], start: usize, end: usize) -> bool {
754 spans
755 .iter()
756 .any(|(span_start, span_end)| start >= *span_start && end <= *span_end)
757}
758
759fn is_titlecase_word(token: &str) -> bool {
760 if token.contains(['.', '_', '$']) {
761 return false;
762 }
763 let mut chars = token.chars();
764 let Some(first) = chars.next() else {
765 return false;
766 };
767 if !first.is_ascii_uppercase() {
768 return false;
769 }
770
771 let mut has_letter_after_first = false;
772 let mut has_lowercase = false;
773 for ch in chars {
774 if !ch.is_ascii_alphanumeric() {
775 return false;
776 }
777 if ch.is_ascii_alphabetic() {
778 has_letter_after_first = true;
779 }
780 if ch.is_ascii_lowercase() {
781 has_lowercase = true;
782 }
783 }
784
785 has_letter_after_first && (has_lowercase || token.chars().all(|ch| !ch.is_ascii_lowercase()))
786}
787
788fn extract_path_tokens(query: &str) -> Vec<String> {
789 let mut tokens = Vec::new();
790 for segment in query
791 .split(['/', '\\'])
792 .filter(|segment| !segment.is_empty())
793 {
794 if segment.contains('.') {
795 if let Some(stem) = segment.rsplit_once('.').map(|(stem, _)| stem) {
796 push_unique(&mut tokens, stem);
797 }
798 }
799 push_unique(&mut tokens, segment);
800 }
801 tokens
802}
803
804fn extract_error_code_tokens(query: &str) -> Vec<String> {
805 let mut tokens = Vec::new();
806 for regex in [
807 &*HEX_CODE_RE,
808 &*ERROR_PREFIX_RE,
809 &*NUMERIC_ERROR_RE,
810 &*TYPESCRIPT_ERROR_RE,
811 &*HTTP_STATUS_RE,
812 ] {
813 for mat in regex.find_iter(query) {
814 push_unique(&mut tokens, mat.as_str());
815 }
816 }
817 if tokens.is_empty() && !query.trim().is_empty() {
818 push_unique(&mut tokens, query.trim());
819 }
820 tokens
821}
822
823fn extract_identifier_tokens(query: &str, require_code_shape: bool) -> Vec<String> {
824 let mut tokens = Vec::new();
825 for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
826 let token = mat.as_str();
827 if require_code_shape && !is_code_identifier_token(token) {
828 continue;
829 }
830 push_unique(&mut tokens, token);
831 }
832 tokens
833}
834
835fn is_code_identifier_token(token: &str) -> bool {
836 CAMEL_CASE_RE.is_match(token)
837 || SNAKE_CASE_RE.is_match(token)
838 || PASCAL_CASE_RE.is_match(token)
839 || ACRONYM_PASCAL_RE.is_match(token)
840 || DOT_PATH_RE.is_match(token)
841 || ERROR_PREFIX_RE.is_match(token)
842 || NUMERIC_ERROR_RE.is_match(token)
843 || TYPESCRIPT_ERROR_RE.is_match(token)
844}
845
846fn push_unique(tokens: &mut Vec<String>, token: &str) {
847 if !token.is_empty() && !tokens.iter().any(|existing| existing == token) {
848 tokens.push(token.to_string());
849 }
850}
851
852fn shape(kind: QueryKind) -> QueryShape {
853 QueryShape {
854 kind,
855 weights: weights_for(kind),
856 }
857}
858
859fn weights_for(kind: QueryKind) -> ShapeWeights {
860 match kind {
861 QueryKind::Identifier => ShapeWeights {
862 semantic: 0.2,
863 lexical: 0.8,
864 should_use_lexical: true,
865 },
866 QueryKind::Path | QueryKind::ErrorCode => ShapeWeights {
867 semantic: 0.1,
868 lexical: 0.9,
869 should_use_lexical: true,
870 },
871 QueryKind::Regex => ShapeWeights {
872 semantic: 0.0,
873 lexical: 1.0,
874 should_use_lexical: false,
875 },
876 QueryKind::NaturalLanguage => ShapeWeights {
877 semantic: 0.6,
878 lexical: 0.4,
879 should_use_lexical: false,
880 },
881 QueryKind::Mixed => ShapeWeights {
882 semantic: 0.4,
883 lexical: 0.6,
884 should_use_lexical: true,
885 },
886 }
887}
888
889#[cfg(test)]
890mod tests {
891 use super::*;
892
893 fn kind(query: &str) -> QueryKind {
894 classify(query).kind
895 }
896
897 #[test]
898 fn url_exemptions_allow_common_literal_url_punctuation() {
899 for query in [
900 "https://api.io/path",
901 "https://api.io/foo?q=test",
902 "https://api.io/foo+bar",
903 "https://api.io/foo@bar",
904 "https://api.io/foo#anchor",
905 ] {
906 assert_eq!(pre_tier_exempt(query), Some("url"), "{query}");
907 assert_ne!(kind(query), QueryKind::Regex, "{query}");
908 }
909 }
910
911 #[test]
912 fn url_exemptions_reject_regex_sequences() {
913 for query in [
914 "https://.*",
915 "https://api.io/.+",
916 "file://[^ ]+",
917 "file:///tmp/.+",
918 r"https://api.io/users/\w+",
919 ] {
920 assert_eq!(kind(query), QueryKind::Regex, "{query}");
921 }
922 }
923
924 #[test]
925 fn path_and_filename_exemptions_allow_literal_punctuation() {
926 for (query, expected) in [
927 (r"C:\new\test", "windows_abs"),
928 (r"src\bin\main.rs", "windows_rel"),
929 (r"src\tab\main.ts", "windows_rel"),
930 (r"packages\opencode-plugin\src", "windows_rel"),
931 ("/usr/local/bin", "posix_abs"),
932 ("/Users/John Doe/Documents", "posix_abs"),
933 ("/home/user/.gitignore", "posix_abs"),
934 ("v1/release/notes.md", "posix_rel"),
935 ("/home/user/jeff's-folder", "posix_abs"),
936 ("C++/parser/main.cpp", "posix_rel"),
937 ("foo+bar/baz.ts", "posix_rel"),
938 ("is_valid?.ts", "filename"),
939 ("Cargo.lock", "filename"),
940 ("tsconfig.json", "filename"),
941 ] {
942 assert_eq!(pre_tier_exempt(query), Some(expected), "{query}");
943 assert_eq!(kind(query), QueryKind::Path, "{query}");
944 }
945 assert_eq!(pre_tier_exempt("foo?"), None);
946 }
947
948 #[test]
949 fn path_exemptions_reject_regex_sequences() {
950 for query in [
951 "src/.*",
952 "src/.+",
953 r"C:\bin\foo*.exe",
954 r"C:\Users\\w+",
955 r"src\w+\main.ts",
956 ] {
957 assert_eq!(kind(query), QueryKind::Regex, "{query}");
958 }
959 }
960
961 #[test]
962 fn tier_a_and_c_regex_signals_route_to_regex() {
963 for query in [
964 "^export",
965 "foo$",
966 "^main$",
967 r"foo\.bar",
968 r"\(method\)",
969 r"\bTODO\b",
970 ".*foo",
971 "foo|bar",
972 "(?:foo)",
973 "(?P<n>foo)",
974 "(?i)Todo",
975 r"\p{Lu}",
976 r"\xFF",
977 r"\u{1F600}",
978 "a{3}",
979 r"\n",
984 r"\t",
985 r"\r",
986 r"\tindent",
987 ] {
988 assert_eq!(kind(query), QueryKind::Regex, "{query}");
989 }
990 }
991
992 #[test]
993 fn character_classes_route_only_when_they_look_like_classes() {
994 for query in ["[a-z]+", "[^abc]", r"[\w]+"] {
995 assert_eq!(kind(query), QueryKind::Regex, "{query}");
996 }
997 for query in [
998 "arr[0]",
999 "obj[key]",
1000 "config[\"key\"]",
1001 "#[derive]",
1002 "Vec<[u8; 32]>",
1003 ] {
1004 assert_ne!(kind(query), QueryKind::Regex, "{query}");
1005 }
1006 }
1007
1008 #[test]
1009 fn unsupported_regex_syntax_still_routes_to_regex_for_compile_error() {
1010 for query in [
1011 "(?=foo)",
1012 "(?!foo)",
1013 "(?<=foo)",
1014 "(?<!foo)",
1015 "(?P=name)",
1016 r"\1",
1017 "foo*+",
1018 "(?>foo)",
1019 ] {
1020 assert_eq!(kind(query), QueryKind::Regex, "{query}");
1021 }
1022 }
1023
1024 #[test]
1025 fn explicit_code_tokens_for_natural_language_skip_bare_capitalized_words() {
1026 assert!(extract_explicit_code_tokens("Engine implementations").is_empty());
1027 assert_eq!(
1028 extract_explicit_code_tokens("find `Engine` and engine_factory"),
1029 vec!["Engine".to_string(), "engine_factory".to_string()]
1030 );
1031 assert_eq!(
1032 extract_explicit_code_tokens("Engine Index"),
1033 vec!["Engine.Index".to_string()]
1034 );
1035 assert_eq!(
1036 extract_explicit_code_tokens("use Engine.Index and AllocationService"),
1037 vec!["Engine.Index".to_string(), "AllocationService".to_string()]
1038 );
1039 }
1040
1041 #[test]
1042 fn two_word_lowercase_concepts_route_to_natural_language() {
1043 for query in ["retry logic", "auth flow", "cache invalidation"] {
1044 assert_eq!(kind(query), QueryKind::NaturalLanguage, "{query}");
1045 }
1046 }
1047
1048 #[test]
1049 fn identifierish_short_queries_stay_identifier() {
1050 for query in ["useState hook", "parseConfig", "parse_config option"] {
1051 assert_eq!(kind(query), QueryKind::Identifier, "{query}");
1052 }
1053 }
1054
1055 #[test]
1056 fn question_mark_code_shapes_do_not_route_to_regex() {
1057 for query in ["foo()?", "optional?.length", "user?.name", "arr[0]?"] {
1058 assert_ne!(kind(query), QueryKind::Regex, "{query}");
1059 }
1060 }
1061
1062 #[test]
1063 fn question_mark_regex_quantifiers_still_route_to_regex() {
1064 for query in ["colou?r", "https?"] {
1065 assert_eq!(kind(query), QueryKind::Regex, "{query}");
1066 }
1067 }
1068
1069 #[test]
1070 fn weak_regex_like_punctuation_does_not_route_to_regex() {
1071 for query in [
1072 "^id",
1073 "id$",
1074 "^",
1075 "$",
1076 "$HOME",
1077 r"\.",
1078 "array.length",
1079 "foo()",
1080 "map.get(key)",
1081 "a|b",
1082 ] {
1083 assert_ne!(kind(query), QueryKind::Regex, "{query}");
1084 }
1085 }
1086}