Skip to main content

aft/
query_shape.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4static CAMEL_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z][A-Z]").unwrap());
5static SNAKE_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z]_[a-z]").unwrap());
6static PASCAL_CASE_RE: LazyLock<Regex> =
7    LazyLock::new(|| Regex::new(r"^[A-Z][a-z]+[A-Z]").unwrap());
8static ACRONYM_PASCAL_RE: LazyLock<Regex> =
9    LazyLock::new(|| Regex::new(r"\b[A-Z]{2,}[A-Z][a-z]").unwrap());
10static DOT_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-zA-Z]\.[a-zA-Z]").unwrap());
11static FILE_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[/\\].*\.\w{1,5}$").unwrap());
12static HEX_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"0x[A-Fa-f0-9]+").unwrap());
13static ERROR_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bERR_\w+").unwrap());
14static NUMERIC_ERROR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bE\d{4,}").unwrap());
15static TYPESCRIPT_ERROR_RE: LazyLock<Regex> =
16    LazyLock::new(|| Regex::new(r"\bTS\d{4,}\b").unwrap());
17static HTTP_STATUS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[1-5]\d{2}\b").unwrap());
18static IDENTIFIER_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
19    Regex::new(r"\b[A-Za-z_$][A-Za-z0-9_$]*(?:\.[A-Za-z_$][A-Za-z0-9_$]*)*\b").unwrap()
20});
21
22static WINDOWS_ABS_PATH_RE: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"^[A-Za-z]:[\\/][A-Za-z0-9_.\-+?\\/' ]+$").unwrap());
24static WINDOWS_REL_PATH_RE: LazyLock<Regex> =
25    LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(\\[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
26static POSIX_ABS_PATH_RE: LazyLock<Regex> =
27    LazyLock::new(|| Regex::new(r"^/[A-Za-z0-9_.\-+?/' ]+$").unwrap());
28static POSIX_REL_PATH_RE: LazyLock<Regex> =
29    LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(/[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
30static UNC_PATH_RE: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"^\\\\[A-Za-z0-9_.\-+?\\']+$").unwrap());
32static FILENAME_EXEMPTION_RE: LazyLock<Regex> =
33    LazyLock::new(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_.\-+'? ]*\.[A-Za-z0-9]{1,8}$").unwrap());
34static BRACE_QUANTIFIER_RE: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\{\d+(?:,\d*)?\}").unwrap());
36static NAMED_CAPTURE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(\?P<[^>]+>").unwrap());
37static CHAR_RANGE_RE: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"[A-Za-z0-9]-[A-Za-z0-9]").unwrap());
39
40const QUESTION_WORDS: &[&str] = &[
41    "how", "what", "where", "why", "when", "which", "who", "does",
42];
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum QueryKind {
46    Identifier,
47    Mixed,
48    ErrorCode,
49    Path,
50    Regex,
51    NaturalLanguage,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub struct ShapeWeights {
56    pub semantic: f32,
57    pub lexical: f32,
58    pub should_use_lexical: bool,
59}
60
61#[derive(Debug, Clone, Copy, PartialEq)]
62pub struct QueryShape {
63    pub kind: QueryKind,
64    pub weights: ShapeWeights,
65}
66
67pub fn classify(query: &str) -> QueryShape {
68    let trimmed = query.trim();
69    if trimmed.is_empty() {
70        return shape(QueryKind::NaturalLanguage);
71    }
72
73    if pre_tier_exempt(trimmed).is_some() {
74        return shape(QueryKind::Path);
75    }
76
77    if looks_like_regex(trimmed) {
78        return shape(QueryKind::Regex);
79    }
80
81    let words: Vec<&str> = trimmed.split_whitespace().collect();
82    let word_count = words.len();
83    let first_word_lower = words[0].to_ascii_lowercase();
84
85    if FILE_PATH_RE.is_match(trimmed) {
86        return shape(QueryKind::Path);
87    }
88
89    let has_question_word = QUESTION_WORDS.contains(&first_word_lower.as_str());
90    let is_long_phrase = word_count > 2;
91    let is_two_word_concept = is_two_word_lowercase_concept(&words);
92    let has_natural_language_signals = has_question_word || is_long_phrase || is_two_word_concept;
93    let has_error_code = contains_error_code(trimmed, word_count);
94
95    if has_error_code && has_natural_language_signals {
96        return shape(QueryKind::Mixed);
97    }
98
99    if has_error_code {
100        return shape(QueryKind::ErrorCode);
101    }
102
103    let has_code_identifier = CAMEL_CASE_RE.is_match(trimmed)
104        || SNAKE_CASE_RE.is_match(trimmed)
105        || PASCAL_CASE_RE.is_match(trimmed)
106        || ACRONYM_PASCAL_RE.is_match(trimmed)
107        || DOT_PATH_RE.is_match(trimmed);
108
109    if has_code_identifier && has_natural_language_signals {
110        return shape(QueryKind::Mixed);
111    }
112
113    if has_code_identifier || (word_count <= 2 && !has_natural_language_signals) {
114        return shape(QueryKind::Identifier);
115    }
116
117    shape(QueryKind::NaturalLanguage)
118}
119
120pub fn extract_tokens(query: &str, shape: &QueryShape) -> Vec<String> {
121    match shape.kind {
122        QueryKind::NaturalLanguage | QueryKind::Regex => Vec::new(),
123        QueryKind::Path => extract_path_tokens(query),
124        QueryKind::ErrorCode => extract_error_code_tokens(query),
125        QueryKind::Identifier => extract_identifier_tokens(query, false),
126        QueryKind::Mixed => extract_identifier_tokens(query, true),
127    }
128}
129
130/// Lexical tokens for a short natural-language concept routed to Hybrid (e.g.
131/// "parse imports"). `extract_tokens` returns nothing for NL (its words are not
132/// code identifiers), but a short two-word concept is frequently a literal code
133/// phrase the trigram lane can match. Split on whitespace and keep words of at
134/// least 3 chars (the trigram floor).
135pub fn extract_short_nl_lexical_tokens(query: &str) -> Vec<String> {
136    query
137        .split_whitespace()
138        .filter(|word| word.chars().count() >= 3)
139        .map(str::to_string)
140        .collect()
141}
142
143pub(crate) fn is_type_concept_identifier_query(query: &str, shape: &QueryShape) -> bool {
144    if shape.kind != QueryKind::Identifier {
145        return false;
146    }
147
148    let mut identifier_token_count = 0;
149    let mut has_type_token = false;
150    let mut has_lowercase_concept_word = false;
151
152    for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
153        let token = mat.as_str();
154        identifier_token_count += 1;
155        has_type_token |= is_type_concept_type_token(token);
156        has_lowercase_concept_word |= is_dictionary_style_lowercase_word(token);
157    }
158
159    identifier_token_count >= 2 && has_type_token && has_lowercase_concept_word
160}
161
162fn is_type_concept_type_token(token: &str) -> bool {
163    token
164        .chars()
165        .next()
166        .is_some_and(|first| first.is_ascii_uppercase())
167        && (is_titlecase_word(token)
168            || PASCAL_CASE_RE.is_match(token)
169            || ACRONYM_PASCAL_RE.is_match(token))
170}
171
172/// Code-shaped tokens that are explicit enough to use for semantic name priors
173/// inside natural-language queries. Bare prose words are excluded: a lone
174/// capitalized word like "Engine" is too ambiguous unless quoted, while adjacent
175/// TitleCase words are treated as one qualified name such as "Engine.Index".
176pub(crate) fn extract_explicit_code_tokens(query: &str) -> Vec<String> {
177    let mut tokens = Vec::new();
178
179    push_quoted_code_tokens(query, &mut tokens);
180    let title_spans = push_adjacent_titlecase_tokens(query, 0, &mut tokens);
181    for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
182        if span_is_covered(&title_spans, mat.start(), mat.end()) {
183            continue;
184        }
185        let token = mat.as_str();
186        if is_code_identifier_token(token) {
187            push_unique(&mut tokens, token);
188        }
189    }
190
191    tokens
192}
193
194pub fn pre_tier_exempt(query: &str) -> Option<&'static str> {
195    if let Some(kind) = check_url_exemption(query) {
196        return Some(kind);
197    }
198    check_path_exemption(query)
199}
200
201pub fn looks_like_regex(query: &str) -> bool {
202    crate::pattern_compile::detect_unsupported_features(query).is_some()
203        || tier_a_regex_signal(query)
204        || tier_b_character_class(query)
205        || tier_c_adjacent_meta(query)
206}
207
208fn check_url_exemption(query: &str) -> Option<&'static str> {
209    let parsed = url::Url::parse(query).ok()?;
210    if !matches!(parsed.scheme(), "http" | "https" | "file" | "ftp" | "ssh") {
211        return None;
212    }
213    if has_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
214        return None;
215    }
216    Some("url")
217}
218
219fn check_path_exemption(query: &str) -> Option<&'static str> {
220    let kind = if WINDOWS_ABS_PATH_RE.is_match(query) {
221        "windows_abs"
222    } else if WINDOWS_REL_PATH_RE.is_match(query) {
223        "windows_rel"
224    } else if POSIX_ABS_PATH_RE.is_match(query) {
225        "posix_abs"
226    } else if POSIX_REL_PATH_RE.is_match(query) {
227        "posix_rel"
228    } else if UNC_PATH_RE.is_match(query) {
229        "unc"
230    } else if FILENAME_EXEMPTION_RE.is_match(query) {
231        "filename"
232    } else {
233        return None;
234    };
235    if has_path_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
236        return None;
237    }
238    Some(kind)
239}
240
241fn contains_error_code(query: &str, word_count: usize) -> bool {
242    HEX_CODE_RE.is_match(query)
243        || ERROR_PREFIX_RE.is_match(query)
244        || NUMERIC_ERROR_RE.is_match(query)
245        || TYPESCRIPT_ERROR_RE.is_match(query)
246        || has_http_status(query, word_count)
247}
248
249fn has_http_status(query: &str, word_count: usize) -> bool {
250    HTTP_STATUS_RE.is_match(query)
251        && (word_count <= 3 || query.to_ascii_lowercase().contains("http"))
252}
253
254fn is_two_word_lowercase_concept(words: &[&str]) -> bool {
255    words.len() == 2
256        && words
257            .iter()
258            .all(|word| is_dictionary_style_lowercase_word(word))
259}
260
261fn is_dictionary_style_lowercase_word(word: &str) -> bool {
262    word.len() >= 3 && word.bytes().all(|byte| byte.is_ascii_lowercase())
263}
264
265fn has_regex_meta_sequences(query: &str) -> bool {
266    query.contains(".+")
267        || query.contains(".*")
268        || query.contains(".?")
269        || query.contains(r"\n")
270        || query.contains(r"\t")
271        || query.contains(r"\r")
272        || query.contains(r"\b")
273        || query.contains(r"\B")
274        || query.contains(r"\w")
275        || query.contains(r"\W")
276        || query.contains(r"\d")
277        || query.contains(r"\D")
278        || query.contains(r"\s")
279        || query.contains(r"\S")
280        || query.contains(r"\p{")
281        || query.contains(r"\x")
282        || query.contains(r"\u{")
283        || has_escaped_regex_metachar(query)
284}
285
286fn has_path_regex_meta_sequences(query: &str) -> bool {
287    query.contains(".+")
288        || query.contains(".*")
289        || query.contains(".?")
290        || query.contains(r"\p{")
291        || query.contains(r"\x")
292        || query.contains(r"\u{")
293        || has_path_context_regex_escape(query)
294        || has_escaped_regex_metachar(query)
295}
296
297fn has_path_context_regex_escape(query: &str) -> bool {
298    let chars = query.char_indices().collect::<Vec<_>>();
299    for index in 0..chars.len().saturating_sub(1) {
300        if chars[index].1 != '\\' {
301            continue;
302        }
303        let escaped = chars[index + 1].1;
304        if matches!(escaped, 'b' | 'B' | 'w' | 'W' | 'd' | 'D' | 's' | 'S')
305            && path_escape_looks_like_regex(&chars, index + 1)
306        {
307            return true;
308        }
309    }
310    false
311}
312
313fn path_escape_looks_like_regex(chars: &[(usize, char)], escaped_index: usize) -> bool {
314    let Some((_, next)) = chars.get(escaped_index + 1) else {
315        return true;
316    };
317
318    matches!(
319        *next,
320        '*' | '+' | '?' | '{' | '(' | '[' | '|' | '^' | '$' | '\\' | '/'
321    )
322}
323
324fn has_escaped_regex_metachar(query: &str) -> bool {
325    let mut escaped = false;
326    for ch in query.chars() {
327        if escaped {
328            if is_escaped_metachar(ch) {
329                return true;
330            }
331            escaped = false;
332            continue;
333        }
334        escaped = ch == '\\';
335    }
336    false
337}
338
339fn has_obvious_regex_chars(query: &str) -> bool {
340    query.contains('*')
341        || query.contains('[')
342        || query.contains(']')
343        || query.contains('(')
344        || query.contains(')')
345        || query.contains('|')
346        || query.contains('{')
347        || query.contains('}')
348}
349
350fn tier_a_regex_signal(query: &str) -> bool {
351    query.contains("(?:")
352        || NAMED_CAPTURE_RE.is_match(query)
353        || ["(?i)", "(?m)", "(?s)", "(?x)"]
354            .iter()
355            .any(|signal| query.contains(signal))
356        || [
357            r"\b", r"\B", r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\p{", r"\x", r"\u{", r"\n",
358            r"\t", r"\r",
359        ]
360        .iter()
361        .any(|signal| query.contains(signal))
362        || has_brace_quantifier(query)
363        || has_anchored_identifier(query)
364        || has_contextual_escaped_metachar(query)
365}
366
367fn has_brace_quantifier(query: &str) -> bool {
368    for matched in BRACE_QUANTIFIER_RE.find_iter(query) {
369        if matched.start() > 0
370            && query[..matched.start()]
371                .chars()
372                .last()
373                .is_some_and(|ch| !ch.is_whitespace())
374        {
375            return true;
376        }
377    }
378    false
379}
380
381fn has_anchored_identifier(query: &str) -> bool {
382    let trimmed = query.trim();
383    if let Some(rest) = trimmed.strip_prefix('^') {
384        if leading_identifier_len(rest) >= 3 {
385            return true;
386        }
387    }
388    if let Some(rest) = trimmed.strip_suffix('$') {
389        if trailing_identifier_len(rest) >= 3 {
390            return true;
391        }
392    }
393    false
394}
395
396fn leading_identifier_len(text: &str) -> usize {
397    text.chars()
398        .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
399        .count()
400}
401
402fn trailing_identifier_len(text: &str) -> usize {
403    text.chars()
404        .rev()
405        .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
406        .count()
407}
408
409fn has_contextual_escaped_metachar(query: &str) -> bool {
410    let chars: Vec<char> = query.chars().collect();
411    let mut index = 0usize;
412    while index + 1 < chars.len() {
413        if chars[index] == '\\' && is_escaped_metachar(chars[index + 1]) {
414            let literal_after = chars[index + 2..]
415                .iter()
416                .filter(|ch| ch.is_ascii_alphanumeric() || **ch == '_')
417                .count();
418            if literal_after >= 2 {
419                return true;
420            }
421            index += 2;
422        } else {
423            index += 1;
424        }
425    }
426    false
427}
428
429fn is_escaped_metachar(ch: char) -> bool {
430    matches!(
431        ch,
432        '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$'
433    )
434}
435
436fn tier_b_character_class(query: &str) -> bool {
437    for content in bracket_contents(query) {
438        if content.starts_with('^')
439            || CHAR_RANGE_RE.is_match(&content)
440            || [r"\w", r"\d", r"\s", r"\W", r"\D", r"\S"]
441                .iter()
442                .any(|signal| content.contains(signal))
443            || multi_char_non_identifier_class(&content)
444        {
445            return true;
446        }
447    }
448    false
449}
450
451fn bracket_contents(query: &str) -> Vec<String> {
452    let mut contents = Vec::new();
453    let mut escaped = false;
454    let mut start = None;
455    for (index, ch) in query.char_indices() {
456        if escaped {
457            escaped = false;
458            continue;
459        }
460        if ch == '\\' {
461            escaped = true;
462            continue;
463        }
464        match ch {
465            '[' if start.is_none() => start = Some(index + ch.len_utf8()),
466            ']' => {
467                if let Some(open) = start.take() {
468                    contents.push(query[open..index].to_string());
469                }
470            }
471            _ => {}
472        }
473    }
474    contents
475}
476
477fn multi_char_non_identifier_class(content: &str) -> bool {
478    let char_count = content.chars().count();
479    char_count >= 2
480        && !content.chars().any(|ch| {
481            ch.is_ascii_alphanumeric() || ch == '_' || ch == '"' || ch == '\'' || ch == ';'
482        })
483}
484
485fn tier_c_adjacent_meta(query: &str) -> bool {
486    has_dot_quantifier(query)
487        || has_literal_atom_quantifier(query)
488        || has_regex_pipe(query)
489        || escaped_paren_count(query) >= 2
490}
491
492fn has_dot_quantifier(query: &str) -> bool {
493    [".*", ".+", ".?"]
494        .iter()
495        .any(|signal| query.contains(signal) && query.trim().len() > signal.len())
496}
497
498fn has_literal_atom_quantifier(query: &str) -> bool {
499    let chars = query.char_indices().collect::<Vec<_>>();
500    for (index, (byte_index, ch)) in chars.iter().copied().enumerate() {
501        if !is_bare_quantifier(ch) || is_escaped_at(query, byte_index) {
502            continue;
503        }
504        if chars
505            .get(index + 1)
506            .is_some_and(|(_, next)| is_bare_quantifier(*next))
507        {
508            continue;
509        }
510        if ch == '?'
511            && (sentence_final_question_mark_in_phrase(query, byte_index)
512                || question_mark_is_code_shape(&chars, index))
513        {
514            continue;
515        }
516        if previous_is_literal_atom(&chars, index) {
517            return true;
518        }
519    }
520    false
521}
522
523fn sentence_final_question_mark_in_phrase(query: &str, byte_index: usize) -> bool {
524    query[byte_index + '?'.len_utf8()..].trim().is_empty()
525        && query[..byte_index].split_whitespace().count() > 1
526}
527
528fn question_mark_is_code_shape(chars: &[(usize, char)], question_index: usize) -> bool {
529    question_mark_is_optional_chain(chars, question_index)
530        || question_mark_after_empty_call(chars, question_index)
531        || question_mark_after_index_expression(chars, question_index)
532        || question_mark_is_typescript_optional(chars, question_index)
533}
534
535fn question_mark_is_optional_chain(chars: &[(usize, char)], question_index: usize) -> bool {
536    chars
537        .get(question_index + 1)
538        .is_some_and(|(_, next)| *next == '.')
539        && question_index
540            .checked_sub(1)
541            .and_then(|previous_index| chars.get(previous_index))
542            .is_some_and(|(_, previous)| is_code_expression_tail(*previous))
543}
544
545fn question_mark_after_empty_call(chars: &[(usize, char)], question_index: usize) -> bool {
546    let Some(call_open_index) = question_index.checked_sub(2) else {
547        return false;
548    };
549    chars
550        .get(question_index - 1)
551        .is_some_and(|(_, previous)| *previous == ')')
552        && chars
553            .get(call_open_index)
554            .is_some_and(|(_, open)| *open == '(')
555        && call_open_index
556            .checked_sub(1)
557            .and_then(|callee_index| chars.get(callee_index))
558            .is_some_and(|(_, callee_tail)| is_code_expression_tail(*callee_tail))
559}
560
561fn question_mark_after_index_expression(chars: &[(usize, char)], question_index: usize) -> bool {
562    if chars
563        .get(question_index.checked_sub(1).unwrap_or(usize::MAX))
564        .is_none_or(|(_, previous)| *previous != ']')
565    {
566        return false;
567    }
568
569    let mut depth = 0usize;
570    for index in (0..question_index).rev() {
571        match chars[index].1 {
572            ']' => depth += 1,
573            '[' => {
574                depth = depth.saturating_sub(1);
575                if depth == 0 {
576                    return index
577                        .checked_sub(1)
578                        .and_then(|target_index| chars.get(target_index))
579                        .is_some_and(|(_, target_tail)| is_code_expression_tail(*target_tail));
580                }
581            }
582            _ => {}
583        }
584    }
585    false
586}
587
588fn question_mark_is_typescript_optional(chars: &[(usize, char)], question_index: usize) -> bool {
589    let previous_is_identifier = question_index
590        .checked_sub(1)
591        .and_then(|previous_index| chars.get(previous_index))
592        .is_some_and(|(_, previous)| is_identifier_tail(*previous));
593    if !previous_is_identifier {
594        return false;
595    }
596    if chars
597        .get(question_index + 1)
598        .is_none_or(|(_, next)| *next != ':')
599    {
600        return false;
601    }
602
603    chars
604        .get(question_index + 2)
605        .is_none_or(|(_, after_colon)| {
606            after_colon.is_whitespace()
607                || after_colon.is_ascii_alphabetic()
608                || matches!(*after_colon, '_' | '{' | '[' | '(' | '"' | '\'')
609        })
610}
611
612fn is_code_expression_tail(ch: char) -> bool {
613    is_identifier_tail(ch) || matches!(ch, ')' | ']')
614}
615
616fn is_identifier_tail(ch: char) -> bool {
617    ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
618}
619
620fn previous_is_literal_atom(chars: &[(usize, char)], quantifier_index: usize) -> bool {
621    let Some((_, previous)) = quantifier_index
622        .checked_sub(1)
623        .and_then(|previous_index| chars.get(previous_index))
624    else {
625        return false;
626    };
627
628    previous.is_ascii_alphanumeric() || *previous == '_' || *previous == ')' || *previous == ']'
629}
630
631fn is_bare_quantifier(ch: char) -> bool {
632    matches!(ch, '*' | '+' | '?')
633}
634
635fn is_escaped_at(query: &str, byte_index: usize) -> bool {
636    let backslash_count = query[..byte_index]
637        .chars()
638        .rev()
639        .take_while(|ch| *ch == '\\')
640        .count();
641    backslash_count % 2 == 1
642}
643
644fn has_regex_pipe(query: &str) -> bool {
645    for (index, ch) in query.char_indices() {
646        if ch != '|' {
647            continue;
648        }
649        let left = trailing_identifier_len(&query[..index]);
650        let right = leading_identifier_len(&query[index + ch.len_utf8()..]);
651        if left >= 3 && right >= 3 {
652            return true;
653        }
654    }
655    false
656}
657
658fn escaped_paren_count(query: &str) -> usize {
659    let mut count = 0usize;
660    let mut escaped = false;
661    for ch in query.chars() {
662        if escaped {
663            if ch == '(' || ch == ')' {
664                count += 1;
665            }
666            escaped = false;
667            continue;
668        }
669        escaped = ch == '\\';
670    }
671    count
672}
673
674fn push_quoted_code_tokens(query: &str, tokens: &mut Vec<String>) {
675    let mut open: Option<(char, usize)> = None;
676    let mut escaped = false;
677
678    for (index, ch) in query.char_indices() {
679        if escaped {
680            escaped = false;
681            continue;
682        }
683        if ch == '\\' {
684            escaped = true;
685            continue;
686        }
687
688        if let Some((delimiter, content_start)) = open {
689            if ch == delimiter {
690                let content = &query[content_start..index];
691                let title_spans = push_adjacent_titlecase_tokens(content, content_start, tokens);
692                for mat in IDENTIFIER_TOKEN_RE.find_iter(content) {
693                    let start = content_start + mat.start();
694                    let end = content_start + mat.end();
695                    if !span_is_covered(&title_spans, start, end) {
696                        push_unique(tokens, mat.as_str());
697                    }
698                }
699                open = None;
700            }
701        } else if matches!(ch, '"' | '\'' | '`') {
702            open = Some((ch, index + ch.len_utf8()));
703        }
704    }
705}
706
707fn push_adjacent_titlecase_tokens(
708    text: &str,
709    base_offset: usize,
710    tokens: &mut Vec<String>,
711) -> Vec<(usize, usize)> {
712    let mut covered_spans = Vec::new();
713    let mut current: Vec<(usize, usize, &str)> = Vec::new();
714    let mut previous_end: Option<usize> = None;
715
716    for mat in IDENTIFIER_TOKEN_RE.find_iter(text) {
717        let token = mat.as_str();
718        let adjacent_to_current = previous_end.is_some_and(|end| {
719            !current.is_empty() && text[end..mat.start()].chars().all(|ch| ch.is_whitespace())
720        });
721        if is_titlecase_word(token) && (current.is_empty() || adjacent_to_current) {
722            current.push((base_offset + mat.start(), base_offset + mat.end(), token));
723        } else {
724            flush_titlecase_sequence(&mut current, &mut covered_spans, tokens);
725            if is_titlecase_word(token) {
726                current.push((base_offset + mat.start(), base_offset + mat.end(), token));
727            }
728        }
729        previous_end = Some(mat.end());
730    }
731
732    flush_titlecase_sequence(&mut current, &mut covered_spans, tokens);
733    covered_spans
734}
735
736fn flush_titlecase_sequence(
737    current: &mut Vec<(usize, usize, &str)>,
738    covered_spans: &mut Vec<(usize, usize)>,
739    tokens: &mut Vec<String>,
740) {
741    if current.len() >= 2 {
742        let qualified = current
743            .iter()
744            .map(|(_, _, token)| *token)
745            .collect::<Vec<_>>()
746            .join(".");
747        push_unique(tokens, &qualified);
748        covered_spans.extend(current.iter().map(|(start, end, _)| (*start, *end)));
749    }
750    current.clear();
751}
752
753fn span_is_covered(spans: &[(usize, usize)], start: usize, end: usize) -> bool {
754    spans
755        .iter()
756        .any(|(span_start, span_end)| start >= *span_start && end <= *span_end)
757}
758
759fn is_titlecase_word(token: &str) -> bool {
760    if token.contains(['.', '_', '$']) {
761        return false;
762    }
763    let mut chars = token.chars();
764    let Some(first) = chars.next() else {
765        return false;
766    };
767    if !first.is_ascii_uppercase() {
768        return false;
769    }
770
771    let mut has_letter_after_first = false;
772    let mut has_lowercase = false;
773    for ch in chars {
774        if !ch.is_ascii_alphanumeric() {
775            return false;
776        }
777        if ch.is_ascii_alphabetic() {
778            has_letter_after_first = true;
779        }
780        if ch.is_ascii_lowercase() {
781            has_lowercase = true;
782        }
783    }
784
785    has_letter_after_first && (has_lowercase || token.chars().all(|ch| !ch.is_ascii_lowercase()))
786}
787
788fn extract_path_tokens(query: &str) -> Vec<String> {
789    let mut tokens = Vec::new();
790    for segment in query
791        .split(['/', '\\'])
792        .filter(|segment| !segment.is_empty())
793    {
794        if segment.contains('.') {
795            if let Some(stem) = segment.rsplit_once('.').map(|(stem, _)| stem) {
796                push_unique(&mut tokens, stem);
797            }
798        }
799        push_unique(&mut tokens, segment);
800    }
801    tokens
802}
803
804fn extract_error_code_tokens(query: &str) -> Vec<String> {
805    let mut tokens = Vec::new();
806    for regex in [
807        &*HEX_CODE_RE,
808        &*ERROR_PREFIX_RE,
809        &*NUMERIC_ERROR_RE,
810        &*TYPESCRIPT_ERROR_RE,
811        &*HTTP_STATUS_RE,
812    ] {
813        for mat in regex.find_iter(query) {
814            push_unique(&mut tokens, mat.as_str());
815        }
816    }
817    if tokens.is_empty() && !query.trim().is_empty() {
818        push_unique(&mut tokens, query.trim());
819    }
820    tokens
821}
822
823fn extract_identifier_tokens(query: &str, require_code_shape: bool) -> Vec<String> {
824    let mut tokens = Vec::new();
825    for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
826        let token = mat.as_str();
827        if require_code_shape && !is_code_identifier_token(token) {
828            continue;
829        }
830        push_unique(&mut tokens, token);
831    }
832    tokens
833}
834
835fn is_code_identifier_token(token: &str) -> bool {
836    CAMEL_CASE_RE.is_match(token)
837        || SNAKE_CASE_RE.is_match(token)
838        || PASCAL_CASE_RE.is_match(token)
839        || ACRONYM_PASCAL_RE.is_match(token)
840        || DOT_PATH_RE.is_match(token)
841        || ERROR_PREFIX_RE.is_match(token)
842        || NUMERIC_ERROR_RE.is_match(token)
843        || TYPESCRIPT_ERROR_RE.is_match(token)
844}
845
846fn push_unique(tokens: &mut Vec<String>, token: &str) {
847    if !token.is_empty() && !tokens.iter().any(|existing| existing == token) {
848        tokens.push(token.to_string());
849    }
850}
851
852fn shape(kind: QueryKind) -> QueryShape {
853    QueryShape {
854        kind,
855        weights: weights_for(kind),
856    }
857}
858
859fn weights_for(kind: QueryKind) -> ShapeWeights {
860    match kind {
861        QueryKind::Identifier => ShapeWeights {
862            semantic: 0.2,
863            lexical: 0.8,
864            should_use_lexical: true,
865        },
866        QueryKind::Path | QueryKind::ErrorCode => ShapeWeights {
867            semantic: 0.1,
868            lexical: 0.9,
869            should_use_lexical: true,
870        },
871        QueryKind::Regex => ShapeWeights {
872            semantic: 0.0,
873            lexical: 1.0,
874            should_use_lexical: false,
875        },
876        QueryKind::NaturalLanguage => ShapeWeights {
877            semantic: 0.6,
878            lexical: 0.4,
879            should_use_lexical: false,
880        },
881        QueryKind::Mixed => ShapeWeights {
882            semantic: 0.4,
883            lexical: 0.6,
884            should_use_lexical: true,
885        },
886    }
887}
888
889#[cfg(test)]
890mod tests {
891    use super::*;
892
893    fn kind(query: &str) -> QueryKind {
894        classify(query).kind
895    }
896
897    #[test]
898    fn url_exemptions_allow_common_literal_url_punctuation() {
899        for query in [
900            "https://api.io/path",
901            "https://api.io/foo?q=test",
902            "https://api.io/foo+bar",
903            "https://api.io/foo@bar",
904            "https://api.io/foo#anchor",
905        ] {
906            assert_eq!(pre_tier_exempt(query), Some("url"), "{query}");
907            assert_ne!(kind(query), QueryKind::Regex, "{query}");
908        }
909    }
910
911    #[test]
912    fn url_exemptions_reject_regex_sequences() {
913        for query in [
914            "https://.*",
915            "https://api.io/.+",
916            "file://[^ ]+",
917            "file:///tmp/.+",
918            r"https://api.io/users/\w+",
919        ] {
920            assert_eq!(kind(query), QueryKind::Regex, "{query}");
921        }
922    }
923
924    #[test]
925    fn path_and_filename_exemptions_allow_literal_punctuation() {
926        for (query, expected) in [
927            (r"C:\new\test", "windows_abs"),
928            (r"src\bin\main.rs", "windows_rel"),
929            (r"src\tab\main.ts", "windows_rel"),
930            (r"packages\opencode-plugin\src", "windows_rel"),
931            ("/usr/local/bin", "posix_abs"),
932            ("/Users/John Doe/Documents", "posix_abs"),
933            ("/home/user/.gitignore", "posix_abs"),
934            ("v1/release/notes.md", "posix_rel"),
935            ("/home/user/jeff's-folder", "posix_abs"),
936            ("C++/parser/main.cpp", "posix_rel"),
937            ("foo+bar/baz.ts", "posix_rel"),
938            ("is_valid?.ts", "filename"),
939            ("Cargo.lock", "filename"),
940            ("tsconfig.json", "filename"),
941        ] {
942            assert_eq!(pre_tier_exempt(query), Some(expected), "{query}");
943            assert_eq!(kind(query), QueryKind::Path, "{query}");
944        }
945        assert_eq!(pre_tier_exempt("foo?"), None);
946    }
947
948    #[test]
949    fn path_exemptions_reject_regex_sequences() {
950        for query in [
951            "src/.*",
952            "src/.+",
953            r"C:\bin\foo*.exe",
954            r"C:\Users\\w+",
955            r"src\w+\main.ts",
956        ] {
957            assert_eq!(kind(query), QueryKind::Regex, "{query}");
958        }
959    }
960
961    #[test]
962    fn tier_a_and_c_regex_signals_route_to_regex() {
963        for query in [
964            "^export",
965            "foo$",
966            "^main$",
967            r"foo\.bar",
968            r"\(method\)",
969            r"\bTODO\b",
970            ".*foo",
971            "foo|bar",
972            "(?:foo)",
973            "(?P<n>foo)",
974            "(?i)Todo",
975            r"\p{Lu}",
976            r"\xFF",
977            r"\u{1F600}",
978            "a{3}",
979            // Bare escape sequences route to regex via Tier A. Caveat: `foo\n`
980            // and similar single-backslash-escape after literal text are
981            // genuinely ambiguous with Windows path segments (e.g., file `n`
982            // in directory `foo`) and stay on the path/exemption path.
983            r"\n",
984            r"\t",
985            r"\r",
986            r"\tindent",
987        ] {
988            assert_eq!(kind(query), QueryKind::Regex, "{query}");
989        }
990    }
991
992    #[test]
993    fn character_classes_route_only_when_they_look_like_classes() {
994        for query in ["[a-z]+", "[^abc]", r"[\w]+"] {
995            assert_eq!(kind(query), QueryKind::Regex, "{query}");
996        }
997        for query in [
998            "arr[0]",
999            "obj[key]",
1000            "config[\"key\"]",
1001            "#[derive]",
1002            "Vec<[u8; 32]>",
1003        ] {
1004            assert_ne!(kind(query), QueryKind::Regex, "{query}");
1005        }
1006    }
1007
1008    #[test]
1009    fn unsupported_regex_syntax_still_routes_to_regex_for_compile_error() {
1010        for query in [
1011            "(?=foo)",
1012            "(?!foo)",
1013            "(?<=foo)",
1014            "(?<!foo)",
1015            "(?P=name)",
1016            r"\1",
1017            "foo*+",
1018            "(?>foo)",
1019        ] {
1020            assert_eq!(kind(query), QueryKind::Regex, "{query}");
1021        }
1022    }
1023
1024    #[test]
1025    fn explicit_code_tokens_for_natural_language_skip_bare_capitalized_words() {
1026        assert!(extract_explicit_code_tokens("Engine implementations").is_empty());
1027        assert_eq!(
1028            extract_explicit_code_tokens("find `Engine` and engine_factory"),
1029            vec!["Engine".to_string(), "engine_factory".to_string()]
1030        );
1031        assert_eq!(
1032            extract_explicit_code_tokens("Engine Index"),
1033            vec!["Engine.Index".to_string()]
1034        );
1035        assert_eq!(
1036            extract_explicit_code_tokens("use Engine.Index and AllocationService"),
1037            vec!["Engine.Index".to_string(), "AllocationService".to_string()]
1038        );
1039    }
1040
1041    #[test]
1042    fn two_word_lowercase_concepts_route_to_natural_language() {
1043        for query in ["retry logic", "auth flow", "cache invalidation"] {
1044            assert_eq!(kind(query), QueryKind::NaturalLanguage, "{query}");
1045        }
1046    }
1047
1048    #[test]
1049    fn identifierish_short_queries_stay_identifier() {
1050        for query in ["useState hook", "parseConfig", "parse_config option"] {
1051            assert_eq!(kind(query), QueryKind::Identifier, "{query}");
1052        }
1053    }
1054
1055    #[test]
1056    fn question_mark_code_shapes_do_not_route_to_regex() {
1057        for query in ["foo()?", "optional?.length", "user?.name", "arr[0]?"] {
1058            assert_ne!(kind(query), QueryKind::Regex, "{query}");
1059        }
1060    }
1061
1062    #[test]
1063    fn question_mark_regex_quantifiers_still_route_to_regex() {
1064        for query in ["colou?r", "https?"] {
1065            assert_eq!(kind(query), QueryKind::Regex, "{query}");
1066        }
1067    }
1068
1069    #[test]
1070    fn weak_regex_like_punctuation_does_not_route_to_regex() {
1071        for query in [
1072            "^id",
1073            "id$",
1074            "^",
1075            "$",
1076            "$HOME",
1077            r"\.",
1078            "array.length",
1079            "foo()",
1080            "map.get(key)",
1081            "a|b",
1082        ] {
1083            assert_ne!(kind(query), QueryKind::Regex, "{query}");
1084        }
1085    }
1086}