Skip to main content

aft/
query_shape.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4static CAMEL_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z][A-Z]").unwrap());
5static SNAKE_CASE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-z]_[a-z]").unwrap());
6static PASCAL_CASE_RE: LazyLock<Regex> =
7    LazyLock::new(|| Regex::new(r"^[A-Z][a-z]+[A-Z]").unwrap());
8static ACRONYM_PASCAL_RE: LazyLock<Regex> =
9    LazyLock::new(|| Regex::new(r"\b[A-Z]{2,}[A-Z][a-z]").unwrap());
10static DOT_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[a-zA-Z]\.[a-zA-Z]").unwrap());
11static FILE_PATH_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[/\\].*\.\w{1,5}$").unwrap());
12static HEX_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"0x[A-Fa-f0-9]+").unwrap());
13static ERROR_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bERR_\w+").unwrap());
14static NUMERIC_ERROR_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bE\d{4,}").unwrap());
15static TYPESCRIPT_ERROR_RE: LazyLock<Regex> =
16    LazyLock::new(|| Regex::new(r"\bTS\d{4,}\b").unwrap());
17static HTTP_STATUS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b[1-5]\d{2}\b").unwrap());
18static IDENTIFIER_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
19    Regex::new(r"\b[A-Za-z_$][A-Za-z0-9_$]*(?:\.[A-Za-z_$][A-Za-z0-9_$]*)*\b").unwrap()
20});
21
22static WINDOWS_ABS_PATH_RE: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"^[A-Za-z]:[\\/][A-Za-z0-9_.\-+?\\/' ]+$").unwrap());
24static WINDOWS_REL_PATH_RE: LazyLock<Regex> =
25    LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(\\[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
26static POSIX_ABS_PATH_RE: LazyLock<Regex> =
27    LazyLock::new(|| Regex::new(r"^/[A-Za-z0-9_.\-+?/' ]+$").unwrap());
28static POSIX_REL_PATH_RE: LazyLock<Regex> =
29    LazyLock::new(|| Regex::new(r"^[A-Za-z0-9_.\-+?' ]+(/[A-Za-z0-9_.\-+?' ]+)+$").unwrap());
30static UNC_PATH_RE: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"^\\\\[A-Za-z0-9_.\-+?\\']+$").unwrap());
32static FILENAME_EXEMPTION_RE: LazyLock<Regex> =
33    LazyLock::new(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_.\-+'? ]*\.[A-Za-z0-9]{1,8}$").unwrap());
34static BRACE_QUANTIFIER_RE: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\{\d+(?:,\d*)?\}").unwrap());
36static NAMED_CAPTURE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(\?P<[^>]+>").unwrap());
37static CHAR_RANGE_RE: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"[A-Za-z0-9]-[A-Za-z0-9]").unwrap());
39
40const QUESTION_WORDS: &[&str] = &[
41    "how", "what", "where", "why", "when", "which", "who", "does",
42];
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum QueryKind {
46    Identifier,
47    Mixed,
48    ErrorCode,
49    Path,
50    Regex,
51    NaturalLanguage,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub struct ShapeWeights {
56    pub semantic: f32,
57    pub lexical: f32,
58    pub should_use_lexical: bool,
59}
60
61#[derive(Debug, Clone, Copy, PartialEq)]
62pub struct QueryShape {
63    pub kind: QueryKind,
64    pub weights: ShapeWeights,
65}
66
67pub fn classify(query: &str) -> QueryShape {
68    let trimmed = query.trim();
69    if trimmed.is_empty() {
70        return shape(QueryKind::NaturalLanguage);
71    }
72
73    if pre_tier_exempt(trimmed).is_some() {
74        return shape(QueryKind::Path);
75    }
76
77    if looks_like_regex(trimmed) {
78        return shape(QueryKind::Regex);
79    }
80
81    let words: Vec<&str> = trimmed.split_whitespace().collect();
82    let word_count = words.len();
83    let first_word_lower = words[0].to_ascii_lowercase();
84
85    if FILE_PATH_RE.is_match(trimmed) {
86        return shape(QueryKind::Path);
87    }
88
89    let has_question_word = QUESTION_WORDS.contains(&first_word_lower.as_str());
90    let is_long_phrase = word_count > 2;
91    let is_two_word_concept = is_two_word_lowercase_concept(&words);
92    let has_natural_language_signals = has_question_word || is_long_phrase || is_two_word_concept;
93    let has_error_code = contains_error_code(trimmed, word_count);
94
95    if has_error_code && has_natural_language_signals {
96        return shape(QueryKind::Mixed);
97    }
98
99    if has_error_code {
100        return shape(QueryKind::ErrorCode);
101    }
102
103    let has_code_identifier = CAMEL_CASE_RE.is_match(trimmed)
104        || SNAKE_CASE_RE.is_match(trimmed)
105        || PASCAL_CASE_RE.is_match(trimmed)
106        || ACRONYM_PASCAL_RE.is_match(trimmed)
107        || DOT_PATH_RE.is_match(trimmed);
108
109    if has_code_identifier && has_natural_language_signals {
110        return shape(QueryKind::Mixed);
111    }
112
113    if has_code_identifier || (word_count <= 2 && !has_natural_language_signals) {
114        return shape(QueryKind::Identifier);
115    }
116
117    shape(QueryKind::NaturalLanguage)
118}
119
120pub fn extract_tokens(query: &str, shape: &QueryShape) -> Vec<String> {
121    match shape.kind {
122        QueryKind::NaturalLanguage | QueryKind::Regex => Vec::new(),
123        QueryKind::Path => extract_path_tokens(query),
124        QueryKind::ErrorCode => extract_error_code_tokens(query),
125        QueryKind::Identifier => extract_identifier_tokens(query, false),
126        QueryKind::Mixed => extract_identifier_tokens(query, true),
127    }
128}
129
130/// Lexical tokens for a short natural-language concept routed to Hybrid (e.g.
131/// "parse imports"). `extract_tokens` returns nothing for NL (its words are not
132/// code identifiers), but a short two-word concept is frequently a literal code
133/// phrase the trigram lane can match. Split on whitespace and keep words of at
134/// least 3 chars (the trigram floor).
135pub fn extract_short_nl_lexical_tokens(query: &str) -> Vec<String> {
136    query
137        .split_whitespace()
138        .filter(|word| word.chars().count() >= 3)
139        .map(str::to_string)
140        .collect()
141}
142
143pub fn pre_tier_exempt(query: &str) -> Option<&'static str> {
144    if let Some(kind) = check_url_exemption(query) {
145        return Some(kind);
146    }
147    check_path_exemption(query)
148}
149
150pub fn looks_like_regex(query: &str) -> bool {
151    crate::pattern_compile::detect_unsupported_features(query).is_some()
152        || tier_a_regex_signal(query)
153        || tier_b_character_class(query)
154        || tier_c_adjacent_meta(query)
155}
156
157fn check_url_exemption(query: &str) -> Option<&'static str> {
158    let parsed = url::Url::parse(query).ok()?;
159    if !matches!(parsed.scheme(), "http" | "https" | "file" | "ftp" | "ssh") {
160        return None;
161    }
162    if has_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
163        return None;
164    }
165    Some("url")
166}
167
168fn check_path_exemption(query: &str) -> Option<&'static str> {
169    let kind = if WINDOWS_ABS_PATH_RE.is_match(query) {
170        "windows_abs"
171    } else if WINDOWS_REL_PATH_RE.is_match(query) {
172        "windows_rel"
173    } else if POSIX_ABS_PATH_RE.is_match(query) {
174        "posix_abs"
175    } else if POSIX_REL_PATH_RE.is_match(query) {
176        "posix_rel"
177    } else if UNC_PATH_RE.is_match(query) {
178        "unc"
179    } else if FILENAME_EXEMPTION_RE.is_match(query) {
180        "filename"
181    } else {
182        return None;
183    };
184    if has_path_regex_meta_sequences(query) || has_obvious_regex_chars(query) {
185        return None;
186    }
187    Some(kind)
188}
189
190fn contains_error_code(query: &str, word_count: usize) -> bool {
191    HEX_CODE_RE.is_match(query)
192        || ERROR_PREFIX_RE.is_match(query)
193        || NUMERIC_ERROR_RE.is_match(query)
194        || TYPESCRIPT_ERROR_RE.is_match(query)
195        || has_http_status(query, word_count)
196}
197
198fn has_http_status(query: &str, word_count: usize) -> bool {
199    HTTP_STATUS_RE.is_match(query)
200        && (word_count <= 3 || query.to_ascii_lowercase().contains("http"))
201}
202
203fn is_two_word_lowercase_concept(words: &[&str]) -> bool {
204    words.len() == 2
205        && words
206            .iter()
207            .all(|word| is_dictionary_style_lowercase_word(word))
208}
209
210fn is_dictionary_style_lowercase_word(word: &str) -> bool {
211    word.len() >= 3 && word.bytes().all(|byte| byte.is_ascii_lowercase())
212}
213
214fn has_regex_meta_sequences(query: &str) -> bool {
215    query.contains(".+")
216        || query.contains(".*")
217        || query.contains(".?")
218        || query.contains(r"\n")
219        || query.contains(r"\t")
220        || query.contains(r"\r")
221        || query.contains(r"\b")
222        || query.contains(r"\B")
223        || query.contains(r"\w")
224        || query.contains(r"\W")
225        || query.contains(r"\d")
226        || query.contains(r"\D")
227        || query.contains(r"\s")
228        || query.contains(r"\S")
229        || query.contains(r"\p{")
230        || query.contains(r"\x")
231        || query.contains(r"\u{")
232        || has_escaped_regex_metachar(query)
233}
234
235fn has_path_regex_meta_sequences(query: &str) -> bool {
236    query.contains(".+")
237        || query.contains(".*")
238        || query.contains(".?")
239        || query.contains(r"\p{")
240        || query.contains(r"\x")
241        || query.contains(r"\u{")
242        || has_path_context_regex_escape(query)
243        || has_escaped_regex_metachar(query)
244}
245
246fn has_path_context_regex_escape(query: &str) -> bool {
247    let chars = query.char_indices().collect::<Vec<_>>();
248    for index in 0..chars.len().saturating_sub(1) {
249        if chars[index].1 != '\\' {
250            continue;
251        }
252        let escaped = chars[index + 1].1;
253        if matches!(escaped, 'b' | 'B' | 'w' | 'W' | 'd' | 'D' | 's' | 'S')
254            && path_escape_looks_like_regex(&chars, index + 1)
255        {
256            return true;
257        }
258    }
259    false
260}
261
262fn path_escape_looks_like_regex(chars: &[(usize, char)], escaped_index: usize) -> bool {
263    let Some((_, next)) = chars.get(escaped_index + 1) else {
264        return true;
265    };
266
267    matches!(
268        *next,
269        '*' | '+' | '?' | '{' | '(' | '[' | '|' | '^' | '$' | '\\' | '/'
270    )
271}
272
273fn has_escaped_regex_metachar(query: &str) -> bool {
274    let mut escaped = false;
275    for ch in query.chars() {
276        if escaped {
277            if is_escaped_metachar(ch) {
278                return true;
279            }
280            escaped = false;
281            continue;
282        }
283        escaped = ch == '\\';
284    }
285    false
286}
287
288fn has_obvious_regex_chars(query: &str) -> bool {
289    query.contains('*')
290        || query.contains('[')
291        || query.contains(']')
292        || query.contains('(')
293        || query.contains(')')
294        || query.contains('|')
295        || query.contains('{')
296        || query.contains('}')
297}
298
299fn tier_a_regex_signal(query: &str) -> bool {
300    query.contains("(?:")
301        || NAMED_CAPTURE_RE.is_match(query)
302        || ["(?i)", "(?m)", "(?s)", "(?x)"]
303            .iter()
304            .any(|signal| query.contains(signal))
305        || [
306            r"\b", r"\B", r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\p{", r"\x", r"\u{", r"\n",
307            r"\t", r"\r",
308        ]
309        .iter()
310        .any(|signal| query.contains(signal))
311        || has_brace_quantifier(query)
312        || has_anchored_identifier(query)
313        || has_contextual_escaped_metachar(query)
314}
315
316fn has_brace_quantifier(query: &str) -> bool {
317    for matched in BRACE_QUANTIFIER_RE.find_iter(query) {
318        if matched.start() > 0
319            && query[..matched.start()]
320                .chars()
321                .last()
322                .is_some_and(|ch| !ch.is_whitespace())
323        {
324            return true;
325        }
326    }
327    false
328}
329
330fn has_anchored_identifier(query: &str) -> bool {
331    let trimmed = query.trim();
332    if let Some(rest) = trimmed.strip_prefix('^') {
333        if leading_identifier_len(rest) >= 3 {
334            return true;
335        }
336    }
337    if let Some(rest) = trimmed.strip_suffix('$') {
338        if trailing_identifier_len(rest) >= 3 {
339            return true;
340        }
341    }
342    false
343}
344
345fn leading_identifier_len(text: &str) -> usize {
346    text.chars()
347        .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
348        .count()
349}
350
351fn trailing_identifier_len(text: &str) -> usize {
352    text.chars()
353        .rev()
354        .take_while(|ch| ch.is_ascii_alphanumeric() || *ch == '_')
355        .count()
356}
357
358fn has_contextual_escaped_metachar(query: &str) -> bool {
359    let chars: Vec<char> = query.chars().collect();
360    let mut index = 0usize;
361    while index + 1 < chars.len() {
362        if chars[index] == '\\' && is_escaped_metachar(chars[index + 1]) {
363            let literal_after = chars[index + 2..]
364                .iter()
365                .filter(|ch| ch.is_ascii_alphanumeric() || **ch == '_')
366                .count();
367            if literal_after >= 2 {
368                return true;
369            }
370            index += 2;
371        } else {
372            index += 1;
373        }
374    }
375    false
376}
377
378fn is_escaped_metachar(ch: char) -> bool {
379    matches!(
380        ch,
381        '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$'
382    )
383}
384
385fn tier_b_character_class(query: &str) -> bool {
386    for content in bracket_contents(query) {
387        if content.starts_with('^')
388            || CHAR_RANGE_RE.is_match(&content)
389            || [r"\w", r"\d", r"\s", r"\W", r"\D", r"\S"]
390                .iter()
391                .any(|signal| content.contains(signal))
392            || multi_char_non_identifier_class(&content)
393        {
394            return true;
395        }
396    }
397    false
398}
399
400fn bracket_contents(query: &str) -> Vec<String> {
401    let mut contents = Vec::new();
402    let mut escaped = false;
403    let mut start = None;
404    for (index, ch) in query.char_indices() {
405        if escaped {
406            escaped = false;
407            continue;
408        }
409        if ch == '\\' {
410            escaped = true;
411            continue;
412        }
413        match ch {
414            '[' if start.is_none() => start = Some(index + ch.len_utf8()),
415            ']' => {
416                if let Some(open) = start.take() {
417                    contents.push(query[open..index].to_string());
418                }
419            }
420            _ => {}
421        }
422    }
423    contents
424}
425
426fn multi_char_non_identifier_class(content: &str) -> bool {
427    let char_count = content.chars().count();
428    char_count >= 2
429        && !content.chars().any(|ch| {
430            ch.is_ascii_alphanumeric() || ch == '_' || ch == '"' || ch == '\'' || ch == ';'
431        })
432}
433
434fn tier_c_adjacent_meta(query: &str) -> bool {
435    has_dot_quantifier(query)
436        || has_literal_atom_quantifier(query)
437        || has_regex_pipe(query)
438        || escaped_paren_count(query) >= 2
439}
440
441fn has_dot_quantifier(query: &str) -> bool {
442    [".*", ".+", ".?"]
443        .iter()
444        .any(|signal| query.contains(signal) && query.trim().len() > signal.len())
445}
446
447fn has_literal_atom_quantifier(query: &str) -> bool {
448    let chars = query.char_indices().collect::<Vec<_>>();
449    for (index, (byte_index, ch)) in chars.iter().copied().enumerate() {
450        if !is_bare_quantifier(ch) || is_escaped_at(query, byte_index) {
451            continue;
452        }
453        if chars
454            .get(index + 1)
455            .is_some_and(|(_, next)| is_bare_quantifier(*next))
456        {
457            continue;
458        }
459        if ch == '?'
460            && (sentence_final_question_mark_in_phrase(query, byte_index)
461                || question_mark_is_code_shape(&chars, index))
462        {
463            continue;
464        }
465        if previous_is_literal_atom(&chars, index) {
466            return true;
467        }
468    }
469    false
470}
471
472fn sentence_final_question_mark_in_phrase(query: &str, byte_index: usize) -> bool {
473    query[byte_index + '?'.len_utf8()..].trim().is_empty()
474        && query[..byte_index].split_whitespace().count() > 1
475}
476
477fn question_mark_is_code_shape(chars: &[(usize, char)], question_index: usize) -> bool {
478    question_mark_is_optional_chain(chars, question_index)
479        || question_mark_after_empty_call(chars, question_index)
480        || question_mark_after_index_expression(chars, question_index)
481        || question_mark_is_typescript_optional(chars, question_index)
482}
483
484fn question_mark_is_optional_chain(chars: &[(usize, char)], question_index: usize) -> bool {
485    chars
486        .get(question_index + 1)
487        .is_some_and(|(_, next)| *next == '.')
488        && question_index
489            .checked_sub(1)
490            .and_then(|previous_index| chars.get(previous_index))
491            .is_some_and(|(_, previous)| is_code_expression_tail(*previous))
492}
493
494fn question_mark_after_empty_call(chars: &[(usize, char)], question_index: usize) -> bool {
495    let Some(call_open_index) = question_index.checked_sub(2) else {
496        return false;
497    };
498    chars
499        .get(question_index - 1)
500        .is_some_and(|(_, previous)| *previous == ')')
501        && chars
502            .get(call_open_index)
503            .is_some_and(|(_, open)| *open == '(')
504        && call_open_index
505            .checked_sub(1)
506            .and_then(|callee_index| chars.get(callee_index))
507            .is_some_and(|(_, callee_tail)| is_code_expression_tail(*callee_tail))
508}
509
510fn question_mark_after_index_expression(chars: &[(usize, char)], question_index: usize) -> bool {
511    if chars
512        .get(question_index.checked_sub(1).unwrap_or(usize::MAX))
513        .is_none_or(|(_, previous)| *previous != ']')
514    {
515        return false;
516    }
517
518    let mut depth = 0usize;
519    for index in (0..question_index).rev() {
520        match chars[index].1 {
521            ']' => depth += 1,
522            '[' => {
523                depth = depth.saturating_sub(1);
524                if depth == 0 {
525                    return index
526                        .checked_sub(1)
527                        .and_then(|target_index| chars.get(target_index))
528                        .is_some_and(|(_, target_tail)| is_code_expression_tail(*target_tail));
529                }
530            }
531            _ => {}
532        }
533    }
534    false
535}
536
537fn question_mark_is_typescript_optional(chars: &[(usize, char)], question_index: usize) -> bool {
538    let previous_is_identifier = question_index
539        .checked_sub(1)
540        .and_then(|previous_index| chars.get(previous_index))
541        .is_some_and(|(_, previous)| is_identifier_tail(*previous));
542    if !previous_is_identifier {
543        return false;
544    }
545    if chars
546        .get(question_index + 1)
547        .is_none_or(|(_, next)| *next != ':')
548    {
549        return false;
550    }
551
552    chars
553        .get(question_index + 2)
554        .is_none_or(|(_, after_colon)| {
555            after_colon.is_whitespace()
556                || after_colon.is_ascii_alphabetic()
557                || matches!(*after_colon, '_' | '{' | '[' | '(' | '"' | '\'')
558        })
559}
560
561fn is_code_expression_tail(ch: char) -> bool {
562    is_identifier_tail(ch) || matches!(ch, ')' | ']')
563}
564
565fn is_identifier_tail(ch: char) -> bool {
566    ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
567}
568
569fn previous_is_literal_atom(chars: &[(usize, char)], quantifier_index: usize) -> bool {
570    let Some((_, previous)) = quantifier_index
571        .checked_sub(1)
572        .and_then(|previous_index| chars.get(previous_index))
573    else {
574        return false;
575    };
576
577    previous.is_ascii_alphanumeric() || *previous == '_' || *previous == ')' || *previous == ']'
578}
579
580fn is_bare_quantifier(ch: char) -> bool {
581    matches!(ch, '*' | '+' | '?')
582}
583
584fn is_escaped_at(query: &str, byte_index: usize) -> bool {
585    let backslash_count = query[..byte_index]
586        .chars()
587        .rev()
588        .take_while(|ch| *ch == '\\')
589        .count();
590    backslash_count % 2 == 1
591}
592
593fn has_regex_pipe(query: &str) -> bool {
594    for (index, ch) in query.char_indices() {
595        if ch != '|' {
596            continue;
597        }
598        let left = trailing_identifier_len(&query[..index]);
599        let right = leading_identifier_len(&query[index + ch.len_utf8()..]);
600        if left >= 3 && right >= 3 {
601            return true;
602        }
603    }
604    false
605}
606
607fn escaped_paren_count(query: &str) -> usize {
608    let mut count = 0usize;
609    let mut escaped = false;
610    for ch in query.chars() {
611        if escaped {
612            if ch == '(' || ch == ')' {
613                count += 1;
614            }
615            escaped = false;
616            continue;
617        }
618        escaped = ch == '\\';
619    }
620    count
621}
622
623fn extract_path_tokens(query: &str) -> Vec<String> {
624    let mut tokens = Vec::new();
625    for segment in query
626        .split(['/', '\\'])
627        .filter(|segment| !segment.is_empty())
628    {
629        if segment.contains('.') {
630            if let Some(stem) = segment.rsplit_once('.').map(|(stem, _)| stem) {
631                push_unique(&mut tokens, stem);
632            }
633        }
634        push_unique(&mut tokens, segment);
635    }
636    tokens
637}
638
639fn extract_error_code_tokens(query: &str) -> Vec<String> {
640    let mut tokens = Vec::new();
641    for regex in [
642        &*HEX_CODE_RE,
643        &*ERROR_PREFIX_RE,
644        &*NUMERIC_ERROR_RE,
645        &*TYPESCRIPT_ERROR_RE,
646        &*HTTP_STATUS_RE,
647    ] {
648        for mat in regex.find_iter(query) {
649            push_unique(&mut tokens, mat.as_str());
650        }
651    }
652    if tokens.is_empty() && !query.trim().is_empty() {
653        push_unique(&mut tokens, query.trim());
654    }
655    tokens
656}
657
658fn extract_identifier_tokens(query: &str, require_code_shape: bool) -> Vec<String> {
659    let mut tokens = Vec::new();
660    for mat in IDENTIFIER_TOKEN_RE.find_iter(query) {
661        let token = mat.as_str();
662        if require_code_shape && !is_code_identifier_token(token) {
663            continue;
664        }
665        push_unique(&mut tokens, token);
666    }
667    tokens
668}
669
670fn is_code_identifier_token(token: &str) -> bool {
671    CAMEL_CASE_RE.is_match(token)
672        || SNAKE_CASE_RE.is_match(token)
673        || PASCAL_CASE_RE.is_match(token)
674        || ACRONYM_PASCAL_RE.is_match(token)
675        || DOT_PATH_RE.is_match(token)
676        || ERROR_PREFIX_RE.is_match(token)
677        || NUMERIC_ERROR_RE.is_match(token)
678        || TYPESCRIPT_ERROR_RE.is_match(token)
679}
680
681fn push_unique(tokens: &mut Vec<String>, token: &str) {
682    if !token.is_empty() && !tokens.iter().any(|existing| existing == token) {
683        tokens.push(token.to_string());
684    }
685}
686
687fn shape(kind: QueryKind) -> QueryShape {
688    QueryShape {
689        kind,
690        weights: weights_for(kind),
691    }
692}
693
694fn weights_for(kind: QueryKind) -> ShapeWeights {
695    match kind {
696        QueryKind::Identifier => ShapeWeights {
697            semantic: 0.2,
698            lexical: 0.8,
699            should_use_lexical: true,
700        },
701        QueryKind::Path | QueryKind::ErrorCode => ShapeWeights {
702            semantic: 0.1,
703            lexical: 0.9,
704            should_use_lexical: true,
705        },
706        QueryKind::Regex => ShapeWeights {
707            semantic: 0.0,
708            lexical: 1.0,
709            should_use_lexical: false,
710        },
711        QueryKind::NaturalLanguage => ShapeWeights {
712            semantic: 0.6,
713            lexical: 0.4,
714            should_use_lexical: false,
715        },
716        QueryKind::Mixed => ShapeWeights {
717            semantic: 0.4,
718            lexical: 0.6,
719            should_use_lexical: true,
720        },
721    }
722}
723
724#[cfg(test)]
725mod tests {
726    use super::*;
727
728    fn kind(query: &str) -> QueryKind {
729        classify(query).kind
730    }
731
732    #[test]
733    fn url_exemptions_allow_common_literal_url_punctuation() {
734        for query in [
735            "https://api.io/path",
736            "https://api.io/foo?q=test",
737            "https://api.io/foo+bar",
738            "https://api.io/foo@bar",
739            "https://api.io/foo#anchor",
740        ] {
741            assert_eq!(pre_tier_exempt(query), Some("url"), "{query}");
742            assert_ne!(kind(query), QueryKind::Regex, "{query}");
743        }
744    }
745
746    #[test]
747    fn url_exemptions_reject_regex_sequences() {
748        for query in [
749            "https://.*",
750            "https://api.io/.+",
751            "file://[^ ]+",
752            "file:///tmp/.+",
753            r"https://api.io/users/\w+",
754        ] {
755            assert_eq!(kind(query), QueryKind::Regex, "{query}");
756        }
757    }
758
759    #[test]
760    fn path_and_filename_exemptions_allow_literal_punctuation() {
761        for (query, expected) in [
762            (r"C:\new\test", "windows_abs"),
763            (r"src\bin\main.rs", "windows_rel"),
764            (r"src\tab\main.ts", "windows_rel"),
765            (r"packages\opencode-plugin\src", "windows_rel"),
766            ("/usr/local/bin", "posix_abs"),
767            ("/Users/John Doe/Documents", "posix_abs"),
768            ("/home/user/.gitignore", "posix_abs"),
769            ("v1/release/notes.md", "posix_rel"),
770            ("/home/user/jeff's-folder", "posix_abs"),
771            ("C++/parser/main.cpp", "posix_rel"),
772            ("foo+bar/baz.ts", "posix_rel"),
773            ("is_valid?.ts", "filename"),
774            ("Cargo.lock", "filename"),
775            ("tsconfig.json", "filename"),
776        ] {
777            assert_eq!(pre_tier_exempt(query), Some(expected), "{query}");
778            assert_eq!(kind(query), QueryKind::Path, "{query}");
779        }
780        assert_eq!(pre_tier_exempt("foo?"), None);
781    }
782
783    #[test]
784    fn path_exemptions_reject_regex_sequences() {
785        for query in [
786            "src/.*",
787            "src/.+",
788            r"C:\bin\foo*.exe",
789            r"C:\Users\\w+",
790            r"src\w+\main.ts",
791        ] {
792            assert_eq!(kind(query), QueryKind::Regex, "{query}");
793        }
794    }
795
796    #[test]
797    fn tier_a_and_c_regex_signals_route_to_regex() {
798        for query in [
799            "^export",
800            "foo$",
801            "^main$",
802            r"foo\.bar",
803            r"\(method\)",
804            r"\bTODO\b",
805            ".*foo",
806            "foo|bar",
807            "(?:foo)",
808            "(?P<n>foo)",
809            "(?i)Todo",
810            r"\p{Lu}",
811            r"\xFF",
812            r"\u{1F600}",
813            "a{3}",
814            // Bare escape sequences route to regex via Tier A. Caveat: `foo\n`
815            // and similar single-backslash-escape after literal text are
816            // genuinely ambiguous with Windows path segments (e.g., file `n`
817            // in directory `foo`) and stay on the path/exemption path.
818            r"\n",
819            r"\t",
820            r"\r",
821            r"\tindent",
822        ] {
823            assert_eq!(kind(query), QueryKind::Regex, "{query}");
824        }
825    }
826
827    #[test]
828    fn character_classes_route_only_when_they_look_like_classes() {
829        for query in ["[a-z]+", "[^abc]", r"[\w]+"] {
830            assert_eq!(kind(query), QueryKind::Regex, "{query}");
831        }
832        for query in [
833            "arr[0]",
834            "obj[key]",
835            "config[\"key\"]",
836            "#[derive]",
837            "Vec<[u8; 32]>",
838        ] {
839            assert_ne!(kind(query), QueryKind::Regex, "{query}");
840        }
841    }
842
843    #[test]
844    fn unsupported_regex_syntax_still_routes_to_regex_for_compile_error() {
845        for query in [
846            "(?=foo)",
847            "(?!foo)",
848            "(?<=foo)",
849            "(?<!foo)",
850            "(?P=name)",
851            r"\1",
852            "foo*+",
853            "(?>foo)",
854        ] {
855            assert_eq!(kind(query), QueryKind::Regex, "{query}");
856        }
857    }
858
859    #[test]
860    fn two_word_lowercase_concepts_route_to_natural_language() {
861        for query in ["retry logic", "auth flow", "cache invalidation"] {
862            assert_eq!(kind(query), QueryKind::NaturalLanguage, "{query}");
863        }
864    }
865
866    #[test]
867    fn identifierish_short_queries_stay_identifier() {
868        for query in ["useState hook", "parseConfig", "parse_config option"] {
869            assert_eq!(kind(query), QueryKind::Identifier, "{query}");
870        }
871    }
872
873    #[test]
874    fn question_mark_code_shapes_do_not_route_to_regex() {
875        for query in ["foo()?", "optional?.length", "user?.name", "arr[0]?"] {
876            assert_ne!(kind(query), QueryKind::Regex, "{query}");
877        }
878    }
879
880    #[test]
881    fn question_mark_regex_quantifiers_still_route_to_regex() {
882        for query in ["colou?r", "https?"] {
883            assert_eq!(kind(query), QueryKind::Regex, "{query}");
884        }
885    }
886
887    #[test]
888    fn weak_regex_like_punctuation_does_not_route_to_regex() {
889        for query in [
890            "^id",
891            "id$",
892            "^",
893            "$",
894            "$HOME",
895            r"\.",
896            "array.length",
897            "foo()",
898            "map.get(key)",
899            "a|b",
900        ] {
901            assert_ne!(kind(query), QueryKind::Regex, "{query}");
902        }
903    }
904}