cqs 1.26.0 - Docs.rs

//! Query classifier and adaptive search strategy router.
//!
//! Classifies incoming queries by intent (identifier lookup, structural search,
//! behavioral search, etc.) and routes to the best retrieval strategy.
//! Pure logic — no I/O, no store access, infallible.

use crate::language::{ChunkType, REGISTRY};
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use std::sync::LazyLock;

/// Query categories for adaptive routing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QueryCategory {
    /// Looking for a specific function/type by name ("search_filtered", "HashMap::new")
    IdentifierLookup,
    /// Searching for code by structure ("functions that return Result", "structs with Display")
    Structural,
    /// Searching for code by behavior ("validates user input", "retries with backoff")
    Behavioral,
    /// Searching for abstract concepts ("dependency injection", "observer pattern")
    Conceptual,
    /// Queries requiring multiple signals ("find where errors are logged and retried")
    MultiStep,
    /// Queries with negation ("sort without allocating", "parse but not validate")
    Negation,
    /// Queries constrained by chunk type ("all test functions", "every enum")
    TypeFiltered,
    /// Queries mentioning multiple languages ("Python equivalent of map in Rust")
    CrossLanguage,
    /// No clear category — use default strategy
    Unknown,
}

impl std::fmt::Display for QueryCategory {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::IdentifierLookup => write!(f, "identifier_lookup"),
            Self::Structural => write!(f, "structural"),
            Self::Behavioral => write!(f, "behavioral"),
            Self::Conceptual => write!(f, "conceptual"),
            Self::MultiStep => write!(f, "multi_step"),
            Self::Negation => write!(f, "negation"),
            Self::TypeFiltered => write!(f, "type_filtered"),
            Self::CrossLanguage => write!(f, "cross_language"),
            Self::Unknown => write!(f, "unknown"),
        }
    }
}

/// Classifier confidence level.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Confidence {
    /// Strong signal — single strategy is optimal
    High,
    /// Mixed signals — may benefit from ensemble
    Medium,
    /// No clear signal — use default
    Low,
}

impl std::fmt::Display for Confidence {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::High => write!(f, "high"),
            Self::Medium => write!(f, "medium"),
            Self::Low => write!(f, "low"),
        }
    }
}

/// Search strategy to use for a query.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SearchStrategy {
    /// FTS5 name search — skip embedding entirely (~1ms)
    NameOnly,
    /// Standard dense embedding search (current default path, enriched HNSW)
    DenseDefault,
    /// Dense search with type boost for matching chunk types (enriched HNSW)
    DenseWithTypeHints,
    /// Phase 5: dense search against the base (non-enriched) HNSW — LLM
    /// summaries tend to hurt conceptual/behavioral/negation signal because
    /// they inject canonical vocabulary that drowns out query semantics.
    /// Falls back to [`Self::DenseDefault`] when the base index is missing.
    DenseBase,
}

impl std::fmt::Display for SearchStrategy {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NameOnly => write!(f, "name_only"),
            Self::DenseDefault => write!(f, "dense"),
            Self::DenseWithTypeHints => write!(f, "dense_type_hints"),
            Self::DenseBase => write!(f, "dense_base"),
        }
    }
}

/// Classification result from the query router.
#[derive(Debug, Clone)]
pub struct Classification {
    pub category: QueryCategory,
    pub confidence: Confidence,
    pub strategy: SearchStrategy,
    /// Extracted type hints for DenseWithTypeHints strategy
    pub type_hints: Option<Vec<ChunkType>>,
}

// ── Common word lists ────────────────────────────────────────────────

/// Words that indicate natural language (not an identifier)
const NL_INDICATORS: &[&str] = &[
    "the",
    "a",
    "an",
    "that",
    "which",
    "how",
    "what",
    "where",
    "when",
    "find",
    "get",
    "all",
    "every",
    "each",
    "with",
    "without",
    "for",
    "from",
    "into",
    "this",
    "does",
    "code",
    "function",
    "method",
    "implement",
    "using",
];

/// Aho-Corasick automaton over [`NL_INDICATORS`] for whole-query scans.
/// Match ids are not used — only the presence of a whole-word match matters.
static NL_INDICATORS_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::new(NL_INDICATORS).expect("NL_INDICATORS is a valid pattern set (static)")
});

/// Behavioral verbs suggesting a behavioral search
const BEHAVIORAL_VERBS: &[&str] = &[
    "validates",
    "processes",
    "handles",
    "manages",
    "computes",
    "parses",
    "converts",
    "transforms",
    "filters",
    "sorts",
    "checks",
    "verifies",
    "sends",
    "receives",
    "reads",
    "writes",
    "creates",
    "deletes",
    "updates",
    "serializes",
    "deserializes",
    "encodes",
    "decodes",
    "authenticates",
    "authorizes",
    "logs",
    "retries",
    "caches",
    "renders",
];

/// Aho-Corasick automaton over [`BEHAVIORAL_VERBS`].
/// Only whole-word matches trigger behavioral classification.
static BEHAVIORAL_VERBS_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::new(BEHAVIORAL_VERBS).expect("BEHAVIORAL_VERBS is a valid pattern set (static)")
});

/// Abstract nouns suggesting conceptual search
const CONCEPTUAL_NOUNS: &[&str] = &[
    "pattern",
    "architecture",
    "design",
    "approach",
    "strategy",
    "algorithm",
    "principle",
    "abstraction",
    "convention",
    "idiom",
    "paradigm",
    "concept",
    "technique",
    "methodology",
];

/// Aho-Corasick automaton over [`CONCEPTUAL_NOUNS`].
/// Only whole-word matches trigger conceptual classification.
static CONCEPTUAL_NOUNS_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::new(CONCEPTUAL_NOUNS).expect("CONCEPTUAL_NOUNS is a valid pattern set (static)")
});

/// Negation tokens matched against word-split query tokens (not substrings).
///
/// v1.22.0 audit AC-2: the previous pattern used trailing-space substring
/// matching (`query.contains("not ")`) which false-fired on words like
/// `cannot`, `piano`, `nano`, `volcano`, `casino`. Switched to exact
/// word-token matching against the `words` vec already computed upstream.
const NEGATION_TOKENS: &[&str] = &[
    "not",
    "without",
    "except",
    "never",
    "avoid",
    "no",
    "don't",
    "doesn't",
    "shouldn't",
    "exclude",
];

/// Structural keywords from programming languages
const STRUCTURAL_KEYWORDS: &[&str] = &[
    "struct",
    "enum",
    "trait",
    "impl",
    "interface",
    "class",
    "module",
    "namespace",
    "protocol",
    "type",
];

/// Common aliases that users type but don't match registry names.
/// Registry names cover the canonical forms ("cpp", "csharp", etc.);
/// these add the human-written variants.
const LANGUAGE_ALIASES: &[&str] = &["c++", "c#"];

/// Build the set of language names for cross-language detection.
///
/// Combines all registered language names from `REGISTRY.all()` with
/// common aliases that don't appear as registry keys.
///
/// Materialized once at first use — the registry is immutable and the
/// alias list is a compile-time constant, so every subsequent call
/// returns a borrow of the same `Vec`. Previously this allocated a new
/// `Vec<&'static str>` on every `classify_query` call.
static LANGUAGE_NAMES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    let mut names: Vec<&'static str> = REGISTRY.all().map(|def| def.name).collect();
    for alias in LANGUAGE_ALIASES {
        if !names.contains(alias) {
            names.push(alias);
        }
    }
    names
});

/// Return the cached language-name list as a borrowed slice.
fn language_names() -> &'static [&'static str] {
    LANGUAGE_NAMES.as_slice()
}

/// Structural query patterns
const STRUCTURAL_PATTERNS: &[&str] = &[
    "functions that",
    "methods that",
    "types that",
    "structs that",
    "that return",
    "that take",
    "that accept",
    "with signature",
    "implementing",
    "extending",
    "deriving",
];

/// Aho-Corasick automaton over [`STRUCTURAL_PATTERNS`]. These are matched as
/// raw substrings in the query (same as the previous `query.contains(pat)`),
/// so any match — word-bounded or not — triggers structural classification.
static STRUCTURAL_PATTERNS_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::new(STRUCTURAL_PATTERNS)
        .expect("STRUCTURAL_PATTERNS is a valid pattern set (static)")
});

/// Multi-step conjunction patterns.
///
/// AC-V1.25-10: bare " and " / " or " were removed because they fired on
/// any conjunction in a query ("find foo and bar"), sweeping near-every
/// multi-word NL query into `QueryCategory::MultiStep`. The remaining
/// patterns require explicit sequencing / enumeration phrasing
/// ("first do X then do Y") so the category actually captures multi-step
/// intent, not any coordinated phrase.
const MULTISTEP_PATTERNS: &[&str] = &[
    "and then",
    "before ",
    "after ",
    " or also ",
    "first ",
    "then ",
    "both ",
    "between ",
];

/// Aho-Corasick automaton over [`MULTISTEP_PATTERNS`]. Raw substring match —
/// the pattern strings already carry their own trailing / leading space
/// where word-boundary semantics are needed.
static MULTISTEP_PATTERNS_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::new(MULTISTEP_PATTERNS)
        .expect("MULTISTEP_PATTERNS is a valid pattern set (static)")
});

// ── Classification ───────────────────────────────────────────────────

/// Classify a query into a category with confidence level and recommended strategy.
///
/// Resolve the SPLADE fusion alpha for a query category.
///
/// Precedence: per-category env (`CQS_SPLADE_ALPHA_{CATEGORY}`) > global env
/// (`CQS_SPLADE_ALPHA`) > hardcoded default (1.0 = pure dense, SPLADE off).
///
/// Returns a value in [0.0, 1.0] where 1.0 means pure dense and < 1.0 activates
/// SPLADE with that fusion weight.
///
/// OB-NEW-1: emits a single structured `tracing::info!` recording the
/// resolved alpha, its source (`per_cat_env` / `global_env` / `default`),
/// and the category. Callers no longer need to log the decision themselves;
/// rooting the log inside this function makes the env-precedence visible and
/// eliminates the drift that existed between the CLI and batch-handler logs.
pub fn resolve_splade_alpha(category: &QueryCategory) -> f32 {
    let _span = tracing::debug_span!("resolve_splade_alpha", category = %category).entered();

    // Per-category env override: CQS_SPLADE_ALPHA_CONCEPTUAL_SEARCH etc.
    let cat_key = format!("CQS_SPLADE_ALPHA_{}", category.to_string().to_uppercase());
    if let Ok(val) = std::env::var(&cat_key) {
        if let Ok(alpha) = val.parse::<f32>() {
            if alpha.is_finite() {
                let alpha = alpha.clamp(0.0, 1.0);
                tracing::info!(
                    category = %category,
                    alpha,
                    source = "per_cat_env",
                    "SPLADE routing"
                );
                return alpha;
            }
            tracing::warn!(var = %cat_key, value = %val, "Non-finite alpha, using default");
        } else {
            tracing::warn!(var = %cat_key, value = %val, "Invalid alpha, using default");
        }
    }

    // Global env override: CQS_SPLADE_ALPHA
    if let Ok(val) = std::env::var("CQS_SPLADE_ALPHA") {
        if let Ok(alpha) = val.parse::<f32>() {
            if alpha.is_finite() {
                let alpha = alpha.clamp(0.0, 1.0);
                tracing::info!(
                    category = %category,
                    alpha,
                    source = "global_env",
                    "SPLADE routing"
                );
                return alpha;
            }
        }
    }

    // Per-category defaults from the 21-point alpha sweep on the genuinely
    // clean index (2026-04-15). 265 queries × 8 categories, 14,882 chunks
    // post-GC + worktree-duplicate purge.
    //
    // History: the 2026-04-14 "clean" sweep was actually run on a 96k-chunk
    // index polluted by auto-indexed `.claude/worktrees/` copies (daemon
    // watch ignored .gitignore, fixed in #1003). The dirty-tuned alphas
    // drove SPLADE-enabled R@1 to 26.8% (vs 35.8% dense-only) until
    // re-measured. The values here reflect the real clean-index optima —
    // overall R@1 41% projected (vs 37.7% for global α=0.90).
    //
    // Run artifacts: /home/user001/.cache/cqs/evals/run_20260415_1[4-5]*/
    let alpha = match category {
        // Peak at α=1.00 (94%). Previous default 0.90 gave 90% — the extra
        // 10% SPLADE cost 4pp. Identifier queries are dominated by exact
        // name matches which the dense embedder resolves unambiguously.
        QueryCategory::IdentifierLookup => 1.00,
        // Peak at α=0.90 (44.4%). Previous default 0.60 was from the
        // dirty-index sweep and gave only 29.6% — 14.8pp miscalibration.
        // Structural queries (e.g. "recursive function", "mutex usage")
        // match on code idioms the dense embedder learned well.
        QueryCategory::Structural => 0.90,
        // Peak at α=0.70 (33.3%). Previous default 0.85 gave 30.6%.
        // Conceptual queries benefit from some SPLADE lexical grounding
        // (noun-token matches) without over-weighting it.
        QueryCategory::Conceptual => 0.70,
        // Peak at α=0.00 (25.0%). Previous default 0.05 gave 22.7% —
        // the gain is within noise (N=44) but directionally consistent:
        // behavioral queries match action verbs that SPLADE captures
        // lexically and the dense embedder does not.
        QueryCategory::Behavioral => 0.00,
        // Peak at α=0.80 (20.7%), up from 13.8% at α=1.0 — a real 6.9pp
        // gain. Previously missed because the arm fell through to the
        // `_ => 1.0` default. Negation queries need lexical SPLADE to
        // suppress candidates that match the negated term.
        QueryCategory::Negation => 0.80,
        // multi_step, cross_language, type_filtered, unknown: flat curves
        // within noise (N=21-34); pick α=1.0. SPLADE still contributes to
        // the candidate pool (always-on), α just weights the scoring.
        _ => 1.0,
    };

    tracing::info!(
        category = %category,
        alpha,
        source = "default",
        "SPLADE routing"
    );
    alpha
}

/// Pure function — no I/O, cannot fail, completes in <1ms.
/// Priority order: Negation > Identifier > CrossLanguage > TypeFiltered >
/// Structural > Behavioral > Conceptual > MultiStep > Unknown.
pub fn classify_query(query: &str) -> Classification {
    let query_lower = query.to_lowercase();
    let words: Vec<&str> = query_lower.split_whitespace().collect();

    if words.is_empty() {
        return Classification {
            category: QueryCategory::Unknown,
            confidence: Confidence::Low,
            strategy: SearchStrategy::DenseDefault,
            type_hints: None,
        };
    }

    // 1. Negation trumps everything — "sort without allocating".
    //    Phase 5: enriched summaries inject positive vocabulary ("allocates",
    //    "uses heap") that fights the negation, so route to the base index.
    if words.iter().any(|w| NEGATION_TOKENS.contains(w)) {
        return Classification {
            category: QueryCategory::Negation,
            confidence: Confidence::High,
            strategy: SearchStrategy::DenseBase,
            type_hints: None,
        };
    }

    // 2. Identifier lookup — all tokens look like identifiers
    if is_identifier_query(&query_lower, &words) {
        return Classification {
            category: QueryCategory::IdentifierLookup,
            confidence: Confidence::High,
            strategy: SearchStrategy::NameOnly,
            type_hints: None,
        };
    }

    // 3. Cross-language — mentions 2+ language names or "equivalent"/"translate".
    //    These benefit from the enriched index (summaries add canonical
    //    vocabulary that bridges language-specific syntax).
    if is_cross_language_query(&query_lower, &words) {
        return Classification {
            category: QueryCategory::CrossLanguage,
            confidence: Confidence::High,
            strategy: SearchStrategy::DenseDefault,
            type_hints: None,
        };
    }

    // 4. Type-filtered — "all structs", "every enum", "test functions"
    //    2026-04-13: route to base. Enrichment ablation at 78% summary coverage
    //    showed +8.4pp R@1 on base vs enriched (41.7% vs 33.3%, N=24).
    //    Summaries add generic vocabulary that dilutes the specific type signal.
    let type_hints = extract_type_hints(&query_lower);
    if type_hints.is_some() {
        return Classification {
            category: QueryCategory::TypeFiltered,
            confidence: Confidence::Medium,
            strategy: SearchStrategy::DenseBase,
            type_hints,
        };
    }

    // 5. Structural — type keywords + "functions that" patterns
    if is_structural_query(&query_lower) {
        return Classification {
            category: QueryCategory::Structural,
            confidence: Confidence::Medium,
            strategy: SearchStrategy::DenseWithTypeHints,
            type_hints: None,
        };
    }

    // 6. Behavioral — action verbs, "code that does X".
    //    Phase 5: behavioral queries use verbs the query author chose; enriched
    //    summaries standardize those verbs ("handles" → "processes"), which
    //    washes out the specific verb the user asked about. Route to base.
    //
    //    2026-04-10 update: same-corpus A/B at 50% summary coverage shows
    //    behavioral routing produces 0pp delta — the routing fires but the
    //    affected queries' gold answers are mostly callable types where
    //    base ≈ enriched after enrichment_hash dedupe. Keeping the route on
    //    base because the historical research data still says behavioral
    //    is hurt by summaries; we just can't measure the effect on this
    //    corpus shape. See research/enrichment.md for the data.
    if is_behavioral_query(&query_lower, &words) {
        return Classification {
            category: QueryCategory::Behavioral,
            confidence: Confidence::Medium,
            strategy: SearchStrategy::DenseBase,
            type_hints: None,
        };
    }

    // 7. Conceptual — abstract nouns, short non-identifier queries.
    //
    //    2026-04-10 update: ROUTING REVERSED. Phase 5 originally routed
    //    conceptual to DenseBase based on the historical research finding
    //    "summaries hurt conceptual −15pp". That finding was measured on a
    //    corpus where only callable types were summarized.
    //
    //    After the eligibility expansion in PR #878 (summaries now cover
    //    structs / enums / impls / traits / classes / etc.), conceptual
    //    queries' gold answers are mostly type definitions where the
    //    summary actively helps bridge code → concept ("a service container
    //    that resolves dependencies" → "dependency injection"). Routing
    //    those queries to the base index strips the helpful signal.
    //
    //    Same-corpus A/B at 50% coverage measured −3.7pp R@1 on conceptual
    //    when routing was on. Keeping conceptual on the enriched index
    //    until / unless the summary coverage shape changes again.
    //
    //    The lesson: routing rules are coupled to corpus shape, not to
    //    a category-intrinsic property. They need to be re-validated any
    //    time summary coverage changes meaningfully.
    if is_conceptual_query(&query_lower, &words) {
        return Classification {
            category: QueryCategory::Conceptual,
            confidence: Confidence::Medium,
            strategy: SearchStrategy::DenseDefault,
            type_hints: None,
        };
    }

    // 8. Multi-step — conjunctions
    //    2026-04-13: route to base. Enrichment ablation at 78% summary coverage
    //    showed +2.9pp R@1 on base vs enriched (23.5% vs 20.6%, N=34).
    //    Summaries inject vocabulary that displaces the conjunction terms.
    if MULTISTEP_PATTERNS_AC.is_match(&query_lower) {
        return Classification {
            category: QueryCategory::MultiStep,
            confidence: Confidence::Low,
            strategy: SearchStrategy::DenseBase,
            type_hints: None,
        };
    }

    // 9. Unknown — default
    Classification {
        category: QueryCategory::Unknown,
        confidence: Confidence::Low,
        strategy: SearchStrategy::DenseDefault,
        type_hints: None,
    }
}

// ── Helpers ──────────────────────────────────────────────────────────

/// Check if a query looks like an identifier lookup.
/// All tokens must be valid identifier characters (a-z, 0-9, _, :, .)
/// and no natural language indicator words.
fn is_identifier_query(query: &str, words: &[&str]) -> bool {
    // Single-word queries with identifier chars
    if words.len() == 1 {
        let w = words[0];
        // Must contain at least one letter
        if !w.chars().any(|c| c.is_alphabetic()) {
            return false;
        }
        // NL indicator words are not identifiers. On a single-word query the
        // word itself IS the whole query and carries no whitespace, so the
        // AC word-boundary match reduces to "some pattern equals w".
        if ac_has_word_bounded_match(&NL_INDICATORS_AC, query) {
            return false;
        }
        // Pure identifier chars (including :: and .)
        return w
            .chars()
            .all(|c| c.is_alphanumeric() || c == '_' || c == ':' || c == '.' || c == '/');
    }

    // Multi-word: require at least one strong identifier signal
    // (underscore, ::, ., or mixed case within a single token)
    if words.len() <= 3 {
        let has_nl = ac_has_word_bounded_match(&NL_INDICATORS_AC, query);
        if has_nl {
            return false;
        }
        let has_identifier_signal = words.iter().any(|w| {
            w.contains('_')
                || w.contains("::")
                || w.contains('.')
                || (w.chars().any(|c| c.is_uppercase()) && w.chars().any(|c| c.is_lowercase()))
        });
        let all_identifier_chars = words.iter().all(|w| {
            w.chars()
                .all(|c| c.is_alphanumeric() || c == '_' || c == ':' || c == '.')
        });
        return has_identifier_signal && all_identifier_chars;
    }

    false
}

/// Check if query mentions multiple programming languages or translation.
///
/// AC-V1.25-9: the translation-verb check uses word-token matching for
/// "port" / "ports" / "convert" / "translate" / "equivalent". Previously
/// the "port " substring probe false-fired on "report", "reports",
/// "airport" etc. — any word with "port " inside it at a word boundary
/// inside a longer compound would look like a translation verb.
fn is_cross_language_query(query: &str, words: &[&str]) -> bool {
    let names = language_names();
    let lang_count = names
        .iter()
        .filter(|l| words.iter().any(|w| *w == **l))
        .count();
    if lang_count >= 2 {
        return true;
    }
    let has_translate_verb = query.contains("equivalent")
        || query.contains("translate")
        || query.contains("convert ")
        || words.iter().any(|w| *w == "port" || *w == "ports");
    if lang_count >= 1 && has_translate_verb {
        return true;
    }
    false
}

/// Check if query is structural (about code structure, not behavior).
fn is_structural_query(query: &str) -> bool {
    // Structural patterns like "functions that return"
    if STRUCTURAL_PATTERNS_AC.is_match(query) {
        return true;
    }
    // Contains structural keywords as NL words (not identifiers)
    // e.g., "find all structs" but not "MyStruct"
    STRUCTURAL_KEYWORDS
        .iter()
        .any(|kw| query.contains(&format!(" {} ", kw)) || query.starts_with(&format!("{} ", kw)))
}

/// Check if query describes behavior.
///
/// AC-V1.25-15: the "code that" / "function that" probes use word-boundary
/// checks instead of raw substring contains so hyphenated identifiers like
/// `code-that-was-deleted-yesterday` don't false-fire. The word-boundary
/// phrase must be surrounded by whitespace or sit at a string boundary.
fn is_behavioral_query(query: &str, _words: &[&str]) -> bool {
    if ac_has_word_bounded_match(&BEHAVIORAL_VERBS_AC, query) {
        return true;
    }
    // "how does" / "what does" removed 2026-04-14 — they caught 100% of
    // multi_step eval queries ("how does X trace callers to find tests")
    // and sent them down α=0.05 (Behavioral) instead of α=1.0 (MultiStep /
    // Unknown). Net loss: ~3 queries / 265. "code that" / "function that"
    // are kept — they're more specific phrasings used by genuine behavioral
    // queries ("function that embeds a batch of text documents").
    contains_phrase(query, "code that") || contains_phrase(query, "function that")
}

/// Check whether `phrase` appears in `query` surrounded by whitespace or
/// string boundaries — a word-boundary check without regex overhead.
///
/// Used by [`is_behavioral_query`] so hyphenated or compounded identifiers
/// that happen to contain the phrase as a substring don't false-fire.
fn contains_phrase(query: &str, phrase: &str) -> bool {
    let bytes = query.as_bytes();
    let pbytes = phrase.as_bytes();
    let plen = pbytes.len();
    if plen == 0 || bytes.len() < plen {
        return false;
    }
    for start in 0..=bytes.len() - plen {
        if &bytes[start..start + plen] != pbytes {
            continue;
        }
        let left_ok = start == 0 || bytes[start - 1].is_ascii_whitespace();
        let right_ok = start + plen == bytes.len() || bytes[start + plen].is_ascii_whitespace();
        if left_ok && right_ok {
            return true;
        }
    }
    false
}

/// Check whether any pattern in `ac` has at least one whole-word match in
/// `query`. A match is whole-word iff both sides of the match are either
/// a string boundary or ASCII whitespace.
///
/// Used in place of the previous `words.iter().any(|w| SET.contains(w))`
/// check: tokens split by whitespace are exactly the strings whose first
/// and last bytes sit at ASCII whitespace (or the string boundary), so an
/// AC match with whitespace on both sides represents a token that equals
/// one of the patterns. No regex, no allocation, single pass over `query`.
///
/// Uses [`AhoCorasick::find_overlapping_iter`] so shared-prefix patterns
/// (e.g. `"a"` / `"all"` / `"an"` in [`NL_INDICATORS`]) all get a chance
/// to fire: a leftmost-first `find_iter` would return the first pattern
/// that matches at position 0, and if that pattern is not word-bounded
/// the helper would wrongly report "no match" even when a longer sibling
/// pattern *is* word-bounded at the same start. Requires
/// [`MatchKind::Standard`], which is the default for [`AhoCorasick::new`].
fn ac_has_word_bounded_match(ac: &AhoCorasick, query: &str) -> bool {
    let bytes = query.as_bytes();
    for m in ac.find_overlapping_iter(query) {
        let left_ok = m.start() == 0 || bytes[m.start() - 1].is_ascii_whitespace();
        let right_ok = m.end() == bytes.len() || bytes[m.end()].is_ascii_whitespace();
        if left_ok && right_ok {
            return true;
        }
    }
    false
}

/// Check if query is about abstract concepts.
fn is_conceptual_query(query: &str, words: &[&str]) -> bool {
    if ac_has_word_bounded_match(&CONCEPTUAL_NOUNS_AC, query) {
        return true;
    }
    // Short queries (1-3 words) that aren't identifiers and aren't structural
    words.len() <= 3
        && ac_has_word_bounded_match(&NL_INDICATORS_AC, query)
        && !is_structural_query(query)
}

/// Patterns for [`extract_type_hints`] — the pattern string and the
/// [`ChunkType`] it maps to. Order matters: output hints preserve this
/// declaration order so tests that assert on hint ordering keep passing.
const TYPE_HINT_PATTERNS: &[(&str, ChunkType)] = &[
    // Test
    ("test function", ChunkType::Test),
    ("test method", ChunkType::Test),
    ("all tests", ChunkType::Test),
    ("every test", ChunkType::Test),
    // Function / Method
    ("all functions", ChunkType::Function),
    ("every function", ChunkType::Function),
    ("all methods", ChunkType::Method),
    ("every method", ChunkType::Method),
    // Type definitions
    ("all structs", ChunkType::Struct),
    ("every struct", ChunkType::Struct),
    ("all enums", ChunkType::Enum),
    ("every enum", ChunkType::Enum),
    ("all traits", ChunkType::Trait),
    ("every trait", ChunkType::Trait),
    ("all interfaces", ChunkType::Interface),
    ("every interface", ChunkType::Interface),
    ("all classes", ChunkType::Class),
    ("every class", ChunkType::Class),
    ("type alias", ChunkType::TypeAlias),
    ("all type aliases", ChunkType::TypeAlias),
    // OOP / module constructs
    ("all modules", ChunkType::Module),
    ("every module", ChunkType::Module),
    ("all objects", ChunkType::Object),
    ("every object", ChunkType::Object),
    ("all namespaces", ChunkType::Namespace),
    ("every namespace", ChunkType::Namespace),
    ("all impl blocks", ChunkType::Impl),
    ("implementation block", ChunkType::Impl),
    ("extension method", ChunkType::Extension),
    ("all extensions", ChunkType::Extension),
    // Members
    ("all constants", ChunkType::Constant),
    ("every constant", ChunkType::Constant),
    ("all variables", ChunkType::Variable),
    ("every variable", ChunkType::Variable),
    ("all properties", ChunkType::Property),
    ("every property", ChunkType::Property),
    ("constructor", ChunkType::Constructor),
    ("all constructors", ChunkType::Constructor),
    // C# specific
    ("all delegates", ChunkType::Delegate),
    ("every delegate", ChunkType::Delegate),
    ("all events", ChunkType::Event),
    ("every event", ChunkType::Event),
    // Macros
    ("all macros", ChunkType::Macro),
    ("every macro", ChunkType::Macro),
    ("macro_rules", ChunkType::Macro),
    // Web / API
    ("endpoint", ChunkType::Endpoint),
    ("all endpoints", ChunkType::Endpoint),
    ("all services", ChunkType::Service),
    ("every service", ChunkType::Service),
    ("middleware", ChunkType::Middleware),
    ("all middleware", ChunkType::Middleware),
    // Database / FFI / config
    ("stored procedure", ChunkType::StoredProc),
    ("all stored procedures", ChunkType::StoredProc),
    ("extern function", ChunkType::Extern),
    ("all externs", ChunkType::Extern),
    ("ffi declaration", ChunkType::Extern),
    ("config key", ChunkType::ConfigKey),
    ("all config keys", ChunkType::ConfigKey),
    // Docs / Solidity
    ("all sections", ChunkType::Section),
    ("every section", ChunkType::Section),
    ("all modifiers", ChunkType::Modifier),
    ("every modifier", ChunkType::Modifier),
];

/// Aho-Corasick automaton over [`TYPE_HINT_PATTERNS`] — one pass over
/// `query` finds every matching pattern id.
///
/// Uses [`MatchKind::Standard`] because [`AhoCorasick::find_overlapping_iter`]
/// (which we need: sibling patterns like `"constructor"` / `"all constructors"`
/// overlap in the haystack, and both must fire to match the previous
/// `for (pat, _) in patterns { if query.contains(pat) {..} }` semantics)
/// is only valid under the Standard match kind.
static TYPE_HINT_AC: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasickBuilder::new()
        .match_kind(MatchKind::Standard)
        .build(TYPE_HINT_PATTERNS.iter().map(|(p, _)| *p))
        .expect("TYPE_HINT_PATTERNS is a valid pattern set (static input)")
});

/// Extract chunk type hints from the query text.
///
/// Returns the types to boost (not filter) in search results.
/// Only extracts when confidence is reasonable — avoids false positives.
///
/// Previously this scanned ~72 patterns with individual `query.contains(p)`
/// probes. Now uses a single Aho-Corasick pass via [`TYPE_HINT_AC`].
///
/// Output order is preserved: a hint is pushed the first time its pattern
/// id appears in declaration order, and duplicate `ChunkType`s across
/// different matched patterns are kept (e.g. two Test-mapped patterns both
/// matching still yields `[Test, Test]`, matching the previous loop).
pub fn extract_type_hints(query: &str) -> Option<Vec<ChunkType>> {
    // Collect the set of pattern ids that match at least once.
    let mut matched = [false; TYPE_HINT_PATTERNS.len()];
    for m in TYPE_HINT_AC.find_overlapping_iter(query) {
        matched[m.pattern().as_usize()] = true;
    }

    let mut types = Vec::new();
    for (idx, (_, chunk_type)) in TYPE_HINT_PATTERNS.iter().enumerate() {
        if matched[idx] {
            types.push(*chunk_type);
        }
    }

    if types.is_empty() {
        None
    } else {
        Some(types)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── Happy path (13 tests) ────────────────────────────────────────

    #[test]
    fn test_classify_identifier_snake_case() {
        let c = classify_query("search_filtered");
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
        assert_eq!(c.confidence, Confidence::High);
        assert_eq!(c.strategy, SearchStrategy::NameOnly);
    }

    #[test]
    fn test_classify_identifier_qualified() {
        let c = classify_query("HashMap::new");
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
        assert_eq!(c.confidence, Confidence::High);
    }

    #[test]
    fn test_classify_identifier_camel() {
        let c = classify_query("SearchFilter");
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
        assert_eq!(c.confidence, Confidence::High);
    }

    #[test]
    fn test_classify_behavioral() {
        let c = classify_query("validates user input");
        assert_eq!(c.category, QueryCategory::Behavioral);
        assert_eq!(c.confidence, Confidence::Medium);
        // Phase 5: behavioral routes to the base (non-enriched) index because
        // LLM summaries flatten the specific verbs users ask about.
        assert_eq!(c.strategy, SearchStrategy::DenseBase);
    }

    #[test]
    fn test_classify_negation() {
        let c = classify_query("sort without allocating");
        assert_eq!(c.category, QueryCategory::Negation);
        assert_eq!(c.confidence, Confidence::High);
        // Phase 5: negation routes to base — summaries inject positive
        // vocabulary that fights the "without" clause.
        assert_eq!(c.strategy, SearchStrategy::DenseBase);
    }

    #[test]
    fn test_classify_conceptual_routes_to_enriched() {
        // 2026-04-10: ROUTING REVERSED. Originally Phase 5 routed conceptual
        // to DenseBase based on the historical research finding "summaries
        // hurt conceptual −15pp". Same-corpus A/B at 50% summary coverage
        // measured −3.7pp R@1 from that routing — the historical finding
        // was for a different corpus shape (only callables summarized).
        // After PR #878 expanded summaries to type definitions, conceptual
        // queries' gold answers benefit from the enrichment (the summary
        // bridges code → concept on struct/enum chunks).
        //
        // See research/enrichment.md "Same-corpus A/B/C/D matrix (50% coverage)"
        // for the data that drove this revision.
        let c = classify_query("dependency injection pattern");
        assert_eq!(c.category, QueryCategory::Conceptual);
        assert_eq!(c.strategy, SearchStrategy::DenseDefault);
    }

    #[test]
    fn test_classify_structural_stays_on_enriched() {
        // Phase 5 regression: structural queries benefit from enrichment,
        // so they keep the DenseWithTypeHints (enriched HNSW) strategy.
        let c = classify_query("functions that return Result");
        assert_eq!(c.category, QueryCategory::Structural);
        assert_eq!(c.strategy, SearchStrategy::DenseWithTypeHints);
    }

    #[test]
    fn test_classify_cross_language_stays_on_enriched() {
        // Phase 5 regression: cross-language queries rely on canonical
        // vocabulary that summaries provide, so they stay on enriched.
        let c = classify_query("Python equivalent of map in Rust");
        assert_eq!(c.category, QueryCategory::CrossLanguage);
        assert_eq!(c.strategy, SearchStrategy::DenseDefault);
    }

    #[test]
    fn test_classify_structural() {
        let c = classify_query("functions that return Result");
        assert_eq!(c.category, QueryCategory::Structural);
        assert_eq!(c.confidence, Confidence::Medium);
    }

    #[test]
    fn test_classify_type_filtered() {
        let c = classify_query("all test functions");
        assert_eq!(c.category, QueryCategory::TypeFiltered);
        // 2026-04-13: type_filtered routes to base — summaries dilute type signal (+8.4pp).
        assert_eq!(c.strategy, SearchStrategy::DenseBase);
        assert!(c.type_hints.is_some());
        assert!(c.type_hints.unwrap().contains(&ChunkType::Test));
    }

    #[test]
    fn test_classify_cross_language() {
        let c = classify_query("Python equivalent of map in Rust");
        assert_eq!(c.category, QueryCategory::CrossLanguage);
        assert_eq!(c.confidence, Confidence::High);
    }

    #[test]
    fn test_classify_conceptual() {
        let c = classify_query("dependency injection pattern");
        assert_eq!(c.category, QueryCategory::Conceptual);
        assert_eq!(c.confidence, Confidence::Medium);
    }

    #[test]
    fn test_classify_multi_step() {
        let c = classify_query("find errors and then retry them");
        assert_eq!(c.category, QueryCategory::MultiStep);
        assert_eq!(c.confidence, Confidence::Low);
        // 2026-04-13: multi_step routes to base — summaries displace conjunction terms (+2.9pp).
        assert_eq!(c.strategy, SearchStrategy::DenseBase);
    }

    #[test]
    fn test_classify_unknown() {
        let c = classify_query("asdf jkl qwerty");
        assert_eq!(c.category, QueryCategory::Unknown);
        assert_eq!(c.confidence, Confidence::Low);
    }

    #[test]
    fn test_extract_type_hints_struct() {
        let hints = extract_type_hints("find all structs");
        assert!(hints.is_some());
        assert!(hints.unwrap().contains(&ChunkType::Struct));
    }

    #[test]
    fn test_extract_type_hints_none() {
        let hints = extract_type_hints("handle errors gracefully");
        assert!(hints.is_none());
    }

    // ── Adversarial (15 tests) ───────────────────────────────────────

    #[test]
    fn test_classify_empty() {
        let c = classify_query("");
        assert_eq!(c.category, QueryCategory::Unknown);
        assert_eq!(c.confidence, Confidence::Low);
    }

    #[test]
    fn test_classify_single_char() {
        let c = classify_query("a");
        // "a" is an NL indicator, not an identifier
        assert_ne!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_very_long() {
        let long = "a ".repeat(5000);
        let start = std::time::Instant::now();
        let c = classify_query(&long);
        let elapsed = start.elapsed();
        assert!(elapsed.as_millis() < 100, "Should complete in <100ms");
        assert_eq!(c.confidence, Confidence::Low);
    }

    #[test]
    fn test_classify_unicode_identifier() {
        let c = classify_query("日本語_関数");
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_path_like() {
        let c = classify_query("src/store/mod.rs");
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_only_stopwords() {
        let c = classify_query("the a an of");
        assert_ne!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_special_chars() {
        let c = classify_query("fn<T: Hash>()");
        // Contains "fn" which triggers structural, but also looks like code
        // Key: doesn't panic
        let _ = c;
    }

    #[test]
    fn test_classify_all_caps() {
        let c = classify_query("WHERE IS THE ERROR HANDLER");
        // Contains NL words, should not be identifier
        assert_ne!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_numbers() {
        let c = classify_query("404");
        // Pure number — has no alphabetic chars
        assert_eq!(c.category, QueryCategory::Unknown);
    }

    #[test]
    fn test_classify_hex() {
        let c = classify_query("0xFF");
        // Starts with digit, has alpha — could be identifier
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_mixed_signals() {
        let c = classify_query("not struct");
        // Negation trumps structural
        assert_eq!(c.category, QueryCategory::Negation);
    }

    #[test]
    fn test_classify_sql_injection() {
        let c = classify_query("'; DROP TABLE--");
        // Should not panic, should not be identifier
        assert_ne!(c.category, QueryCategory::IdentifierLookup);
    }

    #[test]
    fn test_classify_null_bytes() {
        let c = classify_query("foo\0bar");
        // Should handle gracefully — no panic
        let _ = c;
    }

    #[test]
    fn test_classify_type_hint_wrong_extraction() {
        // "error handling" should NOT extract Enum type hint
        // even though "error" could be confused with an error enum
        let hints = extract_type_hints("error handling");
        assert!(hints.is_none());
    }

    #[test]
    fn test_classify_identifier_common_word() {
        // "error" alone is ambiguous — could be identifier or concept
        let c = classify_query("error");
        // Should be identifier (single word, valid identifier chars)
        // but Medium confidence since it's also a common word
        assert_eq!(c.category, QueryCategory::IdentifierLookup);
    }

    // ── AC-V1.25-10 MultiStep pattern tightening ─────────────────────

    #[test]
    fn test_classify_plain_and_is_not_multistep() {
        // "find foo and bar" is a single search intent, not a multi-step
        // query. Previously " and " alone pushed this into MultiStep.
        let c = classify_query("find foo and bar");
        assert_ne!(
            c.category,
            QueryCategory::MultiStep,
            "plain conjunction should not classify as MultiStep"
        );
    }

    #[test]
    fn test_classify_plain_or_is_not_multistep() {
        // "find foo or bar" is a single search intent with alternation,
        // not a multi-step query.
        let c = classify_query("find foo or bar");
        assert_ne!(
            c.category,
            QueryCategory::MultiStep,
            "plain disjunction should not classify as MultiStep"
        );
    }

    #[test]
    fn test_classify_first_then_is_multistep() {
        // Explicit sequencing must still classify as MultiStep.
        let c = classify_query("first do X then do Y");
        assert_eq!(c.category, QueryCategory::MultiStep);
    }

    #[test]
    fn test_classify_and_then_is_multistep() {
        // "and then" explicitly chains two steps.
        let c = classify_query("find errors and then retry them");
        assert_eq!(c.category, QueryCategory::MultiStep);
    }

    // ── AC-V1.25-9 cross-language classifier word-boundary ──────────

    #[test]
    fn test_classify_report_is_not_cross_language() {
        // Previously "port " substring probe matched "report" and
        // classified any language + "report" query as CrossLanguage.
        let c = classify_query("show the error report in python");
        assert_ne!(
            c.category,
            QueryCategory::CrossLanguage,
            "'report' should not trigger cross-language via 'port ' substring"
        );
    }

    #[test]
    fn test_classify_port_verb_stays_cross_language() {
        // "port X to Y" with a language name is still CrossLanguage.
        let c = classify_query("port the logging module to rust");
        assert_eq!(c.category, QueryCategory::CrossLanguage);
    }

    // ── AC-V1.25-15 behavioral classifier word-boundary ─────────────

    #[test]
    fn test_classify_word_bounded_code_that_not_behavioral() {
        // A token-attached "code that" like `barcode that1` contains the
        // literal "code that" substring but is not the phrase "code that"
        // as a word. Previously the substring probe classified this as
        // Behavioral; the word-boundary check should not.
        let c = classify_query("barcode that1 lives forever");
        assert_ne!(
            c.category,
            QueryCategory::Behavioral,
            "token-attached 'code that' should not classify as Behavioral via substring"
        );
    }

    #[test]
    fn test_classify_word_bounded_function_that_not_behavioral() {
        // "malfunction that" attaches "function that" to "mal"; should
        // not match the word-bounded phrase check.
        let c = classify_query("malfunction that3 happened");
        assert_ne!(
            c.category,
            QueryCategory::Behavioral,
            "token-attached 'function that' should not classify as Behavioral"
        );
    }

    #[test]
    fn test_classify_behavioral_code_that_still_fires() {
        // "code that ..." as a real NL phrase still classifies as
        // Behavioral after word-boundary tightening.
        let c = classify_query("code that handles retries");
        assert_eq!(c.category, QueryCategory::Behavioral);
    }

    // ── Micro-benchmark (#964) ───────────────────────────────────────
    //
    // Sanity check for the Aho-Corasick + LazyLock rewrite. Runs
    // classify_query on a mix of query shapes and prints per-call
    // timing. Does not assert on timing — CI machines have wildly
    // different performance envelopes.
    //
    // Marked #[ignore] so the default `cargo test` run does not pay
    // the timing cost; invoke in release for a realistic number:
    //   cargo test --release --features gpu-index --lib -- \
    //     search::router::tests::bench_classify_query_throughput \
    //     --ignored --nocapture
    #[test]
    #[ignore]
    fn bench_classify_query_throughput() {
        // Four query shapes that exercise different branches of classify_query:
        //   1. Type-filtered — runs the full 72-pattern extract_type_hints
        //      table, which was the heaviest contributor before the AC rewrite.
        //   2. Behavioral — fires on a BEHAVIORAL_VERBS word.
        //   3. Cross-language — two language names, full language_names scan.
        //   4. Unknown — walks every branch (no early return) so the whole
        //      classifier is stressed.
        let queries: &[(&str, &str)] = &[
            (
                "type_filtered",
                "find all test functions and every interface and all traits in the codebase shown",
            ),
            (
                "behavioral",
                "find the function that validates user input in the python module and logs it",
            ),
            (
                "cross_language",
                "port the python logging and tracing module into a rust crate with serde",
            ),
            (
                "unknown",
                "zephyr quartz wonder blooming river sunset gentle breeze stormy afternoon light",
            ),
        ];

        // Warm the LazyLocks so construction cost isn't folded into timing.
        for (_, q) in queries {
            let _ = classify_query(q);
        }

        const ITERATIONS: usize = 10_000;
        for (label, query) in queries {
            assert!(
                query.len() >= 60 && query.len() <= 95,
                "keep bench queries near the 80-char target ({} = {} chars)",
                label,
                query.len()
            );
            let start = std::time::Instant::now();
            let mut sink = 0u32;
            for _ in 0..ITERATIONS {
                let c = classify_query(query);
                // Prevent the optimizer from eliding the call.
                sink = sink.wrapping_add(c.category as u32);
            }
            let elapsed = start.elapsed();
            let per_call_ns = elapsed.as_nanos() / ITERATIONS as u128;
            eprintln!(
                "classify_query bench [{:<14}]: {} iters in {:>9.3?} ({:>5} ns/call, sink={})",
                label, ITERATIONS, elapsed, per_call_ns, sink
            );
        }
    }
}