ripvec-core 1.0.3

//! Alpha auto-detection and query-driven boosting.
//!
//! Port of `~/src/semble/src/semble/ranking/weighting.py` and
//! `~/src/semble/src/semble/ranking/boosting.py`.
//!
//! Three public entry points:
//!
//! - [`resolve_alpha`] — picks the semantic/BM25 blend weight from the
//!   query shape: 0.3 for bare-symbol queries (lean BM25), 0.5 for
//!   natural-language queries (balanced).
//! - [`apply_query_boost`] — adds query-type boosts on top of a score
//!   map. Returns a new map; callers re-rank afterwards.
//! - [`boost_multi_chunk_files`] — file-coherence boost; promotes the
//!   top chunk of files whose chunks collectively score high.
//!
//! Where Python keys `combined_scores` by `Chunk`, this port uses
//! `HashMap<usize, f32>` (chunk index → score) plus `&[CodeChunk]` for
//! lookups. Same shape as [`crate::encoder::ripvec::penalties::rerank_topk`].

use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::sync::OnceLock;

use regex::{Regex, RegexBuilder};

use crate::chunk::CodeChunk;
use crate::encoder::ripvec::tokens::split_identifier;

// ---------------------------------------------------------------------------
// Alpha selection (weighting.py).
// ---------------------------------------------------------------------------

/// Semantic blend weight for symbol-shaped queries. Lean BM25.
pub const ALPHA_SYMBOL: f32 = 0.3;
/// Semantic blend weight for natural-language queries. Balanced.
pub const ALPHA_NL: f32 = 0.5;

/// Return the semantic blend weight, optionally overriding by caller.
///
/// `alpha = Some(w)` returns `w` directly. `alpha = None` auto-detects
/// from the query: bare symbol-shaped → [`ALPHA_SYMBOL`], otherwise
/// [`ALPHA_NL`].
#[must_use]
pub fn resolve_alpha(query: &str, alpha: Option<f32>) -> f32 {
    if let Some(w) = alpha {
        return w;
    }
    if is_symbol_query(query) {
        ALPHA_SYMBOL
    } else {
        ALPHA_NL
    }
}

// ---------------------------------------------------------------------------
// Symbol-query detection (boosting.py:11).
// ---------------------------------------------------------------------------

/// Symbol-lookup queries: namespace-qualified, leading underscore, or
/// containing uppercase/underscore. Plain lowercase words are NL.
fn symbol_query_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(concat!(
            r"^(?:",
            // namespace-qualified
            r"[A-Za-z_][A-Za-z0-9_]*(?:(?:::|\\|->|\.)[A-Za-z_][A-Za-z0-9_]*)+",
            // leading underscore
            r"|_[A-Za-z0-9_]*",
            // contains uppercase or underscore in the body
            r"|[A-Za-z][A-Za-z0-9]*[A-Z_][A-Za-z0-9_]*",
            // starts with uppercase
            r"|[A-Z][A-Za-z0-9]*",
            r")$",
        ))
        .expect("symbol-query regex compiles")
    })
}

/// CamelCase / camelCase identifiers embedded in an NL query.
/// Excludes plain words and pure acronyms.
fn embedded_symbol_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(concat!(
            r"\b(?:",
            // PascalCase: upper, lower run, then upper, then mix.
            r"[A-Z][a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]*",
            // camelCase: lower, mix, then upper, then mix.
            r"|[a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]+",
            r")\b",
        ))
        .expect("embedded-symbol regex compiles")
    })
}

/// Return `true` when the query looks like a bare symbol or
/// namespace-qualified identifier (`Foo::Bar`, `module.Class`, `_x`,
/// `getX`, `XMLParser`, etc.).
#[must_use]
pub fn is_symbol_query(query: &str) -> bool {
    symbol_query_re().is_match(query.trim())
}

// ---------------------------------------------------------------------------
// Definition-keyword scan (boosting.py:36).
// ---------------------------------------------------------------------------

/// Language-agnostic definition keywords (Python, JS, Go, Rust, Kotlin,
/// Elixir, Swift, etc.). Case-sensitive matching avoids false positives
/// like "Module" appearing in Python docstrings.
const DEFINITION_KEYWORDS: &[&str] = &[
    "class",
    "module",
    "defmodule", // Elixir
    "def",
    "interface",
    "struct",
    "enum",
    "trait",
    "type",
    "func",
    "function",
    "object",
    "abstract class",
    "data class",
    "fn",
    "fun", // Kotlin
    "package",
    "namespace",
    "protocol", // Swift
    "record",   // C# 9+, Java 16+
    "typedef",  // C/C++/Dart
];

/// SQL DDL is conventionally all-caps or all-lowercase; match both via
/// case-insensitive regex.
const SQL_DEFINITION_KEYWORDS: &[&str] = &[
    "CREATE TABLE",
    "CREATE VIEW",
    "CREATE PROCEDURE",
    "CREATE FUNCTION",
];

const DEFINITION_BOOST_MULTIPLIER: f32 = 3.0;
const STEM_BOOST_MULTIPLIER: f32 = 1.0;
const FILE_COHERENCE_BOOST_FRAC: f32 = 0.2;
const EMBEDDED_SYMBOL_BOOST_SCALE: f32 = 0.5;
const EMBEDDED_STEM_MIN_LEN: usize = 4;

/// Common English stopwords excluded from file-stem matching for NL
/// queries. Mirrors `_STOPWORDS` from boosting.py:82.
const STOPWORDS: &[&str] = &[
    "a", "an", "and", "are", "as", "at", "be", "by", "do", "does", "for", "from", "has", "have",
    "how", "if", "in", "is", "it", "not", "of", "on", "or", "the", "to", "was", "what", "when",
    "where", "which", "who", "why", "with",
];

/// Build the general-keyword definition regex for a given symbol name.
///
/// Mirrors Python's `_definition_pattern`. Returns the compiled regex
/// each call (Python uses `functools.lru_cache(256)` — we forgo the
/// cache here; the hot path in `apply_query_boost` calls this a
/// constant number of times per query).
fn definition_pattern(symbol_name: &str) -> (Regex, Regex) {
    let escaped = regex::escape(symbol_name);
    let ns_prefix = r"(?:[A-Za-z_][A-Za-z0-9_]*(?:\.|::))*";
    // Python's pattern uses `(?:^|(?<=\s))(?:keywords)` — keyword at start-of-line
    // or preceded by whitespace. Rust's RE2 has no lookbehind, so we use
    // `(?:^|\s)(?:keywords)` and let the whitespace be consumed; semantically
    // equivalent for definition-keyword detection.
    let def_body = DEFINITION_KEYWORDS
        .iter()
        .map(|k| regex::escape(k))
        .collect::<Vec<_>>()
        .join("|");
    let sql_body = SQL_DEFINITION_KEYWORDS
        .iter()
        .map(|k| regex::escape(k))
        .collect::<Vec<_>>()
        .join("|");
    let suffix = format!(r")\s+{ns_prefix}{escaped}(?:\s|[<({{:\[;]|$)");
    // The Rust `regex` crate uses RE2 and does not support `(?<=\s)` (lookbehind).
    // Python's pattern is `(?:^|(?<=\s))(?:keywords)` — keywords appear at start of line
    // or after whitespace. Equivalent without lookbehind: anchor on `(?:^|\s)` and include
    // the whitespace character in the match (rather than as a lookbehind boundary). The
    // semantic is identical for definition detection.
    let no_lookbehind_prefix = r"(?:^|\s)(?:";
    let general_pat = format!("{no_lookbehind_prefix}{def_body}{suffix}");
    let sql_pat = format!("{no_lookbehind_prefix}{sql_body}{suffix}");
    let general = RegexBuilder::new(&general_pat)
        .multi_line(true)
        .build()
        .expect("general definition regex compiles");
    let sql = RegexBuilder::new(&sql_pat)
        .multi_line(true)
        .case_insensitive(true)
        .build()
        .expect("SQL definition regex compiles");
    (general, sql)
}

/// `true` when `content` contains a definition of `symbol_name`.
///
/// Mirrors Python's `_chunk_defines_symbol`. Case-sensitive for general
/// keywords; case-insensitive for SQL DDL. Namespace-qualified forms
/// (`defmodule Phoenix.Router` for `Router`) match because the pattern
/// allows an optional `ns_prefix`.
fn chunk_defines_symbol(content: &str, symbol_name: &str) -> bool {
    let (general, sql) = definition_pattern(symbol_name);
    general.is_match(content) || sql.is_match(content)
}

/// `true` when `stem` matches `name` (exact, snake_case-normalised, or
/// plural). Mirrors Python's `_stem_matches`.
fn stem_matches(stem: &str, name: &str) -> bool {
    let stem_norm = stem.replace('_', "");
    stem == name
        || stem_norm == name
        || stem.trim_end_matches('s') == name
        || stem_norm.trim_end_matches('s') == name
}

// ---------------------------------------------------------------------------
// Symbol extraction (boosting.py:137).
// ---------------------------------------------------------------------------

/// Extract the final identifier from a possibly namespace-qualified
/// query. Mirrors `_extract_symbol_name`.
///
/// Examples: `Sinatra::Base` → `Base`, `Client` → `Client`.
fn extract_symbol_name(query: &str) -> String {
    for separator in &["::", "\\", "->", "."] {
        if let Some(idx) = query.rfind(separator) {
            return query[idx + separator.len()..].to_string();
        }
    }
    query.trim().to_string()
}

/// Return the boost amount for a chunk that defines one of `names`
/// (0.0 if none match). Mirrors `_definition_tier`.
fn definition_tier(chunk: &CodeChunk, names: &HashSet<String>, boost_unit: f32) -> f32 {
    let any_match = names
        .iter()
        .any(|name| chunk_defines_symbol(&chunk.content, name));
    if !any_match {
        return 0.0;
    }
    let stem = Path::new(&chunk.file_path)
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or_default()
        .to_ascii_lowercase();
    let stem_match_bonus = names
        .iter()
        .any(|name| stem_matches(&stem, &name.to_ascii_lowercase()));
    boost_unit * if stem_match_bonus { 1.5 } else { 1.0 }
}

/// Boost non-candidate chunks whose lowercased file stem satisfies
/// `stem_ok`. Mirrors `_scan_non_candidates`.
fn scan_non_candidates(
    boosted: &mut HashMap<usize, f32>,
    names: &HashSet<String>,
    boost_unit: f32,
    all_chunks: &[CodeChunk],
    stem_ok: &dyn Fn(&str) -> bool,
) {
    for (idx, chunk) in all_chunks.iter().enumerate() {
        if boosted.contains_key(&idx) {
            continue;
        }
        let stem = Path::new(&chunk.file_path)
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or_default()
            .to_ascii_lowercase();
        if !stem_ok(&stem) {
            continue;
        }
        let tier = definition_tier(chunk, names, boost_unit);
        if tier > 0.0 {
            boosted.insert(idx, tier);
        }
    }
}

/// Symbol-query branch: boost chunks defining the queried symbol and
/// stem-matched non-candidates. Mirrors `_boost_symbol_definitions`.
fn boost_symbol_definitions(
    boosted: &mut HashMap<usize, f32>,
    query: &str,
    max_score: f32,
    all_chunks: &[CodeChunk],
) {
    let symbol_name = extract_symbol_name(query);
    let trimmed_query = query.trim();
    let mut names: HashSet<String> = HashSet::new();
    names.insert(symbol_name.clone());
    if symbol_name != trimmed_query {
        names.insert(trimmed_query.to_string());
    }

    let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER;

    // Pass 1: walk current candidates, add a tier if they define the symbol.
    let candidate_indices: Vec<usize> = boosted.keys().copied().collect();
    for idx in candidate_indices {
        let tier = definition_tier(&all_chunks[idx], &names, boost_unit);
        if tier > 0.0 {
            *boosted.entry(idx).or_insert(0.0) += tier;
        }
    }

    // Pass 2: scan non-candidate chunks whose stem matches the symbol name.
    let symbol_lower = symbol_name.to_ascii_lowercase();
    scan_non_candidates(boosted, &names, boost_unit, all_chunks, &|stem: &str| {
        stem_matches(stem, &symbol_lower)
    });
}

/// NL-query branch: boost CamelCase / camelCase identifiers embedded in
/// the query at half strength. Mirrors `_boost_embedded_symbols`.
fn boost_embedded_symbols(
    boosted: &mut HashMap<usize, f32>,
    query: &str,
    max_score: f32,
    all_chunks: &[CodeChunk],
) {
    let names: HashSet<String> = embedded_symbol_re()
        .find_iter(query)
        .map(|m| m.as_str().to_string())
        .collect();
    if names.is_empty() {
        return;
    }

    let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE;

    // Pass 1: candidates that define the embedded symbol(s).
    let candidate_indices: Vec<usize> = boosted.keys().copied().collect();
    for idx in candidate_indices {
        let tier = definition_tier(&all_chunks[idx], &names, boost_unit);
        if tier > 0.0 {
            *boosted.entry(idx).or_insert(0.0) += tier;
        }
    }

    // Pass 2: non-candidate stem-prefix scan.
    let symbols_lower: Vec<String> = names.iter().map(|n| n.to_ascii_lowercase()).collect();
    let symbols_lower_for_scan = symbols_lower.clone();
    scan_non_candidates(
        boosted,
        &names,
        boost_unit,
        all_chunks,
        &move |stem: &str| {
            let stem_norm = stem.replace('_', "");
            symbols_lower_for_scan.iter().any(|sym_lower| {
                stem == sym_lower
                    || stem_norm == *sym_lower
                    || (stem.len() >= EMBEDDED_STEM_MIN_LEN && sym_lower.starts_with(stem))
                    || (stem_norm.len() >= EMBEDDED_STEM_MIN_LEN
                        && sym_lower.starts_with(stem_norm.as_str()))
            })
        },
    );
}

/// Count query keywords that match path parts. Mirrors
/// `_count_keyword_matches`. Allows prefix overlap when the shorter
/// side has at least 3 characters.
fn count_keyword_matches(keywords: &HashSet<String>, parts: &HashSet<String>) -> usize {
    let exact: HashSet<&String> = keywords.iter().filter(|k| parts.contains(*k)).collect();
    if exact.len() == keywords.len() {
        return exact.len();
    }
    let mut n = exact.len();
    for keyword in keywords {
        if exact.contains(keyword) {
            continue;
        }
        for part in parts {
            let (shorter, longer) = if keyword.len() <= part.len() {
                (keyword.as_str(), part.as_str())
            } else {
                (part.as_str(), keyword.as_str())
            };
            if shorter.len() >= 3 && longer.starts_with(shorter) {
                n += 1;
                break;
            }
        }
    }
    n
}

/// Boost chunks whose file paths match NL query keywords. Mirrors
/// `_boost_stem_matches`. Uses prefix matching for morphological
/// variants ("dependency" matches "dependencies"). Matches file stems
/// and the immediate parent directory name.
fn boost_stem_matches(
    boosted: &mut HashMap<usize, f32>,
    query: &str,
    max_score: f32,
    chunks: &[CodeChunk],
) {
    static KEYWORD_RE: OnceLock<Regex> = OnceLock::new();
    let keyword_re =
        KEYWORD_RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").expect("keyword regex"));
    let keywords: HashSet<String> = keyword_re
        .find_iter(query)
        .map(|m| m.as_str().to_ascii_lowercase())
        .filter(|w| w.len() > 2 && !STOPWORDS.contains(&w.as_str()))
        .collect();
    if keywords.is_empty() {
        return;
    }

    let boost = max_score * STEM_BOOST_MULTIPLIER;
    let mut path_cache: HashMap<String, HashSet<String>> = HashMap::new();
    let candidate_indices: Vec<usize> = boosted.keys().copied().collect();
    for idx in candidate_indices {
        let path = &chunks[idx].file_path;
        let parts = path_cache
            .entry(path.clone())
            .or_insert_with(|| {
                let mut parts: HashSet<String> = HashSet::new();
                let p = Path::new(path);
                if let Some(stem) = p.file_stem().and_then(|s| s.to_str()) {
                    parts.extend(split_identifier(stem));
                }
                if let Some(parent_name) = p
                    .parent()
                    .and_then(Path::file_name)
                    .and_then(|s| s.to_str())
                    && !parent_name.is_empty()
                    && parent_name != "."
                    && parent_name != ".."
                {
                    parts.extend(split_identifier(parent_name));
                }
                parts
            })
            .clone();
        let n_matches = count_keyword_matches(&keywords, &parts);
        if n_matches > 0 {
            let match_ratio = n_matches as f32 / keywords.len() as f32;
            if match_ratio >= 0.10 {
                *boosted.entry(idx).or_insert(0.0) += boost * match_ratio;
            }
        }
    }
}

// ---------------------------------------------------------------------------
// Public entry: apply_query_boost.
// ---------------------------------------------------------------------------

/// Apply query-type boosts to candidate scores.
///
/// Mirrors `apply_query_boost`. Returns a new map; the input is not
/// mutated. Empty input passes through.
///
/// Branches on [`is_symbol_query`]:
/// - Symbol-shaped query → [`boost_symbol_definitions`] (×3 base,
///   ×1.5 on stem match) and scan non-candidate stem-matched chunks.
/// - NL query → [`boost_stem_matches`] (file-stem keyword overlap) +
///   [`boost_embedded_symbols`] (half-strength CamelCase scan).
#[expect(
    clippy::implicit_hasher,
    reason = "internal API; callers in the semble pipeline use the default RandomState"
)]
#[must_use]
pub fn apply_query_boost(
    combined_scores: &HashMap<usize, f32>,
    query: &str,
    all_chunks: &[CodeChunk],
) -> HashMap<usize, f32> {
    if combined_scores.is_empty() {
        return HashMap::new();
    }
    let max_score = combined_scores
        .values()
        .copied()
        .fold(f32::NEG_INFINITY, f32::max);
    let mut boosted = combined_scores.clone();
    if is_symbol_query(query) {
        boost_symbol_definitions(&mut boosted, query, max_score, all_chunks);
    } else {
        boost_stem_matches(&mut boosted, query, max_score, all_chunks);
        boost_embedded_symbols(&mut boosted, query, max_score, all_chunks);
    }
    boosted
}

/// Promote files with multiple high-scoring chunks by boosting their
/// top chunk in place. Mirrors `boost_multi_chunk_files`.
#[expect(
    clippy::implicit_hasher,
    reason = "internal API; callers in the semble pipeline use the default RandomState"
)]
pub fn boost_multi_chunk_files(scores: &mut HashMap<usize, f32>, chunks: &[CodeChunk]) {
    if scores.is_empty() {
        return;
    }
    let max_score = scores.values().copied().fold(f32::NEG_INFINITY, f32::max);
    if max_score == 0.0 || !max_score.is_finite() {
        return;
    }

    let mut file_sum: HashMap<String, f32> = HashMap::new();
    let mut best_chunk_idx: HashMap<String, usize> = HashMap::new();
    for (&idx, &score) in scores.iter() {
        let path = chunks[idx].file_path.clone();
        *file_sum.entry(path.clone()).or_insert(0.0) += score;
        match best_chunk_idx.get(&path) {
            Some(&best) if scores[&best] >= score => {}
            _ => {
                best_chunk_idx.insert(path, idx);
            }
        }
    }
    let max_file_sum = file_sum.values().copied().fold(f32::NEG_INFINITY, f32::max);
    if max_file_sum <= 0.0 || !max_file_sum.is_finite() {
        return;
    }
    let boost_unit = max_score * FILE_COHERENCE_BOOST_FRAC;
    for (path, &idx) in &best_chunk_idx {
        let contribution = boost_unit * file_sum[path] / max_file_sum;
        *scores.entry(idx).or_insert(0.0) += contribution;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn chunk(path: &str, content: &str) -> CodeChunk {
        CodeChunk {
            file_path: path.to_string(),
            name: String::new(),
            kind: String::new(),
            start_line: 1,
            end_line: 1,
            content: content.to_string(),
            enriched_content: content.to_string(),
        }
    }

    // ----- is_symbol_query (boosting.py:132) -----

    #[test]
    fn is_symbol_query_namespace() {
        assert!(is_symbol_query("Sinatra::Base"));
        assert!(is_symbol_query("module.Class"));
        assert!(is_symbol_query("a->b->c"));
        assert!(is_symbol_query(r"Foo\Bar"));
    }

    #[test]
    fn is_symbol_query_pascal() {
        assert!(is_symbol_query("Client"));
        assert!(is_symbol_query("HTTPHandler"));
        assert!(is_symbol_query("XMLParser"));
    }

    #[test]
    fn is_symbol_query_plain_word_rejected() {
        assert!(!is_symbol_query("session"));
        assert!(!is_symbol_query("retry"));
        assert!(!is_symbol_query("authentication"));
    }

    #[test]
    fn is_symbol_query_leading_underscore_accepted() {
        assert!(is_symbol_query("_private"));
        assert!(is_symbol_query("__init__"));
    }

    // ----- resolve_alpha -----

    #[test]
    fn resolve_alpha_symbol_0_3() {
        assert!((resolve_alpha("Client", None) - ALPHA_SYMBOL).abs() < 1e-6);
        assert!((resolve_alpha("foo.Bar", None) - ALPHA_SYMBOL).abs() < 1e-6);
    }

    #[test]
    fn resolve_alpha_nl_0_5() {
        assert!((resolve_alpha("how does retry work", None) - ALPHA_NL).abs() < 1e-6);
        assert!((resolve_alpha("authentication handling", None) - ALPHA_NL).abs() < 1e-6);
    }

    #[test]
    fn resolve_alpha_explicit_override_wins() {
        assert!((resolve_alpha("Client", Some(0.7)) - 0.7).abs() < 1e-6);
    }

    // ----- definition_pattern + chunk_defines_symbol -----

    #[test]
    fn chunk_defines_symbol_class() {
        let content = "class Client:\n    pass";
        assert!(chunk_defines_symbol(content, "Client"));
    }

    #[test]
    fn chunk_defines_symbol_def() {
        let content = "def handle_request():\n    pass";
        assert!(chunk_defines_symbol(content, "handle_request"));
    }

    #[test]
    fn chunk_defines_symbol_namespace_qualified() {
        // Elixir defmodule with namespace prefix should match the bare symbol.
        let content = " defmodule Phoenix.Router do\n";
        assert!(chunk_defines_symbol(content, "Router"));
    }

    #[test]
    fn chunk_defines_symbol_sql_case_insensitive() {
        assert!(chunk_defines_symbol(
            " create table users (id int)",
            "users"
        ));
        assert!(chunk_defines_symbol(
            " CREATE TABLE Users (id int)",
            "Users"
        ));
    }

    #[test]
    fn chunk_defines_symbol_negative() {
        assert!(!chunk_defines_symbol("client.do_thing()", "Client"));
    }

    // ----- boost_symbol_definitions stem multiplier -----

    #[test]
    fn boost_symbol_definitions_stem_multiplier() {
        // Two chunks both define `Client`; one is in client.rs (stem match),
        // the other in unrelated.rs. The stem-matched chunk gets the 1.5x
        // bonus on top of the base ×3 multiplier.
        let chunks = vec![
            chunk("src/client.rs", "struct Client { /* ... */ }"),
            chunk("src/unrelated.rs", "struct Client { /* ... */ }"),
        ];
        let mut boosted: HashMap<usize, f32> = HashMap::from([(0, 1.0), (1, 1.0)]);
        boost_symbol_definitions(&mut boosted, "Client", 1.0, &chunks);
        // base_boost = max_score (1.0) * 3.0 = 3.0
        // stem-matched chunk 0: 1.0 + 3.0 * 1.5 = 5.5
        // non-matched chunk 1: 1.0 + 3.0 = 4.0
        assert!((boosted[&0] - 5.5).abs() < 1e-6);
        assert!((boosted[&1] - 4.0).abs() < 1e-6);
    }

    // ----- boost_stem_matches -----

    #[test]
    fn boost_stem_matches_prefix() {
        // Query keyword "parse" should boost a chunk in "parser.rs" via
        // prefix overlap (shorter="parse", longer="parser", min-length 3
        // satisfied). This is the real morphological-variant shape;
        // Python semble's docstring example ("dependency" ↔ "dependencies")
        // is misleading because the two diverge at char 9.
        let chunks = vec![chunk("src/parser.rs", "fn run() {}")];
        let mut boosted: HashMap<usize, f32> = HashMap::from([(0, 1.0_f32)]);
        boost_stem_matches(&mut boosted, "parse json structure", 1.0, &chunks);
        assert!(boosted[&0] > 1.0, "expected stem-match boost on parser.rs");
    }

    // ----- boost_embedded_symbols half-strength -----

    #[test]
    fn boost_embedded_symbols_half_strength() {
        // Embedded symbol "MyClass" in an NL query boosts at half strength
        // vs a pure symbol query.
        let chunks = vec![chunk("src/myclass.rs", "struct MyClass {}")];
        let mut boosted: HashMap<usize, f32> = HashMap::from([(0, 1.0_f32)]);
        boost_embedded_symbols(&mut boosted, "how does MyClass handle errors", 1.0, &chunks);
        // base_boost = 1.0 * 3.0 * 0.5 = 1.5
        // stem-matched chunk 0: 1.0 + 1.5 * 1.5 = 3.25
        assert!((boosted[&0] - 3.25).abs() < 1e-6, "got {}", boosted[&0]);
    }

    // ----- boost_multi_chunk_files (file coherence) -----

    #[test]
    fn boost_multi_chunk_files() {
        // Three chunks: two in foo.rs (sum 1.5), one in bar.rs (sum 1.0).
        // boost_unit = max_score (1.0) * 0.2 = 0.2
        // foo.rs file_sum 1.5; boost to its TOP chunk (idx 0) =
        //   0.2 * 1.5 / 1.5 = 0.2 → final 1.2.
        // bar.rs file_sum 1.0; boost to chunk 2 = 0.2 * 1.0/1.5 ≈ 0.133 → ~1.133.
        let chunks = vec![
            chunk("src/foo.rs", ""),
            chunk("src/foo.rs", ""),
            chunk("src/bar.rs", ""),
        ];
        let mut scores: HashMap<usize, f32> = HashMap::from([(0, 1.0), (1, 0.5), (2, 1.0)]);
        super::boost_multi_chunk_files(&mut scores, &chunks);
        assert!((scores[&0] - 1.2).abs() < 1e-6, "got {}", scores[&0]);
        assert!((scores[&1] - 0.5).abs() < 1e-6, "non-best chunk unchanged");
        let expected_bar = 1.0 + 0.2 * (1.0 / 1.5);
        assert!(
            (scores[&2] - expected_bar).abs() < 1e-6,
            "got {}, expected {}",
            scores[&2],
            expected_bar
        );
    }

    // ----- property test (symbol regex parity) -----

    #[test]
    fn property_symbol_regex_parity_python() {
        // These should be detected as symbol queries (matches Python).
        let symbols = &[
            "Client",
            "handle_request",
            "_private",
            "getX",
            "XMLParser",
            "foo::bar",
            "foo.bar.baz",
            "a->b",
            r"Foo\Bar",
            "__init__",
            "snake_case",
        ];
        for q in symbols {
            assert!(is_symbol_query(q), "expected symbol query: {q:?}");
        }
        // These should NOT be detected (NL).
        let non_symbols = &[
            "session",
            "retry",
            "authentication",
            "how does retry work",
            "user authentication flow",
            "hi",
        ];
        for q in non_symbols {
            assert!(!is_symbol_query(q), "expected NL: {q:?}");
        }
    }
}