Skip to main content

codelens_engine/symbols/
scoring.rs

1use super::types::SymbolInfo;
2
3// ── Zero-allocation ASCII case-insensitive helpers ──────────────────
4
5/// ASCII case-insensitive substring search. Returns true if `needle`
6/// appears anywhere in `haystack` ignoring ASCII case differences.
7///
8/// This replaces the previous pattern of allocating
9/// `haystack.to_lowercase()` + `haystack_lower.contains(needle_lower)`
10/// which paid one `String` allocation per call. Since code identifiers
11/// in all 25 supported tree-sitter languages are ASCII, the ASCII-only
12/// comparison is both correct and faster than Unicode `to_lowercase`.
13pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14    let h = haystack.as_bytes();
15    let n = needle.as_bytes();
16    if n.len() > h.len() {
17        return false;
18    }
19    if n.is_empty() {
20        return true;
21    }
22    h.windows(n.len())
23        .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26/// ASCII case-insensitive full-string equality.
27fn eq_ascii_ci(a: &str, b: &str) -> bool {
28    a.eq_ignore_ascii_case(b)
29}
30
31/// Check if any query token is a common programming action verb.
32fn query_has_action_verb(tokens: &[&str]) -> bool {
33    const ACTION_VERBS: &[&str] = &[
34        "find",
35        "get",
36        "search",
37        "detect",
38        "start",
39        "run",
40        "read",
41        "write",
42        "move",
43        "change",
44        "rename",
45        "replace",
46        "extract",
47        "route",
48        "embed",
49        "build",
50        "create",
51        "delete",
52        "update",
53        "compute",
54        "calculate",
55        "apply",
56        "handle",
57        "parse",
58        "index",
59        "watch",
60        "listen",
61        "fetch",
62        "send",
63        "load",
64        "save",
65        "open",
66        "close",
67        "connect",
68        "check",
69        "validate",
70        "verify",
71        "transform",
72        "convert",
73        "process",
74        "execute",
75        "call",
76        "invoke",
77        "inline",
78        "refactor",
79        "analyze",
80        "import",
81        "export",
82    ];
83    tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86/// Score a symbol's relevance to a query string.
87/// Returns None if no match, Some(1..=100) for match strength.
88///
89/// Accepts pre-computed `query_lower` to avoid repeated allocation
90/// when scoring many symbols against the same query.
91pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92    let lower = query.to_lowercase();
93    let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94    score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97/// Inner scoring with pre-lowercased query and pre-computed joined-snake
98/// form — call this from hot loops where both are invariant across
99/// candidates.
100///
101/// `joined_snake` is the query with whitespace/hyphens replaced by
102/// underscores, used for snake_case identifier matching (e.g.
103/// "rename symbol" → "rename_symbol"). It is query-derived and
104/// identical for every candidate, so computing it once in the caller
105/// eliminates one String allocation per candidate in the hot loop.
106pub(crate) fn score_symbol_with_lower(
107    query: &str,
108    query_lower: &str,
109    joined_snake: &str,
110    symbol: &SymbolInfo,
111) -> Option<i32> {
112    // Exact full-query match (no allocation needed)
113    if symbol.name.eq_ignore_ascii_case(query) {
114        return Some(100);
115    }
116
117    // ── Zero-alloc substring checks (replaces 4 × to_lowercase()) ──
118    // All checks below use contains_ascii_ci / eq_ascii_ci instead of
119    // allocating lowered Strings. Code identifiers are ASCII, so
120    // ASCII case folding is correct and avoids one String per field.
121
122    if contains_ascii_ci(&symbol.name, query_lower) {
123        return Some(60);
124    }
125    if contains_ascii_ci(&symbol.signature, query_lower) {
126        return Some(30);
127    }
128    if contains_ascii_ci(&symbol.name_path, query_lower) {
129        return Some(20);
130    }
131
132    // Check if query tokens form the symbol name when joined with underscore
133    // e.g. "rename symbol" → "rename_symbol" → exact match bonus
134    // `joined_snake` is pre-computed by the caller to avoid one String
135    // allocation per candidate in the hot loop.
136    if eq_ascii_ci(&symbol.name, joined_snake) {
137        return Some(80);
138    }
139    // Partial: symbol name is a subset of joined tokens
140    // e.g. "move symbol to file" → joined = "move_symbol_to_file", contains "move_symbol" → 70
141    if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142        return Some(70);
143    }
144    // Reverse: symbol name contains the joined tokens
145    // e.g. "extract function" → "refactor_extract_function" contains "extract_function" → 65
146    if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147        return Some(65);
148    }
149
150    // Token-level matching: split query into words, score by hit ratio
151    let tokens: Vec<&str> = query_lower
152        .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153        .filter(|t| t.len() >= 2)
154        .collect();
155    if tokens.is_empty() {
156        return None;
157    }
158
159    // Token-level name matching: contains_ascii_ci already covers
160    // CamelCase segments because every CamelCase segment is a contiguous
161    // substring of the original name. The old split_camel_case() call
162    // allocated Vec<char> + Vec<String> per candidate but could never
163    // produce a hit that contains_ascii_ci missed. Removed in v1.6.2+
164    // to eliminate the last per-candidate allocation in this function.
165
166    let mut name_hits = 0i32;
167    let mut sig_hits = 0i32;
168    let mut path_hits = 0i32;
169    for token in &tokens {
170        if contains_ascii_ci(&symbol.name, token) {
171            name_hits += 1;
172        }
173        if contains_ascii_ci(&symbol.signature, token) {
174            sig_hits += 1;
175        }
176        if contains_ascii_ci(&symbol.file_path, token) {
177            path_hits += 1;
178        }
179    }
180
181    let total_tokens = tokens.len() as i32;
182    if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183        return None;
184    }
185
186    // Score formula: name hits dominate, sig/path are secondary
187    // name_ratio: 0.0-1.0 portion of query tokens found in name
188    // Boost for high name coverage (most tokens match the symbol name)
189    let name_ratio = name_hits as f64 / total_tokens as f64;
190    let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192    let base_score = if name_hits > 0 {
193        let base = (15.0 + name_ratio * 40.0) as i32;
194        let sig_bonus = (sig_ratio * 5.0) as i32;
195        (base + sig_bonus).min(55)
196    } else if sig_hits > 0 {
197        (5.0 + sig_ratio * 20.0) as i32
198    } else {
199        // Path-only: very weak signal, 1-5
200        let path_ratio = path_hits as f64 / total_tokens as f64;
201        (1.0 + path_ratio * 4.0).max(1.0) as i32
202    };
203
204    // Kind-aware boost: action queries prefer functions, noun queries prefer types.
205    // Detects action intent by checking if any query token is a common verb.
206    let kind_boost = if query_has_action_verb(&tokens) {
207        match symbol.kind {
208            super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209            _ => 0,
210        }
211    } else {
212        match symbol.kind {
213            super::types::SymbolKind::Class
214            | super::types::SymbolKind::Interface
215            | super::types::SymbolKind::Enum => 5,
216            _ => 0,
217        }
218    };
219
220    Some(base_score + kind_boost)
221}
222
223/// Return true when v1.5 Phase 2e sparse term weighting is enabled via
224/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or `true`/`yes`/`on`).
225///
226/// Default OFF, mirroring the Phase 2b/2c opt-in policy. Projects that
227/// already opt into the Phase 2b/2c embedding hints can stack this knob
228/// to tighten top-1 ordering without another index rebuild — the sparse
229/// pass reads `SymbolInfo` fields that are already populated on the
230/// ranking path.
231///
232/// v1.5 Phase 2j: when no explicit env var is set, fall through to
233/// `crate::embedding::auto_sparse_should_enable()` for language-gated
234/// defaults. This intentionally diverges from `nl_tokens_enabled` and
235/// `api_calls_enabled`: Phase 2m keeps JS/TS auto-enabled for Phase 2b/2c
236/// but auto-disables sparse weighting there because recent JS/TS
237/// measurements were negative-or-inert. Explicit env always wins.
238pub fn sparse_weighting_enabled() -> bool {
239    if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240        let lowered = raw.trim().to_ascii_lowercase();
241        return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242    }
243    crate::embedding::auto_sparse_should_enable()
244}
245
246/// Maximum sparse coverage bonus added to the blended score when a query
247/// reaches 100% term coverage against a symbol's `name + name_path +
248/// signature` corpus. Override via `CODELENS_RANK_SPARSE_MAX` (clamped
249/// to 5..=50).
250///
251/// Kept deliberately modest (default 20) because the existing lexical
252/// score in `score_symbol_with_lower` already reaches 55 for signature
253/// hits. The sparse bonus is a *tie-breaker* — it re-orders the top-K
254/// after the main scoring has selected them, not a replacement for the
255/// lexical signal.
256pub fn sparse_max_bonus() -> f64 {
257    std::env::var("CODELENS_RANK_SPARSE_MAX")
258        .ok()
259        .and_then(|raw| raw.parse::<u32>().ok())
260        .map(|n| n.clamp(5, 50))
261        .unwrap_or(20) as f64
262}
263
264/// Minimum query-term coverage (as a percentage, 10..=90) a symbol must
265/// reach before it receives any sparse bonus. Below this threshold the
266/// bonus is `0.0`. Between the threshold and 100% the bonus rises
267/// linearly from `0.0` to `sparse_max_bonus()`.
268///
269/// The default of 60 was a conservative first guess. An initial 4-arm
270/// A/B on the 89-query self dataset found that the bonus never fired at
271/// 60 because most NL queries only share 1–2 discriminative tokens with
272/// their target symbol's `name + name_path + signature` corpus.
273/// Override via `CODELENS_RANK_SPARSE_THRESHOLD` for tuning experiments.
274pub fn sparse_threshold() -> f64 {
275    std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
276        .ok()
277        .and_then(|raw| raw.parse::<u32>().ok())
278        .map(|n| n.clamp(10, 90))
279        .unwrap_or(60) as f64
280        / 100.0
281}
282
283/// English/pseudo-stopwords that add no discriminative signal when used
284/// as query tokens. Intentionally short — real NL stopwords lists contain
285/// ~150 entries, but most of them never show up in code-search queries.
286/// We only need the ones that regularly dilute query coverage ("find the
287/// function that opens a file" — `the` and `that` are the problem).
288const SPARSE_STOPWORDS: &[&str] = &[
289    "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
290    "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
291    "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
292];
293
294/// Return true when `token` is found in `corpus` as a whole word — that is,
295/// the characters surrounding each occurrence are NOT alphanumeric or `_`.
296///
297/// Phase 2e uses this instead of `str::contains` so that a query token like
298/// `"parse"` matches `parse_json` (snake separator) but not `parser` or
299/// `parseRequest` (would already be caught by the lexical `contains` path,
300/// which is where we want them scored — not via the sparse bonus).
301pub fn has_whole_word(corpus: &str, token: &str) -> bool {
302    if token.is_empty() || corpus.len() < token.len() {
303        return false;
304    }
305    let corpus_bytes = corpus.as_bytes();
306    let token_bytes = token.as_bytes();
307    let mut start = 0;
308    while start + token_bytes.len() <= corpus_bytes.len() {
309        // Find next occurrence from `start`
310        let remaining = &corpus[start..];
311        let Some(local_idx) = remaining.find(token) else {
312            return false;
313        };
314        let abs = start + local_idx;
315        let end = abs + token_bytes.len();
316        let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
317        let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
318        if before_ok && after_ok {
319            return true;
320        }
321        start = abs + 1;
322    }
323    false
324}
325
326/// Byte-level helper: true when the byte is part of an ASCII word
327/// ([A-Za-z0-9]). `_` is deliberately excluded so that snake_case
328/// separators count as word boundaries — e.g. `"parse"` should match
329/// `"parse_json_body"` but not `"parser"`. Non-ASCII bytes (UTF-8
330/// continuation) default to "word" so multi-byte identifiers stay
331/// conservative (no false positives from partial UTF-8 matches).
332fn is_word_byte(b: u8) -> bool {
333    b.is_ascii_alphanumeric() || (b & 0x80) != 0
334}
335
336/// Tokenize `query_lower` into distinct discriminative terms for the
337/// Phase 2e sparse pass:
338/// - split on any non-alphanumeric character
339/// - drop tokens shorter than 3 characters
340/// - drop tokens in `SPARSE_STOPWORDS`
341/// - deduplicate while preserving order
342///
343/// Returns `Vec<String>` (not `Vec<&str>`) so callers can own the tokens
344/// independently of the query lifetime — the rank loop already has to
345/// outlive the borrow anyway.
346pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
347    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
348    let mut out: Vec<String> = Vec::new();
349    for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
350        if raw.len() < 3 {
351            continue;
352        }
353        if SPARSE_STOPWORDS.contains(&raw) {
354            continue;
355        }
356        if seen.insert(raw.to_string()) {
357            out.push(raw.to_string());
358        }
359    }
360    out
361}
362
363/// Text-first variant of the Phase 2e sparse coverage bonus. Does NOT
364/// take a `SymbolInfo` so that callers outside the engine crate (notably
365/// the MCP `get_ranked_context` post-process) can feed it whatever fields
366/// are actually available on their entry type.
367///
368/// `query_lower` MUST already be lower-cased — the function does not
369/// re-lowercase so that callers with a long query can amortise the
370/// allocation outside the loop. Pass the *original user query*, not the
371/// MCP-expanded retrieval string: the expansion adds dozens of
372/// derivative tokens (snake_case, CamelCase, alias groups) that dilute
373/// the coverage ratio below any reasonable threshold — that dilution
374/// was the exact reason the first 4-arm pilot measured zero effect.
375///
376/// Returns `0.0` whenever:
377/// - the query has fewer than 2 discriminative tokens after stopword
378///   filtering (single-token queries already resolve well via the
379///   lexical path — `sparse_query_tokens` deduplicates + drops <3 chars),
380/// - the coverage ratio is below `sparse_threshold()` (default 0.6).
381///
382/// Between the threshold and 100% coverage the bonus rises linearly
383/// from 0 to `sparse_max_bonus()`. The caller is responsible for
384/// gating the whole call with `sparse_weighting_enabled()` so test
385/// code can run the inner logic deterministically.
386pub fn sparse_coverage_bonus_from_fields(
387    query_lower: &str,
388    name: &str,
389    name_path: &str,
390    signature: &str,
391    file_path: &str,
392) -> f64 {
393    let tokens = sparse_query_tokens(query_lower);
394    if tokens.len() < 2 {
395        return 0.0;
396    }
397    // Build the corpus directly as lowercase to avoid a second String
398    // allocation. Previously this was corpus + corpus.to_lowercase() =
399    // 2 allocations per candidate; now it's 1.
400    let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
401    let mut corpus_lower = String::with_capacity(cap);
402    for field in [name, name_path, signature, file_path] {
403        if !corpus_lower.is_empty() {
404            corpus_lower.push(' ');
405        }
406        for ch in field.chars() {
407            corpus_lower.push(ch.to_ascii_lowercase());
408        }
409    }
410
411    let matched = tokens
412        .iter()
413        .filter(|t| has_whole_word(&corpus_lower, t))
414        .count() as f64;
415    let total = tokens.len() as f64;
416    let coverage = matched / total;
417
418    let threshold = sparse_threshold();
419    if coverage < threshold {
420        return 0.0;
421    }
422    // threshold → 0, 100% → sparse_max_bonus(), linear between. Guard
423    // against threshold == 1.0 (would divide by zero) by clamping.
424    let span = (1.0 - threshold).max(0.01);
425    (coverage - threshold) / span * sparse_max_bonus()
426}
427
428/// Back-compat wrapper kept for the existing `SymbolInfo`-based unit
429/// tests. New call sites should prefer `sparse_coverage_bonus_from_fields`.
430#[cfg(test)]
431pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
432    sparse_coverage_bonus_from_fields(
433        query_lower,
434        &symbol.name,
435        &symbol.name_path,
436        &symbol.signature,
437        &symbol.file_path,
438    )
439}
440
441#[cfg(test)]
442mod tests {
443    use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
444    use super::*;
445    use std::sync::Mutex;
446
447    static ENV_LOCK: Mutex<()> = Mutex::new(());
448
449    fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
450        SymbolInfo {
451            name: name.to_string(),
452            kind: SymbolKind::Function,
453            file_path: "test.rs".into(),
454            line: 1,
455            column: 0,
456            signature: signature.to_string(),
457            name_path: name.to_string(),
458            id: format!("test.rs#function:{name}"),
459            body: None,
460            children: Vec::new(),
461            start_byte: 0,
462            end_byte: 0,
463            provenance: SymbolProvenance::default(),
464        }
465    }
466
467    #[test]
468    fn sparse_weighting_gated_off_by_default() {
469        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
470        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
471        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
472        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
473        unsafe {
474            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
475            std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
476            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
477        }
478        let enabled = sparse_weighting_enabled();
479        unsafe {
480            match previous_explicit {
481                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
482                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
483            }
484            match previous_auto {
485                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
486                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
487            }
488            match previous_lang {
489                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
490                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
491            }
492        }
493        assert!(!enabled, "sparse weighting gate leaked");
494    }
495
496    #[test]
497    fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
498        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
499        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
500        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
501        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
502
503        unsafe {
504            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
505            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
506            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
507        }
508        assert!(
509            sparse_weighting_enabled(),
510            "auto+rust should enable sparse weighting"
511        );
512
513        unsafe {
514            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
515            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
516            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
517        }
518        assert!(
519            !sparse_weighting_enabled(),
520            "auto+typescript should disable sparse weighting after Phase 2m split"
521        );
522
523        unsafe {
524            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
525            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
526            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
527        }
528        assert!(
529            sparse_weighting_enabled(),
530            "explicit sparse=1 must still win over JS/TS auto-off"
531        );
532
533        unsafe {
534            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
535            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
536            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
537        }
538        assert!(
539            !sparse_weighting_enabled(),
540            "explicit sparse=0 must still win over rust auto-on"
541        );
542
543        unsafe {
544            match previous_explicit {
545                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
546                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
547            }
548            match previous_auto {
549                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
550                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
551            }
552            match previous_lang {
553                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
554                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
555            }
556        }
557    }
558
559    #[test]
560    fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
561        let tokens = sparse_query_tokens("find the function that opens a file");
562        // "find", "function", "opens", "file" survive. "the", "that", "a" dropped.
563        assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
564    }
565
566    #[test]
567    fn sparse_query_tokens_deduplicates() {
568        let tokens = sparse_query_tokens("parse json parse xml parse");
569        assert_eq!(tokens, vec!["parse", "json", "xml"]);
570    }
571
572    #[test]
573    fn has_whole_word_respects_word_boundaries() {
574        // snake_case separator counts as non-word → match
575        assert!(has_whole_word("parse_json_body", "parse"));
576        // substring inside a larger identifier → no match
577        assert!(!has_whole_word("parser", "parse"));
578        assert!(!has_whole_word("parserequest", "parse"));
579        // leading/trailing whitespace
580        assert!(has_whole_word("parse the file", "parse"));
581        assert!(has_whole_word("open file", "file"));
582        // empty token / short corpus
583        assert!(!has_whole_word("xyz", ""));
584        assert!(!has_whole_word("ab", "abc"));
585    }
586
587    #[test]
588    fn sparse_coverage_bonus_zero_for_single_token_query() {
589        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
590        // Single token after stopword filtering — short-circuit to 0.
591        let bonus = sparse_coverage_bonus("parse", &sym);
592        assert_eq!(bonus, 0.0);
593    }
594
595    #[test]
596    fn sparse_coverage_bonus_zero_below_threshold() {
597        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
598        // Two query tokens: "parse", "rename". Only "parse" matches → 50% coverage.
599        // 50% < 60% threshold → bonus 0.
600        let bonus = sparse_coverage_bonus("parse rename", &sym);
601        assert_eq!(bonus, 0.0);
602    }
603
604    #[test]
605    fn sparse_coverage_bonus_full_match_reaches_max() {
606        let sym = mk_symbol(
607            "parse_json_body",
608            "fn parse_json_body(input: &str) -> Value",
609        );
610        // Tokens: "parse", "json", "body". All three match.
611        // coverage = 1.0 → bonus = (1.0 - 0.6) / 0.4 * 20 = 20
612        let bonus = sparse_coverage_bonus("parse json body", &sym);
613        // Allow small float tolerance for default max = 20
614        assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
615    }
616
617    #[test]
618    fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
619        // "parser" should NOT match token "parse" via the sparse path —
620        // word-boundary precision is the whole point of Phase 2e.
621        // Two tokens ("parse", "json"), only "json" matches via the
622        // signature → 50% coverage → 0 bonus (below threshold).
623        let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
624        let bonus = sparse_coverage_bonus("parse json", &sym);
625        assert_eq!(bonus, 0.0);
626    }
627}