Skip to main content

codelens_engine/symbols/
scoring.rs

1use super::types::SymbolInfo;
2
3// ── Zero-allocation ASCII case-insensitive helpers ──────────────────
4
5/// ASCII case-insensitive substring search. Returns true if `needle`
6/// appears anywhere in `haystack` ignoring ASCII case differences.
7///
8/// This replaces the previous pattern of allocating
9/// `haystack.to_lowercase()` + `haystack_lower.contains(needle_lower)`
10/// which paid one `String` allocation per call. Since code identifiers
11/// in all 25 supported tree-sitter languages are ASCII, the ASCII-only
12/// comparison is both correct and faster than Unicode `to_lowercase`.
13pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14    let h = haystack.as_bytes();
15    let n = needle.as_bytes();
16    if n.len() > h.len() {
17        return false;
18    }
19    if n.is_empty() {
20        return true;
21    }
22    h.windows(n.len())
23        .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26/// ASCII case-insensitive full-string equality.
27fn eq_ascii_ci(a: &str, b: &str) -> bool {
28    a.eq_ignore_ascii_case(b)
29}
30
31/// Check if any query token is a common programming action verb.
32fn query_has_action_verb(tokens: &[&str]) -> bool {
33    const ACTION_VERBS: &[&str] = &[
34        "find",
35        "get",
36        "search",
37        "detect",
38        "start",
39        "run",
40        "read",
41        "write",
42        "move",
43        "change",
44        "rename",
45        "replace",
46        "extract",
47        "route",
48        "embed",
49        "build",
50        "create",
51        "delete",
52        "update",
53        "compute",
54        "calculate",
55        "apply",
56        "handle",
57        "parse",
58        "index",
59        "watch",
60        "listen",
61        "fetch",
62        "send",
63        "load",
64        "save",
65        "open",
66        "close",
67        "connect",
68        "check",
69        "validate",
70        "verify",
71        "transform",
72        "convert",
73        "process",
74        "execute",
75        "call",
76        "invoke",
77        "inline",
78        "refactor",
79        "analyze",
80        "import",
81        "export",
82    ];
83    tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86/// Score a symbol's relevance to a query string.
87/// Returns None if no match, Some(1..=100) for match strength.
88///
89/// Accepts pre-computed `query_lower` to avoid repeated allocation
90/// when scoring many symbols against the same query.
91pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92    let lower = query.to_lowercase();
93    let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94    score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97/// Inner scoring with pre-lowercased query and pre-computed joined-snake
98/// form — call this from hot loops where both are invariant across
99/// candidates.
100///
101/// `joined_snake` is the query with whitespace/hyphens replaced by
102/// underscores, used for snake_case identifier matching (e.g.
103/// "rename symbol" → "rename_symbol"). It is query-derived and
104/// identical for every candidate, so computing it once in the caller
105/// eliminates one String allocation per candidate in the hot loop.
106pub(crate) fn score_symbol_with_lower(
107    query: &str,
108    query_lower: &str,
109    joined_snake: &str,
110    symbol: &SymbolInfo,
111) -> Option<i32> {
112    // Exact full-query match (no allocation needed)
113    if symbol.name.eq_ignore_ascii_case(query) {
114        return Some(100);
115    }
116
117    // ── Zero-alloc substring checks (replaces 4 × to_lowercase()) ──
118    // All checks below use contains_ascii_ci / eq_ascii_ci instead of
119    // allocating lowered Strings. Code identifiers are ASCII, so
120    // ASCII case folding is correct and avoids one String per field.
121
122    if contains_ascii_ci(&symbol.name, query_lower) {
123        return Some(60);
124    }
125    if contains_ascii_ci(&symbol.signature, query_lower) {
126        return Some(30);
127    }
128    if contains_ascii_ci(&symbol.name_path, query_lower) {
129        return Some(20);
130    }
131
132    // Check if query tokens form the symbol name when joined with underscore
133    // e.g. "rename symbol" → "rename_symbol" → exact match bonus
134    // `joined_snake` is pre-computed by the caller to avoid one String
135    // allocation per candidate in the hot loop.
136    if eq_ascii_ci(&symbol.name, joined_snake) {
137        return Some(80);
138    }
139    // Partial: symbol name is a subset of joined tokens
140    // e.g. "move symbol to file" → joined = "move_symbol_to_file", contains "move_symbol" → 70
141    if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142        return Some(70);
143    }
144    // Reverse: symbol name contains the joined tokens
145    // e.g. "extract function" → "refactor_extract_function" contains "extract_function" → 65
146    if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147        return Some(65);
148    }
149
150    // Token-level matching: split query into words, score by hit ratio
151    let tokens: Vec<&str> = query_lower
152        .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153        .filter(|t| t.len() >= 2)
154        .collect();
155    if tokens.is_empty() {
156        return None;
157    }
158
159    // Token-level name matching: contains_ascii_ci already covers
160    // CamelCase segments because every CamelCase segment is a contiguous
161    // substring of the original name. The old split_camel_case() call
162    // allocated Vec<char> + Vec<String> per candidate but could never
163    // produce a hit that contains_ascii_ci missed. Removed in v1.6.2+
164    // to eliminate the last per-candidate allocation in this function.
165
166    let mut name_hits = 0i32;
167    let mut sig_hits = 0i32;
168    let mut path_hits = 0i32;
169    for token in &tokens {
170        if contains_ascii_ci(&symbol.name, token) {
171            name_hits += 1;
172        }
173        if contains_ascii_ci(&symbol.signature, token) {
174            sig_hits += 1;
175        }
176        if contains_ascii_ci(&symbol.file_path, token) {
177            path_hits += 1;
178        }
179    }
180
181    let total_tokens = tokens.len() as i32;
182    if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183        return None;
184    }
185
186    // Score formula: name hits dominate, sig/path are secondary
187    // name_ratio: 0.0-1.0 portion of query tokens found in name
188    // Boost for high name coverage (most tokens match the symbol name)
189    let name_ratio = name_hits as f64 / total_tokens as f64;
190    let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192    let base_score = if name_hits > 0 {
193        let base = (15.0 + name_ratio * 40.0) as i32;
194        let sig_bonus = (sig_ratio * 5.0) as i32;
195        (base + sig_bonus).min(55)
196    } else if sig_hits > 0 {
197        (5.0 + sig_ratio * 20.0) as i32
198    } else {
199        // Path-only: very weak signal, 1-5
200        let path_ratio = path_hits as f64 / total_tokens as f64;
201        (1.0 + path_ratio * 4.0).max(1.0) as i32
202    };
203
204    // Kind-aware boost: action queries prefer functions, noun queries prefer types.
205    // Detects action intent by checking if any query token is a common verb.
206    let kind_boost = if query_has_action_verb(&tokens) {
207        match symbol.kind {
208            super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209            _ => 0,
210        }
211    } else {
212        match symbol.kind {
213            super::types::SymbolKind::Class
214            | super::types::SymbolKind::Interface
215            | super::types::SymbolKind::Enum => 5,
216            _ => 0,
217        }
218    };
219
220    Some(base_score + kind_boost)
221}
222
223/// Return true when v1.5 Phase 2e sparse term weighting is enabled via
224/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or `true`/`yes`/`on`).
225///
226/// Default OFF, mirroring the Phase 2b/2c opt-in policy. Projects that
227/// already opt into the Phase 2b/2c embedding hints can stack this knob
228/// to tighten top-1 ordering without another index rebuild — the sparse
229/// pass reads `SymbolInfo` fields that are already populated on the
230/// ranking path.
231///
232/// v1.5 Phase 2j: when no explicit env var is set, fall through to
233/// `crate::embedding::auto_sparse_should_enable()` for language-gated
234/// defaults. This intentionally diverges from `nl_tokens_enabled` and
235/// `api_calls_enabled`: Phase 2m keeps JS/TS auto-enabled for Phase 2b/2c
236/// but auto-disables sparse weighting there because recent JS/TS
237/// measurements were negative-or-inert. Explicit env always wins.
238pub fn sparse_weighting_enabled() -> bool {
239    if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240        let lowered = raw.trim().to_ascii_lowercase();
241        return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242    }
243    // The auto-on heuristic lives under the semantic feature because it keys off
244    // the embedding runtime config. When semantic is disabled the ranker falls
245    // back to pure lexical scoring, which has no sparse weighting path, so the
246    // default is `false`.
247    #[cfg(feature = "semantic")]
248    {
249        crate::embedding::auto_sparse_should_enable()
250    }
251    #[cfg(not(feature = "semantic"))]
252    {
253        false
254    }
255}
256
257/// Maximum sparse coverage bonus added to the blended score when a query
258/// reaches 100% term coverage against a symbol's `name + name_path +
259/// signature` corpus. Override via `CODELENS_RANK_SPARSE_MAX` (clamped
260/// to 5..=50).
261///
262/// Kept deliberately modest (default 20) because the existing lexical
263/// score in `score_symbol_with_lower` already reaches 55 for signature
264/// hits. The sparse bonus is a *tie-breaker* — it re-orders the top-K
265/// after the main scoring has selected them, not a replacement for the
266/// lexical signal.
267pub fn sparse_max_bonus() -> f64 {
268    std::env::var("CODELENS_RANK_SPARSE_MAX")
269        .ok()
270        .and_then(|raw| raw.parse::<u32>().ok())
271        .map(|n| n.clamp(5, 50))
272        .unwrap_or(20) as f64
273}
274
275/// Minimum query-term coverage (as a percentage, 10..=90) a symbol must
276/// reach before it receives any sparse bonus. Below this threshold the
277/// bonus is `0.0`. Between the threshold and 100% the bonus rises
278/// linearly from `0.0` to `sparse_max_bonus()`.
279///
280/// The default of 60 was a conservative first guess. An initial 4-arm
281/// A/B on the 89-query self dataset found that the bonus never fired at
282/// 60 because most NL queries only share 1–2 discriminative tokens with
283/// their target symbol's `name + name_path + signature` corpus.
284/// Override via `CODELENS_RANK_SPARSE_THRESHOLD` for tuning experiments.
285pub fn sparse_threshold() -> f64 {
286    std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
287        .ok()
288        .and_then(|raw| raw.parse::<u32>().ok())
289        .map(|n| n.clamp(10, 90))
290        .unwrap_or(60) as f64
291        / 100.0
292}
293
294/// English/pseudo-stopwords that add no discriminative signal when used
295/// as query tokens. Intentionally short — real NL stopwords lists contain
296/// ~150 entries, but most of them never show up in code-search queries.
297/// We only need the ones that regularly dilute query coverage ("find the
298/// function that opens a file" — `the` and `that` are the problem).
299const SPARSE_STOPWORDS: &[&str] = &[
300    "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
301    "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
302    "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
303];
304
305/// Return true when `token` is found in `corpus` as a whole word — that is,
306/// the characters surrounding each occurrence are NOT alphanumeric or `_`.
307///
308/// Phase 2e uses this instead of `str::contains` so that a query token like
309/// `"parse"` matches `parse_json` (snake separator) but not `parser` or
310/// `parseRequest` (would already be caught by the lexical `contains` path,
311/// which is where we want them scored — not via the sparse bonus).
312pub fn has_whole_word(corpus: &str, token: &str) -> bool {
313    if token.is_empty() || corpus.len() < token.len() {
314        return false;
315    }
316    let corpus_bytes = corpus.as_bytes();
317    let token_bytes = token.as_bytes();
318    let mut start = 0;
319    while start + token_bytes.len() <= corpus_bytes.len() {
320        // Find next occurrence from `start`
321        let remaining = &corpus[start..];
322        let Some(local_idx) = remaining.find(token) else {
323            return false;
324        };
325        let abs = start + local_idx;
326        let end = abs + token_bytes.len();
327        let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
328        let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
329        if before_ok && after_ok {
330            return true;
331        }
332        start = abs + 1;
333    }
334    false
335}
336
337/// Byte-level helper: true when the byte is part of an ASCII word
338/// ([A-Za-z0-9]). `_` is deliberately excluded so that snake_case
339/// separators count as word boundaries — e.g. `"parse"` should match
340/// `"parse_json_body"` but not `"parser"`. Non-ASCII bytes (UTF-8
341/// continuation) default to "word" so multi-byte identifiers stay
342/// conservative (no false positives from partial UTF-8 matches).
343fn is_word_byte(b: u8) -> bool {
344    b.is_ascii_alphanumeric() || (b & 0x80) != 0
345}
346
347/// Tokenize `query_lower` into distinct discriminative terms for the
348/// Phase 2e sparse pass:
349/// - split on any non-alphanumeric character
350/// - drop tokens shorter than 3 characters
351/// - drop tokens in `SPARSE_STOPWORDS`
352/// - deduplicate while preserving order
353///
354/// Returns `Vec<String>` (not `Vec<&str>`) so callers can own the tokens
355/// independently of the query lifetime — the rank loop already has to
356/// outlive the borrow anyway.
357pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
358    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
359    let mut out: Vec<String> = Vec::new();
360    for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
361        if raw.len() < 3 {
362            continue;
363        }
364        if SPARSE_STOPWORDS.contains(&raw) {
365            continue;
366        }
367        if seen.insert(raw.to_string()) {
368            out.push(raw.to_string());
369        }
370    }
371    out
372}
373
374/// Text-first variant of the Phase 2e sparse coverage bonus. Does NOT
375/// take a `SymbolInfo` so that callers outside the engine crate (notably
376/// the MCP `get_ranked_context` post-process) can feed it whatever fields
377/// are actually available on their entry type.
378///
379/// `query_lower` MUST already be lower-cased — the function does not
380/// re-lowercase so that callers with a long query can amortise the
381/// allocation outside the loop. Pass the *original user query*, not the
382/// MCP-expanded retrieval string: the expansion adds dozens of
383/// derivative tokens (snake_case, CamelCase, alias groups) that dilute
384/// the coverage ratio below any reasonable threshold — that dilution
385/// was the exact reason the first 4-arm pilot measured zero effect.
386///
387/// Returns `0.0` whenever:
388/// - the query has fewer than 2 discriminative tokens after stopword
389///   filtering (single-token queries already resolve well via the
390///   lexical path — `sparse_query_tokens` deduplicates + drops <3 chars),
391/// - the coverage ratio is below `sparse_threshold()` (default 0.6).
392///
393/// Between the threshold and 100% coverage the bonus rises linearly
394/// from 0 to `sparse_max_bonus()`. The caller is responsible for
395/// gating the whole call with `sparse_weighting_enabled()` so test
396/// code can run the inner logic deterministically.
397pub fn sparse_coverage_bonus_from_fields(
398    query_lower: &str,
399    name: &str,
400    name_path: &str,
401    signature: &str,
402    file_path: &str,
403) -> f64 {
404    let tokens = sparse_query_tokens(query_lower);
405    if tokens.len() < 2 {
406        return 0.0;
407    }
408    // Build the corpus directly as lowercase to avoid a second String
409    // allocation. Previously this was corpus + corpus.to_lowercase() =
410    // 2 allocations per candidate; now it's 1.
411    let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
412    let mut corpus_lower = String::with_capacity(cap);
413    for field in [name, name_path, signature, file_path] {
414        if !corpus_lower.is_empty() {
415            corpus_lower.push(' ');
416        }
417        for ch in field.chars() {
418            corpus_lower.push(ch.to_ascii_lowercase());
419        }
420    }
421
422    let matched = tokens
423        .iter()
424        .filter(|t| has_whole_word(&corpus_lower, t))
425        .count() as f64;
426    let total = tokens.len() as f64;
427    let coverage = matched / total;
428
429    let threshold = sparse_threshold();
430    if coverage < threshold {
431        return 0.0;
432    }
433    // threshold → 0, 100% → sparse_max_bonus(), linear between. Guard
434    // against threshold == 1.0 (would divide by zero) by clamping.
435    let span = (1.0 - threshold).max(0.01);
436    (coverage - threshold) / span * sparse_max_bonus()
437}
438
439/// Back-compat wrapper kept for the existing `SymbolInfo`-based unit
440/// tests. New call sites should prefer `sparse_coverage_bonus_from_fields`.
441#[cfg(test)]
442pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
443    sparse_coverage_bonus_from_fields(
444        query_lower,
445        &symbol.name,
446        &symbol.name_path,
447        &symbol.signature,
448        &symbol.file_path,
449    )
450}
451
452#[cfg(test)]
453mod tests {
454    use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
455    use super::*;
456    use std::sync::Mutex;
457
458    static ENV_LOCK: Mutex<()> = Mutex::new(());
459
460    fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
461        SymbolInfo {
462            name: name.to_string(),
463            kind: SymbolKind::Function,
464            file_path: "test.rs".into(),
465            line: 1,
466            column: 0,
467            signature: signature.to_string(),
468            name_path: name.to_string(),
469            id: format!("test.rs#function:{name}"),
470            body: None,
471            children: Vec::new(),
472            start_byte: 0,
473            end_byte: 0,
474            provenance: SymbolProvenance::default(),
475        }
476    }
477
478    #[test]
479    fn sparse_weighting_gated_off_by_default() {
480        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
481        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
482        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
483        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
484        unsafe {
485            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
486            std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
487            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
488        }
489        let enabled = sparse_weighting_enabled();
490        unsafe {
491            match previous_explicit {
492                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
493                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
494            }
495            match previous_auto {
496                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
497                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
498            }
499            match previous_lang {
500                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
501                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
502            }
503        }
504        assert!(!enabled, "sparse weighting gate leaked");
505    }
506
507    #[test]
508    fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
509        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
510        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
511        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
512        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
513
514        unsafe {
515            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
516            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
517            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
518        }
519        assert!(
520            sparse_weighting_enabled(),
521            "auto+rust should enable sparse weighting"
522        );
523
524        unsafe {
525            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
526            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
527            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
528        }
529        assert!(
530            !sparse_weighting_enabled(),
531            "auto+typescript should disable sparse weighting after Phase 2m split"
532        );
533
534        unsafe {
535            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
536            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
537            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
538        }
539        assert!(
540            sparse_weighting_enabled(),
541            "explicit sparse=1 must still win over JS/TS auto-off"
542        );
543
544        unsafe {
545            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
546            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
547            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
548        }
549        assert!(
550            !sparse_weighting_enabled(),
551            "explicit sparse=0 must still win over rust auto-on"
552        );
553
554        unsafe {
555            match previous_explicit {
556                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
557                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
558            }
559            match previous_auto {
560                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
561                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
562            }
563            match previous_lang {
564                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
565                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
566            }
567        }
568    }
569
570    #[test]
571    fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
572        let tokens = sparse_query_tokens("find the function that opens a file");
573        // "find", "function", "opens", "file" survive. "the", "that", "a" dropped.
574        assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
575    }
576
577    #[test]
578    fn sparse_query_tokens_deduplicates() {
579        let tokens = sparse_query_tokens("parse json parse xml parse");
580        assert_eq!(tokens, vec!["parse", "json", "xml"]);
581    }
582
583    #[test]
584    fn has_whole_word_respects_word_boundaries() {
585        // snake_case separator counts as non-word → match
586        assert!(has_whole_word("parse_json_body", "parse"));
587        // substring inside a larger identifier → no match
588        assert!(!has_whole_word("parser", "parse"));
589        assert!(!has_whole_word("parserequest", "parse"));
590        // leading/trailing whitespace
591        assert!(has_whole_word("parse the file", "parse"));
592        assert!(has_whole_word("open file", "file"));
593        // empty token / short corpus
594        assert!(!has_whole_word("xyz", ""));
595        assert!(!has_whole_word("ab", "abc"));
596    }
597
598    #[test]
599    fn sparse_coverage_bonus_zero_for_single_token_query() {
600        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
601        // Single token after stopword filtering — short-circuit to 0.
602        let bonus = sparse_coverage_bonus("parse", &sym);
603        assert_eq!(bonus, 0.0);
604    }
605
606    #[test]
607    fn sparse_coverage_bonus_zero_below_threshold() {
608        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
609        // Two query tokens: "parse", "rename". Only "parse" matches → 50% coverage.
610        // 50% < 60% threshold → bonus 0.
611        let bonus = sparse_coverage_bonus("parse rename", &sym);
612        assert_eq!(bonus, 0.0);
613    }
614
615    #[test]
616    fn sparse_coverage_bonus_full_match_reaches_max() {
617        let sym = mk_symbol(
618            "parse_json_body",
619            "fn parse_json_body(input: &str) -> Value",
620        );
621        // Tokens: "parse", "json", "body". All three match.
622        // coverage = 1.0 → bonus = (1.0 - 0.6) / 0.4 * 20 = 20
623        let bonus = sparse_coverage_bonus("parse json body", &sym);
624        // Allow small float tolerance for default max = 20
625        assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
626    }
627
628    #[test]
629    fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
630        // "parser" should NOT match token "parse" via the sparse path —
631        // word-boundary precision is the whole point of Phase 2e.
632        // Two tokens ("parse", "json"), only "json" matches via the
633        // signature → 50% coverage → 0 bonus (below threshold).
634        let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
635        let bonus = sparse_coverage_bonus("parse json", &sym);
636        assert_eq!(bonus, 0.0);
637    }
638}