Skip to main content

codelens_engine/symbols/
scoring.rs

1use super::types::SymbolInfo;
2
3// ── Zero-allocation ASCII case-insensitive helpers ──────────────────
4
5/// ASCII case-insensitive substring search. Returns true if `needle`
6/// appears anywhere in `haystack` ignoring ASCII case differences.
7///
8/// This replaces the previous pattern of allocating
9/// `haystack.to_lowercase()` + `haystack_lower.contains(needle_lower)`
10/// which paid one `String` allocation per call. Since code identifiers
11/// in all 25 supported tree-sitter languages are ASCII, the ASCII-only
12/// comparison is both correct and faster than Unicode `to_lowercase`.
13pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
14    let h = haystack.as_bytes();
15    let n = needle.as_bytes();
16    if n.len() > h.len() {
17        return false;
18    }
19    if n.is_empty() {
20        return true;
21    }
22    h.windows(n.len())
23        .any(|window| window.eq_ignore_ascii_case(n))
24}
25
26/// ASCII case-insensitive full-string equality.
27fn eq_ascii_ci(a: &str, b: &str) -> bool {
28    a.eq_ignore_ascii_case(b)
29}
30
31/// Check if any query token is a common programming action verb.
32fn query_has_action_verb(tokens: &[&str]) -> bool {
33    const ACTION_VERBS: &[&str] = &[
34        "find",
35        "get",
36        "search",
37        "detect",
38        "start",
39        "run",
40        "read",
41        "write",
42        "move",
43        "change",
44        "rename",
45        "replace",
46        "extract",
47        "route",
48        "embed",
49        "build",
50        "create",
51        "delete",
52        "update",
53        "compute",
54        "calculate",
55        "apply",
56        "handle",
57        "parse",
58        "index",
59        "watch",
60        "listen",
61        "fetch",
62        "send",
63        "load",
64        "save",
65        "open",
66        "close",
67        "connect",
68        "check",
69        "validate",
70        "verify",
71        "transform",
72        "convert",
73        "process",
74        "execute",
75        "call",
76        "invoke",
77        "inline",
78        "refactor",
79        "analyze",
80        "import",
81        "export",
82    ];
83    tokens.iter().any(|t| ACTION_VERBS.contains(t))
84}
85
86/// Score a symbol's relevance to a query string.
87/// Returns None if no match, Some(1..=100) for match strength.
88///
89/// Accepts pre-computed `query_lower` to avoid repeated allocation
90/// when scoring many symbols against the same query.
91pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
92    let lower = query.to_lowercase();
93    let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
94    score_symbol_with_lower(query, &lower, &snake, symbol)
95}
96
97/// Inner scoring with pre-lowercased query and pre-computed joined-snake
98/// form — call this from hot loops where both are invariant across
99/// candidates.
100///
101/// `joined_snake` is the query with whitespace/hyphens replaced by
102/// underscores, used for snake_case identifier matching (e.g.
103/// "rename symbol" → "rename_symbol"). It is query-derived and
104/// identical for every candidate, so computing it once in the caller
105/// eliminates one String allocation per candidate in the hot loop.
106pub(crate) fn score_symbol_with_lower(
107    query: &str,
108    query_lower: &str,
109    joined_snake: &str,
110    symbol: &SymbolInfo,
111) -> Option<i32> {
112    // Exact full-query match (no allocation needed)
113    if symbol.name.eq_ignore_ascii_case(query) {
114        return Some(100);
115    }
116
117    // ── Zero-alloc substring checks (replaces 4 × to_lowercase()) ──
118    // All checks below use contains_ascii_ci / eq_ascii_ci instead of
119    // allocating lowered Strings. Code identifiers are ASCII, so
120    // ASCII case folding is correct and avoids one String per field.
121
122    if contains_ascii_ci(&symbol.name, query_lower) {
123        return Some(60);
124    }
125    if contains_ascii_ci(&symbol.signature, query_lower) {
126        return Some(30);
127    }
128    if contains_ascii_ci(&symbol.name_path, query_lower) {
129        return Some(20);
130    }
131
132    // Check if query tokens form the symbol name when joined with underscore
133    // e.g. "rename symbol" → "rename_symbol" → exact match bonus
134    // `joined_snake` is pre-computed by the caller to avoid one String
135    // allocation per candidate in the hot loop.
136    if eq_ascii_ci(&symbol.name, joined_snake) {
137        return Some(80);
138    }
139    // Partial: symbol name is a subset of joined tokens
140    // e.g. "move symbol to file" → joined = "move_symbol_to_file", contains "move_symbol" → 70
141    if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
142        return Some(70);
143    }
144    // Reverse: symbol name contains the joined tokens
145    // e.g. "extract function" → "refactor_extract_function" contains "extract_function" → 65
146    if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
147        return Some(65);
148    }
149
150    // Token-level matching: split query into words, score by hit ratio
151    let tokens: Vec<&str> = query_lower
152        .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
153        .filter(|t| t.len() >= 2)
154        .collect();
155    if tokens.is_empty() {
156        return None;
157    }
158
159    // Token-level name matching: contains_ascii_ci already covers
160    // CamelCase segments because every CamelCase segment is a contiguous
161    // substring of the original name. The old split_camel_case() call
162    // allocated Vec<char> + Vec<String> per candidate but could never
163    // produce a hit that contains_ascii_ci missed. Removed in v1.6.2+
164    // to eliminate the last per-candidate allocation in this function.
165
166    let mut name_hits = 0i32;
167    let mut sig_hits = 0i32;
168    let mut path_hits = 0i32;
169    for token in &tokens {
170        if contains_ascii_ci(&symbol.name, token) {
171            name_hits += 1;
172        }
173        if contains_ascii_ci(&symbol.signature, token) {
174            sig_hits += 1;
175        }
176        if contains_ascii_ci(&symbol.file_path, token) {
177            path_hits += 1;
178        }
179    }
180
181    let total_tokens = tokens.len() as i32;
182    if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
183        return None;
184    }
185
186    // Score formula: name hits dominate, sig/path are secondary
187    // name_ratio: 0.0-1.0 portion of query tokens found in name
188    // Boost for high name coverage (most tokens match the symbol name)
189    let name_ratio = name_hits as f64 / total_tokens as f64;
190    let sig_ratio = sig_hits as f64 / total_tokens as f64;
191
192    let base_score = if name_hits > 0 {
193        let base = (15.0 + name_ratio * 40.0) as i32;
194        let sig_bonus = (sig_ratio * 5.0) as i32;
195        (base + sig_bonus).min(55)
196    } else if sig_hits > 0 {
197        (5.0 + sig_ratio * 20.0) as i32
198    } else {
199        // Path-only: very weak signal, 1-5
200        let path_ratio = path_hits as f64 / total_tokens as f64;
201        (1.0 + path_ratio * 4.0).max(1.0) as i32
202    };
203
204    // Kind-aware boost: action queries prefer functions, noun queries prefer types.
205    // Detects action intent by checking if any query token is a common verb.
206    let kind_boost = if query_has_action_verb(&tokens) {
207        match symbol.kind {
208            super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
209            _ => 0,
210        }
211    } else {
212        match symbol.kind {
213            super::types::SymbolKind::Class
214            | super::types::SymbolKind::Interface
215            | super::types::SymbolKind::Enum => 5,
216            _ => 0,
217        }
218    };
219
220    Some(base_score + kind_boost)
221}
222
223/// Return true when v1.5 Phase 2e sparse term weighting is enabled via
224/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or `true`/`yes`/`on`).
225///
226/// Default OFF, mirroring the Phase 2b/2c opt-in policy. Projects that
227/// already opt into the Phase 2b/2c embedding hints can stack this knob
228/// to tighten top-1 ordering without another index rebuild — the sparse
229/// pass reads `SymbolInfo` fields that are already populated on the
230/// ranking path.
231///
232/// v1.5 Phase 2j: when no explicit env var is set, fall through to
233/// `crate::embedding::auto_sparse_should_enable()` for language-gated
234/// defaults. This intentionally diverges from `nl_tokens_enabled` and
235/// `api_calls_enabled`: Phase 2m keeps JS/TS auto-enabled for Phase 2b/2c
236/// but auto-disables sparse weighting there because recent JS/TS
237/// measurements were negative-or-inert. Explicit env always wins.
238pub fn sparse_weighting_enabled() -> bool {
239    if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
240        let lowered = raw.trim().to_ascii_lowercase();
241        return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
242    }
243    // The auto-on heuristic lives under the semantic feature because it keys off
244    // the embedding runtime config. When semantic is disabled the ranker falls
245    // back to pure lexical scoring, which has no sparse weighting path, so the
246    // default is `false`.
247    #[cfg(feature = "semantic")]
248    {
249        crate::embedding::auto_sparse_should_enable()
250    }
251    #[cfg(not(feature = "semantic"))]
252    {
253        false
254    }
255}
256
257/// Maximum sparse coverage bonus added to the blended score when a query
258/// reaches 100% term coverage against a symbol's `name + name_path +
259/// signature` corpus. Override via `CODELENS_RANK_SPARSE_MAX` (clamped
260/// to 5..=50).
261///
262/// Kept deliberately modest (default 20) because the existing lexical
263/// score in `score_symbol_with_lower` already reaches 55 for signature
264/// hits. The sparse bonus is a *tie-breaker* — it re-orders the top-K
265/// after the main scoring has selected them, not a replacement for the
266/// lexical signal.
267pub fn sparse_max_bonus() -> f64 {
268    std::env::var("CODELENS_RANK_SPARSE_MAX")
269        .ok()
270        .and_then(|raw| raw.parse::<u32>().ok())
271        .map(|n| n.clamp(5, 50))
272        .unwrap_or(20) as f64
273}
274
275/// Minimum query-term coverage (as a percentage, 10..=90) a symbol must
276/// reach before it receives any sparse bonus. Below this threshold the
277/// bonus is `0.0`. Between the threshold and 100% the bonus rises
278/// linearly from `0.0` to `sparse_max_bonus()`.
279///
280/// The default of 60 was a conservative first guess. An initial 4-arm
281/// A/B on the 89-query self dataset found that the bonus never fired at
282/// 60 because most NL queries only share 1–2 discriminative tokens with
283/// their target symbol's `name + name_path + signature` corpus.
284/// Override via `CODELENS_RANK_SPARSE_THRESHOLD` for tuning experiments.
285pub fn sparse_threshold() -> f64 {
286    std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
287        .ok()
288        .and_then(|raw| raw.parse::<u32>().ok())
289        .map(|n| n.clamp(10, 90))
290        .unwrap_or(60) as f64
291        / 100.0
292}
293
294/// English/pseudo-stopwords that add no discriminative signal when used
295/// as query tokens. Intentionally short — real NL stopwords lists contain
296/// ~150 entries, but most of them never show up in code-search queries.
297/// We only need the ones that regularly dilute query coverage ("find the
298/// function that opens a file" — `the` and `that` are the problem).
299const SPARSE_STOPWORDS: &[&str] = &[
300    "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
301    "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
302    "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
303];
304
305/// Return true when `token` is found in `corpus` as a whole word — that is,
306/// the characters surrounding each occurrence are NOT alphanumeric or `_`.
307///
308/// Phase 2e uses this instead of `str::contains` so that a query token like
309/// `"parse"` matches `parse_json` (snake separator) but not `parser` or
310/// `parseRequest` (would already be caught by the lexical `contains` path,
311/// which is where we want them scored — not via the sparse bonus).
312pub fn has_whole_word(corpus: &str, token: &str) -> bool {
313    if token.is_empty() || corpus.len() < token.len() {
314        return false;
315    }
316    let corpus_bytes = corpus.as_bytes();
317    let token_bytes = token.as_bytes();
318    let mut start = 0;
319    while start + token_bytes.len() <= corpus_bytes.len() {
320        // Find next occurrence from `start`
321        let remaining = &corpus[start..];
322        let Some(local_idx) = remaining.find(token) else {
323            return false;
324        };
325        let abs = start + local_idx;
326        let end = abs + token_bytes.len();
327        let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
328        let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
329        if before_ok && after_ok {
330            return true;
331        }
332        start = abs + 1;
333    }
334    false
335}
336
337/// Byte-level helper: true when the byte is part of an ASCII word
338/// ([A-Za-z0-9]). `_` is deliberately excluded so that snake_case
339/// separators count as word boundaries — e.g. `"parse"` should match
340/// `"parse_json_body"` but not `"parser"`. Non-ASCII bytes (UTF-8
341/// continuation) default to "word" so multi-byte identifiers stay
342/// conservative (no false positives from partial UTF-8 matches).
343fn is_word_byte(b: u8) -> bool {
344    b.is_ascii_alphanumeric() || (b & 0x80) != 0
345}
346
347/// Tokenize `query_lower` into distinct discriminative terms for the
348/// Phase 2e sparse pass:
349/// - split on any non-alphanumeric character
350/// - drop tokens shorter than 3 characters
351/// - drop tokens in `SPARSE_STOPWORDS`
352/// - deduplicate while preserving order
353///
354/// Returns `Vec<String>` (not `Vec<&str>`) so callers can own the tokens
355/// independently of the query lifetime — the rank loop already has to
356/// outlive the borrow anyway.
357pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
358    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
359    let mut out: Vec<String> = Vec::new();
360    for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
361        if raw.len() < 3 {
362            continue;
363        }
364        if SPARSE_STOPWORDS.contains(&raw) {
365            continue;
366        }
367        if seen.insert(raw.to_string()) {
368            out.push(raw.to_string());
369        }
370    }
371    out
372}
373
374/// Text-first variant of the Phase 2e sparse coverage bonus. Does NOT
375/// take a `SymbolInfo` so that callers outside the engine crate (notably
376/// the MCP `get_ranked_context` post-process) can feed it whatever fields
377/// are actually available on their entry type.
378///
379/// `query_lower` MUST already be lower-cased — the function does not
380/// re-lowercase so that callers with a long query can amortise the
381/// allocation outside the loop. Pass the *original user query*, not the
382/// MCP-expanded retrieval string: the expansion adds dozens of
383/// derivative tokens (snake_case, CamelCase, alias groups) that dilute
384/// the coverage ratio below any reasonable threshold — that dilution
385/// was the exact reason the first 4-arm pilot measured zero effect.
386///
387/// Returns `0.0` whenever:
388/// - the query has fewer than 2 discriminative tokens after stopword
389///   filtering (single-token queries already resolve well via the
390///   lexical path — `sparse_query_tokens` deduplicates + drops <3 chars),
391/// - the coverage ratio is below `sparse_threshold()` (default 0.6).
392///
393/// Between the threshold and 100% coverage the bonus rises linearly
394/// from 0 to `sparse_max_bonus()`. The caller is responsible for
395/// gating the whole call with `sparse_weighting_enabled()` so test
396/// code can run the inner logic deterministically.
397pub fn sparse_coverage_bonus_from_fields(
398    query_lower: &str,
399    name: &str,
400    name_path: &str,
401    signature: &str,
402    file_path: &str,
403) -> f64 {
404    let tokens = sparse_query_tokens(query_lower);
405    if tokens.len() < 2 {
406        return 0.0;
407    }
408    // Build the corpus directly as lowercase to avoid a second String
409    // allocation. Previously this was corpus + corpus.to_lowercase() =
410    // 2 allocations per candidate; now it's 1.
411    let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
412    let mut corpus_lower = String::with_capacity(cap);
413    for field in [name, name_path, signature, file_path] {
414        if !corpus_lower.is_empty() {
415            corpus_lower.push(' ');
416        }
417        for ch in field.chars() {
418            corpus_lower.push(ch.to_ascii_lowercase());
419        }
420    }
421
422    let matched = tokens
423        .iter()
424        .filter(|t| has_whole_word(&corpus_lower, t))
425        .count() as f64;
426    let total = tokens.len() as f64;
427    let coverage = matched / total;
428
429    let threshold = sparse_threshold();
430    if coverage < threshold {
431        return 0.0;
432    }
433    // threshold → 0, 100% → sparse_max_bonus(), linear between. Guard
434    // against threshold == 1.0 (would divide by zero) by clamping.
435    let span = (1.0 - threshold).max(0.01);
436    (coverage - threshold) / span * sparse_max_bonus()
437}
438
439/// Back-compat wrapper kept for the existing `SymbolInfo`-based unit
440/// tests. New call sites should prefer `sparse_coverage_bonus_from_fields`.
441#[cfg(test)]
442pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
443    sparse_coverage_bonus_from_fields(
444        query_lower,
445        &symbol.name,
446        &symbol.name_path,
447        &symbol.signature,
448        &symbol.file_path,
449    )
450}
451
452#[cfg(test)]
453mod tests {
454    use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
455    use super::*;
456    use std::sync::Mutex;
457
458    static ENV_LOCK: Mutex<()> = Mutex::new(());
459
460    fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
461        SymbolInfo {
462            name: name.to_string(),
463            kind: SymbolKind::Function,
464            file_path: "test.rs".into(),
465            line: 1,
466            column: 0,
467            signature: signature.to_string(),
468            name_path: name.to_string(),
469            id: format!("test.rs#function:{name}"),
470            body: None,
471            children: Vec::new(),
472            start_byte: 0,
473            end_byte: 0,
474            provenance: SymbolProvenance::default(),
475            end_line: 0,
476        }
477    }
478
479    #[test]
480    fn sparse_weighting_gated_off_by_default() {
481        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
482        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
483        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
484        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
485        unsafe {
486            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
487            std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
488            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
489        }
490        let enabled = sparse_weighting_enabled();
491        unsafe {
492            match previous_explicit {
493                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
494                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
495            }
496            match previous_auto {
497                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
498                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
499            }
500            match previous_lang {
501                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
502                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
503            }
504        }
505        assert!(!enabled, "sparse weighting gate leaked");
506    }
507
508    #[test]
509    fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
510        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
511        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
512        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
513        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
514
515        unsafe {
516            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
517            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
518            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
519        }
520        assert!(
521            sparse_weighting_enabled(),
522            "auto+rust should enable sparse weighting"
523        );
524
525        unsafe {
526            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
527            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
528            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
529        }
530        assert!(
531            !sparse_weighting_enabled(),
532            "auto+typescript should disable sparse weighting after Phase 2m split"
533        );
534
535        unsafe {
536            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
537            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
538            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
539        }
540        assert!(
541            sparse_weighting_enabled(),
542            "explicit sparse=1 must still win over JS/TS auto-off"
543        );
544
545        unsafe {
546            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
547            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
548            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
549        }
550        assert!(
551            !sparse_weighting_enabled(),
552            "explicit sparse=0 must still win over rust auto-on"
553        );
554
555        unsafe {
556            match previous_explicit {
557                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
558                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
559            }
560            match previous_auto {
561                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
562                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
563            }
564            match previous_lang {
565                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
566                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
567            }
568        }
569    }
570
571    #[test]
572    fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
573        let tokens = sparse_query_tokens("find the function that opens a file");
574        // "find", "function", "opens", "file" survive. "the", "that", "a" dropped.
575        assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
576    }
577
578    #[test]
579    fn sparse_query_tokens_deduplicates() {
580        let tokens = sparse_query_tokens("parse json parse xml parse");
581        assert_eq!(tokens, vec!["parse", "json", "xml"]);
582    }
583
584    #[test]
585    fn has_whole_word_respects_word_boundaries() {
586        // snake_case separator counts as non-word → match
587        assert!(has_whole_word("parse_json_body", "parse"));
588        // substring inside a larger identifier → no match
589        assert!(!has_whole_word("parser", "parse"));
590        assert!(!has_whole_word("parserequest", "parse"));
591        // leading/trailing whitespace
592        assert!(has_whole_word("parse the file", "parse"));
593        assert!(has_whole_word("open file", "file"));
594        // empty token / short corpus
595        assert!(!has_whole_word("xyz", ""));
596        assert!(!has_whole_word("ab", "abc"));
597    }
598
599    #[test]
600    fn sparse_coverage_bonus_zero_for_single_token_query() {
601        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
602        // Single token after stopword filtering — short-circuit to 0.
603        let bonus = sparse_coverage_bonus("parse", &sym);
604        assert_eq!(bonus, 0.0);
605    }
606
607    #[test]
608    fn sparse_coverage_bonus_zero_below_threshold() {
609        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
610        // Two query tokens: "parse", "rename". Only "parse" matches → 50% coverage.
611        // 50% < 60% threshold → bonus 0.
612        let bonus = sparse_coverage_bonus("parse rename", &sym);
613        assert_eq!(bonus, 0.0);
614    }
615
616    #[test]
617    fn sparse_coverage_bonus_full_match_reaches_max() {
618        let sym = mk_symbol(
619            "parse_json_body",
620            "fn parse_json_body(input: &str) -> Value",
621        );
622        // Tokens: "parse", "json", "body". All three match.
623        // coverage = 1.0 → bonus = (1.0 - 0.6) / 0.4 * 20 = 20
624        let bonus = sparse_coverage_bonus("parse json body", &sym);
625        // Allow small float tolerance for default max = 20
626        assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
627    }
628
629    #[test]
630    fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
631        // "parser" should NOT match token "parse" via the sparse path —
632        // word-boundary precision is the whole point of Phase 2e.
633        // Two tokens ("parse", "json"), only "json" matches via the
634        // signature → 50% coverage → 0 bonus (below threshold).
635        let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
636        let bonus = sparse_coverage_bonus("parse json", &sym);
637        assert_eq!(bonus, 0.0);
638    }
639}