codelens-engine 1.9.45

Harness-native Rust MCP server for code intelligence with generated surface governance, hybrid retrieval, and mutation-gated workflows
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
use super::types::SymbolInfo;

// ── Zero-allocation ASCII case-insensitive helpers ──────────────────

/// ASCII case-insensitive substring search. Returns true if `needle`
/// appears anywhere in `haystack` ignoring ASCII case differences.
///
/// This replaces the previous pattern of allocating
/// `haystack.to_lowercase()` + `haystack_lower.contains(needle_lower)`
/// which paid one `String` allocation per call. Since code identifiers
/// in all 25 supported tree-sitter languages are ASCII, the ASCII-only
/// comparison is both correct and faster than Unicode `to_lowercase`.
pub(crate) fn contains_ascii_ci(haystack: &str, needle: &str) -> bool {
    let h = haystack.as_bytes();
    let n = needle.as_bytes();
    if n.len() > h.len() {
        return false;
    }
    if n.is_empty() {
        return true;
    }
    h.windows(n.len())
        .any(|window| window.eq_ignore_ascii_case(n))
}

/// ASCII case-insensitive full-string equality.
fn eq_ascii_ci(a: &str, b: &str) -> bool {
    a.eq_ignore_ascii_case(b)
}

/// Check if any query token is a common programming action verb.
fn query_has_action_verb(tokens: &[&str]) -> bool {
    const ACTION_VERBS: &[&str] = &[
        "find",
        "get",
        "search",
        "detect",
        "start",
        "run",
        "read",
        "write",
        "move",
        "change",
        "rename",
        "replace",
        "extract",
        "route",
        "embed",
        "build",
        "create",
        "delete",
        "update",
        "compute",
        "calculate",
        "apply",
        "handle",
        "parse",
        "index",
        "watch",
        "listen",
        "fetch",
        "send",
        "load",
        "save",
        "open",
        "close",
        "connect",
        "check",
        "validate",
        "verify",
        "transform",
        "convert",
        "process",
        "execute",
        "call",
        "invoke",
        "inline",
        "refactor",
        "analyze",
        "import",
        "export",
    ];
    tokens.iter().any(|t| ACTION_VERBS.contains(t))
}

/// Score a symbol's relevance to a query string.
/// Returns None if no match, Some(1..=100) for match strength.
///
/// Accepts pre-computed `query_lower` to avoid repeated allocation
/// when scoring many symbols against the same query.
pub(crate) fn score_symbol(query: &str, symbol: &SymbolInfo) -> Option<i32> {
    let lower = query.to_lowercase();
    let snake = lower.replace(|c: char| c.is_whitespace() || c == '-', "_");
    score_symbol_with_lower(query, &lower, &snake, symbol)
}

/// Inner scoring with pre-lowercased query and pre-computed joined-snake
/// form — call this from hot loops where both are invariant across
/// candidates.
///
/// `joined_snake` is the query with whitespace/hyphens replaced by
/// underscores, used for snake_case identifier matching (e.g.
/// "rename symbol" → "rename_symbol"). It is query-derived and
/// identical for every candidate, so computing it once in the caller
/// eliminates one String allocation per candidate in the hot loop.
pub(crate) fn score_symbol_with_lower(
    query: &str,
    query_lower: &str,
    joined_snake: &str,
    symbol: &SymbolInfo,
) -> Option<i32> {
    // Exact full-query match (no allocation needed)
    if symbol.name.eq_ignore_ascii_case(query) {
        return Some(100);
    }

    // ── Zero-alloc substring checks (replaces 4 × to_lowercase()) ──
    // All checks below use contains_ascii_ci / eq_ascii_ci instead of
    // allocating lowered Strings. Code identifiers are ASCII, so
    // ASCII case folding is correct and avoids one String per field.

    if contains_ascii_ci(&symbol.name, query_lower) {
        return Some(60);
    }
    if contains_ascii_ci(&symbol.signature, query_lower) {
        return Some(30);
    }
    if contains_ascii_ci(&symbol.name_path, query_lower) {
        return Some(20);
    }

    // Check if query tokens form the symbol name when joined with underscore
    // e.g. "rename symbol" → "rename_symbol" → exact match bonus
    // `joined_snake` is pre-computed by the caller to avoid one String
    // allocation per candidate in the hot loop.
    if eq_ascii_ci(&symbol.name, joined_snake) {
        return Some(80);
    }
    // Partial: symbol name is a subset of joined tokens
    // e.g. "move symbol to file" → joined = "move_symbol_to_file", contains "move_symbol" → 70
    if contains_ascii_ci(joined_snake, &symbol.name) && symbol.name.contains('_') {
        return Some(70);
    }
    // Reverse: symbol name contains the joined tokens
    // e.g. "extract function" → "refactor_extract_function" contains "extract_function" → 65
    if contains_ascii_ci(&symbol.name, joined_snake) && joined_snake.contains('_') {
        return Some(65);
    }

    // Token-level matching: split query into words, score by hit ratio
    let tokens: Vec<&str> = query_lower
        .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
        .filter(|t| t.len() >= 2)
        .collect();
    if tokens.is_empty() {
        return None;
    }

    // Token-level name matching: contains_ascii_ci already covers
    // CamelCase segments because every CamelCase segment is a contiguous
    // substring of the original name. The old split_camel_case() call
    // allocated Vec<char> + Vec<String> per candidate but could never
    // produce a hit that contains_ascii_ci missed. Removed in v1.6.2+
    // to eliminate the last per-candidate allocation in this function.

    let mut name_hits = 0i32;
    let mut sig_hits = 0i32;
    let mut path_hits = 0i32;
    for token in &tokens {
        if contains_ascii_ci(&symbol.name, token) {
            name_hits += 1;
        }
        if contains_ascii_ci(&symbol.signature, token) {
            sig_hits += 1;
        }
        if contains_ascii_ci(&symbol.file_path, token) {
            path_hits += 1;
        }
    }

    let total_tokens = tokens.len() as i32;
    if name_hits == 0 && sig_hits == 0 && path_hits == 0 {
        return None;
    }

    // Score formula: name hits dominate, sig/path are secondary
    // name_ratio: 0.0-1.0 portion of query tokens found in name
    // Boost for high name coverage (most tokens match the symbol name)
    let name_ratio = name_hits as f64 / total_tokens as f64;
    let sig_ratio = sig_hits as f64 / total_tokens as f64;

    let base_score = if name_hits > 0 {
        let base = (15.0 + name_ratio * 40.0) as i32;
        let sig_bonus = (sig_ratio * 5.0) as i32;
        (base + sig_bonus).min(55)
    } else if sig_hits > 0 {
        (5.0 + sig_ratio * 20.0) as i32
    } else {
        // Path-only: very weak signal, 1-5
        let path_ratio = path_hits as f64 / total_tokens as f64;
        (1.0 + path_ratio * 4.0).max(1.0) as i32
    };

    // Kind-aware boost: action queries prefer functions, noun queries prefer types.
    // Detects action intent by checking if any query token is a common verb.
    let kind_boost = if query_has_action_verb(&tokens) {
        match symbol.kind {
            super::types::SymbolKind::Function | super::types::SymbolKind::Method => 8,
            _ => 0,
        }
    } else {
        match symbol.kind {
            super::types::SymbolKind::Class
            | super::types::SymbolKind::Interface
            | super::types::SymbolKind::Enum => 5,
            _ => 0,
        }
    };

    Some(base_score + kind_boost)
}

/// Return true when v1.5 Phase 2e sparse term weighting is enabled via
/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or `true`/`yes`/`on`).
///
/// Default OFF, mirroring the Phase 2b/2c opt-in policy. Projects that
/// already opt into the Phase 2b/2c embedding hints can stack this knob
/// to tighten top-1 ordering without another index rebuild — the sparse
/// pass reads `SymbolInfo` fields that are already populated on the
/// ranking path.
///
/// v1.5 Phase 2j: when no explicit env var is set, fall through to
/// `crate::embedding::auto_sparse_should_enable()` for language-gated
/// defaults. This intentionally diverges from `nl_tokens_enabled` and
/// `api_calls_enabled`: Phase 2m keeps JS/TS auto-enabled for Phase 2b/2c
/// but auto-disables sparse weighting there because recent JS/TS
/// measurements were negative-or-inert. Explicit env always wins.
pub fn sparse_weighting_enabled() -> bool {
    if let Ok(raw) = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT") {
        let lowered = raw.trim().to_ascii_lowercase();
        return matches!(lowered.as_str(), "1" | "true" | "yes" | "on");
    }
    // The auto-on heuristic lives under the semantic feature because it keys off
    // the embedding runtime config. When semantic is disabled the ranker falls
    // back to pure lexical scoring, which has no sparse weighting path, so the
    // default is `false`.
    #[cfg(feature = "semantic")]
    {
        crate::embedding::auto_sparse_should_enable()
    }
    #[cfg(not(feature = "semantic"))]
    {
        false
    }
}

/// Maximum sparse coverage bonus added to the blended score when a query
/// reaches 100% term coverage against a symbol's `name + name_path +
/// signature` corpus. Override via `CODELENS_RANK_SPARSE_MAX` (clamped
/// to 5..=50).
///
/// Kept deliberately modest (default 20) because the existing lexical
/// score in `score_symbol_with_lower` already reaches 55 for signature
/// hits. The sparse bonus is a *tie-breaker* — it re-orders the top-K
/// after the main scoring has selected them, not a replacement for the
/// lexical signal.
pub fn sparse_max_bonus() -> f64 {
    std::env::var("CODELENS_RANK_SPARSE_MAX")
        .ok()
        .and_then(|raw| raw.parse::<u32>().ok())
        .map(|n| n.clamp(5, 50))
        .unwrap_or(20) as f64
}

/// Minimum query-term coverage (as a percentage, 10..=90) a symbol must
/// reach before it receives any sparse bonus. Below this threshold the
/// bonus is `0.0`. Between the threshold and 100% the bonus rises
/// linearly from `0.0` to `sparse_max_bonus()`.
///
/// The default of 60 was a conservative first guess. An initial 4-arm
/// A/B on the 89-query self dataset found that the bonus never fired at
/// 60 because most NL queries only share 1–2 discriminative tokens with
/// their target symbol's `name + name_path + signature` corpus.
/// Override via `CODELENS_RANK_SPARSE_THRESHOLD` for tuning experiments.
pub fn sparse_threshold() -> f64 {
    std::env::var("CODELENS_RANK_SPARSE_THRESHOLD")
        .ok()
        .and_then(|raw| raw.parse::<u32>().ok())
        .map(|n| n.clamp(10, 90))
        .unwrap_or(60) as f64
        / 100.0
}

/// English/pseudo-stopwords that add no discriminative signal when used
/// as query tokens. Intentionally short — real NL stopwords lists contain
/// ~150 entries, but most of them never show up in code-search queries.
/// We only need the ones that regularly dilute query coverage ("find the
/// function that opens a file" — `the` and `that` are the problem).
const SPARSE_STOPWORDS: &[&str] = &[
    "the", "for", "with", "from", "that", "this", "into", "onto", "over", "not", "and", "any",
    "all", "are", "was", "were", "has", "have", "had", "how", "what", "when", "where", "which",
    "who", "why", "but", "its", "can", "use", "using", "used", "gets", "set", "sets", "new", "let",
];

/// Return true when `token` is found in `corpus` as a whole word — that is,
/// the characters surrounding each occurrence are NOT alphanumeric or `_`.
///
/// Phase 2e uses this instead of `str::contains` so that a query token like
/// `"parse"` matches `parse_json` (snake separator) but not `parser` or
/// `parseRequest` (would already be caught by the lexical `contains` path,
/// which is where we want them scored — not via the sparse bonus).
pub fn has_whole_word(corpus: &str, token: &str) -> bool {
    if token.is_empty() || corpus.len() < token.len() {
        return false;
    }
    let corpus_bytes = corpus.as_bytes();
    let token_bytes = token.as_bytes();
    let mut start = 0;
    while start + token_bytes.len() <= corpus_bytes.len() {
        // Find next occurrence from `start`
        let remaining = &corpus[start..];
        let Some(local_idx) = remaining.find(token) else {
            return false;
        };
        let abs = start + local_idx;
        let end = abs + token_bytes.len();
        let before_ok = abs == 0 || !is_word_byte(corpus_bytes[abs - 1]);
        let after_ok = end == corpus_bytes.len() || !is_word_byte(corpus_bytes[end]);
        if before_ok && after_ok {
            return true;
        }
        start = abs + 1;
    }
    false
}

/// Byte-level helper: true when the byte is part of an ASCII word
/// ([A-Za-z0-9]). `_` is deliberately excluded so that snake_case
/// separators count as word boundaries — e.g. `"parse"` should match
/// `"parse_json_body"` but not `"parser"`. Non-ASCII bytes (UTF-8
/// continuation) default to "word" so multi-byte identifiers stay
/// conservative (no false positives from partial UTF-8 matches).
fn is_word_byte(b: u8) -> bool {
    b.is_ascii_alphanumeric() || (b & 0x80) != 0
}

/// Tokenize `query_lower` into distinct discriminative terms for the
/// Phase 2e sparse pass:
/// - split on any non-alphanumeric character
/// - drop tokens shorter than 3 characters
/// - drop tokens in `SPARSE_STOPWORDS`
/// - deduplicate while preserving order
///
/// Returns `Vec<String>` (not `Vec<&str>`) so callers can own the tokens
/// independently of the query lifetime — the rank loop already has to
/// outlive the borrow anyway.
pub fn sparse_query_tokens(query_lower: &str) -> Vec<String> {
    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
    let mut out: Vec<String> = Vec::new();
    for raw in query_lower.split(|c: char| !c.is_alphanumeric()) {
        if raw.len() < 3 {
            continue;
        }
        if SPARSE_STOPWORDS.contains(&raw) {
            continue;
        }
        if seen.insert(raw.to_string()) {
            out.push(raw.to_string());
        }
    }
    out
}

/// Text-first variant of the Phase 2e sparse coverage bonus. Does NOT
/// take a `SymbolInfo` so that callers outside the engine crate (notably
/// the MCP `get_ranked_context` post-process) can feed it whatever fields
/// are actually available on their entry type.
///
/// `query_lower` MUST already be lower-cased — the function does not
/// re-lowercase so that callers with a long query can amortise the
/// allocation outside the loop. Pass the *original user query*, not the
/// MCP-expanded retrieval string: the expansion adds dozens of
/// derivative tokens (snake_case, CamelCase, alias groups) that dilute
/// the coverage ratio below any reasonable threshold — that dilution
/// was the exact reason the first 4-arm pilot measured zero effect.
///
/// Returns `0.0` whenever:
/// - the query has fewer than 2 discriminative tokens after stopword
///   filtering (single-token queries already resolve well via the
///   lexical path — `sparse_query_tokens` deduplicates + drops <3 chars),
/// - the coverage ratio is below `sparse_threshold()` (default 0.6).
///
/// Between the threshold and 100% coverage the bonus rises linearly
/// from 0 to `sparse_max_bonus()`. The caller is responsible for
/// gating the whole call with `sparse_weighting_enabled()` so test
/// code can run the inner logic deterministically.
pub fn sparse_coverage_bonus_from_fields(
    query_lower: &str,
    name: &str,
    name_path: &str,
    signature: &str,
    file_path: &str,
) -> f64 {
    let tokens = sparse_query_tokens(query_lower);
    if tokens.len() < 2 {
        return 0.0;
    }
    // Build the corpus directly as lowercase to avoid a second String
    // allocation. Previously this was corpus + corpus.to_lowercase() =
    // 2 allocations per candidate; now it's 1.
    let cap = name.len() + name_path.len() + signature.len() + file_path.len() + 3;
    let mut corpus_lower = String::with_capacity(cap);
    for field in [name, name_path, signature, file_path] {
        if !corpus_lower.is_empty() {
            corpus_lower.push(' ');
        }
        for ch in field.chars() {
            corpus_lower.push(ch.to_ascii_lowercase());
        }
    }

    let matched = tokens
        .iter()
        .filter(|t| has_whole_word(&corpus_lower, t))
        .count() as f64;
    let total = tokens.len() as f64;
    let coverage = matched / total;

    let threshold = sparse_threshold();
    if coverage < threshold {
        return 0.0;
    }
    // threshold → 0, 100% → sparse_max_bonus(), linear between. Guard
    // against threshold == 1.0 (would divide by zero) by clamping.
    let span = (1.0 - threshold).max(0.01);
    (coverage - threshold) / span * sparse_max_bonus()
}

/// Back-compat wrapper kept for the existing `SymbolInfo`-based unit
/// tests. New call sites should prefer `sparse_coverage_bonus_from_fields`.
#[cfg(test)]
pub(crate) fn sparse_coverage_bonus(query_lower: &str, symbol: &SymbolInfo) -> f64 {
    sparse_coverage_bonus_from_fields(
        query_lower,
        &symbol.name,
        &symbol.name_path,
        &symbol.signature,
        &symbol.file_path,
    )
}

#[cfg(test)]
mod tests {
    use super::super::types::{SymbolInfo, SymbolKind, SymbolProvenance};
    use super::*;
    use std::sync::Mutex;

    static ENV_LOCK: Mutex<()> = Mutex::new(());

    fn mk_symbol(name: &str, signature: &str) -> SymbolInfo {
        SymbolInfo {
            name: name.to_string(),
            kind: SymbolKind::Function,
            file_path: "test.rs".into(),
            line: 1,
            column: 0,
            signature: signature.to_string(),
            name_path: name.to_string(),
            id: format!("test.rs#function:{name}"),
            body: None,
            children: Vec::new(),
            start_byte: 0,
            end_byte: 0,
            provenance: SymbolProvenance::default(),
        }
    }

    #[test]
    fn sparse_weighting_gated_off_by_default() {
        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
        unsafe {
            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
            std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
        }
        let enabled = sparse_weighting_enabled();
        unsafe {
            match previous_explicit {
                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
            }
            match previous_auto {
                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
            }
            match previous_lang {
                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
            }
        }
        assert!(!enabled, "sparse weighting gate leaked");
    }

    #[test]
    fn sparse_weighting_auto_gate_disables_for_js_ts_but_explicit_env_still_wins() {
        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
        let previous_explicit = std::env::var("CODELENS_RANK_SPARSE_TERM_WEIGHT").ok();
        let previous_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
        let previous_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();

        unsafe {
            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
        }
        assert!(
            sparse_weighting_enabled(),
            "auto+rust should enable sparse weighting"
        );

        unsafe {
            std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
        }
        assert!(
            !sparse_weighting_enabled(),
            "auto+typescript should disable sparse weighting after Phase 2m split"
        );

        unsafe {
            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "1");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
        }
        assert!(
            sparse_weighting_enabled(),
            "explicit sparse=1 must still win over JS/TS auto-off"
        );

        unsafe {
            std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", "0");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
        }
        assert!(
            !sparse_weighting_enabled(),
            "explicit sparse=0 must still win over rust auto-on"
        );

        unsafe {
            match previous_explicit {
                Some(value) => std::env::set_var("CODELENS_RANK_SPARSE_TERM_WEIGHT", value),
                None => std::env::remove_var("CODELENS_RANK_SPARSE_TERM_WEIGHT"),
            }
            match previous_auto {
                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", value),
                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
            }
            match previous_lang {
                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", value),
                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
            }
        }
    }

    #[test]
    fn sparse_query_tokens_drops_stopwords_and_short_tokens() {
        let tokens = sparse_query_tokens("find the function that opens a file");
        // "find", "function", "opens", "file" survive. "the", "that", "a" dropped.
        assert_eq!(tokens, vec!["find", "function", "opens", "file"]);
    }

    #[test]
    fn sparse_query_tokens_deduplicates() {
        let tokens = sparse_query_tokens("parse json parse xml parse");
        assert_eq!(tokens, vec!["parse", "json", "xml"]);
    }

    #[test]
    fn has_whole_word_respects_word_boundaries() {
        // snake_case separator counts as non-word → match
        assert!(has_whole_word("parse_json_body", "parse"));
        // substring inside a larger identifier → no match
        assert!(!has_whole_word("parser", "parse"));
        assert!(!has_whole_word("parserequest", "parse"));
        // leading/trailing whitespace
        assert!(has_whole_word("parse the file", "parse"));
        assert!(has_whole_word("open file", "file"));
        // empty token / short corpus
        assert!(!has_whole_word("xyz", ""));
        assert!(!has_whole_word("ab", "abc"));
    }

    #[test]
    fn sparse_coverage_bonus_zero_for_single_token_query() {
        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
        // Single token after stopword filtering — short-circuit to 0.
        let bonus = sparse_coverage_bonus("parse", &sym);
        assert_eq!(bonus, 0.0);
    }

    #[test]
    fn sparse_coverage_bonus_zero_below_threshold() {
        let sym = mk_symbol("parse_json", "fn parse_json(input: &str) -> Value");
        // Two query tokens: "parse", "rename". Only "parse" matches → 50% coverage.
        // 50% < 60% threshold → bonus 0.
        let bonus = sparse_coverage_bonus("parse rename", &sym);
        assert_eq!(bonus, 0.0);
    }

    #[test]
    fn sparse_coverage_bonus_full_match_reaches_max() {
        let sym = mk_symbol(
            "parse_json_body",
            "fn parse_json_body(input: &str) -> Value",
        );
        // Tokens: "parse", "json", "body". All three match.
        // coverage = 1.0 → bonus = (1.0 - 0.6) / 0.4 * 20 = 20
        let bonus = sparse_coverage_bonus("parse json body", &sym);
        // Allow small float tolerance for default max = 20
        assert!((bonus - 20.0).abs() < 0.01, "expected ~20, got {bonus}");
    }

    #[test]
    fn sparse_coverage_bonus_ignores_whole_word_false_positives() {
        // "parser" should NOT match token "parse" via the sparse path —
        // word-boundary precision is the whole point of Phase 2e.
        // Two tokens ("parse", "json"), only "json" matches via the
        // signature → 50% coverage → 0 bonus (below threshold).
        let sym = mk_symbol("parser", "fn parser(input: &str) -> Json");
        let bonus = sparse_coverage_bonus("parse json", &sym);
        assert_eq!(bonus, 0.0);
    }
}