Skip to main content

codelens_engine/embedding/
prompt.rs

1use super::runtime::parse_bool_env;
2
3/// Split CamelCase/snake_case into space-separated words for embedding matching.
4/// "getDonationRankings" → "get Donation Rankings"
5/// "build_non_code_ranges" → "build non code ranges"
6pub fn split_identifier(name: &str) -> String {
7    // Only split if name is CamelCase or snake_case with multiple segments
8    if !name.contains('_') && !name.chars().any(|c| c.is_uppercase()) {
9        return name.to_string();
10    }
11    let mut words = Vec::new();
12    let mut current = String::new();
13    let chars: Vec<char> = name.chars().collect();
14    for (i, &ch) in chars.iter().enumerate() {
15        if ch == '_' {
16            if !current.is_empty() {
17                words.push(current.clone());
18                current.clear();
19            }
20        } else if ch.is_uppercase()
21            && !current.is_empty()
22            && (current
23                .chars()
24                .last()
25                .map(|c| c.is_lowercase())
26                .unwrap_or(false)
27                || chars.get(i + 1).map(|c| c.is_lowercase()).unwrap_or(false))
28        {
29            // Split at CamelCase boundary, but not for ALL_CAPS
30            words.push(current.clone());
31            current.clear();
32            current.push(ch);
33        } else {
34            current.push(ch);
35        }
36    }
37    if !current.is_empty() {
38        words.push(current);
39    }
40    if words.len() <= 1 {
41        return name.to_string(); // No meaningful split
42    }
43    words.join(" ")
44}
45
46pub fn is_test_only_symbol(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> bool {
47    let fp = &sym.file_path;
48
49    // ── Path-based detection (language-agnostic) ─────────────────────
50    // Rust
51    if fp.contains("/tests/") || fp.ends_with("_tests.rs") {
52        return true;
53    }
54    // JS/TS — Jest __tests__ directory
55    if fp.contains("/__tests__/") || fp.contains("\\__tests__\\") {
56        return true;
57    }
58    // Python
59    if fp.ends_with("_test.py") {
60        return true;
61    }
62    // Go
63    if fp.ends_with("_test.go") {
64        return true;
65    }
66    // JS/TS — .test.* / .spec.*
67    if fp.ends_with(".test.ts")
68        || fp.ends_with(".test.tsx")
69        || fp.ends_with(".test.js")
70        || fp.ends_with(".test.jsx")
71        || fp.ends_with(".spec.ts")
72        || fp.ends_with(".spec.js")
73    {
74        return true;
75    }
76    // Java/Kotlin — Maven src/test/ layout
77    if fp.contains("/src/test/") {
78        return true;
79    }
80    // Java — *Test.java / *Tests.java
81    if fp.ends_with("Test.java") || fp.ends_with("Tests.java") {
82        return true;
83    }
84    // Ruby
85    if fp.ends_with("_test.rb") || fp.contains("/spec/") {
86        return true;
87    }
88
89    // ── Rust name_path patterns ───────────────────────────────────────
90    if sym.name_path.starts_with("tests::")
91        || sym.name_path.contains("::tests::")
92        || sym.name_path.starts_with("test::")
93        || sym.name_path.contains("::test::")
94    {
95        return true;
96    }
97
98    let Some(source) = source else {
99        return false;
100    };
101
102    let start = usize::try_from(sym.start_byte.max(0))
103        .unwrap_or(0)
104        .min(source.len());
105
106    // ── Source-based: Rust attributes ────────────────────────────────
107    let window_start = start.saturating_sub(2048);
108    let attrs = String::from_utf8_lossy(&source.as_bytes()[window_start..start]);
109    if attrs.contains("#[test]")
110        || attrs.contains("#[tokio::test]")
111        || attrs.contains("#[cfg(test)]")
112        || attrs.contains("#[cfg(all(test")
113    {
114        return true;
115    }
116
117    // ── Source-based: Python ─────────────────────────────────────────
118    // Function names starting with `test_` or class names starting with `Test`
119    if fp.ends_with(".py") {
120        if sym.name.starts_with("test_") {
121            return true;
122        }
123        // Class whose name starts with "Test" — also matches TestCase subclasses
124        if sym.kind == "class" && sym.name.starts_with("Test") {
125            return true;
126        }
127    }
128
129    // ── Source-based: Go ─────────────────────────────────────────────
130    // func TestXxx(...) pattern; file must end with _test.go (already caught above),
131    // but guard on .go extension for any edge-case non-test files with Test* helpers.
132    if fp.ends_with(".go") && sym.name.starts_with("Test") && sym.kind == "function" {
133        return true;
134    }
135
136    // ── Source-based: Java / Kotlin ──────────────────────────────────
137    if fp.ends_with(".java") || fp.ends_with(".kt") {
138        let before = &source[..start];
139        let window = if before.len() > 200 {
140            &before[before.len() - 200..]
141        } else {
142            before
143        };
144        if window.contains("@Test")
145            || window.contains("@ParameterizedTest")
146            || window.contains("@RepeatedTest")
147        {
148            return true;
149        }
150    }
151
152    false
153}
154
155/// Build the embedding text for a symbol.
156///
157/// Optimized for MiniLM-L12-CodeSearchNet:
158/// - No "passage:" prefix (model not trained with prefixes)
159/// - Include file context for disambiguation
160/// - Signature-focused (body inclusion hurts quality for this model)
161///
162/// When `CODELENS_EMBED_DOCSTRINGS=1` is set, leading docstrings/comments are
163/// appended. Disabled by default because the bundled CodeSearchNet-INT8 model
164/// is optimized for code signatures and dilutes on natural language text.
165/// Enable when switching to a hybrid code+text model (E5-large, BGE-base, etc).
166pub fn build_embedding_text(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> String {
167    // File context: use only the filename (not full path) to reduce noise.
168    // Full paths like "crates/codelens-engine/src/symbols/mod.rs" add tokens
169    // that dilute the semantic signal. "mod.rs" is sufficient context.
170    let file_ctx = if sym.file_path.is_empty() {
171        String::new()
172    } else {
173        let filename = sym.file_path.rsplit('/').next().unwrap_or(&sym.file_path);
174        format!(" in {}", filename)
175    };
176
177    // Include split identifier words for better NL matching
178    // e.g. "getDonationRankings" → "get Donation Rankings"
179    let split_name = split_identifier(&sym.name);
180    let name_with_split = if split_name != sym.name {
181        format!("{} ({})", sym.name, split_name)
182    } else {
183        sym.name.clone()
184    };
185
186    // Add parent context from name_path (e.g. "UserService/get_user" → "in UserService")
187    let parent_ctx = if !sym.name_path.is_empty() && sym.name_path.contains('/') {
188        let parent = sym.name_path.rsplit_once('/').map(|x| x.0).unwrap_or("");
189        if parent.is_empty() {
190            String::new()
191        } else {
192            format!(" (in {})", parent)
193        }
194    } else {
195        String::new()
196    };
197
198    // Module context: directory name provides domain signal without full path noise.
199    // "embedding/mod.rs" → module "embedding", "symbols/ranking.rs" → module "symbols"
200    let module_ctx = if sym.file_path.contains('/') {
201        let parts: Vec<&str> = sym.file_path.rsplitn(3, '/').collect();
202        if parts.len() >= 2 {
203            let dir = parts[1];
204            // Skip generic dirs like "src"
205            if dir != "src" && dir != "crates" {
206                format!(" [{dir}]")
207            } else {
208                String::new()
209            }
210        } else {
211            String::new()
212        }
213    } else {
214        String::new()
215    };
216
217    let base = if sym.signature.is_empty() {
218        format!(
219            "{} {}{}{}{}",
220            sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx
221        )
222    } else {
223        format!(
224            "{} {}{}{}{}: {}",
225            sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx, sym.signature
226        )
227    };
228
229    // Docstring inclusion: v2 model improved NL understanding (+45%), enabling
230    // docstrings by default. Measured: ranked_context +0.020, semantic -0.003 (neutral).
231    // Disable via CODELENS_EMBED_DOCSTRINGS=0 if needed.
232    let docstrings_disabled = std::env::var("CODELENS_EMBED_DOCSTRINGS")
233        .map(|v| v == "0" || v == "false")
234        .unwrap_or(false);
235
236    if docstrings_disabled {
237        return base;
238    }
239
240    let docstring = source
241        .and_then(|src| extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize))
242        .unwrap_or_default();
243
244    let mut text = if docstring.is_empty() {
245        // Fallback: extract the first few meaningful lines from the function
246        // body. This captures key API calls (e.g. "tree_sitter::Parser",
247        // "stdin()") that help the embedding model match NL queries to
248        // symbols without docs.
249        let body_hint = source
250            .and_then(|src| extract_body_hint(src, sym.start_byte as usize, sym.end_byte as usize))
251            .unwrap_or_default();
252        if body_hint.is_empty() {
253            base
254        } else {
255            format!("{} — {}", base, body_hint)
256        }
257    } else {
258        // Collect up to hint_line_budget() non-empty docstring lines
259        // (rather than only the first) so the embedding model sees
260        // multi-sentence explanations in full — up to the runtime
261        // char budget via join_hint_lines.
262        let line_budget = hint_line_budget();
263        let lines: Vec<String> = docstring
264            .lines()
265            .map(str::trim)
266            .filter(|line| !line.is_empty())
267            .take(line_budget)
268            .map(str::to_string)
269            .collect();
270        let hint = join_hint_lines(&lines);
271        if hint.is_empty() {
272            base
273        } else {
274            format!("{} — {}", base, hint)
275        }
276    };
277
278    // v1.5 Phase 2b experiment: optionally append NL tokens harvested from
279    // comments and string literals inside the body. Disabled by default;
280    // enable with `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` to A/B.
281    if let Some(src) = source
282        && let Some(nl_tokens) =
283            extract_nl_tokens(src, sym.start_byte as usize, sym.end_byte as usize)
284        && !nl_tokens.is_empty()
285    {
286        text.push_str(" · NL: ");
287        text.push_str(&nl_tokens);
288    }
289
290    // v1.5 Phase 2c experiment: optionally append `Type::method` call-site
291    // hints harvested from the body. Disabled by default; enable with
292    // `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` to A/B. Orthogonal to
293    // Phase 2b — both can be stacked.
294    if let Some(src) = source
295        && let Some(api_calls) =
296            extract_api_calls(src, sym.start_byte as usize, sym.end_byte as usize)
297        && !api_calls.is_empty()
298    {
299        text.push_str(" · API: ");
300        text.push_str(&api_calls);
301    }
302
303    text
304}
305
306/// Maximum total characters collected from body-hint or docstring lines.
307/// Kept conservative to avoid diluting signature signal for the bundled
308/// MiniLM-L12-CodeSearchNet INT8 model. Override via
309/// `CODELENS_EMBED_HINT_CHARS` for experiments (clamped to 60..=512).
310///
311/// History: a v1.5 Phase 2 PoC briefly raised this to 180 / 3 lines in an
312/// attempt to close the NL query MRR gap. The 2026-04-11 A/B measurement
313/// (`benchmarks/embedding-quality-v1.5-hint1` vs `-phase2`) showed
314/// `hybrid -0.005`, `NL hybrid -0.008`, `NL semantic_search -0.041`, so
315/// the defaults reverted to the pre-PoC values. The infrastructure
316/// (`join_hint_lines`, `hint_line_budget`, env overrides) stayed so the
317/// next experiment does not need a rewrite.
318const DEFAULT_HINT_TOTAL_CHAR_BUDGET: usize = 60;
319
320/// Maximum number of meaningful lines to collect from a function body.
321/// Overridable via `CODELENS_EMBED_HINT_LINES` (clamped to 1..=10).
322const DEFAULT_HINT_LINES: usize = 1;
323
324pub fn hint_char_budget() -> usize {
325    std::env::var("CODELENS_EMBED_HINT_CHARS")
326        .ok()
327        .and_then(|raw| raw.parse::<usize>().ok())
328        .map(|n| n.clamp(60, 512))
329        .unwrap_or(DEFAULT_HINT_TOTAL_CHAR_BUDGET)
330}
331
332pub fn hint_line_budget() -> usize {
333    std::env::var("CODELENS_EMBED_HINT_LINES")
334        .ok()
335        .and_then(|raw| raw.parse::<usize>().ok())
336        .map(|n| n.clamp(1, 10))
337        .unwrap_or(DEFAULT_HINT_LINES)
338}
339
340/// Join collected hint lines, capping at the runtime-configured char
341/// budget (default 60 chars; override via `CODELENS_EMBED_HINT_CHARS`).
342///
343/// Each line is separated by " · " so the embedding model sees a small
344/// structural boundary between logically distinct body snippets. The final
345/// result is truncated with a trailing "..." on char-boundaries only.
346pub fn join_hint_lines(lines: &[String]) -> String {
347    if lines.is_empty() {
348        return String::new();
349    }
350    let joined = lines
351        .iter()
352        .map(String::as_str)
353        .collect::<Vec<_>>()
354        .join(" · ");
355    let budget = hint_char_budget();
356    if joined.chars().count() > budget {
357        let truncated: String = joined.chars().take(budget).collect();
358        format!("{truncated}...")
359    } else {
360        joined
361    }
362}
363
364/// Extract up to `hint_line_budget()` meaningful lines from a function body
365/// (skipping braces, blank lines, and comments). Used as a fallback when no
366/// docstring is available so the embedding model still sees the core API
367/// calls / return values.
368///
369/// Historically this returned only the first meaningful line clipped at 60
370/// chars. The 180-char / 3-line budget was introduced in v1.5 Phase 2 to
371/// close the NL-query gap (MRR 0.528) on cases where the discriminating
372/// keyword lives in line 2 or 3 of the body.
373pub fn extract_body_hint(source: &str, start: usize, end: usize) -> Option<String> {
374    if start >= source.len() || end > source.len() || start >= end {
375        return None;
376    }
377    let safe_start = if source.is_char_boundary(start) {
378        start
379    } else {
380        source.floor_char_boundary(start)
381    };
382    let safe_end = end.min(source.len());
383    let safe_end = if source.is_char_boundary(safe_end) {
384        safe_end
385    } else {
386        source.floor_char_boundary(safe_end)
387    };
388    let body = &source[safe_start..safe_end];
389
390    let max_lines = hint_line_budget();
391    let mut collected: Vec<String> = Vec::with_capacity(max_lines);
392
393    // Skip past the signature: everything until we see a line ending with '{' or ':'
394    // (opening brace of the function body), then start looking for meaningful lines.
395    let mut past_signature = false;
396    for line in body.lines() {
397        let trimmed = line.trim();
398        if !past_signature {
399            // Keep skipping until we find the opening brace/colon
400            if trimmed.ends_with('{') || trimmed.ends_with(':') || trimmed == "{" {
401                past_signature = true;
402            }
403            continue;
404        }
405        // Skip comments, blank lines, closing braces
406        if trimmed.is_empty()
407            || trimmed.starts_with("//")
408            || trimmed.starts_with('#')
409            || trimmed.starts_with("/*")
410            || trimmed.starts_with('*')
411            || trimmed == "}"
412        {
413            continue;
414        }
415        collected.push(trimmed.to_string());
416        if collected.len() >= max_lines {
417            break;
418        }
419    }
420
421    if collected.is_empty() {
422        None
423    } else {
424        Some(join_hint_lines(&collected))
425    }
426}
427
428/// Return true when NL-token collection is enabled via
429/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` (or `true`/`yes`/`on`).
430///
431/// v1.5 Phase 2b infrastructure — kept off by default pending A/B
432/// measurement against the fixed 89-query dataset.
433///
434/// v1.5 Phase 2j: when no explicit env var is set, fall through to
435/// `auto_hint_should_enable()` which consults `CODELENS_EMBED_HINT_AUTO` +
436/// `CODELENS_EMBED_HINT_AUTO_LANG` for language-gated defaults.
437pub fn nl_tokens_enabled() -> bool {
438    if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_COMMENTS") {
439        return explicit;
440    }
441    auto_hint_should_enable()
442}
443
444/// Return true when v1.5 Phase 2j auto-detection mode is enabled.
445///
446/// **v1.6.0 default change (§8.14)**: this returns `true` by default.
447/// Users opt **out** with `CODELENS_EMBED_HINT_AUTO=0` (or `false` /
448/// `no` / `off`). The previous v1.5.x behaviour was the other way
449/// around — default OFF, opt in with `=1`. The flip ships as part of
450/// v1.6.0 after the five-dataset measurement (§8.7, §8.8, §8.13,
451/// §8.11, §8.12) validated:
452///
453/// 1. Rust / C / C++ / Go / Java / Kotlin / Scala / C# projects hit
454///    the §8.7 stacked arm (+2.4 % to +15.2 % hybrid MRR).
455/// 2. TypeScript / JavaScript projects validated the Phase 2b/2c
456///    embedding hints on `facebook/jest` and later `microsoft/typescript`.
457///    Subsequent app/runtime follow-ups (`vercel/next.js`,
458///    `facebook/react` production subtree) motivated splitting Phase 2e
459///    out of the JS/TS auto path, but not removing JS/TS from the
460///    embedding-hint default.
461/// 3. Python projects hit the §8.8 baseline (no change) — the
462///    §8.11 language gate + §8.12 MCP auto-set means Python is
463///    auto-detected and the stack stays OFF without user action.
464/// 4. Ruby / PHP / Lua / shell / untested-dynamic projects fall
465///    through to the conservative default-off branch (same as
466///    Python behaviour — no regression).
467///
468/// The dominant language is supplied by the MCP tool layer via the
469/// `CODELENS_EMBED_HINT_AUTO_LANG` env var, which is set
470/// automatically on startup (`main.rs`) and on MCP
471/// `activate_project` calls by `compute_dominant_language` (§8.12).
472/// The engine only reads the env var — it does not walk the
473/// filesystem itself.
474///
475/// Explicit `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` /
476/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` /
477/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or their `=0` counterparts)
478/// always win over the auto decision — users who want to force a
479/// configuration still can, the auto mode is a better default, not
480/// a lock-in.
481///
482/// **Opt-out**: set `CODELENS_EMBED_HINT_AUTO=0` to restore v1.5.x
483/// behaviour (no auto-detection, all Phase 2 gates default off unless
484/// their individual env vars are set).
485pub fn auto_hint_mode_enabled() -> bool {
486    parse_bool_env("CODELENS_EMBED_HINT_AUTO").unwrap_or(true)
487}
488
489/// Return the language tag supplied by the MCP tool layer via
490/// `CODELENS_EMBED_HINT_AUTO_LANG`, or `None` when unset. The tag is
491/// compared against `language_supports_nl_stack` to decide whether
492/// the Phase 2b / 2c / 2e stack should be auto-enabled.
493///
494/// Accepted tags are the canonical extensions from
495/// `crates/codelens-engine/src/lang_config.rs` (`rs`, `py`, `js`,
496/// `ts`, `go`, `rb`, `java`, `kt`, `scala`, `cs`, `cpp`, `c`, …) plus
497/// a handful of long-form aliases (`rust`, `python`, `javascript`,
498/// `typescript`, `golang`) for users who set the env var by hand.
499pub fn auto_hint_lang() -> Option<String> {
500    std::env::var("CODELENS_EMBED_HINT_AUTO_LANG")
501        .ok()
502        .map(|raw| raw.trim().to_ascii_lowercase())
503}
504
505/// Return true when `lang` is a language where the v1.5 embedding-hint
506/// stack (Phase 2b comments + Phase 2c API-call extraction) has been
507/// measured to net-positive (§8.2, §8.4, §8.6, §8.7, §8.13, §8.15) or
508/// where the language's static typing + snake_case naming + comment-first
509/// culture makes the mechanism behave the same way it does on Rust.
510///
511/// This gate is intentionally separate from the Phase 2e sparse
512/// re-ranker. As of the §8.15 / §8.16 / §8.17 follow-up arc, JS/TS stays
513/// enabled here because tooling/compiler repos are positive and short-file
514/// runtime repos are inert, but JS/TS is disabled in the **sparse**
515/// auto-gate because Phase 2e is negative-or-null on that family.
516///
517/// The list is intentionally conservative — additions require an actual
518/// external-repo A/B following the §8.7 methodology, not a
519/// language-similarity argument alone.
520///
521/// **Supported** (measured or by static-typing analogy):
522/// - `rs`, `rust` (§8.2, §8.4, §8.6, §8.7: +2.4 %, +7.1 %, +15.2 %)
523/// - `cpp`, `cc`, `cxx`, `c++`
524/// - `c`
525/// - `go`, `golang`
526/// - `java`
527/// - `kt`, `kotlin`
528/// - `scala`
529/// - `cs`, `csharp`
530/// - `ts`, `typescript`, `tsx` (§8.13: `facebook/jest` +7.3 % hybrid MRR)
531/// - `js`, `javascript`, `jsx`
532///
533/// **Unsupported** (measured regression or untested dynamic-typed):
534/// - `py`, `python` (§8.8 regression)
535/// - `rb`, `ruby`
536/// - `php`
537/// - `lua`, `r`, `jl`
538/// - `sh`, `bash`
539/// - anything else
540pub fn language_supports_nl_stack(lang: &str) -> bool {
541    matches!(
542        lang.trim().to_ascii_lowercase().as_str(),
543        "rs" | "rust"
544            | "cpp"
545            | "cc"
546            | "cxx"
547            | "c++"
548            | "c"
549            | "go"
550            | "golang"
551            | "java"
552            | "kt"
553            | "kotlin"
554            | "scala"
555            | "cs"
556            | "csharp"
557            | "ts"
558            | "typescript"
559            | "tsx"
560            | "js"
561            | "javascript"
562            | "jsx"
563    )
564}
565
566/// Return true when `lang` is a language where the Phase 2e sparse
567/// coverage re-ranker should be auto-enabled when the user has not set
568/// `CODELENS_RANK_SPARSE_TERM_WEIGHT` explicitly.
569///
570/// This is deliberately narrower than `language_supports_nl_stack`.
571/// Phase 2e remains positive on Rust-style codebases, but the JS/TS
572/// measurement arc now says:
573///
574/// - `facebook/jest`: marginal positive
575/// - `microsoft/typescript`: negative
576/// - `vercel/next.js`: slight negative
577/// - `facebook/react` production subtree: exact no-op
578///
579/// So the conservative Phase 2m policy is:
580/// - keep Phase 2b/2c auto-eligible on JS/TS
581/// - disable **auto** Phase 2e on JS/TS
582/// - preserve explicit env override for users who want to force it on
583pub fn language_supports_sparse_weighting(lang: &str) -> bool {
584    matches!(
585        lang.trim().to_ascii_lowercase().as_str(),
586        "rs" | "rust"
587            | "cpp"
588            | "cc"
589            | "cxx"
590            | "c++"
591            | "c"
592            | "go"
593            | "golang"
594            | "java"
595            | "kt"
596            | "kotlin"
597            | "scala"
598            | "cs"
599            | "csharp"
600    )
601}
602
603/// Combined decision: Phase 2j auto mode is enabled AND the detected
604/// language supports the Phase 2b/2c embedding-hint stack. This is the
605/// `else` branch that `nl_tokens_enabled` and `api_calls_enabled` fall
606/// through to when no explicit env var is set.
607pub fn auto_hint_should_enable() -> bool {
608    if !auto_hint_mode_enabled() {
609        return false;
610    }
611    match auto_hint_lang() {
612        Some(lang) => language_supports_nl_stack(&lang),
613        None => false, // auto mode on but no language tag → conservative OFF
614    }
615}
616
617/// Combined decision: Phase 2j auto mode is enabled AND the detected
618/// language supports auto-enabling the Phase 2e sparse re-ranker.
619///
620/// This intentionally differs from `auto_hint_should_enable()` after the
621/// §8.15 / §8.16 / §8.17 JS/TS follow-up arc: embedding hints stay
622/// auto-on for JS/TS, but sparse weighting does not.
623pub fn auto_sparse_should_enable() -> bool {
624    if !auto_hint_mode_enabled() {
625        return false;
626    }
627    match auto_hint_lang() {
628        Some(lang) => language_supports_sparse_weighting(&lang),
629        None => false,
630    }
631}
632
633/// Heuristic: does this string look like natural language rather than
634/// a code identifier, path, or numeric literal?
635///
636/// Criteria:
637/// - at least 4 characters
638/// - no path / scope separators (`/`, `\`, `::`)
639/// - must contain a space (multi-word)
640/// - alphabetic character ratio >= 60%
641pub fn is_nl_shaped(s: &str) -> bool {
642    let s = s.trim();
643    if s.chars().count() < 4 {
644        return false;
645    }
646    if s.contains('/') || s.contains('\\') || s.contains("::") {
647        return false;
648    }
649    if !s.contains(' ') {
650        return false;
651    }
652    let non_ws: usize = s.chars().filter(|c| !c.is_whitespace()).count();
653    if non_ws == 0 {
654        return false;
655    }
656    let alpha: usize = s.chars().filter(|c| c.is_alphabetic()).count();
657    (alpha * 100) / non_ws >= 60
658}
659
660/// Return true when the v1.5 Phase 2i strict comment filter is enabled
661/// via `CODELENS_EMBED_HINT_STRICT_COMMENTS=1` (or `true`/`yes`/`on`).
662///
663/// Phase 2i extends Phase 2h (§8.9) with a comment-side analogue of the
664/// literal filter. Phase 2h recovered ~8 % of the Python regression by
665/// rejecting format/error/log string literals in Pass 2; Phase 2i
666/// targets the remaining ~92 % by rejecting meta-annotation comments
667/// (`# TODO`, `# FIXME`, `# HACK`, `# XXX`, `# BUG`, `# REVIEW`,
668/// `# REFACTOR`, `# TEMP`, `# DEPRECATED`) in Pass 1. Conservative
669/// prefix list — `# NOTE`, `# WARN`, `# SAFETY` are retained because
670/// they often carry behaviour-descriptive content even on Rust.
671///
672/// Default OFF (same policy as every Phase 2 knob). Orthogonal to
673/// `CODELENS_EMBED_HINT_STRICT_LITERALS` so both may be stacked.
674pub fn strict_comments_enabled() -> bool {
675    std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS")
676        .map(|raw| {
677            let lowered = raw.to_ascii_lowercase();
678            matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
679        })
680        .unwrap_or(false)
681}
682
683/// Heuristic: does `body` (the comment text *after* the `//` / `#` prefix
684/// has been stripped by `extract_comment_body`) look like a meta-annotation
685/// rather than behaviour-descriptive prose?
686///
687/// Recognises the following prefixes (case-insensitive, followed by
688/// `:`, `(`, or whitespace):
689/// - `TODO`, `FIXME`, `HACK`, `XXX`, `BUG`
690/// - `REVIEW`, `REFACTOR`, `TEMP`, `TEMPORARY`, `DEPRECATED`
691///
692/// Deliberately excluded (kept as behaviour signal):
693/// - `NOTE`, `NOTES`, `WARN`, `WARNING`
694/// - `SAFETY` (Rust `unsafe` block justifications)
695/// - `PANIC` (Rust invariant docs)
696///
697/// The exclusion list is based on the observation that Rust projects
698/// use `// SAFETY:` and `// NOTE:` to document *why* a block behaves a
699/// certain way — that text is exactly the NL retrieval signal Phase 2b
700/// is trying to capture. The inclusion list targets the "I'll fix this
701/// later" noise that poisons the embedding on both languages but is
702/// especially common on mature Python projects.
703pub fn looks_like_meta_annotation(body: &str) -> bool {
704    let trimmed = body.trim_start();
705    // Find the end of the first "word" (alphanumerics only — a colon,
706    // paren, or whitespace terminates the marker).
707    let word_end = trimmed
708        .find(|c: char| !c.is_ascii_alphabetic())
709        .unwrap_or(trimmed.len());
710    if word_end == 0 {
711        return false;
712    }
713    let first_word = &trimmed[..word_end];
714    let upper = first_word.to_ascii_uppercase();
715    matches!(
716        upper.as_str(),
717        "TODO"
718            | "FIXME"
719            | "HACK"
720            | "XXX"
721            | "BUG"
722            | "REVIEW"
723            | "REFACTOR"
724            | "TEMP"
725            | "TEMPORARY"
726            | "DEPRECATED"
727    )
728}
729
730/// Return true when the v1.5 Phase 2h strict NL literal filter is enabled
731/// via `CODELENS_EMBED_HINT_STRICT_LITERALS=1` (or `true`/`yes`/`on`).
732///
733/// Phase 2h addresses the Phase 3b Python regression (§8.8). The default
734/// Phase 2b Pass 2 scanner accepts any `is_nl_shaped` string literal from
735/// the body, which on Python captures a lot of generic error / log / format
736/// strings (`raise ValueError("Invalid URL %s" % url)`, `logging.debug(...)`,
737/// `fmt.format(...)`). These pass the NL-shape test but carry zero
738/// behaviour-descriptive signal and pollute the embedding. The strict
739/// filter rejects string literals that look like format templates or
740/// common error / log prefixes, while leaving comments (Pass 1) untouched.
741///
742/// Default OFF (same policy as every Phase 2 knob — opt-in first,
743/// measure, then consider flipping the default).
744pub fn strict_literal_filter_enabled() -> bool {
745    std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS")
746        .map(|raw| {
747            let lowered = raw.to_ascii_lowercase();
748            matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
749        })
750        .unwrap_or(false)
751}
752
753/// Heuristic: does `s` contain a C / Python / Rust format specifier?
754///
755/// Recognises:
756/// - C / Python `%` style: `%s`, `%d`, `%r`, `%f`, `%x`, `%o`, `%i`, `%u`
757/// - Python `.format` / f-string style: `{name}`, `{0}`, `{:fmt}`, `{name:fmt}`
758///
759/// Rust `format!` / `println!` style `{}` / `{:?}` / `{name}` is caught by
760/// the same `{...}` branch. Generic `{...}` braces used for JSON-like
761/// content (e.g. `"{name: foo, id: 1}"`) are distinguished from format
762/// placeholders by requiring the inside to be either empty, prefix-colon
763/// (`:fmt`), a single identifier, or an identifier followed by `:fmt`.
764pub fn contains_format_specifier(s: &str) -> bool {
765    let bytes = s.as_bytes();
766    let len = bytes.len();
767    let mut i = 0;
768    while i + 1 < len {
769        if bytes[i] == b'%' {
770            let next = bytes[i + 1];
771            if matches!(next, b's' | b'd' | b'r' | b'f' | b'x' | b'o' | b'i' | b'u') {
772                return true;
773            }
774        }
775        i += 1;
776    }
777    // Python `.format` / f-string / Rust `format!` style `{...}`
778    //
779    // Real format placeholders never contain whitespace inside the braces:
780    // `{}`, `{0}`, `{name}`, `{:?}`, `{:.2f}`, `{name:fmt}`. JSON-like
781    // content such as `{name: foo, id: 1}` DOES contain whitespace. The
782    // whitespace check is therefore the single simplest and most robust
783    // way to distinguish the two without a full format-spec parser.
784    for window in s.split('{').skip(1) {
785        let Some(close_idx) = window.find('}') else {
786            continue;
787        };
788        let inside = &window[..close_idx];
789        // `{}` — Rust empty placeholder
790        if inside.is_empty() {
791            return true;
792        }
793        // Any whitespace inside the braces → JSON-like, not a format spec.
794        if inside.chars().any(|c| c.is_whitespace()) {
795            continue;
796        }
797        // `{:fmt}` — anonymous format spec
798        if inside.starts_with(':') {
799            return true;
800        }
801        // `{name}`, `{0}`, `{name:fmt}` — identifier (or digit), optionally
802        // followed by `:fmt`. We already rejected whitespace-containing
803        // inputs above, so here we only need to check the identifier chars.
804        let ident_end = inside.find(':').unwrap_or(inside.len());
805        let ident = &inside[..ident_end];
806        if !ident.is_empty()
807            && ident
808                .chars()
809                .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
810        {
811            return true;
812        }
813    }
814    false
815}
816
817/// Heuristic: does `s` look like a generic error message, log line, or
818/// low-value imperative string that an NL query would never try to match?
819///
820/// The prefix list is intentionally short — covering the patterns the
821/// Phase 3b `psf/requests` post-mortem flagged as the largest regression
822/// sources. False negatives (real behaviour strings misclassified as
823/// errors) would cost retrieval quality, but because the filter only
824/// runs on string literals and leaves comments alone, a missed NL string
825/// in one symbol will typically have a comment covering the same
826/// behaviour on the same symbol.
827pub fn looks_like_error_or_log_prefix(s: &str) -> bool {
828    let lower = s.trim().to_lowercase();
829    const PREFIXES: &[&str] = &[
830        "invalid ",
831        "cannot ",
832        "could not ",
833        "unable to ",
834        "failed to ",
835        "expected ",
836        "unexpected ",
837        "missing ",
838        "not found",
839        "error: ",
840        "error ",
841        "warning: ",
842        "warning ",
843        "sending ",
844        "received ",
845        "starting ",
846        "stopping ",
847        "calling ",
848        "connecting ",
849        "disconnecting ",
850    ];
851    PREFIXES.iter().any(|p| lower.starts_with(p))
852}
853
854/// Test-only variant: bypass the env gate so the unit tests can exercise
855/// the filter logic deterministically (mirrors `extract_nl_tokens_inner`
856/// vs `extract_nl_tokens` policy). Inlined here instead of a `#[cfg(test)]`
857/// helper so the release binary path never calls it.
858#[cfg(test)]
859pub fn should_reject_literal_strict(s: &str) -> bool {
860    contains_format_specifier(s) || looks_like_error_or_log_prefix(s)
861}
862
863/// Collect natural-language tokens from a function body: line comments,
864/// block comments, and string literals that look like NL prose.
865///
866/// v1.5 Phase 2b experiment. The hypothesis is that the bundled
867/// CodeSearchNet-INT8 model struggles with NL queries (hybrid MRR 0.472)
868/// because the symbol text it sees is pure code, whereas NL queries target
869/// behavioural descriptions that live in *comments* and *string literals*.
870///
871/// Unlike `extract_body_hint` (which skips comments) this function only
872/// keeps comments + NL-shaped string literals and ignores actual code.
873///
874/// Gated by `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1`. Returns `None` when
875/// the gate is off so the default embedding text is untouched.
876pub fn extract_nl_tokens(source: &str, start: usize, end: usize) -> Option<String> {
877    if !nl_tokens_enabled() {
878        return None;
879    }
880    extract_nl_tokens_inner(source, start, end)
881}
882
883/// Env-independent core of `extract_nl_tokens`, exposed to the test module
884/// so unit tests can run deterministically without touching env vars
885/// (which would race with the other tests that set
886/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`).
887pub fn extract_nl_tokens_inner(source: &str, start: usize, end: usize) -> Option<String> {
888    if start >= source.len() || end > source.len() || start >= end {
889        return None;
890    }
891    let safe_start = if source.is_char_boundary(start) {
892        start
893    } else {
894        source.floor_char_boundary(start)
895    };
896    let safe_end = end.min(source.len());
897    let safe_end = if source.is_char_boundary(safe_end) {
898        safe_end
899    } else {
900        source.floor_char_boundary(safe_end)
901    };
902    let body = &source[safe_start..safe_end];
903
904    let mut tokens: Vec<String> = Vec::new();
905
906    // ── Pass 1: comments ─────────────────────────────────────────────
907    // v1.5 Phase 2i: when CODELENS_EMBED_HINT_STRICT_COMMENTS=1 is set,
908    // reject meta-annotation comments (`# TODO`, `# FIXME`, `# HACK`,
909    // ...) while keeping behaviour-descriptive comments untouched. This
910    // is the comment-side analogue of the Phase 2h literal filter
911    // (§8.9) and targets the remaining ~92 % of the Python regression
912    // that Phase 2h's literal-only filter left behind.
913    let strict_comments = strict_comments_enabled();
914    for line in body.lines() {
915        let trimmed = line.trim();
916        if let Some(cleaned) = extract_comment_body(trimmed)
917            && is_nl_shaped(&cleaned)
918            && (!strict_comments || !looks_like_meta_annotation(&cleaned))
919        {
920            tokens.push(cleaned);
921        }
922    }
923
924    // ── Pass 2: double-quoted string literals ────────────────────────
925    // Simplified scanner — handles escape sequences but does not track
926    // multi-line strings or raw strings. Good enough for NL-shaped
927    // heuristic filtering where false negatives are acceptable.
928    //
929    // v1.5 Phase 2h: when CODELENS_EMBED_HINT_STRICT_LITERALS=1 is set,
930    // also reject format templates and generic error / log prefixes. This
931    // addresses the Phase 3b Python regression documented in §8.8 —
932    // comments (Pass 1) stay untouched so Rust projects keep their wins.
933    let strict_literals = strict_literal_filter_enabled();
934    let mut chars = body.chars().peekable();
935    let mut in_string = false;
936    let mut current = String::new();
937    while let Some(c) = chars.next() {
938        if in_string {
939            if c == '\\' {
940                // Skip escape sequence
941                let _ = chars.next();
942            } else if c == '"' {
943                if is_nl_shaped(&current)
944                    && (!strict_literals
945                        || (!contains_format_specifier(&current)
946                            && !looks_like_error_or_log_prefix(&current)))
947                {
948                    tokens.push(current.clone());
949                }
950                current.clear();
951                in_string = false;
952            } else {
953                current.push(c);
954            }
955        } else if c == '"' {
956            in_string = true;
957        }
958    }
959
960    if tokens.is_empty() {
961        return None;
962    }
963    Some(join_hint_lines(&tokens))
964}
965
966/// Return true when API-call extraction is enabled via
967/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` (or `true`/`yes`/`on`).
968///
969/// v1.5 Phase 2c infrastructure — kept off by default pending A/B
970/// measurement. Orthogonal to `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`
971/// so both may be stacked.
972///
973/// v1.5 Phase 2j: explicit env > auto mode, same policy as Phase 2b.
974pub fn api_calls_enabled() -> bool {
975    if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_API_CALLS") {
976        return explicit;
977    }
978    auto_hint_should_enable()
979}
980
981/// Heuristic: does `ident` look like a Rust/C++ *type* (PascalCase) rather
982/// than a module or free function (snake_case)?
983///
984/// Phase 2c API-call extractor relies on this filter to keep the hint
985/// focused on static-method call sites (`Parser::new`, `HashMap::with_capacity`)
986/// and drop module-scoped free functions (`std::fs::read_to_string`).
987/// We intentionally accept only an ASCII uppercase first letter; stricter
988/// than PascalCase detection but deliberate — the goal is high-precision
989/// Type filtering, not lexical accuracy.
990pub fn is_static_method_ident(ident: &str) -> bool {
991    ident.chars().next().is_some_and(|c| c.is_ascii_uppercase())
992}
993
994/// Collect `Type::method` call sites from a function body.
995///
996/// v1.5 Phase 2c experiment. Hypothesis: exposing the Types a function
997/// interacts with (via their static-method call sites) adds a lexical
998/// bridge between NL queries ("parse json", "open database") and symbols
999/// whose body references the relevant type (`Parser::new`, `Connection::open`).
1000/// This is orthogonal to Phase 2b (comments + NL-shaped literals), which
1001/// targets *explanatory* natural language rather than *type* hints.
1002///
1003/// Gated by `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1`. Returns `None` when
1004/// the gate is off so the default embedding text is untouched.
1005pub fn extract_api_calls(source: &str, start: usize, end: usize) -> Option<String> {
1006    if !api_calls_enabled() {
1007        return None;
1008    }
1009    extract_api_calls_inner(source, start, end)
1010}
1011
1012/// Env-independent core of `extract_api_calls`, exposed to the test module
1013/// so unit tests can run deterministically without touching env vars
1014/// (which would race with other tests that set
1015/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS`).
1016///
1017/// Scans the body for `Type::method` byte patterns where:
1018/// - `Type` starts with an ASCII uppercase letter and consists of
1019///   `[A-Za-z0-9_]*` (plain ASCII — non-ASCII identifiers are skipped
1020///   on purpose to minimise noise).
1021/// - `method` is any identifier (start `[A-Za-z_]`, continue `[A-Za-z0-9_]*`).
1022///
1023/// Duplicate `Type::method` pairs collapse into a single entry to avoid
1024/// biasing the embedding toward repeated calls in hot loops.
1025pub fn extract_api_calls_inner(source: &str, start: usize, end: usize) -> Option<String> {
1026    if start >= source.len() || end > source.len() || start >= end {
1027        return None;
1028    }
1029    let safe_start = if source.is_char_boundary(start) {
1030        start
1031    } else {
1032        source.floor_char_boundary(start)
1033    };
1034    let safe_end = end.min(source.len());
1035    let safe_end = if source.is_char_boundary(safe_end) {
1036        safe_end
1037    } else {
1038        source.floor_char_boundary(safe_end)
1039    };
1040    if safe_start >= safe_end {
1041        return None;
1042    }
1043    let body = &source[safe_start..safe_end];
1044    let bytes = body.as_bytes();
1045    let len = bytes.len();
1046
1047    let mut calls: Vec<String> = Vec::new();
1048    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1049
1050    let mut i = 0usize;
1051    while i < len {
1052        let b = bytes[i];
1053        // Walk forward until we find the start of an ASCII identifier.
1054        if !(b == b'_' || b.is_ascii_alphabetic()) {
1055            i += 1;
1056            continue;
1057        }
1058        let ident_start = i;
1059        while i < len {
1060            let bb = bytes[i];
1061            if bb == b'_' || bb.is_ascii_alphanumeric() {
1062                i += 1;
1063            } else {
1064                break;
1065            }
1066        }
1067        let ident_end = i;
1068
1069        // Must be immediately followed by `::`.
1070        if i + 1 >= len || bytes[i] != b':' || bytes[i + 1] != b':' {
1071            continue;
1072        }
1073
1074        let type_ident = &body[ident_start..ident_end];
1075        if !is_static_method_ident(type_ident) {
1076            // `snake_module::foo` — not a Type. Skip past the `::` so we
1077            // don't rescan the same characters, but keep walking.
1078            i += 2;
1079            continue;
1080        }
1081
1082        // Skip the `::`
1083        let mut j = i + 2;
1084        if j >= len || !(bytes[j] == b'_' || bytes[j].is_ascii_alphabetic()) {
1085            i = j;
1086            continue;
1087        }
1088        let method_start = j;
1089        while j < len {
1090            let bb = bytes[j];
1091            if bb == b'_' || bb.is_ascii_alphanumeric() {
1092                j += 1;
1093            } else {
1094                break;
1095            }
1096        }
1097        let method_end = j;
1098
1099        let method_ident = &body[method_start..method_end];
1100        let call = format!("{type_ident}::{method_ident}");
1101        if seen.insert(call.clone()) {
1102            calls.push(call);
1103        }
1104        i = j;
1105    }
1106
1107    if calls.is_empty() {
1108        return None;
1109    }
1110    Some(join_hint_lines(&calls))
1111}
1112
1113/// Peel the comment prefix off a trimmed line, returning the inner text
1114/// if the line is recognisably a `//`, `#`, `/* */`, or leading-`*` comment.
1115pub fn extract_comment_body(trimmed: &str) -> Option<String> {
1116    if trimmed.is_empty() {
1117        return None;
1118    }
1119    // `//` and `///` and `//!` (Rust doc comments)
1120    if let Some(rest) = trimmed.strip_prefix("///") {
1121        return Some(rest.trim().to_string());
1122    }
1123    if let Some(rest) = trimmed.strip_prefix("//!") {
1124        return Some(rest.trim().to_string());
1125    }
1126    if let Some(rest) = trimmed.strip_prefix("//") {
1127        return Some(rest.trim().to_string());
1128    }
1129    // `#[...]` attribute, `#!...` shebang — NOT comments
1130    if trimmed.starts_with("#[") || trimmed.starts_with("#!") {
1131        return None;
1132    }
1133    // `#` line comment (Python, bash, ...)
1134    if let Some(rest) = trimmed.strip_prefix('#') {
1135        return Some(rest.trim().to_string());
1136    }
1137    // Block-comment line: `/**`, `/*`, or continuation `*`
1138    if let Some(rest) = trimmed.strip_prefix("/**") {
1139        return Some(rest.trim_end_matches("*/").trim().to_string());
1140    }
1141    if let Some(rest) = trimmed.strip_prefix("/*") {
1142        return Some(rest.trim_end_matches("*/").trim().to_string());
1143    }
1144    if let Some(rest) = trimmed.strip_prefix('*') {
1145        // Block-comment continuation. Only accept if the rest looks textual
1146        // (avoid e.g. `*const T` pointer types).
1147        let rest = rest.trim_end_matches("*/").trim();
1148        if rest.is_empty() {
1149            return None;
1150        }
1151        // Reject obvious code continuations
1152        if rest.contains(';') || rest.contains('{') {
1153            return None;
1154        }
1155        return Some(rest.to_string());
1156    }
1157    None
1158}
1159
1160/// Extract the leading docstring or comment block from a symbol's body.
1161/// Supports: Python triple-quote, Rust //!//// doc comments, JS/TS /** */ blocks.
1162pub fn extract_leading_doc(source: &str, start: usize, end: usize) -> Option<String> {
1163    if start >= source.len() || end > source.len() || start >= end {
1164        return None;
1165    }
1166    // Clamp to nearest char boundary to avoid panicking on multi-byte UTF-8
1167    let safe_start = if source.is_char_boundary(start) {
1168        start
1169    } else {
1170        source.floor_char_boundary(start)
1171    };
1172    let safe_end = end.min(source.len());
1173    let safe_end = if source.is_char_boundary(safe_end) {
1174        safe_end
1175    } else {
1176        source.floor_char_boundary(safe_end)
1177    };
1178    if safe_start >= safe_end {
1179        return None;
1180    }
1181    let body = &source[safe_start..safe_end];
1182    let lines: Vec<&str> = body.lines().skip(1).collect(); // skip the signature line
1183    if lines.is_empty() {
1184        return None;
1185    }
1186
1187    let mut doc_lines = Vec::new();
1188
1189    // Python: triple-quote docstrings
1190    let first_trimmed = lines.first().map(|l| l.trim()).unwrap_or_default();
1191    if first_trimmed.starts_with("\"\"\"") || first_trimmed.starts_with("'''") {
1192        let quote = &first_trimmed[..3];
1193        for line in &lines {
1194            let t = line.trim();
1195            doc_lines.push(t.trim_start_matches(quote).trim_end_matches(quote));
1196            if doc_lines.len() > 1 && t.ends_with(quote) {
1197                break;
1198            }
1199        }
1200    }
1201    // Rust: /// or //! doc comments (before the body, captured by tree-sitter)
1202    else if first_trimmed.starts_with("///") || first_trimmed.starts_with("//!") {
1203        for line in &lines {
1204            let t = line.trim();
1205            if t.starts_with("///") || t.starts_with("//!") {
1206                doc_lines.push(t.trim_start_matches("///").trim_start_matches("//!").trim());
1207            } else {
1208                break;
1209            }
1210        }
1211    }
1212    // JS/TS: /** ... */ block comments
1213    else if first_trimmed.starts_with("/**") {
1214        for line in &lines {
1215            let t = line.trim();
1216            let cleaned = t
1217                .trim_start_matches("/**")
1218                .trim_start_matches('*')
1219                .trim_end_matches("*/")
1220                .trim();
1221            if !cleaned.is_empty() {
1222                doc_lines.push(cleaned);
1223            }
1224            if t.ends_with("*/") {
1225                break;
1226            }
1227        }
1228    }
1229    // Generic: leading // or # comment block
1230    else {
1231        for line in &lines {
1232            let t = line.trim();
1233            if t.starts_with("//") || t.starts_with('#') {
1234                doc_lines.push(t.trim_start_matches("//").trim_start_matches('#').trim());
1235            } else {
1236                break;
1237            }
1238        }
1239    }
1240
1241    if doc_lines.is_empty() {
1242        return None;
1243    }
1244    Some(doc_lines.join(" ").trim().to_owned())
1245}