codelens_engine/embedding/prompt.rs
1use super::runtime::parse_bool_env;
2
3/// Split CamelCase/snake_case into space-separated words for embedding matching.
4/// "getDonationRankings" → "get Donation Rankings"
5/// "build_non_code_ranges" → "build non code ranges"
6pub fn split_identifier(name: &str) -> String {
7 // Only split if name is CamelCase or snake_case with multiple segments
8 if !name.contains('_') && !name.chars().any(|c| c.is_uppercase()) {
9 return name.to_string();
10 }
11 let mut words = Vec::new();
12 let mut current = String::new();
13 let chars: Vec<char> = name.chars().collect();
14 for (i, &ch) in chars.iter().enumerate() {
15 if ch == '_' {
16 if !current.is_empty() {
17 words.push(current.clone());
18 current.clear();
19 }
20 } else if ch.is_uppercase()
21 && !current.is_empty()
22 && (current
23 .chars()
24 .last()
25 .map(|c| c.is_lowercase())
26 .unwrap_or(false)
27 || chars.get(i + 1).map(|c| c.is_lowercase()).unwrap_or(false))
28 {
29 // Split at CamelCase boundary, but not for ALL_CAPS
30 words.push(current.clone());
31 current.clear();
32 current.push(ch);
33 } else {
34 current.push(ch);
35 }
36 }
37 if !current.is_empty() {
38 words.push(current);
39 }
40 if words.len() <= 1 {
41 return name.to_string(); // No meaningful split
42 }
43 words.join(" ")
44}
45
46pub fn is_test_only_symbol(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> bool {
47 let fp = &sym.file_path;
48
49 // ── Path-based detection (language-agnostic) ─────────────────────
50 // Rust
51 if fp.contains("/tests/") || fp.ends_with("_tests.rs") {
52 return true;
53 }
54 // JS/TS — Jest __tests__ directory
55 if fp.contains("/__tests__/") || fp.contains("\\__tests__\\") {
56 return true;
57 }
58 // Python
59 if fp.ends_with("_test.py") {
60 return true;
61 }
62 // Go
63 if fp.ends_with("_test.go") {
64 return true;
65 }
66 // JS/TS — .test.* / .spec.*
67 if fp.ends_with(".test.ts")
68 || fp.ends_with(".test.tsx")
69 || fp.ends_with(".test.js")
70 || fp.ends_with(".test.jsx")
71 || fp.ends_with(".spec.ts")
72 || fp.ends_with(".spec.js")
73 {
74 return true;
75 }
76 // Java/Kotlin — Maven src/test/ layout
77 if fp.contains("/src/test/") {
78 return true;
79 }
80 // Java — *Test.java / *Tests.java
81 if fp.ends_with("Test.java") || fp.ends_with("Tests.java") {
82 return true;
83 }
84 // Ruby
85 if fp.ends_with("_test.rb") || fp.contains("/spec/") {
86 return true;
87 }
88
89 // ── Rust name_path patterns ───────────────────────────────────────
90 if sym.name_path.starts_with("tests::")
91 || sym.name_path.contains("::tests::")
92 || sym.name_path.starts_with("test::")
93 || sym.name_path.contains("::test::")
94 {
95 return true;
96 }
97
98 let Some(source) = source else {
99 return false;
100 };
101
102 let start = usize::try_from(sym.start_byte.max(0))
103 .unwrap_or(0)
104 .min(source.len());
105
106 // ── Source-based: Rust attributes ────────────────────────────────
107 let window_start = start.saturating_sub(2048);
108 let attrs = String::from_utf8_lossy(&source.as_bytes()[window_start..start]);
109 if attrs.contains("#[test]")
110 || attrs.contains("#[tokio::test]")
111 || attrs.contains("#[cfg(test)]")
112 || attrs.contains("#[cfg(all(test")
113 {
114 return true;
115 }
116
117 // ── Source-based: Python ─────────────────────────────────────────
118 // Function names starting with `test_` or class names starting with `Test`
119 if fp.ends_with(".py") {
120 if sym.name.starts_with("test_") {
121 return true;
122 }
123 // Class whose name starts with "Test" — also matches TestCase subclasses
124 if sym.kind == "class" && sym.name.starts_with("Test") {
125 return true;
126 }
127 }
128
129 // ── Source-based: Go ─────────────────────────────────────────────
130 // func TestXxx(...) pattern; file must end with _test.go (already caught above),
131 // but guard on .go extension for any edge-case non-test files with Test* helpers.
132 if fp.ends_with(".go") && sym.name.starts_with("Test") && sym.kind == "function" {
133 return true;
134 }
135
136 // ── Source-based: Java / Kotlin ──────────────────────────────────
137 if fp.ends_with(".java") || fp.ends_with(".kt") {
138 let before = &source[..start];
139 let window = if before.len() > 200 {
140 &before[before.len() - 200..]
141 } else {
142 before
143 };
144 if window.contains("@Test")
145 || window.contains("@ParameterizedTest")
146 || window.contains("@RepeatedTest")
147 {
148 return true;
149 }
150 }
151
152 false
153}
154
155/// Build the embedding text for a symbol.
156///
157/// Optimized for MiniLM-L12-CodeSearchNet:
158/// - No "passage:" prefix (model not trained with prefixes)
159/// - Include file context for disambiguation
160/// - Signature-focused (body inclusion hurts quality for this model)
161///
162/// When `CODELENS_EMBED_DOCSTRINGS=1` is set, leading docstrings/comments are
163/// appended. Disabled by default because the bundled CodeSearchNet-INT8 model
164/// is optimized for code signatures and dilutes on natural language text.
165/// Enable when switching to a hybrid code+text model (E5-large, BGE-base, etc).
166pub fn build_embedding_text(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> String {
167 // File context: use only the filename (not full path) to reduce noise.
168 // Full paths like "crates/codelens-engine/src/symbols/mod.rs" add tokens
169 // that dilute the semantic signal. "mod.rs" is sufficient context.
170 let file_ctx = if sym.file_path.is_empty() {
171 String::new()
172 } else {
173 let filename = sym.file_path.rsplit('/').next().unwrap_or(&sym.file_path);
174 format!(" in {}", filename)
175 };
176
177 // Include split identifier words for better NL matching
178 // e.g. "getDonationRankings" → "get Donation Rankings"
179 let split_name = split_identifier(&sym.name);
180 let name_with_split = if split_name != sym.name {
181 format!("{} ({})", sym.name, split_name)
182 } else {
183 sym.name.clone()
184 };
185
186 // Add parent context from name_path (e.g. "UserService/get_user" → "in UserService")
187 let parent_ctx = if !sym.name_path.is_empty() && sym.name_path.contains('/') {
188 let parent = sym.name_path.rsplit_once('/').map(|x| x.0).unwrap_or("");
189 if parent.is_empty() {
190 String::new()
191 } else {
192 format!(" (in {})", parent)
193 }
194 } else {
195 String::new()
196 };
197
198 // Module context: directory name provides domain signal without full path noise.
199 // "embedding/mod.rs" → module "embedding", "symbols/ranking.rs" → module "symbols"
200 let module_ctx = if sym.file_path.contains('/') {
201 let parts: Vec<&str> = sym.file_path.rsplitn(3, '/').collect();
202 if parts.len() >= 2 {
203 let dir = parts[1];
204 // Skip generic dirs like "src"
205 if dir != "src" && dir != "crates" {
206 format!(" [{dir}]")
207 } else {
208 String::new()
209 }
210 } else {
211 String::new()
212 }
213 } else {
214 String::new()
215 };
216
217 let base = if sym.signature.is_empty() {
218 format!(
219 "{} {}{}{}{}",
220 sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx
221 )
222 } else {
223 format!(
224 "{} {}{}{}{}: {}",
225 sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx, sym.signature
226 )
227 };
228
229 // Docstring inclusion: v2 model improved NL understanding (+45%), enabling
230 // docstrings by default. Measured: ranked_context +0.020, semantic -0.003 (neutral).
231 // Disable via CODELENS_EMBED_DOCSTRINGS=0 if needed.
232 let docstrings_disabled = std::env::var("CODELENS_EMBED_DOCSTRINGS")
233 .map(|v| v == "0" || v == "false")
234 .unwrap_or(false);
235
236 if docstrings_disabled {
237 return base;
238 }
239
240 let docstring = source
241 .and_then(|src| extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize))
242 .unwrap_or_default();
243
244 let mut text = if docstring.is_empty() {
245 // Fallback: extract the first few meaningful lines from the function
246 // body. This captures key API calls (e.g. "tree_sitter::Parser",
247 // "stdin()") that help the embedding model match NL queries to
248 // symbols without docs.
249 let body_hint = source
250 .and_then(|src| extract_body_hint(src, sym.start_byte as usize, sym.end_byte as usize))
251 .unwrap_or_default();
252 if body_hint.is_empty() {
253 base
254 } else {
255 format!("{} — {}", base, body_hint)
256 }
257 } else {
258 // Collect up to hint_line_budget() non-empty docstring lines
259 // (rather than only the first) so the embedding model sees
260 // multi-sentence explanations in full — up to the runtime
261 // char budget via join_hint_lines.
262 let line_budget = hint_line_budget();
263 let lines: Vec<String> = docstring
264 .lines()
265 .map(str::trim)
266 .filter(|line| !line.is_empty())
267 .take(line_budget)
268 .map(str::to_string)
269 .collect();
270 let hint = join_hint_lines(&lines);
271 if hint.is_empty() {
272 base
273 } else {
274 format!("{} — {}", base, hint)
275 }
276 };
277
278 // v1.5 Phase 2b experiment: optionally append NL tokens harvested from
279 // comments and string literals inside the body. Disabled by default;
280 // enable with `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` to A/B.
281 if let Some(src) = source
282 && let Some(nl_tokens) =
283 extract_nl_tokens(src, sym.start_byte as usize, sym.end_byte as usize)
284 && !nl_tokens.is_empty()
285 {
286 text.push_str(" · NL: ");
287 text.push_str(&nl_tokens);
288 }
289
290 // v1.5 Phase 2c experiment: optionally append `Type::method` call-site
291 // hints harvested from the body. Disabled by default; enable with
292 // `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` to A/B. Orthogonal to
293 // Phase 2b — both can be stacked.
294 if let Some(src) = source
295 && let Some(api_calls) =
296 extract_api_calls(src, sym.start_byte as usize, sym.end_byte as usize)
297 && !api_calls.is_empty()
298 {
299 text.push_str(" · API: ");
300 text.push_str(&api_calls);
301 }
302
303 text
304}
305
306/// Maximum total characters collected from body-hint or docstring lines.
307/// Kept conservative to avoid diluting signature signal for the bundled
308/// MiniLM-L12-CodeSearchNet INT8 model. Override via
309/// `CODELENS_EMBED_HINT_CHARS` for experiments (clamped to 60..=512).
310///
311/// History: a v1.5 Phase 2 PoC briefly raised this to 180 / 3 lines in an
312/// attempt to close the NL query MRR gap. The 2026-04-11 A/B measurement
313/// (`benchmarks/embedding-quality-v1.5-hint1` vs `-phase2`) showed
314/// `hybrid -0.005`, `NL hybrid -0.008`, `NL semantic_search -0.041`, so
315/// the defaults reverted to the pre-PoC values. The infrastructure
316/// (`join_hint_lines`, `hint_line_budget`, env overrides) stayed so the
317/// next experiment does not need a rewrite.
318const DEFAULT_HINT_TOTAL_CHAR_BUDGET: usize = 60;
319
320/// Maximum number of meaningful lines to collect from a function body.
321/// Overridable via `CODELENS_EMBED_HINT_LINES` (clamped to 1..=10).
322const DEFAULT_HINT_LINES: usize = 1;
323
324pub fn hint_char_budget() -> usize {
325 std::env::var("CODELENS_EMBED_HINT_CHARS")
326 .ok()
327 .and_then(|raw| raw.parse::<usize>().ok())
328 .map(|n| n.clamp(60, 512))
329 .unwrap_or(DEFAULT_HINT_TOTAL_CHAR_BUDGET)
330}
331
332pub fn hint_line_budget() -> usize {
333 std::env::var("CODELENS_EMBED_HINT_LINES")
334 .ok()
335 .and_then(|raw| raw.parse::<usize>().ok())
336 .map(|n| n.clamp(1, 10))
337 .unwrap_or(DEFAULT_HINT_LINES)
338}
339
340/// Join collected hint lines, capping at the runtime-configured char
341/// budget (default 60 chars; override via `CODELENS_EMBED_HINT_CHARS`).
342///
343/// Each line is separated by " · " so the embedding model sees a small
344/// structural boundary between logically distinct body snippets. The final
345/// result is truncated with a trailing "..." on char-boundaries only.
346pub fn join_hint_lines(lines: &[String]) -> String {
347 if lines.is_empty() {
348 return String::new();
349 }
350 let joined = lines
351 .iter()
352 .map(String::as_str)
353 .collect::<Vec<_>>()
354 .join(" · ");
355 let budget = hint_char_budget();
356 if joined.chars().count() > budget {
357 let truncated: String = joined.chars().take(budget).collect();
358 format!("{truncated}...")
359 } else {
360 joined
361 }
362}
363
364/// Extract up to `hint_line_budget()` meaningful lines from a function body
365/// (skipping braces, blank lines, and comments). Used as a fallback when no
366/// docstring is available so the embedding model still sees the core API
367/// calls / return values.
368///
369/// Historically this returned only the first meaningful line clipped at 60
370/// chars. The 180-char / 3-line budget was introduced in v1.5 Phase 2 to
371/// close the NL-query gap (MRR 0.528) on cases where the discriminating
372/// keyword lives in line 2 or 3 of the body.
373pub fn extract_body_hint(source: &str, start: usize, end: usize) -> Option<String> {
374 if start >= source.len() || end > source.len() || start >= end {
375 return None;
376 }
377 let safe_start = if source.is_char_boundary(start) {
378 start
379 } else {
380 source.floor_char_boundary(start)
381 };
382 let safe_end = end.min(source.len());
383 let safe_end = if source.is_char_boundary(safe_end) {
384 safe_end
385 } else {
386 source.floor_char_boundary(safe_end)
387 };
388 let body = &source[safe_start..safe_end];
389
390 let max_lines = hint_line_budget();
391 let mut collected: Vec<String> = Vec::with_capacity(max_lines);
392
393 // Skip past the signature: everything until we see a line ending with '{' or ':'
394 // (opening brace of the function body), then start looking for meaningful lines.
395 let mut past_signature = false;
396 for line in body.lines() {
397 let trimmed = line.trim();
398 if !past_signature {
399 // Keep skipping until we find the opening brace/colon
400 if trimmed.ends_with('{') || trimmed.ends_with(':') || trimmed == "{" {
401 past_signature = true;
402 }
403 continue;
404 }
405 // Skip comments, blank lines, closing braces
406 if trimmed.is_empty()
407 || trimmed.starts_with("//")
408 || trimmed.starts_with('#')
409 || trimmed.starts_with("/*")
410 || trimmed.starts_with('*')
411 || trimmed == "}"
412 {
413 continue;
414 }
415 collected.push(trimmed.to_string());
416 if collected.len() >= max_lines {
417 break;
418 }
419 }
420
421 if collected.is_empty() {
422 None
423 } else {
424 Some(join_hint_lines(&collected))
425 }
426}
427
428/// Return true when NL-token collection is enabled via
429/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` (or `true`/`yes`/`on`).
430///
431/// v1.5 Phase 2b infrastructure — kept off by default pending A/B
432/// measurement against the fixed 89-query dataset.
433///
434/// v1.5 Phase 2j: when no explicit env var is set, fall through to
435/// `auto_hint_should_enable()` which consults `CODELENS_EMBED_HINT_AUTO` +
436/// `CODELENS_EMBED_HINT_AUTO_LANG` for language-gated defaults.
437pub fn nl_tokens_enabled() -> bool {
438 if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_COMMENTS") {
439 return explicit;
440 }
441 auto_hint_should_enable()
442}
443
444/// Return true when v1.5 Phase 2j auto-detection mode is enabled.
445///
446/// **v1.6.0 default change (§8.14)**: this returns `true` by default.
447/// Users opt **out** with `CODELENS_EMBED_HINT_AUTO=0` (or `false` /
448/// `no` / `off`). The previous v1.5.x behaviour was the other way
449/// around — default OFF, opt in with `=1`. The flip ships as part of
450/// v1.6.0 after the five-dataset measurement (§8.7, §8.8, §8.13,
451/// §8.11, §8.12) validated:
452///
453/// 1. Rust / C / C++ / Go / Java / Kotlin / Scala / C# projects hit
454/// the §8.7 stacked arm (+2.4 % to +15.2 % hybrid MRR).
455/// 2. TypeScript / JavaScript projects validated the Phase 2b/2c
456/// embedding hints on `facebook/jest` and later `microsoft/typescript`.
457/// Subsequent app/runtime follow-ups (`vercel/next.js`,
458/// `facebook/react` production subtree) motivated splitting Phase 2e
459/// out of the JS/TS auto path, but not removing JS/TS from the
460/// embedding-hint default.
461/// 3. Python projects hit the §8.8 baseline (no change) — the
462/// §8.11 language gate + §8.12 MCP auto-set means Python is
463/// auto-detected and the stack stays OFF without user action.
464/// 4. Ruby / PHP / Lua / shell / untested-dynamic projects fall
465/// through to the conservative default-off branch (same as
466/// Python behaviour — no regression).
467///
468/// The dominant language is supplied by the MCP tool layer via the
469/// `CODELENS_EMBED_HINT_AUTO_LANG` env var, which is set
470/// automatically on startup (`main.rs`) and on MCP
471/// `activate_project` calls by `compute_dominant_language` (§8.12).
472/// The engine only reads the env var — it does not walk the
473/// filesystem itself.
474///
475/// Explicit `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` /
476/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` /
477/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or their `=0` counterparts)
478/// always win over the auto decision — users who want to force a
479/// configuration still can, the auto mode is a better default, not
480/// a lock-in.
481///
482/// **Opt-out**: set `CODELENS_EMBED_HINT_AUTO=0` to restore v1.5.x
483/// behaviour (no auto-detection, all Phase 2 gates default off unless
484/// their individual env vars are set).
485pub fn auto_hint_mode_enabled() -> bool {
486 parse_bool_env("CODELENS_EMBED_HINT_AUTO").unwrap_or(true)
487}
488
489/// Return the language tag supplied by the MCP tool layer via
490/// `CODELENS_EMBED_HINT_AUTO_LANG`, or `None` when unset. The tag is
491/// compared against `language_supports_nl_stack` to decide whether
492/// the Phase 2b / 2c / 2e stack should be auto-enabled.
493///
494/// Accepted tags are the canonical extensions from
495/// `crates/codelens-engine/src/lang_config.rs` (`rs`, `py`, `js`,
496/// `ts`, `go`, `rb`, `java`, `kt`, `scala`, `cs`, `cpp`, `c`, …) plus
497/// a handful of long-form aliases (`rust`, `python`, `javascript`,
498/// `typescript`, `golang`) for users who set the env var by hand.
499pub fn auto_hint_lang() -> Option<String> {
500 std::env::var("CODELENS_EMBED_HINT_AUTO_LANG")
501 .ok()
502 .map(|raw| raw.trim().to_ascii_lowercase())
503}
504
505/// Return true when `lang` is a language where the v1.5 embedding-hint
506/// stack (Phase 2b comments + Phase 2c API-call extraction) has been
507/// measured to net-positive (§8.2, §8.4, §8.6, §8.7, §8.13, §8.15) or
508/// where the language's static typing + snake_case naming + comment-first
509/// culture makes the mechanism behave the same way it does on Rust.
510///
511/// This gate is intentionally separate from the Phase 2e sparse
512/// re-ranker. As of the §8.15 / §8.16 / §8.17 follow-up arc, JS/TS stays
513/// enabled here because tooling/compiler repos are positive and short-file
514/// runtime repos are inert, but JS/TS is disabled in the **sparse**
515/// auto-gate because Phase 2e is negative-or-null on that family.
516///
517/// The list is intentionally conservative — additions require an actual
518/// external-repo A/B following the §8.7 methodology, not a
519/// language-similarity argument alone.
520///
521/// **Supported** (measured or by static-typing analogy):
522/// - `rs`, `rust` (§8.2, §8.4, §8.6, §8.7: +2.4 %, +7.1 %, +15.2 %)
523/// - `cpp`, `cc`, `cxx`, `c++`
524/// - `c`
525/// - `go`, `golang`
526/// - `java`
527/// - `kt`, `kotlin`
528/// - `scala`
529/// - `cs`, `csharp`
530/// - `ts`, `typescript`, `tsx` (§8.13: `facebook/jest` +7.3 % hybrid MRR)
531/// - `js`, `javascript`, `jsx`
532///
533/// **Unsupported** (measured regression or untested dynamic-typed):
534/// - `py`, `python` (§8.8 regression)
535/// - `rb`, `ruby`
536/// - `php`
537/// - `lua`, `r`, `jl`
538/// - `sh`, `bash`
539/// - anything else
540pub fn language_supports_nl_stack(lang: &str) -> bool {
541 matches!(
542 lang.trim().to_ascii_lowercase().as_str(),
543 "rs" | "rust"
544 | "cpp"
545 | "cc"
546 | "cxx"
547 | "c++"
548 | "c"
549 | "go"
550 | "golang"
551 | "java"
552 | "kt"
553 | "kotlin"
554 | "scala"
555 | "cs"
556 | "csharp"
557 | "ts"
558 | "typescript"
559 | "tsx"
560 | "js"
561 | "javascript"
562 | "jsx"
563 )
564}
565
566/// Return true when `lang` is a language where the Phase 2e sparse
567/// coverage re-ranker should be auto-enabled when the user has not set
568/// `CODELENS_RANK_SPARSE_TERM_WEIGHT` explicitly.
569///
570/// This is deliberately narrower than `language_supports_nl_stack`.
571/// Phase 2e remains positive on Rust-style codebases, but the JS/TS
572/// measurement arc now says:
573///
574/// - `facebook/jest`: marginal positive
575/// - `microsoft/typescript`: negative
576/// - `vercel/next.js`: slight negative
577/// - `facebook/react` production subtree: exact no-op
578///
579/// So the conservative Phase 2m policy is:
580/// - keep Phase 2b/2c auto-eligible on JS/TS
581/// - disable **auto** Phase 2e on JS/TS
582/// - preserve explicit env override for users who want to force it on
583pub fn language_supports_sparse_weighting(lang: &str) -> bool {
584 matches!(
585 lang.trim().to_ascii_lowercase().as_str(),
586 "rs" | "rust"
587 | "cpp"
588 | "cc"
589 | "cxx"
590 | "c++"
591 | "c"
592 | "go"
593 | "golang"
594 | "java"
595 | "kt"
596 | "kotlin"
597 | "scala"
598 | "cs"
599 | "csharp"
600 )
601}
602
603/// Combined decision: Phase 2j auto mode is enabled AND the detected
604/// language supports the Phase 2b/2c embedding-hint stack. This is the
605/// `else` branch that `nl_tokens_enabled` and `api_calls_enabled` fall
606/// through to when no explicit env var is set.
607pub fn auto_hint_should_enable() -> bool {
608 if !auto_hint_mode_enabled() {
609 return false;
610 }
611 match auto_hint_lang() {
612 Some(lang) => language_supports_nl_stack(&lang),
613 None => false, // auto mode on but no language tag → conservative OFF
614 }
615}
616
617/// Combined decision: Phase 2j auto mode is enabled AND the detected
618/// language supports auto-enabling the Phase 2e sparse re-ranker.
619///
620/// This intentionally differs from `auto_hint_should_enable()` after the
621/// §8.15 / §8.16 / §8.17 JS/TS follow-up arc: embedding hints stay
622/// auto-on for JS/TS, but sparse weighting does not.
623pub fn auto_sparse_should_enable() -> bool {
624 if !auto_hint_mode_enabled() {
625 return false;
626 }
627 match auto_hint_lang() {
628 Some(lang) => language_supports_sparse_weighting(&lang),
629 None => false,
630 }
631}
632
633/// Heuristic: does this string look like natural language rather than
634/// a code identifier, path, or numeric literal?
635///
636/// Criteria:
637/// - at least 4 characters
638/// - no path / scope separators (`/`, `\`, `::`)
639/// - must contain a space (multi-word)
640/// - alphabetic character ratio >= 60%
641pub fn is_nl_shaped(s: &str) -> bool {
642 let s = s.trim();
643 if s.chars().count() < 4 {
644 return false;
645 }
646 if s.contains('/') || s.contains('\\') || s.contains("::") {
647 return false;
648 }
649 if !s.contains(' ') {
650 return false;
651 }
652 let non_ws: usize = s.chars().filter(|c| !c.is_whitespace()).count();
653 if non_ws == 0 {
654 return false;
655 }
656 let alpha: usize = s.chars().filter(|c| c.is_alphabetic()).count();
657 (alpha * 100) / non_ws >= 60
658}
659
660/// Return true when the v1.5 Phase 2i strict comment filter is enabled
661/// via `CODELENS_EMBED_HINT_STRICT_COMMENTS=1` (or `true`/`yes`/`on`).
662///
663/// Phase 2i extends Phase 2h (§8.9) with a comment-side analogue of the
664/// literal filter. Phase 2h recovered ~8 % of the Python regression by
665/// rejecting format/error/log string literals in Pass 2; Phase 2i
666/// targets the remaining ~92 % by rejecting meta-annotation comments
667/// (`# TODO`, `# FIXME`, `# HACK`, `# XXX`, `# BUG`, `# REVIEW`,
668/// `# REFACTOR`, `# TEMP`, `# DEPRECATED`) in Pass 1. Conservative
669/// prefix list — `# NOTE`, `# WARN`, `# SAFETY` are retained because
670/// they often carry behaviour-descriptive content even on Rust.
671///
672/// Default OFF (same policy as every Phase 2 knob). Orthogonal to
673/// `CODELENS_EMBED_HINT_STRICT_LITERALS` so both may be stacked.
674pub fn strict_comments_enabled() -> bool {
675 std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS")
676 .map(|raw| {
677 let lowered = raw.to_ascii_lowercase();
678 matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
679 })
680 .unwrap_or(false)
681}
682
683/// Heuristic: does `body` (the comment text *after* the `//` / `#` prefix
684/// has been stripped by `extract_comment_body`) look like a meta-annotation
685/// rather than behaviour-descriptive prose?
686///
687/// Recognises the following prefixes (case-insensitive, followed by
688/// `:`, `(`, or whitespace):
689/// - `TODO`, `FIXME`, `HACK`, `XXX`, `BUG`
690/// - `REVIEW`, `REFACTOR`, `TEMP`, `TEMPORARY`, `DEPRECATED`
691///
692/// Deliberately excluded (kept as behaviour signal):
693/// - `NOTE`, `NOTES`, `WARN`, `WARNING`
694/// - `SAFETY` (Rust `unsafe` block justifications)
695/// - `PANIC` (Rust invariant docs)
696///
697/// The exclusion list is based on the observation that Rust projects
698/// use `// SAFETY:` and `// NOTE:` to document *why* a block behaves a
699/// certain way — that text is exactly the NL retrieval signal Phase 2b
700/// is trying to capture. The inclusion list targets the "I'll fix this
701/// later" noise that poisons the embedding on both languages but is
702/// especially common on mature Python projects.
703pub fn looks_like_meta_annotation(body: &str) -> bool {
704 let trimmed = body.trim_start();
705 // Find the end of the first "word" (alphanumerics only — a colon,
706 // paren, or whitespace terminates the marker).
707 let word_end = trimmed
708 .find(|c: char| !c.is_ascii_alphabetic())
709 .unwrap_or(trimmed.len());
710 if word_end == 0 {
711 return false;
712 }
713 let first_word = &trimmed[..word_end];
714 let upper = first_word.to_ascii_uppercase();
715 matches!(
716 upper.as_str(),
717 "TODO"
718 | "FIXME"
719 | "HACK"
720 | "XXX"
721 | "BUG"
722 | "REVIEW"
723 | "REFACTOR"
724 | "TEMP"
725 | "TEMPORARY"
726 | "DEPRECATED"
727 )
728}
729
730/// Return true when the v1.5 Phase 2h strict NL literal filter is enabled
731/// via `CODELENS_EMBED_HINT_STRICT_LITERALS=1` (or `true`/`yes`/`on`).
732///
733/// Phase 2h addresses the Phase 3b Python regression (§8.8). The default
734/// Phase 2b Pass 2 scanner accepts any `is_nl_shaped` string literal from
735/// the body, which on Python captures a lot of generic error / log / format
736/// strings (`raise ValueError("Invalid URL %s" % url)`, `logging.debug(...)`,
737/// `fmt.format(...)`). These pass the NL-shape test but carry zero
738/// behaviour-descriptive signal and pollute the embedding. The strict
739/// filter rejects string literals that look like format templates or
740/// common error / log prefixes, while leaving comments (Pass 1) untouched.
741///
742/// Default OFF (same policy as every Phase 2 knob — opt-in first,
743/// measure, then consider flipping the default).
744pub fn strict_literal_filter_enabled() -> bool {
745 std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS")
746 .map(|raw| {
747 let lowered = raw.to_ascii_lowercase();
748 matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
749 })
750 .unwrap_or(false)
751}
752
753/// Heuristic: does `s` contain a C / Python / Rust format specifier?
754///
755/// Recognises:
756/// - C / Python `%` style: `%s`, `%d`, `%r`, `%f`, `%x`, `%o`, `%i`, `%u`
757/// - Python `.format` / f-string style: `{name}`, `{0}`, `{:fmt}`, `{name:fmt}`
758///
759/// Rust `format!` / `println!` style `{}` / `{:?}` / `{name}` is caught by
760/// the same `{...}` branch. Generic `{...}` braces used for JSON-like
761/// content (e.g. `"{name: foo, id: 1}"`) are distinguished from format
762/// placeholders by requiring the inside to be either empty, prefix-colon
763/// (`:fmt`), a single identifier, or an identifier followed by `:fmt`.
764pub fn contains_format_specifier(s: &str) -> bool {
765 let bytes = s.as_bytes();
766 let len = bytes.len();
767 let mut i = 0;
768 while i + 1 < len {
769 if bytes[i] == b'%' {
770 let next = bytes[i + 1];
771 if matches!(next, b's' | b'd' | b'r' | b'f' | b'x' | b'o' | b'i' | b'u') {
772 return true;
773 }
774 }
775 i += 1;
776 }
777 // Python `.format` / f-string / Rust `format!` style `{...}`
778 //
779 // Real format placeholders never contain whitespace inside the braces:
780 // `{}`, `{0}`, `{name}`, `{:?}`, `{:.2f}`, `{name:fmt}`. JSON-like
781 // content such as `{name: foo, id: 1}` DOES contain whitespace. The
782 // whitespace check is therefore the single simplest and most robust
783 // way to distinguish the two without a full format-spec parser.
784 for window in s.split('{').skip(1) {
785 let Some(close_idx) = window.find('}') else {
786 continue;
787 };
788 let inside = &window[..close_idx];
789 // `{}` — Rust empty placeholder
790 if inside.is_empty() {
791 return true;
792 }
793 // Any whitespace inside the braces → JSON-like, not a format spec.
794 if inside.chars().any(|c| c.is_whitespace()) {
795 continue;
796 }
797 // `{:fmt}` — anonymous format spec
798 if inside.starts_with(':') {
799 return true;
800 }
801 // `{name}`, `{0}`, `{name:fmt}` — identifier (or digit), optionally
802 // followed by `:fmt`. We already rejected whitespace-containing
803 // inputs above, so here we only need to check the identifier chars.
804 let ident_end = inside.find(':').unwrap_or(inside.len());
805 let ident = &inside[..ident_end];
806 if !ident.is_empty()
807 && ident
808 .chars()
809 .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
810 {
811 return true;
812 }
813 }
814 false
815}
816
817/// Heuristic: does `s` look like a generic error message, log line, or
818/// low-value imperative string that an NL query would never try to match?
819///
820/// The prefix list is intentionally short — covering the patterns the
821/// Phase 3b `psf/requests` post-mortem flagged as the largest regression
822/// sources. False negatives (real behaviour strings misclassified as
823/// errors) would cost retrieval quality, but because the filter only
824/// runs on string literals and leaves comments alone, a missed NL string
825/// in one symbol will typically have a comment covering the same
826/// behaviour on the same symbol.
827pub fn looks_like_error_or_log_prefix(s: &str) -> bool {
828 let lower = s.trim().to_lowercase();
829 const PREFIXES: &[&str] = &[
830 "invalid ",
831 "cannot ",
832 "could not ",
833 "unable to ",
834 "failed to ",
835 "expected ",
836 "unexpected ",
837 "missing ",
838 "not found",
839 "error: ",
840 "error ",
841 "warning: ",
842 "warning ",
843 "sending ",
844 "received ",
845 "starting ",
846 "stopping ",
847 "calling ",
848 "connecting ",
849 "disconnecting ",
850 ];
851 PREFIXES.iter().any(|p| lower.starts_with(p))
852}
853
854/// Test-only variant: bypass the env gate so the unit tests can exercise
855/// the filter logic deterministically (mirrors `extract_nl_tokens_inner`
856/// vs `extract_nl_tokens` policy). Inlined here instead of a `#[cfg(test)]`
857/// helper so the release binary path never calls it.
858#[cfg(test)]
859pub fn should_reject_literal_strict(s: &str) -> bool {
860 contains_format_specifier(s) || looks_like_error_or_log_prefix(s)
861}
862
863/// Collect natural-language tokens from a function body: line comments,
864/// block comments, and string literals that look like NL prose.
865///
866/// v1.5 Phase 2b experiment. The hypothesis is that the bundled
867/// CodeSearchNet-INT8 model struggles with NL queries (hybrid MRR 0.472)
868/// because the symbol text it sees is pure code, whereas NL queries target
869/// behavioural descriptions that live in *comments* and *string literals*.
870///
871/// Unlike `extract_body_hint` (which skips comments) this function only
872/// keeps comments + NL-shaped string literals and ignores actual code.
873///
874/// Gated by `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1`. Returns `None` when
875/// the gate is off so the default embedding text is untouched.
876pub fn extract_nl_tokens(source: &str, start: usize, end: usize) -> Option<String> {
877 if !nl_tokens_enabled() {
878 return None;
879 }
880 extract_nl_tokens_inner(source, start, end)
881}
882
883/// Env-independent core of `extract_nl_tokens`, exposed to the test module
884/// so unit tests can run deterministically without touching env vars
885/// (which would race with the other tests that set
886/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`).
887pub fn extract_nl_tokens_inner(source: &str, start: usize, end: usize) -> Option<String> {
888 if start >= source.len() || end > source.len() || start >= end {
889 return None;
890 }
891 let safe_start = if source.is_char_boundary(start) {
892 start
893 } else {
894 source.floor_char_boundary(start)
895 };
896 let safe_end = end.min(source.len());
897 let safe_end = if source.is_char_boundary(safe_end) {
898 safe_end
899 } else {
900 source.floor_char_boundary(safe_end)
901 };
902 let body = &source[safe_start..safe_end];
903
904 let mut tokens: Vec<String> = Vec::new();
905
906 // ── Pass 1: comments ─────────────────────────────────────────────
907 // v1.5 Phase 2i: when CODELENS_EMBED_HINT_STRICT_COMMENTS=1 is set,
908 // reject meta-annotation comments (`# TODO`, `# FIXME`, `# HACK`,
909 // ...) while keeping behaviour-descriptive comments untouched. This
910 // is the comment-side analogue of the Phase 2h literal filter
911 // (§8.9) and targets the remaining ~92 % of the Python regression
912 // that Phase 2h's literal-only filter left behind.
913 let strict_comments = strict_comments_enabled();
914 for line in body.lines() {
915 let trimmed = line.trim();
916 if let Some(cleaned) = extract_comment_body(trimmed)
917 && is_nl_shaped(&cleaned)
918 && (!strict_comments || !looks_like_meta_annotation(&cleaned))
919 {
920 tokens.push(cleaned);
921 }
922 }
923
924 // ── Pass 2: double-quoted string literals ────────────────────────
925 // Simplified scanner — handles escape sequences but does not track
926 // multi-line strings or raw strings. Good enough for NL-shaped
927 // heuristic filtering where false negatives are acceptable.
928 //
929 // v1.5 Phase 2h: when CODELENS_EMBED_HINT_STRICT_LITERALS=1 is set,
930 // also reject format templates and generic error / log prefixes. This
931 // addresses the Phase 3b Python regression documented in §8.8 —
932 // comments (Pass 1) stay untouched so Rust projects keep their wins.
933 let strict_literals = strict_literal_filter_enabled();
934 let mut chars = body.chars().peekable();
935 let mut in_string = false;
936 let mut current = String::new();
937 while let Some(c) = chars.next() {
938 if in_string {
939 if c == '\\' {
940 // Skip escape sequence
941 let _ = chars.next();
942 } else if c == '"' {
943 if is_nl_shaped(¤t)
944 && (!strict_literals
945 || (!contains_format_specifier(¤t)
946 && !looks_like_error_or_log_prefix(¤t)))
947 {
948 tokens.push(current.clone());
949 }
950 current.clear();
951 in_string = false;
952 } else {
953 current.push(c);
954 }
955 } else if c == '"' {
956 in_string = true;
957 }
958 }
959
960 if tokens.is_empty() {
961 return None;
962 }
963 Some(join_hint_lines(&tokens))
964}
965
966/// Return true when API-call extraction is enabled via
967/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` (or `true`/`yes`/`on`).
968///
969/// v1.5 Phase 2c infrastructure — kept off by default pending A/B
970/// measurement. Orthogonal to `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`
971/// so both may be stacked.
972///
973/// v1.5 Phase 2j: explicit env > auto mode, same policy as Phase 2b.
974pub fn api_calls_enabled() -> bool {
975 if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_API_CALLS") {
976 return explicit;
977 }
978 auto_hint_should_enable()
979}
980
981/// Heuristic: does `ident` look like a Rust/C++ *type* (PascalCase) rather
982/// than a module or free function (snake_case)?
983///
984/// Phase 2c API-call extractor relies on this filter to keep the hint
985/// focused on static-method call sites (`Parser::new`, `HashMap::with_capacity`)
986/// and drop module-scoped free functions (`std::fs::read_to_string`).
987/// We intentionally accept only an ASCII uppercase first letter; stricter
988/// than PascalCase detection but deliberate — the goal is high-precision
989/// Type filtering, not lexical accuracy.
990pub fn is_static_method_ident(ident: &str) -> bool {
991 ident.chars().next().is_some_and(|c| c.is_ascii_uppercase())
992}
993
994/// Collect `Type::method` call sites from a function body.
995///
996/// v1.5 Phase 2c experiment. Hypothesis: exposing the Types a function
997/// interacts with (via their static-method call sites) adds a lexical
998/// bridge between NL queries ("parse json", "open database") and symbols
999/// whose body references the relevant type (`Parser::new`, `Connection::open`).
1000/// This is orthogonal to Phase 2b (comments + NL-shaped literals), which
1001/// targets *explanatory* natural language rather than *type* hints.
1002///
1003/// Gated by `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1`. Returns `None` when
1004/// the gate is off so the default embedding text is untouched.
1005pub fn extract_api_calls(source: &str, start: usize, end: usize) -> Option<String> {
1006 if !api_calls_enabled() {
1007 return None;
1008 }
1009 extract_api_calls_inner(source, start, end)
1010}
1011
1012/// Env-independent core of `extract_api_calls`, exposed to the test module
1013/// so unit tests can run deterministically without touching env vars
1014/// (which would race with other tests that set
1015/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS`).
1016///
1017/// Scans the body for `Type::method` byte patterns where:
1018/// - `Type` starts with an ASCII uppercase letter and consists of
1019/// `[A-Za-z0-9_]*` (plain ASCII — non-ASCII identifiers are skipped
1020/// on purpose to minimise noise).
1021/// - `method` is any identifier (start `[A-Za-z_]`, continue `[A-Za-z0-9_]*`).
1022///
1023/// Duplicate `Type::method` pairs collapse into a single entry to avoid
1024/// biasing the embedding toward repeated calls in hot loops.
1025pub fn extract_api_calls_inner(source: &str, start: usize, end: usize) -> Option<String> {
1026 if start >= source.len() || end > source.len() || start >= end {
1027 return None;
1028 }
1029 let safe_start = if source.is_char_boundary(start) {
1030 start
1031 } else {
1032 source.floor_char_boundary(start)
1033 };
1034 let safe_end = end.min(source.len());
1035 let safe_end = if source.is_char_boundary(safe_end) {
1036 safe_end
1037 } else {
1038 source.floor_char_boundary(safe_end)
1039 };
1040 if safe_start >= safe_end {
1041 return None;
1042 }
1043 let body = &source[safe_start..safe_end];
1044 let bytes = body.as_bytes();
1045 let len = bytes.len();
1046
1047 let mut calls: Vec<String> = Vec::new();
1048 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1049
1050 let mut i = 0usize;
1051 while i < len {
1052 let b = bytes[i];
1053 // Walk forward until we find the start of an ASCII identifier.
1054 if !(b == b'_' || b.is_ascii_alphabetic()) {
1055 i += 1;
1056 continue;
1057 }
1058 let ident_start = i;
1059 while i < len {
1060 let bb = bytes[i];
1061 if bb == b'_' || bb.is_ascii_alphanumeric() {
1062 i += 1;
1063 } else {
1064 break;
1065 }
1066 }
1067 let ident_end = i;
1068
1069 // Must be immediately followed by `::`.
1070 if i + 1 >= len || bytes[i] != b':' || bytes[i + 1] != b':' {
1071 continue;
1072 }
1073
1074 let type_ident = &body[ident_start..ident_end];
1075 if !is_static_method_ident(type_ident) {
1076 // `snake_module::foo` — not a Type. Skip past the `::` so we
1077 // don't rescan the same characters, but keep walking.
1078 i += 2;
1079 continue;
1080 }
1081
1082 // Skip the `::`
1083 let mut j = i + 2;
1084 if j >= len || !(bytes[j] == b'_' || bytes[j].is_ascii_alphabetic()) {
1085 i = j;
1086 continue;
1087 }
1088 let method_start = j;
1089 while j < len {
1090 let bb = bytes[j];
1091 if bb == b'_' || bb.is_ascii_alphanumeric() {
1092 j += 1;
1093 } else {
1094 break;
1095 }
1096 }
1097 let method_end = j;
1098
1099 let method_ident = &body[method_start..method_end];
1100 let call = format!("{type_ident}::{method_ident}");
1101 if seen.insert(call.clone()) {
1102 calls.push(call);
1103 }
1104 i = j;
1105 }
1106
1107 if calls.is_empty() {
1108 return None;
1109 }
1110 Some(join_hint_lines(&calls))
1111}
1112
1113/// Peel the comment prefix off a trimmed line, returning the inner text
1114/// if the line is recognisably a `//`, `#`, `/* */`, or leading-`*` comment.
1115pub fn extract_comment_body(trimmed: &str) -> Option<String> {
1116 if trimmed.is_empty() {
1117 return None;
1118 }
1119 // `//` and `///` and `//!` (Rust doc comments)
1120 if let Some(rest) = trimmed.strip_prefix("///") {
1121 return Some(rest.trim().to_string());
1122 }
1123 if let Some(rest) = trimmed.strip_prefix("//!") {
1124 return Some(rest.trim().to_string());
1125 }
1126 if let Some(rest) = trimmed.strip_prefix("//") {
1127 return Some(rest.trim().to_string());
1128 }
1129 // `#[...]` attribute, `#!...` shebang — NOT comments
1130 if trimmed.starts_with("#[") || trimmed.starts_with("#!") {
1131 return None;
1132 }
1133 // `#` line comment (Python, bash, ...)
1134 if let Some(rest) = trimmed.strip_prefix('#') {
1135 return Some(rest.trim().to_string());
1136 }
1137 // Block-comment line: `/**`, `/*`, or continuation `*`
1138 if let Some(rest) = trimmed.strip_prefix("/**") {
1139 return Some(rest.trim_end_matches("*/").trim().to_string());
1140 }
1141 if let Some(rest) = trimmed.strip_prefix("/*") {
1142 return Some(rest.trim_end_matches("*/").trim().to_string());
1143 }
1144 if let Some(rest) = trimmed.strip_prefix('*') {
1145 // Block-comment continuation. Only accept if the rest looks textual
1146 // (avoid e.g. `*const T` pointer types).
1147 let rest = rest.trim_end_matches("*/").trim();
1148 if rest.is_empty() {
1149 return None;
1150 }
1151 // Reject obvious code continuations
1152 if rest.contains(';') || rest.contains('{') {
1153 return None;
1154 }
1155 return Some(rest.to_string());
1156 }
1157 None
1158}
1159
1160/// Extract the leading docstring or comment block from a symbol's body.
1161/// Supports: Python triple-quote, Rust //!//// doc comments, JS/TS /** */ blocks.
1162pub fn extract_leading_doc(source: &str, start: usize, end: usize) -> Option<String> {
1163 if start >= source.len() || end > source.len() || start >= end {
1164 return None;
1165 }
1166 // Clamp to nearest char boundary to avoid panicking on multi-byte UTF-8
1167 let safe_start = if source.is_char_boundary(start) {
1168 start
1169 } else {
1170 source.floor_char_boundary(start)
1171 };
1172 let safe_end = end.min(source.len());
1173 let safe_end = if source.is_char_boundary(safe_end) {
1174 safe_end
1175 } else {
1176 source.floor_char_boundary(safe_end)
1177 };
1178 if safe_start >= safe_end {
1179 return None;
1180 }
1181 let body = &source[safe_start..safe_end];
1182 let lines: Vec<&str> = body.lines().skip(1).collect(); // skip the signature line
1183 if lines.is_empty() {
1184 return None;
1185 }
1186
1187 let mut doc_lines = Vec::new();
1188
1189 // Python: triple-quote docstrings
1190 let first_trimmed = lines.first().map(|l| l.trim()).unwrap_or_default();
1191 if first_trimmed.starts_with("\"\"\"") || first_trimmed.starts_with("'''") {
1192 let quote = &first_trimmed[..3];
1193 for line in &lines {
1194 let t = line.trim();
1195 doc_lines.push(t.trim_start_matches(quote).trim_end_matches(quote));
1196 if doc_lines.len() > 1 && t.ends_with(quote) {
1197 break;
1198 }
1199 }
1200 }
1201 // Rust: /// or //! doc comments (before the body, captured by tree-sitter)
1202 else if first_trimmed.starts_with("///") || first_trimmed.starts_with("//!") {
1203 for line in &lines {
1204 let t = line.trim();
1205 if t.starts_with("///") || t.starts_with("//!") {
1206 doc_lines.push(t.trim_start_matches("///").trim_start_matches("//!").trim());
1207 } else {
1208 break;
1209 }
1210 }
1211 }
1212 // JS/TS: /** ... */ block comments
1213 else if first_trimmed.starts_with("/**") {
1214 for line in &lines {
1215 let t = line.trim();
1216 let cleaned = t
1217 .trim_start_matches("/**")
1218 .trim_start_matches('*')
1219 .trim_end_matches("*/")
1220 .trim();
1221 if !cleaned.is_empty() {
1222 doc_lines.push(cleaned);
1223 }
1224 if t.ends_with("*/") {
1225 break;
1226 }
1227 }
1228 }
1229 // Generic: leading // or # comment block
1230 else {
1231 for line in &lines {
1232 let t = line.trim();
1233 if t.starts_with("//") || t.starts_with('#') {
1234 doc_lines.push(t.trim_start_matches("//").trim_start_matches('#').trim());
1235 } else {
1236 break;
1237 }
1238 }
1239 }
1240
1241 if doc_lines.is_empty() {
1242 return None;
1243 }
1244 Some(doc_lines.join(" ").trim().to_owned())
1245}