codelens_engine/embedding/
mod.rs

1//! Semantic search using fastembed + sqlite-vec.
2//! Gated behind the `semantic` feature flag.
3
4use crate::db::IndexDb;
5use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
6use crate::project::ProjectRoot;
7use anyhow::{Context, Result};
8#[cfg(target_os = "macos")]
9use fastembed::ExecutionProviderDispatch;
10use fastembed::{InitOptionsUserDefined, TextEmbedding, TokenizerFiles, UserDefinedEmbeddingModel};
11use rusqlite::Connection;
12use serde::Serialize;
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::sync::{Arc, Mutex, Once};
15use std::thread::available_parallelism;
16use tracing::debug;
17
18/// Isolated unsafe FFI — the only module allowed to use `unsafe`.
19pub(super) mod ffi {
20    use anyhow::Result;
21
22    pub fn register_sqlite_vec() -> Result<()> {
23        let rc = unsafe {
24            rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute::<
25                *const (),
26                unsafe extern "C" fn(
27                    *mut rusqlite::ffi::sqlite3,
28                    *mut *mut i8,
29                    *const rusqlite::ffi::sqlite3_api_routines,
30                ) -> i32,
31            >(
32                sqlite_vec::sqlite3_vec_init as *const ()
33            )))
34        };
35        if rc != rusqlite::ffi::SQLITE_OK {
36            anyhow::bail!("failed to register sqlite-vec extension (SQLite error code: {rc})");
37        }
38        Ok(())
39    }
40
41    #[cfg(target_os = "macos")]
42    pub fn sysctl_usize(name: &[u8]) -> Option<usize> {
43        let mut value: libc::c_uint = 0;
44        let mut size = std::mem::size_of::<libc::c_uint>();
45        let rc = unsafe {
46            libc::sysctlbyname(
47                name.as_ptr().cast(),
48                (&mut value as *mut libc::c_uint).cast(),
49                &mut size,
50                std::ptr::null_mut(),
51                0,
52            )
53        };
54        (rc == 0 && size == std::mem::size_of::<libc::c_uint>()).then_some(value as usize)
55    }
56}
57
58/// Result of a semantic search query.
59#[derive(Debug, Clone, Serialize)]
60pub struct SemanticMatch {
61    pub file_path: String,
62    pub symbol_name: String,
63    pub kind: String,
64    pub line: usize,
65    pub signature: String,
66    pub name_path: String,
67    pub score: f64,
68}
69
70impl From<ScoredChunk> for SemanticMatch {
71    fn from(c: ScoredChunk) -> Self {
72        Self {
73            file_path: c.file_path,
74            symbol_name: c.symbol_name,
75            kind: c.kind,
76            line: c.line,
77            signature: c.signature,
78            name_path: c.name_path,
79            score: c.score,
80        }
81    }
82}
83
84mod vec_store;
85use vec_store::SqliteVecStore;
86
87type ReusableEmbeddingKey = (String, String, String, String, String, String);
88
89fn reusable_embedding_key(
90    file_path: &str,
91    symbol_name: &str,
92    kind: &str,
93    signature: &str,
94    name_path: &str,
95    text: &str,
96) -> ReusableEmbeddingKey {
97    (
98        file_path.to_owned(),
99        symbol_name.to_owned(),
100        kind.to_owned(),
101        signature.to_owned(),
102        name_path.to_owned(),
103        text.to_owned(),
104    )
105}
106
107fn reusable_embedding_key_for_chunk(chunk: &EmbeddingChunk) -> ReusableEmbeddingKey {
108    reusable_embedding_key(
109        &chunk.file_path,
110        &chunk.symbol_name,
111        &chunk.kind,
112        &chunk.signature,
113        &chunk.name_path,
114        &chunk.text,
115    )
116}
117
118fn reusable_embedding_key_for_symbol(
119    sym: &crate::db::SymbolWithFile,
120    text: &str,
121) -> ReusableEmbeddingKey {
122    reusable_embedding_key(
123        &sym.file_path,
124        &sym.name,
125        &sym.kind,
126        &sym.signature,
127        &sym.name_path,
128        text,
129    )
130}
131
132// ── EmbeddingEngine (facade) ──────────────────────────────────────────
133
134const DEFAULT_EMBED_BATCH_SIZE: usize = 128;
135const DEFAULT_MACOS_EMBED_BATCH_SIZE: usize = 128;
136const DEFAULT_TEXT_EMBED_CACHE_SIZE: usize = 256;
137const DEFAULT_MACOS_TEXT_EMBED_CACHE_SIZE: usize = 1024;
138const CODESEARCH_DIMENSION: usize = 384;
139const DEFAULT_MAX_EMBED_SYMBOLS: usize = 50_000;
140const CHANGED_FILE_QUERY_CHUNK: usize = 128;
141const DEFAULT_DUPLICATE_SCAN_BATCH_SIZE: usize = 128;
142static ORT_ENV_INIT: Once = Once::new();
143
144/// Default: CodeSearchNet (MiniLM-L12 fine-tuned on code, bundled ONNX INT8).
145/// Override via `CODELENS_EMBED_MODEL` env var to use fastembed built-in models.
146const CODESEARCH_MODEL_NAME: &str = "MiniLM-L12-CodeSearchNet-INT8";
147
148pub struct EmbeddingEngine {
149    model: Mutex<TextEmbedding>,
150    store: SqliteVecStore,
151    model_name: String,
152    runtime_info: EmbeddingRuntimeInfo,
153    text_embed_cache: Mutex<TextEmbeddingCache>,
154    indexing: std::sync::atomic::AtomicBool,
155}
156
157#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
158pub struct EmbeddingIndexInfo {
159    pub model_name: String,
160    pub indexed_symbols: usize,
161}
162
163#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
164pub struct EmbeddingRuntimeInfo {
165    pub runtime_preference: String,
166    pub backend: String,
167    pub threads: usize,
168    pub max_length: usize,
169    pub coreml_model_format: Option<String>,
170    pub coreml_compute_units: Option<String>,
171    pub coreml_static_input_shapes: Option<bool>,
172    pub coreml_profile_compute_plan: Option<bool>,
173    pub coreml_specialization_strategy: Option<String>,
174    pub coreml_model_cache_dir: Option<String>,
175    pub fallback_reason: Option<String>,
176}
177
178struct TextEmbeddingCache {
179    capacity: usize,
180    order: VecDeque<String>,
181    entries: HashMap<String, Vec<f32>>,
182}
183
184impl TextEmbeddingCache {
185    fn new(capacity: usize) -> Self {
186        Self {
187            capacity,
188            order: VecDeque::new(),
189            entries: HashMap::new(),
190        }
191    }
192
193    fn get(&mut self, key: &str) -> Option<Vec<f32>> {
194        let value = self.entries.get(key)?.clone();
195        self.touch(key);
196        Some(value)
197    }
198
199    fn insert(&mut self, key: String, value: Vec<f32>) {
200        if self.capacity == 0 {
201            return;
202        }
203
204        self.entries.insert(key.clone(), value);
205        self.touch(&key);
206
207        while self.entries.len() > self.capacity {
208            if let Some(oldest) = self.order.pop_front() {
209                self.entries.remove(&oldest);
210            } else {
211                break;
212            }
213        }
214    }
215
216    fn touch(&mut self, key: &str) {
217        if let Some(position) = self.order.iter().position(|existing| existing == key) {
218            self.order.remove(position);
219        }
220        self.order.push_back(key.to_owned());
221    }
222}
223
224/// Resolve the sidecar model directory.
225///
226/// Search order:
227/// 1. `$CODELENS_MODEL_DIR` env var (explicit override)
228/// 2. Next to the executable: `<exe_dir>/models/codesearch/`
229/// 3. User cache: `~/.cache/codelens/models/codesearch/`
230/// 4. Compile-time relative path (for development): `models/codesearch/` from crate root
231fn resolve_model_dir() -> Result<std::path::PathBuf> {
232    // Explicit override
233    if let Ok(dir) = std::env::var("CODELENS_MODEL_DIR") {
234        let p = std::path::PathBuf::from(dir).join("codesearch");
235        if p.join("model.onnx").exists() {
236            return Ok(p);
237        }
238    }
239
240    // Next to executable
241    if let Ok(exe) = std::env::current_exe()
242        && let Some(exe_dir) = exe.parent()
243    {
244        let p = exe_dir.join("models").join("codesearch");
245        if p.join("model.onnx").exists() {
246            return Ok(p);
247        }
248    }
249
250    // User cache
251    if let Some(home) = dirs_fallback() {
252        let p = home
253            .join(".cache")
254            .join("codelens")
255            .join("models")
256            .join("codesearch");
257        if p.join("model.onnx").exists() {
258            return Ok(p);
259        }
260    }
261
262    // Development: crate-relative path
263    let dev_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
264        .join("models")
265        .join("codesearch");
266    if dev_path.join("model.onnx").exists() {
267        return Ok(dev_path);
268    }
269
270    anyhow::bail!(
271        "CodeSearchNet model not found. Place model files in one of:\n\
272         - $CODELENS_MODEL_DIR/codesearch/\n\
273         - <executable>/models/codesearch/\n\
274         - ~/.cache/codelens/models/codesearch/\n\
275         Required files: model.onnx, tokenizer.json, config.json, special_tokens_map.json, tokenizer_config.json"
276    )
277}
278
279fn dirs_fallback() -> Option<std::path::PathBuf> {
280    std::env::var_os("HOME").map(std::path::PathBuf::from)
281}
282
283fn parse_usize_env(name: &str) -> Option<usize> {
284    std::env::var(name)
285        .ok()
286        .and_then(|v| v.trim().parse::<usize>().ok())
287        .filter(|v| *v > 0)
288}
289
290fn parse_bool_env(name: &str) -> Option<bool> {
291    std::env::var(name).ok().and_then(|value| {
292        let normalized = value.trim().to_ascii_lowercase();
293        match normalized.as_str() {
294            "1" | "true" | "yes" | "on" => Some(true),
295            "0" | "false" | "no" | "off" => Some(false),
296            _ => None,
297        }
298    })
299}
300
301#[cfg(target_os = "macos")]
302fn apple_perf_cores() -> Option<usize> {
303    ffi::sysctl_usize(b"hw.perflevel0.physicalcpu\0")
304        .filter(|value| *value > 0)
305        .or_else(|| ffi::sysctl_usize(b"hw.physicalcpu\0").filter(|value| *value > 0))
306}
307
308#[cfg(not(target_os = "macos"))]
309fn apple_perf_cores() -> Option<usize> {
310    None
311}
312
313pub fn configured_embedding_runtime_preference() -> String {
314    let requested = std::env::var("CODELENS_EMBED_PROVIDER")
315        .ok()
316        .map(|value| value.trim().to_ascii_lowercase());
317
318    match requested.as_deref() {
319        Some("cpu") => "cpu".to_string(),
320        Some("coreml") if cfg!(target_os = "macos") => "coreml".to_string(),
321        Some("coreml") => "cpu".to_string(),
322        _ if cfg!(target_os = "macos") => "coreml_preferred".to_string(),
323        _ => "cpu".to_string(),
324    }
325}
326
327pub fn configured_embedding_threads() -> usize {
328    recommended_embed_threads()
329}
330
331fn configured_embedding_max_length() -> usize {
332    parse_usize_env("CODELENS_EMBED_MAX_LENGTH")
333        .unwrap_or(256)
334        .clamp(32, 512)
335}
336
337fn configured_embedding_text_cache_size() -> usize {
338    std::env::var("CODELENS_EMBED_TEXT_CACHE_SIZE")
339        .ok()
340        .and_then(|value| value.trim().parse::<usize>().ok())
341        .unwrap_or({
342            if cfg!(target_os = "macos") {
343                DEFAULT_MACOS_TEXT_EMBED_CACHE_SIZE
344            } else {
345                DEFAULT_TEXT_EMBED_CACHE_SIZE
346            }
347        })
348        .min(8192)
349}
350
351#[cfg(target_os = "macos")]
352fn configured_coreml_compute_units_name() -> String {
353    match std::env::var("CODELENS_EMBED_COREML_COMPUTE_UNITS")
354        .ok()
355        .map(|value| value.trim().to_ascii_lowercase())
356        .as_deref()
357    {
358        Some("all") => "all".to_string(),
359        Some("cpu") | Some("cpu_only") => "cpu_only".to_string(),
360        Some("gpu") | Some("cpu_and_gpu") => "cpu_and_gpu".to_string(),
361        Some("ane") | Some("neural_engine") | Some("cpu_and_neural_engine") => {
362            "cpu_and_neural_engine".to_string()
363        }
364        _ => "cpu_and_neural_engine".to_string(),
365    }
366}
367
368#[cfg(target_os = "macos")]
369fn configured_coreml_model_format_name() -> String {
370    match std::env::var("CODELENS_EMBED_COREML_MODEL_FORMAT")
371        .ok()
372        .map(|value| value.trim().to_ascii_lowercase())
373        .as_deref()
374    {
375        Some("neuralnetwork") | Some("neural_network") => "neural_network".to_string(),
376        _ => "mlprogram".to_string(),
377    }
378}
379
380#[cfg(target_os = "macos")]
381fn configured_coreml_profile_compute_plan() -> bool {
382    parse_bool_env("CODELENS_EMBED_COREML_PROFILE_PLAN").unwrap_or(false)
383}
384
385#[cfg(target_os = "macos")]
386fn configured_coreml_static_input_shapes() -> bool {
387    parse_bool_env("CODELENS_EMBED_COREML_STATIC_INPUT_SHAPES").unwrap_or(true)
388}
389
390#[cfg(target_os = "macos")]
391fn configured_coreml_specialization_strategy_name() -> String {
392    match std::env::var("CODELENS_EMBED_COREML_SPECIALIZATION")
393        .ok()
394        .map(|value| value.trim().to_ascii_lowercase())
395        .as_deref()
396    {
397        Some("default") => "default".to_string(),
398        _ => "fast_prediction".to_string(),
399    }
400}
401
402#[cfg(target_os = "macos")]
403fn configured_coreml_model_cache_dir() -> std::path::PathBuf {
404    dirs_fallback()
405        .unwrap_or_else(std::env::temp_dir)
406        .join(".cache")
407        .join("codelens")
408        .join("coreml-cache")
409        .join("codesearch")
410}
411
412fn recommended_embed_threads() -> usize {
413    if let Some(explicit) = parse_usize_env("CODELENS_EMBED_THREADS") {
414        return explicit.max(1);
415    }
416
417    let available = available_parallelism().map(|n| n.get()).unwrap_or(1);
418    if cfg!(target_os = "macos") {
419        apple_perf_cores()
420            .unwrap_or(available)
421            .min(available)
422            .clamp(1, 8)
423    } else {
424        available.div_ceil(2).clamp(1, 8)
425    }
426}
427
428fn embed_batch_size() -> usize {
429    parse_usize_env("CODELENS_EMBED_BATCH_SIZE").unwrap_or({
430        if cfg!(target_os = "macos") {
431            DEFAULT_MACOS_EMBED_BATCH_SIZE
432        } else {
433            DEFAULT_EMBED_BATCH_SIZE
434        }
435    })
436}
437
438fn max_embed_symbols() -> usize {
439    parse_usize_env("CODELENS_MAX_EMBED_SYMBOLS").unwrap_or(DEFAULT_MAX_EMBED_SYMBOLS)
440}
441
442fn set_env_if_unset(name: &str, value: impl Into<String>) {
443    if std::env::var_os(name).is_none() {
444        // SAFETY: we only set process-wide runtime knobs during one-time startup,
445        // before the embedding session is initialized.
446        unsafe {
447            std::env::set_var(name, value.into());
448        }
449    }
450}
451
452fn configure_embedding_runtime() {
453    let threads = recommended_embed_threads();
454    let runtime_preference = configured_embedding_runtime_preference();
455
456    // OpenMP-backed ORT builds ignore SessionBuilder::with_intra_threads, so set
457    // the process knobs as well. Keep these best-effort and only fill defaults.
458    set_env_if_unset("OMP_NUM_THREADS", threads.to_string());
459    set_env_if_unset("OMP_WAIT_POLICY", "PASSIVE");
460    set_env_if_unset("OMP_DYNAMIC", "FALSE");
461    set_env_if_unset("TOKENIZERS_PARALLELISM", "false");
462    if cfg!(target_os = "macos") {
463        set_env_if_unset("VECLIB_MAXIMUM_THREADS", threads.to_string());
464    }
465
466    ORT_ENV_INIT.call_once(|| {
467        let pool = ort::environment::GlobalThreadPoolOptions::default()
468            .with_intra_threads(threads)
469            .and_then(|pool| pool.with_inter_threads(1))
470            .and_then(|pool| pool.with_spin_control(false));
471
472        if let Ok(pool) = pool {
473            let _ = ort::init()
474                .with_name("codelens-embedding")
475                .with_telemetry(false)
476                .with_global_thread_pool(pool)
477                .commit();
478        }
479    });
480
481    debug!(
482        threads,
483        runtime_preference = %runtime_preference,
484        "configured embedding runtime"
485    );
486}
487
488fn requested_embedding_model_override() -> Result<Option<String>> {
489    let env_model = std::env::var("CODELENS_EMBED_MODEL").ok();
490    let Some(model_id) = env_model else {
491        return Ok(None);
492    };
493    if model_id.is_empty() || model_id == CODESEARCH_MODEL_NAME {
494        return Ok(None);
495    }
496
497    #[cfg(feature = "model-bakeoff")]
498    {
499        return Ok(Some(model_id));
500    }
501
502    #[cfg(not(feature = "model-bakeoff"))]
503    {
504        anyhow::bail!(
505            "CODELENS_EMBED_MODEL={model_id} requires the `model-bakeoff` feature; \
506             rebuild the binary with `--features model-bakeoff` to run alternative model bake-offs"
507        );
508    }
509}
510
511pub fn configured_embedding_runtime_info() -> EmbeddingRuntimeInfo {
512    let runtime_preference = configured_embedding_runtime_preference();
513    let threads = configured_embedding_threads();
514
515    #[cfg(target_os = "macos")]
516    {
517        let coreml_enabled = runtime_preference != "cpu";
518        EmbeddingRuntimeInfo {
519            runtime_preference,
520            backend: "not_loaded".to_string(),
521            threads,
522            max_length: configured_embedding_max_length(),
523            coreml_model_format: coreml_enabled.then(configured_coreml_model_format_name),
524            coreml_compute_units: coreml_enabled.then(configured_coreml_compute_units_name),
525            coreml_static_input_shapes: coreml_enabled.then(configured_coreml_static_input_shapes),
526            coreml_profile_compute_plan: coreml_enabled
527                .then(configured_coreml_profile_compute_plan),
528            coreml_specialization_strategy: coreml_enabled
529                .then(configured_coreml_specialization_strategy_name),
530            coreml_model_cache_dir: coreml_enabled
531                .then(|| configured_coreml_model_cache_dir().display().to_string()),
532            fallback_reason: None,
533        }
534    }
535
536    #[cfg(not(target_os = "macos"))]
537    {
538        EmbeddingRuntimeInfo {
539            runtime_preference,
540            backend: "not_loaded".to_string(),
541            threads,
542            max_length: configured_embedding_max_length(),
543            coreml_model_format: None,
544            coreml_compute_units: None,
545            coreml_static_input_shapes: None,
546            coreml_profile_compute_plan: None,
547            coreml_specialization_strategy: None,
548            coreml_model_cache_dir: None,
549            fallback_reason: None,
550        }
551    }
552}
553
554#[cfg(target_os = "macos")]
555fn build_coreml_execution_provider() -> ExecutionProviderDispatch {
556    use ort::ep::{
557        CoreML,
558        coreml::{ComputeUnits, ModelFormat, SpecializationStrategy},
559    };
560
561    let compute_units = match configured_coreml_compute_units_name().as_str() {
562        "all" => ComputeUnits::All,
563        "cpu_only" => ComputeUnits::CPUOnly,
564        "cpu_and_gpu" => ComputeUnits::CPUAndGPU,
565        _ => ComputeUnits::CPUAndNeuralEngine,
566    };
567    let model_format = match configured_coreml_model_format_name().as_str() {
568        "neural_network" => ModelFormat::NeuralNetwork,
569        _ => ModelFormat::MLProgram,
570    };
571    let specialization = match configured_coreml_specialization_strategy_name().as_str() {
572        "default" => SpecializationStrategy::Default,
573        _ => SpecializationStrategy::FastPrediction,
574    };
575    let cache_dir = configured_coreml_model_cache_dir();
576    let _ = std::fs::create_dir_all(&cache_dir);
577
578    CoreML::default()
579        .with_model_format(model_format)
580        .with_compute_units(compute_units)
581        .with_static_input_shapes(configured_coreml_static_input_shapes())
582        .with_specialization_strategy(specialization)
583        .with_profile_compute_plan(configured_coreml_profile_compute_plan())
584        .with_model_cache_dir(cache_dir.display().to_string())
585        .build()
586        .error_on_failure()
587}
588
589fn cpu_runtime_info(
590    runtime_preference: String,
591    fallback_reason: Option<String>,
592) -> EmbeddingRuntimeInfo {
593    EmbeddingRuntimeInfo {
594        runtime_preference,
595        backend: "cpu".to_string(),
596        threads: configured_embedding_threads(),
597        max_length: configured_embedding_max_length(),
598        coreml_model_format: None,
599        coreml_compute_units: None,
600        coreml_static_input_shapes: None,
601        coreml_profile_compute_plan: None,
602        coreml_specialization_strategy: None,
603        coreml_model_cache_dir: None,
604        fallback_reason,
605    }
606}
607
608#[cfg(target_os = "macos")]
609fn coreml_runtime_info(
610    runtime_preference: String,
611    fallback_reason: Option<String>,
612) -> EmbeddingRuntimeInfo {
613    EmbeddingRuntimeInfo {
614        runtime_preference,
615        backend: if fallback_reason.is_some() {
616            "cpu".to_string()
617        } else {
618            "coreml".to_string()
619        },
620        threads: configured_embedding_threads(),
621        max_length: configured_embedding_max_length(),
622        coreml_model_format: Some(configured_coreml_model_format_name()),
623        coreml_compute_units: Some(configured_coreml_compute_units_name()),
624        coreml_static_input_shapes: Some(configured_coreml_static_input_shapes()),
625        coreml_profile_compute_plan: Some(configured_coreml_profile_compute_plan()),
626        coreml_specialization_strategy: Some(configured_coreml_specialization_strategy_name()),
627        coreml_model_cache_dir: Some(configured_coreml_model_cache_dir().display().to_string()),
628        fallback_reason,
629    }
630}
631
632/// Load a fastembed built-in model by ID (auto-downloads from HuggingFace).
633/// Used for A/B model comparison via `CODELENS_EMBED_MODEL` env var.
634/// Load a fastembed built-in model by ID (auto-downloads from HuggingFace).
635/// Requires the `model-bakeoff` feature (enables fastembed's hf-hub support).
636#[cfg(feature = "model-bakeoff")]
637fn load_fastembed_builtin(
638    model_id: &str,
639) -> Result<(TextEmbedding, usize, String, EmbeddingRuntimeInfo)> {
640    use fastembed::EmbeddingModel;
641
642    // Match known fastembed model IDs to their enum variants
643    let (model_enum, expected_dim) = match model_id {
644        "all-MiniLM-L6-v2" | "sentence-transformers/all-MiniLM-L6-v2" => {
645            (EmbeddingModel::AllMiniLML6V2, 384)
646        }
647        "all-MiniLM-L12-v2" | "sentence-transformers/all-MiniLM-L12-v2" => {
648            (EmbeddingModel::AllMiniLML12V2, 384)
649        }
650        "bge-small-en-v1.5" | "BAAI/bge-small-en-v1.5" => (EmbeddingModel::BGESmallENV15, 384),
651        "bge-base-en-v1.5" | "BAAI/bge-base-en-v1.5" => (EmbeddingModel::BGEBaseENV15, 768),
652        "nomic-embed-text-v1.5" | "nomic-ai/nomic-embed-text-v1.5" => {
653            (EmbeddingModel::NomicEmbedTextV15, 768)
654        }
655        other => {
656            anyhow::bail!(
657                "Unknown fastembed model: {other}. \
658                 Supported: all-MiniLM-L6-v2, all-MiniLM-L12-v2, bge-small-en-v1.5, \
659                 bge-base-en-v1.5, nomic-embed-text-v1.5"
660            );
661        }
662    };
663
664    let init = fastembed::InitOptionsWithLength::new(model_enum)
665        .with_max_length(configured_embedding_max_length())
666        .with_cache_dir(std::env::temp_dir().join("codelens-fastembed-cache"))
667        .with_show_download_progress(true);
668    let model =
669        TextEmbedding::try_new(init).with_context(|| format!("failed to load {model_id}"))?;
670
671    let runtime_info = cpu_runtime_info("cpu".to_string(), None);
672
673    tracing::info!(
674        model = model_id,
675        dimension = expected_dim,
676        "loaded fastembed built-in model for A/B comparison"
677    );
678
679    Ok((model, expected_dim, model_id.to_string(), runtime_info))
680}
681
682/// Load the CodeSearchNet model from sidecar files (MiniLM-L12 fine-tuned, ONNX INT8).
683fn load_codesearch_model() -> Result<(TextEmbedding, usize, String, EmbeddingRuntimeInfo)> {
684    configure_embedding_runtime();
685
686    // Alternative model overrides are only valid when the bakeoff feature is enabled.
687    #[allow(unused_variables)]
688    if let Some(model_id) = requested_embedding_model_override()? {
689        #[cfg(feature = "model-bakeoff")]
690        {
691            return load_fastembed_builtin(&model_id);
692        }
693
694        #[cfg(not(feature = "model-bakeoff"))]
695        {
696            let _ = model_id;
697            unreachable!("alternative embedding model override should have errored");
698        }
699    }
700
701    let model_dir = resolve_model_dir()?;
702
703    let onnx_bytes =
704        std::fs::read(model_dir.join("model.onnx")).context("failed to read model.onnx")?;
705    let tokenizer_bytes =
706        std::fs::read(model_dir.join("tokenizer.json")).context("failed to read tokenizer.json")?;
707    let config_bytes =
708        std::fs::read(model_dir.join("config.json")).context("failed to read config.json")?;
709    let special_tokens_bytes = std::fs::read(model_dir.join("special_tokens_map.json"))
710        .context("failed to read special_tokens_map.json")?;
711    let tokenizer_config_bytes = std::fs::read(model_dir.join("tokenizer_config.json"))
712        .context("failed to read tokenizer_config.json")?;
713
714    let user_model = UserDefinedEmbeddingModel::new(
715        onnx_bytes,
716        TokenizerFiles {
717            tokenizer_file: tokenizer_bytes,
718            config_file: config_bytes,
719            special_tokens_map_file: special_tokens_bytes,
720            tokenizer_config_file: tokenizer_config_bytes,
721        },
722    );
723
724    let runtime_preference = configured_embedding_runtime_preference();
725
726    #[cfg(target_os = "macos")]
727    if runtime_preference != "cpu" {
728        let init_opts = InitOptionsUserDefined::new()
729            .with_max_length(configured_embedding_max_length())
730            .with_execution_providers(vec![build_coreml_execution_provider()]);
731        match TextEmbedding::try_new_from_user_defined(user_model.clone(), init_opts) {
732            Ok(model) => {
733                let runtime_info = coreml_runtime_info(runtime_preference.clone(), None);
734                debug!(
735                    threads = runtime_info.threads,
736                    runtime_preference = %runtime_info.runtime_preference,
737                    backend = %runtime_info.backend,
738                    coreml_compute_units = ?runtime_info.coreml_compute_units,
739                    coreml_static_input_shapes = ?runtime_info.coreml_static_input_shapes,
740                    coreml_profile_compute_plan = ?runtime_info.coreml_profile_compute_plan,
741                    coreml_specialization_strategy = ?runtime_info.coreml_specialization_strategy,
742                    coreml_model_cache_dir = ?runtime_info.coreml_model_cache_dir,
743                    "loaded CodeSearchNet embedding model"
744                );
745                return Ok((
746                    model,
747                    CODESEARCH_DIMENSION,
748                    CODESEARCH_MODEL_NAME.to_string(),
749                    runtime_info,
750                ));
751            }
752            Err(err) => {
753                let reason = err.to_string();
754                debug!(
755                    runtime_preference = %runtime_preference,
756                    fallback_reason = %reason,
757                    "CoreML embedding load failed; falling back to CPU"
758                );
759                let model = TextEmbedding::try_new_from_user_defined(
760                    user_model,
761                    InitOptionsUserDefined::new()
762                        .with_max_length(configured_embedding_max_length()),
763                )
764                .context("failed to load CodeSearchNet embedding model")?;
765                let runtime_info = coreml_runtime_info(runtime_preference.clone(), Some(reason));
766                debug!(
767                    threads = runtime_info.threads,
768                    runtime_preference = %runtime_info.runtime_preference,
769                    backend = %runtime_info.backend,
770                    coreml_compute_units = ?runtime_info.coreml_compute_units,
771                    coreml_static_input_shapes = ?runtime_info.coreml_static_input_shapes,
772                    coreml_profile_compute_plan = ?runtime_info.coreml_profile_compute_plan,
773                    coreml_specialization_strategy = ?runtime_info.coreml_specialization_strategy,
774                    coreml_model_cache_dir = ?runtime_info.coreml_model_cache_dir,
775                    fallback_reason = ?runtime_info.fallback_reason,
776                    "loaded CodeSearchNet embedding model"
777                );
778                return Ok((
779                    model,
780                    CODESEARCH_DIMENSION,
781                    CODESEARCH_MODEL_NAME.to_string(),
782                    runtime_info,
783                ));
784            }
785        }
786    }
787
788    let model = TextEmbedding::try_new_from_user_defined(
789        user_model,
790        InitOptionsUserDefined::new().with_max_length(configured_embedding_max_length()),
791    )
792    .context("failed to load CodeSearchNet embedding model")?;
793    let runtime_info = cpu_runtime_info(runtime_preference.clone(), None);
794
795    debug!(
796        threads = runtime_info.threads,
797        runtime_preference = %runtime_info.runtime_preference,
798        backend = %runtime_info.backend,
799        "loaded CodeSearchNet embedding model"
800    );
801
802    Ok((
803        model,
804        CODESEARCH_DIMENSION,
805        CODESEARCH_MODEL_NAME.to_string(),
806        runtime_info,
807    ))
808}
809
810pub fn configured_embedding_model_name() -> String {
811    std::env::var("CODELENS_EMBED_MODEL").unwrap_or_else(|_| CODESEARCH_MODEL_NAME.to_string())
812}
813
814fn configured_rerank_blend() -> f64 {
815    std::env::var("CODELENS_RERANK_BLEND")
816        .ok()
817        .and_then(|v| v.parse::<f64>().ok())
818        .and_then(|v| {
819            if (0.0..=1.0).contains(&v) {
820                Some(v)
821            } else {
822                None
823            }
824        })
825        .unwrap_or(0.75) // default: 75% bi-encoder, 25% text overlap (sweep: self +0.006 MRR, role neutral)
826}
827
828pub fn embedding_model_assets_available() -> bool {
829    resolve_model_dir().is_ok()
830}
831
832impl EmbeddingEngine {
833    fn embed_texts_cached(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
834        if texts.is_empty() {
835            return Ok(Vec::new());
836        }
837
838        let mut resolved: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
839        let mut missing_order: Vec<String> = Vec::new();
840        let mut missing_positions: HashMap<String, Vec<usize>> = HashMap::new();
841
842        {
843            let mut cache = self
844                .text_embed_cache
845                .lock()
846                .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
847            for (index, text) in texts.iter().enumerate() {
848                if let Some(cached) = cache.get(text) {
849                    resolved[index] = Some(cached);
850                } else {
851                    let key = (*text).to_owned();
852                    if !missing_positions.contains_key(&key) {
853                        missing_order.push(key.clone());
854                    }
855                    missing_positions.entry(key).or_default().push(index);
856                }
857            }
858        }
859
860        if !missing_order.is_empty() {
861            let missing_refs: Vec<&str> = missing_order.iter().map(String::as_str).collect();
862            let embeddings = self
863                .model
864                .lock()
865                .map_err(|_| anyhow::anyhow!("model lock"))?
866                .embed(missing_refs, None)
867                .context("text embedding failed")?;
868
869            let mut cache = self
870                .text_embed_cache
871                .lock()
872                .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
873            for (text, embedding) in missing_order.into_iter().zip(embeddings.into_iter()) {
874                cache.insert(text.clone(), embedding.clone());
875                if let Some(indices) = missing_positions.remove(&text) {
876                    for index in indices {
877                        resolved[index] = Some(embedding.clone());
878                    }
879                }
880            }
881        }
882
883        resolved
884            .into_iter()
885            .map(|item| item.ok_or_else(|| anyhow::anyhow!("missing embedding cache entry")))
886            .collect()
887    }
888
889    pub fn new(project: &ProjectRoot) -> Result<Self> {
890        let (model, dimension, model_name, runtime_info) = load_codesearch_model()?;
891
892        let db_dir = project.as_path().join(".codelens/index");
893        std::fs::create_dir_all(&db_dir)?;
894        let db_path = db_dir.join("embeddings.db");
895
896        let store = SqliteVecStore::new(&db_path, dimension, &model_name)?;
897
898        Ok(Self {
899            model: Mutex::new(model),
900            store,
901            model_name,
902            runtime_info,
903            text_embed_cache: Mutex::new(TextEmbeddingCache::new(
904                configured_embedding_text_cache_size(),
905            )),
906            indexing: std::sync::atomic::AtomicBool::new(false),
907        })
908    }
909
910    pub fn model_name(&self) -> &str {
911        &self.model_name
912    }
913
914    pub fn runtime_info(&self) -> &EmbeddingRuntimeInfo {
915        &self.runtime_info
916    }
917
918    /// Index all symbols from the project's symbol database into the embedding index.
919    ///
920    /// Reconciles the embedding store file-by-file so unchanged symbols can
921    /// reuse their existing vectors and only changed/new symbols are re-embedded.
922    /// Caps at a configurable max to prevent runaway on huge projects.
923    /// Returns true if a full reindex is currently in progress.
924    pub fn is_indexing(&self) -> bool {
925        self.indexing.load(std::sync::atomic::Ordering::Relaxed)
926    }
927
928    pub fn index_from_project(&self, project: &ProjectRoot) -> Result<usize> {
929        // Guard against concurrent full reindex (14s+ operation)
930        if self
931            .indexing
932            .compare_exchange(
933                false,
934                true,
935                std::sync::atomic::Ordering::AcqRel,
936                std::sync::atomic::Ordering::Relaxed,
937            )
938            .is_err()
939        {
940            anyhow::bail!(
941                "Embedding indexing already in progress — wait for the current run to complete before retrying."
942            );
943        }
944        // RAII guard to reset the flag on any exit path
945        struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
946        impl Drop for IndexGuard<'_> {
947            fn drop(&mut self) {
948                self.0.store(false, std::sync::atomic::Ordering::Release);
949            }
950        }
951        let _guard = IndexGuard(&self.indexing);
952
953        let db_path = crate::db::index_db_path(project.as_path());
954        let symbol_db = IndexDb::open(&db_path)?;
955        let batch_size = embed_batch_size();
956        let max_symbols = max_embed_symbols();
957        let mut total_indexed = 0usize;
958        let mut total_seen = 0usize;
959        let mut model = None;
960        let mut existing_embeddings: HashMap<
961            String,
962            HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
963        > = HashMap::new();
964        let mut current_db_files = HashSet::new();
965        let mut capped = false;
966
967        self.store
968            .for_each_file_embeddings(&mut |file_path, chunks| {
969                existing_embeddings.insert(
970                    file_path,
971                    chunks
972                        .into_iter()
973                        .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
974                        .collect(),
975                );
976                Ok(())
977            })?;
978
979        symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
980            current_db_files.insert(file_path.clone());
981            if capped {
982                return Ok(());
983            }
984
985            let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
986            let relevant_symbols: Vec<_> = symbols
987                .into_iter()
988                .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
989                .collect();
990
991            if relevant_symbols.is_empty() {
992                self.store.delete_by_file(&[file_path.as_str()])?;
993                existing_embeddings.remove(&file_path);
994                return Ok(());
995            }
996
997            if total_seen + relevant_symbols.len() > max_symbols {
998                capped = true;
999                return Ok(());
1000            }
1001            total_seen += relevant_symbols.len();
1002
1003            let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
1004            total_indexed += self.reconcile_file_embeddings(
1005                &file_path,
1006                relevant_symbols,
1007                source.as_deref(),
1008                existing_for_file,
1009                batch_size,
1010                &mut model,
1011            )?;
1012            Ok(())
1013        })?;
1014
1015        let removed_files: Vec<String> = existing_embeddings
1016            .into_keys()
1017            .filter(|file_path| !current_db_files.contains(file_path))
1018            .collect();
1019        if !removed_files.is_empty() {
1020            let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
1021            self.store.delete_by_file(&removed_refs)?;
1022        }
1023
1024        Ok(total_indexed)
1025    }
1026
1027    /// Extract NL→code bridge candidates from indexed symbols.
1028    /// For each symbol with a docstring, produces a (docstring_first_line, symbol_name) pair.
1029    /// The caller writes these to `.codelens/bridges.json` for project-specific NL bridging.
1030    pub fn generate_bridge_candidates(
1031        &self,
1032        project: &ProjectRoot,
1033    ) -> Result<Vec<(String, String)>> {
1034        let db_path = crate::db::index_db_path(project.as_path());
1035        let symbol_db = IndexDb::open(&db_path)?;
1036        let mut bridges: Vec<(String, String)> = Vec::new();
1037        let mut seen_nl = HashSet::new();
1038
1039        symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
1040            let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
1041            for sym in &symbols {
1042                if is_test_only_symbol(sym, source.as_deref()) {
1043                    continue;
1044                }
1045                let doc = source.as_deref().and_then(|src| {
1046                    extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize)
1047                });
1048                let doc = match doc {
1049                    Some(d) if !d.is_empty() => d,
1050                    _ => continue,
1051                };
1052
1053                // Build code term: symbol_name + split words
1054                let split = split_identifier(&sym.name);
1055                let code_term = if split != sym.name {
1056                    format!("{} {}", sym.name, split)
1057                } else {
1058                    sym.name.clone()
1059                };
1060
1061                // Extract short NL phrases (3-6 words) from the docstring.
1062                // This produces multiple bridge entries per symbol, each matching
1063                // common NL query patterns like "render template" or "parse url".
1064                let first_line = doc.lines().next().unwrap_or("").trim().to_lowercase();
1065                // Remove trailing period/punctuation
1066                let clean = first_line.trim_end_matches(|c: char| c.is_ascii_punctuation());
1067                let words: Vec<&str> = clean.split_whitespace().collect();
1068                if words.len() < 2 {
1069                    continue;
1070                }
1071
1072                // Generate short N-gram keys (2-4 words from the start)
1073                for window in 2..=words.len().min(4) {
1074                    let key = words[..window].join(" ");
1075                    if key.len() < 5 || key.len() > 60 {
1076                        continue;
1077                    }
1078                    if seen_nl.insert(key.clone()) {
1079                        bridges.push((key, code_term.clone()));
1080                    }
1081                }
1082
1083                // Also add split_identifier words as a bridge key
1084                // so "render template" → render_template
1085                if split != sym.name && !seen_nl.contains(&split.to_lowercase()) {
1086                    let lowered = split.to_lowercase();
1087                    if lowered.split_whitespace().count() >= 2 && seen_nl.insert(lowered.clone()) {
1088                        bridges.push((lowered, code_term.clone()));
1089                    }
1090                }
1091            }
1092            Ok(())
1093        })?;
1094
1095        Ok(bridges)
1096    }
1097
1098    fn reconcile_file_embeddings<'a>(
1099        &'a self,
1100        file_path: &str,
1101        symbols: Vec<crate::db::SymbolWithFile>,
1102        source: Option<&str>,
1103        mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
1104        batch_size: usize,
1105        model: &mut Option<std::sync::MutexGuard<'a, TextEmbedding>>,
1106    ) -> Result<usize> {
1107        let mut reconciled_chunks = Vec::with_capacity(symbols.len());
1108        let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
1109        let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
1110
1111        for sym in symbols {
1112            let text = build_embedding_text(&sym, source);
1113            if let Some(existing) =
1114                existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
1115            {
1116                reconciled_chunks.push(EmbeddingChunk {
1117                    file_path: sym.file_path.clone(),
1118                    symbol_name: sym.name.clone(),
1119                    kind: sym.kind.clone(),
1120                    line: sym.line as usize,
1121                    signature: sym.signature.clone(),
1122                    name_path: sym.name_path.clone(),
1123                    text,
1124                    embedding: existing.embedding,
1125                    doc_embedding: existing.doc_embedding,
1126                });
1127                continue;
1128            }
1129
1130            batch_texts.push(text);
1131            batch_meta.push(sym);
1132
1133            if batch_texts.len() >= batch_size {
1134                if model.is_none() {
1135                    *model = Some(
1136                        self.model
1137                            .lock()
1138                            .map_err(|_| anyhow::anyhow!("model lock"))?,
1139                    );
1140                }
1141                reconciled_chunks.extend(Self::embed_chunks(
1142                    model.as_mut().expect("model lock initialized"),
1143                    &batch_texts,
1144                    &batch_meta,
1145                )?);
1146                batch_texts.clear();
1147                batch_meta.clear();
1148            }
1149        }
1150
1151        if !batch_texts.is_empty() {
1152            if model.is_none() {
1153                *model = Some(
1154                    self.model
1155                        .lock()
1156                        .map_err(|_| anyhow::anyhow!("model lock"))?,
1157                );
1158            }
1159            reconciled_chunks.extend(Self::embed_chunks(
1160                model.as_mut().expect("model lock initialized"),
1161                &batch_texts,
1162                &batch_meta,
1163            )?);
1164        }
1165
1166        self.store.delete_by_file(&[file_path])?;
1167        if reconciled_chunks.is_empty() {
1168            return Ok(0);
1169        }
1170        self.store.insert(&reconciled_chunks)
1171    }
1172
1173    fn embed_chunks(
1174        model: &mut TextEmbedding,
1175        texts: &[String],
1176        meta: &[crate::db::SymbolWithFile],
1177    ) -> Result<Vec<EmbeddingChunk>> {
1178        let batch_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
1179        let embeddings = model.embed(batch_refs, None).context("embedding failed")?;
1180
1181        Ok(meta
1182            .iter()
1183            .zip(embeddings)
1184            .zip(texts.iter())
1185            .map(|((sym, emb), text)| EmbeddingChunk {
1186                file_path: sym.file_path.clone(),
1187                symbol_name: sym.name.clone(),
1188                kind: sym.kind.clone(),
1189                line: sym.line as usize,
1190                signature: sym.signature.clone(),
1191                name_path: sym.name_path.clone(),
1192                text: text.clone(),
1193                embedding: emb,
1194                doc_embedding: None,
1195            })
1196            .collect())
1197    }
1198
1199    /// Embed one batch of texts and upsert immediately, then the caller drops the batch.
1200    fn flush_batch(
1201        model: &mut TextEmbedding,
1202        store: &SqliteVecStore,
1203        texts: &[String],
1204        meta: &[crate::db::SymbolWithFile],
1205    ) -> Result<usize> {
1206        let chunks = Self::embed_chunks(model, texts, meta)?;
1207        store.insert(&chunks)
1208    }
1209
1210    /// Search for symbols semantically similar to the query.
1211    pub fn search(&self, query: &str, max_results: usize) -> Result<Vec<SemanticMatch>> {
1212        let results = self.search_scored(query, max_results)?;
1213        Ok(results.into_iter().map(SemanticMatch::from).collect())
1214    }
1215
1216    /// Search returning raw ScoredChunks with optional reranking.
1217    ///
1218    /// Pipeline: bi-encoder → candidate pool (3× requested) → rerank → top-N.
1219    /// Reranking uses query-document text overlap scoring to refine bi-encoder
1220    /// cosine similarity. This catches cases where embedding similarity is high
1221    /// but the actual text relevance is low (or vice versa).
1222    pub fn search_scored(&self, query: &str, max_results: usize) -> Result<Vec<ScoredChunk>> {
1223        let query_embedding = self.embed_texts_cached(&[query])?;
1224
1225        if query_embedding.is_empty() {
1226            return Ok(Vec::new());
1227        }
1228
1229        // Fetch N× candidates for reranking headroom (default 5×, override via
1230        // CODELENS_RERANK_FACTOR). More candidates = better rerank quality at
1231        // marginal latency cost (sqlite-vec scan is fast).
1232        let factor = std::env::var("CODELENS_RERANK_FACTOR")
1233            .ok()
1234            .and_then(|v| v.parse::<usize>().ok())
1235            .unwrap_or(5);
1236        let candidate_count = max_results.saturating_mul(factor).max(max_results);
1237        let mut candidates = self.store.search(&query_embedding[0], candidate_count)?;
1238
1239        if candidates.len() <= max_results {
1240            return Ok(candidates);
1241        }
1242
1243        // Lightweight rerank: blend bi-encoder score with text overlap signal.
1244        // This is a stopgap until a proper cross-encoder is plugged in.
1245        let query_lower = query.to_lowercase();
1246        let query_tokens: Vec<&str> = query_lower
1247            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
1248            .filter(|t| t.len() >= 2)
1249            .collect();
1250
1251        if query_tokens.is_empty() {
1252            candidates.truncate(max_results);
1253            return Ok(candidates);
1254        }
1255
1256        let blend = configured_rerank_blend();
1257        for chunk in &mut candidates {
1258            // Build searchable text: symbol_name + split identifier words +
1259            // name_path (parent context) + signature + file_path.
1260            // split_identifier turns "parseSymbols" into "parse Symbols" for
1261            // better NL token matching.
1262            let split_name = split_identifier(&chunk.symbol_name);
1263            let searchable = format!(
1264                "{} {} {} {} {}",
1265                chunk.symbol_name.to_lowercase(),
1266                split_name.to_lowercase(),
1267                chunk.name_path.to_lowercase(),
1268                chunk.signature.to_lowercase(),
1269                chunk.file_path.to_lowercase(),
1270            );
1271            let overlap = query_tokens
1272                .iter()
1273                .filter(|t| searchable.contains(**t))
1274                .count() as f64;
1275            let overlap_ratio = overlap / query_tokens.len().max(1) as f64;
1276            // Blend: configurable bi-encoder + text overlap (default 75/25)
1277            chunk.score = chunk.score * blend + overlap_ratio * (1.0 - blend);
1278        }
1279
1280        candidates.sort_by(|a, b| {
1281            b.score
1282                .partial_cmp(&a.score)
1283                .unwrap_or(std::cmp::Ordering::Equal)
1284        });
1285        candidates.truncate(max_results);
1286        Ok(candidates)
1287    }
1288
1289    /// Incrementally re-index only the given files.
1290    pub fn index_changed_files(
1291        &self,
1292        project: &ProjectRoot,
1293        changed_files: &[&str],
1294    ) -> Result<usize> {
1295        if changed_files.is_empty() {
1296            return Ok(0);
1297        }
1298        let batch_size = embed_batch_size();
1299        let mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk> = HashMap::new();
1300        for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
1301            for chunk in self.store.embeddings_for_files(file_chunk)? {
1302                existing_embeddings.insert(reusable_embedding_key_for_chunk(&chunk), chunk);
1303            }
1304        }
1305        self.store.delete_by_file(changed_files)?;
1306
1307        let db_path = crate::db::index_db_path(project.as_path());
1308        let symbol_db = IndexDb::open(&db_path)?;
1309
1310        let mut total_indexed = 0usize;
1311        let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
1312        let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
1313        let mut batch_reused: Vec<EmbeddingChunk> = Vec::with_capacity(batch_size);
1314        let mut file_cache: std::collections::HashMap<String, Option<String>> =
1315            std::collections::HashMap::new();
1316        let mut model = None;
1317
1318        for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
1319            let relevant = symbol_db.symbols_for_files(file_chunk)?;
1320            for sym in relevant {
1321                let source = file_cache.entry(sym.file_path.clone()).or_insert_with(|| {
1322                    std::fs::read_to_string(project.as_path().join(&sym.file_path)).ok()
1323                });
1324                if is_test_only_symbol(&sym, source.as_deref()) {
1325                    continue;
1326                }
1327                let text = build_embedding_text(&sym, source.as_deref());
1328                if let Some(existing) =
1329                    existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
1330                {
1331                    batch_reused.push(EmbeddingChunk {
1332                        file_path: sym.file_path.clone(),
1333                        symbol_name: sym.name.clone(),
1334                        kind: sym.kind.clone(),
1335                        line: sym.line as usize,
1336                        signature: sym.signature.clone(),
1337                        name_path: sym.name_path.clone(),
1338                        text,
1339                        embedding: existing.embedding,
1340                        doc_embedding: existing.doc_embedding,
1341                    });
1342                    if batch_reused.len() >= batch_size {
1343                        total_indexed += self.store.insert(&batch_reused)?;
1344                        batch_reused.clear();
1345                    }
1346                    continue;
1347                }
1348                batch_texts.push(text);
1349                batch_meta.push(sym);
1350
1351                if batch_texts.len() >= batch_size {
1352                    if model.is_none() {
1353                        model = Some(
1354                            self.model
1355                                .lock()
1356                                .map_err(|_| anyhow::anyhow!("model lock"))?,
1357                        );
1358                    }
1359                    total_indexed += Self::flush_batch(
1360                        model.as_mut().expect("model lock initialized"),
1361                        &self.store,
1362                        &batch_texts,
1363                        &batch_meta,
1364                    )?;
1365                    batch_texts.clear();
1366                    batch_meta.clear();
1367                }
1368            }
1369        }
1370
1371        if !batch_reused.is_empty() {
1372            total_indexed += self.store.insert(&batch_reused)?;
1373        }
1374
1375        if !batch_texts.is_empty() {
1376            if model.is_none() {
1377                model = Some(
1378                    self.model
1379                        .lock()
1380                        .map_err(|_| anyhow::anyhow!("model lock"))?,
1381                );
1382            }
1383            total_indexed += Self::flush_batch(
1384                model.as_mut().expect("model lock initialized"),
1385                &self.store,
1386                &batch_texts,
1387                &batch_meta,
1388            )?;
1389        }
1390
1391        Ok(total_indexed)
1392    }
1393
1394    /// Whether the embedding index has been populated.
1395    pub fn is_indexed(&self) -> bool {
1396        self.store.count().unwrap_or(0) > 0
1397    }
1398
1399    pub fn index_info(&self) -> EmbeddingIndexInfo {
1400        EmbeddingIndexInfo {
1401            model_name: self.model_name.clone(),
1402            indexed_symbols: self.store.count().unwrap_or(0),
1403        }
1404    }
1405
1406    pub fn inspect_existing_index(project: &ProjectRoot) -> Result<Option<EmbeddingIndexInfo>> {
1407        let db_path = project.as_path().join(".codelens/index/embeddings.db");
1408        if !db_path.exists() {
1409            return Ok(None);
1410        }
1411
1412        let conn =
1413            crate::db::open_derived_sqlite_with_recovery(&db_path, "embedding index", || {
1414                ffi::register_sqlite_vec()?;
1415                let conn = Connection::open(&db_path)?;
1416                conn.execute_batch("PRAGMA busy_timeout=5000;")?;
1417                conn.query_row("PRAGMA schema_version", [], |_row| Ok(()))?;
1418                Ok(conn)
1419            })?;
1420
1421        let model_name: Option<String> = conn
1422            .query_row(
1423                "SELECT value FROM meta WHERE key = 'model' LIMIT 1",
1424                [],
1425                |row| row.get(0),
1426            )
1427            .ok();
1428        let indexed_symbols: usize = conn
1429            .query_row("SELECT COUNT(*) FROM symbols", [], |row| {
1430                row.get::<_, i64>(0)
1431            })
1432            .map(|count| count.max(0) as usize)
1433            .unwrap_or(0);
1434
1435        Ok(model_name.map(|model_name| EmbeddingIndexInfo {
1436            model_name,
1437            indexed_symbols,
1438        }))
1439    }
1440
1441    // ── Embedding-powered analysis ─────────────────────────────────
1442
1443    /// Find code symbols most similar to the given symbol.
1444    pub fn find_similar_code(
1445        &self,
1446        file_path: &str,
1447        symbol_name: &str,
1448        max_results: usize,
1449    ) -> Result<Vec<SemanticMatch>> {
1450        let target = self
1451            .store
1452            .get_embedding(file_path, symbol_name)?
1453            .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
1454
1455        let oversample = max_results.saturating_add(8).max(1);
1456        let scored = self
1457            .store
1458            .search(&target.embedding, oversample)?
1459            .into_iter()
1460            .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
1461            .take(max_results)
1462            .map(SemanticMatch::from)
1463            .collect();
1464        Ok(scored)
1465    }
1466
1467    /// Find near-duplicate code pairs across the codebase.
1468    /// Returns pairs with cosine similarity above the threshold (default 0.85).
1469    pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
1470        let mut pairs = Vec::new();
1471        let mut seen_pairs = HashSet::new();
1472        let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
1473        let candidate_limit = duplicate_candidate_limit(max_pairs);
1474        let mut done = false;
1475
1476        self.store
1477            .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut |batch| {
1478                if done {
1479                    return Ok(());
1480                }
1481
1482                let mut candidate_lists = Vec::with_capacity(batch.len());
1483                let mut missing_candidates = Vec::new();
1484                let mut missing_keys = HashSet::new();
1485
1486                for chunk in &batch {
1487                    if pairs.len() >= max_pairs {
1488                        done = true;
1489                        break;
1490                    }
1491
1492                    let filtered: Vec<ScoredChunk> = self
1493                        .store
1494                        .search(&chunk.embedding, candidate_limit)?
1495                        .into_iter()
1496                        .filter(|candidate| {
1497                            !(chunk.file_path == candidate.file_path
1498                                && chunk.symbol_name == candidate.symbol_name
1499                                && chunk.line == candidate.line
1500                                && chunk.signature == candidate.signature
1501                                && chunk.name_path == candidate.name_path)
1502                        })
1503                        .collect();
1504
1505                    for candidate in &filtered {
1506                        let cache_key = stored_chunk_key_for_score(candidate);
1507                        if !embedding_cache.contains_key(&cache_key)
1508                            && missing_keys.insert(cache_key)
1509                        {
1510                            missing_candidates.push(candidate.clone());
1511                        }
1512                    }
1513
1514                    candidate_lists.push(filtered);
1515                }
1516
1517                if !missing_candidates.is_empty() {
1518                    for candidate_chunk in self
1519                        .store
1520                        .embeddings_for_scored_chunks(&missing_candidates)?
1521                    {
1522                        embedding_cache
1523                            .entry(stored_chunk_key(&candidate_chunk))
1524                            .or_insert_with(|| Arc::new(candidate_chunk));
1525                    }
1526                }
1527
1528                for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
1529                    if pairs.len() >= max_pairs {
1530                        done = true;
1531                        break;
1532                    }
1533
1534                    for candidate in candidates {
1535                        let pair_key = duplicate_pair_key(
1536                            &chunk.file_path,
1537                            &chunk.symbol_name,
1538                            &candidate.file_path,
1539                            &candidate.symbol_name,
1540                        );
1541                        if !seen_pairs.insert(pair_key) {
1542                            continue;
1543                        }
1544
1545                        let Some(candidate_chunk) =
1546                            embedding_cache.get(&stored_chunk_key_for_score(candidate))
1547                        else {
1548                            continue;
1549                        };
1550
1551                        let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
1552                        if sim < threshold {
1553                            continue;
1554                        }
1555
1556                        pairs.push(DuplicatePair {
1557                            symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
1558                            symbol_b: format!(
1559                                "{}:{}",
1560                                candidate_chunk.file_path, candidate_chunk.symbol_name
1561                            ),
1562                            file_a: chunk.file_path.clone(),
1563                            file_b: candidate_chunk.file_path.clone(),
1564                            line_a: chunk.line,
1565                            line_b: candidate_chunk.line,
1566                            similarity: sim,
1567                        });
1568                        if pairs.len() >= max_pairs {
1569                            done = true;
1570                            break;
1571                        }
1572                    }
1573                }
1574                Ok(())
1575            })?;
1576
1577        pairs.sort_by(|a, b| {
1578            b.similarity
1579                .partial_cmp(&a.similarity)
1580                .unwrap_or(std::cmp::Ordering::Equal)
1581        });
1582        Ok(pairs)
1583    }
1584}
1585
1586fn duplicate_candidate_limit(max_pairs: usize) -> usize {
1587    max_pairs.saturating_mul(4).clamp(32, 128)
1588}
1589
1590fn duplicate_pair_key(
1591    file_a: &str,
1592    symbol_a: &str,
1593    file_b: &str,
1594    symbol_b: &str,
1595) -> ((String, String), (String, String)) {
1596    let left = (file_a.to_owned(), symbol_a.to_owned());
1597    let right = (file_b.to_owned(), symbol_b.to_owned());
1598    if left <= right {
1599        (left, right)
1600    } else {
1601        (right, left)
1602    }
1603}
1604
1605type StoredChunkKey = (String, String, usize, String, String);
1606
1607fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
1608    (
1609        chunk.file_path.clone(),
1610        chunk.symbol_name.clone(),
1611        chunk.line,
1612        chunk.signature.clone(),
1613        chunk.name_path.clone(),
1614    )
1615}
1616
1617fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
1618    (
1619        chunk.file_path.clone(),
1620        chunk.symbol_name.clone(),
1621        chunk.line,
1622        chunk.signature.clone(),
1623        chunk.name_path.clone(),
1624    )
1625}
1626
1627impl EmbeddingEngine {
1628    /// Classify a code symbol into one of the given categories using zero-shot embedding similarity.
1629    pub fn classify_symbol(
1630        &self,
1631        file_path: &str,
1632        symbol_name: &str,
1633        categories: &[&str],
1634    ) -> Result<Vec<CategoryScore>> {
1635        let target = match self.store.get_embedding(file_path, symbol_name)? {
1636            Some(target) => target,
1637            None => self
1638                .store
1639                .all_with_embeddings()?
1640                .into_iter()
1641                .find(|c| c.file_path == file_path && c.symbol_name == symbol_name)
1642                .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?,
1643        };
1644
1645        let embeddings = self.embed_texts_cached(categories)?;
1646
1647        let mut scores: Vec<CategoryScore> = categories
1648            .iter()
1649            .zip(embeddings.iter())
1650            .map(|(cat, emb)| CategoryScore {
1651                category: cat.to_string(),
1652                score: cosine_similarity(&target.embedding, emb),
1653            })
1654            .collect();
1655
1656        scores.sort_by(|a, b| {
1657            b.score
1658                .partial_cmp(&a.score)
1659                .unwrap_or(std::cmp::Ordering::Equal)
1660        });
1661        Ok(scores)
1662    }
1663
1664    /// Find symbols that are outliers — semantically distant from their file's other symbols.
1665    pub fn find_misplaced_code(&self, max_results: usize) -> Result<Vec<OutlierSymbol>> {
1666        let mut outliers = Vec::new();
1667
1668        self.store
1669            .for_each_file_embeddings(&mut |file_path, chunks| {
1670                if chunks.len() < 2 {
1671                    return Ok(());
1672                }
1673
1674                for (idx, chunk) in chunks.iter().enumerate() {
1675                    let mut sim_sum = 0.0;
1676                    let mut count = 0;
1677                    for (other_idx, other_chunk) in chunks.iter().enumerate() {
1678                        if other_idx == idx {
1679                            continue;
1680                        }
1681                        sim_sum += cosine_similarity(&chunk.embedding, &other_chunk.embedding);
1682                        count += 1;
1683                    }
1684                    if count > 0 {
1685                        let avg_sim = sim_sum / count as f64; // Lower means more misplaced.
1686                        outliers.push(OutlierSymbol {
1687                            file_path: file_path.clone(),
1688                            symbol_name: chunk.symbol_name.clone(),
1689                            kind: chunk.kind.clone(),
1690                            line: chunk.line,
1691                            avg_similarity_to_file: avg_sim,
1692                        });
1693                    }
1694                }
1695                Ok(())
1696            })?;
1697
1698        outliers.sort_by(|a, b| {
1699            a.avg_similarity_to_file
1700                .partial_cmp(&b.avg_similarity_to_file)
1701                .unwrap_or(std::cmp::Ordering::Equal)
1702        });
1703        outliers.truncate(max_results);
1704        Ok(outliers)
1705    }
1706}
1707
1708// ── Analysis result types ────────────────────────────────────────────
1709
1710#[derive(Debug, Clone, Serialize)]
1711pub struct DuplicatePair {
1712    pub symbol_a: String,
1713    pub symbol_b: String,
1714    pub file_a: String,
1715    pub file_b: String,
1716    pub line_a: usize,
1717    pub line_b: usize,
1718    pub similarity: f64,
1719}
1720
1721#[derive(Debug, Clone, Serialize)]
1722pub struct CategoryScore {
1723    pub category: String,
1724    pub score: f64,
1725}
1726
1727#[derive(Debug, Clone, Serialize)]
1728pub struct OutlierSymbol {
1729    pub file_path: String,
1730    pub symbol_name: String,
1731    pub kind: String,
1732    pub line: usize,
1733    pub avg_similarity_to_file: f64,
1734}
1735
1736/// SIMD-friendly cosine similarity for f32 embedding vectors.
1737///
1738/// Computes dot product and norms in f32 (auto-vectorized by LLVM on Apple Silicon NEON),
1739/// then promotes to f64 only for the final division to avoid precision loss.
1740fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
1741    debug_assert_eq!(a.len(), b.len());
1742
1743    // Process in chunks of 8 for optimal SIMD lane utilization (NEON 128-bit = 4xf32,
1744    // but the compiler can unroll 2 iterations for 8-wide throughput).
1745    let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
1746    for (x, y) in a.iter().zip(b.iter()) {
1747        dot += x * y;
1748        norm_a += x * x;
1749        norm_b += y * y;
1750    }
1751
1752    let norm_a = (norm_a as f64).sqrt();
1753    let norm_b = (norm_b as f64).sqrt();
1754    if norm_a == 0.0 || norm_b == 0.0 {
1755        0.0
1756    } else {
1757        dot as f64 / (norm_a * norm_b)
1758    }
1759}
1760
1761/// Build the embedding text for a symbol.
1762///
1763/// Optimized for MiniLM-L12-CodeSearchNet:
1764/// - No "passage:" prefix (model not trained with prefixes)
1765/// - Include file context for disambiguation
1766/// - Signature-focused (body inclusion hurts quality for this model)
1767///
1768/// When `CODELENS_EMBED_DOCSTRINGS=1` is set, leading docstrings/comments are
1769/// appended. Disabled by default because the bundled CodeSearchNet-INT8 model
1770/// is optimized for code signatures and dilutes on natural language text.
1771/// Enable when switching to a hybrid code+text model (E5-large, BGE-base, etc).
1772/// Split CamelCase/snake_case into space-separated words for embedding matching.
1773/// "getDonationRankings" → "get Donation Rankings"
1774/// "build_non_code_ranges" → "build non code ranges"
1775fn split_identifier(name: &str) -> String {
1776    // Only split if name is CamelCase or snake_case with multiple segments
1777    if !name.contains('_') && !name.chars().any(|c| c.is_uppercase()) {
1778        return name.to_string();
1779    }
1780    let mut words = Vec::new();
1781    let mut current = String::new();
1782    let chars: Vec<char> = name.chars().collect();
1783    for (i, &ch) in chars.iter().enumerate() {
1784        if ch == '_' {
1785            if !current.is_empty() {
1786                words.push(current.clone());
1787                current.clear();
1788            }
1789        } else if ch.is_uppercase()
1790            && !current.is_empty()
1791            && (current
1792                .chars()
1793                .last()
1794                .map(|c| c.is_lowercase())
1795                .unwrap_or(false)
1796                || chars.get(i + 1).map(|c| c.is_lowercase()).unwrap_or(false))
1797        {
1798            // Split at CamelCase boundary, but not for ALL_CAPS
1799            words.push(current.clone());
1800            current.clear();
1801            current.push(ch);
1802        } else {
1803            current.push(ch);
1804        }
1805    }
1806    if !current.is_empty() {
1807        words.push(current);
1808    }
1809    if words.len() <= 1 {
1810        return name.to_string(); // No meaningful split
1811    }
1812    words.join(" ")
1813}
1814
1815fn is_test_only_symbol(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> bool {
1816    let fp = &sym.file_path;
1817
1818    // ── Path-based detection (language-agnostic) ─────────────────────
1819    // Rust
1820    if fp.contains("/tests/") || fp.ends_with("_tests.rs") {
1821        return true;
1822    }
1823    // JS/TS — Jest __tests__ directory
1824    if fp.contains("/__tests__/") || fp.contains("\\__tests__\\") {
1825        return true;
1826    }
1827    // Python
1828    if fp.ends_with("_test.py") {
1829        return true;
1830    }
1831    // Go
1832    if fp.ends_with("_test.go") {
1833        return true;
1834    }
1835    // JS/TS — .test.* / .spec.*
1836    if fp.ends_with(".test.ts")
1837        || fp.ends_with(".test.tsx")
1838        || fp.ends_with(".test.js")
1839        || fp.ends_with(".test.jsx")
1840        || fp.ends_with(".spec.ts")
1841        || fp.ends_with(".spec.js")
1842    {
1843        return true;
1844    }
1845    // Java/Kotlin — Maven src/test/ layout
1846    if fp.contains("/src/test/") {
1847        return true;
1848    }
1849    // Java — *Test.java / *Tests.java
1850    if fp.ends_with("Test.java") || fp.ends_with("Tests.java") {
1851        return true;
1852    }
1853    // Ruby
1854    if fp.ends_with("_test.rb") || fp.contains("/spec/") {
1855        return true;
1856    }
1857
1858    // ── Rust name_path patterns ───────────────────────────────────────
1859    if sym.name_path.starts_with("tests::")
1860        || sym.name_path.contains("::tests::")
1861        || sym.name_path.starts_with("test::")
1862        || sym.name_path.contains("::test::")
1863    {
1864        return true;
1865    }
1866
1867    let Some(source) = source else {
1868        return false;
1869    };
1870
1871    let start = usize::try_from(sym.start_byte.max(0))
1872        .unwrap_or(0)
1873        .min(source.len());
1874
1875    // ── Source-based: Rust attributes ────────────────────────────────
1876    let window_start = start.saturating_sub(2048);
1877    let attrs = String::from_utf8_lossy(&source.as_bytes()[window_start..start]);
1878    if attrs.contains("#[test]")
1879        || attrs.contains("#[tokio::test]")
1880        || attrs.contains("#[cfg(test)]")
1881        || attrs.contains("#[cfg(all(test")
1882    {
1883        return true;
1884    }
1885
1886    // ── Source-based: Python ─────────────────────────────────────────
1887    // Function names starting with `test_` or class names starting with `Test`
1888    if fp.ends_with(".py") {
1889        if sym.name.starts_with("test_") {
1890            return true;
1891        }
1892        // Class whose name starts with "Test" — also matches TestCase subclasses
1893        if sym.kind == "class" && sym.name.starts_with("Test") {
1894            return true;
1895        }
1896    }
1897
1898    // ── Source-based: Go ─────────────────────────────────────────────
1899    // func TestXxx(...) pattern; file must end with _test.go (already caught above),
1900    // but guard on .go extension for any edge-case non-test files with Test* helpers.
1901    if fp.ends_with(".go") && sym.name.starts_with("Test") && sym.kind == "function" {
1902        return true;
1903    }
1904
1905    // ── Source-based: Java / Kotlin ──────────────────────────────────
1906    if fp.ends_with(".java") || fp.ends_with(".kt") {
1907        let before = &source[..start];
1908        let window = if before.len() > 200 {
1909            &before[before.len() - 200..]
1910        } else {
1911            before
1912        };
1913        if window.contains("@Test")
1914            || window.contains("@ParameterizedTest")
1915            || window.contains("@RepeatedTest")
1916        {
1917            return true;
1918        }
1919    }
1920
1921    false
1922}
1923
1924fn build_embedding_text(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> String {
1925    // File context: use only the filename (not full path) to reduce noise.
1926    // Full paths like "crates/codelens-engine/src/symbols/mod.rs" add tokens
1927    // that dilute the semantic signal. "mod.rs" is sufficient context.
1928    let file_ctx = if sym.file_path.is_empty() {
1929        String::new()
1930    } else {
1931        let filename = sym.file_path.rsplit('/').next().unwrap_or(&sym.file_path);
1932        format!(" in {}", filename)
1933    };
1934
1935    // Include split identifier words for better NL matching
1936    // e.g. "getDonationRankings" → "get Donation Rankings"
1937    let split_name = split_identifier(&sym.name);
1938    let name_with_split = if split_name != sym.name {
1939        format!("{} ({})", sym.name, split_name)
1940    } else {
1941        sym.name.clone()
1942    };
1943
1944    // Add parent context from name_path (e.g. "UserService/get_user" → "in UserService")
1945    let parent_ctx = if !sym.name_path.is_empty() && sym.name_path.contains('/') {
1946        let parent = sym.name_path.rsplit_once('/').map(|x| x.0).unwrap_or("");
1947        if parent.is_empty() {
1948            String::new()
1949        } else {
1950            format!(" (in {})", parent)
1951        }
1952    } else {
1953        String::new()
1954    };
1955
1956    // Module context: directory name provides domain signal without full path noise.
1957    // "embedding/mod.rs" → module "embedding", "symbols/ranking.rs" → module "symbols"
1958    let module_ctx = if sym.file_path.contains('/') {
1959        let parts: Vec<&str> = sym.file_path.rsplitn(3, '/').collect();
1960        if parts.len() >= 2 {
1961            let dir = parts[1];
1962            // Skip generic dirs like "src"
1963            if dir != "src" && dir != "crates" {
1964                format!(" [{dir}]")
1965            } else {
1966                String::new()
1967            }
1968        } else {
1969            String::new()
1970        }
1971    } else {
1972        String::new()
1973    };
1974
1975    let base = if sym.signature.is_empty() {
1976        format!(
1977            "{} {}{}{}{}",
1978            sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx
1979        )
1980    } else {
1981        format!(
1982            "{} {}{}{}{}: {}",
1983            sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx, sym.signature
1984        )
1985    };
1986
1987    // Docstring inclusion: v2 model improved NL understanding (+45%), enabling
1988    // docstrings by default. Measured: ranked_context +0.020, semantic -0.003 (neutral).
1989    // Disable via CODELENS_EMBED_DOCSTRINGS=0 if needed.
1990    let docstrings_disabled = std::env::var("CODELENS_EMBED_DOCSTRINGS")
1991        .map(|v| v == "0" || v == "false")
1992        .unwrap_or(false);
1993
1994    if docstrings_disabled {
1995        return base;
1996    }
1997
1998    let docstring = source
1999        .and_then(|src| extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize))
2000        .unwrap_or_default();
2001
2002    let mut text = if docstring.is_empty() {
2003        // Fallback: extract the first few meaningful lines from the function
2004        // body. This captures key API calls (e.g. "tree_sitter::Parser",
2005        // "stdin()") that help the embedding model match NL queries to
2006        // symbols without docs.
2007        let body_hint = source
2008            .and_then(|src| extract_body_hint(src, sym.start_byte as usize, sym.end_byte as usize))
2009            .unwrap_or_default();
2010        if body_hint.is_empty() {
2011            base
2012        } else {
2013            format!("{} — {}", base, body_hint)
2014        }
2015    } else {
2016        // Collect up to hint_line_budget() non-empty docstring lines
2017        // (rather than only the first) so the embedding model sees
2018        // multi-sentence explanations in full — up to the runtime
2019        // char budget via join_hint_lines.
2020        let line_budget = hint_line_budget();
2021        let lines: Vec<String> = docstring
2022            .lines()
2023            .map(str::trim)
2024            .filter(|line| !line.is_empty())
2025            .take(line_budget)
2026            .map(str::to_string)
2027            .collect();
2028        let hint = join_hint_lines(&lines);
2029        if hint.is_empty() {
2030            base
2031        } else {
2032            format!("{} — {}", base, hint)
2033        }
2034    };
2035
2036    // v1.5 Phase 2b experiment: optionally append NL tokens harvested from
2037    // comments and string literals inside the body. Disabled by default;
2038    // enable with `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` to A/B.
2039    if let Some(src) = source
2040        && let Some(nl_tokens) =
2041            extract_nl_tokens(src, sym.start_byte as usize, sym.end_byte as usize)
2042        && !nl_tokens.is_empty()
2043    {
2044        text.push_str(" · NL: ");
2045        text.push_str(&nl_tokens);
2046    }
2047
2048    // v1.5 Phase 2c experiment: optionally append `Type::method` call-site
2049    // hints harvested from the body. Disabled by default; enable with
2050    // `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` to A/B. Orthogonal to
2051    // Phase 2b — both can be stacked.
2052    if let Some(src) = source
2053        && let Some(api_calls) =
2054            extract_api_calls(src, sym.start_byte as usize, sym.end_byte as usize)
2055        && !api_calls.is_empty()
2056    {
2057        text.push_str(" · API: ");
2058        text.push_str(&api_calls);
2059    }
2060
2061    text
2062}
2063
2064/// Maximum total characters collected from body-hint or docstring lines.
2065/// Kept conservative to avoid diluting signature signal for the bundled
2066/// MiniLM-L12-CodeSearchNet INT8 model. Override via
2067/// `CODELENS_EMBED_HINT_CHARS` for experiments (clamped to 60..=512).
2068///
2069/// History: a v1.5 Phase 2 PoC briefly raised this to 180 / 3 lines in an
2070/// attempt to close the NL query MRR gap. The 2026-04-11 A/B measurement
2071/// (`benchmarks/embedding-quality-v1.5-hint1` vs `-phase2`) showed
2072/// `hybrid -0.005`, `NL hybrid -0.008`, `NL semantic_search -0.041`, so
2073/// the defaults reverted to the pre-PoC values. The infrastructure
2074/// (`join_hint_lines`, `hint_line_budget`, env overrides) stayed so the
2075/// next experiment does not need a rewrite.
2076const DEFAULT_HINT_TOTAL_CHAR_BUDGET: usize = 60;
2077
2078/// Maximum number of meaningful lines to collect from a function body.
2079/// Overridable via `CODELENS_EMBED_HINT_LINES` (clamped to 1..=10).
2080const DEFAULT_HINT_LINES: usize = 1;
2081
2082fn hint_char_budget() -> usize {
2083    std::env::var("CODELENS_EMBED_HINT_CHARS")
2084        .ok()
2085        .and_then(|raw| raw.parse::<usize>().ok())
2086        .map(|n| n.clamp(60, 512))
2087        .unwrap_or(DEFAULT_HINT_TOTAL_CHAR_BUDGET)
2088}
2089
2090fn hint_line_budget() -> usize {
2091    std::env::var("CODELENS_EMBED_HINT_LINES")
2092        .ok()
2093        .and_then(|raw| raw.parse::<usize>().ok())
2094        .map(|n| n.clamp(1, 10))
2095        .unwrap_or(DEFAULT_HINT_LINES)
2096}
2097
2098/// Join collected hint lines, capping at the runtime-configured char
2099/// budget (default 60 chars; override via `CODELENS_EMBED_HINT_CHARS`).
2100///
2101/// Each line is separated by " · " so the embedding model sees a small
2102/// structural boundary between logically distinct body snippets. The final
2103/// result is truncated with a trailing "..." on char-boundaries only.
2104fn join_hint_lines(lines: &[String]) -> String {
2105    if lines.is_empty() {
2106        return String::new();
2107    }
2108    let joined = lines
2109        .iter()
2110        .map(String::as_str)
2111        .collect::<Vec<_>>()
2112        .join(" · ");
2113    let budget = hint_char_budget();
2114    if joined.chars().count() > budget {
2115        let truncated: String = joined.chars().take(budget).collect();
2116        format!("{truncated}...")
2117    } else {
2118        joined
2119    }
2120}
2121
2122/// Extract up to `hint_line_budget()` meaningful lines from a function body
2123/// (skipping braces, blank lines, and comments). Used as a fallback when no
2124/// docstring is available so the embedding model still sees the core API
2125/// calls / return values.
2126///
2127/// Historically this returned only the first meaningful line clipped at 60
2128/// chars. The 180-char / 3-line budget was introduced in v1.5 Phase 2 to
2129/// close the NL-query gap (MRR 0.528) on cases where the discriminating
2130/// keyword lives in line 2 or 3 of the body.
2131fn extract_body_hint(source: &str, start: usize, end: usize) -> Option<String> {
2132    if start >= source.len() || end > source.len() || start >= end {
2133        return None;
2134    }
2135    let safe_start = if source.is_char_boundary(start) {
2136        start
2137    } else {
2138        source.floor_char_boundary(start)
2139    };
2140    let safe_end = end.min(source.len());
2141    let safe_end = if source.is_char_boundary(safe_end) {
2142        safe_end
2143    } else {
2144        source.floor_char_boundary(safe_end)
2145    };
2146    let body = &source[safe_start..safe_end];
2147
2148    let max_lines = hint_line_budget();
2149    let mut collected: Vec<String> = Vec::with_capacity(max_lines);
2150
2151    // Skip past the signature: everything until we see a line ending with '{' or ':'
2152    // (opening brace of the function body), then start looking for meaningful lines.
2153    let mut past_signature = false;
2154    for line in body.lines() {
2155        let trimmed = line.trim();
2156        if !past_signature {
2157            // Keep skipping until we find the opening brace/colon
2158            if trimmed.ends_with('{') || trimmed.ends_with(':') || trimmed == "{" {
2159                past_signature = true;
2160            }
2161            continue;
2162        }
2163        // Skip comments, blank lines, closing braces
2164        if trimmed.is_empty()
2165            || trimmed.starts_with("//")
2166            || trimmed.starts_with('#')
2167            || trimmed.starts_with("/*")
2168            || trimmed.starts_with('*')
2169            || trimmed == "}"
2170        {
2171            continue;
2172        }
2173        collected.push(trimmed.to_string());
2174        if collected.len() >= max_lines {
2175            break;
2176        }
2177    }
2178
2179    if collected.is_empty() {
2180        None
2181    } else {
2182        Some(join_hint_lines(&collected))
2183    }
2184}
2185
2186/// Return true when NL-token collection is enabled via
2187/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` (or `true`/`yes`/`on`).
2188///
2189/// v1.5 Phase 2b infrastructure — kept off by default pending A/B
2190/// measurement against the fixed 89-query dataset.
2191///
2192/// v1.5 Phase 2j: when no explicit env var is set, fall through to
2193/// `auto_hint_should_enable()` which consults `CODELENS_EMBED_HINT_AUTO` +
2194/// `CODELENS_EMBED_HINT_AUTO_LANG` for language-gated defaults.
2195fn nl_tokens_enabled() -> bool {
2196    if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_COMMENTS") {
2197        return explicit;
2198    }
2199    auto_hint_should_enable()
2200}
2201
2202/// Return true when v1.5 Phase 2j auto-detection mode is enabled.
2203///
2204/// **v1.6.0 default change (§8.14)**: this returns `true` by default.
2205/// Users opt **out** with `CODELENS_EMBED_HINT_AUTO=0` (or `false` /
2206/// `no` / `off`). The previous v1.5.x behaviour was the other way
2207/// around — default OFF, opt in with `=1`. The flip ships as part of
2208/// v1.6.0 after the five-dataset measurement (§8.7, §8.8, §8.13,
2209/// §8.11, §8.12) validated:
2210///
2211/// 1. Rust / C / C++ / Go / Java / Kotlin / Scala / C# projects hit
2212///    the §8.7 stacked arm (+2.4 % to +15.2 % hybrid MRR).
2213/// 2. TypeScript / JavaScript projects validated the Phase 2b/2c
2214///    embedding hints on `facebook/jest` and later `microsoft/typescript`.
2215///    Subsequent app/runtime follow-ups (`vercel/next.js`,
2216///    `facebook/react` production subtree) motivated splitting Phase 2e
2217///    out of the JS/TS auto path, but not removing JS/TS from the
2218///    embedding-hint default.
2219/// 3. Python projects hit the §8.8 baseline (no change) — the
2220///    §8.11 language gate + §8.12 MCP auto-set means Python is
2221///    auto-detected and the stack stays OFF without user action.
2222/// 4. Ruby / PHP / Lua / shell / untested-dynamic projects fall
2223///    through to the conservative default-off branch (same as
2224///    Python behaviour — no regression).
2225///
2226/// The dominant language is supplied by the MCP tool layer via the
2227/// `CODELENS_EMBED_HINT_AUTO_LANG` env var, which is set
2228/// automatically on startup (`main.rs`) and on MCP
2229/// `activate_project` calls by `compute_dominant_language` (§8.12).
2230/// The engine only reads the env var — it does not walk the
2231/// filesystem itself.
2232///
2233/// Explicit `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1` /
2234/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` /
2235/// `CODELENS_RANK_SPARSE_TERM_WEIGHT=1` (or their `=0` counterparts)
2236/// always win over the auto decision — users who want to force a
2237/// configuration still can, the auto mode is a better default, not
2238/// a lock-in.
2239///
2240/// **Opt-out**: set `CODELENS_EMBED_HINT_AUTO=0` to restore v1.5.x
2241/// behaviour (no auto-detection, all Phase 2 gates default off unless
2242/// their individual env vars are set).
2243pub(super) fn auto_hint_mode_enabled() -> bool {
2244    parse_bool_env("CODELENS_EMBED_HINT_AUTO").unwrap_or(true)
2245}
2246
2247/// Return the language tag supplied by the MCP tool layer via
2248/// `CODELENS_EMBED_HINT_AUTO_LANG`, or `None` when unset. The tag is
2249/// compared against `language_supports_nl_stack` to decide whether
2250/// the Phase 2b / 2c / 2e stack should be auto-enabled.
2251///
2252/// Accepted tags are the canonical extensions from
2253/// `crates/codelens-engine/src/lang_config.rs` (`rs`, `py`, `js`,
2254/// `ts`, `go`, `rb`, `java`, `kt`, `scala`, `cs`, `cpp`, `c`, …) plus
2255/// a handful of long-form aliases (`rust`, `python`, `javascript`,
2256/// `typescript`, `golang`) for users who set the env var by hand.
2257pub(super) fn auto_hint_lang() -> Option<String> {
2258    std::env::var("CODELENS_EMBED_HINT_AUTO_LANG")
2259        .ok()
2260        .map(|raw| raw.trim().to_ascii_lowercase())
2261}
2262
2263/// Return true when `lang` is a language where the v1.5 embedding-hint
2264/// stack (Phase 2b comments + Phase 2c API-call extraction) has been
2265/// measured to net-positive (§8.2, §8.4, §8.6, §8.7, §8.13, §8.15) or
2266/// where the language's static typing + snake_case naming + comment-first
2267/// culture makes the mechanism behave the same way it does on Rust.
2268///
2269/// This gate is intentionally separate from the Phase 2e sparse
2270/// re-ranker. As of the §8.15 / §8.16 / §8.17 follow-up arc, JS/TS stays
2271/// enabled here because tooling/compiler repos are positive and short-file
2272/// runtime repos are inert, but JS/TS is disabled in the **sparse**
2273/// auto-gate because Phase 2e is negative-or-null on that family.
2274///
2275/// The list is intentionally conservative — additions require an actual
2276/// external-repo A/B following the §8.7 methodology, not a
2277/// language-similarity argument alone.
2278///
2279/// **Supported** (measured or by static-typing analogy):
2280/// - `rs`, `rust` (§8.2, §8.4, §8.6, §8.7: +2.4 %, +7.1 %, +15.2 %)
2281/// - `cpp`, `cc`, `cxx`, `c++`
2282/// - `c`
2283/// - `go`, `golang`
2284/// - `java`
2285/// - `kt`, `kotlin`
2286/// - `scala`
2287/// - `cs`, `csharp`
2288/// - `ts`, `typescript`, `tsx` (§8.13: `facebook/jest` +7.3 % hybrid MRR)
2289/// - `js`, `javascript`, `jsx`
2290///
2291/// **Unsupported** (measured regression or untested dynamic-typed):
2292/// - `py`, `python` (§8.8 regression)
2293/// - `rb`, `ruby`
2294/// - `php`
2295/// - `lua`, `r`, `jl`
2296/// - `sh`, `bash`
2297/// - anything else
2298pub(super) fn language_supports_nl_stack(lang: &str) -> bool {
2299    matches!(
2300        lang.trim().to_ascii_lowercase().as_str(),
2301        "rs" | "rust"
2302            | "cpp"
2303            | "cc"
2304            | "cxx"
2305            | "c++"
2306            | "c"
2307            | "go"
2308            | "golang"
2309            | "java"
2310            | "kt"
2311            | "kotlin"
2312            | "scala"
2313            | "cs"
2314            | "csharp"
2315            | "ts"
2316            | "typescript"
2317            | "tsx"
2318            | "js"
2319            | "javascript"
2320            | "jsx"
2321    )
2322}
2323
2324/// Return true when `lang` is a language where the Phase 2e sparse
2325/// coverage re-ranker should be auto-enabled when the user has not set
2326/// `CODELENS_RANK_SPARSE_TERM_WEIGHT` explicitly.
2327///
2328/// This is deliberately narrower than `language_supports_nl_stack`.
2329/// Phase 2e remains positive on Rust-style codebases, but the JS/TS
2330/// measurement arc now says:
2331///
2332/// - `facebook/jest`: marginal positive
2333/// - `microsoft/typescript`: negative
2334/// - `vercel/next.js`: slight negative
2335/// - `facebook/react` production subtree: exact no-op
2336///
2337/// So the conservative Phase 2m policy is:
2338/// - keep Phase 2b/2c auto-eligible on JS/TS
2339/// - disable **auto** Phase 2e on JS/TS
2340/// - preserve explicit env override for users who want to force it on
2341pub(super) fn language_supports_sparse_weighting(lang: &str) -> bool {
2342    matches!(
2343        lang.trim().to_ascii_lowercase().as_str(),
2344        "rs" | "rust"
2345            | "cpp"
2346            | "cc"
2347            | "cxx"
2348            | "c++"
2349            | "c"
2350            | "go"
2351            | "golang"
2352            | "java"
2353            | "kt"
2354            | "kotlin"
2355            | "scala"
2356            | "cs"
2357            | "csharp"
2358    )
2359}
2360
2361/// Combined decision: Phase 2j auto mode is enabled AND the detected
2362/// language supports the Phase 2b/2c embedding-hint stack. This is the
2363/// `else` branch that `nl_tokens_enabled` and `api_calls_enabled` fall
2364/// through to when no explicit env var is set.
2365pub(super) fn auto_hint_should_enable() -> bool {
2366    if !auto_hint_mode_enabled() {
2367        return false;
2368    }
2369    match auto_hint_lang() {
2370        Some(lang) => language_supports_nl_stack(&lang),
2371        None => false, // auto mode on but no language tag → conservative OFF
2372    }
2373}
2374
2375/// Combined decision: Phase 2j auto mode is enabled AND the detected
2376/// language supports auto-enabling the Phase 2e sparse re-ranker.
2377///
2378/// This intentionally differs from `auto_hint_should_enable()` after the
2379/// §8.15 / §8.16 / §8.17 JS/TS follow-up arc: embedding hints stay
2380/// auto-on for JS/TS, but sparse weighting does not.
2381pub(super) fn auto_sparse_should_enable() -> bool {
2382    if !auto_hint_mode_enabled() {
2383        return false;
2384    }
2385    match auto_hint_lang() {
2386        Some(lang) => language_supports_sparse_weighting(&lang),
2387        None => false,
2388    }
2389}
2390
2391/// Heuristic: does this string look like natural language rather than
2392/// a code identifier, path, or numeric literal?
2393///
2394/// Criteria:
2395/// - at least 4 characters
2396/// - no path / scope separators (`/`, `\`, `::`)
2397/// - must contain a space (multi-word)
2398/// - alphabetic character ratio >= 60%
2399pub(super) fn is_nl_shaped(s: &str) -> bool {
2400    let s = s.trim();
2401    if s.chars().count() < 4 {
2402        return false;
2403    }
2404    if s.contains('/') || s.contains('\\') || s.contains("::") {
2405        return false;
2406    }
2407    if !s.contains(' ') {
2408        return false;
2409    }
2410    let non_ws: usize = s.chars().filter(|c| !c.is_whitespace()).count();
2411    if non_ws == 0 {
2412        return false;
2413    }
2414    let alpha: usize = s.chars().filter(|c| c.is_alphabetic()).count();
2415    (alpha * 100) / non_ws >= 60
2416}
2417
2418/// Return true when the v1.5 Phase 2i strict comment filter is enabled
2419/// via `CODELENS_EMBED_HINT_STRICT_COMMENTS=1` (or `true`/`yes`/`on`).
2420///
2421/// Phase 2i extends Phase 2h (§8.9) with a comment-side analogue of the
2422/// literal filter. Phase 2h recovered ~8 % of the Python regression by
2423/// rejecting format/error/log string literals in Pass 2; Phase 2i
2424/// targets the remaining ~92 % by rejecting meta-annotation comments
2425/// (`# TODO`, `# FIXME`, `# HACK`, `# XXX`, `# BUG`, `# REVIEW`,
2426/// `# REFACTOR`, `# TEMP`, `# DEPRECATED`) in Pass 1. Conservative
2427/// prefix list — `# NOTE`, `# WARN`, `# SAFETY` are retained because
2428/// they often carry behaviour-descriptive content even on Rust.
2429///
2430/// Default OFF (same policy as every Phase 2 knob). Orthogonal to
2431/// `CODELENS_EMBED_HINT_STRICT_LITERALS` so both may be stacked.
2432fn strict_comments_enabled() -> bool {
2433    std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS")
2434        .map(|raw| {
2435            let lowered = raw.to_ascii_lowercase();
2436            matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
2437        })
2438        .unwrap_or(false)
2439}
2440
2441/// Heuristic: does `body` (the comment text *after* the `//` / `#` prefix
2442/// has been stripped by `extract_comment_body`) look like a meta-annotation
2443/// rather than behaviour-descriptive prose?
2444///
2445/// Recognises the following prefixes (case-insensitive, followed by
2446/// `:`, `(`, or whitespace):
2447/// - `TODO`, `FIXME`, `HACK`, `XXX`, `BUG`
2448/// - `REVIEW`, `REFACTOR`, `TEMP`, `TEMPORARY`, `DEPRECATED`
2449///
2450/// Deliberately excluded (kept as behaviour signal):
2451/// - `NOTE`, `NOTES`, `WARN`, `WARNING`
2452/// - `SAFETY` (Rust `unsafe` block justifications)
2453/// - `PANIC` (Rust invariant docs)
2454///
2455/// The exclusion list is based on the observation that Rust projects
2456/// use `// SAFETY:` and `// NOTE:` to document *why* a block behaves a
2457/// certain way — that text is exactly the NL retrieval signal Phase 2b
2458/// is trying to capture. The inclusion list targets the "I'll fix this
2459/// later" noise that poisons the embedding on both languages but is
2460/// especially common on mature Python projects.
2461pub(super) fn looks_like_meta_annotation(body: &str) -> bool {
2462    let trimmed = body.trim_start();
2463    // Find the end of the first "word" (alphanumerics only — a colon,
2464    // paren, or whitespace terminates the marker).
2465    let word_end = trimmed
2466        .find(|c: char| !c.is_ascii_alphabetic())
2467        .unwrap_or(trimmed.len());
2468    if word_end == 0 {
2469        return false;
2470    }
2471    let first_word = &trimmed[..word_end];
2472    let upper = first_word.to_ascii_uppercase();
2473    matches!(
2474        upper.as_str(),
2475        "TODO"
2476            | "FIXME"
2477            | "HACK"
2478            | "XXX"
2479            | "BUG"
2480            | "REVIEW"
2481            | "REFACTOR"
2482            | "TEMP"
2483            | "TEMPORARY"
2484            | "DEPRECATED"
2485    )
2486}
2487
2488/// Return true when the v1.5 Phase 2h strict NL literal filter is enabled
2489/// via `CODELENS_EMBED_HINT_STRICT_LITERALS=1` (or `true`/`yes`/`on`).
2490///
2491/// Phase 2h addresses the Phase 3b Python regression (§8.8). The default
2492/// Phase 2b Pass 2 scanner accepts any `is_nl_shaped` string literal from
2493/// the body, which on Python captures a lot of generic error / log / format
2494/// strings (`raise ValueError("Invalid URL %s" % url)`, `logging.debug(...)`,
2495/// `fmt.format(...)`). These pass the NL-shape test but carry zero
2496/// behaviour-descriptive signal and pollute the embedding. The strict
2497/// filter rejects string literals that look like format templates or
2498/// common error / log prefixes, while leaving comments (Pass 1) untouched.
2499///
2500/// Default OFF (same policy as every Phase 2 knob — opt-in first,
2501/// measure, then consider flipping the default).
2502fn strict_literal_filter_enabled() -> bool {
2503    std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS")
2504        .map(|raw| {
2505            let lowered = raw.to_ascii_lowercase();
2506            matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
2507        })
2508        .unwrap_or(false)
2509}
2510
2511/// Heuristic: does `s` contain a C / Python / Rust format specifier?
2512///
2513/// Recognises:
2514/// - C / Python `%` style: `%s`, `%d`, `%r`, `%f`, `%x`, `%o`, `%i`, `%u`
2515/// - Python `.format` / f-string style: `{name}`, `{0}`, `{:fmt}`, `{name:fmt}`
2516///
2517/// Rust `format!` / `println!` style `{}` / `{:?}` / `{name}` is caught by
2518/// the same `{...}` branch. Generic `{...}` braces used for JSON-like
2519/// content (e.g. `"{name: foo, id: 1}"`) are distinguished from format
2520/// placeholders by requiring the inside to be either empty, prefix-colon
2521/// (`:fmt`), a single identifier, or an identifier followed by `:fmt`.
2522pub(super) fn contains_format_specifier(s: &str) -> bool {
2523    let bytes = s.as_bytes();
2524    let len = bytes.len();
2525    let mut i = 0;
2526    while i + 1 < len {
2527        if bytes[i] == b'%' {
2528            let next = bytes[i + 1];
2529            if matches!(next, b's' | b'd' | b'r' | b'f' | b'x' | b'o' | b'i' | b'u') {
2530                return true;
2531            }
2532        }
2533        i += 1;
2534    }
2535    // Python `.format` / f-string / Rust `format!` style `{...}`
2536    //
2537    // Real format placeholders never contain whitespace inside the braces:
2538    // `{}`, `{0}`, `{name}`, `{:?}`, `{:.2f}`, `{name:fmt}`. JSON-like
2539    // content such as `{name: foo, id: 1}` DOES contain whitespace. The
2540    // whitespace check is therefore the single simplest and most robust
2541    // way to distinguish the two without a full format-spec parser.
2542    for window in s.split('{').skip(1) {
2543        let Some(close_idx) = window.find('}') else {
2544            continue;
2545        };
2546        let inside = &window[..close_idx];
2547        // `{}` — Rust empty placeholder
2548        if inside.is_empty() {
2549            return true;
2550        }
2551        // Any whitespace inside the braces → JSON-like, not a format spec.
2552        if inside.chars().any(|c| c.is_whitespace()) {
2553            continue;
2554        }
2555        // `{:fmt}` — anonymous format spec
2556        if inside.starts_with(':') {
2557            return true;
2558        }
2559        // `{name}`, `{0}`, `{name:fmt}` — identifier (or digit), optionally
2560        // followed by `:fmt`. We already rejected whitespace-containing
2561        // inputs above, so here we only need to check the identifier chars.
2562        let ident_end = inside.find(':').unwrap_or(inside.len());
2563        let ident = &inside[..ident_end];
2564        if !ident.is_empty()
2565            && ident
2566                .chars()
2567                .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
2568        {
2569            return true;
2570        }
2571    }
2572    false
2573}
2574
2575/// Heuristic: does `s` look like a generic error message, log line, or
2576/// low-value imperative string that an NL query would never try to match?
2577///
2578/// The prefix list is intentionally short — covering the patterns the
2579/// Phase 3b `psf/requests` post-mortem flagged as the largest regression
2580/// sources. False negatives (real behaviour strings misclassified as
2581/// errors) would cost retrieval quality, but because the filter only
2582/// runs on string literals and leaves comments alone, a missed NL string
2583/// in one symbol will typically have a comment covering the same
2584/// behaviour on the same symbol.
2585pub(super) fn looks_like_error_or_log_prefix(s: &str) -> bool {
2586    let lower = s.trim().to_lowercase();
2587    const PREFIXES: &[&str] = &[
2588        "invalid ",
2589        "cannot ",
2590        "could not ",
2591        "unable to ",
2592        "failed to ",
2593        "expected ",
2594        "unexpected ",
2595        "missing ",
2596        "not found",
2597        "error: ",
2598        "error ",
2599        "warning: ",
2600        "warning ",
2601        "sending ",
2602        "received ",
2603        "starting ",
2604        "stopping ",
2605        "calling ",
2606        "connecting ",
2607        "disconnecting ",
2608    ];
2609    PREFIXES.iter().any(|p| lower.starts_with(p))
2610}
2611
2612/// Test-only variant: bypass the env gate so the unit tests can exercise
2613/// the filter logic deterministically (mirrors `extract_nl_tokens_inner`
2614/// vs `extract_nl_tokens` policy). Inlined here instead of a `#[cfg(test)]`
2615/// helper so the release binary path never calls it.
2616#[cfg(test)]
2617pub(super) fn should_reject_literal_strict(s: &str) -> bool {
2618    contains_format_specifier(s) || looks_like_error_or_log_prefix(s)
2619}
2620
2621/// Collect natural-language tokens from a function body: line comments,
2622/// block comments, and string literals that look like NL prose.
2623///
2624/// v1.5 Phase 2b experiment. The hypothesis is that the bundled
2625/// CodeSearchNet-INT8 model struggles with NL queries (hybrid MRR 0.472)
2626/// because the symbol text it sees is pure code, whereas NL queries target
2627/// behavioural descriptions that live in *comments* and *string literals*.
2628///
2629/// Unlike `extract_body_hint` (which skips comments) this function only
2630/// keeps comments + NL-shaped string literals and ignores actual code.
2631///
2632/// Gated by `CODELENS_EMBED_HINT_INCLUDE_COMMENTS=1`. Returns `None` when
2633/// the gate is off so the default embedding text is untouched.
2634fn extract_nl_tokens(source: &str, start: usize, end: usize) -> Option<String> {
2635    if !nl_tokens_enabled() {
2636        return None;
2637    }
2638    extract_nl_tokens_inner(source, start, end)
2639}
2640
2641/// Env-independent core of `extract_nl_tokens`, exposed to the test module
2642/// so unit tests can run deterministically without touching env vars
2643/// (which would race with the other tests that set
2644/// `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`).
2645pub(super) fn extract_nl_tokens_inner(source: &str, start: usize, end: usize) -> Option<String> {
2646    if start >= source.len() || end > source.len() || start >= end {
2647        return None;
2648    }
2649    let safe_start = if source.is_char_boundary(start) {
2650        start
2651    } else {
2652        source.floor_char_boundary(start)
2653    };
2654    let safe_end = end.min(source.len());
2655    let safe_end = if source.is_char_boundary(safe_end) {
2656        safe_end
2657    } else {
2658        source.floor_char_boundary(safe_end)
2659    };
2660    let body = &source[safe_start..safe_end];
2661
2662    let mut tokens: Vec<String> = Vec::new();
2663
2664    // ── Pass 1: comments ─────────────────────────────────────────────
2665    // v1.5 Phase 2i: when CODELENS_EMBED_HINT_STRICT_COMMENTS=1 is set,
2666    // reject meta-annotation comments (`# TODO`, `# FIXME`, `# HACK`,
2667    // ...) while keeping behaviour-descriptive comments untouched. This
2668    // is the comment-side analogue of the Phase 2h literal filter
2669    // (§8.9) and targets the remaining ~92 % of the Python regression
2670    // that Phase 2h's literal-only filter left behind.
2671    let strict_comments = strict_comments_enabled();
2672    for line in body.lines() {
2673        let trimmed = line.trim();
2674        if let Some(cleaned) = extract_comment_body(trimmed)
2675            && is_nl_shaped(&cleaned)
2676            && (!strict_comments || !looks_like_meta_annotation(&cleaned))
2677        {
2678            tokens.push(cleaned);
2679        }
2680    }
2681
2682    // ── Pass 2: double-quoted string literals ────────────────────────
2683    // Simplified scanner — handles escape sequences but does not track
2684    // multi-line strings or raw strings. Good enough for NL-shaped
2685    // heuristic filtering where false negatives are acceptable.
2686    //
2687    // v1.5 Phase 2h: when CODELENS_EMBED_HINT_STRICT_LITERALS=1 is set,
2688    // also reject format templates and generic error / log prefixes. This
2689    // addresses the Phase 3b Python regression documented in §8.8 —
2690    // comments (Pass 1) stay untouched so Rust projects keep their wins.
2691    let strict_literals = strict_literal_filter_enabled();
2692    let mut chars = body.chars().peekable();
2693    let mut in_string = false;
2694    let mut current = String::new();
2695    while let Some(c) = chars.next() {
2696        if in_string {
2697            if c == '\\' {
2698                // Skip escape sequence
2699                let _ = chars.next();
2700            } else if c == '"' {
2701                if is_nl_shaped(&current)
2702                    && (!strict_literals
2703                        || (!contains_format_specifier(&current)
2704                            && !looks_like_error_or_log_prefix(&current)))
2705                {
2706                    tokens.push(current.clone());
2707                }
2708                current.clear();
2709                in_string = false;
2710            } else {
2711                current.push(c);
2712            }
2713        } else if c == '"' {
2714            in_string = true;
2715        }
2716    }
2717
2718    if tokens.is_empty() {
2719        return None;
2720    }
2721    Some(join_hint_lines(&tokens))
2722}
2723
2724/// Return true when API-call extraction is enabled via
2725/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1` (or `true`/`yes`/`on`).
2726///
2727/// v1.5 Phase 2c infrastructure — kept off by default pending A/B
2728/// measurement. Orthogonal to `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`
2729/// so both may be stacked.
2730///
2731/// v1.5 Phase 2j: explicit env > auto mode, same policy as Phase 2b.
2732fn api_calls_enabled() -> bool {
2733    if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_API_CALLS") {
2734        return explicit;
2735    }
2736    auto_hint_should_enable()
2737}
2738
2739/// Heuristic: does `ident` look like a Rust/C++ *type* (PascalCase) rather
2740/// than a module or free function (snake_case)?
2741///
2742/// Phase 2c API-call extractor relies on this filter to keep the hint
2743/// focused on static-method call sites (`Parser::new`, `HashMap::with_capacity`)
2744/// and drop module-scoped free functions (`std::fs::read_to_string`).
2745/// We intentionally accept only an ASCII uppercase first letter; stricter
2746/// than PascalCase detection but deliberate — the goal is high-precision
2747/// Type filtering, not lexical accuracy.
2748pub(super) fn is_static_method_ident(ident: &str) -> bool {
2749    ident.chars().next().is_some_and(|c| c.is_ascii_uppercase())
2750}
2751
2752/// Collect `Type::method` call sites from a function body.
2753///
2754/// v1.5 Phase 2c experiment. Hypothesis: exposing the Types a function
2755/// interacts with (via their static-method call sites) adds a lexical
2756/// bridge between NL queries ("parse json", "open database") and symbols
2757/// whose body references the relevant type (`Parser::new`, `Connection::open`).
2758/// This is orthogonal to Phase 2b (comments + NL-shaped literals), which
2759/// targets *explanatory* natural language rather than *type* hints.
2760///
2761/// Gated by `CODELENS_EMBED_HINT_INCLUDE_API_CALLS=1`. Returns `None` when
2762/// the gate is off so the default embedding text is untouched.
2763fn extract_api_calls(source: &str, start: usize, end: usize) -> Option<String> {
2764    if !api_calls_enabled() {
2765        return None;
2766    }
2767    extract_api_calls_inner(source, start, end)
2768}
2769
2770/// Env-independent core of `extract_api_calls`, exposed to the test module
2771/// so unit tests can run deterministically without touching env vars
2772/// (which would race with other tests that set
2773/// `CODELENS_EMBED_HINT_INCLUDE_API_CALLS`).
2774///
2775/// Scans the body for `Type::method` byte patterns where:
2776/// - `Type` starts with an ASCII uppercase letter and consists of
2777///   `[A-Za-z0-9_]*` (plain ASCII — non-ASCII identifiers are skipped
2778///   on purpose to minimise noise).
2779/// - `method` is any identifier (start `[A-Za-z_]`, continue `[A-Za-z0-9_]*`).
2780///
2781/// Duplicate `Type::method` pairs collapse into a single entry to avoid
2782/// biasing the embedding toward repeated calls in hot loops.
2783pub(super) fn extract_api_calls_inner(source: &str, start: usize, end: usize) -> Option<String> {
2784    if start >= source.len() || end > source.len() || start >= end {
2785        return None;
2786    }
2787    let safe_start = if source.is_char_boundary(start) {
2788        start
2789    } else {
2790        source.floor_char_boundary(start)
2791    };
2792    let safe_end = end.min(source.len());
2793    let safe_end = if source.is_char_boundary(safe_end) {
2794        safe_end
2795    } else {
2796        source.floor_char_boundary(safe_end)
2797    };
2798    if safe_start >= safe_end {
2799        return None;
2800    }
2801    let body = &source[safe_start..safe_end];
2802    let bytes = body.as_bytes();
2803    let len = bytes.len();
2804
2805    let mut calls: Vec<String> = Vec::new();
2806    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
2807
2808    let mut i = 0usize;
2809    while i < len {
2810        let b = bytes[i];
2811        // Walk forward until we find the start of an ASCII identifier.
2812        if !(b == b'_' || b.is_ascii_alphabetic()) {
2813            i += 1;
2814            continue;
2815        }
2816        let ident_start = i;
2817        while i < len {
2818            let bb = bytes[i];
2819            if bb == b'_' || bb.is_ascii_alphanumeric() {
2820                i += 1;
2821            } else {
2822                break;
2823            }
2824        }
2825        let ident_end = i;
2826
2827        // Must be immediately followed by `::`.
2828        if i + 1 >= len || bytes[i] != b':' || bytes[i + 1] != b':' {
2829            continue;
2830        }
2831
2832        let type_ident = &body[ident_start..ident_end];
2833        if !is_static_method_ident(type_ident) {
2834            // `snake_module::foo` — not a Type. Skip past the `::` so we
2835            // don't rescan the same characters, but keep walking.
2836            i += 2;
2837            continue;
2838        }
2839
2840        // Skip the `::`
2841        let mut j = i + 2;
2842        if j >= len || !(bytes[j] == b'_' || bytes[j].is_ascii_alphabetic()) {
2843            i = j;
2844            continue;
2845        }
2846        let method_start = j;
2847        while j < len {
2848            let bb = bytes[j];
2849            if bb == b'_' || bb.is_ascii_alphanumeric() {
2850                j += 1;
2851            } else {
2852                break;
2853            }
2854        }
2855        let method_end = j;
2856
2857        let method_ident = &body[method_start..method_end];
2858        let call = format!("{type_ident}::{method_ident}");
2859        if seen.insert(call.clone()) {
2860            calls.push(call);
2861        }
2862        i = j;
2863    }
2864
2865    if calls.is_empty() {
2866        return None;
2867    }
2868    Some(join_hint_lines(&calls))
2869}
2870
2871/// Peel the comment prefix off a trimmed line, returning the inner text
2872/// if the line is recognisably a `//`, `#`, `/* */`, or leading-`*` comment.
2873fn extract_comment_body(trimmed: &str) -> Option<String> {
2874    if trimmed.is_empty() {
2875        return None;
2876    }
2877    // `//` and `///` and `//!` (Rust doc comments)
2878    if let Some(rest) = trimmed.strip_prefix("///") {
2879        return Some(rest.trim().to_string());
2880    }
2881    if let Some(rest) = trimmed.strip_prefix("//!") {
2882        return Some(rest.trim().to_string());
2883    }
2884    if let Some(rest) = trimmed.strip_prefix("//") {
2885        return Some(rest.trim().to_string());
2886    }
2887    // `#[...]` attribute, `#!...` shebang — NOT comments
2888    if trimmed.starts_with("#[") || trimmed.starts_with("#!") {
2889        return None;
2890    }
2891    // `#` line comment (Python, bash, ...)
2892    if let Some(rest) = trimmed.strip_prefix('#') {
2893        return Some(rest.trim().to_string());
2894    }
2895    // Block-comment line: `/**`, `/*`, or continuation `*`
2896    if let Some(rest) = trimmed.strip_prefix("/**") {
2897        return Some(rest.trim_end_matches("*/").trim().to_string());
2898    }
2899    if let Some(rest) = trimmed.strip_prefix("/*") {
2900        return Some(rest.trim_end_matches("*/").trim().to_string());
2901    }
2902    if let Some(rest) = trimmed.strip_prefix('*') {
2903        // Block-comment continuation. Only accept if the rest looks textual
2904        // (avoid e.g. `*const T` pointer types).
2905        let rest = rest.trim_end_matches("*/").trim();
2906        if rest.is_empty() {
2907            return None;
2908        }
2909        // Reject obvious code continuations
2910        if rest.contains(';') || rest.contains('{') {
2911            return None;
2912        }
2913        return Some(rest.to_string());
2914    }
2915    None
2916}
2917
2918/// Extract the leading docstring or comment block from a symbol's body.
2919/// Supports: Python triple-quote, Rust //!//// doc comments, JS/TS /** */ blocks.
2920fn extract_leading_doc(source: &str, start: usize, end: usize) -> Option<String> {
2921    if start >= source.len() || end > source.len() || start >= end {
2922        return None;
2923    }
2924    // Clamp to nearest char boundary to avoid panicking on multi-byte UTF-8
2925    let safe_start = if source.is_char_boundary(start) {
2926        start
2927    } else {
2928        source.floor_char_boundary(start)
2929    };
2930    let safe_end = end.min(source.len());
2931    let safe_end = if source.is_char_boundary(safe_end) {
2932        safe_end
2933    } else {
2934        source.floor_char_boundary(safe_end)
2935    };
2936    if safe_start >= safe_end {
2937        return None;
2938    }
2939    let body = &source[safe_start..safe_end];
2940    let lines: Vec<&str> = body.lines().skip(1).collect(); // skip the signature line
2941    if lines.is_empty() {
2942        return None;
2943    }
2944
2945    let mut doc_lines = Vec::new();
2946
2947    // Python: triple-quote docstrings
2948    let first_trimmed = lines.first().map(|l| l.trim()).unwrap_or_default();
2949    if first_trimmed.starts_with("\"\"\"") || first_trimmed.starts_with("'''") {
2950        let quote = &first_trimmed[..3];
2951        for line in &lines {
2952            let t = line.trim();
2953            doc_lines.push(t.trim_start_matches(quote).trim_end_matches(quote));
2954            if doc_lines.len() > 1 && t.ends_with(quote) {
2955                break;
2956            }
2957        }
2958    }
2959    // Rust: /// or //! doc comments (before the body, captured by tree-sitter)
2960    else if first_trimmed.starts_with("///") || first_trimmed.starts_with("//!") {
2961        for line in &lines {
2962            let t = line.trim();
2963            if t.starts_with("///") || t.starts_with("//!") {
2964                doc_lines.push(t.trim_start_matches("///").trim_start_matches("//!").trim());
2965            } else {
2966                break;
2967            }
2968        }
2969    }
2970    // JS/TS: /** ... */ block comments
2971    else if first_trimmed.starts_with("/**") {
2972        for line in &lines {
2973            let t = line.trim();
2974            let cleaned = t
2975                .trim_start_matches("/**")
2976                .trim_start_matches('*')
2977                .trim_end_matches("*/")
2978                .trim();
2979            if !cleaned.is_empty() {
2980                doc_lines.push(cleaned);
2981            }
2982            if t.ends_with("*/") {
2983                break;
2984            }
2985        }
2986    }
2987    // Generic: leading // or # comment block
2988    else {
2989        for line in &lines {
2990            let t = line.trim();
2991            if t.starts_with("//") || t.starts_with('#') {
2992                doc_lines.push(t.trim_start_matches("//").trim_start_matches('#').trim());
2993            } else {
2994                break;
2995            }
2996        }
2997    }
2998
2999    if doc_lines.is_empty() {
3000        return None;
3001    }
3002    Some(doc_lines.join(" ").trim().to_owned())
3003}
3004
3005pub(super) fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
3006    embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
3007}
3008
3009#[cfg(test)]
3010mod tests {
3011    use super::*;
3012    use crate::db::{IndexDb, NewSymbol};
3013    use std::sync::Mutex;
3014
3015    /// Serialize tests that load the fastembed ONNX model to avoid file lock contention.
3016    static MODEL_LOCK: Mutex<()> = Mutex::new(());
3017
3018    /// Serialize tests that mutate `CODELENS_EMBED_HINT_*` env vars.
3019    /// The v1.6.0 default flip (§8.14) exposed a pre-existing race where
3020    /// parallel env-var mutating tests interfere with each other — the
3021    /// old `unwrap_or(false)` default happened to mask the race most of
3022    /// the time, but `unwrap_or(true)` no longer does. All tests that
3023    /// read or mutate `CODELENS_EMBED_HINT_*` should take this lock.
3024    static ENV_LOCK: Mutex<()> = Mutex::new(());
3025
3026    macro_rules! skip_without_embedding_model {
3027        () => {
3028            if !super::embedding_model_assets_available() {
3029                eprintln!("skipping embedding test: CodeSearchNet model assets unavailable");
3030                return;
3031            }
3032        };
3033    }
3034
3035    /// Helper: create a temp project with seeded symbols.
3036    fn make_project_with_source() -> (tempfile::TempDir, ProjectRoot) {
3037        let dir = tempfile::tempdir().unwrap();
3038        let root = dir.path();
3039
3040        // Write a source file so body extraction works
3041        let source = "def hello():\n    print('hi')\n\ndef world():\n    return 42\n";
3042        write_python_file_with_symbols(
3043            root,
3044            "main.py",
3045            source,
3046            "hash1",
3047            &[
3048                ("hello", "def hello():", "hello"),
3049                ("world", "def world():", "world"),
3050            ],
3051        );
3052
3053        let project = ProjectRoot::new_exact(root).unwrap();
3054        (dir, project)
3055    }
3056
3057    fn write_python_file_with_symbols(
3058        root: &std::path::Path,
3059        relative_path: &str,
3060        source: &str,
3061        hash: &str,
3062        symbols: &[(&str, &str, &str)],
3063    ) {
3064        std::fs::write(root.join(relative_path), source).unwrap();
3065        let db_path = crate::db::index_db_path(root);
3066        let db = IndexDb::open(&db_path).unwrap();
3067        let file_id = db
3068            .upsert_file(relative_path, 100, hash, source.len() as i64, Some("py"))
3069            .unwrap();
3070
3071        let new_symbols: Vec<NewSymbol<'_>> = symbols
3072            .iter()
3073            .map(|(name, signature, name_path)| {
3074                let start = source.find(signature).unwrap() as i64;
3075                let end = source[start as usize..]
3076                    .find("\n\ndef ")
3077                    .map(|offset| start + offset as i64)
3078                    .unwrap_or(source.len() as i64);
3079                let line = source[..start as usize]
3080                    .bytes()
3081                    .filter(|&b| b == b'\n')
3082                    .count() as i64
3083                    + 1;
3084                NewSymbol {
3085                    name,
3086                    kind: "function",
3087                    line,
3088                    column_num: 0,
3089                    start_byte: start,
3090                    end_byte: end,
3091                    signature,
3092                    name_path,
3093                    parent_id: None,
3094                }
3095            })
3096            .collect();
3097        db.insert_symbols(file_id, &new_symbols).unwrap();
3098    }
3099
3100    fn replace_file_embeddings_with_sentinels(
3101        engine: &EmbeddingEngine,
3102        file_path: &str,
3103        sentinels: &[(&str, f32)],
3104    ) {
3105        let mut chunks = engine.store.embeddings_for_files(&[file_path]).unwrap();
3106        for chunk in &mut chunks {
3107            if let Some((_, value)) = sentinels
3108                .iter()
3109                .find(|(symbol_name, _)| *symbol_name == chunk.symbol_name)
3110            {
3111                chunk.embedding = vec![*value; chunk.embedding.len()];
3112            }
3113        }
3114        engine.store.delete_by_file(&[file_path]).unwrap();
3115        engine.store.insert(&chunks).unwrap();
3116    }
3117
3118    #[test]
3119    fn build_embedding_text_with_signature() {
3120        let sym = crate::db::SymbolWithFile {
3121            name: "hello".into(),
3122            kind: "function".into(),
3123            file_path: "main.py".into(),
3124            line: 1,
3125            signature: "def hello():".into(),
3126            name_path: "hello".into(),
3127            start_byte: 0,
3128            end_byte: 10,
3129        };
3130        let text = build_embedding_text(&sym, Some("def hello(): pass"));
3131        assert_eq!(text, "function hello in main.py: def hello():");
3132    }
3133
3134    #[test]
3135    fn build_embedding_text_without_source() {
3136        let sym = crate::db::SymbolWithFile {
3137            name: "MyClass".into(),
3138            kind: "class".into(),
3139            file_path: "app.py".into(),
3140            line: 5,
3141            signature: "class MyClass:".into(),
3142            name_path: "MyClass".into(),
3143            start_byte: 0,
3144            end_byte: 50,
3145        };
3146        let text = build_embedding_text(&sym, None);
3147        assert_eq!(text, "class MyClass (My Class) in app.py: class MyClass:");
3148    }
3149
3150    #[test]
3151    fn build_embedding_text_empty_signature() {
3152        let sym = crate::db::SymbolWithFile {
3153            name: "CONFIG".into(),
3154            kind: "variable".into(),
3155            file_path: "config.py".into(),
3156            line: 1,
3157            signature: String::new(),
3158            name_path: "CONFIG".into(),
3159            start_byte: 0,
3160            end_byte: 0,
3161        };
3162        let text = build_embedding_text(&sym, None);
3163        assert_eq!(text, "variable CONFIG in config.py");
3164    }
3165
3166    #[test]
3167    fn filters_direct_test_symbols_from_embedding_index() {
3168        let source = "#[test]\nfn alias_case() {}\n";
3169        let sym = crate::db::SymbolWithFile {
3170            name: "alias_case".into(),
3171            kind: "function".into(),
3172            file_path: "src/lib.rs".into(),
3173            line: 2,
3174            signature: "fn alias_case() {}".into(),
3175            name_path: "alias_case".into(),
3176            start_byte: source.find("fn alias_case").unwrap() as i64,
3177            end_byte: source.len() as i64,
3178        };
3179
3180        assert!(is_test_only_symbol(&sym, Some(source)));
3181    }
3182
3183    #[test]
3184    fn filters_cfg_test_module_symbols_from_embedding_index() {
3185        let source = "#[cfg(all(test, feature = \"semantic\"))]\nmod semantic_tests {\n    fn helper_case() {}\n}\n";
3186        let sym = crate::db::SymbolWithFile {
3187            name: "helper_case".into(),
3188            kind: "function".into(),
3189            file_path: "src/lib.rs".into(),
3190            line: 3,
3191            signature: "fn helper_case() {}".into(),
3192            name_path: "helper_case".into(),
3193            start_byte: source.find("fn helper_case").unwrap() as i64,
3194            end_byte: source.len() as i64,
3195        };
3196
3197        assert!(is_test_only_symbol(&sym, Some(source)));
3198    }
3199
3200    #[test]
3201    fn extract_python_docstring() {
3202        let source =
3203            "def greet(name):\n    \"\"\"Say hello to a person.\"\"\"\n    print(f'hi {name}')\n";
3204        let doc = extract_leading_doc(source, 0, source.len()).unwrap();
3205        assert!(doc.contains("Say hello to a person"));
3206    }
3207
3208    #[test]
3209    fn extract_rust_doc_comment() {
3210        let source = "fn dispatch_tool() {\n    /// Route incoming tool requests.\n    /// Handles all MCP methods.\n    let x = 1;\n}\n";
3211        let doc = extract_leading_doc(source, 0, source.len()).unwrap();
3212        assert!(doc.contains("Route incoming tool requests"));
3213        assert!(doc.contains("Handles all MCP methods"));
3214    }
3215
3216    #[test]
3217    fn extract_leading_doc_returns_none_for_no_doc() {
3218        let source = "def f():\n    return 1\n";
3219        assert!(extract_leading_doc(source, 0, source.len()).is_none());
3220    }
3221
3222    #[test]
3223    fn extract_body_hint_finds_first_meaningful_line() {
3224        let source = "pub fn parse_symbols(\n    project: &ProjectRoot,\n) -> Vec<SymbolInfo> {\n    let mut parser = tree_sitter::Parser::new();\n    parser.set_language(lang);\n}\n";
3225        let hint = extract_body_hint(source, 0, source.len());
3226        assert!(hint.is_some());
3227        assert!(hint.unwrap().contains("tree_sitter::Parser"));
3228    }
3229
3230    #[test]
3231    fn extract_body_hint_skips_comments() {
3232        let source = "fn foo() {\n    // setup\n    let x = bar();\n}\n";
3233        let hint = extract_body_hint(source, 0, source.len());
3234        assert_eq!(hint.unwrap(), "let x = bar();");
3235    }
3236
3237    #[test]
3238    fn extract_body_hint_returns_none_for_empty() {
3239        let source = "fn empty() {\n}\n";
3240        let hint = extract_body_hint(source, 0, source.len());
3241        assert!(hint.is_none());
3242    }
3243
3244    #[test]
3245    fn extract_body_hint_multi_line_collection_via_env_override() {
3246        // Default is 1 line / 60 chars (v1.4.0 parity after the v1.5 Phase 2
3247        // PoC revert). Override the line budget via env to confirm the
3248        // multi-line path still works — this is the knob future experiments
3249        // will use without recompiling.
3250        let previous_lines = std::env::var("CODELENS_EMBED_HINT_LINES").ok();
3251        let previous_chars = std::env::var("CODELENS_EMBED_HINT_CHARS").ok();
3252        unsafe {
3253            std::env::set_var("CODELENS_EMBED_HINT_LINES", "3");
3254            std::env::set_var("CODELENS_EMBED_HINT_CHARS", "200");
3255        }
3256
3257        let source = "\
3258fn route_request() {
3259    let kind = detect_request_kind();
3260    let target = dispatch_table.get(&kind);
3261    return target.handle();
3262}
3263";
3264        let hint = extract_body_hint(source, 0, source.len()).expect("hint present");
3265
3266        let env_restore = || unsafe {
3267            match &previous_lines {
3268                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_LINES", value),
3269                None => std::env::remove_var("CODELENS_EMBED_HINT_LINES"),
3270            }
3271            match &previous_chars {
3272                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_CHARS", value),
3273                None => std::env::remove_var("CODELENS_EMBED_HINT_CHARS"),
3274            }
3275        };
3276
3277        let all_three = hint.contains("detect_request_kind")
3278            && hint.contains("dispatch_table")
3279            && hint.contains("target.handle");
3280        let has_separator = hint.contains(" · ");
3281        env_restore();
3282
3283        assert!(all_three, "missing one of three body lines: {hint}");
3284        assert!(has_separator, "missing · separator: {hint}");
3285    }
3286
3287    // Note: we intentionally do NOT have a test that verifies the "default"
3288    // 60-char / 1-line behaviour via `extract_body_hint`. Such a test is
3289    // flaky because cargo test runs tests in parallel and the env-overriding
3290    // tests below (`CODELENS_EMBED_HINT_CHARS`, `CODELENS_EMBED_HINT_LINES`)
3291    // can leak their variables into this one. The default constants
3292    // themselves are compile-time checked and covered by
3293    // `extract_body_hint_finds_first_meaningful_line` /
3294    // `extract_body_hint_skips_comments` which assert on the exact single-line
3295    // shape and implicitly depend on the default budget.
3296
3297    #[test]
3298    fn hint_line_budget_respects_env_override() {
3299        // SAFETY: test block is serialized by crate-level test harness; we
3300        // restore the variable on exit.
3301        let previous = std::env::var("CODELENS_EMBED_HINT_LINES").ok();
3302        unsafe {
3303            std::env::set_var("CODELENS_EMBED_HINT_LINES", "5");
3304        }
3305        let budget = super::hint_line_budget();
3306        unsafe {
3307            match previous {
3308                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_LINES", value),
3309                None => std::env::remove_var("CODELENS_EMBED_HINT_LINES"),
3310            }
3311        }
3312        assert_eq!(budget, 5);
3313    }
3314
3315    #[test]
3316    fn is_nl_shaped_accepts_multi_word_prose() {
3317        assert!(super::is_nl_shaped("skip comments and string literals"));
3318        assert!(super::is_nl_shaped("failed to open database"));
3319        assert!(super::is_nl_shaped("detect client version"));
3320    }
3321
3322    #[test]
3323    fn is_nl_shaped_rejects_code_and_paths() {
3324        // Path-like tokens (both slash flavors)
3325        assert!(!super::is_nl_shaped("crates/codelens-engine/src"));
3326        assert!(!super::is_nl_shaped("C:\\Users\\foo"));
3327        // Module-path-like
3328        assert!(!super::is_nl_shaped("std::sync::Mutex"));
3329        // Single-word identifier
3330        assert!(!super::is_nl_shaped("detect_client"));
3331        // Too short
3332        assert!(!super::is_nl_shaped("ok"));
3333        assert!(!super::is_nl_shaped(""));
3334        // High non-alphabetic ratio
3335        assert!(!super::is_nl_shaped("1 2 3 4 5"));
3336    }
3337
3338    #[test]
3339    fn extract_comment_body_strips_comment_markers() {
3340        assert_eq!(
3341            super::extract_comment_body("/// rust doc comment"),
3342            Some("rust doc comment".to_string())
3343        );
3344        assert_eq!(
3345            super::extract_comment_body("// regular line comment"),
3346            Some("regular line comment".to_string())
3347        );
3348        assert_eq!(
3349            super::extract_comment_body("# python line comment"),
3350            Some("python line comment".to_string())
3351        );
3352        assert_eq!(
3353            super::extract_comment_body("/* inline block */"),
3354            Some("inline block".to_string())
3355        );
3356        assert_eq!(
3357            super::extract_comment_body("* continuation line"),
3358            Some("continuation line".to_string())
3359        );
3360    }
3361
3362    #[test]
3363    fn extract_comment_body_rejects_rust_attributes_and_shebangs() {
3364        assert!(super::extract_comment_body("#[derive(Debug)]").is_none());
3365        assert!(super::extract_comment_body("#[test]").is_none());
3366        assert!(super::extract_comment_body("#!/usr/bin/env python").is_none());
3367    }
3368
3369    #[test]
3370    fn extract_nl_tokens_gated_off_by_default() {
3371        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3372        // Default: no env, no NL tokens regardless of body content.
3373        let previous = std::env::var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS").ok();
3374        unsafe {
3375            std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3376        }
3377        let source = "\
3378fn skip_things() {
3379    // skip comments and string literals during search
3380    let lit = \"scan for matching tokens\";
3381}
3382";
3383        let result = extract_nl_tokens(source, 0, source.len());
3384        unsafe {
3385            if let Some(value) = previous {
3386                std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", value);
3387            }
3388        }
3389        assert!(result.is_none(), "gate leaked: {result:?}");
3390    }
3391
3392    #[test]
3393    fn auto_hint_mode_defaults_on_unless_explicit_off() {
3394        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3395        // v1.6.0 flip (§8.14): default-ON semantics.
3396        //
3397        // Case 1: env var unset → default ON (the v1.6.0 flip).
3398        // Case 2: env var="0" (or "false"/"no"/"off") → explicit OFF
3399        //   (opt-out preserved).
3400        // Case 3: env var="1" (or "true"/"yes"/"on") → explicit ON
3401        //   (still works — explicit always wins).
3402        let previous = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3403
3404        // Case 1: unset → ON (flip)
3405        unsafe {
3406            std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
3407        }
3408        let default_enabled = super::auto_hint_mode_enabled();
3409        assert!(
3410            default_enabled,
3411            "v1.6.0 default flip: auto hint mode should be ON when env unset"
3412        );
3413
3414        // Case 2: explicit OFF
3415        unsafe {
3416            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3417        }
3418        let explicit_off = super::auto_hint_mode_enabled();
3419        assert!(
3420            !explicit_off,
3421            "explicit CODELENS_EMBED_HINT_AUTO=0 must still disable (opt-out escape hatch)"
3422        );
3423
3424        // Case 3: explicit ON
3425        unsafe {
3426            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3427        }
3428        let explicit_on = super::auto_hint_mode_enabled();
3429        assert!(
3430            explicit_on,
3431            "explicit CODELENS_EMBED_HINT_AUTO=1 must still enable"
3432        );
3433
3434        // Restore
3435        unsafe {
3436            match previous {
3437                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3438                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3439            }
3440        }
3441    }
3442
3443    #[test]
3444    fn language_supports_nl_stack_classifies_correctly() {
3445        // Supported — measured or static-typed analogue
3446        assert!(super::language_supports_nl_stack("rs"));
3447        assert!(super::language_supports_nl_stack("rust"));
3448        assert!(super::language_supports_nl_stack("cpp"));
3449        assert!(super::language_supports_nl_stack("c++"));
3450        assert!(super::language_supports_nl_stack("c"));
3451        assert!(super::language_supports_nl_stack("go"));
3452        assert!(super::language_supports_nl_stack("golang"));
3453        assert!(super::language_supports_nl_stack("java"));
3454        assert!(super::language_supports_nl_stack("kt"));
3455        assert!(super::language_supports_nl_stack("kotlin"));
3456        assert!(super::language_supports_nl_stack("scala"));
3457        assert!(super::language_supports_nl_stack("cs"));
3458        assert!(super::language_supports_nl_stack("csharp"));
3459        // §8.13 Phase 3c: TypeScript / JavaScript added after
3460        // facebook/jest external-repo A/B (+7.3 % hybrid MRR).
3461        assert!(super::language_supports_nl_stack("ts"));
3462        assert!(super::language_supports_nl_stack("typescript"));
3463        assert!(super::language_supports_nl_stack("tsx"));
3464        assert!(super::language_supports_nl_stack("js"));
3465        assert!(super::language_supports_nl_stack("javascript"));
3466        assert!(super::language_supports_nl_stack("jsx"));
3467        // Case-insensitive
3468        assert!(super::language_supports_nl_stack("Rust"));
3469        assert!(super::language_supports_nl_stack("RUST"));
3470        assert!(super::language_supports_nl_stack("TypeScript"));
3471        // Leading/trailing whitespace is tolerated
3472        assert!(super::language_supports_nl_stack("  rust  "));
3473        assert!(super::language_supports_nl_stack("  ts  "));
3474
3475        // Unsupported — measured regression or untested dynamic
3476        assert!(!super::language_supports_nl_stack("py"));
3477        assert!(!super::language_supports_nl_stack("python"));
3478        assert!(!super::language_supports_nl_stack("rb"));
3479        assert!(!super::language_supports_nl_stack("ruby"));
3480        assert!(!super::language_supports_nl_stack("php"));
3481        assert!(!super::language_supports_nl_stack("lua"));
3482        assert!(!super::language_supports_nl_stack("sh"));
3483        // Unknown defaults to unsupported
3484        assert!(!super::language_supports_nl_stack("klingon"));
3485        assert!(!super::language_supports_nl_stack(""));
3486    }
3487
3488    #[test]
3489    fn language_supports_sparse_weighting_classifies_correctly() {
3490        assert!(super::language_supports_sparse_weighting("rs"));
3491        assert!(super::language_supports_sparse_weighting("rust"));
3492        assert!(super::language_supports_sparse_weighting("cpp"));
3493        assert!(super::language_supports_sparse_weighting("go"));
3494        assert!(super::language_supports_sparse_weighting("java"));
3495        assert!(super::language_supports_sparse_weighting("kotlin"));
3496        assert!(super::language_supports_sparse_weighting("csharp"));
3497
3498        assert!(!super::language_supports_sparse_weighting("ts"));
3499        assert!(!super::language_supports_sparse_weighting("typescript"));
3500        assert!(!super::language_supports_sparse_weighting("tsx"));
3501        assert!(!super::language_supports_sparse_weighting("js"));
3502        assert!(!super::language_supports_sparse_weighting("javascript"));
3503        assert!(!super::language_supports_sparse_weighting("jsx"));
3504        assert!(!super::language_supports_sparse_weighting("py"));
3505        assert!(!super::language_supports_sparse_weighting("python"));
3506        assert!(!super::language_supports_sparse_weighting("klingon"));
3507        assert!(!super::language_supports_sparse_weighting(""));
3508    }
3509
3510    #[test]
3511    fn auto_hint_should_enable_requires_both_gate_and_supported_lang() {
3512        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3513        let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3514        let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3515
3516        // Case 1: gate explicitly off → never enable, regardless of language.
3517        // v1.6.0 flip (§8.14): `unset` now means default-ON, so to test
3518        // "gate off" we must set the env var to an explicit "0".
3519        unsafe {
3520            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3521            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3522        }
3523        assert!(
3524            !super::auto_hint_should_enable(),
3525            "gate-off (explicit =0) with lang=rust must stay disabled"
3526        );
3527
3528        // Case 2: gate on, supported language → enable
3529        unsafe {
3530            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3531            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3532        }
3533        assert!(
3534            super::auto_hint_should_enable(),
3535            "gate-on + lang=rust must enable"
3536        );
3537
3538        unsafe {
3539            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3540            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
3541        }
3542        assert!(
3543            super::auto_hint_should_enable(),
3544            "gate-on + lang=typescript must keep Phase 2b/2c enabled"
3545        );
3546
3547        // Case 3: gate on, unsupported language → disable
3548        unsafe {
3549            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3550            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3551        }
3552        assert!(
3553            !super::auto_hint_should_enable(),
3554            "gate-on + lang=python must stay disabled"
3555        );
3556
3557        // Case 4: gate on, no language tag → conservative disable
3558        unsafe {
3559            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3560            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
3561        }
3562        assert!(
3563            !super::auto_hint_should_enable(),
3564            "gate-on + no lang tag must stay disabled"
3565        );
3566
3567        // Restore
3568        unsafe {
3569            match prev_auto {
3570                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3571                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3572            }
3573            match prev_lang {
3574                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3575                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3576            }
3577        }
3578    }
3579
3580    #[test]
3581    fn auto_sparse_should_enable_requires_both_gate_and_sparse_supported_lang() {
3582        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3583        let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3584        let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3585
3586        unsafe {
3587            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3588            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3589        }
3590        assert!(
3591            !super::auto_sparse_should_enable(),
3592            "gate-off (explicit =0) must disable sparse auto gate"
3593        );
3594
3595        unsafe {
3596            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3597            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3598        }
3599        assert!(
3600            super::auto_sparse_should_enable(),
3601            "gate-on + lang=rust must enable sparse auto gate"
3602        );
3603
3604        unsafe {
3605            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3606            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
3607        }
3608        assert!(
3609            !super::auto_sparse_should_enable(),
3610            "gate-on + lang=typescript must keep sparse auto gate disabled"
3611        );
3612
3613        unsafe {
3614            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3615            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3616        }
3617        assert!(
3618            !super::auto_sparse_should_enable(),
3619            "gate-on + lang=python must keep sparse auto gate disabled"
3620        );
3621
3622        unsafe {
3623            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3624            std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
3625        }
3626        assert!(
3627            !super::auto_sparse_should_enable(),
3628            "gate-on + no lang tag must keep sparse auto gate disabled"
3629        );
3630
3631        unsafe {
3632            match prev_auto {
3633                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3634                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3635            }
3636            match prev_lang {
3637                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3638                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3639            }
3640        }
3641    }
3642
3643    #[test]
3644    fn nl_tokens_enabled_explicit_env_wins_over_auto() {
3645        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3646        let prev_explicit = std::env::var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS").ok();
3647        let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3648        let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3649
3650        // Explicit ON beats auto-OFF-for-python
3651        unsafe {
3652            std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", "1");
3653            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3654            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3655        }
3656        assert!(
3657            super::nl_tokens_enabled(),
3658            "explicit=1 must win over auto+python=off"
3659        );
3660
3661        // Explicit OFF beats auto-ON-for-rust
3662        unsafe {
3663            std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", "0");
3664            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3665            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3666        }
3667        assert!(
3668            !super::nl_tokens_enabled(),
3669            "explicit=0 must win over auto+rust=on"
3670        );
3671
3672        // No explicit, auto+rust → on via fallback
3673        unsafe {
3674            std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3675            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3676            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3677        }
3678        assert!(
3679            super::nl_tokens_enabled(),
3680            "no explicit + auto+rust must enable"
3681        );
3682
3683        // No explicit, auto+python → off via fallback
3684        unsafe {
3685            std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3686            std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3687            std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3688        }
3689        assert!(
3690            !super::nl_tokens_enabled(),
3691            "no explicit + auto+python must stay disabled"
3692        );
3693
3694        // Restore
3695        unsafe {
3696            match prev_explicit {
3697                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", v),
3698                None => std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS"),
3699            }
3700            match prev_auto {
3701                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3702                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3703            }
3704            match prev_lang {
3705                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3706                None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3707            }
3708        }
3709    }
3710
3711    #[test]
3712    fn strict_comments_gated_off_by_default() {
3713        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3714        let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3715        unsafe {
3716            std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS");
3717        }
3718        let enabled = super::strict_comments_enabled();
3719        unsafe {
3720            if let Some(value) = previous {
3721                std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", value);
3722            }
3723        }
3724        assert!(!enabled, "strict comments gate leaked");
3725    }
3726
3727    #[test]
3728    fn looks_like_meta_annotation_detects_rejected_prefixes() {
3729        // All case variants of the rejected prefix list must match.
3730        assert!(super::looks_like_meta_annotation("TODO: fix later"));
3731        assert!(super::looks_like_meta_annotation("todo handle edge case"));
3732        assert!(super::looks_like_meta_annotation("FIXME this is broken"));
3733        assert!(super::looks_like_meta_annotation(
3734            "HACK: workaround for bug"
3735        ));
3736        assert!(super::looks_like_meta_annotation("XXX not implemented yet"));
3737        assert!(super::looks_like_meta_annotation(
3738            "BUG in the upstream crate"
3739        ));
3740        assert!(super::looks_like_meta_annotation("REVIEW before merging"));
3741        assert!(super::looks_like_meta_annotation(
3742            "REFACTOR this block later"
3743        ));
3744        assert!(super::looks_like_meta_annotation("TEMP: remove before v2"));
3745        assert!(super::looks_like_meta_annotation(
3746            "DEPRECATED use new_api instead"
3747        ));
3748        // Leading whitespace inside the comment body is handled.
3749        assert!(super::looks_like_meta_annotation(
3750            "   TODO: with leading ws"
3751        ));
3752    }
3753
3754    #[test]
3755    fn looks_like_meta_annotation_preserves_behaviour_prefixes() {
3756        // Explicitly-excluded prefixes — kept as behaviour signal.
3757        assert!(!super::looks_like_meta_annotation(
3758            "NOTE: this branch handles empty input"
3759        ));
3760        assert!(!super::looks_like_meta_annotation(
3761            "WARN: overflow is possible"
3762        ));
3763        assert!(!super::looks_like_meta_annotation(
3764            "SAFETY: caller must hold the lock"
3765        ));
3766        assert!(!super::looks_like_meta_annotation(
3767            "PANIC: unreachable by construction"
3768        ));
3769        // Behaviour-descriptive prose must pass through.
3770        assert!(!super::looks_like_meta_annotation(
3771            "parse json body from request"
3772        ));
3773        assert!(!super::looks_like_meta_annotation(
3774            "walk directory respecting gitignore"
3775        ));
3776        assert!(!super::looks_like_meta_annotation(
3777            "compute cosine similarity between vectors"
3778        ));
3779        // Empty / edge inputs
3780        assert!(!super::looks_like_meta_annotation(""));
3781        assert!(!super::looks_like_meta_annotation("   "));
3782        assert!(!super::looks_like_meta_annotation("123 numeric prefix"));
3783    }
3784
3785    #[test]
3786    fn strict_comments_filters_meta_annotations_during_extraction() {
3787        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3788        let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3789        unsafe {
3790            std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", "1");
3791        }
3792        let source = "\
3793fn handle_request() {
3794    // TODO: handle the error path properly
3795    // parse json body from the incoming request
3796    // FIXME: this can panic on empty input
3797    // walk directory respecting the gitignore rules
3798    let _x = 1;
3799}
3800";
3801        let result = super::extract_nl_tokens_inner(source, 0, source.len());
3802        unsafe {
3803            match previous {
3804                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", value),
3805                None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS"),
3806            }
3807        }
3808        let hint = result.expect("behaviour comments must survive");
3809        // The first real behaviour comment must appear. The hint is capped
3810        // by the default 60-char budget, so we only assert on a short
3811        // substring that's guaranteed to fit.
3812        assert!(
3813            hint.contains("parse json body"),
3814            "behaviour comment dropped: {hint}"
3815        );
3816        // TODO / FIXME must NOT appear anywhere in the hint (they were
3817        // rejected before join, so they cannot be there even partially).
3818        assert!(!hint.contains("TODO"), "TODO annotation leaked: {hint}");
3819        assert!(!hint.contains("FIXME"), "FIXME annotation leaked: {hint}");
3820    }
3821
3822    #[test]
3823    fn strict_comments_is_orthogonal_to_strict_literals() {
3824        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3825        // Enabling strict_comments must NOT affect the Pass-2 literal path.
3826        // A format-specifier literal should still pass through Pass 2
3827        // when the literal filter is off, regardless of the comment gate.
3828        let prev_c = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3829        let prev_l = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3830        unsafe {
3831            std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", "1");
3832            std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS");
3833        }
3834        // Source kept short so the 60-char hint budget does not truncate
3835        // either of the two substrings we assert on.
3836        let source = "\
3837fn handle() {
3838    // handles real behaviour
3839    let fmt = \"format error string\";
3840}
3841";
3842        let result = super::extract_nl_tokens_inner(source, 0, source.len());
3843        unsafe {
3844            match prev_c {
3845                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", v),
3846                None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS"),
3847            }
3848            match prev_l {
3849                Some(v) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", v),
3850                None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
3851            }
3852        }
3853        let hint = result.expect("tokens must exist");
3854        // Comment survives (not a meta-annotation).
3855        assert!(hint.contains("handles real"), "comment dropped: {hint}");
3856        // String literal still appears — strict_literals was OFF, so the
3857        // Pass-2 filter is inactive for this test.
3858        assert!(
3859            hint.contains("format error string"),
3860            "literal dropped: {hint}"
3861        );
3862    }
3863
3864    #[test]
3865    fn strict_literal_filter_gated_off_by_default() {
3866        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3867        let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3868        unsafe {
3869            std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS");
3870        }
3871        let enabled = super::strict_literal_filter_enabled();
3872        unsafe {
3873            if let Some(value) = previous {
3874                std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value);
3875            }
3876        }
3877        assert!(!enabled, "strict literal filter gate leaked");
3878    }
3879
3880    #[test]
3881    fn contains_format_specifier_detects_c_and_python_style() {
3882        // C / Python `%` style
3883        assert!(super::contains_format_specifier("Invalid URL %s"));
3884        assert!(super::contains_format_specifier("got %d matches"));
3885        assert!(super::contains_format_specifier("value=%r"));
3886        assert!(super::contains_format_specifier("size=%f"));
3887        // Python `.format` / f-string / Rust `format!` style
3888        assert!(super::contains_format_specifier("sending request to {url}"));
3889        assert!(super::contains_format_specifier("got {0} items"));
3890        assert!(super::contains_format_specifier("{:?}"));
3891        assert!(super::contains_format_specifier("value: {x:.2f}"));
3892        assert!(super::contains_format_specifier("{}"));
3893        // Plain prose with no format specifier
3894        assert!(!super::contains_format_specifier(
3895            "skip comments and string literals"
3896        ));
3897        assert!(!super::contains_format_specifier("failed to open database"));
3898        // JSON-like brace content should not count as a format specifier
3899        // (multi-word content inside braces)
3900        assert!(!super::contains_format_specifier("{name: foo, id: 1}"));
3901    }
3902
3903    #[test]
3904    fn looks_like_error_or_log_prefix_rejects_common_patterns() {
3905        assert!(super::looks_like_error_or_log_prefix("Invalid URL format"));
3906        assert!(super::looks_like_error_or_log_prefix(
3907            "Cannot decode response"
3908        ));
3909        assert!(super::looks_like_error_or_log_prefix("could not open file"));
3910        assert!(super::looks_like_error_or_log_prefix(
3911            "Failed to send request"
3912        ));
3913        assert!(super::looks_like_error_or_log_prefix(
3914            "Expected int, got str"
3915        ));
3916        assert!(super::looks_like_error_or_log_prefix(
3917            "sending request to server"
3918        ));
3919        assert!(super::looks_like_error_or_log_prefix(
3920            "received response headers"
3921        ));
3922        assert!(super::looks_like_error_or_log_prefix(
3923            "starting worker pool"
3924        ));
3925        // Real behaviour strings must pass
3926        assert!(!super::looks_like_error_or_log_prefix(
3927            "parse json body from request"
3928        ));
3929        assert!(!super::looks_like_error_or_log_prefix(
3930            "compute cosine similarity between vectors"
3931        ));
3932        assert!(!super::looks_like_error_or_log_prefix(
3933            "walk directory tree respecting gitignore"
3934        ));
3935    }
3936
3937    #[test]
3938    fn strict_mode_rejects_format_and_error_literals_during_extraction() {
3939        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3940        // The env gate is bypassed by calling the inner function directly,
3941        // BUT the inner function still reads the strict-literal env var.
3942        // So we have to set it explicitly for this test.
3943        let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3944        unsafe {
3945            std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", "1");
3946        }
3947        let source = "\
3948fn handle_request() {
3949    let err = \"Invalid URL %s\";
3950    let log = \"sending request to the upstream server\";
3951    let fmt = \"received {count} items in batch\";
3952    let real = \"parse json body from the incoming request\";
3953}
3954";
3955        let result = super::extract_nl_tokens_inner(source, 0, source.len());
3956        unsafe {
3957            match previous {
3958                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value),
3959                None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
3960            }
3961        }
3962        let hint = result.expect("some token should survive");
3963        // The one real behaviour-descriptive literal must land in the hint.
3964        assert!(
3965            hint.contains("parse json body"),
3966            "real literal was filtered out: {hint}"
3967        );
3968        // None of the three low-value literals should appear.
3969        assert!(
3970            !hint.contains("Invalid URL"),
3971            "format-specifier literal leaked: {hint}"
3972        );
3973        assert!(
3974            !hint.contains("sending request"),
3975            "log-prefix literal leaked: {hint}"
3976        );
3977        assert!(
3978            !hint.contains("received {count}"),
3979            "python fstring literal leaked: {hint}"
3980        );
3981    }
3982
3983    #[test]
3984    fn strict_mode_leaves_comments_untouched() {
3985        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3986        // Comments (Pass 1) should NOT be filtered by the strict flag —
3987        // the §8.8 post-mortem identified string literals as the
3988        // regression source, not comments.
3989        let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3990        unsafe {
3991            std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", "1");
3992        }
3993        let source = "\
3994fn do_work() {
3995    // Invalid inputs are rejected by this guard clause.
3996    // sending requests in parallel across worker threads.
3997    let _lit = \"format spec %s\";
3998}
3999";
4000        let result = super::extract_nl_tokens_inner(source, 0, source.len());
4001        unsafe {
4002            match previous {
4003                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value),
4004                None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
4005            }
4006        }
4007        let hint = result.expect("comments should survive strict mode");
4008        // Both comments should land in the hint even though they start with
4009        // error/log-style prefixes — the filter only touches string literals.
4010        assert!(
4011            hint.contains("Invalid inputs") || hint.contains("rejected by this guard"),
4012            "strict mode swallowed a comment: {hint}"
4013        );
4014        // And the low-value string literal should NOT be in the hint.
4015        assert!(
4016            !hint.contains("format spec"),
4017            "format-specifier literal leaked under strict mode: {hint}"
4018        );
4019    }
4020
4021    #[test]
4022    fn should_reject_literal_strict_composes_format_and_prefix() {
4023        // The test-only helper must mirror the production filter logic:
4024        // a literal is rejected iff it is a format specifier OR an error/log
4025        // prefix (the production filter uses exactly this disjunction).
4026        assert!(super::should_reject_literal_strict("Invalid URL %s"));
4027        assert!(super::should_reject_literal_strict(
4028            "sending request to server"
4029        ));
4030        assert!(super::should_reject_literal_strict("value: {x:.2f}"));
4031        // Real behaviour strings pass through.
4032        assert!(!super::should_reject_literal_strict(
4033            "parse json body from the incoming request"
4034        ));
4035        assert!(!super::should_reject_literal_strict(
4036            "compute cosine similarity between vectors"
4037        ));
4038    }
4039
4040    #[test]
4041    fn is_static_method_ident_accepts_pascal_and_rejects_snake() {
4042        assert!(super::is_static_method_ident("HashMap"));
4043        assert!(super::is_static_method_ident("Parser"));
4044        assert!(super::is_static_method_ident("A"));
4045        // snake_case / module-like — the filter must reject these so
4046        // `std::fs::read_to_string` does not leak into API hints.
4047        assert!(!super::is_static_method_ident("std"));
4048        assert!(!super::is_static_method_ident("fs"));
4049        assert!(!super::is_static_method_ident("_private"));
4050        assert!(!super::is_static_method_ident(""));
4051    }
4052
4053    #[test]
4054    fn extract_api_calls_gated_off_by_default() {
4055        let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
4056        // Default: no env, no API-call hint regardless of body content.
4057        let previous = std::env::var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS").ok();
4058        unsafe {
4059            std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS");
4060        }
4061        let source = "\
4062fn make_parser() {
4063    let p = Parser::new();
4064    let _ = HashMap::with_capacity(8);
4065}
4066";
4067        let result = extract_api_calls(source, 0, source.len());
4068        unsafe {
4069            if let Some(value) = previous {
4070                std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS", value);
4071            }
4072        }
4073        assert!(result.is_none(), "gate leaked: {result:?}");
4074    }
4075
4076    #[test]
4077    fn extract_api_calls_captures_type_method_patterns() {
4078        // Uses the env-independent inner to avoid racing with other tests.
4079        let source = "\
4080fn open_db() {
4081    let p = Parser::new();
4082    let map = HashMap::with_capacity(16);
4083    let _ = tree_sitter::Parser::new();
4084}
4085";
4086        let hint = super::extract_api_calls_inner(source, 0, source.len())
4087            .expect("api calls should be produced");
4088        assert!(hint.contains("Parser::new"), "missing Parser::new: {hint}");
4089        assert!(
4090            hint.contains("HashMap::with_capacity"),
4091            "missing HashMap::with_capacity: {hint}"
4092        );
4093    }
4094
4095    #[test]
4096    fn extract_api_calls_rejects_module_prefixed_free_functions() {
4097        // Pure module paths must not surface as Type hints — the whole
4098        // point of `is_static_method_ident` is to drop these.
4099        let source = "\
4100fn read_config() {
4101    let _ = std::fs::read_to_string(\"foo\");
4102    let _ = crate::util::parse();
4103}
4104";
4105        let hint = super::extract_api_calls_inner(source, 0, source.len());
4106        // If any API hint is produced, it must not contain the snake_case
4107        // module prefixes; otherwise `None` is acceptable too.
4108        if let Some(hint) = hint {
4109            assert!(!hint.contains("std::fs"), "lowercase module leaked: {hint}");
4110            assert!(
4111                !hint.contains("fs::read_to_string"),
4112                "module-prefixed free function leaked: {hint}"
4113            );
4114            assert!(!hint.contains("crate::util"), "crate path leaked: {hint}");
4115        }
4116    }
4117
4118    #[test]
4119    fn extract_api_calls_deduplicates_repeated_calls() {
4120        let source = "\
4121fn hot_loop() {
4122    for _ in 0..10 {
4123        let _ = Parser::new();
4124        let _ = Parser::new();
4125    }
4126    let _ = Parser::new();
4127}
4128";
4129        let hint = super::extract_api_calls_inner(source, 0, source.len())
4130            .expect("api calls should be produced");
4131        let first = hint.find("Parser::new").expect("hit");
4132        let rest = &hint[first + "Parser::new".len()..];
4133        assert!(
4134            !rest.contains("Parser::new"),
4135            "duplicate not deduplicated: {hint}"
4136        );
4137    }
4138
4139    #[test]
4140    fn extract_api_calls_returns_none_when_body_has_no_type_calls() {
4141        let source = "\
4142fn plain() {
4143    let x = 1;
4144    let y = x + 2;
4145}
4146";
4147        assert!(super::extract_api_calls_inner(source, 0, source.len()).is_none());
4148    }
4149
4150    #[test]
4151    fn extract_nl_tokens_collects_comments_and_string_literals() {
4152        // Calls the env-independent inner to avoid racing with other tests
4153        // that mutate `CODELENS_EMBED_HINT_INCLUDE_COMMENTS`. The gate is
4154        // covered by `extract_nl_tokens_gated_off_by_default` above.
4155        let source = "\
4156fn search_for_matches() {
4157    // skip comments and string literals during search
4158    let error = \"failed to open database\";
4159    let single = \"tok\";
4160    let path = \"src/foo/bar\";
4161    let keyword = match kind {
4162        Kind::Ident => \"detect client version\",
4163        _ => \"\",
4164    };
4165}
4166";
4167        // Override the char budget locally so long hints are not truncated
4168        // before the assertions read them. We use the inner function which
4169        // still reads `CODELENS_EMBED_HINT_CHARS`, but we do NOT set it —
4170        // the default 60-char budget is enough for at least the first
4171        // discriminator to land in the output.
4172        let hint = super::extract_nl_tokens_inner(source, 0, source.len())
4173            .expect("nl tokens should be produced");
4174        // At least one NL-shaped token must land in the hint. The default
4175        // 60-char budget may truncate later ones; we assert on the first
4176        // few discriminators only.
4177        let has_first_nl_signal = hint.contains("skip comments")
4178            || hint.contains("failed to open")
4179            || hint.contains("detect client");
4180        assert!(has_first_nl_signal, "no NL signal produced: {hint}");
4181        // Short single-token literals must never leak in.
4182        assert!(!hint.contains(" tok "), "short literal leaked: {hint}");
4183        // Path literals must never leak in.
4184        assert!(!hint.contains("src/foo/bar"), "path literal leaked: {hint}");
4185    }
4186
4187    #[test]
4188    fn hint_char_budget_respects_env_override() {
4189        let previous = std::env::var("CODELENS_EMBED_HINT_CHARS").ok();
4190        unsafe {
4191            std::env::set_var("CODELENS_EMBED_HINT_CHARS", "120");
4192        }
4193        let budget = super::hint_char_budget();
4194        unsafe {
4195            match previous {
4196                Some(value) => std::env::set_var("CODELENS_EMBED_HINT_CHARS", value),
4197                None => std::env::remove_var("CODELENS_EMBED_HINT_CHARS"),
4198            }
4199        }
4200        assert_eq!(budget, 120);
4201    }
4202
4203    #[test]
4204    fn embedding_to_bytes_roundtrip() {
4205        let floats = vec![1.0f32, -0.5, 0.0, 3.25];
4206        let bytes = embedding_to_bytes(&floats);
4207        assert_eq!(bytes.len(), 4 * 4);
4208        // Verify roundtrip
4209        let recovered: Vec<f32> = bytes
4210            .chunks_exact(4)
4211            .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
4212            .collect();
4213        assert_eq!(floats, recovered);
4214    }
4215
4216    #[test]
4217    fn duplicate_pair_key_is_order_independent() {
4218        let a = duplicate_pair_key("a.py", "foo", "b.py", "bar");
4219        let b = duplicate_pair_key("b.py", "bar", "a.py", "foo");
4220        assert_eq!(a, b);
4221    }
4222
4223    #[test]
4224    fn text_embedding_cache_updates_recency() {
4225        let mut cache = TextEmbeddingCache::new(2);
4226        cache.insert("a".into(), vec![1.0]);
4227        cache.insert("b".into(), vec![2.0]);
4228        assert_eq!(cache.get("a"), Some(vec![1.0]));
4229        cache.insert("c".into(), vec![3.0]);
4230
4231        assert_eq!(cache.get("a"), Some(vec![1.0]));
4232        assert_eq!(cache.get("b"), None);
4233        assert_eq!(cache.get("c"), Some(vec![3.0]));
4234    }
4235
4236    #[test]
4237    fn text_embedding_cache_can_be_disabled() {
4238        let mut cache = TextEmbeddingCache::new(0);
4239        cache.insert("a".into(), vec![1.0]);
4240        assert_eq!(cache.get("a"), None);
4241    }
4242
4243    #[test]
4244    fn engine_new_and_index() {
4245        let _lock = MODEL_LOCK.lock().unwrap();
4246        skip_without_embedding_model!();
4247        let (_dir, project) = make_project_with_source();
4248        let engine = EmbeddingEngine::new(&project).expect("engine should load");
4249        assert!(!engine.is_indexed());
4250
4251        let count = engine.index_from_project(&project).unwrap();
4252        assert_eq!(count, 2, "should index 2 symbols");
4253        assert!(engine.is_indexed());
4254    }
4255
4256    #[test]
4257    fn engine_search_returns_results() {
4258        let _lock = MODEL_LOCK.lock().unwrap();
4259        skip_without_embedding_model!();
4260        let (_dir, project) = make_project_with_source();
4261        let engine = EmbeddingEngine::new(&project).unwrap();
4262        engine.index_from_project(&project).unwrap();
4263
4264        let results = engine.search("hello function", 10).unwrap();
4265        assert!(!results.is_empty(), "search should return results");
4266        for r in &results {
4267            assert!(
4268                r.score >= -1.0 && r.score <= 1.0,
4269                "score should be in [-1,1]: {}",
4270                r.score
4271            );
4272        }
4273    }
4274
4275    #[test]
4276    fn engine_incremental_index() {
4277        let _lock = MODEL_LOCK.lock().unwrap();
4278        skip_without_embedding_model!();
4279        let (_dir, project) = make_project_with_source();
4280        let engine = EmbeddingEngine::new(&project).unwrap();
4281        engine.index_from_project(&project).unwrap();
4282        assert_eq!(engine.store.count().unwrap(), 2);
4283
4284        // Re-index only main.py — should replace its embeddings
4285        let count = engine.index_changed_files(&project, &["main.py"]).unwrap();
4286        assert_eq!(count, 2);
4287        assert_eq!(engine.store.count().unwrap(), 2);
4288    }
4289
4290    #[test]
4291    fn engine_reindex_preserves_symbol_count() {
4292        let _lock = MODEL_LOCK.lock().unwrap();
4293        skip_without_embedding_model!();
4294        let (_dir, project) = make_project_with_source();
4295        let engine = EmbeddingEngine::new(&project).unwrap();
4296        engine.index_from_project(&project).unwrap();
4297        assert_eq!(engine.store.count().unwrap(), 2);
4298
4299        let count = engine.index_from_project(&project).unwrap();
4300        assert_eq!(count, 2);
4301        assert_eq!(engine.store.count().unwrap(), 2);
4302    }
4303
4304    #[test]
4305    fn full_reindex_reuses_unchanged_embeddings() {
4306        let _lock = MODEL_LOCK.lock().unwrap();
4307        skip_without_embedding_model!();
4308        let (_dir, project) = make_project_with_source();
4309        let engine = EmbeddingEngine::new(&project).unwrap();
4310        engine.index_from_project(&project).unwrap();
4311
4312        replace_file_embeddings_with_sentinels(
4313            &engine,
4314            "main.py",
4315            &[("hello", 11.0), ("world", 22.0)],
4316        );
4317
4318        let count = engine.index_from_project(&project).unwrap();
4319        assert_eq!(count, 2);
4320
4321        let hello = engine
4322            .store
4323            .get_embedding("main.py", "hello")
4324            .unwrap()
4325            .expect("hello should exist");
4326        let world = engine
4327            .store
4328            .get_embedding("main.py", "world")
4329            .unwrap()
4330            .expect("world should exist");
4331        assert!(hello.embedding.iter().all(|value| *value == 11.0));
4332        assert!(world.embedding.iter().all(|value| *value == 22.0));
4333    }
4334
4335    #[test]
4336    fn full_reindex_reuses_unchanged_sibling_after_edit() {
4337        let _lock = MODEL_LOCK.lock().unwrap();
4338        skip_without_embedding_model!();
4339        let (dir, project) = make_project_with_source();
4340        let engine = EmbeddingEngine::new(&project).unwrap();
4341        engine.index_from_project(&project).unwrap();
4342
4343        replace_file_embeddings_with_sentinels(
4344            &engine,
4345            "main.py",
4346            &[("hello", 11.0), ("world", 22.0)],
4347        );
4348
4349        let updated_source =
4350            "def hello():\n    print('hi')\n\ndef world(name):\n    return name.upper()\n";
4351        write_python_file_with_symbols(
4352            dir.path(),
4353            "main.py",
4354            updated_source,
4355            "hash2",
4356            &[
4357                ("hello", "def hello():", "hello"),
4358                ("world", "def world(name):", "world"),
4359            ],
4360        );
4361
4362        let count = engine.index_from_project(&project).unwrap();
4363        assert_eq!(count, 2);
4364
4365        let hello = engine
4366            .store
4367            .get_embedding("main.py", "hello")
4368            .unwrap()
4369            .expect("hello should exist");
4370        let world = engine
4371            .store
4372            .get_embedding("main.py", "world")
4373            .unwrap()
4374            .expect("world should exist");
4375        assert!(hello.embedding.iter().all(|value| *value == 11.0));
4376        assert!(world.embedding.iter().any(|value| *value != 22.0));
4377        assert_eq!(engine.store.count().unwrap(), 2);
4378    }
4379
4380    #[test]
4381    fn full_reindex_removes_deleted_files() {
4382        let _lock = MODEL_LOCK.lock().unwrap();
4383        skip_without_embedding_model!();
4384        let (dir, project) = make_project_with_source();
4385        write_python_file_with_symbols(
4386            dir.path(),
4387            "extra.py",
4388            "def bonus():\n    return 7\n",
4389            "hash-extra",
4390            &[("bonus", "def bonus():", "bonus")],
4391        );
4392
4393        let engine = EmbeddingEngine::new(&project).unwrap();
4394        engine.index_from_project(&project).unwrap();
4395        assert_eq!(engine.store.count().unwrap(), 3);
4396
4397        std::fs::remove_file(dir.path().join("extra.py")).unwrap();
4398        let db_path = crate::db::index_db_path(dir.path());
4399        let db = IndexDb::open(&db_path).unwrap();
4400        db.delete_file("extra.py").unwrap();
4401
4402        let count = engine.index_from_project(&project).unwrap();
4403        assert_eq!(count, 2);
4404        assert_eq!(engine.store.count().unwrap(), 2);
4405        assert!(
4406            engine
4407                .store
4408                .embeddings_for_files(&["extra.py"])
4409                .unwrap()
4410                .is_empty()
4411        );
4412    }
4413
4414    #[test]
4415    fn engine_model_change_recreates_db() {
4416        let _lock = MODEL_LOCK.lock().unwrap();
4417        skip_without_embedding_model!();
4418        let (_dir, project) = make_project_with_source();
4419
4420        // First engine with default model
4421        let engine1 = EmbeddingEngine::new(&project).unwrap();
4422        engine1.index_from_project(&project).unwrap();
4423        assert_eq!(engine1.store.count().unwrap(), 2);
4424        drop(engine1);
4425
4426        // Second engine with same model should preserve data
4427        let engine2 = EmbeddingEngine::new(&project).unwrap();
4428        assert!(engine2.store.count().unwrap() >= 2);
4429    }
4430
4431    #[test]
4432    fn inspect_existing_index_returns_model_and_count() {
4433        let _lock = MODEL_LOCK.lock().unwrap();
4434        skip_without_embedding_model!();
4435        let (_dir, project) = make_project_with_source();
4436        let engine = EmbeddingEngine::new(&project).unwrap();
4437        engine.index_from_project(&project).unwrap();
4438
4439        let info = EmbeddingEngine::inspect_existing_index(&project)
4440            .unwrap()
4441            .expect("index info should exist");
4442        assert_eq!(info.model_name, engine.model_name());
4443        assert_eq!(info.indexed_symbols, 2);
4444    }
4445
4446    #[test]
4447    fn inspect_existing_index_recovers_from_corrupt_db() {
4448        let (_dir, project) = make_project_with_source();
4449        let index_dir = project.as_path().join(".codelens/index");
4450        let db_path = index_dir.join("embeddings.db");
4451        let wal_path = index_dir.join("embeddings.db-wal");
4452        let shm_path = index_dir.join("embeddings.db-shm");
4453
4454        std::fs::write(&db_path, b"not a sqlite database").unwrap();
4455        std::fs::write(&wal_path, b"bad wal").unwrap();
4456        std::fs::write(&shm_path, b"bad shm").unwrap();
4457
4458        let info = EmbeddingEngine::inspect_existing_index(&project).unwrap();
4459        assert!(info.is_none());
4460
4461        assert!(db_path.is_file());
4462
4463        let backup_names: Vec<String> = std::fs::read_dir(&index_dir)
4464            .unwrap()
4465            .map(|entry| entry.unwrap().file_name().to_string_lossy().into_owned())
4466            .filter(|name| name.contains(".corrupt-"))
4467            .collect();
4468
4469        assert!(
4470            backup_names
4471                .iter()
4472                .any(|name| name.starts_with("embeddings.db.corrupt-")),
4473            "expected quarantined embedding db, found {backup_names:?}"
4474        );
4475    }
4476
4477    #[test]
4478    fn store_can_fetch_single_embedding_without_loading_all() {
4479        let _lock = MODEL_LOCK.lock().unwrap();
4480        skip_without_embedding_model!();
4481        let (_dir, project) = make_project_with_source();
4482        let engine = EmbeddingEngine::new(&project).unwrap();
4483        engine.index_from_project(&project).unwrap();
4484
4485        let chunk = engine
4486            .store
4487            .get_embedding("main.py", "hello")
4488            .unwrap()
4489            .expect("embedding should exist");
4490        assert_eq!(chunk.file_path, "main.py");
4491        assert_eq!(chunk.symbol_name, "hello");
4492        assert!(!chunk.embedding.is_empty());
4493    }
4494
4495    #[test]
4496    fn find_similar_code_uses_index_and_excludes_target_symbol() {
4497        let _lock = MODEL_LOCK.lock().unwrap();
4498        skip_without_embedding_model!();
4499        let (_dir, project) = make_project_with_source();
4500        let engine = EmbeddingEngine::new(&project).unwrap();
4501        engine.index_from_project(&project).unwrap();
4502
4503        let matches = engine.find_similar_code("main.py", "hello", 5).unwrap();
4504        assert!(!matches.is_empty());
4505        assert!(
4506            matches
4507                .iter()
4508                .all(|m| !(m.file_path == "main.py" && m.symbol_name == "hello"))
4509        );
4510    }
4511
4512    #[test]
4513    fn delete_by_file_removes_rows_in_one_batch() {
4514        let _lock = MODEL_LOCK.lock().unwrap();
4515        skip_without_embedding_model!();
4516        let (_dir, project) = make_project_with_source();
4517        let engine = EmbeddingEngine::new(&project).unwrap();
4518        engine.index_from_project(&project).unwrap();
4519
4520        let deleted = engine.store.delete_by_file(&["main.py"]).unwrap();
4521        assert_eq!(deleted, 2);
4522        assert_eq!(engine.store.count().unwrap(), 0);
4523    }
4524
4525    #[test]
4526    fn store_streams_embeddings_grouped_by_file() {
4527        let _lock = MODEL_LOCK.lock().unwrap();
4528        skip_without_embedding_model!();
4529        let (_dir, project) = make_project_with_source();
4530        let engine = EmbeddingEngine::new(&project).unwrap();
4531        engine.index_from_project(&project).unwrap();
4532
4533        let mut groups = Vec::new();
4534        engine
4535            .store
4536            .for_each_file_embeddings(&mut |file_path, chunks| {
4537                groups.push((file_path, chunks.len()));
4538                Ok(())
4539            })
4540            .unwrap();
4541
4542        assert_eq!(groups, vec![("main.py".to_string(), 2)]);
4543    }
4544
4545    #[test]
4546    fn store_fetches_embeddings_for_specific_files() {
4547        let _lock = MODEL_LOCK.lock().unwrap();
4548        skip_without_embedding_model!();
4549        let (_dir, project) = make_project_with_source();
4550        let engine = EmbeddingEngine::new(&project).unwrap();
4551        engine.index_from_project(&project).unwrap();
4552
4553        let chunks = engine.store.embeddings_for_files(&["main.py"]).unwrap();
4554        assert_eq!(chunks.len(), 2);
4555        assert!(chunks.iter().all(|chunk| chunk.file_path == "main.py"));
4556    }
4557
4558    #[test]
4559    fn store_fetches_embeddings_for_scored_chunks() {
4560        let _lock = MODEL_LOCK.lock().unwrap();
4561        skip_without_embedding_model!();
4562        let (_dir, project) = make_project_with_source();
4563        let engine = EmbeddingEngine::new(&project).unwrap();
4564        engine.index_from_project(&project).unwrap();
4565
4566        let scored = engine.search_scored("hello world function", 2).unwrap();
4567        let chunks = engine.store.embeddings_for_scored_chunks(&scored).unwrap();
4568
4569        assert_eq!(chunks.len(), scored.len());
4570        assert!(scored.iter().all(|candidate| chunks.iter().any(|chunk| {
4571            chunk.file_path == candidate.file_path
4572                && chunk.symbol_name == candidate.symbol_name
4573                && chunk.line == candidate.line
4574                && chunk.signature == candidate.signature
4575                && chunk.name_path == candidate.name_path
4576        })));
4577    }
4578
4579    #[test]
4580    fn find_misplaced_code_returns_per_file_outliers() {
4581        let _lock = MODEL_LOCK.lock().unwrap();
4582        skip_without_embedding_model!();
4583        let (_dir, project) = make_project_with_source();
4584        let engine = EmbeddingEngine::new(&project).unwrap();
4585        engine.index_from_project(&project).unwrap();
4586
4587        let outliers = engine.find_misplaced_code(5).unwrap();
4588        assert_eq!(outliers.len(), 2);
4589        assert!(outliers.iter().all(|item| item.file_path == "main.py"));
4590    }
4591
4592    #[test]
4593    fn find_duplicates_uses_batched_candidate_embeddings() {
4594        let _lock = MODEL_LOCK.lock().unwrap();
4595        skip_without_embedding_model!();
4596        let (_dir, project) = make_project_with_source();
4597        let engine = EmbeddingEngine::new(&project).unwrap();
4598        engine.index_from_project(&project).unwrap();
4599
4600        replace_file_embeddings_with_sentinels(
4601            &engine,
4602            "main.py",
4603            &[("hello", 5.0), ("world", 5.0)],
4604        );
4605
4606        let duplicates = engine.find_duplicates(0.99, 4).unwrap();
4607        assert!(!duplicates.is_empty());
4608        assert!(duplicates.iter().any(|pair| {
4609            (pair.symbol_a == "main.py:hello" && pair.symbol_b == "main.py:world")
4610                || (pair.symbol_a == "main.py:world" && pair.symbol_b == "main.py:hello")
4611        }));
4612    }
4613
4614    #[test]
4615    fn search_scored_returns_raw_chunks() {
4616        let _lock = MODEL_LOCK.lock().unwrap();
4617        skip_without_embedding_model!();
4618        let (_dir, project) = make_project_with_source();
4619        let engine = EmbeddingEngine::new(&project).unwrap();
4620        engine.index_from_project(&project).unwrap();
4621
4622        let chunks = engine.search_scored("world function", 5).unwrap();
4623        assert!(!chunks.is_empty());
4624        for c in &chunks {
4625            assert!(!c.file_path.is_empty());
4626            assert!(!c.symbol_name.is_empty());
4627        }
4628    }
4629
4630    #[test]
4631    fn configured_embedding_model_name_defaults_to_codesearchnet() {
4632        assert_eq!(configured_embedding_model_name(), CODESEARCH_MODEL_NAME);
4633    }
4634
4635    #[test]
4636    fn requested_embedding_model_override_ignores_default_model_name() {
4637        let _lock = MODEL_LOCK.lock().unwrap();
4638        let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4639        unsafe {
4640            std::env::set_var("CODELENS_EMBED_MODEL", CODESEARCH_MODEL_NAME);
4641        }
4642
4643        let result = requested_embedding_model_override().unwrap();
4644
4645        unsafe {
4646            match previous {
4647                Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4648                None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4649            }
4650        }
4651
4652        assert_eq!(result, None);
4653    }
4654
4655    #[cfg(not(feature = "model-bakeoff"))]
4656    #[test]
4657    fn requested_embedding_model_override_requires_bakeoff_feature() {
4658        let _lock = MODEL_LOCK.lock().unwrap();
4659        let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4660        unsafe {
4661            std::env::set_var("CODELENS_EMBED_MODEL", "all-MiniLM-L12-v2");
4662        }
4663
4664        let err = requested_embedding_model_override().unwrap_err();
4665
4666        unsafe {
4667            match previous {
4668                Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4669                None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4670            }
4671        }
4672
4673        assert!(err.to_string().contains("model-bakeoff"));
4674    }
4675
4676    #[cfg(feature = "model-bakeoff")]
4677    #[test]
4678    fn requested_embedding_model_override_accepts_alternative_model() {
4679        let _lock = MODEL_LOCK.lock().unwrap();
4680        let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4681        unsafe {
4682            std::env::set_var("CODELENS_EMBED_MODEL", "all-MiniLM-L12-v2");
4683        }
4684
4685        let result = requested_embedding_model_override().unwrap();
4686
4687        unsafe {
4688            match previous {
4689                Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4690                None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4691            }
4692        }
4693
4694        assert_eq!(result.as_deref(), Some("all-MiniLM-L12-v2"));
4695    }
4696
4697    #[test]
4698    fn recommended_embed_threads_caps_macos_style_load() {
4699        let threads = recommended_embed_threads();
4700        assert!(threads >= 1);
4701        assert!(threads <= 8);
4702    }
4703
4704    #[test]
4705    fn embed_batch_size_has_safe_default_floor() {
4706        assert!(embed_batch_size() >= 1);
4707        if cfg!(target_os = "macos") {
4708            assert!(embed_batch_size() <= DEFAULT_MACOS_EMBED_BATCH_SIZE);
4709        }
4710    }
4711}
codelens_engine/embedding/mod.rs

codelens_engine/embedding/
mod.rs