1use crate::db::IndexDb;
5use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
6use crate::project::ProjectRoot;
7use anyhow::{Context, Result};
8#[cfg(target_os = "macos")]
9use fastembed::ExecutionProviderDispatch;
10use fastembed::{InitOptionsUserDefined, TextEmbedding, TokenizerFiles, UserDefinedEmbeddingModel};
11use rusqlite::Connection;
12use serde::Serialize;
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::sync::{Arc, Mutex, Once};
15use std::thread::available_parallelism;
16use tracing::debug;
17
18pub(super) mod ffi {
20 use anyhow::Result;
21
22 pub fn register_sqlite_vec() -> Result<()> {
23 let rc = unsafe {
24 rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute::<
25 *const (),
26 unsafe extern "C" fn(
27 *mut rusqlite::ffi::sqlite3,
28 *mut *mut i8,
29 *const rusqlite::ffi::sqlite3_api_routines,
30 ) -> i32,
31 >(
32 sqlite_vec::sqlite3_vec_init as *const ()
33 )))
34 };
35 if rc != rusqlite::ffi::SQLITE_OK {
36 anyhow::bail!("failed to register sqlite-vec extension (SQLite error code: {rc})");
37 }
38 Ok(())
39 }
40
41 #[cfg(target_os = "macos")]
42 pub fn sysctl_usize(name: &[u8]) -> Option<usize> {
43 let mut value: libc::c_uint = 0;
44 let mut size = std::mem::size_of::<libc::c_uint>();
45 let rc = unsafe {
46 libc::sysctlbyname(
47 name.as_ptr().cast(),
48 (&mut value as *mut libc::c_uint).cast(),
49 &mut size,
50 std::ptr::null_mut(),
51 0,
52 )
53 };
54 (rc == 0 && size == std::mem::size_of::<libc::c_uint>()).then_some(value as usize)
55 }
56}
57
58#[derive(Debug, Clone, Serialize)]
60pub struct SemanticMatch {
61 pub file_path: String,
62 pub symbol_name: String,
63 pub kind: String,
64 pub line: usize,
65 pub signature: String,
66 pub name_path: String,
67 pub score: f64,
68}
69
70impl From<ScoredChunk> for SemanticMatch {
71 fn from(c: ScoredChunk) -> Self {
72 Self {
73 file_path: c.file_path,
74 symbol_name: c.symbol_name,
75 kind: c.kind,
76 line: c.line,
77 signature: c.signature,
78 name_path: c.name_path,
79 score: c.score,
80 }
81 }
82}
83
84mod vec_store;
85use vec_store::SqliteVecStore;
86
87type ReusableEmbeddingKey = (String, String, String, String, String, String);
88
89fn reusable_embedding_key(
90 file_path: &str,
91 symbol_name: &str,
92 kind: &str,
93 signature: &str,
94 name_path: &str,
95 text: &str,
96) -> ReusableEmbeddingKey {
97 (
98 file_path.to_owned(),
99 symbol_name.to_owned(),
100 kind.to_owned(),
101 signature.to_owned(),
102 name_path.to_owned(),
103 text.to_owned(),
104 )
105}
106
107fn reusable_embedding_key_for_chunk(chunk: &EmbeddingChunk) -> ReusableEmbeddingKey {
108 reusable_embedding_key(
109 &chunk.file_path,
110 &chunk.symbol_name,
111 &chunk.kind,
112 &chunk.signature,
113 &chunk.name_path,
114 &chunk.text,
115 )
116}
117
118fn reusable_embedding_key_for_symbol(
119 sym: &crate::db::SymbolWithFile,
120 text: &str,
121) -> ReusableEmbeddingKey {
122 reusable_embedding_key(
123 &sym.file_path,
124 &sym.name,
125 &sym.kind,
126 &sym.signature,
127 &sym.name_path,
128 text,
129 )
130}
131
132const DEFAULT_EMBED_BATCH_SIZE: usize = 128;
135const DEFAULT_MACOS_EMBED_BATCH_SIZE: usize = 128;
136const DEFAULT_TEXT_EMBED_CACHE_SIZE: usize = 256;
137const DEFAULT_MACOS_TEXT_EMBED_CACHE_SIZE: usize = 1024;
138const CODESEARCH_DIMENSION: usize = 384;
139const DEFAULT_MAX_EMBED_SYMBOLS: usize = 50_000;
140const CHANGED_FILE_QUERY_CHUNK: usize = 128;
141const DEFAULT_DUPLICATE_SCAN_BATCH_SIZE: usize = 128;
142static ORT_ENV_INIT: Once = Once::new();
143
144const CODESEARCH_MODEL_NAME: &str = "MiniLM-L12-CodeSearchNet-INT8";
147
148pub struct EmbeddingEngine {
149 model: Mutex<TextEmbedding>,
150 store: SqliteVecStore,
151 model_name: String,
152 runtime_info: EmbeddingRuntimeInfo,
153 text_embed_cache: Mutex<TextEmbeddingCache>,
154 indexing: std::sync::atomic::AtomicBool,
155}
156
157#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
158pub struct EmbeddingIndexInfo {
159 pub model_name: String,
160 pub indexed_symbols: usize,
161}
162
163#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
164pub struct EmbeddingRuntimeInfo {
165 pub runtime_preference: String,
166 pub backend: String,
167 pub threads: usize,
168 pub max_length: usize,
169 pub coreml_model_format: Option<String>,
170 pub coreml_compute_units: Option<String>,
171 pub coreml_static_input_shapes: Option<bool>,
172 pub coreml_profile_compute_plan: Option<bool>,
173 pub coreml_specialization_strategy: Option<String>,
174 pub coreml_model_cache_dir: Option<String>,
175 pub fallback_reason: Option<String>,
176}
177
178struct TextEmbeddingCache {
179 capacity: usize,
180 order: VecDeque<String>,
181 entries: HashMap<String, Vec<f32>>,
182}
183
184impl TextEmbeddingCache {
185 fn new(capacity: usize) -> Self {
186 Self {
187 capacity,
188 order: VecDeque::new(),
189 entries: HashMap::new(),
190 }
191 }
192
193 fn get(&mut self, key: &str) -> Option<Vec<f32>> {
194 let value = self.entries.get(key)?.clone();
195 self.touch(key);
196 Some(value)
197 }
198
199 fn insert(&mut self, key: String, value: Vec<f32>) {
200 if self.capacity == 0 {
201 return;
202 }
203
204 self.entries.insert(key.clone(), value);
205 self.touch(&key);
206
207 while self.entries.len() > self.capacity {
208 if let Some(oldest) = self.order.pop_front() {
209 self.entries.remove(&oldest);
210 } else {
211 break;
212 }
213 }
214 }
215
216 fn touch(&mut self, key: &str) {
217 if let Some(position) = self.order.iter().position(|existing| existing == key) {
218 self.order.remove(position);
219 }
220 self.order.push_back(key.to_owned());
221 }
222}
223
224fn resolve_model_dir() -> Result<std::path::PathBuf> {
232 if let Ok(dir) = std::env::var("CODELENS_MODEL_DIR") {
234 let p = std::path::PathBuf::from(dir).join("codesearch");
235 if p.join("model.onnx").exists() {
236 return Ok(p);
237 }
238 }
239
240 if let Ok(exe) = std::env::current_exe()
242 && let Some(exe_dir) = exe.parent()
243 {
244 let p = exe_dir.join("models").join("codesearch");
245 if p.join("model.onnx").exists() {
246 return Ok(p);
247 }
248 }
249
250 if let Some(home) = dirs_fallback() {
252 let p = home
253 .join(".cache")
254 .join("codelens")
255 .join("models")
256 .join("codesearch");
257 if p.join("model.onnx").exists() {
258 return Ok(p);
259 }
260 }
261
262 let dev_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
264 .join("models")
265 .join("codesearch");
266 if dev_path.join("model.onnx").exists() {
267 return Ok(dev_path);
268 }
269
270 anyhow::bail!(
271 "CodeSearchNet model not found. Place model files in one of:\n\
272 - $CODELENS_MODEL_DIR/codesearch/\n\
273 - <executable>/models/codesearch/\n\
274 - ~/.cache/codelens/models/codesearch/\n\
275 Required files: model.onnx, tokenizer.json, config.json, special_tokens_map.json, tokenizer_config.json"
276 )
277}
278
279fn dirs_fallback() -> Option<std::path::PathBuf> {
280 std::env::var_os("HOME").map(std::path::PathBuf::from)
281}
282
283fn parse_usize_env(name: &str) -> Option<usize> {
284 std::env::var(name)
285 .ok()
286 .and_then(|v| v.trim().parse::<usize>().ok())
287 .filter(|v| *v > 0)
288}
289
290fn parse_bool_env(name: &str) -> Option<bool> {
291 std::env::var(name).ok().and_then(|value| {
292 let normalized = value.trim().to_ascii_lowercase();
293 match normalized.as_str() {
294 "1" | "true" | "yes" | "on" => Some(true),
295 "0" | "false" | "no" | "off" => Some(false),
296 _ => None,
297 }
298 })
299}
300
301#[cfg(target_os = "macos")]
302fn apple_perf_cores() -> Option<usize> {
303 ffi::sysctl_usize(b"hw.perflevel0.physicalcpu\0")
304 .filter(|value| *value > 0)
305 .or_else(|| ffi::sysctl_usize(b"hw.physicalcpu\0").filter(|value| *value > 0))
306}
307
308#[cfg(not(target_os = "macos"))]
309fn apple_perf_cores() -> Option<usize> {
310 None
311}
312
313pub fn configured_embedding_runtime_preference() -> String {
314 let requested = std::env::var("CODELENS_EMBED_PROVIDER")
315 .ok()
316 .map(|value| value.trim().to_ascii_lowercase());
317
318 match requested.as_deref() {
319 Some("cpu") => "cpu".to_string(),
320 Some("coreml") if cfg!(target_os = "macos") => "coreml".to_string(),
321 Some("coreml") => "cpu".to_string(),
322 _ if cfg!(target_os = "macos") => "coreml_preferred".to_string(),
323 _ => "cpu".to_string(),
324 }
325}
326
327pub fn configured_embedding_threads() -> usize {
328 recommended_embed_threads()
329}
330
331fn configured_embedding_max_length() -> usize {
332 parse_usize_env("CODELENS_EMBED_MAX_LENGTH")
333 .unwrap_or(256)
334 .clamp(32, 512)
335}
336
337fn configured_embedding_text_cache_size() -> usize {
338 std::env::var("CODELENS_EMBED_TEXT_CACHE_SIZE")
339 .ok()
340 .and_then(|value| value.trim().parse::<usize>().ok())
341 .unwrap_or({
342 if cfg!(target_os = "macos") {
343 DEFAULT_MACOS_TEXT_EMBED_CACHE_SIZE
344 } else {
345 DEFAULT_TEXT_EMBED_CACHE_SIZE
346 }
347 })
348 .min(8192)
349}
350
351#[cfg(target_os = "macos")]
352fn configured_coreml_compute_units_name() -> String {
353 match std::env::var("CODELENS_EMBED_COREML_COMPUTE_UNITS")
354 .ok()
355 .map(|value| value.trim().to_ascii_lowercase())
356 .as_deref()
357 {
358 Some("all") => "all".to_string(),
359 Some("cpu") | Some("cpu_only") => "cpu_only".to_string(),
360 Some("gpu") | Some("cpu_and_gpu") => "cpu_and_gpu".to_string(),
361 Some("ane") | Some("neural_engine") | Some("cpu_and_neural_engine") => {
362 "cpu_and_neural_engine".to_string()
363 }
364 _ => "cpu_and_neural_engine".to_string(),
365 }
366}
367
368#[cfg(target_os = "macos")]
369fn configured_coreml_model_format_name() -> String {
370 match std::env::var("CODELENS_EMBED_COREML_MODEL_FORMAT")
371 .ok()
372 .map(|value| value.trim().to_ascii_lowercase())
373 .as_deref()
374 {
375 Some("neuralnetwork") | Some("neural_network") => "neural_network".to_string(),
376 _ => "mlprogram".to_string(),
377 }
378}
379
380#[cfg(target_os = "macos")]
381fn configured_coreml_profile_compute_plan() -> bool {
382 parse_bool_env("CODELENS_EMBED_COREML_PROFILE_PLAN").unwrap_or(false)
383}
384
385#[cfg(target_os = "macos")]
386fn configured_coreml_static_input_shapes() -> bool {
387 parse_bool_env("CODELENS_EMBED_COREML_STATIC_INPUT_SHAPES").unwrap_or(true)
388}
389
390#[cfg(target_os = "macos")]
391fn configured_coreml_specialization_strategy_name() -> String {
392 match std::env::var("CODELENS_EMBED_COREML_SPECIALIZATION")
393 .ok()
394 .map(|value| value.trim().to_ascii_lowercase())
395 .as_deref()
396 {
397 Some("default") => "default".to_string(),
398 _ => "fast_prediction".to_string(),
399 }
400}
401
402#[cfg(target_os = "macos")]
403fn configured_coreml_model_cache_dir() -> std::path::PathBuf {
404 dirs_fallback()
405 .unwrap_or_else(std::env::temp_dir)
406 .join(".cache")
407 .join("codelens")
408 .join("coreml-cache")
409 .join("codesearch")
410}
411
412fn recommended_embed_threads() -> usize {
413 if let Some(explicit) = parse_usize_env("CODELENS_EMBED_THREADS") {
414 return explicit.max(1);
415 }
416
417 let available = available_parallelism().map(|n| n.get()).unwrap_or(1);
418 if cfg!(target_os = "macos") {
419 apple_perf_cores()
420 .unwrap_or(available)
421 .min(available)
422 .clamp(1, 8)
423 } else {
424 available.div_ceil(2).clamp(1, 8)
425 }
426}
427
428fn embed_batch_size() -> usize {
429 parse_usize_env("CODELENS_EMBED_BATCH_SIZE").unwrap_or({
430 if cfg!(target_os = "macos") {
431 DEFAULT_MACOS_EMBED_BATCH_SIZE
432 } else {
433 DEFAULT_EMBED_BATCH_SIZE
434 }
435 })
436}
437
438fn max_embed_symbols() -> usize {
439 parse_usize_env("CODELENS_MAX_EMBED_SYMBOLS").unwrap_or(DEFAULT_MAX_EMBED_SYMBOLS)
440}
441
442fn set_env_if_unset(name: &str, value: impl Into<String>) {
443 if std::env::var_os(name).is_none() {
444 unsafe {
447 std::env::set_var(name, value.into());
448 }
449 }
450}
451
452fn configure_embedding_runtime() {
453 let threads = recommended_embed_threads();
454 let runtime_preference = configured_embedding_runtime_preference();
455
456 set_env_if_unset("OMP_NUM_THREADS", threads.to_string());
459 set_env_if_unset("OMP_WAIT_POLICY", "PASSIVE");
460 set_env_if_unset("OMP_DYNAMIC", "FALSE");
461 set_env_if_unset("TOKENIZERS_PARALLELISM", "false");
462 if cfg!(target_os = "macos") {
463 set_env_if_unset("VECLIB_MAXIMUM_THREADS", threads.to_string());
464 }
465
466 ORT_ENV_INIT.call_once(|| {
467 let pool = ort::environment::GlobalThreadPoolOptions::default()
468 .with_intra_threads(threads)
469 .and_then(|pool| pool.with_inter_threads(1))
470 .and_then(|pool| pool.with_spin_control(false));
471
472 if let Ok(pool) = pool {
473 let _ = ort::init()
474 .with_name("codelens-embedding")
475 .with_telemetry(false)
476 .with_global_thread_pool(pool)
477 .commit();
478 }
479 });
480
481 debug!(
482 threads,
483 runtime_preference = %runtime_preference,
484 "configured embedding runtime"
485 );
486}
487
488fn requested_embedding_model_override() -> Result<Option<String>> {
489 let env_model = std::env::var("CODELENS_EMBED_MODEL").ok();
490 let Some(model_id) = env_model else {
491 return Ok(None);
492 };
493 if model_id.is_empty() || model_id == CODESEARCH_MODEL_NAME {
494 return Ok(None);
495 }
496
497 #[cfg(feature = "model-bakeoff")]
498 {
499 return Ok(Some(model_id));
500 }
501
502 #[cfg(not(feature = "model-bakeoff"))]
503 {
504 anyhow::bail!(
505 "CODELENS_EMBED_MODEL={model_id} requires the `model-bakeoff` feature; \
506 rebuild the binary with `--features model-bakeoff` to run alternative model bake-offs"
507 );
508 }
509}
510
511pub fn configured_embedding_runtime_info() -> EmbeddingRuntimeInfo {
512 let runtime_preference = configured_embedding_runtime_preference();
513 let threads = configured_embedding_threads();
514
515 #[cfg(target_os = "macos")]
516 {
517 let coreml_enabled = runtime_preference != "cpu";
518 EmbeddingRuntimeInfo {
519 runtime_preference,
520 backend: "not_loaded".to_string(),
521 threads,
522 max_length: configured_embedding_max_length(),
523 coreml_model_format: coreml_enabled.then(configured_coreml_model_format_name),
524 coreml_compute_units: coreml_enabled.then(configured_coreml_compute_units_name),
525 coreml_static_input_shapes: coreml_enabled.then(configured_coreml_static_input_shapes),
526 coreml_profile_compute_plan: coreml_enabled
527 .then(configured_coreml_profile_compute_plan),
528 coreml_specialization_strategy: coreml_enabled
529 .then(configured_coreml_specialization_strategy_name),
530 coreml_model_cache_dir: coreml_enabled
531 .then(|| configured_coreml_model_cache_dir().display().to_string()),
532 fallback_reason: None,
533 }
534 }
535
536 #[cfg(not(target_os = "macos"))]
537 {
538 EmbeddingRuntimeInfo {
539 runtime_preference,
540 backend: "not_loaded".to_string(),
541 threads,
542 max_length: configured_embedding_max_length(),
543 coreml_model_format: None,
544 coreml_compute_units: None,
545 coreml_static_input_shapes: None,
546 coreml_profile_compute_plan: None,
547 coreml_specialization_strategy: None,
548 coreml_model_cache_dir: None,
549 fallback_reason: None,
550 }
551 }
552}
553
554#[cfg(target_os = "macos")]
555fn build_coreml_execution_provider() -> ExecutionProviderDispatch {
556 use ort::ep::{
557 CoreML,
558 coreml::{ComputeUnits, ModelFormat, SpecializationStrategy},
559 };
560
561 let compute_units = match configured_coreml_compute_units_name().as_str() {
562 "all" => ComputeUnits::All,
563 "cpu_only" => ComputeUnits::CPUOnly,
564 "cpu_and_gpu" => ComputeUnits::CPUAndGPU,
565 _ => ComputeUnits::CPUAndNeuralEngine,
566 };
567 let model_format = match configured_coreml_model_format_name().as_str() {
568 "neural_network" => ModelFormat::NeuralNetwork,
569 _ => ModelFormat::MLProgram,
570 };
571 let specialization = match configured_coreml_specialization_strategy_name().as_str() {
572 "default" => SpecializationStrategy::Default,
573 _ => SpecializationStrategy::FastPrediction,
574 };
575 let cache_dir = configured_coreml_model_cache_dir();
576 let _ = std::fs::create_dir_all(&cache_dir);
577
578 CoreML::default()
579 .with_model_format(model_format)
580 .with_compute_units(compute_units)
581 .with_static_input_shapes(configured_coreml_static_input_shapes())
582 .with_specialization_strategy(specialization)
583 .with_profile_compute_plan(configured_coreml_profile_compute_plan())
584 .with_model_cache_dir(cache_dir.display().to_string())
585 .build()
586 .error_on_failure()
587}
588
589fn cpu_runtime_info(
590 runtime_preference: String,
591 fallback_reason: Option<String>,
592) -> EmbeddingRuntimeInfo {
593 EmbeddingRuntimeInfo {
594 runtime_preference,
595 backend: "cpu".to_string(),
596 threads: configured_embedding_threads(),
597 max_length: configured_embedding_max_length(),
598 coreml_model_format: None,
599 coreml_compute_units: None,
600 coreml_static_input_shapes: None,
601 coreml_profile_compute_plan: None,
602 coreml_specialization_strategy: None,
603 coreml_model_cache_dir: None,
604 fallback_reason,
605 }
606}
607
608#[cfg(target_os = "macos")]
609fn coreml_runtime_info(
610 runtime_preference: String,
611 fallback_reason: Option<String>,
612) -> EmbeddingRuntimeInfo {
613 EmbeddingRuntimeInfo {
614 runtime_preference,
615 backend: if fallback_reason.is_some() {
616 "cpu".to_string()
617 } else {
618 "coreml".to_string()
619 },
620 threads: configured_embedding_threads(),
621 max_length: configured_embedding_max_length(),
622 coreml_model_format: Some(configured_coreml_model_format_name()),
623 coreml_compute_units: Some(configured_coreml_compute_units_name()),
624 coreml_static_input_shapes: Some(configured_coreml_static_input_shapes()),
625 coreml_profile_compute_plan: Some(configured_coreml_profile_compute_plan()),
626 coreml_specialization_strategy: Some(configured_coreml_specialization_strategy_name()),
627 coreml_model_cache_dir: Some(configured_coreml_model_cache_dir().display().to_string()),
628 fallback_reason,
629 }
630}
631
632#[cfg(feature = "model-bakeoff")]
637fn load_fastembed_builtin(
638 model_id: &str,
639) -> Result<(TextEmbedding, usize, String, EmbeddingRuntimeInfo)> {
640 use fastembed::EmbeddingModel;
641
642 let (model_enum, expected_dim) = match model_id {
644 "all-MiniLM-L6-v2" | "sentence-transformers/all-MiniLM-L6-v2" => {
645 (EmbeddingModel::AllMiniLML6V2, 384)
646 }
647 "all-MiniLM-L12-v2" | "sentence-transformers/all-MiniLM-L12-v2" => {
648 (EmbeddingModel::AllMiniLML12V2, 384)
649 }
650 "bge-small-en-v1.5" | "BAAI/bge-small-en-v1.5" => (EmbeddingModel::BGESmallENV15, 384),
651 "bge-base-en-v1.5" | "BAAI/bge-base-en-v1.5" => (EmbeddingModel::BGEBaseENV15, 768),
652 "nomic-embed-text-v1.5" | "nomic-ai/nomic-embed-text-v1.5" => {
653 (EmbeddingModel::NomicEmbedTextV15, 768)
654 }
655 other => {
656 anyhow::bail!(
657 "Unknown fastembed model: {other}. \
658 Supported: all-MiniLM-L6-v2, all-MiniLM-L12-v2, bge-small-en-v1.5, \
659 bge-base-en-v1.5, nomic-embed-text-v1.5"
660 );
661 }
662 };
663
664 let init = fastembed::InitOptionsWithLength::new(model_enum)
665 .with_max_length(configured_embedding_max_length())
666 .with_cache_dir(std::env::temp_dir().join("codelens-fastembed-cache"))
667 .with_show_download_progress(true);
668 let model =
669 TextEmbedding::try_new(init).with_context(|| format!("failed to load {model_id}"))?;
670
671 let runtime_info = cpu_runtime_info("cpu".to_string(), None);
672
673 tracing::info!(
674 model = model_id,
675 dimension = expected_dim,
676 "loaded fastembed built-in model for A/B comparison"
677 );
678
679 Ok((model, expected_dim, model_id.to_string(), runtime_info))
680}
681
682fn load_codesearch_model() -> Result<(TextEmbedding, usize, String, EmbeddingRuntimeInfo)> {
684 configure_embedding_runtime();
685
686 #[allow(unused_variables)]
688 if let Some(model_id) = requested_embedding_model_override()? {
689 #[cfg(feature = "model-bakeoff")]
690 {
691 return load_fastembed_builtin(&model_id);
692 }
693
694 #[cfg(not(feature = "model-bakeoff"))]
695 {
696 let _ = model_id;
697 unreachable!("alternative embedding model override should have errored");
698 }
699 }
700
701 let model_dir = resolve_model_dir()?;
702
703 let onnx_bytes =
704 std::fs::read(model_dir.join("model.onnx")).context("failed to read model.onnx")?;
705 let tokenizer_bytes =
706 std::fs::read(model_dir.join("tokenizer.json")).context("failed to read tokenizer.json")?;
707 let config_bytes =
708 std::fs::read(model_dir.join("config.json")).context("failed to read config.json")?;
709 let special_tokens_bytes = std::fs::read(model_dir.join("special_tokens_map.json"))
710 .context("failed to read special_tokens_map.json")?;
711 let tokenizer_config_bytes = std::fs::read(model_dir.join("tokenizer_config.json"))
712 .context("failed to read tokenizer_config.json")?;
713
714 let user_model = UserDefinedEmbeddingModel::new(
715 onnx_bytes,
716 TokenizerFiles {
717 tokenizer_file: tokenizer_bytes,
718 config_file: config_bytes,
719 special_tokens_map_file: special_tokens_bytes,
720 tokenizer_config_file: tokenizer_config_bytes,
721 },
722 );
723
724 let runtime_preference = configured_embedding_runtime_preference();
725
726 #[cfg(target_os = "macos")]
727 if runtime_preference != "cpu" {
728 let init_opts = InitOptionsUserDefined::new()
729 .with_max_length(configured_embedding_max_length())
730 .with_execution_providers(vec![build_coreml_execution_provider()]);
731 match TextEmbedding::try_new_from_user_defined(user_model.clone(), init_opts) {
732 Ok(model) => {
733 let runtime_info = coreml_runtime_info(runtime_preference.clone(), None);
734 debug!(
735 threads = runtime_info.threads,
736 runtime_preference = %runtime_info.runtime_preference,
737 backend = %runtime_info.backend,
738 coreml_compute_units = ?runtime_info.coreml_compute_units,
739 coreml_static_input_shapes = ?runtime_info.coreml_static_input_shapes,
740 coreml_profile_compute_plan = ?runtime_info.coreml_profile_compute_plan,
741 coreml_specialization_strategy = ?runtime_info.coreml_specialization_strategy,
742 coreml_model_cache_dir = ?runtime_info.coreml_model_cache_dir,
743 "loaded CodeSearchNet embedding model"
744 );
745 return Ok((
746 model,
747 CODESEARCH_DIMENSION,
748 CODESEARCH_MODEL_NAME.to_string(),
749 runtime_info,
750 ));
751 }
752 Err(err) => {
753 let reason = err.to_string();
754 debug!(
755 runtime_preference = %runtime_preference,
756 fallback_reason = %reason,
757 "CoreML embedding load failed; falling back to CPU"
758 );
759 let model = TextEmbedding::try_new_from_user_defined(
760 user_model,
761 InitOptionsUserDefined::new()
762 .with_max_length(configured_embedding_max_length()),
763 )
764 .context("failed to load CodeSearchNet embedding model")?;
765 let runtime_info = coreml_runtime_info(runtime_preference.clone(), Some(reason));
766 debug!(
767 threads = runtime_info.threads,
768 runtime_preference = %runtime_info.runtime_preference,
769 backend = %runtime_info.backend,
770 coreml_compute_units = ?runtime_info.coreml_compute_units,
771 coreml_static_input_shapes = ?runtime_info.coreml_static_input_shapes,
772 coreml_profile_compute_plan = ?runtime_info.coreml_profile_compute_plan,
773 coreml_specialization_strategy = ?runtime_info.coreml_specialization_strategy,
774 coreml_model_cache_dir = ?runtime_info.coreml_model_cache_dir,
775 fallback_reason = ?runtime_info.fallback_reason,
776 "loaded CodeSearchNet embedding model"
777 );
778 return Ok((
779 model,
780 CODESEARCH_DIMENSION,
781 CODESEARCH_MODEL_NAME.to_string(),
782 runtime_info,
783 ));
784 }
785 }
786 }
787
788 let model = TextEmbedding::try_new_from_user_defined(
789 user_model,
790 InitOptionsUserDefined::new().with_max_length(configured_embedding_max_length()),
791 )
792 .context("failed to load CodeSearchNet embedding model")?;
793 let runtime_info = cpu_runtime_info(runtime_preference.clone(), None);
794
795 debug!(
796 threads = runtime_info.threads,
797 runtime_preference = %runtime_info.runtime_preference,
798 backend = %runtime_info.backend,
799 "loaded CodeSearchNet embedding model"
800 );
801
802 Ok((
803 model,
804 CODESEARCH_DIMENSION,
805 CODESEARCH_MODEL_NAME.to_string(),
806 runtime_info,
807 ))
808}
809
810pub fn configured_embedding_model_name() -> String {
811 std::env::var("CODELENS_EMBED_MODEL").unwrap_or_else(|_| CODESEARCH_MODEL_NAME.to_string())
812}
813
814fn configured_rerank_blend() -> f64 {
815 std::env::var("CODELENS_RERANK_BLEND")
816 .ok()
817 .and_then(|v| v.parse::<f64>().ok())
818 .and_then(|v| {
819 if (0.0..=1.0).contains(&v) {
820 Some(v)
821 } else {
822 None
823 }
824 })
825 .unwrap_or(0.75) }
827
828pub fn embedding_model_assets_available() -> bool {
829 resolve_model_dir().is_ok()
830}
831
832impl EmbeddingEngine {
833 fn embed_texts_cached(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
834 if texts.is_empty() {
835 return Ok(Vec::new());
836 }
837
838 let mut resolved: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
839 let mut missing_order: Vec<String> = Vec::new();
840 let mut missing_positions: HashMap<String, Vec<usize>> = HashMap::new();
841
842 {
843 let mut cache = self
844 .text_embed_cache
845 .lock()
846 .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
847 for (index, text) in texts.iter().enumerate() {
848 if let Some(cached) = cache.get(text) {
849 resolved[index] = Some(cached);
850 } else {
851 let key = (*text).to_owned();
852 if !missing_positions.contains_key(&key) {
853 missing_order.push(key.clone());
854 }
855 missing_positions.entry(key).or_default().push(index);
856 }
857 }
858 }
859
860 if !missing_order.is_empty() {
861 let missing_refs: Vec<&str> = missing_order.iter().map(String::as_str).collect();
862 let embeddings = self
863 .model
864 .lock()
865 .map_err(|_| anyhow::anyhow!("model lock"))?
866 .embed(missing_refs, None)
867 .context("text embedding failed")?;
868
869 let mut cache = self
870 .text_embed_cache
871 .lock()
872 .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
873 for (text, embedding) in missing_order.into_iter().zip(embeddings.into_iter()) {
874 cache.insert(text.clone(), embedding.clone());
875 if let Some(indices) = missing_positions.remove(&text) {
876 for index in indices {
877 resolved[index] = Some(embedding.clone());
878 }
879 }
880 }
881 }
882
883 resolved
884 .into_iter()
885 .map(|item| item.ok_or_else(|| anyhow::anyhow!("missing embedding cache entry")))
886 .collect()
887 }
888
889 pub fn new(project: &ProjectRoot) -> Result<Self> {
890 let (model, dimension, model_name, runtime_info) = load_codesearch_model()?;
891
892 let db_dir = project.as_path().join(".codelens/index");
893 std::fs::create_dir_all(&db_dir)?;
894 let db_path = db_dir.join("embeddings.db");
895
896 let store = SqliteVecStore::new(&db_path, dimension, &model_name)?;
897
898 Ok(Self {
899 model: Mutex::new(model),
900 store,
901 model_name,
902 runtime_info,
903 text_embed_cache: Mutex::new(TextEmbeddingCache::new(
904 configured_embedding_text_cache_size(),
905 )),
906 indexing: std::sync::atomic::AtomicBool::new(false),
907 })
908 }
909
910 pub fn model_name(&self) -> &str {
911 &self.model_name
912 }
913
914 pub fn runtime_info(&self) -> &EmbeddingRuntimeInfo {
915 &self.runtime_info
916 }
917
918 pub fn is_indexing(&self) -> bool {
925 self.indexing.load(std::sync::atomic::Ordering::Relaxed)
926 }
927
928 pub fn index_from_project(&self, project: &ProjectRoot) -> Result<usize> {
929 if self
931 .indexing
932 .compare_exchange(
933 false,
934 true,
935 std::sync::atomic::Ordering::AcqRel,
936 std::sync::atomic::Ordering::Relaxed,
937 )
938 .is_err()
939 {
940 anyhow::bail!(
941 "Embedding indexing already in progress — wait for the current run to complete before retrying."
942 );
943 }
944 struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
946 impl Drop for IndexGuard<'_> {
947 fn drop(&mut self) {
948 self.0.store(false, std::sync::atomic::Ordering::Release);
949 }
950 }
951 let _guard = IndexGuard(&self.indexing);
952
953 let db_path = crate::db::index_db_path(project.as_path());
954 let symbol_db = IndexDb::open(&db_path)?;
955 let batch_size = embed_batch_size();
956 let max_symbols = max_embed_symbols();
957 let mut total_indexed = 0usize;
958 let mut total_seen = 0usize;
959 let mut model = None;
960 let mut existing_embeddings: HashMap<
961 String,
962 HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
963 > = HashMap::new();
964 let mut current_db_files = HashSet::new();
965 let mut capped = false;
966
967 self.store
968 .for_each_file_embeddings(&mut |file_path, chunks| {
969 existing_embeddings.insert(
970 file_path,
971 chunks
972 .into_iter()
973 .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
974 .collect(),
975 );
976 Ok(())
977 })?;
978
979 symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
980 current_db_files.insert(file_path.clone());
981 if capped {
982 return Ok(());
983 }
984
985 let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
986 let relevant_symbols: Vec<_> = symbols
987 .into_iter()
988 .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
989 .collect();
990
991 if relevant_symbols.is_empty() {
992 self.store.delete_by_file(&[file_path.as_str()])?;
993 existing_embeddings.remove(&file_path);
994 return Ok(());
995 }
996
997 if total_seen + relevant_symbols.len() > max_symbols {
998 capped = true;
999 return Ok(());
1000 }
1001 total_seen += relevant_symbols.len();
1002
1003 let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
1004 total_indexed += self.reconcile_file_embeddings(
1005 &file_path,
1006 relevant_symbols,
1007 source.as_deref(),
1008 existing_for_file,
1009 batch_size,
1010 &mut model,
1011 )?;
1012 Ok(())
1013 })?;
1014
1015 let removed_files: Vec<String> = existing_embeddings
1016 .into_keys()
1017 .filter(|file_path| !current_db_files.contains(file_path))
1018 .collect();
1019 if !removed_files.is_empty() {
1020 let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
1021 self.store.delete_by_file(&removed_refs)?;
1022 }
1023
1024 Ok(total_indexed)
1025 }
1026
1027 pub fn generate_bridge_candidates(
1031 &self,
1032 project: &ProjectRoot,
1033 ) -> Result<Vec<(String, String)>> {
1034 let db_path = crate::db::index_db_path(project.as_path());
1035 let symbol_db = IndexDb::open(&db_path)?;
1036 let mut bridges: Vec<(String, String)> = Vec::new();
1037 let mut seen_nl = HashSet::new();
1038
1039 symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
1040 let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
1041 for sym in &symbols {
1042 if is_test_only_symbol(sym, source.as_deref()) {
1043 continue;
1044 }
1045 let doc = source.as_deref().and_then(|src| {
1046 extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize)
1047 });
1048 let doc = match doc {
1049 Some(d) if !d.is_empty() => d,
1050 _ => continue,
1051 };
1052
1053 let split = split_identifier(&sym.name);
1055 let code_term = if split != sym.name {
1056 format!("{} {}", sym.name, split)
1057 } else {
1058 sym.name.clone()
1059 };
1060
1061 let first_line = doc.lines().next().unwrap_or("").trim().to_lowercase();
1065 let clean = first_line.trim_end_matches(|c: char| c.is_ascii_punctuation());
1067 let words: Vec<&str> = clean.split_whitespace().collect();
1068 if words.len() < 2 {
1069 continue;
1070 }
1071
1072 for window in 2..=words.len().min(4) {
1074 let key = words[..window].join(" ");
1075 if key.len() < 5 || key.len() > 60 {
1076 continue;
1077 }
1078 if seen_nl.insert(key.clone()) {
1079 bridges.push((key, code_term.clone()));
1080 }
1081 }
1082
1083 if split != sym.name && !seen_nl.contains(&split.to_lowercase()) {
1086 let lowered = split.to_lowercase();
1087 if lowered.split_whitespace().count() >= 2 && seen_nl.insert(lowered.clone()) {
1088 bridges.push((lowered, code_term.clone()));
1089 }
1090 }
1091 }
1092 Ok(())
1093 })?;
1094
1095 Ok(bridges)
1096 }
1097
1098 fn reconcile_file_embeddings<'a>(
1099 &'a self,
1100 file_path: &str,
1101 symbols: Vec<crate::db::SymbolWithFile>,
1102 source: Option<&str>,
1103 mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
1104 batch_size: usize,
1105 model: &mut Option<std::sync::MutexGuard<'a, TextEmbedding>>,
1106 ) -> Result<usize> {
1107 let mut reconciled_chunks = Vec::with_capacity(symbols.len());
1108 let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
1109 let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
1110
1111 for sym in symbols {
1112 let text = build_embedding_text(&sym, source);
1113 if let Some(existing) =
1114 existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
1115 {
1116 reconciled_chunks.push(EmbeddingChunk {
1117 file_path: sym.file_path.clone(),
1118 symbol_name: sym.name.clone(),
1119 kind: sym.kind.clone(),
1120 line: sym.line as usize,
1121 signature: sym.signature.clone(),
1122 name_path: sym.name_path.clone(),
1123 text,
1124 embedding: existing.embedding,
1125 doc_embedding: existing.doc_embedding,
1126 });
1127 continue;
1128 }
1129
1130 batch_texts.push(text);
1131 batch_meta.push(sym);
1132
1133 if batch_texts.len() >= batch_size {
1134 if model.is_none() {
1135 *model = Some(
1136 self.model
1137 .lock()
1138 .map_err(|_| anyhow::anyhow!("model lock"))?,
1139 );
1140 }
1141 reconciled_chunks.extend(Self::embed_chunks(
1142 model.as_mut().expect("model lock initialized"),
1143 &batch_texts,
1144 &batch_meta,
1145 )?);
1146 batch_texts.clear();
1147 batch_meta.clear();
1148 }
1149 }
1150
1151 if !batch_texts.is_empty() {
1152 if model.is_none() {
1153 *model = Some(
1154 self.model
1155 .lock()
1156 .map_err(|_| anyhow::anyhow!("model lock"))?,
1157 );
1158 }
1159 reconciled_chunks.extend(Self::embed_chunks(
1160 model.as_mut().expect("model lock initialized"),
1161 &batch_texts,
1162 &batch_meta,
1163 )?);
1164 }
1165
1166 self.store.delete_by_file(&[file_path])?;
1167 if reconciled_chunks.is_empty() {
1168 return Ok(0);
1169 }
1170 self.store.insert(&reconciled_chunks)
1171 }
1172
1173 fn embed_chunks(
1174 model: &mut TextEmbedding,
1175 texts: &[String],
1176 meta: &[crate::db::SymbolWithFile],
1177 ) -> Result<Vec<EmbeddingChunk>> {
1178 let batch_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
1179 let embeddings = model.embed(batch_refs, None).context("embedding failed")?;
1180
1181 Ok(meta
1182 .iter()
1183 .zip(embeddings)
1184 .zip(texts.iter())
1185 .map(|((sym, emb), text)| EmbeddingChunk {
1186 file_path: sym.file_path.clone(),
1187 symbol_name: sym.name.clone(),
1188 kind: sym.kind.clone(),
1189 line: sym.line as usize,
1190 signature: sym.signature.clone(),
1191 name_path: sym.name_path.clone(),
1192 text: text.clone(),
1193 embedding: emb,
1194 doc_embedding: None,
1195 })
1196 .collect())
1197 }
1198
1199 fn flush_batch(
1201 model: &mut TextEmbedding,
1202 store: &SqliteVecStore,
1203 texts: &[String],
1204 meta: &[crate::db::SymbolWithFile],
1205 ) -> Result<usize> {
1206 let chunks = Self::embed_chunks(model, texts, meta)?;
1207 store.insert(&chunks)
1208 }
1209
1210 pub fn search(&self, query: &str, max_results: usize) -> Result<Vec<SemanticMatch>> {
1212 let results = self.search_scored(query, max_results)?;
1213 Ok(results.into_iter().map(SemanticMatch::from).collect())
1214 }
1215
1216 pub fn search_scored(&self, query: &str, max_results: usize) -> Result<Vec<ScoredChunk>> {
1223 let query_embedding = self.embed_texts_cached(&[query])?;
1224
1225 if query_embedding.is_empty() {
1226 return Ok(Vec::new());
1227 }
1228
1229 let factor = std::env::var("CODELENS_RERANK_FACTOR")
1233 .ok()
1234 .and_then(|v| v.parse::<usize>().ok())
1235 .unwrap_or(5);
1236 let candidate_count = max_results.saturating_mul(factor).max(max_results);
1237 let mut candidates = self.store.search(&query_embedding[0], candidate_count)?;
1238
1239 if candidates.len() <= max_results {
1240 return Ok(candidates);
1241 }
1242
1243 let query_lower = query.to_lowercase();
1246 let query_tokens: Vec<&str> = query_lower
1247 .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
1248 .filter(|t| t.len() >= 2)
1249 .collect();
1250
1251 if query_tokens.is_empty() {
1252 candidates.truncate(max_results);
1253 return Ok(candidates);
1254 }
1255
1256 let blend = configured_rerank_blend();
1257 for chunk in &mut candidates {
1258 let split_name = split_identifier(&chunk.symbol_name);
1263 let searchable = format!(
1264 "{} {} {} {} {}",
1265 chunk.symbol_name.to_lowercase(),
1266 split_name.to_lowercase(),
1267 chunk.name_path.to_lowercase(),
1268 chunk.signature.to_lowercase(),
1269 chunk.file_path.to_lowercase(),
1270 );
1271 let overlap = query_tokens
1272 .iter()
1273 .filter(|t| searchable.contains(**t))
1274 .count() as f64;
1275 let overlap_ratio = overlap / query_tokens.len().max(1) as f64;
1276 chunk.score = chunk.score * blend + overlap_ratio * (1.0 - blend);
1278 }
1279
1280 candidates.sort_by(|a, b| {
1281 b.score
1282 .partial_cmp(&a.score)
1283 .unwrap_or(std::cmp::Ordering::Equal)
1284 });
1285 candidates.truncate(max_results);
1286 Ok(candidates)
1287 }
1288
1289 pub fn index_changed_files(
1291 &self,
1292 project: &ProjectRoot,
1293 changed_files: &[&str],
1294 ) -> Result<usize> {
1295 if changed_files.is_empty() {
1296 return Ok(0);
1297 }
1298 let batch_size = embed_batch_size();
1299 let mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk> = HashMap::new();
1300 for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
1301 for chunk in self.store.embeddings_for_files(file_chunk)? {
1302 existing_embeddings.insert(reusable_embedding_key_for_chunk(&chunk), chunk);
1303 }
1304 }
1305 self.store.delete_by_file(changed_files)?;
1306
1307 let db_path = crate::db::index_db_path(project.as_path());
1308 let symbol_db = IndexDb::open(&db_path)?;
1309
1310 let mut total_indexed = 0usize;
1311 let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
1312 let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
1313 let mut batch_reused: Vec<EmbeddingChunk> = Vec::with_capacity(batch_size);
1314 let mut file_cache: std::collections::HashMap<String, Option<String>> =
1315 std::collections::HashMap::new();
1316 let mut model = None;
1317
1318 for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
1319 let relevant = symbol_db.symbols_for_files(file_chunk)?;
1320 for sym in relevant {
1321 let source = file_cache.entry(sym.file_path.clone()).or_insert_with(|| {
1322 std::fs::read_to_string(project.as_path().join(&sym.file_path)).ok()
1323 });
1324 if is_test_only_symbol(&sym, source.as_deref()) {
1325 continue;
1326 }
1327 let text = build_embedding_text(&sym, source.as_deref());
1328 if let Some(existing) =
1329 existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
1330 {
1331 batch_reused.push(EmbeddingChunk {
1332 file_path: sym.file_path.clone(),
1333 symbol_name: sym.name.clone(),
1334 kind: sym.kind.clone(),
1335 line: sym.line as usize,
1336 signature: sym.signature.clone(),
1337 name_path: sym.name_path.clone(),
1338 text,
1339 embedding: existing.embedding,
1340 doc_embedding: existing.doc_embedding,
1341 });
1342 if batch_reused.len() >= batch_size {
1343 total_indexed += self.store.insert(&batch_reused)?;
1344 batch_reused.clear();
1345 }
1346 continue;
1347 }
1348 batch_texts.push(text);
1349 batch_meta.push(sym);
1350
1351 if batch_texts.len() >= batch_size {
1352 if model.is_none() {
1353 model = Some(
1354 self.model
1355 .lock()
1356 .map_err(|_| anyhow::anyhow!("model lock"))?,
1357 );
1358 }
1359 total_indexed += Self::flush_batch(
1360 model.as_mut().expect("model lock initialized"),
1361 &self.store,
1362 &batch_texts,
1363 &batch_meta,
1364 )?;
1365 batch_texts.clear();
1366 batch_meta.clear();
1367 }
1368 }
1369 }
1370
1371 if !batch_reused.is_empty() {
1372 total_indexed += self.store.insert(&batch_reused)?;
1373 }
1374
1375 if !batch_texts.is_empty() {
1376 if model.is_none() {
1377 model = Some(
1378 self.model
1379 .lock()
1380 .map_err(|_| anyhow::anyhow!("model lock"))?,
1381 );
1382 }
1383 total_indexed += Self::flush_batch(
1384 model.as_mut().expect("model lock initialized"),
1385 &self.store,
1386 &batch_texts,
1387 &batch_meta,
1388 )?;
1389 }
1390
1391 Ok(total_indexed)
1392 }
1393
1394 pub fn is_indexed(&self) -> bool {
1396 self.store.count().unwrap_or(0) > 0
1397 }
1398
1399 pub fn index_info(&self) -> EmbeddingIndexInfo {
1400 EmbeddingIndexInfo {
1401 model_name: self.model_name.clone(),
1402 indexed_symbols: self.store.count().unwrap_or(0),
1403 }
1404 }
1405
1406 pub fn inspect_existing_index(project: &ProjectRoot) -> Result<Option<EmbeddingIndexInfo>> {
1407 let db_path = project.as_path().join(".codelens/index/embeddings.db");
1408 if !db_path.exists() {
1409 return Ok(None);
1410 }
1411
1412 let conn =
1413 crate::db::open_derived_sqlite_with_recovery(&db_path, "embedding index", || {
1414 ffi::register_sqlite_vec()?;
1415 let conn = Connection::open(&db_path)?;
1416 conn.execute_batch("PRAGMA busy_timeout=5000;")?;
1417 conn.query_row("PRAGMA schema_version", [], |_row| Ok(()))?;
1418 Ok(conn)
1419 })?;
1420
1421 let model_name: Option<String> = conn
1422 .query_row(
1423 "SELECT value FROM meta WHERE key = 'model' LIMIT 1",
1424 [],
1425 |row| row.get(0),
1426 )
1427 .ok();
1428 let indexed_symbols: usize = conn
1429 .query_row("SELECT COUNT(*) FROM symbols", [], |row| {
1430 row.get::<_, i64>(0)
1431 })
1432 .map(|count| count.max(0) as usize)
1433 .unwrap_or(0);
1434
1435 Ok(model_name.map(|model_name| EmbeddingIndexInfo {
1436 model_name,
1437 indexed_symbols,
1438 }))
1439 }
1440
1441 pub fn find_similar_code(
1445 &self,
1446 file_path: &str,
1447 symbol_name: &str,
1448 max_results: usize,
1449 ) -> Result<Vec<SemanticMatch>> {
1450 let target = self
1451 .store
1452 .get_embedding(file_path, symbol_name)?
1453 .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
1454
1455 let oversample = max_results.saturating_add(8).max(1);
1456 let scored = self
1457 .store
1458 .search(&target.embedding, oversample)?
1459 .into_iter()
1460 .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
1461 .take(max_results)
1462 .map(SemanticMatch::from)
1463 .collect();
1464 Ok(scored)
1465 }
1466
1467 pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
1470 let mut pairs = Vec::new();
1471 let mut seen_pairs = HashSet::new();
1472 let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
1473 let candidate_limit = duplicate_candidate_limit(max_pairs);
1474 let mut done = false;
1475
1476 self.store
1477 .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut |batch| {
1478 if done {
1479 return Ok(());
1480 }
1481
1482 let mut candidate_lists = Vec::with_capacity(batch.len());
1483 let mut missing_candidates = Vec::new();
1484 let mut missing_keys = HashSet::new();
1485
1486 for chunk in &batch {
1487 if pairs.len() >= max_pairs {
1488 done = true;
1489 break;
1490 }
1491
1492 let filtered: Vec<ScoredChunk> = self
1493 .store
1494 .search(&chunk.embedding, candidate_limit)?
1495 .into_iter()
1496 .filter(|candidate| {
1497 !(chunk.file_path == candidate.file_path
1498 && chunk.symbol_name == candidate.symbol_name
1499 && chunk.line == candidate.line
1500 && chunk.signature == candidate.signature
1501 && chunk.name_path == candidate.name_path)
1502 })
1503 .collect();
1504
1505 for candidate in &filtered {
1506 let cache_key = stored_chunk_key_for_score(candidate);
1507 if !embedding_cache.contains_key(&cache_key)
1508 && missing_keys.insert(cache_key)
1509 {
1510 missing_candidates.push(candidate.clone());
1511 }
1512 }
1513
1514 candidate_lists.push(filtered);
1515 }
1516
1517 if !missing_candidates.is_empty() {
1518 for candidate_chunk in self
1519 .store
1520 .embeddings_for_scored_chunks(&missing_candidates)?
1521 {
1522 embedding_cache
1523 .entry(stored_chunk_key(&candidate_chunk))
1524 .or_insert_with(|| Arc::new(candidate_chunk));
1525 }
1526 }
1527
1528 for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
1529 if pairs.len() >= max_pairs {
1530 done = true;
1531 break;
1532 }
1533
1534 for candidate in candidates {
1535 let pair_key = duplicate_pair_key(
1536 &chunk.file_path,
1537 &chunk.symbol_name,
1538 &candidate.file_path,
1539 &candidate.symbol_name,
1540 );
1541 if !seen_pairs.insert(pair_key) {
1542 continue;
1543 }
1544
1545 let Some(candidate_chunk) =
1546 embedding_cache.get(&stored_chunk_key_for_score(candidate))
1547 else {
1548 continue;
1549 };
1550
1551 let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
1552 if sim < threshold {
1553 continue;
1554 }
1555
1556 pairs.push(DuplicatePair {
1557 symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
1558 symbol_b: format!(
1559 "{}:{}",
1560 candidate_chunk.file_path, candidate_chunk.symbol_name
1561 ),
1562 file_a: chunk.file_path.clone(),
1563 file_b: candidate_chunk.file_path.clone(),
1564 line_a: chunk.line,
1565 line_b: candidate_chunk.line,
1566 similarity: sim,
1567 });
1568 if pairs.len() >= max_pairs {
1569 done = true;
1570 break;
1571 }
1572 }
1573 }
1574 Ok(())
1575 })?;
1576
1577 pairs.sort_by(|a, b| {
1578 b.similarity
1579 .partial_cmp(&a.similarity)
1580 .unwrap_or(std::cmp::Ordering::Equal)
1581 });
1582 Ok(pairs)
1583 }
1584}
1585
1586fn duplicate_candidate_limit(max_pairs: usize) -> usize {
1587 max_pairs.saturating_mul(4).clamp(32, 128)
1588}
1589
1590fn duplicate_pair_key(
1591 file_a: &str,
1592 symbol_a: &str,
1593 file_b: &str,
1594 symbol_b: &str,
1595) -> ((String, String), (String, String)) {
1596 let left = (file_a.to_owned(), symbol_a.to_owned());
1597 let right = (file_b.to_owned(), symbol_b.to_owned());
1598 if left <= right {
1599 (left, right)
1600 } else {
1601 (right, left)
1602 }
1603}
1604
1605type StoredChunkKey = (String, String, usize, String, String);
1606
1607fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
1608 (
1609 chunk.file_path.clone(),
1610 chunk.symbol_name.clone(),
1611 chunk.line,
1612 chunk.signature.clone(),
1613 chunk.name_path.clone(),
1614 )
1615}
1616
1617fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
1618 (
1619 chunk.file_path.clone(),
1620 chunk.symbol_name.clone(),
1621 chunk.line,
1622 chunk.signature.clone(),
1623 chunk.name_path.clone(),
1624 )
1625}
1626
1627impl EmbeddingEngine {
1628 pub fn classify_symbol(
1630 &self,
1631 file_path: &str,
1632 symbol_name: &str,
1633 categories: &[&str],
1634 ) -> Result<Vec<CategoryScore>> {
1635 let target = match self.store.get_embedding(file_path, symbol_name)? {
1636 Some(target) => target,
1637 None => self
1638 .store
1639 .all_with_embeddings()?
1640 .into_iter()
1641 .find(|c| c.file_path == file_path && c.symbol_name == symbol_name)
1642 .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?,
1643 };
1644
1645 let embeddings = self.embed_texts_cached(categories)?;
1646
1647 let mut scores: Vec<CategoryScore> = categories
1648 .iter()
1649 .zip(embeddings.iter())
1650 .map(|(cat, emb)| CategoryScore {
1651 category: cat.to_string(),
1652 score: cosine_similarity(&target.embedding, emb),
1653 })
1654 .collect();
1655
1656 scores.sort_by(|a, b| {
1657 b.score
1658 .partial_cmp(&a.score)
1659 .unwrap_or(std::cmp::Ordering::Equal)
1660 });
1661 Ok(scores)
1662 }
1663
1664 pub fn find_misplaced_code(&self, max_results: usize) -> Result<Vec<OutlierSymbol>> {
1666 let mut outliers = Vec::new();
1667
1668 self.store
1669 .for_each_file_embeddings(&mut |file_path, chunks| {
1670 if chunks.len() < 2 {
1671 return Ok(());
1672 }
1673
1674 for (idx, chunk) in chunks.iter().enumerate() {
1675 let mut sim_sum = 0.0;
1676 let mut count = 0;
1677 for (other_idx, other_chunk) in chunks.iter().enumerate() {
1678 if other_idx == idx {
1679 continue;
1680 }
1681 sim_sum += cosine_similarity(&chunk.embedding, &other_chunk.embedding);
1682 count += 1;
1683 }
1684 if count > 0 {
1685 let avg_sim = sim_sum / count as f64; outliers.push(OutlierSymbol {
1687 file_path: file_path.clone(),
1688 symbol_name: chunk.symbol_name.clone(),
1689 kind: chunk.kind.clone(),
1690 line: chunk.line,
1691 avg_similarity_to_file: avg_sim,
1692 });
1693 }
1694 }
1695 Ok(())
1696 })?;
1697
1698 outliers.sort_by(|a, b| {
1699 a.avg_similarity_to_file
1700 .partial_cmp(&b.avg_similarity_to_file)
1701 .unwrap_or(std::cmp::Ordering::Equal)
1702 });
1703 outliers.truncate(max_results);
1704 Ok(outliers)
1705 }
1706}
1707
1708#[derive(Debug, Clone, Serialize)]
1711pub struct DuplicatePair {
1712 pub symbol_a: String,
1713 pub symbol_b: String,
1714 pub file_a: String,
1715 pub file_b: String,
1716 pub line_a: usize,
1717 pub line_b: usize,
1718 pub similarity: f64,
1719}
1720
1721#[derive(Debug, Clone, Serialize)]
1722pub struct CategoryScore {
1723 pub category: String,
1724 pub score: f64,
1725}
1726
1727#[derive(Debug, Clone, Serialize)]
1728pub struct OutlierSymbol {
1729 pub file_path: String,
1730 pub symbol_name: String,
1731 pub kind: String,
1732 pub line: usize,
1733 pub avg_similarity_to_file: f64,
1734}
1735
1736fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
1741 debug_assert_eq!(a.len(), b.len());
1742
1743 let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
1746 for (x, y) in a.iter().zip(b.iter()) {
1747 dot += x * y;
1748 norm_a += x * x;
1749 norm_b += y * y;
1750 }
1751
1752 let norm_a = (norm_a as f64).sqrt();
1753 let norm_b = (norm_b as f64).sqrt();
1754 if norm_a == 0.0 || norm_b == 0.0 {
1755 0.0
1756 } else {
1757 dot as f64 / (norm_a * norm_b)
1758 }
1759}
1760
1761fn split_identifier(name: &str) -> String {
1776 if !name.contains('_') && !name.chars().any(|c| c.is_uppercase()) {
1778 return name.to_string();
1779 }
1780 let mut words = Vec::new();
1781 let mut current = String::new();
1782 let chars: Vec<char> = name.chars().collect();
1783 for (i, &ch) in chars.iter().enumerate() {
1784 if ch == '_' {
1785 if !current.is_empty() {
1786 words.push(current.clone());
1787 current.clear();
1788 }
1789 } else if ch.is_uppercase()
1790 && !current.is_empty()
1791 && (current
1792 .chars()
1793 .last()
1794 .map(|c| c.is_lowercase())
1795 .unwrap_or(false)
1796 || chars.get(i + 1).map(|c| c.is_lowercase()).unwrap_or(false))
1797 {
1798 words.push(current.clone());
1800 current.clear();
1801 current.push(ch);
1802 } else {
1803 current.push(ch);
1804 }
1805 }
1806 if !current.is_empty() {
1807 words.push(current);
1808 }
1809 if words.len() <= 1 {
1810 return name.to_string(); }
1812 words.join(" ")
1813}
1814
1815fn is_test_only_symbol(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> bool {
1816 let fp = &sym.file_path;
1817
1818 if fp.contains("/tests/") || fp.ends_with("_tests.rs") {
1821 return true;
1822 }
1823 if fp.contains("/__tests__/") || fp.contains("\\__tests__\\") {
1825 return true;
1826 }
1827 if fp.ends_with("_test.py") {
1829 return true;
1830 }
1831 if fp.ends_with("_test.go") {
1833 return true;
1834 }
1835 if fp.ends_with(".test.ts")
1837 || fp.ends_with(".test.tsx")
1838 || fp.ends_with(".test.js")
1839 || fp.ends_with(".test.jsx")
1840 || fp.ends_with(".spec.ts")
1841 || fp.ends_with(".spec.js")
1842 {
1843 return true;
1844 }
1845 if fp.contains("/src/test/") {
1847 return true;
1848 }
1849 if fp.ends_with("Test.java") || fp.ends_with("Tests.java") {
1851 return true;
1852 }
1853 if fp.ends_with("_test.rb") || fp.contains("/spec/") {
1855 return true;
1856 }
1857
1858 if sym.name_path.starts_with("tests::")
1860 || sym.name_path.contains("::tests::")
1861 || sym.name_path.starts_with("test::")
1862 || sym.name_path.contains("::test::")
1863 {
1864 return true;
1865 }
1866
1867 let Some(source) = source else {
1868 return false;
1869 };
1870
1871 let start = usize::try_from(sym.start_byte.max(0))
1872 .unwrap_or(0)
1873 .min(source.len());
1874
1875 let window_start = start.saturating_sub(2048);
1877 let attrs = String::from_utf8_lossy(&source.as_bytes()[window_start..start]);
1878 if attrs.contains("#[test]")
1879 || attrs.contains("#[tokio::test]")
1880 || attrs.contains("#[cfg(test)]")
1881 || attrs.contains("#[cfg(all(test")
1882 {
1883 return true;
1884 }
1885
1886 if fp.ends_with(".py") {
1889 if sym.name.starts_with("test_") {
1890 return true;
1891 }
1892 if sym.kind == "class" && sym.name.starts_with("Test") {
1894 return true;
1895 }
1896 }
1897
1898 if fp.ends_with(".go") && sym.name.starts_with("Test") && sym.kind == "function" {
1902 return true;
1903 }
1904
1905 if fp.ends_with(".java") || fp.ends_with(".kt") {
1907 let before = &source[..start];
1908 let window = if before.len() > 200 {
1909 &before[before.len() - 200..]
1910 } else {
1911 before
1912 };
1913 if window.contains("@Test")
1914 || window.contains("@ParameterizedTest")
1915 || window.contains("@RepeatedTest")
1916 {
1917 return true;
1918 }
1919 }
1920
1921 false
1922}
1923
1924fn build_embedding_text(sym: &crate::db::SymbolWithFile, source: Option<&str>) -> String {
1925 let file_ctx = if sym.file_path.is_empty() {
1929 String::new()
1930 } else {
1931 let filename = sym.file_path.rsplit('/').next().unwrap_or(&sym.file_path);
1932 format!(" in {}", filename)
1933 };
1934
1935 let split_name = split_identifier(&sym.name);
1938 let name_with_split = if split_name != sym.name {
1939 format!("{} ({})", sym.name, split_name)
1940 } else {
1941 sym.name.clone()
1942 };
1943
1944 let parent_ctx = if !sym.name_path.is_empty() && sym.name_path.contains('/') {
1946 let parent = sym.name_path.rsplit_once('/').map(|x| x.0).unwrap_or("");
1947 if parent.is_empty() {
1948 String::new()
1949 } else {
1950 format!(" (in {})", parent)
1951 }
1952 } else {
1953 String::new()
1954 };
1955
1956 let module_ctx = if sym.file_path.contains('/') {
1959 let parts: Vec<&str> = sym.file_path.rsplitn(3, '/').collect();
1960 if parts.len() >= 2 {
1961 let dir = parts[1];
1962 if dir != "src" && dir != "crates" {
1964 format!(" [{dir}]")
1965 } else {
1966 String::new()
1967 }
1968 } else {
1969 String::new()
1970 }
1971 } else {
1972 String::new()
1973 };
1974
1975 let base = if sym.signature.is_empty() {
1976 format!(
1977 "{} {}{}{}{}",
1978 sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx
1979 )
1980 } else {
1981 format!(
1982 "{} {}{}{}{}: {}",
1983 sym.kind, name_with_split, parent_ctx, module_ctx, file_ctx, sym.signature
1984 )
1985 };
1986
1987 let docstrings_disabled = std::env::var("CODELENS_EMBED_DOCSTRINGS")
1991 .map(|v| v == "0" || v == "false")
1992 .unwrap_or(false);
1993
1994 if docstrings_disabled {
1995 return base;
1996 }
1997
1998 let docstring = source
1999 .and_then(|src| extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize))
2000 .unwrap_or_default();
2001
2002 let mut text = if docstring.is_empty() {
2003 let body_hint = source
2008 .and_then(|src| extract_body_hint(src, sym.start_byte as usize, sym.end_byte as usize))
2009 .unwrap_or_default();
2010 if body_hint.is_empty() {
2011 base
2012 } else {
2013 format!("{} — {}", base, body_hint)
2014 }
2015 } else {
2016 let line_budget = hint_line_budget();
2021 let lines: Vec<String> = docstring
2022 .lines()
2023 .map(str::trim)
2024 .filter(|line| !line.is_empty())
2025 .take(line_budget)
2026 .map(str::to_string)
2027 .collect();
2028 let hint = join_hint_lines(&lines);
2029 if hint.is_empty() {
2030 base
2031 } else {
2032 format!("{} — {}", base, hint)
2033 }
2034 };
2035
2036 if let Some(src) = source
2040 && let Some(nl_tokens) =
2041 extract_nl_tokens(src, sym.start_byte as usize, sym.end_byte as usize)
2042 && !nl_tokens.is_empty()
2043 {
2044 text.push_str(" · NL: ");
2045 text.push_str(&nl_tokens);
2046 }
2047
2048 if let Some(src) = source
2053 && let Some(api_calls) =
2054 extract_api_calls(src, sym.start_byte as usize, sym.end_byte as usize)
2055 && !api_calls.is_empty()
2056 {
2057 text.push_str(" · API: ");
2058 text.push_str(&api_calls);
2059 }
2060
2061 text
2062}
2063
2064const DEFAULT_HINT_TOTAL_CHAR_BUDGET: usize = 60;
2077
2078const DEFAULT_HINT_LINES: usize = 1;
2081
2082fn hint_char_budget() -> usize {
2083 std::env::var("CODELENS_EMBED_HINT_CHARS")
2084 .ok()
2085 .and_then(|raw| raw.parse::<usize>().ok())
2086 .map(|n| n.clamp(60, 512))
2087 .unwrap_or(DEFAULT_HINT_TOTAL_CHAR_BUDGET)
2088}
2089
2090fn hint_line_budget() -> usize {
2091 std::env::var("CODELENS_EMBED_HINT_LINES")
2092 .ok()
2093 .and_then(|raw| raw.parse::<usize>().ok())
2094 .map(|n| n.clamp(1, 10))
2095 .unwrap_or(DEFAULT_HINT_LINES)
2096}
2097
2098fn join_hint_lines(lines: &[String]) -> String {
2105 if lines.is_empty() {
2106 return String::new();
2107 }
2108 let joined = lines
2109 .iter()
2110 .map(String::as_str)
2111 .collect::<Vec<_>>()
2112 .join(" · ");
2113 let budget = hint_char_budget();
2114 if joined.chars().count() > budget {
2115 let truncated: String = joined.chars().take(budget).collect();
2116 format!("{truncated}...")
2117 } else {
2118 joined
2119 }
2120}
2121
2122fn extract_body_hint(source: &str, start: usize, end: usize) -> Option<String> {
2132 if start >= source.len() || end > source.len() || start >= end {
2133 return None;
2134 }
2135 let safe_start = if source.is_char_boundary(start) {
2136 start
2137 } else {
2138 source.floor_char_boundary(start)
2139 };
2140 let safe_end = end.min(source.len());
2141 let safe_end = if source.is_char_boundary(safe_end) {
2142 safe_end
2143 } else {
2144 source.floor_char_boundary(safe_end)
2145 };
2146 let body = &source[safe_start..safe_end];
2147
2148 let max_lines = hint_line_budget();
2149 let mut collected: Vec<String> = Vec::with_capacity(max_lines);
2150
2151 let mut past_signature = false;
2154 for line in body.lines() {
2155 let trimmed = line.trim();
2156 if !past_signature {
2157 if trimmed.ends_with('{') || trimmed.ends_with(':') || trimmed == "{" {
2159 past_signature = true;
2160 }
2161 continue;
2162 }
2163 if trimmed.is_empty()
2165 || trimmed.starts_with("//")
2166 || trimmed.starts_with('#')
2167 || trimmed.starts_with("/*")
2168 || trimmed.starts_with('*')
2169 || trimmed == "}"
2170 {
2171 continue;
2172 }
2173 collected.push(trimmed.to_string());
2174 if collected.len() >= max_lines {
2175 break;
2176 }
2177 }
2178
2179 if collected.is_empty() {
2180 None
2181 } else {
2182 Some(join_hint_lines(&collected))
2183 }
2184}
2185
2186fn nl_tokens_enabled() -> bool {
2196 if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_COMMENTS") {
2197 return explicit;
2198 }
2199 auto_hint_should_enable()
2200}
2201
2202pub(super) fn auto_hint_mode_enabled() -> bool {
2244 parse_bool_env("CODELENS_EMBED_HINT_AUTO").unwrap_or(true)
2245}
2246
2247pub(super) fn auto_hint_lang() -> Option<String> {
2258 std::env::var("CODELENS_EMBED_HINT_AUTO_LANG")
2259 .ok()
2260 .map(|raw| raw.trim().to_ascii_lowercase())
2261}
2262
2263pub(super) fn language_supports_nl_stack(lang: &str) -> bool {
2299 matches!(
2300 lang.trim().to_ascii_lowercase().as_str(),
2301 "rs" | "rust"
2302 | "cpp"
2303 | "cc"
2304 | "cxx"
2305 | "c++"
2306 | "c"
2307 | "go"
2308 | "golang"
2309 | "java"
2310 | "kt"
2311 | "kotlin"
2312 | "scala"
2313 | "cs"
2314 | "csharp"
2315 | "ts"
2316 | "typescript"
2317 | "tsx"
2318 | "js"
2319 | "javascript"
2320 | "jsx"
2321 )
2322}
2323
2324pub(super) fn language_supports_sparse_weighting(lang: &str) -> bool {
2342 matches!(
2343 lang.trim().to_ascii_lowercase().as_str(),
2344 "rs" | "rust"
2345 | "cpp"
2346 | "cc"
2347 | "cxx"
2348 | "c++"
2349 | "c"
2350 | "go"
2351 | "golang"
2352 | "java"
2353 | "kt"
2354 | "kotlin"
2355 | "scala"
2356 | "cs"
2357 | "csharp"
2358 )
2359}
2360
2361pub(super) fn auto_hint_should_enable() -> bool {
2366 if !auto_hint_mode_enabled() {
2367 return false;
2368 }
2369 match auto_hint_lang() {
2370 Some(lang) => language_supports_nl_stack(&lang),
2371 None => false, }
2373}
2374
2375pub(super) fn auto_sparse_should_enable() -> bool {
2382 if !auto_hint_mode_enabled() {
2383 return false;
2384 }
2385 match auto_hint_lang() {
2386 Some(lang) => language_supports_sparse_weighting(&lang),
2387 None => false,
2388 }
2389}
2390
2391pub(super) fn is_nl_shaped(s: &str) -> bool {
2400 let s = s.trim();
2401 if s.chars().count() < 4 {
2402 return false;
2403 }
2404 if s.contains('/') || s.contains('\\') || s.contains("::") {
2405 return false;
2406 }
2407 if !s.contains(' ') {
2408 return false;
2409 }
2410 let non_ws: usize = s.chars().filter(|c| !c.is_whitespace()).count();
2411 if non_ws == 0 {
2412 return false;
2413 }
2414 let alpha: usize = s.chars().filter(|c| c.is_alphabetic()).count();
2415 (alpha * 100) / non_ws >= 60
2416}
2417
2418fn strict_comments_enabled() -> bool {
2433 std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS")
2434 .map(|raw| {
2435 let lowered = raw.to_ascii_lowercase();
2436 matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
2437 })
2438 .unwrap_or(false)
2439}
2440
2441pub(super) fn looks_like_meta_annotation(body: &str) -> bool {
2462 let trimmed = body.trim_start();
2463 let word_end = trimmed
2466 .find(|c: char| !c.is_ascii_alphabetic())
2467 .unwrap_or(trimmed.len());
2468 if word_end == 0 {
2469 return false;
2470 }
2471 let first_word = &trimmed[..word_end];
2472 let upper = first_word.to_ascii_uppercase();
2473 matches!(
2474 upper.as_str(),
2475 "TODO"
2476 | "FIXME"
2477 | "HACK"
2478 | "XXX"
2479 | "BUG"
2480 | "REVIEW"
2481 | "REFACTOR"
2482 | "TEMP"
2483 | "TEMPORARY"
2484 | "DEPRECATED"
2485 )
2486}
2487
2488fn strict_literal_filter_enabled() -> bool {
2503 std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS")
2504 .map(|raw| {
2505 let lowered = raw.to_ascii_lowercase();
2506 matches!(lowered.as_str(), "1" | "true" | "yes" | "on")
2507 })
2508 .unwrap_or(false)
2509}
2510
2511pub(super) fn contains_format_specifier(s: &str) -> bool {
2523 let bytes = s.as_bytes();
2524 let len = bytes.len();
2525 let mut i = 0;
2526 while i + 1 < len {
2527 if bytes[i] == b'%' {
2528 let next = bytes[i + 1];
2529 if matches!(next, b's' | b'd' | b'r' | b'f' | b'x' | b'o' | b'i' | b'u') {
2530 return true;
2531 }
2532 }
2533 i += 1;
2534 }
2535 for window in s.split('{').skip(1) {
2543 let Some(close_idx) = window.find('}') else {
2544 continue;
2545 };
2546 let inside = &window[..close_idx];
2547 if inside.is_empty() {
2549 return true;
2550 }
2551 if inside.chars().any(|c| c.is_whitespace()) {
2553 continue;
2554 }
2555 if inside.starts_with(':') {
2557 return true;
2558 }
2559 let ident_end = inside.find(':').unwrap_or(inside.len());
2563 let ident = &inside[..ident_end];
2564 if !ident.is_empty()
2565 && ident
2566 .chars()
2567 .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '.')
2568 {
2569 return true;
2570 }
2571 }
2572 false
2573}
2574
2575pub(super) fn looks_like_error_or_log_prefix(s: &str) -> bool {
2586 let lower = s.trim().to_lowercase();
2587 const PREFIXES: &[&str] = &[
2588 "invalid ",
2589 "cannot ",
2590 "could not ",
2591 "unable to ",
2592 "failed to ",
2593 "expected ",
2594 "unexpected ",
2595 "missing ",
2596 "not found",
2597 "error: ",
2598 "error ",
2599 "warning: ",
2600 "warning ",
2601 "sending ",
2602 "received ",
2603 "starting ",
2604 "stopping ",
2605 "calling ",
2606 "connecting ",
2607 "disconnecting ",
2608 ];
2609 PREFIXES.iter().any(|p| lower.starts_with(p))
2610}
2611
2612#[cfg(test)]
2617pub(super) fn should_reject_literal_strict(s: &str) -> bool {
2618 contains_format_specifier(s) || looks_like_error_or_log_prefix(s)
2619}
2620
2621fn extract_nl_tokens(source: &str, start: usize, end: usize) -> Option<String> {
2635 if !nl_tokens_enabled() {
2636 return None;
2637 }
2638 extract_nl_tokens_inner(source, start, end)
2639}
2640
2641pub(super) fn extract_nl_tokens_inner(source: &str, start: usize, end: usize) -> Option<String> {
2646 if start >= source.len() || end > source.len() || start >= end {
2647 return None;
2648 }
2649 let safe_start = if source.is_char_boundary(start) {
2650 start
2651 } else {
2652 source.floor_char_boundary(start)
2653 };
2654 let safe_end = end.min(source.len());
2655 let safe_end = if source.is_char_boundary(safe_end) {
2656 safe_end
2657 } else {
2658 source.floor_char_boundary(safe_end)
2659 };
2660 let body = &source[safe_start..safe_end];
2661
2662 let mut tokens: Vec<String> = Vec::new();
2663
2664 let strict_comments = strict_comments_enabled();
2672 for line in body.lines() {
2673 let trimmed = line.trim();
2674 if let Some(cleaned) = extract_comment_body(trimmed)
2675 && is_nl_shaped(&cleaned)
2676 && (!strict_comments || !looks_like_meta_annotation(&cleaned))
2677 {
2678 tokens.push(cleaned);
2679 }
2680 }
2681
2682 let strict_literals = strict_literal_filter_enabled();
2692 let mut chars = body.chars().peekable();
2693 let mut in_string = false;
2694 let mut current = String::new();
2695 while let Some(c) = chars.next() {
2696 if in_string {
2697 if c == '\\' {
2698 let _ = chars.next();
2700 } else if c == '"' {
2701 if is_nl_shaped(¤t)
2702 && (!strict_literals
2703 || (!contains_format_specifier(¤t)
2704 && !looks_like_error_or_log_prefix(¤t)))
2705 {
2706 tokens.push(current.clone());
2707 }
2708 current.clear();
2709 in_string = false;
2710 } else {
2711 current.push(c);
2712 }
2713 } else if c == '"' {
2714 in_string = true;
2715 }
2716 }
2717
2718 if tokens.is_empty() {
2719 return None;
2720 }
2721 Some(join_hint_lines(&tokens))
2722}
2723
2724fn api_calls_enabled() -> bool {
2733 if let Some(explicit) = parse_bool_env("CODELENS_EMBED_HINT_INCLUDE_API_CALLS") {
2734 return explicit;
2735 }
2736 auto_hint_should_enable()
2737}
2738
2739pub(super) fn is_static_method_ident(ident: &str) -> bool {
2749 ident.chars().next().is_some_and(|c| c.is_ascii_uppercase())
2750}
2751
2752fn extract_api_calls(source: &str, start: usize, end: usize) -> Option<String> {
2764 if !api_calls_enabled() {
2765 return None;
2766 }
2767 extract_api_calls_inner(source, start, end)
2768}
2769
2770pub(super) fn extract_api_calls_inner(source: &str, start: usize, end: usize) -> Option<String> {
2784 if start >= source.len() || end > source.len() || start >= end {
2785 return None;
2786 }
2787 let safe_start = if source.is_char_boundary(start) {
2788 start
2789 } else {
2790 source.floor_char_boundary(start)
2791 };
2792 let safe_end = end.min(source.len());
2793 let safe_end = if source.is_char_boundary(safe_end) {
2794 safe_end
2795 } else {
2796 source.floor_char_boundary(safe_end)
2797 };
2798 if safe_start >= safe_end {
2799 return None;
2800 }
2801 let body = &source[safe_start..safe_end];
2802 let bytes = body.as_bytes();
2803 let len = bytes.len();
2804
2805 let mut calls: Vec<String> = Vec::new();
2806 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
2807
2808 let mut i = 0usize;
2809 while i < len {
2810 let b = bytes[i];
2811 if !(b == b'_' || b.is_ascii_alphabetic()) {
2813 i += 1;
2814 continue;
2815 }
2816 let ident_start = i;
2817 while i < len {
2818 let bb = bytes[i];
2819 if bb == b'_' || bb.is_ascii_alphanumeric() {
2820 i += 1;
2821 } else {
2822 break;
2823 }
2824 }
2825 let ident_end = i;
2826
2827 if i + 1 >= len || bytes[i] != b':' || bytes[i + 1] != b':' {
2829 continue;
2830 }
2831
2832 let type_ident = &body[ident_start..ident_end];
2833 if !is_static_method_ident(type_ident) {
2834 i += 2;
2837 continue;
2838 }
2839
2840 let mut j = i + 2;
2842 if j >= len || !(bytes[j] == b'_' || bytes[j].is_ascii_alphabetic()) {
2843 i = j;
2844 continue;
2845 }
2846 let method_start = j;
2847 while j < len {
2848 let bb = bytes[j];
2849 if bb == b'_' || bb.is_ascii_alphanumeric() {
2850 j += 1;
2851 } else {
2852 break;
2853 }
2854 }
2855 let method_end = j;
2856
2857 let method_ident = &body[method_start..method_end];
2858 let call = format!("{type_ident}::{method_ident}");
2859 if seen.insert(call.clone()) {
2860 calls.push(call);
2861 }
2862 i = j;
2863 }
2864
2865 if calls.is_empty() {
2866 return None;
2867 }
2868 Some(join_hint_lines(&calls))
2869}
2870
2871fn extract_comment_body(trimmed: &str) -> Option<String> {
2874 if trimmed.is_empty() {
2875 return None;
2876 }
2877 if let Some(rest) = trimmed.strip_prefix("///") {
2879 return Some(rest.trim().to_string());
2880 }
2881 if let Some(rest) = trimmed.strip_prefix("//!") {
2882 return Some(rest.trim().to_string());
2883 }
2884 if let Some(rest) = trimmed.strip_prefix("//") {
2885 return Some(rest.trim().to_string());
2886 }
2887 if trimmed.starts_with("#[") || trimmed.starts_with("#!") {
2889 return None;
2890 }
2891 if let Some(rest) = trimmed.strip_prefix('#') {
2893 return Some(rest.trim().to_string());
2894 }
2895 if let Some(rest) = trimmed.strip_prefix("/**") {
2897 return Some(rest.trim_end_matches("*/").trim().to_string());
2898 }
2899 if let Some(rest) = trimmed.strip_prefix("/*") {
2900 return Some(rest.trim_end_matches("*/").trim().to_string());
2901 }
2902 if let Some(rest) = trimmed.strip_prefix('*') {
2903 let rest = rest.trim_end_matches("*/").trim();
2906 if rest.is_empty() {
2907 return None;
2908 }
2909 if rest.contains(';') || rest.contains('{') {
2911 return None;
2912 }
2913 return Some(rest.to_string());
2914 }
2915 None
2916}
2917
2918fn extract_leading_doc(source: &str, start: usize, end: usize) -> Option<String> {
2921 if start >= source.len() || end > source.len() || start >= end {
2922 return None;
2923 }
2924 let safe_start = if source.is_char_boundary(start) {
2926 start
2927 } else {
2928 source.floor_char_boundary(start)
2929 };
2930 let safe_end = end.min(source.len());
2931 let safe_end = if source.is_char_boundary(safe_end) {
2932 safe_end
2933 } else {
2934 source.floor_char_boundary(safe_end)
2935 };
2936 if safe_start >= safe_end {
2937 return None;
2938 }
2939 let body = &source[safe_start..safe_end];
2940 let lines: Vec<&str> = body.lines().skip(1).collect(); if lines.is_empty() {
2942 return None;
2943 }
2944
2945 let mut doc_lines = Vec::new();
2946
2947 let first_trimmed = lines.first().map(|l| l.trim()).unwrap_or_default();
2949 if first_trimmed.starts_with("\"\"\"") || first_trimmed.starts_with("'''") {
2950 let quote = &first_trimmed[..3];
2951 for line in &lines {
2952 let t = line.trim();
2953 doc_lines.push(t.trim_start_matches(quote).trim_end_matches(quote));
2954 if doc_lines.len() > 1 && t.ends_with(quote) {
2955 break;
2956 }
2957 }
2958 }
2959 else if first_trimmed.starts_with("///") || first_trimmed.starts_with("//!") {
2961 for line in &lines {
2962 let t = line.trim();
2963 if t.starts_with("///") || t.starts_with("//!") {
2964 doc_lines.push(t.trim_start_matches("///").trim_start_matches("//!").trim());
2965 } else {
2966 break;
2967 }
2968 }
2969 }
2970 else if first_trimmed.starts_with("/**") {
2972 for line in &lines {
2973 let t = line.trim();
2974 let cleaned = t
2975 .trim_start_matches("/**")
2976 .trim_start_matches('*')
2977 .trim_end_matches("*/")
2978 .trim();
2979 if !cleaned.is_empty() {
2980 doc_lines.push(cleaned);
2981 }
2982 if t.ends_with("*/") {
2983 break;
2984 }
2985 }
2986 }
2987 else {
2989 for line in &lines {
2990 let t = line.trim();
2991 if t.starts_with("//") || t.starts_with('#') {
2992 doc_lines.push(t.trim_start_matches("//").trim_start_matches('#').trim());
2993 } else {
2994 break;
2995 }
2996 }
2997 }
2998
2999 if doc_lines.is_empty() {
3000 return None;
3001 }
3002 Some(doc_lines.join(" ").trim().to_owned())
3003}
3004
3005pub(super) fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
3006 embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
3007}
3008
3009#[cfg(test)]
3010mod tests {
3011 use super::*;
3012 use crate::db::{IndexDb, NewSymbol};
3013 use std::sync::Mutex;
3014
3015 static MODEL_LOCK: Mutex<()> = Mutex::new(());
3017
3018 static ENV_LOCK: Mutex<()> = Mutex::new(());
3025
3026 macro_rules! skip_without_embedding_model {
3027 () => {
3028 if !super::embedding_model_assets_available() {
3029 eprintln!("skipping embedding test: CodeSearchNet model assets unavailable");
3030 return;
3031 }
3032 };
3033 }
3034
3035 fn make_project_with_source() -> (tempfile::TempDir, ProjectRoot) {
3037 let dir = tempfile::tempdir().unwrap();
3038 let root = dir.path();
3039
3040 let source = "def hello():\n print('hi')\n\ndef world():\n return 42\n";
3042 write_python_file_with_symbols(
3043 root,
3044 "main.py",
3045 source,
3046 "hash1",
3047 &[
3048 ("hello", "def hello():", "hello"),
3049 ("world", "def world():", "world"),
3050 ],
3051 );
3052
3053 let project = ProjectRoot::new_exact(root).unwrap();
3054 (dir, project)
3055 }
3056
3057 fn write_python_file_with_symbols(
3058 root: &std::path::Path,
3059 relative_path: &str,
3060 source: &str,
3061 hash: &str,
3062 symbols: &[(&str, &str, &str)],
3063 ) {
3064 std::fs::write(root.join(relative_path), source).unwrap();
3065 let db_path = crate::db::index_db_path(root);
3066 let db = IndexDb::open(&db_path).unwrap();
3067 let file_id = db
3068 .upsert_file(relative_path, 100, hash, source.len() as i64, Some("py"))
3069 .unwrap();
3070
3071 let new_symbols: Vec<NewSymbol<'_>> = symbols
3072 .iter()
3073 .map(|(name, signature, name_path)| {
3074 let start = source.find(signature).unwrap() as i64;
3075 let end = source[start as usize..]
3076 .find("\n\ndef ")
3077 .map(|offset| start + offset as i64)
3078 .unwrap_or(source.len() as i64);
3079 let line = source[..start as usize]
3080 .bytes()
3081 .filter(|&b| b == b'\n')
3082 .count() as i64
3083 + 1;
3084 NewSymbol {
3085 name,
3086 kind: "function",
3087 line,
3088 column_num: 0,
3089 start_byte: start,
3090 end_byte: end,
3091 signature,
3092 name_path,
3093 parent_id: None,
3094 }
3095 })
3096 .collect();
3097 db.insert_symbols(file_id, &new_symbols).unwrap();
3098 }
3099
3100 fn replace_file_embeddings_with_sentinels(
3101 engine: &EmbeddingEngine,
3102 file_path: &str,
3103 sentinels: &[(&str, f32)],
3104 ) {
3105 let mut chunks = engine.store.embeddings_for_files(&[file_path]).unwrap();
3106 for chunk in &mut chunks {
3107 if let Some((_, value)) = sentinels
3108 .iter()
3109 .find(|(symbol_name, _)| *symbol_name == chunk.symbol_name)
3110 {
3111 chunk.embedding = vec![*value; chunk.embedding.len()];
3112 }
3113 }
3114 engine.store.delete_by_file(&[file_path]).unwrap();
3115 engine.store.insert(&chunks).unwrap();
3116 }
3117
3118 #[test]
3119 fn build_embedding_text_with_signature() {
3120 let sym = crate::db::SymbolWithFile {
3121 name: "hello".into(),
3122 kind: "function".into(),
3123 file_path: "main.py".into(),
3124 line: 1,
3125 signature: "def hello():".into(),
3126 name_path: "hello".into(),
3127 start_byte: 0,
3128 end_byte: 10,
3129 };
3130 let text = build_embedding_text(&sym, Some("def hello(): pass"));
3131 assert_eq!(text, "function hello in main.py: def hello():");
3132 }
3133
3134 #[test]
3135 fn build_embedding_text_without_source() {
3136 let sym = crate::db::SymbolWithFile {
3137 name: "MyClass".into(),
3138 kind: "class".into(),
3139 file_path: "app.py".into(),
3140 line: 5,
3141 signature: "class MyClass:".into(),
3142 name_path: "MyClass".into(),
3143 start_byte: 0,
3144 end_byte: 50,
3145 };
3146 let text = build_embedding_text(&sym, None);
3147 assert_eq!(text, "class MyClass (My Class) in app.py: class MyClass:");
3148 }
3149
3150 #[test]
3151 fn build_embedding_text_empty_signature() {
3152 let sym = crate::db::SymbolWithFile {
3153 name: "CONFIG".into(),
3154 kind: "variable".into(),
3155 file_path: "config.py".into(),
3156 line: 1,
3157 signature: String::new(),
3158 name_path: "CONFIG".into(),
3159 start_byte: 0,
3160 end_byte: 0,
3161 };
3162 let text = build_embedding_text(&sym, None);
3163 assert_eq!(text, "variable CONFIG in config.py");
3164 }
3165
3166 #[test]
3167 fn filters_direct_test_symbols_from_embedding_index() {
3168 let source = "#[test]\nfn alias_case() {}\n";
3169 let sym = crate::db::SymbolWithFile {
3170 name: "alias_case".into(),
3171 kind: "function".into(),
3172 file_path: "src/lib.rs".into(),
3173 line: 2,
3174 signature: "fn alias_case() {}".into(),
3175 name_path: "alias_case".into(),
3176 start_byte: source.find("fn alias_case").unwrap() as i64,
3177 end_byte: source.len() as i64,
3178 };
3179
3180 assert!(is_test_only_symbol(&sym, Some(source)));
3181 }
3182
3183 #[test]
3184 fn filters_cfg_test_module_symbols_from_embedding_index() {
3185 let source = "#[cfg(all(test, feature = \"semantic\"))]\nmod semantic_tests {\n fn helper_case() {}\n}\n";
3186 let sym = crate::db::SymbolWithFile {
3187 name: "helper_case".into(),
3188 kind: "function".into(),
3189 file_path: "src/lib.rs".into(),
3190 line: 3,
3191 signature: "fn helper_case() {}".into(),
3192 name_path: "helper_case".into(),
3193 start_byte: source.find("fn helper_case").unwrap() as i64,
3194 end_byte: source.len() as i64,
3195 };
3196
3197 assert!(is_test_only_symbol(&sym, Some(source)));
3198 }
3199
3200 #[test]
3201 fn extract_python_docstring() {
3202 let source =
3203 "def greet(name):\n \"\"\"Say hello to a person.\"\"\"\n print(f'hi {name}')\n";
3204 let doc = extract_leading_doc(source, 0, source.len()).unwrap();
3205 assert!(doc.contains("Say hello to a person"));
3206 }
3207
3208 #[test]
3209 fn extract_rust_doc_comment() {
3210 let source = "fn dispatch_tool() {\n /// Route incoming tool requests.\n /// Handles all MCP methods.\n let x = 1;\n}\n";
3211 let doc = extract_leading_doc(source, 0, source.len()).unwrap();
3212 assert!(doc.contains("Route incoming tool requests"));
3213 assert!(doc.contains("Handles all MCP methods"));
3214 }
3215
3216 #[test]
3217 fn extract_leading_doc_returns_none_for_no_doc() {
3218 let source = "def f():\n return 1\n";
3219 assert!(extract_leading_doc(source, 0, source.len()).is_none());
3220 }
3221
3222 #[test]
3223 fn extract_body_hint_finds_first_meaningful_line() {
3224 let source = "pub fn parse_symbols(\n project: &ProjectRoot,\n) -> Vec<SymbolInfo> {\n let mut parser = tree_sitter::Parser::new();\n parser.set_language(lang);\n}\n";
3225 let hint = extract_body_hint(source, 0, source.len());
3226 assert!(hint.is_some());
3227 assert!(hint.unwrap().contains("tree_sitter::Parser"));
3228 }
3229
3230 #[test]
3231 fn extract_body_hint_skips_comments() {
3232 let source = "fn foo() {\n // setup\n let x = bar();\n}\n";
3233 let hint = extract_body_hint(source, 0, source.len());
3234 assert_eq!(hint.unwrap(), "let x = bar();");
3235 }
3236
3237 #[test]
3238 fn extract_body_hint_returns_none_for_empty() {
3239 let source = "fn empty() {\n}\n";
3240 let hint = extract_body_hint(source, 0, source.len());
3241 assert!(hint.is_none());
3242 }
3243
3244 #[test]
3245 fn extract_body_hint_multi_line_collection_via_env_override() {
3246 let previous_lines = std::env::var("CODELENS_EMBED_HINT_LINES").ok();
3251 let previous_chars = std::env::var("CODELENS_EMBED_HINT_CHARS").ok();
3252 unsafe {
3253 std::env::set_var("CODELENS_EMBED_HINT_LINES", "3");
3254 std::env::set_var("CODELENS_EMBED_HINT_CHARS", "200");
3255 }
3256
3257 let source = "\
3258fn route_request() {
3259 let kind = detect_request_kind();
3260 let target = dispatch_table.get(&kind);
3261 return target.handle();
3262}
3263";
3264 let hint = extract_body_hint(source, 0, source.len()).expect("hint present");
3265
3266 let env_restore = || unsafe {
3267 match &previous_lines {
3268 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_LINES", value),
3269 None => std::env::remove_var("CODELENS_EMBED_HINT_LINES"),
3270 }
3271 match &previous_chars {
3272 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_CHARS", value),
3273 None => std::env::remove_var("CODELENS_EMBED_HINT_CHARS"),
3274 }
3275 };
3276
3277 let all_three = hint.contains("detect_request_kind")
3278 && hint.contains("dispatch_table")
3279 && hint.contains("target.handle");
3280 let has_separator = hint.contains(" · ");
3281 env_restore();
3282
3283 assert!(all_three, "missing one of three body lines: {hint}");
3284 assert!(has_separator, "missing · separator: {hint}");
3285 }
3286
3287 #[test]
3298 fn hint_line_budget_respects_env_override() {
3299 let previous = std::env::var("CODELENS_EMBED_HINT_LINES").ok();
3302 unsafe {
3303 std::env::set_var("CODELENS_EMBED_HINT_LINES", "5");
3304 }
3305 let budget = super::hint_line_budget();
3306 unsafe {
3307 match previous {
3308 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_LINES", value),
3309 None => std::env::remove_var("CODELENS_EMBED_HINT_LINES"),
3310 }
3311 }
3312 assert_eq!(budget, 5);
3313 }
3314
3315 #[test]
3316 fn is_nl_shaped_accepts_multi_word_prose() {
3317 assert!(super::is_nl_shaped("skip comments and string literals"));
3318 assert!(super::is_nl_shaped("failed to open database"));
3319 assert!(super::is_nl_shaped("detect client version"));
3320 }
3321
3322 #[test]
3323 fn is_nl_shaped_rejects_code_and_paths() {
3324 assert!(!super::is_nl_shaped("crates/codelens-engine/src"));
3326 assert!(!super::is_nl_shaped("C:\\Users\\foo"));
3327 assert!(!super::is_nl_shaped("std::sync::Mutex"));
3329 assert!(!super::is_nl_shaped("detect_client"));
3331 assert!(!super::is_nl_shaped("ok"));
3333 assert!(!super::is_nl_shaped(""));
3334 assert!(!super::is_nl_shaped("1 2 3 4 5"));
3336 }
3337
3338 #[test]
3339 fn extract_comment_body_strips_comment_markers() {
3340 assert_eq!(
3341 super::extract_comment_body("/// rust doc comment"),
3342 Some("rust doc comment".to_string())
3343 );
3344 assert_eq!(
3345 super::extract_comment_body("// regular line comment"),
3346 Some("regular line comment".to_string())
3347 );
3348 assert_eq!(
3349 super::extract_comment_body("# python line comment"),
3350 Some("python line comment".to_string())
3351 );
3352 assert_eq!(
3353 super::extract_comment_body("/* inline block */"),
3354 Some("inline block".to_string())
3355 );
3356 assert_eq!(
3357 super::extract_comment_body("* continuation line"),
3358 Some("continuation line".to_string())
3359 );
3360 }
3361
3362 #[test]
3363 fn extract_comment_body_rejects_rust_attributes_and_shebangs() {
3364 assert!(super::extract_comment_body("#[derive(Debug)]").is_none());
3365 assert!(super::extract_comment_body("#[test]").is_none());
3366 assert!(super::extract_comment_body("#!/usr/bin/env python").is_none());
3367 }
3368
3369 #[test]
3370 fn extract_nl_tokens_gated_off_by_default() {
3371 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3372 let previous = std::env::var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS").ok();
3374 unsafe {
3375 std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3376 }
3377 let source = "\
3378fn skip_things() {
3379 // skip comments and string literals during search
3380 let lit = \"scan for matching tokens\";
3381}
3382";
3383 let result = extract_nl_tokens(source, 0, source.len());
3384 unsafe {
3385 if let Some(value) = previous {
3386 std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", value);
3387 }
3388 }
3389 assert!(result.is_none(), "gate leaked: {result:?}");
3390 }
3391
3392 #[test]
3393 fn auto_hint_mode_defaults_on_unless_explicit_off() {
3394 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3395 let previous = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3403
3404 unsafe {
3406 std::env::remove_var("CODELENS_EMBED_HINT_AUTO");
3407 }
3408 let default_enabled = super::auto_hint_mode_enabled();
3409 assert!(
3410 default_enabled,
3411 "v1.6.0 default flip: auto hint mode should be ON when env unset"
3412 );
3413
3414 unsafe {
3416 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3417 }
3418 let explicit_off = super::auto_hint_mode_enabled();
3419 assert!(
3420 !explicit_off,
3421 "explicit CODELENS_EMBED_HINT_AUTO=0 must still disable (opt-out escape hatch)"
3422 );
3423
3424 unsafe {
3426 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3427 }
3428 let explicit_on = super::auto_hint_mode_enabled();
3429 assert!(
3430 explicit_on,
3431 "explicit CODELENS_EMBED_HINT_AUTO=1 must still enable"
3432 );
3433
3434 unsafe {
3436 match previous {
3437 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3438 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3439 }
3440 }
3441 }
3442
3443 #[test]
3444 fn language_supports_nl_stack_classifies_correctly() {
3445 assert!(super::language_supports_nl_stack("rs"));
3447 assert!(super::language_supports_nl_stack("rust"));
3448 assert!(super::language_supports_nl_stack("cpp"));
3449 assert!(super::language_supports_nl_stack("c++"));
3450 assert!(super::language_supports_nl_stack("c"));
3451 assert!(super::language_supports_nl_stack("go"));
3452 assert!(super::language_supports_nl_stack("golang"));
3453 assert!(super::language_supports_nl_stack("java"));
3454 assert!(super::language_supports_nl_stack("kt"));
3455 assert!(super::language_supports_nl_stack("kotlin"));
3456 assert!(super::language_supports_nl_stack("scala"));
3457 assert!(super::language_supports_nl_stack("cs"));
3458 assert!(super::language_supports_nl_stack("csharp"));
3459 assert!(super::language_supports_nl_stack("ts"));
3462 assert!(super::language_supports_nl_stack("typescript"));
3463 assert!(super::language_supports_nl_stack("tsx"));
3464 assert!(super::language_supports_nl_stack("js"));
3465 assert!(super::language_supports_nl_stack("javascript"));
3466 assert!(super::language_supports_nl_stack("jsx"));
3467 assert!(super::language_supports_nl_stack("Rust"));
3469 assert!(super::language_supports_nl_stack("RUST"));
3470 assert!(super::language_supports_nl_stack("TypeScript"));
3471 assert!(super::language_supports_nl_stack(" rust "));
3473 assert!(super::language_supports_nl_stack(" ts "));
3474
3475 assert!(!super::language_supports_nl_stack("py"));
3477 assert!(!super::language_supports_nl_stack("python"));
3478 assert!(!super::language_supports_nl_stack("rb"));
3479 assert!(!super::language_supports_nl_stack("ruby"));
3480 assert!(!super::language_supports_nl_stack("php"));
3481 assert!(!super::language_supports_nl_stack("lua"));
3482 assert!(!super::language_supports_nl_stack("sh"));
3483 assert!(!super::language_supports_nl_stack("klingon"));
3485 assert!(!super::language_supports_nl_stack(""));
3486 }
3487
3488 #[test]
3489 fn language_supports_sparse_weighting_classifies_correctly() {
3490 assert!(super::language_supports_sparse_weighting("rs"));
3491 assert!(super::language_supports_sparse_weighting("rust"));
3492 assert!(super::language_supports_sparse_weighting("cpp"));
3493 assert!(super::language_supports_sparse_weighting("go"));
3494 assert!(super::language_supports_sparse_weighting("java"));
3495 assert!(super::language_supports_sparse_weighting("kotlin"));
3496 assert!(super::language_supports_sparse_weighting("csharp"));
3497
3498 assert!(!super::language_supports_sparse_weighting("ts"));
3499 assert!(!super::language_supports_sparse_weighting("typescript"));
3500 assert!(!super::language_supports_sparse_weighting("tsx"));
3501 assert!(!super::language_supports_sparse_weighting("js"));
3502 assert!(!super::language_supports_sparse_weighting("javascript"));
3503 assert!(!super::language_supports_sparse_weighting("jsx"));
3504 assert!(!super::language_supports_sparse_weighting("py"));
3505 assert!(!super::language_supports_sparse_weighting("python"));
3506 assert!(!super::language_supports_sparse_weighting("klingon"));
3507 assert!(!super::language_supports_sparse_weighting(""));
3508 }
3509
3510 #[test]
3511 fn auto_hint_should_enable_requires_both_gate_and_supported_lang() {
3512 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3513 let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3514 let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3515
3516 unsafe {
3520 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3521 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3522 }
3523 assert!(
3524 !super::auto_hint_should_enable(),
3525 "gate-off (explicit =0) with lang=rust must stay disabled"
3526 );
3527
3528 unsafe {
3530 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3531 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3532 }
3533 assert!(
3534 super::auto_hint_should_enable(),
3535 "gate-on + lang=rust must enable"
3536 );
3537
3538 unsafe {
3539 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3540 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
3541 }
3542 assert!(
3543 super::auto_hint_should_enable(),
3544 "gate-on + lang=typescript must keep Phase 2b/2c enabled"
3545 );
3546
3547 unsafe {
3549 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3550 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3551 }
3552 assert!(
3553 !super::auto_hint_should_enable(),
3554 "gate-on + lang=python must stay disabled"
3555 );
3556
3557 unsafe {
3559 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3560 std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
3561 }
3562 assert!(
3563 !super::auto_hint_should_enable(),
3564 "gate-on + no lang tag must stay disabled"
3565 );
3566
3567 unsafe {
3569 match prev_auto {
3570 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3571 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3572 }
3573 match prev_lang {
3574 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3575 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3576 }
3577 }
3578 }
3579
3580 #[test]
3581 fn auto_sparse_should_enable_requires_both_gate_and_sparse_supported_lang() {
3582 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3583 let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3584 let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3585
3586 unsafe {
3587 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "0");
3588 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3589 }
3590 assert!(
3591 !super::auto_sparse_should_enable(),
3592 "gate-off (explicit =0) must disable sparse auto gate"
3593 );
3594
3595 unsafe {
3596 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3597 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3598 }
3599 assert!(
3600 super::auto_sparse_should_enable(),
3601 "gate-on + lang=rust must enable sparse auto gate"
3602 );
3603
3604 unsafe {
3605 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3606 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "typescript");
3607 }
3608 assert!(
3609 !super::auto_sparse_should_enable(),
3610 "gate-on + lang=typescript must keep sparse auto gate disabled"
3611 );
3612
3613 unsafe {
3614 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3615 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3616 }
3617 assert!(
3618 !super::auto_sparse_should_enable(),
3619 "gate-on + lang=python must keep sparse auto gate disabled"
3620 );
3621
3622 unsafe {
3623 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3624 std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG");
3625 }
3626 assert!(
3627 !super::auto_sparse_should_enable(),
3628 "gate-on + no lang tag must keep sparse auto gate disabled"
3629 );
3630
3631 unsafe {
3632 match prev_auto {
3633 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3634 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3635 }
3636 match prev_lang {
3637 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3638 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3639 }
3640 }
3641 }
3642
3643 #[test]
3644 fn nl_tokens_enabled_explicit_env_wins_over_auto() {
3645 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3646 let prev_explicit = std::env::var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS").ok();
3647 let prev_auto = std::env::var("CODELENS_EMBED_HINT_AUTO").ok();
3648 let prev_lang = std::env::var("CODELENS_EMBED_HINT_AUTO_LANG").ok();
3649
3650 unsafe {
3652 std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", "1");
3653 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3654 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3655 }
3656 assert!(
3657 super::nl_tokens_enabled(),
3658 "explicit=1 must win over auto+python=off"
3659 );
3660
3661 unsafe {
3663 std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", "0");
3664 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3665 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3666 }
3667 assert!(
3668 !super::nl_tokens_enabled(),
3669 "explicit=0 must win over auto+rust=on"
3670 );
3671
3672 unsafe {
3674 std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3675 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3676 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "rust");
3677 }
3678 assert!(
3679 super::nl_tokens_enabled(),
3680 "no explicit + auto+rust must enable"
3681 );
3682
3683 unsafe {
3685 std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS");
3686 std::env::set_var("CODELENS_EMBED_HINT_AUTO", "1");
3687 std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", "python");
3688 }
3689 assert!(
3690 !super::nl_tokens_enabled(),
3691 "no explicit + auto+python must stay disabled"
3692 );
3693
3694 unsafe {
3696 match prev_explicit {
3697 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS", v),
3698 None => std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_COMMENTS"),
3699 }
3700 match prev_auto {
3701 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO", v),
3702 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO"),
3703 }
3704 match prev_lang {
3705 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_AUTO_LANG", v),
3706 None => std::env::remove_var("CODELENS_EMBED_HINT_AUTO_LANG"),
3707 }
3708 }
3709 }
3710
3711 #[test]
3712 fn strict_comments_gated_off_by_default() {
3713 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3714 let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3715 unsafe {
3716 std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS");
3717 }
3718 let enabled = super::strict_comments_enabled();
3719 unsafe {
3720 if let Some(value) = previous {
3721 std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", value);
3722 }
3723 }
3724 assert!(!enabled, "strict comments gate leaked");
3725 }
3726
3727 #[test]
3728 fn looks_like_meta_annotation_detects_rejected_prefixes() {
3729 assert!(super::looks_like_meta_annotation("TODO: fix later"));
3731 assert!(super::looks_like_meta_annotation("todo handle edge case"));
3732 assert!(super::looks_like_meta_annotation("FIXME this is broken"));
3733 assert!(super::looks_like_meta_annotation(
3734 "HACK: workaround for bug"
3735 ));
3736 assert!(super::looks_like_meta_annotation("XXX not implemented yet"));
3737 assert!(super::looks_like_meta_annotation(
3738 "BUG in the upstream crate"
3739 ));
3740 assert!(super::looks_like_meta_annotation("REVIEW before merging"));
3741 assert!(super::looks_like_meta_annotation(
3742 "REFACTOR this block later"
3743 ));
3744 assert!(super::looks_like_meta_annotation("TEMP: remove before v2"));
3745 assert!(super::looks_like_meta_annotation(
3746 "DEPRECATED use new_api instead"
3747 ));
3748 assert!(super::looks_like_meta_annotation(
3750 " TODO: with leading ws"
3751 ));
3752 }
3753
3754 #[test]
3755 fn looks_like_meta_annotation_preserves_behaviour_prefixes() {
3756 assert!(!super::looks_like_meta_annotation(
3758 "NOTE: this branch handles empty input"
3759 ));
3760 assert!(!super::looks_like_meta_annotation(
3761 "WARN: overflow is possible"
3762 ));
3763 assert!(!super::looks_like_meta_annotation(
3764 "SAFETY: caller must hold the lock"
3765 ));
3766 assert!(!super::looks_like_meta_annotation(
3767 "PANIC: unreachable by construction"
3768 ));
3769 assert!(!super::looks_like_meta_annotation(
3771 "parse json body from request"
3772 ));
3773 assert!(!super::looks_like_meta_annotation(
3774 "walk directory respecting gitignore"
3775 ));
3776 assert!(!super::looks_like_meta_annotation(
3777 "compute cosine similarity between vectors"
3778 ));
3779 assert!(!super::looks_like_meta_annotation(""));
3781 assert!(!super::looks_like_meta_annotation(" "));
3782 assert!(!super::looks_like_meta_annotation("123 numeric prefix"));
3783 }
3784
3785 #[test]
3786 fn strict_comments_filters_meta_annotations_during_extraction() {
3787 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3788 let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3789 unsafe {
3790 std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", "1");
3791 }
3792 let source = "\
3793fn handle_request() {
3794 // TODO: handle the error path properly
3795 // parse json body from the incoming request
3796 // FIXME: this can panic on empty input
3797 // walk directory respecting the gitignore rules
3798 let _x = 1;
3799}
3800";
3801 let result = super::extract_nl_tokens_inner(source, 0, source.len());
3802 unsafe {
3803 match previous {
3804 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", value),
3805 None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS"),
3806 }
3807 }
3808 let hint = result.expect("behaviour comments must survive");
3809 assert!(
3813 hint.contains("parse json body"),
3814 "behaviour comment dropped: {hint}"
3815 );
3816 assert!(!hint.contains("TODO"), "TODO annotation leaked: {hint}");
3819 assert!(!hint.contains("FIXME"), "FIXME annotation leaked: {hint}");
3820 }
3821
3822 #[test]
3823 fn strict_comments_is_orthogonal_to_strict_literals() {
3824 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3825 let prev_c = std::env::var("CODELENS_EMBED_HINT_STRICT_COMMENTS").ok();
3829 let prev_l = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3830 unsafe {
3831 std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", "1");
3832 std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS");
3833 }
3834 let source = "\
3837fn handle() {
3838 // handles real behaviour
3839 let fmt = \"format error string\";
3840}
3841";
3842 let result = super::extract_nl_tokens_inner(source, 0, source.len());
3843 unsafe {
3844 match prev_c {
3845 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_COMMENTS", v),
3846 None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_COMMENTS"),
3847 }
3848 match prev_l {
3849 Some(v) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", v),
3850 None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
3851 }
3852 }
3853 let hint = result.expect("tokens must exist");
3854 assert!(hint.contains("handles real"), "comment dropped: {hint}");
3856 assert!(
3859 hint.contains("format error string"),
3860 "literal dropped: {hint}"
3861 );
3862 }
3863
3864 #[test]
3865 fn strict_literal_filter_gated_off_by_default() {
3866 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3867 let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3868 unsafe {
3869 std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS");
3870 }
3871 let enabled = super::strict_literal_filter_enabled();
3872 unsafe {
3873 if let Some(value) = previous {
3874 std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value);
3875 }
3876 }
3877 assert!(!enabled, "strict literal filter gate leaked");
3878 }
3879
3880 #[test]
3881 fn contains_format_specifier_detects_c_and_python_style() {
3882 assert!(super::contains_format_specifier("Invalid URL %s"));
3884 assert!(super::contains_format_specifier("got %d matches"));
3885 assert!(super::contains_format_specifier("value=%r"));
3886 assert!(super::contains_format_specifier("size=%f"));
3887 assert!(super::contains_format_specifier("sending request to {url}"));
3889 assert!(super::contains_format_specifier("got {0} items"));
3890 assert!(super::contains_format_specifier("{:?}"));
3891 assert!(super::contains_format_specifier("value: {x:.2f}"));
3892 assert!(super::contains_format_specifier("{}"));
3893 assert!(!super::contains_format_specifier(
3895 "skip comments and string literals"
3896 ));
3897 assert!(!super::contains_format_specifier("failed to open database"));
3898 assert!(!super::contains_format_specifier("{name: foo, id: 1}"));
3901 }
3902
3903 #[test]
3904 fn looks_like_error_or_log_prefix_rejects_common_patterns() {
3905 assert!(super::looks_like_error_or_log_prefix("Invalid URL format"));
3906 assert!(super::looks_like_error_or_log_prefix(
3907 "Cannot decode response"
3908 ));
3909 assert!(super::looks_like_error_or_log_prefix("could not open file"));
3910 assert!(super::looks_like_error_or_log_prefix(
3911 "Failed to send request"
3912 ));
3913 assert!(super::looks_like_error_or_log_prefix(
3914 "Expected int, got str"
3915 ));
3916 assert!(super::looks_like_error_or_log_prefix(
3917 "sending request to server"
3918 ));
3919 assert!(super::looks_like_error_or_log_prefix(
3920 "received response headers"
3921 ));
3922 assert!(super::looks_like_error_or_log_prefix(
3923 "starting worker pool"
3924 ));
3925 assert!(!super::looks_like_error_or_log_prefix(
3927 "parse json body from request"
3928 ));
3929 assert!(!super::looks_like_error_or_log_prefix(
3930 "compute cosine similarity between vectors"
3931 ));
3932 assert!(!super::looks_like_error_or_log_prefix(
3933 "walk directory tree respecting gitignore"
3934 ));
3935 }
3936
3937 #[test]
3938 fn strict_mode_rejects_format_and_error_literals_during_extraction() {
3939 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3940 let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3944 unsafe {
3945 std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", "1");
3946 }
3947 let source = "\
3948fn handle_request() {
3949 let err = \"Invalid URL %s\";
3950 let log = \"sending request to the upstream server\";
3951 let fmt = \"received {count} items in batch\";
3952 let real = \"parse json body from the incoming request\";
3953}
3954";
3955 let result = super::extract_nl_tokens_inner(source, 0, source.len());
3956 unsafe {
3957 match previous {
3958 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value),
3959 None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
3960 }
3961 }
3962 let hint = result.expect("some token should survive");
3963 assert!(
3965 hint.contains("parse json body"),
3966 "real literal was filtered out: {hint}"
3967 );
3968 assert!(
3970 !hint.contains("Invalid URL"),
3971 "format-specifier literal leaked: {hint}"
3972 );
3973 assert!(
3974 !hint.contains("sending request"),
3975 "log-prefix literal leaked: {hint}"
3976 );
3977 assert!(
3978 !hint.contains("received {count}"),
3979 "python fstring literal leaked: {hint}"
3980 );
3981 }
3982
3983 #[test]
3984 fn strict_mode_leaves_comments_untouched() {
3985 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
3986 let previous = std::env::var("CODELENS_EMBED_HINT_STRICT_LITERALS").ok();
3990 unsafe {
3991 std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", "1");
3992 }
3993 let source = "\
3994fn do_work() {
3995 // Invalid inputs are rejected by this guard clause.
3996 // sending requests in parallel across worker threads.
3997 let _lit = \"format spec %s\";
3998}
3999";
4000 let result = super::extract_nl_tokens_inner(source, 0, source.len());
4001 unsafe {
4002 match previous {
4003 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_STRICT_LITERALS", value),
4004 None => std::env::remove_var("CODELENS_EMBED_HINT_STRICT_LITERALS"),
4005 }
4006 }
4007 let hint = result.expect("comments should survive strict mode");
4008 assert!(
4011 hint.contains("Invalid inputs") || hint.contains("rejected by this guard"),
4012 "strict mode swallowed a comment: {hint}"
4013 );
4014 assert!(
4016 !hint.contains("format spec"),
4017 "format-specifier literal leaked under strict mode: {hint}"
4018 );
4019 }
4020
4021 #[test]
4022 fn should_reject_literal_strict_composes_format_and_prefix() {
4023 assert!(super::should_reject_literal_strict("Invalid URL %s"));
4027 assert!(super::should_reject_literal_strict(
4028 "sending request to server"
4029 ));
4030 assert!(super::should_reject_literal_strict("value: {x:.2f}"));
4031 assert!(!super::should_reject_literal_strict(
4033 "parse json body from the incoming request"
4034 ));
4035 assert!(!super::should_reject_literal_strict(
4036 "compute cosine similarity between vectors"
4037 ));
4038 }
4039
4040 #[test]
4041 fn is_static_method_ident_accepts_pascal_and_rejects_snake() {
4042 assert!(super::is_static_method_ident("HashMap"));
4043 assert!(super::is_static_method_ident("Parser"));
4044 assert!(super::is_static_method_ident("A"));
4045 assert!(!super::is_static_method_ident("std"));
4048 assert!(!super::is_static_method_ident("fs"));
4049 assert!(!super::is_static_method_ident("_private"));
4050 assert!(!super::is_static_method_ident(""));
4051 }
4052
4053 #[test]
4054 fn extract_api_calls_gated_off_by_default() {
4055 let _env_guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
4056 let previous = std::env::var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS").ok();
4058 unsafe {
4059 std::env::remove_var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS");
4060 }
4061 let source = "\
4062fn make_parser() {
4063 let p = Parser::new();
4064 let _ = HashMap::with_capacity(8);
4065}
4066";
4067 let result = extract_api_calls(source, 0, source.len());
4068 unsafe {
4069 if let Some(value) = previous {
4070 std::env::set_var("CODELENS_EMBED_HINT_INCLUDE_API_CALLS", value);
4071 }
4072 }
4073 assert!(result.is_none(), "gate leaked: {result:?}");
4074 }
4075
4076 #[test]
4077 fn extract_api_calls_captures_type_method_patterns() {
4078 let source = "\
4080fn open_db() {
4081 let p = Parser::new();
4082 let map = HashMap::with_capacity(16);
4083 let _ = tree_sitter::Parser::new();
4084}
4085";
4086 let hint = super::extract_api_calls_inner(source, 0, source.len())
4087 .expect("api calls should be produced");
4088 assert!(hint.contains("Parser::new"), "missing Parser::new: {hint}");
4089 assert!(
4090 hint.contains("HashMap::with_capacity"),
4091 "missing HashMap::with_capacity: {hint}"
4092 );
4093 }
4094
4095 #[test]
4096 fn extract_api_calls_rejects_module_prefixed_free_functions() {
4097 let source = "\
4100fn read_config() {
4101 let _ = std::fs::read_to_string(\"foo\");
4102 let _ = crate::util::parse();
4103}
4104";
4105 let hint = super::extract_api_calls_inner(source, 0, source.len());
4106 if let Some(hint) = hint {
4109 assert!(!hint.contains("std::fs"), "lowercase module leaked: {hint}");
4110 assert!(
4111 !hint.contains("fs::read_to_string"),
4112 "module-prefixed free function leaked: {hint}"
4113 );
4114 assert!(!hint.contains("crate::util"), "crate path leaked: {hint}");
4115 }
4116 }
4117
4118 #[test]
4119 fn extract_api_calls_deduplicates_repeated_calls() {
4120 let source = "\
4121fn hot_loop() {
4122 for _ in 0..10 {
4123 let _ = Parser::new();
4124 let _ = Parser::new();
4125 }
4126 let _ = Parser::new();
4127}
4128";
4129 let hint = super::extract_api_calls_inner(source, 0, source.len())
4130 .expect("api calls should be produced");
4131 let first = hint.find("Parser::new").expect("hit");
4132 let rest = &hint[first + "Parser::new".len()..];
4133 assert!(
4134 !rest.contains("Parser::new"),
4135 "duplicate not deduplicated: {hint}"
4136 );
4137 }
4138
4139 #[test]
4140 fn extract_api_calls_returns_none_when_body_has_no_type_calls() {
4141 let source = "\
4142fn plain() {
4143 let x = 1;
4144 let y = x + 2;
4145}
4146";
4147 assert!(super::extract_api_calls_inner(source, 0, source.len()).is_none());
4148 }
4149
4150 #[test]
4151 fn extract_nl_tokens_collects_comments_and_string_literals() {
4152 let source = "\
4156fn search_for_matches() {
4157 // skip comments and string literals during search
4158 let error = \"failed to open database\";
4159 let single = \"tok\";
4160 let path = \"src/foo/bar\";
4161 let keyword = match kind {
4162 Kind::Ident => \"detect client version\",
4163 _ => \"\",
4164 };
4165}
4166";
4167 let hint = super::extract_nl_tokens_inner(source, 0, source.len())
4173 .expect("nl tokens should be produced");
4174 let has_first_nl_signal = hint.contains("skip comments")
4178 || hint.contains("failed to open")
4179 || hint.contains("detect client");
4180 assert!(has_first_nl_signal, "no NL signal produced: {hint}");
4181 assert!(!hint.contains(" tok "), "short literal leaked: {hint}");
4183 assert!(!hint.contains("src/foo/bar"), "path literal leaked: {hint}");
4185 }
4186
4187 #[test]
4188 fn hint_char_budget_respects_env_override() {
4189 let previous = std::env::var("CODELENS_EMBED_HINT_CHARS").ok();
4190 unsafe {
4191 std::env::set_var("CODELENS_EMBED_HINT_CHARS", "120");
4192 }
4193 let budget = super::hint_char_budget();
4194 unsafe {
4195 match previous {
4196 Some(value) => std::env::set_var("CODELENS_EMBED_HINT_CHARS", value),
4197 None => std::env::remove_var("CODELENS_EMBED_HINT_CHARS"),
4198 }
4199 }
4200 assert_eq!(budget, 120);
4201 }
4202
4203 #[test]
4204 fn embedding_to_bytes_roundtrip() {
4205 let floats = vec![1.0f32, -0.5, 0.0, 3.25];
4206 let bytes = embedding_to_bytes(&floats);
4207 assert_eq!(bytes.len(), 4 * 4);
4208 let recovered: Vec<f32> = bytes
4210 .chunks_exact(4)
4211 .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
4212 .collect();
4213 assert_eq!(floats, recovered);
4214 }
4215
4216 #[test]
4217 fn duplicate_pair_key_is_order_independent() {
4218 let a = duplicate_pair_key("a.py", "foo", "b.py", "bar");
4219 let b = duplicate_pair_key("b.py", "bar", "a.py", "foo");
4220 assert_eq!(a, b);
4221 }
4222
4223 #[test]
4224 fn text_embedding_cache_updates_recency() {
4225 let mut cache = TextEmbeddingCache::new(2);
4226 cache.insert("a".into(), vec![1.0]);
4227 cache.insert("b".into(), vec![2.0]);
4228 assert_eq!(cache.get("a"), Some(vec![1.0]));
4229 cache.insert("c".into(), vec![3.0]);
4230
4231 assert_eq!(cache.get("a"), Some(vec![1.0]));
4232 assert_eq!(cache.get("b"), None);
4233 assert_eq!(cache.get("c"), Some(vec![3.0]));
4234 }
4235
4236 #[test]
4237 fn text_embedding_cache_can_be_disabled() {
4238 let mut cache = TextEmbeddingCache::new(0);
4239 cache.insert("a".into(), vec![1.0]);
4240 assert_eq!(cache.get("a"), None);
4241 }
4242
4243 #[test]
4244 fn engine_new_and_index() {
4245 let _lock = MODEL_LOCK.lock().unwrap();
4246 skip_without_embedding_model!();
4247 let (_dir, project) = make_project_with_source();
4248 let engine = EmbeddingEngine::new(&project).expect("engine should load");
4249 assert!(!engine.is_indexed());
4250
4251 let count = engine.index_from_project(&project).unwrap();
4252 assert_eq!(count, 2, "should index 2 symbols");
4253 assert!(engine.is_indexed());
4254 }
4255
4256 #[test]
4257 fn engine_search_returns_results() {
4258 let _lock = MODEL_LOCK.lock().unwrap();
4259 skip_without_embedding_model!();
4260 let (_dir, project) = make_project_with_source();
4261 let engine = EmbeddingEngine::new(&project).unwrap();
4262 engine.index_from_project(&project).unwrap();
4263
4264 let results = engine.search("hello function", 10).unwrap();
4265 assert!(!results.is_empty(), "search should return results");
4266 for r in &results {
4267 assert!(
4268 r.score >= -1.0 && r.score <= 1.0,
4269 "score should be in [-1,1]: {}",
4270 r.score
4271 );
4272 }
4273 }
4274
4275 #[test]
4276 fn engine_incremental_index() {
4277 let _lock = MODEL_LOCK.lock().unwrap();
4278 skip_without_embedding_model!();
4279 let (_dir, project) = make_project_with_source();
4280 let engine = EmbeddingEngine::new(&project).unwrap();
4281 engine.index_from_project(&project).unwrap();
4282 assert_eq!(engine.store.count().unwrap(), 2);
4283
4284 let count = engine.index_changed_files(&project, &["main.py"]).unwrap();
4286 assert_eq!(count, 2);
4287 assert_eq!(engine.store.count().unwrap(), 2);
4288 }
4289
4290 #[test]
4291 fn engine_reindex_preserves_symbol_count() {
4292 let _lock = MODEL_LOCK.lock().unwrap();
4293 skip_without_embedding_model!();
4294 let (_dir, project) = make_project_with_source();
4295 let engine = EmbeddingEngine::new(&project).unwrap();
4296 engine.index_from_project(&project).unwrap();
4297 assert_eq!(engine.store.count().unwrap(), 2);
4298
4299 let count = engine.index_from_project(&project).unwrap();
4300 assert_eq!(count, 2);
4301 assert_eq!(engine.store.count().unwrap(), 2);
4302 }
4303
4304 #[test]
4305 fn full_reindex_reuses_unchanged_embeddings() {
4306 let _lock = MODEL_LOCK.lock().unwrap();
4307 skip_without_embedding_model!();
4308 let (_dir, project) = make_project_with_source();
4309 let engine = EmbeddingEngine::new(&project).unwrap();
4310 engine.index_from_project(&project).unwrap();
4311
4312 replace_file_embeddings_with_sentinels(
4313 &engine,
4314 "main.py",
4315 &[("hello", 11.0), ("world", 22.0)],
4316 );
4317
4318 let count = engine.index_from_project(&project).unwrap();
4319 assert_eq!(count, 2);
4320
4321 let hello = engine
4322 .store
4323 .get_embedding("main.py", "hello")
4324 .unwrap()
4325 .expect("hello should exist");
4326 let world = engine
4327 .store
4328 .get_embedding("main.py", "world")
4329 .unwrap()
4330 .expect("world should exist");
4331 assert!(hello.embedding.iter().all(|value| *value == 11.0));
4332 assert!(world.embedding.iter().all(|value| *value == 22.0));
4333 }
4334
4335 #[test]
4336 fn full_reindex_reuses_unchanged_sibling_after_edit() {
4337 let _lock = MODEL_LOCK.lock().unwrap();
4338 skip_without_embedding_model!();
4339 let (dir, project) = make_project_with_source();
4340 let engine = EmbeddingEngine::new(&project).unwrap();
4341 engine.index_from_project(&project).unwrap();
4342
4343 replace_file_embeddings_with_sentinels(
4344 &engine,
4345 "main.py",
4346 &[("hello", 11.0), ("world", 22.0)],
4347 );
4348
4349 let updated_source =
4350 "def hello():\n print('hi')\n\ndef world(name):\n return name.upper()\n";
4351 write_python_file_with_symbols(
4352 dir.path(),
4353 "main.py",
4354 updated_source,
4355 "hash2",
4356 &[
4357 ("hello", "def hello():", "hello"),
4358 ("world", "def world(name):", "world"),
4359 ],
4360 );
4361
4362 let count = engine.index_from_project(&project).unwrap();
4363 assert_eq!(count, 2);
4364
4365 let hello = engine
4366 .store
4367 .get_embedding("main.py", "hello")
4368 .unwrap()
4369 .expect("hello should exist");
4370 let world = engine
4371 .store
4372 .get_embedding("main.py", "world")
4373 .unwrap()
4374 .expect("world should exist");
4375 assert!(hello.embedding.iter().all(|value| *value == 11.0));
4376 assert!(world.embedding.iter().any(|value| *value != 22.0));
4377 assert_eq!(engine.store.count().unwrap(), 2);
4378 }
4379
4380 #[test]
4381 fn full_reindex_removes_deleted_files() {
4382 let _lock = MODEL_LOCK.lock().unwrap();
4383 skip_without_embedding_model!();
4384 let (dir, project) = make_project_with_source();
4385 write_python_file_with_symbols(
4386 dir.path(),
4387 "extra.py",
4388 "def bonus():\n return 7\n",
4389 "hash-extra",
4390 &[("bonus", "def bonus():", "bonus")],
4391 );
4392
4393 let engine = EmbeddingEngine::new(&project).unwrap();
4394 engine.index_from_project(&project).unwrap();
4395 assert_eq!(engine.store.count().unwrap(), 3);
4396
4397 std::fs::remove_file(dir.path().join("extra.py")).unwrap();
4398 let db_path = crate::db::index_db_path(dir.path());
4399 let db = IndexDb::open(&db_path).unwrap();
4400 db.delete_file("extra.py").unwrap();
4401
4402 let count = engine.index_from_project(&project).unwrap();
4403 assert_eq!(count, 2);
4404 assert_eq!(engine.store.count().unwrap(), 2);
4405 assert!(
4406 engine
4407 .store
4408 .embeddings_for_files(&["extra.py"])
4409 .unwrap()
4410 .is_empty()
4411 );
4412 }
4413
4414 #[test]
4415 fn engine_model_change_recreates_db() {
4416 let _lock = MODEL_LOCK.lock().unwrap();
4417 skip_without_embedding_model!();
4418 let (_dir, project) = make_project_with_source();
4419
4420 let engine1 = EmbeddingEngine::new(&project).unwrap();
4422 engine1.index_from_project(&project).unwrap();
4423 assert_eq!(engine1.store.count().unwrap(), 2);
4424 drop(engine1);
4425
4426 let engine2 = EmbeddingEngine::new(&project).unwrap();
4428 assert!(engine2.store.count().unwrap() >= 2);
4429 }
4430
4431 #[test]
4432 fn inspect_existing_index_returns_model_and_count() {
4433 let _lock = MODEL_LOCK.lock().unwrap();
4434 skip_without_embedding_model!();
4435 let (_dir, project) = make_project_with_source();
4436 let engine = EmbeddingEngine::new(&project).unwrap();
4437 engine.index_from_project(&project).unwrap();
4438
4439 let info = EmbeddingEngine::inspect_existing_index(&project)
4440 .unwrap()
4441 .expect("index info should exist");
4442 assert_eq!(info.model_name, engine.model_name());
4443 assert_eq!(info.indexed_symbols, 2);
4444 }
4445
4446 #[test]
4447 fn inspect_existing_index_recovers_from_corrupt_db() {
4448 let (_dir, project) = make_project_with_source();
4449 let index_dir = project.as_path().join(".codelens/index");
4450 let db_path = index_dir.join("embeddings.db");
4451 let wal_path = index_dir.join("embeddings.db-wal");
4452 let shm_path = index_dir.join("embeddings.db-shm");
4453
4454 std::fs::write(&db_path, b"not a sqlite database").unwrap();
4455 std::fs::write(&wal_path, b"bad wal").unwrap();
4456 std::fs::write(&shm_path, b"bad shm").unwrap();
4457
4458 let info = EmbeddingEngine::inspect_existing_index(&project).unwrap();
4459 assert!(info.is_none());
4460
4461 assert!(db_path.is_file());
4462
4463 let backup_names: Vec<String> = std::fs::read_dir(&index_dir)
4464 .unwrap()
4465 .map(|entry| entry.unwrap().file_name().to_string_lossy().into_owned())
4466 .filter(|name| name.contains(".corrupt-"))
4467 .collect();
4468
4469 assert!(
4470 backup_names
4471 .iter()
4472 .any(|name| name.starts_with("embeddings.db.corrupt-")),
4473 "expected quarantined embedding db, found {backup_names:?}"
4474 );
4475 }
4476
4477 #[test]
4478 fn store_can_fetch_single_embedding_without_loading_all() {
4479 let _lock = MODEL_LOCK.lock().unwrap();
4480 skip_without_embedding_model!();
4481 let (_dir, project) = make_project_with_source();
4482 let engine = EmbeddingEngine::new(&project).unwrap();
4483 engine.index_from_project(&project).unwrap();
4484
4485 let chunk = engine
4486 .store
4487 .get_embedding("main.py", "hello")
4488 .unwrap()
4489 .expect("embedding should exist");
4490 assert_eq!(chunk.file_path, "main.py");
4491 assert_eq!(chunk.symbol_name, "hello");
4492 assert!(!chunk.embedding.is_empty());
4493 }
4494
4495 #[test]
4496 fn find_similar_code_uses_index_and_excludes_target_symbol() {
4497 let _lock = MODEL_LOCK.lock().unwrap();
4498 skip_without_embedding_model!();
4499 let (_dir, project) = make_project_with_source();
4500 let engine = EmbeddingEngine::new(&project).unwrap();
4501 engine.index_from_project(&project).unwrap();
4502
4503 let matches = engine.find_similar_code("main.py", "hello", 5).unwrap();
4504 assert!(!matches.is_empty());
4505 assert!(
4506 matches
4507 .iter()
4508 .all(|m| !(m.file_path == "main.py" && m.symbol_name == "hello"))
4509 );
4510 }
4511
4512 #[test]
4513 fn delete_by_file_removes_rows_in_one_batch() {
4514 let _lock = MODEL_LOCK.lock().unwrap();
4515 skip_without_embedding_model!();
4516 let (_dir, project) = make_project_with_source();
4517 let engine = EmbeddingEngine::new(&project).unwrap();
4518 engine.index_from_project(&project).unwrap();
4519
4520 let deleted = engine.store.delete_by_file(&["main.py"]).unwrap();
4521 assert_eq!(deleted, 2);
4522 assert_eq!(engine.store.count().unwrap(), 0);
4523 }
4524
4525 #[test]
4526 fn store_streams_embeddings_grouped_by_file() {
4527 let _lock = MODEL_LOCK.lock().unwrap();
4528 skip_without_embedding_model!();
4529 let (_dir, project) = make_project_with_source();
4530 let engine = EmbeddingEngine::new(&project).unwrap();
4531 engine.index_from_project(&project).unwrap();
4532
4533 let mut groups = Vec::new();
4534 engine
4535 .store
4536 .for_each_file_embeddings(&mut |file_path, chunks| {
4537 groups.push((file_path, chunks.len()));
4538 Ok(())
4539 })
4540 .unwrap();
4541
4542 assert_eq!(groups, vec![("main.py".to_string(), 2)]);
4543 }
4544
4545 #[test]
4546 fn store_fetches_embeddings_for_specific_files() {
4547 let _lock = MODEL_LOCK.lock().unwrap();
4548 skip_without_embedding_model!();
4549 let (_dir, project) = make_project_with_source();
4550 let engine = EmbeddingEngine::new(&project).unwrap();
4551 engine.index_from_project(&project).unwrap();
4552
4553 let chunks = engine.store.embeddings_for_files(&["main.py"]).unwrap();
4554 assert_eq!(chunks.len(), 2);
4555 assert!(chunks.iter().all(|chunk| chunk.file_path == "main.py"));
4556 }
4557
4558 #[test]
4559 fn store_fetches_embeddings_for_scored_chunks() {
4560 let _lock = MODEL_LOCK.lock().unwrap();
4561 skip_without_embedding_model!();
4562 let (_dir, project) = make_project_with_source();
4563 let engine = EmbeddingEngine::new(&project).unwrap();
4564 engine.index_from_project(&project).unwrap();
4565
4566 let scored = engine.search_scored("hello world function", 2).unwrap();
4567 let chunks = engine.store.embeddings_for_scored_chunks(&scored).unwrap();
4568
4569 assert_eq!(chunks.len(), scored.len());
4570 assert!(scored.iter().all(|candidate| chunks.iter().any(|chunk| {
4571 chunk.file_path == candidate.file_path
4572 && chunk.symbol_name == candidate.symbol_name
4573 && chunk.line == candidate.line
4574 && chunk.signature == candidate.signature
4575 && chunk.name_path == candidate.name_path
4576 })));
4577 }
4578
4579 #[test]
4580 fn find_misplaced_code_returns_per_file_outliers() {
4581 let _lock = MODEL_LOCK.lock().unwrap();
4582 skip_without_embedding_model!();
4583 let (_dir, project) = make_project_with_source();
4584 let engine = EmbeddingEngine::new(&project).unwrap();
4585 engine.index_from_project(&project).unwrap();
4586
4587 let outliers = engine.find_misplaced_code(5).unwrap();
4588 assert_eq!(outliers.len(), 2);
4589 assert!(outliers.iter().all(|item| item.file_path == "main.py"));
4590 }
4591
4592 #[test]
4593 fn find_duplicates_uses_batched_candidate_embeddings() {
4594 let _lock = MODEL_LOCK.lock().unwrap();
4595 skip_without_embedding_model!();
4596 let (_dir, project) = make_project_with_source();
4597 let engine = EmbeddingEngine::new(&project).unwrap();
4598 engine.index_from_project(&project).unwrap();
4599
4600 replace_file_embeddings_with_sentinels(
4601 &engine,
4602 "main.py",
4603 &[("hello", 5.0), ("world", 5.0)],
4604 );
4605
4606 let duplicates = engine.find_duplicates(0.99, 4).unwrap();
4607 assert!(!duplicates.is_empty());
4608 assert!(duplicates.iter().any(|pair| {
4609 (pair.symbol_a == "main.py:hello" && pair.symbol_b == "main.py:world")
4610 || (pair.symbol_a == "main.py:world" && pair.symbol_b == "main.py:hello")
4611 }));
4612 }
4613
4614 #[test]
4615 fn search_scored_returns_raw_chunks() {
4616 let _lock = MODEL_LOCK.lock().unwrap();
4617 skip_without_embedding_model!();
4618 let (_dir, project) = make_project_with_source();
4619 let engine = EmbeddingEngine::new(&project).unwrap();
4620 engine.index_from_project(&project).unwrap();
4621
4622 let chunks = engine.search_scored("world function", 5).unwrap();
4623 assert!(!chunks.is_empty());
4624 for c in &chunks {
4625 assert!(!c.file_path.is_empty());
4626 assert!(!c.symbol_name.is_empty());
4627 }
4628 }
4629
4630 #[test]
4631 fn configured_embedding_model_name_defaults_to_codesearchnet() {
4632 assert_eq!(configured_embedding_model_name(), CODESEARCH_MODEL_NAME);
4633 }
4634
4635 #[test]
4636 fn requested_embedding_model_override_ignores_default_model_name() {
4637 let _lock = MODEL_LOCK.lock().unwrap();
4638 let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4639 unsafe {
4640 std::env::set_var("CODELENS_EMBED_MODEL", CODESEARCH_MODEL_NAME);
4641 }
4642
4643 let result = requested_embedding_model_override().unwrap();
4644
4645 unsafe {
4646 match previous {
4647 Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4648 None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4649 }
4650 }
4651
4652 assert_eq!(result, None);
4653 }
4654
4655 #[cfg(not(feature = "model-bakeoff"))]
4656 #[test]
4657 fn requested_embedding_model_override_requires_bakeoff_feature() {
4658 let _lock = MODEL_LOCK.lock().unwrap();
4659 let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4660 unsafe {
4661 std::env::set_var("CODELENS_EMBED_MODEL", "all-MiniLM-L12-v2");
4662 }
4663
4664 let err = requested_embedding_model_override().unwrap_err();
4665
4666 unsafe {
4667 match previous {
4668 Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4669 None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4670 }
4671 }
4672
4673 assert!(err.to_string().contains("model-bakeoff"));
4674 }
4675
4676 #[cfg(feature = "model-bakeoff")]
4677 #[test]
4678 fn requested_embedding_model_override_accepts_alternative_model() {
4679 let _lock = MODEL_LOCK.lock().unwrap();
4680 let previous = std::env::var("CODELENS_EMBED_MODEL").ok();
4681 unsafe {
4682 std::env::set_var("CODELENS_EMBED_MODEL", "all-MiniLM-L12-v2");
4683 }
4684
4685 let result = requested_embedding_model_override().unwrap();
4686
4687 unsafe {
4688 match previous {
4689 Some(value) => std::env::set_var("CODELENS_EMBED_MODEL", value),
4690 None => std::env::remove_var("CODELENS_EMBED_MODEL"),
4691 }
4692 }
4693
4694 assert_eq!(result.as_deref(), Some("all-MiniLM-L12-v2"));
4695 }
4696
4697 #[test]
4698 fn recommended_embed_threads_caps_macos_style_load() {
4699 let threads = recommended_embed_threads();
4700 assert!(threads >= 1);
4701 assert!(threads <= 8);
4702 }
4703
4704 #[test]
4705 fn embed_batch_size_has_safe_default_floor() {
4706 assert!(embed_batch_size() >= 1);
4707 if cfg!(target_os = "macos") {
4708 assert!(embed_batch_size() <= DEFAULT_MACOS_EMBED_BATCH_SIZE);
4709 }
4710 }
4711}