Skip to main content

lean_ctx/tools/
ctx_semantic_search.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use crate::core::bm25_index::{format_search_results, BM25Index};
5use crate::core::embedding_index::EmbeddingIndex;
6#[cfg(feature = "embeddings")]
7use crate::core::embeddings::EmbeddingEngine;
8use crate::core::hybrid_search::{format_hybrid_results, HybridConfig, HybridResult};
9use crate::tools::CrpMode;
10
11/// Performs semantic code search using BM25, dense embeddings, or hybrid ranking.
12#[allow(clippy::too_many_arguments)]
13pub fn handle(
14    query: &str,
15    path: &str,
16    top_k: usize,
17    crp_mode: CrpMode,
18    languages: Option<&[String]>,
19    path_glob: Option<&str>,
20    mode: Option<&str>,
21    workspace: Option<bool>,
22    artifacts: Option<bool>,
23) -> String {
24    let root = Path::new(path);
25    if !root.exists() {
26        return format!("ERR: path does not exist: {path}");
27    }
28
29    let root = if root.is_file() {
30        root.parent().unwrap_or(root)
31    } else {
32        root
33    };
34
35    let filter = match SearchFilter::new(languages, path_glob) {
36        Ok(f) => f,
37        Err(e) => return format!("ERR: invalid filter: {e}"),
38    };
39
40    let compact = crp_mode.is_tdd();
41    let mode = mode.unwrap_or("hybrid").to_lowercase();
42    let workspace = workspace.unwrap_or(false);
43    let artifacts = artifacts.unwrap_or(false);
44
45    if artifacts {
46        return artifacts_search(query, root, top_k, compact, &filter, workspace);
47    }
48    if workspace {
49        return workspace_search(query, root, top_k, compact, &filter, &mode);
50    }
51
52    let index = match load_or_refresh_bm25(root) {
53        Bm25LoadResult::Ready(idx) => idx,
54        Bm25LoadResult::Building => {
55            return "BM25 index is being built in the background. \
56                    Run ctx_semantic_search again in ~30s, or use action=reindex to wait for completion."
57                .to_string();
58        }
59    };
60    if index.doc_count == 0 {
61        return "No code files found to index.".to_string();
62    }
63
64    match mode.as_str() {
65        "bm25" => {
66            let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
67            if filter.is_active() {
68                results.retain(|x| filter.matches(&x.file_path));
69            }
70            results.truncate(top_k);
71
72            let header = if compact {
73                format!(
74                    "semantic_search(bm25,{top_k}) → {} results, {} chunks indexed\n",
75                    results.len(),
76                    index.doc_count
77                )
78            } else {
79                format!(
80                    "Semantic search (BM25): \"{}\" ({} results from {} indexed chunks)\n",
81                    truncate_query(query, 60),
82                    results.len(),
83                    index.doc_count,
84                )
85            };
86            format!("{header}{}", format_search_results(&results, compact))
87        }
88        "dense" => dense_search_mode(query, root, &index, top_k, compact, &filter),
89        _ => hybrid_search_mode(query, root, &index, top_k, compact, &filter),
90    }
91}
92
93/// Structured single-root search used by the `semantic-search` CLI (`--json`)
94/// and any programmatic caller (editor extensions). Mirrors `handle`'s
95/// single-root logic but returns the ranked [`HybridResult`]s instead of a
96/// formatted report, so callers control their own serialization. Reuses the
97/// exact same hybrid/dense/BM25 ranking as the `ctx_semantic_search` MCP tool —
98/// no second code path to drift.
99pub fn search_hits(
100    query: &str,
101    path: &str,
102    top_k: usize,
103    mode: &str,
104    languages: Option<&[String]>,
105    path_glob: Option<&str>,
106) -> Result<Vec<HybridResult>, String> {
107    let root = Path::new(path);
108    if !root.exists() {
109        return Err(format!("path does not exist: {path}"));
110    }
111    let root = if root.is_file() {
112        root.parent().unwrap_or(root)
113    } else {
114        root
115    };
116
117    let filter =
118        SearchFilter::new(languages, path_glob).map_err(|e| format!("invalid filter: {e}"))?;
119
120    let index = BM25Index::load_or_build(root);
121    if index.doc_count == 0 {
122        return Ok(Vec::new());
123    }
124
125    let results = match mode.to_lowercase().as_str() {
126        "bm25" => bm25_hits(&index, query, top_k, &filter),
127        "dense" => {
128            #[cfg(feature = "embeddings")]
129            {
130                dense_results_for_root(query, root, &index, top_k, &filter).map(|(v, _)| v)?
131            }
132            #[cfg(not(feature = "embeddings"))]
133            {
134                return Err("dense mode requires the embeddings feature".to_string());
135            }
136        }
137        _ => {
138            #[cfg(feature = "embeddings")]
139            {
140                hybrid_results_for_root(query, root, &index, top_k, &filter).map(|(v, _)| v)?
141            }
142            #[cfg(not(feature = "embeddings"))]
143            {
144                bm25_hits(&index, query, top_k, &filter)
145            }
146        }
147    };
148
149    Ok(results)
150}
151
152fn bm25_hits(
153    index: &BM25Index,
154    query: &str,
155    top_k: usize,
156    filter: &SearchFilter,
157) -> Vec<HybridResult> {
158    let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
159    if filter.is_active() {
160        results.retain(|x| filter.matches(&x.file_path));
161    }
162    results.truncate(top_k);
163    results
164        .into_iter()
165        .map(HybridResult::from_bm25_public)
166        .collect()
167}
168
169/// Rebuilds the BM25 search index for the given directory from scratch.
170pub fn handle_reindex(path: &str) -> String {
171    let root = Path::new(path);
172    if !root.exists() {
173        return format!("ERR: path does not exist: {path}");
174    }
175    let root = if root.is_file() {
176        root.parent().unwrap_or(root)
177    } else {
178        root
179    };
180
181    let idx = BM25Index::build_from_directory(root);
182    let files = idx.files.len();
183    let chunks = idx.doc_count;
184    let _ = idx.save(root);
185
186    format!("Reindexed {path}: {files} files, {chunks} chunks")
187}
188
189pub fn handle_reindex_artifacts(path: &str, workspace: bool) -> String {
190    let root = Path::new(path);
191    if !root.exists() {
192        return format!("ERR: path does not exist: {path}");
193    }
194    let root = if root.is_file() {
195        root.parent().unwrap_or(root)
196    } else {
197        root
198    };
199
200    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
201    let mut warnings: Vec<String> = Vec::new();
202
203    if workspace {
204        let linked = crate::core::workspace_config::load_linked_projects(root);
205        warnings.extend(linked.warnings);
206        roots.extend(linked.roots);
207    }
208
209    let mut total_files = 0usize;
210    let mut total_chunks = 0usize;
211    for r in roots {
212        let (idx, w) = crate::core::artifact_index::rebuild_from_scratch(&r);
213        warnings.extend(w);
214        total_files += idx.files.len();
215        total_chunks += idx.doc_count;
216    }
217
218    if warnings.is_empty() {
219        format!("Reindexed artifacts: {total_files} files, {total_chunks} chunks")
220    } else {
221        format!(
222            "Reindexed artifacts: {total_files} files, {total_chunks} chunks ({} warning(s))",
223            warnings.len()
224        )
225    }
226}
227
228/// Find chunks semantically related to a given file location.
229///
230/// Marchionini (2006): Exploratory search navigates from known points.
231/// This enables "show me similar code" workflows.
232pub fn handle_find_related(
233    file_path: &str,
234    line: usize,
235    project_root: &str,
236    top_k: usize,
237    crp_mode: CrpMode,
238) -> String {
239    let root = Path::new(project_root);
240    if !root.exists() {
241        return format!("ERR: path does not exist: {project_root}");
242    }
243
244    let index = BM25Index::load_or_build(root);
245    if index.doc_count == 0 {
246        return "ERR: empty index. Try action=reindex first.".to_string();
247    }
248
249    let source_chunk = index
250        .chunks
251        .iter()
252        .find(|c| c.file_path == file_path && c.start_line <= line && c.end_line >= line);
253
254    let Some(source_chunk) = source_chunk else {
255        return format!(
256            "ERR: no indexed chunk found at {file_path}:{line}. Try action=reindex first."
257        );
258    };
259
260    let query_text = source_chunk.content.clone();
261    let source_file = source_chunk.file_path.clone();
262    let source_start = source_chunk.start_line;
263
264    let compact = crp_mode != CrpMode::Off;
265
266    let results = find_related_internal(&query_text, root, &index, top_k + 5, compact);
267
268    let mut lines: Vec<String> = results
269        .into_iter()
270        .filter(|l| !l.contains(&format!("{source_file}:{source_start}-")))
271        .take(top_k)
272        .collect();
273
274    let header = if compact {
275        format!(
276            "find_related({file_path}:{line}) → {} results\n",
277            lines.len()
278        )
279    } else {
280        format!("Find related to {file_path}:{line} (semantic similarity)\n")
281    };
282
283    lines.insert(0, header);
284    lines.join("")
285}
286
287fn find_related_internal(
288    query: &str,
289    root: &Path,
290    index: &BM25Index,
291    top_k: usize,
292    compact: bool,
293) -> Vec<String> {
294    let Ok(filter) = SearchFilter::new(None, None) else {
295        return vec!["ERR: filter init failed\n".to_string()];
296    };
297    let output = hybrid_search_mode(query, root, index, top_k, compact, &filter);
298    output.lines().map(|l| format!("{l}\n")).collect()
299}
300
301fn truncate_query(q: &str, max: usize) -> &str {
302    if q.len() <= max {
303        return q;
304    }
305    match q.char_indices().nth(max) {
306        Some((byte_idx, _)) => &q[..byte_idx],
307        None => q,
308    }
309}
310
311std::thread_local! {
312    static BM25_SHARED_CACHE: std::cell::RefCell<Option<crate::core::bm25_cache::SharedBm25Cache>> =
313        const { std::cell::RefCell::new(None) };
314}
315
316/// Set the shared BM25 cache for the current thread (called from the registered handler).
317pub fn set_thread_cache(cache: crate::core::bm25_cache::SharedBm25Cache) {
318    BM25_SHARED_CACHE.with(|c| {
319        *c.borrow_mut() = Some(cache);
320    });
321}
322
323/// Clone the current thread's shared BM25 cache, if any. Lets composer tools
324/// propagate the resident cache into a budgeted worker thread so a slow cold
325/// build warms the *same* cache instead of being wasted work.
326pub fn get_thread_cache() -> Option<crate::core::bm25_cache::SharedBm25Cache> {
327    BM25_SHARED_CACHE.with(|c| c.borrow().clone())
328}
329
330/// Result of BM25 index loading — may indicate background build in progress.
331pub(crate) enum Bm25LoadResult {
332    Ready(std::sync::Arc<BM25Index>),
333    Building,
334}
335
336fn load_or_refresh_bm25(root: &Path) -> Bm25LoadResult {
337    let cached = BM25_SHARED_CACHE.with(|c| {
338        let borrow = c.borrow();
339        borrow
340            .as_ref()
341            .and_then(|cache| crate::core::bm25_cache::get_or_background(cache, root))
342    });
343    if let Some(idx) = cached {
344        return Bm25LoadResult::Ready(idx);
345    }
346
347    let root_str = root.to_string_lossy().to_string();
348
349    if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
350        let idx = std::sync::Arc::new(idx);
351        store_in_thread_cache(root, &idx);
352        return Bm25LoadResult::Ready(idx);
353    }
354
355    if crate::core::index_orchestrator::is_building() {
356        return Bm25LoadResult::Building;
357    }
358
359    // Cold path: kick off the background build (which persists the index to
360    // disk) instead of doing an unbounded synchronous build in the MCP handler.
361    // Wait briefly so small/medium repos still return Ready on the first call;
362    // larger repos return Building and the agent retries against the warm cache
363    // once the worker has persisted the index (#150).
364    crate::core::index_orchestrator::ensure_all_background(&root_str);
365
366    let deadline = std::time::Instant::now() + bm25_cold_build_budget();
367    loop {
368        if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
369            let idx = std::sync::Arc::new(idx);
370            store_in_thread_cache(root, &idx);
371            return Bm25LoadResult::Ready(idx);
372        }
373        if std::time::Instant::now() >= deadline {
374            return Bm25LoadResult::Building;
375        }
376        std::thread::sleep(std::time::Duration::from_millis(50));
377    }
378}
379
380/// Time budget for waiting on a cold BM25 build in the MCP handler before
381/// returning `Building`. Overridable via `LEAN_CTX_BM25_COLD_BUDGET_MS`.
382fn bm25_cold_build_budget() -> std::time::Duration {
383    let ms = std::env::var("LEAN_CTX_BM25_COLD_BUDGET_MS")
384        .ok()
385        .and_then(|v| v.parse::<u64>().ok())
386        .unwrap_or(3000);
387    std::time::Duration::from_millis(ms)
388}
389
390fn store_in_thread_cache(root: &Path, idx: &std::sync::Arc<BM25Index>) {
391    BM25_SHARED_CACHE.with(|c| {
392        let borrow = c.borrow();
393        if let Some(cache) = borrow.as_ref() {
394            let mut guard = cache
395                .lock()
396                .unwrap_or_else(std::sync::PoisonError::into_inner);
397            *guard = Some(crate::core::bm25_cache::Bm25CacheEntry {
398                root: root.to_path_buf(),
399                index: std::sync::Arc::clone(idx),
400                loaded_at: std::time::Instant::now(),
401                fingerprint: crate::core::bm25_cache::index_fingerprint(root),
402            });
403        }
404    });
405}
406
407fn filtered_candidate_k(top_k: usize, filtered: bool) -> usize {
408    if !filtered {
409        return top_k;
410    }
411    let candidates = (top_k.max(10)).saturating_mul(10);
412    candidates.clamp(50, 500)
413}
414
415const WORKSPACE_RRF_K: f64 = 60.0;
416
417fn artifacts_search(
418    query: &str,
419    root: &Path,
420    top_k: usize,
421    compact: bool,
422    filter: &SearchFilter,
423    workspace: bool,
424) -> String {
425    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
426    let mut warnings: Vec<String> = Vec::new();
427
428    if workspace {
429        let linked = crate::core::workspace_config::load_linked_projects(root);
430        warnings.extend(linked.warnings);
431        roots.extend(linked.roots);
432    }
433    roots.sort();
434    roots.dedup();
435
436    let mut per_project: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)> = Vec::new();
437    let mut total_chunks = 0usize;
438
439    for r in &roots {
440        let label = label_for_root(r);
441        let (idx, w) = crate::core::artifact_index::load_or_build(r);
442        warnings.extend(w);
443        total_chunks += idx.doc_count;
444        if idx.doc_count == 0 {
445            continue;
446        }
447
448        let mut results = idx.search(query, filtered_candidate_k(top_k, filter.is_active()));
449        if filter.is_active() {
450            results.retain(|x| filter.matches(&x.file_path));
451        }
452        results.truncate(top_k);
453
454        for res in &mut results {
455            res.file_path = if workspace {
456                format!("[project:{label}] [artifact] {}", res.file_path)
457            } else {
458                format!("[artifact] {}", res.file_path)
459            };
460        }
461
462        per_project.push((label, results));
463    }
464
465    let mut fused: Vec<crate::core::bm25_index::SearchResult> = if per_project.len() <= 1 {
466        per_project
467            .into_iter()
468            .next()
469            .map(|(_, v)| v)
470            .unwrap_or_default()
471    } else {
472        rrf_merge_bm25(per_project, top_k)
473    };
474
475    if fused.is_empty() {
476        return "No artifact files found to index.".to_string();
477    }
478
479    fused.truncate(top_k);
480
481    let header = if compact {
482        if workspace {
483            format!(
484                "semantic_search(artifacts,workspace,{top_k}) → {} results, projects={}, {} chunks indexed\n",
485                fused.len(),
486                roots.len(),
487                total_chunks
488            )
489        } else {
490            format!(
491                "semantic_search(artifacts,{top_k}) → {} results, {} chunks indexed\n",
492                fused.len(),
493                total_chunks
494            )
495        }
496    } else if workspace {
497        format!(
498            "Semantic search (Artifacts/Workspace): \"{}\" ({} results from {} projects)\n",
499            truncate_query(query, 60),
500            fused.len(),
501            roots.len()
502        )
503    } else {
504        format!(
505            "Semantic search (Artifacts): \"{}\" ({} results)\n",
506            truncate_query(query, 60),
507            fused.len()
508        )
509    };
510
511    let mut out = format!("{header}{}", format_search_results(&fused, compact));
512    if !warnings.is_empty() && !compact {
513        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
514        for w in warnings.iter().take(20) {
515            out.push_str(&format!("- {w}\n"));
516        }
517    }
518    out
519}
520
521fn workspace_search(
522    query: &str,
523    root: &Path,
524    top_k: usize,
525    compact: bool,
526    filter: &SearchFilter,
527    mode: &str,
528) -> String {
529    let linked = crate::core::workspace_config::load_linked_projects(root);
530    let mut warnings = linked.warnings;
531
532    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
533    roots.extend(linked.roots);
534    roots.sort();
535    roots.dedup();
536
537    let mut per_project: Vec<(String, Vec<HybridResult>)> = Vec::new();
538    let mut avg_cov: Option<f64> = None;
539    let mut cov_count = 0usize;
540
541    for r in &roots {
542        let label = label_for_root(r);
543        let index = BM25Index::load_or_build(r);
544        if index.doc_count == 0 {
545            continue;
546        }
547
548        let mut results: Vec<HybridResult> = match mode {
549            "bm25" => {
550                let mut bm25 = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
551                if filter.is_active() {
552                    bm25.retain(|x| filter.matches(&x.file_path));
553                }
554                bm25.truncate(top_k);
555                bm25.into_iter()
556                    .map(HybridResult::from_bm25_public)
557                    .collect()
558            }
559            "dense" => {
560                #[cfg(feature = "embeddings")]
561                {
562                    match dense_results_for_root(query, r, &index, top_k, filter) {
563                        Ok((v, cov)) => {
564                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
565                            cov_count += 1;
566                            v
567                        }
568                        Err(e) => {
569                            warnings.push(format!("[{label}] dense search failed: {e}"));
570                            let mut bm25 = index
571                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
572                            if filter.is_active() {
573                                bm25.retain(|x| filter.matches(&x.file_path));
574                            }
575                            bm25.truncate(top_k);
576                            bm25.into_iter()
577                                .map(HybridResult::from_bm25_public)
578                                .collect()
579                        }
580                    }
581                }
582                #[cfg(not(feature = "embeddings"))]
583                {
584                    let _ = (&label, &warnings);
585                    let mut bm25 =
586                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
587                    if filter.is_active() {
588                        bm25.retain(|x| filter.matches(&x.file_path));
589                    }
590                    bm25.truncate(top_k);
591                    bm25.into_iter()
592                        .map(HybridResult::from_bm25_public)
593                        .collect()
594                }
595            }
596            _ => {
597                #[cfg(feature = "embeddings")]
598                {
599                    match hybrid_results_for_root(query, r, &index, top_k, filter) {
600                        Ok((v, cov)) => {
601                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
602                            cov_count += 1;
603                            v
604                        }
605                        Err(e) => {
606                            warnings.push(format!("[{label}] hybrid search failed: {e}"));
607                            let mut bm25 = index
608                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
609                            if filter.is_active() {
610                                bm25.retain(|x| filter.matches(&x.file_path));
611                            }
612                            bm25.truncate(top_k);
613                            bm25.into_iter()
614                                .map(HybridResult::from_bm25_public)
615                                .collect()
616                        }
617                    }
618                }
619                #[cfg(not(feature = "embeddings"))]
620                {
621                    let _ = (&label, &warnings);
622                    let mut bm25 =
623                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
624                    if filter.is_active() {
625                        bm25.retain(|x| filter.matches(&x.file_path));
626                    }
627                    bm25.truncate(top_k);
628                    bm25.into_iter()
629                        .map(HybridResult::from_bm25_public)
630                        .collect()
631                }
632            }
633        };
634
635        for res in &mut results {
636            res.file_path = format!("[project:{label}] {}", res.file_path);
637        }
638        per_project.push((label, results));
639    }
640
641    let mut fused: Vec<HybridResult> = if per_project.len() <= 1 {
642        per_project
643            .into_iter()
644            .next()
645            .map(|(_, v)| v)
646            .unwrap_or_default()
647    } else {
648        rrf_merge_hybrid(per_project, top_k)
649    };
650
651    if fused.is_empty() {
652        return "No code files found to index.".to_string();
653    }
654
655    fused.truncate(top_k);
656    let cov = avg_cov.and_then(|s| {
657        if cov_count == 0 {
658            None
659        } else {
660            Some(s / cov_count as f64)
661        }
662    });
663
664    let header = if compact {
665        match (mode, cov) {
666            (_, Some(c)) => format!(
667                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}, embed_cov={:.0}%\n",
668                fused.len(),
669                roots.len(),
670                c * 100.0
671            ),
672            _ => format!(
673                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}\n",
674                fused.len(),
675                roots.len()
676            ),
677        }
678    } else {
679        format!(
680            "Workspace semantic search ({mode}): \"{}\" ({} results from {} projects)\n",
681            truncate_query(query, 60),
682            fused.len(),
683            roots.len()
684        )
685    };
686
687    let mut out = format!("{header}{}", format_hybrid_results(&fused, compact));
688    if !warnings.is_empty() && !compact {
689        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
690        for w in warnings.iter().take(20) {
691            out.push_str(&format!("- {w}\n"));
692        }
693    }
694    out
695}
696
697fn rrf_merge_hybrid(lists: Vec<(String, Vec<HybridResult>)>, top_k: usize) -> Vec<HybridResult> {
698    use std::collections::HashMap;
699
700    let mut acc: HashMap<String, (HybridResult, f64)> = HashMap::new();
701    for (label, results) in lists {
702        for (rank, r) in results.into_iter().enumerate() {
703            let key = format!(
704                "{label}|{}|{}|{}|{}",
705                r.file_path, r.symbol_name, r.start_line, r.end_line
706            );
707            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
708            acc.entry(key)
709                .and_modify(|(_, s)| *s += rrf)
710                .or_insert((r, rrf));
711        }
712    }
713
714    let mut out: Vec<HybridResult> = acc
715        .into_values()
716        .map(|(mut r, s)| {
717            r.rrf_score = s;
718            r
719        })
720        .collect();
721    out.sort_by(|a, b| {
722        b.rrf_score
723            .partial_cmp(&a.rrf_score)
724            .unwrap_or(std::cmp::Ordering::Equal)
725            .then_with(|| a.file_path.cmp(&b.file_path))
726            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
727            .then_with(|| a.start_line.cmp(&b.start_line))
728            .then_with(|| a.end_line.cmp(&b.end_line))
729    });
730    out.truncate(top_k);
731    out
732}
733
734fn rrf_merge_bm25(
735    lists: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)>,
736    top_k: usize,
737) -> Vec<crate::core::bm25_index::SearchResult> {
738    use std::collections::HashMap;
739
740    let mut acc: HashMap<String, (crate::core::bm25_index::SearchResult, f64)> = HashMap::new();
741    for (label, results) in lists {
742        for (rank, r) in results.into_iter().enumerate() {
743            let key = format!(
744                "{label}|{}|{}|{}|{}",
745                r.file_path, r.symbol_name, r.start_line, r.end_line
746            );
747            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
748            acc.entry(key)
749                .and_modify(|(_, s)| *s += rrf)
750                .or_insert((r, rrf));
751        }
752    }
753
754    let mut out: Vec<crate::core::bm25_index::SearchResult> = acc
755        .into_values()
756        .map(|(mut r, s)| {
757            r.score = s;
758            r
759        })
760        .collect();
761    out.sort_by(|a, b| {
762        b.score
763            .partial_cmp(&a.score)
764            .unwrap_or(std::cmp::Ordering::Equal)
765            .then_with(|| a.file_path.cmp(&b.file_path))
766            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
767            .then_with(|| a.start_line.cmp(&b.start_line))
768            .then_with(|| a.end_line.cmp(&b.end_line))
769    });
770    out.truncate(top_k);
771    out
772}
773
774#[cfg(feature = "embeddings")]
775fn dense_results_for_root(
776    query: &str,
777    root: &Path,
778    index: &BM25Index,
779    top_k: usize,
780    filter: &SearchFilter,
781) -> Result<(Vec<HybridResult>, f64), String> {
782    let (engine, mut embed_idx) = load_engine_and_index(root)?;
783    let (aligned, coverage, changed_files) =
784        ensure_embeddings(root, index, engine, &mut embed_idx)?;
785
786    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
787    let filter_fn = |p: &str| filter.matches(p);
788    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
789        .is_active()
790        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
791
792    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
793    let mut results = crate::core::dense_backend::dense_results_as_hybrid(
794        backend,
795        root,
796        index,
797        engine,
798        &aligned,
799        &changed_files,
800        query,
801        candidate_k,
802        filter_pred,
803    )?;
804    results.truncate(top_k);
805
806    Ok((results, coverage))
807}
808
809#[cfg(feature = "embeddings")]
810fn hybrid_results_for_root(
811    query: &str,
812    root: &Path,
813    index: &BM25Index,
814    top_k: usize,
815    filter: &SearchFilter,
816) -> Result<(Vec<HybridResult>, f64), String> {
817    let (engine, mut embed_idx) = load_engine_and_index(root)?;
818    let (aligned, coverage, changed_files) =
819        ensure_embeddings(root, index, engine, &mut embed_idx)?;
820
821    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
822    let cfg = HybridConfig::from_config();
823    let filter_fn = |p: &str| filter.matches(p);
824    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
825        .is_active()
826        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
827    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
828    let graph_ranks = graph_rrf_ranks_for_search_root(root);
829    let graph_ranks_ref = graph_ranks.as_ref();
830    let mut results = crate::core::dense_backend::hybrid_results(
831        backend,
832        root,
833        index,
834        engine,
835        &aligned,
836        &changed_files,
837        query,
838        candidate_k,
839        &cfg,
840        filter_pred,
841        graph_ranks_ref,
842    )?;
843
844    if cfg.splade_weight > 0.0 {
845        let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, candidate_k);
846        if !splade.is_empty() {
847            boost_with_splade(&mut results, &splade, cfg.splade_weight);
848        }
849    }
850
851    results.truncate(top_k);
852    Ok((results, coverage))
853}
854
855/// Boost existing hybrid results with SPLADE expansion scores.
856fn boost_with_splade(
857    results: &mut [HybridResult],
858    splade: &[crate::core::splade_retrieval::SpladeResult],
859    weight: f64,
860) {
861    use std::collections::HashMap;
862    let rrf_k = 60.0_f64;
863
864    let boosts: HashMap<&str, f64> = splade
865        .iter()
866        .enumerate()
867        .map(|(rank, sr)| (sr.file_path.as_str(), weight / (rrf_k + rank as f64 + 1.0)))
868        .collect();
869
870    for r in results.iter_mut() {
871        if let Some(&boost) = boosts.get(r.file_path.as_str()) {
872            r.rrf_score += boost;
873        }
874    }
875
876    results.sort_by(|a, b| {
877        b.rrf_score
878            .partial_cmp(&a.rrf_score)
879            .unwrap_or(std::cmp::Ordering::Equal)
880    });
881}
882
883fn label_for_root(root: &Path) -> String {
884    root.file_name()
885        .and_then(|s| s.to_str())
886        .map(str::to_string)
887        .filter(|s| !s.is_empty())
888        .unwrap_or_else(|| root.to_string_lossy().to_string())
889}
890
891fn graph_rrf_ranks_for_search_root(
892    root: &Path,
893) -> Option<std::collections::HashMap<String, usize>> {
894    let root_s = root.to_string_lossy().to_string();
895    let session = crate::core::session::SessionState::load_latest_for_project_root(&root_s)?;
896
897    if session.files_touched.is_empty() {
898        return None;
899    }
900
901    let recent: Vec<String> = session
902        .files_touched
903        .iter()
904        .rev()
905        .filter(|f| path_under_search_root(&f.path, root))
906        .take(12)
907        .map(|f| f.path.clone())
908        .collect();
909
910    if recent.is_empty() {
911        return None;
912    }
913
914    crate::core::graph_context::graph_neighbor_ranks_for_recent_files(&root_s, &recent, 40, 120)
915}
916
917fn path_under_search_root(path: &str, root: &Path) -> bool {
918    let p = std::path::Path::new(path);
919    if p.is_absolute() {
920        let root_norm = crate::core::pathutil::safe_canonicalize_or_self(root);
921        let path_norm = crate::core::pathutil::safe_canonicalize_or_self(p);
922        path_norm.starts_with(&root_norm)
923    } else {
924        true
925    }
926}
927
928fn hybrid_search_mode(
929    query: &str,
930    root: &Path,
931    index: &BM25Index,
932    top_k: usize,
933    compact: bool,
934    filter: &SearchFilter,
935) -> String {
936    #[cfg(feature = "embeddings")]
937    {
938        let (engine, mut embed_idx) = match load_engine_and_index(root) {
939            Ok(v) => v,
940            Err(e) => return format!("ERR: {e}"),
941        };
942
943        let (aligned, coverage, changed_files) =
944            match ensure_embeddings(root, index, engine, &mut embed_idx) {
945                Ok(v) => v,
946                Err(e) => return format!("ERR: {e}"),
947            };
948
949        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
950            Ok(v) => v,
951            Err(e) => return format!("ERR: {e}"),
952        };
953
954        let cfg = HybridConfig::from_config();
955        let filter_fn = |p: &str| filter.matches(p);
956        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
957            .is_active()
958            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
959        let graph_ranks = graph_rrf_ranks_for_search_root(root);
960        let graph_ranks_ref = graph_ranks.as_ref();
961        let mut results = match crate::core::dense_backend::hybrid_results(
962            backend,
963            root,
964            index,
965            engine,
966            &aligned,
967            &changed_files,
968            query,
969            top_k,
970            &cfg,
971            filter_pred,
972            graph_ranks_ref,
973        ) {
974            Ok(v) => v,
975            Err(e) => return format!("ERR: {e}"),
976        };
977
978        if cfg.splade_weight > 0.0 {
979            let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, top_k);
980            if !splade.is_empty() {
981                boost_with_splade(&mut results, &splade, cfg.splade_weight);
982            }
983        }
984
985        results.truncate(top_k);
986
987        let header = if compact {
988            format!(
989                "semantic_search(hybrid,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
990                results.len(),
991                index.doc_count,
992                coverage * 100.0
993            )
994        } else {
995            format!(
996                "Semantic search (Hybrid): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
997                truncate_query(query, 60),
998                results.len(),
999                index.doc_count,
1000                coverage * 100.0
1001            )
1002        };
1003
1004        format!("{header}{}", format_hybrid_results(&results, compact))
1005    }
1006    #[cfg(not(feature = "embeddings"))]
1007    {
1008        let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
1009        if filter.is_active() {
1010            results.retain(|x| filter.matches(&x.file_path));
1011        }
1012
1013        if let Some(graph_ranks) = graph_rrf_ranks_for_search_root(root) {
1014            const GRAPH_RRF_K: f64 = 60.0;
1015            for r in &mut results {
1016                if let Some(&rank) = graph_ranks.get(&r.file_path) {
1017                    r.score += 1.0 / (GRAPH_RRF_K + rank as f64 + 1.0);
1018                }
1019            }
1020            results.sort_by(|a, b| {
1021                b.score
1022                    .partial_cmp(&a.score)
1023                    .unwrap_or(std::cmp::Ordering::Equal)
1024            });
1025        }
1026
1027        results.truncate(top_k);
1028        let graph_tag = if graph_rrf_ranks_for_search_root(root).is_some() {
1029            "+graph"
1030        } else {
1031            ""
1032        };
1033        let header = if compact {
1034            format!(
1035                "semantic_search(bm25{graph_tag},{top_k}) → {} results, {} chunks indexed\n",
1036                results.len(),
1037                index.doc_count
1038            )
1039        } else {
1040            format!(
1041                "Semantic search (BM25{graph_tag}): \"{}\" ({} results from {} indexed chunks)\n",
1042                truncate_query(query, 60),
1043                results.len(),
1044                index.doc_count,
1045            )
1046        };
1047        format!("{header}{}", format_search_results(&results, compact))
1048    }
1049}
1050
1051fn dense_search_mode(
1052    query: &str,
1053    root: &Path,
1054    index: &BM25Index,
1055    top_k: usize,
1056    compact: bool,
1057    filter: &SearchFilter,
1058) -> String {
1059    #[cfg(feature = "embeddings")]
1060    {
1061        let (engine, mut embed_idx) = match load_engine_and_index(root) {
1062            Ok(v) => v,
1063            Err(e) => return format!("ERR: {e}"),
1064        };
1065
1066        let (aligned, coverage, changed_files) =
1067            match ensure_embeddings(root, index, engine, &mut embed_idx) {
1068                Ok(v) => v,
1069                Err(e) => return format!("ERR: {e}"),
1070            };
1071
1072        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
1073            Ok(v) => v,
1074            Err(e) => return format!("ERR: {e}"),
1075        };
1076
1077        let filter_fn = |p: &str| filter.matches(p);
1078        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
1079            .is_active()
1080            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
1081
1082        let candidate_k = filtered_candidate_k(top_k, filter.is_active());
1083        let mut results = match crate::core::dense_backend::dense_results_as_hybrid(
1084            backend,
1085            root,
1086            index,
1087            engine,
1088            &aligned,
1089            &changed_files,
1090            query,
1091            candidate_k,
1092            filter_pred,
1093        ) {
1094            Ok(v) => v,
1095            Err(e) => return format!("ERR: {e}"),
1096        };
1097        results.truncate(top_k);
1098
1099        let header = if compact {
1100            format!(
1101                "semantic_search(dense,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
1102                results.len(),
1103                index.doc_count,
1104                coverage * 100.0
1105            )
1106        } else {
1107            format!(
1108                "Semantic search (Dense): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
1109                truncate_query(query, 60),
1110                results.len(),
1111                index.doc_count,
1112                coverage * 100.0
1113            )
1114        };
1115
1116        format!("{header}{}", format_hybrid_results(&results, compact))
1117    }
1118    #[cfg(not(feature = "embeddings"))]
1119    {
1120        "ERR: embeddings feature not enabled".to_string()
1121    }
1122}
1123
1124#[cfg(feature = "embeddings")]
1125fn load_engine_and_index(
1126    root: &Path,
1127) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1128    let cfg = crate::core::config::Config::load();
1129    let profile = crate::core::config::MemoryProfile::effective(&cfg);
1130    if !profile.embeddings_enabled() {
1131        return Err("embeddings disabled by memory_profile=low".into());
1132    }
1133
1134    let engine = crate::core::embeddings::shared_engine()
1135        .ok_or_else(|| "embedding engine load failed".to_string())?;
1136
1137    let model_name = engine.model_name();
1138    let mut idx = EmbeddingIndex::load(root)
1139        .unwrap_or_else(|| EmbeddingIndex::new_with_model(engine.dimensions(), model_name));
1140
1141    if let Some((stored, current)) = idx.model_mismatch(model_name) {
1142        tracing::warn!(
1143            "[embeddings] model changed: {stored} → {current}. Re-indexing all embeddings."
1144        );
1145        idx = EmbeddingIndex::new_with_model(engine.dimensions(), model_name);
1146    } else if idx.dimension_mismatch(engine.dimensions()) {
1147        tracing::warn!(
1148            "[embeddings] dimension mismatch: index={}d, engine={}d. Re-indexing.",
1149            idx.dimensions,
1150            engine.dimensions()
1151        );
1152        idx = EmbeddingIndex::new_with_model(engine.dimensions(), model_name);
1153    }
1154
1155    if idx.model_id.is_none() {
1156        idx.model_id = Some(model_name.to_string());
1157    }
1158
1159    Ok((engine, idx))
1160}
1161
1162#[cfg(feature = "embeddings")]
1163fn ensure_embeddings(
1164    root: &Path,
1165    index: &BM25Index,
1166    engine: &EmbeddingEngine,
1167    embed_idx: &mut EmbeddingIndex,
1168) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1169    let mut changed_files = embed_idx.files_needing_update(&index.chunks);
1170    changed_files.sort();
1171    changed_files.dedup();
1172
1173    if !changed_files.is_empty() {
1174        let changed_set: std::collections::HashSet<&str> = changed_files
1175            .iter()
1176            .map(std::string::String::as_str)
1177            .collect();
1178        let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::new();
1179        for (i, c) in index.chunks.iter().enumerate() {
1180            if !changed_set.contains(c.file_path.as_str()) {
1181                continue;
1182            }
1183            let emb = engine
1184                .embed(&c.content)
1185                .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1186            new_embeddings.push((i, emb));
1187        }
1188        embed_idx.update(&index.chunks, &new_embeddings, &changed_files);
1189        embed_idx
1190            .save(root)
1191            .map_err(|e| format!("save embeddings failed: {e}"))?;
1192    }
1193
1194    if let Some(aligned) = embed_idx.get_aligned_embeddings(&index.chunks) {
1195        let coverage = embed_idx.coverage(index.chunks.len());
1196        return Ok((aligned, coverage, changed_files));
1197    }
1198
1199    // Alignment missing: rebuild everything once.
1200    let mut all_files: Vec<String> = index.chunks.iter().map(|c| c.file_path.clone()).collect();
1201    all_files.sort();
1202    all_files.dedup();
1203
1204    let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::with_capacity(index.chunks.len());
1205    for (i, c) in index.chunks.iter().enumerate() {
1206        let emb = engine
1207            .embed(&c.content)
1208            .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1209        new_embeddings.push((i, emb));
1210    }
1211
1212    embed_idx.update(&index.chunks, &new_embeddings, &all_files);
1213    embed_idx
1214        .save(root)
1215        .map_err(|e| format!("save embeddings failed: {e}"))?;
1216
1217    let aligned = embed_idx
1218        .get_aligned_embeddings(&index.chunks)
1219        .ok_or_else(|| "embedding alignment failed after full rebuild".to_string())?;
1220    let coverage = embed_idx.coverage(index.chunks.len());
1221    Ok((aligned, coverage, all_files))
1222}
1223
1224struct SearchFilter {
1225    allowed_exts: Option<HashSet<String>>,
1226    path_glob: Option<glob::Pattern>,
1227}
1228
1229impl SearchFilter {
1230    fn new(languages: Option<&[String]>, path_glob: Option<&str>) -> Result<Self, String> {
1231        let allowed_exts = languages.map(normalize_languages);
1232        let path_glob = match path_glob {
1233            None => None,
1234            Some(s) if s.trim().is_empty() => None,
1235            Some(s) => Some(glob::Pattern::new(s).map_err(|e| e.msg.to_string())?),
1236        };
1237        Ok(Self {
1238            allowed_exts,
1239            path_glob,
1240        })
1241    }
1242
1243    fn is_active(&self) -> bool {
1244        self.allowed_exts.is_some() || self.path_glob.is_some()
1245    }
1246
1247    fn matches(&self, rel_path: &str) -> bool {
1248        let rel_path = rel_path.replace('\\', "/");
1249        if let Some(p) = &self.path_glob {
1250            if !p.matches(&rel_path) {
1251                return false;
1252            }
1253        }
1254        if let Some(exts) = &self.allowed_exts {
1255            let ext = Path::new(&rel_path)
1256                .extension()
1257                .and_then(|e| e.to_str())
1258                .unwrap_or("")
1259                .to_lowercase();
1260            if ext.is_empty() || !exts.contains(&ext) {
1261                return false;
1262            }
1263        }
1264        true
1265    }
1266}
1267
1268fn normalize_languages(langs: &[String]) -> HashSet<String> {
1269    let mut out = HashSet::new();
1270    for l in langs {
1271        let raw = l.trim().trim_start_matches('.').to_lowercase();
1272        match raw.as_str() {
1273            "rust" | "rs" => {
1274                out.insert("rs".to_string());
1275            }
1276            "ts" | "typescript" => {
1277                out.insert("ts".to_string());
1278                out.insert("tsx".to_string());
1279            }
1280            "js" | "javascript" => {
1281                out.insert("js".to_string());
1282                out.insert("jsx".to_string());
1283                out.insert("mjs".to_string());
1284                out.insert("cjs".to_string());
1285            }
1286            "py" | "python" => {
1287                out.insert("py".to_string());
1288            }
1289            "go" => {
1290                out.insert("go".to_string());
1291            }
1292            "java" => {
1293                out.insert("java".to_string());
1294            }
1295            "ruby" | "rb" => {
1296                out.insert("rb".to_string());
1297            }
1298            "php" => {
1299                out.insert("php".to_string());
1300            }
1301            "c" => {
1302                out.insert("c".to_string());
1303                out.insert("h".to_string());
1304            }
1305            "cpp" | "c++" | "cc" => {
1306                out.insert("cpp".to_string());
1307                out.insert("hpp".to_string());
1308                out.insert("cc".to_string());
1309                out.insert("hh".to_string());
1310            }
1311            "cs" | "csharp" => {
1312                out.insert("cs".to_string());
1313            }
1314            "swift" => {
1315                out.insert("swift".to_string());
1316            }
1317            "kt" | "kotlin" => {
1318                out.insert("kt".to_string());
1319                out.insert("kts".to_string());
1320            }
1321            "json" => {
1322                out.insert("json".to_string());
1323            }
1324            "yaml" | "yml" => {
1325                out.insert("yaml".to_string());
1326                out.insert("yml".to_string());
1327            }
1328            other if !other.is_empty() => {
1329                out.insert(other.to_string());
1330            }
1331            _ => {}
1332        }
1333    }
1334    out
1335}
1336
1337/// Public wrapper for eval harness: load embedding engine + index.
1338#[cfg(feature = "embeddings")]
1339pub fn load_engine_and_index_pub(
1340    root: &Path,
1341) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1342    load_engine_and_index(root)
1343}
1344
1345/// Public wrapper for eval harness: prepare embeddings for a project.
1346#[cfg(feature = "embeddings")]
1347pub fn ensure_embeddings_for_eval(
1348    root: &Path,
1349    index: &BM25Index,
1350    engine: &EmbeddingEngine,
1351    embed_idx: &mut EmbeddingIndex,
1352) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1353    ensure_embeddings(root, index, engine, embed_idx)
1354}
1355
1356/// Public wrapper for eval harness: apply SPLADE boosting.
1357pub fn boost_with_splade_pub(
1358    results: &mut [HybridResult],
1359    splade: &[crate::core::splade_retrieval::SpladeResult],
1360    weight: f64,
1361) {
1362    boost_with_splade(results, splade, weight);
1363}
1364
1365#[cfg(test)]
1366mod filter_tests {
1367    use super::*;
1368
1369    #[test]
1370    fn filter_language_rust() {
1371        let f = SearchFilter::new(Some(&["rust".into()]), None).unwrap();
1372        assert!(f.matches("src/main.rs"));
1373        assert!(!f.matches("src/main.ts"));
1374    }
1375
1376    #[test]
1377    fn filter_path_glob() {
1378        let f = SearchFilter::new(None, Some("rust/src/**")).unwrap();
1379        assert!(f.matches("rust/src/core/mod.rs"));
1380        assert!(!f.matches("website/src/pages/index.astro"));
1381    }
1382}
1383
1384#[cfg(test)]
1385mod determinism_tests {
1386    use super::*;
1387
1388    #[test]
1389    fn rrf_merge_hybrid_is_deterministic_on_ties() {
1390        let a = HybridResult {
1391            file_path: "a.rs".to_string(),
1392            symbol_name: "foo".to_string(),
1393            kind: crate::core::bm25_index::ChunkKind::Function,
1394            start_line: 1,
1395            end_line: 1,
1396            snippet: "a".to_string(),
1397            rrf_score: 0.0,
1398            bm25_score: None,
1399            dense_score: None,
1400            bm25_rank: None,
1401            dense_rank: None,
1402        };
1403        let b = HybridResult {
1404            file_path: "b.rs".to_string(),
1405            symbol_name: "foo".to_string(),
1406            kind: crate::core::bm25_index::ChunkKind::Function,
1407            start_line: 1,
1408            end_line: 1,
1409            snippet: "b".to_string(),
1410            rrf_score: 0.0,
1411            bm25_score: None,
1412            dense_score: None,
1413            bm25_rank: None,
1414            dense_rank: None,
1415        };
1416
1417        // Two lists with swapped ranks yield identical RRF sums for a and b.
1418        let fused = rrf_merge_hybrid(
1419            vec![
1420                ("root".to_string(), vec![a.clone(), b.clone()]),
1421                ("root".to_string(), vec![b.clone(), a.clone()]),
1422            ],
1423            10,
1424        );
1425
1426        assert_eq!(fused.len(), 2);
1427        assert_eq!(fused[0].file_path, "a.rs");
1428        assert_eq!(fused[1].file_path, "b.rs");
1429    }
1430}