Skip to main content

lean_ctx/tools/
ctx_semantic_search.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use crate::core::bm25_index::{format_search_results, BM25Index};
5use crate::core::embedding_index::EmbeddingIndex;
6#[cfg(feature = "embeddings")]
7use crate::core::embeddings::EmbeddingEngine;
8use crate::core::hybrid_search::{format_hybrid_results, HybridConfig, HybridResult};
9use crate::tools::CrpMode;
10
11/// Performs semantic code search using BM25, dense embeddings, or hybrid ranking.
12#[allow(clippy::too_many_arguments)]
13pub fn handle(
14    query: &str,
15    path: &str,
16    top_k: usize,
17    crp_mode: CrpMode,
18    languages: Option<&[String]>,
19    path_glob: Option<&str>,
20    mode: Option<&str>,
21    workspace: Option<bool>,
22    artifacts: Option<bool>,
23) -> String {
24    let root = Path::new(path);
25    if !root.exists() {
26        return format!("ERR: path does not exist: {path}");
27    }
28
29    let root = if root.is_file() {
30        root.parent().unwrap_or(root)
31    } else {
32        root
33    };
34
35    let filter = match SearchFilter::new(languages, path_glob) {
36        Ok(f) => f,
37        Err(e) => return format!("ERR: invalid filter: {e}"),
38    };
39
40    let compact = crp_mode.is_tdd();
41    let mode = mode.unwrap_or("hybrid").to_lowercase();
42    let workspace = workspace.unwrap_or(false);
43    let artifacts = artifacts.unwrap_or(false);
44
45    if artifacts {
46        return artifacts_search(query, root, top_k, compact, &filter, workspace);
47    }
48    if workspace {
49        return workspace_search(query, root, top_k, compact, &filter, &mode);
50    }
51
52    let index = match load_or_refresh_bm25(root) {
53        Bm25LoadResult::Ready(idx) => idx,
54        Bm25LoadResult::Building => {
55            return "BM25 index is being built in the background. \
56                    Run ctx_semantic_search again in ~30s, or use action=reindex to wait for completion."
57                .to_string();
58        }
59    };
60    if index.doc_count == 0 {
61        return "No code files found to index.".to_string();
62    }
63
64    match mode.as_str() {
65        "bm25" => {
66            let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
67            if filter.is_active() {
68                results.retain(|x| filter.matches(&x.file_path));
69            }
70            results.truncate(top_k);
71
72            let header = if compact {
73                format!(
74                    "semantic_search(bm25,{top_k}) → {} results, {} chunks indexed\n",
75                    results.len(),
76                    index.doc_count
77                )
78            } else {
79                format!(
80                    "Semantic search (BM25): \"{}\" ({} results from {} indexed chunks)\n",
81                    truncate_query(query, 60),
82                    results.len(),
83                    index.doc_count,
84                )
85            };
86            format!("{header}{}", format_search_results(&results, compact))
87        }
88        "dense" => dense_search_mode(query, root, &index, top_k, compact, &filter),
89        _ => hybrid_search_mode(query, root, &index, top_k, compact, &filter),
90    }
91}
92
93/// Rebuilds the BM25 search index for the given directory from scratch.
94pub fn handle_reindex(path: &str) -> String {
95    let root = Path::new(path);
96    if !root.exists() {
97        return format!("ERR: path does not exist: {path}");
98    }
99    let root = if root.is_file() {
100        root.parent().unwrap_or(root)
101    } else {
102        root
103    };
104
105    let idx = BM25Index::build_from_directory(root);
106    let files = idx.files.len();
107    let chunks = idx.doc_count;
108    let _ = idx.save(root);
109
110    format!("Reindexed {path}: {files} files, {chunks} chunks")
111}
112
113pub fn handle_reindex_artifacts(path: &str, workspace: bool) -> String {
114    let root = Path::new(path);
115    if !root.exists() {
116        return format!("ERR: path does not exist: {path}");
117    }
118    let root = if root.is_file() {
119        root.parent().unwrap_or(root)
120    } else {
121        root
122    };
123
124    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
125    let mut warnings: Vec<String> = Vec::new();
126
127    if workspace {
128        let linked = crate::core::workspace_config::load_linked_projects(root);
129        warnings.extend(linked.warnings);
130        roots.extend(linked.roots);
131    }
132
133    let mut total_files = 0usize;
134    let mut total_chunks = 0usize;
135    for r in roots {
136        let (idx, w) = crate::core::artifact_index::rebuild_from_scratch(&r);
137        warnings.extend(w);
138        total_files += idx.files.len();
139        total_chunks += idx.doc_count;
140    }
141
142    if warnings.is_empty() {
143        format!("Reindexed artifacts: {total_files} files, {total_chunks} chunks")
144    } else {
145        format!(
146            "Reindexed artifacts: {total_files} files, {total_chunks} chunks ({} warning(s))",
147            warnings.len()
148        )
149    }
150}
151
152/// Find chunks semantically related to a given file location.
153///
154/// Marchionini (2006): Exploratory search navigates from known points.
155/// This enables "show me similar code" workflows.
156pub fn handle_find_related(
157    file_path: &str,
158    line: usize,
159    project_root: &str,
160    top_k: usize,
161    crp_mode: CrpMode,
162) -> String {
163    let root = Path::new(project_root);
164    if !root.exists() {
165        return format!("ERR: path does not exist: {project_root}");
166    }
167
168    let index = BM25Index::load_or_build(root);
169    if index.doc_count == 0 {
170        return "ERR: empty index. Try action=reindex first.".to_string();
171    }
172
173    let source_chunk = index
174        .chunks
175        .iter()
176        .find(|c| c.file_path == file_path && c.start_line <= line && c.end_line >= line);
177
178    let Some(source_chunk) = source_chunk else {
179        return format!(
180            "ERR: no indexed chunk found at {file_path}:{line}. Try action=reindex first."
181        );
182    };
183
184    let query_text = source_chunk.content.clone();
185    let source_file = source_chunk.file_path.clone();
186    let source_start = source_chunk.start_line;
187
188    let compact = crp_mode != CrpMode::Off;
189
190    let results = find_related_internal(&query_text, root, &index, top_k + 5, compact);
191
192    let mut lines: Vec<String> = results
193        .into_iter()
194        .filter(|l| !l.contains(&format!("{source_file}:{source_start}-")))
195        .take(top_k)
196        .collect();
197
198    let header = if compact {
199        format!(
200            "find_related({file_path}:{line}) → {} results\n",
201            lines.len()
202        )
203    } else {
204        format!("Find related to {file_path}:{line} (semantic similarity)\n")
205    };
206
207    lines.insert(0, header);
208    lines.join("")
209}
210
211fn find_related_internal(
212    query: &str,
213    root: &Path,
214    index: &BM25Index,
215    top_k: usize,
216    compact: bool,
217) -> Vec<String> {
218    let Ok(filter) = SearchFilter::new(None, None) else {
219        return vec!["ERR: filter init failed\n".to_string()];
220    };
221    let output = hybrid_search_mode(query, root, index, top_k, compact, &filter);
222    output.lines().map(|l| format!("{l}\n")).collect()
223}
224
225fn truncate_query(q: &str, max: usize) -> &str {
226    if q.len() <= max {
227        return q;
228    }
229    match q.char_indices().nth(max) {
230        Some((byte_idx, _)) => &q[..byte_idx],
231        None => q,
232    }
233}
234
235std::thread_local! {
236    static BM25_SHARED_CACHE: std::cell::RefCell<Option<crate::core::bm25_cache::SharedBm25Cache>> =
237        const { std::cell::RefCell::new(None) };
238}
239
240/// Set the shared BM25 cache for the current thread (called from the registered handler).
241pub fn set_thread_cache(cache: crate::core::bm25_cache::SharedBm25Cache) {
242    BM25_SHARED_CACHE.with(|c| {
243        *c.borrow_mut() = Some(cache);
244    });
245}
246
247/// Clone the current thread's shared BM25 cache, if any. Lets composer tools
248/// propagate the resident cache into a budgeted worker thread so a slow cold
249/// build warms the *same* cache instead of being wasted work.
250pub fn get_thread_cache() -> Option<crate::core::bm25_cache::SharedBm25Cache> {
251    BM25_SHARED_CACHE.with(|c| c.borrow().clone())
252}
253
254/// Result of BM25 index loading — may indicate background build in progress.
255pub(crate) enum Bm25LoadResult {
256    Ready(std::sync::Arc<BM25Index>),
257    Building,
258}
259
260fn load_or_refresh_bm25(root: &Path) -> Bm25LoadResult {
261    let cached = BM25_SHARED_CACHE.with(|c| {
262        let borrow = c.borrow();
263        borrow
264            .as_ref()
265            .and_then(|cache| crate::core::bm25_cache::get_or_background(cache, root))
266    });
267    if let Some(idx) = cached {
268        return Bm25LoadResult::Ready(idx);
269    }
270
271    let root_str = root.to_string_lossy().to_string();
272
273    if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
274        let idx = std::sync::Arc::new(idx);
275        store_in_thread_cache(root, &idx);
276        return Bm25LoadResult::Ready(idx);
277    }
278
279    if crate::core::index_orchestrator::is_building() {
280        return Bm25LoadResult::Building;
281    }
282
283    crate::core::index_orchestrator::ensure_all_background(&root_str);
284
285    let idx = std::sync::Arc::new(BM25Index::load_or_build(root));
286    store_in_thread_cache(root, &idx);
287    Bm25LoadResult::Ready(idx)
288}
289
290fn store_in_thread_cache(root: &Path, idx: &std::sync::Arc<BM25Index>) {
291    BM25_SHARED_CACHE.with(|c| {
292        let borrow = c.borrow();
293        if let Some(cache) = borrow.as_ref() {
294            let mut guard = cache
295                .lock()
296                .unwrap_or_else(std::sync::PoisonError::into_inner);
297            *guard = Some(crate::core::bm25_cache::Bm25CacheEntry {
298                root: root.to_path_buf(),
299                index: std::sync::Arc::clone(idx),
300                loaded_at: std::time::Instant::now(),
301                fingerprint: crate::core::bm25_cache::index_fingerprint(root),
302            });
303        }
304    });
305}
306
307fn filtered_candidate_k(top_k: usize, filtered: bool) -> usize {
308    if !filtered {
309        return top_k;
310    }
311    let candidates = (top_k.max(10)).saturating_mul(10);
312    candidates.clamp(50, 500)
313}
314
315const WORKSPACE_RRF_K: f64 = 60.0;
316
317fn artifacts_search(
318    query: &str,
319    root: &Path,
320    top_k: usize,
321    compact: bool,
322    filter: &SearchFilter,
323    workspace: bool,
324) -> String {
325    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
326    let mut warnings: Vec<String> = Vec::new();
327
328    if workspace {
329        let linked = crate::core::workspace_config::load_linked_projects(root);
330        warnings.extend(linked.warnings);
331        roots.extend(linked.roots);
332    }
333    roots.sort();
334    roots.dedup();
335
336    let mut per_project: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)> = Vec::new();
337    let mut total_chunks = 0usize;
338
339    for r in &roots {
340        let label = label_for_root(r);
341        let (idx, w) = crate::core::artifact_index::load_or_build(r);
342        warnings.extend(w);
343        total_chunks += idx.doc_count;
344        if idx.doc_count == 0 {
345            continue;
346        }
347
348        let mut results = idx.search(query, filtered_candidate_k(top_k, filter.is_active()));
349        if filter.is_active() {
350            results.retain(|x| filter.matches(&x.file_path));
351        }
352        results.truncate(top_k);
353
354        for res in &mut results {
355            res.file_path = if workspace {
356                format!("[project:{label}] [artifact] {}", res.file_path)
357            } else {
358                format!("[artifact] {}", res.file_path)
359            };
360        }
361
362        per_project.push((label, results));
363    }
364
365    let mut fused: Vec<crate::core::bm25_index::SearchResult> = if per_project.len() <= 1 {
366        per_project
367            .into_iter()
368            .next()
369            .map(|(_, v)| v)
370            .unwrap_or_default()
371    } else {
372        rrf_merge_bm25(per_project, top_k)
373    };
374
375    if fused.is_empty() {
376        return "No artifact files found to index.".to_string();
377    }
378
379    fused.truncate(top_k);
380
381    let header = if compact {
382        if workspace {
383            format!(
384                "semantic_search(artifacts,workspace,{top_k}) → {} results, projects={}, {} chunks indexed\n",
385                fused.len(),
386                roots.len(),
387                total_chunks
388            )
389        } else {
390            format!(
391                "semantic_search(artifacts,{top_k}) → {} results, {} chunks indexed\n",
392                fused.len(),
393                total_chunks
394            )
395        }
396    } else if workspace {
397        format!(
398            "Semantic search (Artifacts/Workspace): \"{}\" ({} results from {} projects)\n",
399            truncate_query(query, 60),
400            fused.len(),
401            roots.len()
402        )
403    } else {
404        format!(
405            "Semantic search (Artifacts): \"{}\" ({} results)\n",
406            truncate_query(query, 60),
407            fused.len()
408        )
409    };
410
411    let mut out = format!("{header}{}", format_search_results(&fused, compact));
412    if !warnings.is_empty() && !compact {
413        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
414        for w in warnings.iter().take(20) {
415            out.push_str(&format!("- {w}\n"));
416        }
417    }
418    out
419}
420
421fn workspace_search(
422    query: &str,
423    root: &Path,
424    top_k: usize,
425    compact: bool,
426    filter: &SearchFilter,
427    mode: &str,
428) -> String {
429    let linked = crate::core::workspace_config::load_linked_projects(root);
430    let mut warnings = linked.warnings;
431
432    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
433    roots.extend(linked.roots);
434    roots.sort();
435    roots.dedup();
436
437    let mut per_project: Vec<(String, Vec<HybridResult>)> = Vec::new();
438    let mut avg_cov: Option<f64> = None;
439    let mut cov_count = 0usize;
440
441    for r in &roots {
442        let label = label_for_root(r);
443        let index = BM25Index::load_or_build(r);
444        if index.doc_count == 0 {
445            continue;
446        }
447
448        let mut results: Vec<HybridResult> = match mode {
449            "bm25" => {
450                let mut bm25 = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
451                if filter.is_active() {
452                    bm25.retain(|x| filter.matches(&x.file_path));
453                }
454                bm25.truncate(top_k);
455                bm25.into_iter()
456                    .map(HybridResult::from_bm25_public)
457                    .collect()
458            }
459            "dense" => {
460                #[cfg(feature = "embeddings")]
461                {
462                    match dense_results_for_root(query, r, &index, top_k, filter) {
463                        Ok((v, cov)) => {
464                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
465                            cov_count += 1;
466                            v
467                        }
468                        Err(e) => {
469                            warnings.push(format!("[{label}] dense search failed: {e}"));
470                            let mut bm25 = index
471                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
472                            if filter.is_active() {
473                                bm25.retain(|x| filter.matches(&x.file_path));
474                            }
475                            bm25.truncate(top_k);
476                            bm25.into_iter()
477                                .map(HybridResult::from_bm25_public)
478                                .collect()
479                        }
480                    }
481                }
482                #[cfg(not(feature = "embeddings"))]
483                {
484                    let _ = (&label, &warnings);
485                    let mut bm25 =
486                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
487                    if filter.is_active() {
488                        bm25.retain(|x| filter.matches(&x.file_path));
489                    }
490                    bm25.truncate(top_k);
491                    bm25.into_iter()
492                        .map(HybridResult::from_bm25_public)
493                        .collect()
494                }
495            }
496            _ => {
497                #[cfg(feature = "embeddings")]
498                {
499                    match hybrid_results_for_root(query, r, &index, top_k, filter) {
500                        Ok((v, cov)) => {
501                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
502                            cov_count += 1;
503                            v
504                        }
505                        Err(e) => {
506                            warnings.push(format!("[{label}] hybrid search failed: {e}"));
507                            let mut bm25 = index
508                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
509                            if filter.is_active() {
510                                bm25.retain(|x| filter.matches(&x.file_path));
511                            }
512                            bm25.truncate(top_k);
513                            bm25.into_iter()
514                                .map(HybridResult::from_bm25_public)
515                                .collect()
516                        }
517                    }
518                }
519                #[cfg(not(feature = "embeddings"))]
520                {
521                    let _ = (&label, &warnings);
522                    let mut bm25 =
523                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
524                    if filter.is_active() {
525                        bm25.retain(|x| filter.matches(&x.file_path));
526                    }
527                    bm25.truncate(top_k);
528                    bm25.into_iter()
529                        .map(HybridResult::from_bm25_public)
530                        .collect()
531                }
532            }
533        };
534
535        for res in &mut results {
536            res.file_path = format!("[project:{label}] {}", res.file_path);
537        }
538        per_project.push((label, results));
539    }
540
541    let mut fused: Vec<HybridResult> = if per_project.len() <= 1 {
542        per_project
543            .into_iter()
544            .next()
545            .map(|(_, v)| v)
546            .unwrap_or_default()
547    } else {
548        rrf_merge_hybrid(per_project, top_k)
549    };
550
551    if fused.is_empty() {
552        return "No code files found to index.".to_string();
553    }
554
555    fused.truncate(top_k);
556    let cov = avg_cov.and_then(|s| {
557        if cov_count == 0 {
558            None
559        } else {
560            Some(s / cov_count as f64)
561        }
562    });
563
564    let header = if compact {
565        match (mode, cov) {
566            (_, Some(c)) => format!(
567                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}, embed_cov={:.0}%\n",
568                fused.len(),
569                roots.len(),
570                c * 100.0
571            ),
572            _ => format!(
573                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}\n",
574                fused.len(),
575                roots.len()
576            ),
577        }
578    } else {
579        format!(
580            "Workspace semantic search ({mode}): \"{}\" ({} results from {} projects)\n",
581            truncate_query(query, 60),
582            fused.len(),
583            roots.len()
584        )
585    };
586
587    let mut out = format!("{header}{}", format_hybrid_results(&fused, compact));
588    if !warnings.is_empty() && !compact {
589        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
590        for w in warnings.iter().take(20) {
591            out.push_str(&format!("- {w}\n"));
592        }
593    }
594    out
595}
596
597fn rrf_merge_hybrid(lists: Vec<(String, Vec<HybridResult>)>, top_k: usize) -> Vec<HybridResult> {
598    use std::collections::HashMap;
599
600    let mut acc: HashMap<String, (HybridResult, f64)> = HashMap::new();
601    for (label, results) in lists {
602        for (rank, r) in results.into_iter().enumerate() {
603            let key = format!(
604                "{label}|{}|{}|{}|{}",
605                r.file_path, r.symbol_name, r.start_line, r.end_line
606            );
607            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
608            acc.entry(key)
609                .and_modify(|(_, s)| *s += rrf)
610                .or_insert((r, rrf));
611        }
612    }
613
614    let mut out: Vec<HybridResult> = acc
615        .into_values()
616        .map(|(mut r, s)| {
617            r.rrf_score = s;
618            r
619        })
620        .collect();
621    out.sort_by(|a, b| {
622        b.rrf_score
623            .partial_cmp(&a.rrf_score)
624            .unwrap_or(std::cmp::Ordering::Equal)
625            .then_with(|| a.file_path.cmp(&b.file_path))
626            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
627            .then_with(|| a.start_line.cmp(&b.start_line))
628            .then_with(|| a.end_line.cmp(&b.end_line))
629    });
630    out.truncate(top_k);
631    out
632}
633
634fn rrf_merge_bm25(
635    lists: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)>,
636    top_k: usize,
637) -> Vec<crate::core::bm25_index::SearchResult> {
638    use std::collections::HashMap;
639
640    let mut acc: HashMap<String, (crate::core::bm25_index::SearchResult, f64)> = HashMap::new();
641    for (label, results) in lists {
642        for (rank, r) in results.into_iter().enumerate() {
643            let key = format!(
644                "{label}|{}|{}|{}|{}",
645                r.file_path, r.symbol_name, r.start_line, r.end_line
646            );
647            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
648            acc.entry(key)
649                .and_modify(|(_, s)| *s += rrf)
650                .or_insert((r, rrf));
651        }
652    }
653
654    let mut out: Vec<crate::core::bm25_index::SearchResult> = acc
655        .into_values()
656        .map(|(mut r, s)| {
657            r.score = s;
658            r
659        })
660        .collect();
661    out.sort_by(|a, b| {
662        b.score
663            .partial_cmp(&a.score)
664            .unwrap_or(std::cmp::Ordering::Equal)
665            .then_with(|| a.file_path.cmp(&b.file_path))
666            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
667            .then_with(|| a.start_line.cmp(&b.start_line))
668            .then_with(|| a.end_line.cmp(&b.end_line))
669    });
670    out.truncate(top_k);
671    out
672}
673
674#[cfg(feature = "embeddings")]
675fn dense_results_for_root(
676    query: &str,
677    root: &Path,
678    index: &BM25Index,
679    top_k: usize,
680    filter: &SearchFilter,
681) -> Result<(Vec<HybridResult>, f64), String> {
682    let (engine, mut embed_idx) = load_engine_and_index(root)?;
683    let (aligned, coverage, changed_files) =
684        ensure_embeddings(root, index, engine, &mut embed_idx)?;
685
686    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
687    let filter_fn = |p: &str| filter.matches(p);
688    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
689        .is_active()
690        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
691
692    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
693    let mut results = crate::core::dense_backend::dense_results_as_hybrid(
694        backend,
695        root,
696        index,
697        engine,
698        &aligned,
699        &changed_files,
700        query,
701        candidate_k,
702        filter_pred,
703    )?;
704    results.truncate(top_k);
705
706    Ok((results, coverage))
707}
708
709#[cfg(feature = "embeddings")]
710fn hybrid_results_for_root(
711    query: &str,
712    root: &Path,
713    index: &BM25Index,
714    top_k: usize,
715    filter: &SearchFilter,
716) -> Result<(Vec<HybridResult>, f64), String> {
717    let (engine, mut embed_idx) = load_engine_and_index(root)?;
718    let (aligned, coverage, changed_files) =
719        ensure_embeddings(root, index, engine, &mut embed_idx)?;
720
721    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
722    let cfg = HybridConfig::from_config();
723    let filter_fn = |p: &str| filter.matches(p);
724    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
725        .is_active()
726        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
727    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
728    let graph_ranks = graph_rrf_ranks_for_search_root(root);
729    let graph_ranks_ref = graph_ranks.as_ref();
730    let mut results = crate::core::dense_backend::hybrid_results(
731        backend,
732        root,
733        index,
734        engine,
735        &aligned,
736        &changed_files,
737        query,
738        candidate_k,
739        &cfg,
740        filter_pred,
741        graph_ranks_ref,
742    )?;
743
744    if cfg.splade_weight > 0.0 {
745        let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, candidate_k);
746        if !splade.is_empty() {
747            boost_with_splade(&mut results, &splade, cfg.splade_weight);
748        }
749    }
750
751    results.truncate(top_k);
752    Ok((results, coverage))
753}
754
755/// Boost existing hybrid results with SPLADE expansion scores.
756fn boost_with_splade(
757    results: &mut [HybridResult],
758    splade: &[crate::core::splade_retrieval::SpladeResult],
759    weight: f64,
760) {
761    use std::collections::HashMap;
762    let rrf_k = 60.0_f64;
763
764    let boosts: HashMap<&str, f64> = splade
765        .iter()
766        .enumerate()
767        .map(|(rank, sr)| (sr.file_path.as_str(), weight / (rrf_k + rank as f64 + 1.0)))
768        .collect();
769
770    for r in results.iter_mut() {
771        if let Some(&boost) = boosts.get(r.file_path.as_str()) {
772            r.rrf_score += boost;
773        }
774    }
775
776    results.sort_by(|a, b| {
777        b.rrf_score
778            .partial_cmp(&a.rrf_score)
779            .unwrap_or(std::cmp::Ordering::Equal)
780    });
781}
782
783fn label_for_root(root: &Path) -> String {
784    root.file_name()
785        .and_then(|s| s.to_str())
786        .map(str::to_string)
787        .filter(|s| !s.is_empty())
788        .unwrap_or_else(|| root.to_string_lossy().to_string())
789}
790
791fn graph_rrf_ranks_for_search_root(
792    root: &Path,
793) -> Option<std::collections::HashMap<String, usize>> {
794    let root_s = root.to_string_lossy().to_string();
795    let session = crate::core::session::SessionState::load_latest_for_project_root(&root_s)?;
796
797    if session.files_touched.is_empty() {
798        return None;
799    }
800
801    let recent: Vec<String> = session
802        .files_touched
803        .iter()
804        .rev()
805        .filter(|f| path_under_search_root(&f.path, root))
806        .take(12)
807        .map(|f| f.path.clone())
808        .collect();
809
810    if recent.is_empty() {
811        return None;
812    }
813
814    crate::core::graph_context::graph_neighbor_ranks_for_recent_files(&root_s, &recent, 40, 120)
815}
816
817fn path_under_search_root(path: &str, root: &Path) -> bool {
818    let p = std::path::Path::new(path);
819    if p.is_absolute() {
820        let root_norm = crate::core::pathutil::safe_canonicalize_or_self(root);
821        let path_norm = crate::core::pathutil::safe_canonicalize_or_self(p);
822        path_norm.starts_with(&root_norm)
823    } else {
824        true
825    }
826}
827
828fn hybrid_search_mode(
829    query: &str,
830    root: &Path,
831    index: &BM25Index,
832    top_k: usize,
833    compact: bool,
834    filter: &SearchFilter,
835) -> String {
836    #[cfg(feature = "embeddings")]
837    {
838        let (engine, mut embed_idx) = match load_engine_and_index(root) {
839            Ok(v) => v,
840            Err(e) => return format!("ERR: {e}"),
841        };
842
843        let (aligned, coverage, changed_files) =
844            match ensure_embeddings(root, index, engine, &mut embed_idx) {
845                Ok(v) => v,
846                Err(e) => return format!("ERR: {e}"),
847            };
848
849        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
850            Ok(v) => v,
851            Err(e) => return format!("ERR: {e}"),
852        };
853
854        let cfg = HybridConfig::from_config();
855        let filter_fn = |p: &str| filter.matches(p);
856        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
857            .is_active()
858            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
859        let graph_ranks = graph_rrf_ranks_for_search_root(root);
860        let graph_ranks_ref = graph_ranks.as_ref();
861        let mut results = match crate::core::dense_backend::hybrid_results(
862            backend,
863            root,
864            index,
865            engine,
866            &aligned,
867            &changed_files,
868            query,
869            top_k,
870            &cfg,
871            filter_pred,
872            graph_ranks_ref,
873        ) {
874            Ok(v) => v,
875            Err(e) => return format!("ERR: {e}"),
876        };
877
878        if cfg.splade_weight > 0.0 {
879            let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, top_k);
880            if !splade.is_empty() {
881                boost_with_splade(&mut results, &splade, cfg.splade_weight);
882            }
883        }
884
885        results.truncate(top_k);
886
887        let header = if compact {
888            format!(
889                "semantic_search(hybrid,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
890                results.len(),
891                index.doc_count,
892                coverage * 100.0
893            )
894        } else {
895            format!(
896                "Semantic search (Hybrid): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
897                truncate_query(query, 60),
898                results.len(),
899                index.doc_count,
900                coverage * 100.0
901            )
902        };
903
904        format!("{header}{}", format_hybrid_results(&results, compact))
905    }
906    #[cfg(not(feature = "embeddings"))]
907    {
908        let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
909        if filter.is_active() {
910            results.retain(|x| filter.matches(&x.file_path));
911        }
912
913        if let Some(graph_ranks) = graph_rrf_ranks_for_search_root(root) {
914            const GRAPH_RRF_K: f64 = 60.0;
915            for r in &mut results {
916                if let Some(&rank) = graph_ranks.get(&r.file_path) {
917                    r.score += 1.0 / (GRAPH_RRF_K + rank as f64 + 1.0);
918                }
919            }
920            results.sort_by(|a, b| {
921                b.score
922                    .partial_cmp(&a.score)
923                    .unwrap_or(std::cmp::Ordering::Equal)
924            });
925        }
926
927        results.truncate(top_k);
928        let graph_tag = if graph_rrf_ranks_for_search_root(root).is_some() {
929            "+graph"
930        } else {
931            ""
932        };
933        let header = if compact {
934            format!(
935                "semantic_search(bm25{graph_tag},{top_k}) → {} results, {} chunks indexed\n",
936                results.len(),
937                index.doc_count
938            )
939        } else {
940            format!(
941                "Semantic search (BM25{graph_tag}): \"{}\" ({} results from {} indexed chunks)\n",
942                truncate_query(query, 60),
943                results.len(),
944                index.doc_count,
945            )
946        };
947        format!("{header}{}", format_search_results(&results, compact))
948    }
949}
950
951fn dense_search_mode(
952    query: &str,
953    root: &Path,
954    index: &BM25Index,
955    top_k: usize,
956    compact: bool,
957    filter: &SearchFilter,
958) -> String {
959    #[cfg(feature = "embeddings")]
960    {
961        let (engine, mut embed_idx) = match load_engine_and_index(root) {
962            Ok(v) => v,
963            Err(e) => return format!("ERR: {e}"),
964        };
965
966        let (aligned, coverage, changed_files) =
967            match ensure_embeddings(root, index, engine, &mut embed_idx) {
968                Ok(v) => v,
969                Err(e) => return format!("ERR: {e}"),
970            };
971
972        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
973            Ok(v) => v,
974            Err(e) => return format!("ERR: {e}"),
975        };
976
977        let filter_fn = |p: &str| filter.matches(p);
978        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
979            .is_active()
980            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
981
982        let candidate_k = filtered_candidate_k(top_k, filter.is_active());
983        let mut results = match crate::core::dense_backend::dense_results_as_hybrid(
984            backend,
985            root,
986            index,
987            engine,
988            &aligned,
989            &changed_files,
990            query,
991            candidate_k,
992            filter_pred,
993        ) {
994            Ok(v) => v,
995            Err(e) => return format!("ERR: {e}"),
996        };
997        results.truncate(top_k);
998
999        let header = if compact {
1000            format!(
1001                "semantic_search(dense,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
1002                results.len(),
1003                index.doc_count,
1004                coverage * 100.0
1005            )
1006        } else {
1007            format!(
1008                "Semantic search (Dense): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
1009                truncate_query(query, 60),
1010                results.len(),
1011                index.doc_count,
1012                coverage * 100.0
1013            )
1014        };
1015
1016        format!("{header}{}", format_hybrid_results(&results, compact))
1017    }
1018    #[cfg(not(feature = "embeddings"))]
1019    {
1020        "ERR: embeddings feature not enabled".to_string()
1021    }
1022}
1023
1024#[cfg(feature = "embeddings")]
1025fn load_engine_and_index(
1026    root: &Path,
1027) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1028    let cfg = crate::core::config::Config::load();
1029    let profile = crate::core::config::MemoryProfile::effective(&cfg);
1030    if !profile.embeddings_enabled() {
1031        return Err("embeddings disabled by memory_profile=low".into());
1032    }
1033
1034    let engine = crate::core::embeddings::shared_engine()
1035        .ok_or_else(|| "embedding engine load failed".to_string())?;
1036
1037    let mut idx =
1038        EmbeddingIndex::load(root).unwrap_or_else(|| EmbeddingIndex::new(engine.dimensions()));
1039    if idx.dimensions != engine.dimensions() {
1040        idx = EmbeddingIndex::new(engine.dimensions());
1041    }
1042    Ok((engine, idx))
1043}
1044
1045#[cfg(feature = "embeddings")]
1046fn ensure_embeddings(
1047    root: &Path,
1048    index: &BM25Index,
1049    engine: &EmbeddingEngine,
1050    embed_idx: &mut EmbeddingIndex,
1051) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1052    let mut changed_files = embed_idx.files_needing_update(&index.chunks);
1053    changed_files.sort();
1054    changed_files.dedup();
1055
1056    if !changed_files.is_empty() {
1057        let changed_set: std::collections::HashSet<&str> = changed_files
1058            .iter()
1059            .map(std::string::String::as_str)
1060            .collect();
1061        let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::new();
1062        for (i, c) in index.chunks.iter().enumerate() {
1063            if !changed_set.contains(c.file_path.as_str()) {
1064                continue;
1065            }
1066            let emb = engine
1067                .embed(&c.content)
1068                .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1069            new_embeddings.push((i, emb));
1070        }
1071        embed_idx.update(&index.chunks, &new_embeddings, &changed_files);
1072        embed_idx
1073            .save(root)
1074            .map_err(|e| format!("save embeddings failed: {e}"))?;
1075    }
1076
1077    if let Some(aligned) = embed_idx.get_aligned_embeddings(&index.chunks) {
1078        let coverage = embed_idx.coverage(index.chunks.len());
1079        return Ok((aligned, coverage, changed_files));
1080    }
1081
1082    // Alignment missing: rebuild everything once.
1083    let mut all_files: Vec<String> = index.chunks.iter().map(|c| c.file_path.clone()).collect();
1084    all_files.sort();
1085    all_files.dedup();
1086
1087    let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::with_capacity(index.chunks.len());
1088    for (i, c) in index.chunks.iter().enumerate() {
1089        let emb = engine
1090            .embed(&c.content)
1091            .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1092        new_embeddings.push((i, emb));
1093    }
1094
1095    embed_idx.update(&index.chunks, &new_embeddings, &all_files);
1096    embed_idx
1097        .save(root)
1098        .map_err(|e| format!("save embeddings failed: {e}"))?;
1099
1100    let aligned = embed_idx
1101        .get_aligned_embeddings(&index.chunks)
1102        .ok_or_else(|| "embedding alignment failed after full rebuild".to_string())?;
1103    let coverage = embed_idx.coverage(index.chunks.len());
1104    Ok((aligned, coverage, all_files))
1105}
1106
1107struct SearchFilter {
1108    allowed_exts: Option<HashSet<String>>,
1109    path_glob: Option<glob::Pattern>,
1110}
1111
1112impl SearchFilter {
1113    fn new(languages: Option<&[String]>, path_glob: Option<&str>) -> Result<Self, String> {
1114        let allowed_exts = languages.map(normalize_languages);
1115        let path_glob = match path_glob {
1116            None => None,
1117            Some(s) if s.trim().is_empty() => None,
1118            Some(s) => Some(glob::Pattern::new(s).map_err(|e| e.msg.to_string())?),
1119        };
1120        Ok(Self {
1121            allowed_exts,
1122            path_glob,
1123        })
1124    }
1125
1126    fn is_active(&self) -> bool {
1127        self.allowed_exts.is_some() || self.path_glob.is_some()
1128    }
1129
1130    fn matches(&self, rel_path: &str) -> bool {
1131        let rel_path = rel_path.replace('\\', "/");
1132        if let Some(p) = &self.path_glob {
1133            if !p.matches(&rel_path) {
1134                return false;
1135            }
1136        }
1137        if let Some(exts) = &self.allowed_exts {
1138            let ext = Path::new(&rel_path)
1139                .extension()
1140                .and_then(|e| e.to_str())
1141                .unwrap_or("")
1142                .to_lowercase();
1143            if ext.is_empty() || !exts.contains(&ext) {
1144                return false;
1145            }
1146        }
1147        true
1148    }
1149}
1150
1151fn normalize_languages(langs: &[String]) -> HashSet<String> {
1152    let mut out = HashSet::new();
1153    for l in langs {
1154        let raw = l.trim().trim_start_matches('.').to_lowercase();
1155        match raw.as_str() {
1156            "rust" | "rs" => {
1157                out.insert("rs".to_string());
1158            }
1159            "ts" | "typescript" => {
1160                out.insert("ts".to_string());
1161                out.insert("tsx".to_string());
1162            }
1163            "js" | "javascript" => {
1164                out.insert("js".to_string());
1165                out.insert("jsx".to_string());
1166                out.insert("mjs".to_string());
1167                out.insert("cjs".to_string());
1168            }
1169            "py" | "python" => {
1170                out.insert("py".to_string());
1171            }
1172            "go" => {
1173                out.insert("go".to_string());
1174            }
1175            "java" => {
1176                out.insert("java".to_string());
1177            }
1178            "ruby" | "rb" => {
1179                out.insert("rb".to_string());
1180            }
1181            "php" => {
1182                out.insert("php".to_string());
1183            }
1184            "c" => {
1185                out.insert("c".to_string());
1186                out.insert("h".to_string());
1187            }
1188            "cpp" | "c++" | "cc" => {
1189                out.insert("cpp".to_string());
1190                out.insert("hpp".to_string());
1191                out.insert("cc".to_string());
1192                out.insert("hh".to_string());
1193            }
1194            "cs" | "csharp" => {
1195                out.insert("cs".to_string());
1196            }
1197            "swift" => {
1198                out.insert("swift".to_string());
1199            }
1200            "kt" | "kotlin" => {
1201                out.insert("kt".to_string());
1202                out.insert("kts".to_string());
1203            }
1204            "json" => {
1205                out.insert("json".to_string());
1206            }
1207            "yaml" | "yml" => {
1208                out.insert("yaml".to_string());
1209                out.insert("yml".to_string());
1210            }
1211            other if !other.is_empty() => {
1212                out.insert(other.to_string());
1213            }
1214            _ => {}
1215        }
1216    }
1217    out
1218}
1219
1220/// Public wrapper for eval harness: load embedding engine + index.
1221#[cfg(feature = "embeddings")]
1222pub fn load_engine_and_index_pub(
1223    root: &Path,
1224) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1225    load_engine_and_index(root)
1226}
1227
1228/// Public wrapper for eval harness: prepare embeddings for a project.
1229#[cfg(feature = "embeddings")]
1230pub fn ensure_embeddings_for_eval(
1231    root: &Path,
1232    index: &BM25Index,
1233    engine: &EmbeddingEngine,
1234    embed_idx: &mut EmbeddingIndex,
1235) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1236    ensure_embeddings(root, index, engine, embed_idx)
1237}
1238
1239/// Public wrapper for eval harness: apply SPLADE boosting.
1240pub fn boost_with_splade_pub(
1241    results: &mut [HybridResult],
1242    splade: &[crate::core::splade_retrieval::SpladeResult],
1243    weight: f64,
1244) {
1245    boost_with_splade(results, splade, weight);
1246}
1247
1248#[cfg(test)]
1249mod filter_tests {
1250    use super::*;
1251
1252    #[test]
1253    fn filter_language_rust() {
1254        let f = SearchFilter::new(Some(&["rust".into()]), None).unwrap();
1255        assert!(f.matches("src/main.rs"));
1256        assert!(!f.matches("src/main.ts"));
1257    }
1258
1259    #[test]
1260    fn filter_path_glob() {
1261        let f = SearchFilter::new(None, Some("rust/src/**")).unwrap();
1262        assert!(f.matches("rust/src/core/mod.rs"));
1263        assert!(!f.matches("website/src/pages/index.astro"));
1264    }
1265}
1266
1267#[cfg(test)]
1268mod determinism_tests {
1269    use super::*;
1270
1271    #[test]
1272    fn rrf_merge_hybrid_is_deterministic_on_ties() {
1273        let a = HybridResult {
1274            file_path: "a.rs".to_string(),
1275            symbol_name: "foo".to_string(),
1276            kind: crate::core::bm25_index::ChunkKind::Function,
1277            start_line: 1,
1278            end_line: 1,
1279            snippet: "a".to_string(),
1280            rrf_score: 0.0,
1281            bm25_score: None,
1282            dense_score: None,
1283            bm25_rank: None,
1284            dense_rank: None,
1285        };
1286        let b = HybridResult {
1287            file_path: "b.rs".to_string(),
1288            symbol_name: "foo".to_string(),
1289            kind: crate::core::bm25_index::ChunkKind::Function,
1290            start_line: 1,
1291            end_line: 1,
1292            snippet: "b".to_string(),
1293            rrf_score: 0.0,
1294            bm25_score: None,
1295            dense_score: None,
1296            bm25_rank: None,
1297            dense_rank: None,
1298        };
1299
1300        // Two lists with swapped ranks yield identical RRF sums for a and b.
1301        let fused = rrf_merge_hybrid(
1302            vec![
1303                ("root".to_string(), vec![a.clone(), b.clone()]),
1304                ("root".to_string(), vec![b.clone(), a.clone()]),
1305            ],
1306            10,
1307        );
1308
1309        assert_eq!(fused.len(), 2);
1310        assert_eq!(fused[0].file_path, "a.rs");
1311        assert_eq!(fused[1].file_path, "b.rs");
1312    }
1313}