Skip to main content

lean_ctx/tools/
ctx_semantic_search.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use crate::core::bm25_index::{format_search_results, BM25Index};
5use crate::core::embedding_index::EmbeddingIndex;
6#[cfg(feature = "embeddings")]
7use crate::core::embeddings::EmbeddingEngine;
8use crate::core::hybrid_search::{format_hybrid_results, HybridConfig, HybridResult};
9use crate::tools::CrpMode;
10
11/// Performs semantic code search using BM25, dense embeddings, or hybrid ranking.
12#[allow(clippy::too_many_arguments)]
13pub fn handle(
14    query: &str,
15    path: &str,
16    top_k: usize,
17    crp_mode: CrpMode,
18    languages: Option<&[String]>,
19    path_glob: Option<&str>,
20    mode: Option<&str>,
21    workspace: Option<bool>,
22    artifacts: Option<bool>,
23) -> String {
24    let root = Path::new(path);
25    if !root.exists() {
26        return format!("ERR: path does not exist: {path}");
27    }
28
29    let root = if root.is_file() {
30        root.parent().unwrap_or(root)
31    } else {
32        root
33    };
34
35    let filter = match SearchFilter::new(languages, path_glob) {
36        Ok(f) => f,
37        Err(e) => return format!("ERR: invalid filter: {e}"),
38    };
39
40    let compact = crp_mode.is_tdd();
41    let mode = mode.unwrap_or("hybrid").to_lowercase();
42    let workspace = workspace.unwrap_or(false);
43    let artifacts = artifacts.unwrap_or(false);
44
45    if artifacts {
46        return artifacts_search(query, root, top_k, compact, &filter, workspace);
47    }
48    if workspace {
49        return workspace_search(query, root, top_k, compact, &filter, &mode);
50    }
51
52    let index = match load_or_refresh_bm25(root) {
53        Bm25LoadResult::Ready(idx) => idx,
54        Bm25LoadResult::Building => {
55            return "BM25 index is being built in the background. \
56                    Run ctx_semantic_search again in ~30s, or use action=reindex to wait for completion."
57                .to_string();
58        }
59    };
60    if index.doc_count == 0 {
61        return "No code files found to index.".to_string();
62    }
63
64    match mode.as_str() {
65        "bm25" => {
66            let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
67            if filter.is_active() {
68                results.retain(|x| filter.matches(&x.file_path));
69            }
70            results.truncate(top_k);
71
72            let header = if compact {
73                format!(
74                    "semantic_search(bm25,{top_k}) → {} results, {} chunks indexed\n",
75                    results.len(),
76                    index.doc_count
77                )
78            } else {
79                format!(
80                    "Semantic search (BM25): \"{}\" ({} results from {} indexed chunks)\n",
81                    truncate_query(query, 60),
82                    results.len(),
83                    index.doc_count,
84                )
85            };
86            format!("{header}{}", format_search_results(&results, compact))
87        }
88        "dense" => dense_search_mode(query, root, &index, top_k, compact, &filter),
89        _ => hybrid_search_mode(query, root, &index, top_k, compact, &filter),
90    }
91}
92
93/// Rebuilds the BM25 search index for the given directory from scratch.
94pub fn handle_reindex(path: &str) -> String {
95    let root = Path::new(path);
96    if !root.exists() {
97        return format!("ERR: path does not exist: {path}");
98    }
99    let root = if root.is_file() {
100        root.parent().unwrap_or(root)
101    } else {
102        root
103    };
104
105    let idx = BM25Index::build_from_directory(root);
106    let files = idx.files.len();
107    let chunks = idx.doc_count;
108    let _ = idx.save(root);
109
110    format!("Reindexed {path}: {files} files, {chunks} chunks")
111}
112
113pub fn handle_reindex_artifacts(path: &str, workspace: bool) -> String {
114    let root = Path::new(path);
115    if !root.exists() {
116        return format!("ERR: path does not exist: {path}");
117    }
118    let root = if root.is_file() {
119        root.parent().unwrap_or(root)
120    } else {
121        root
122    };
123
124    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
125    let mut warnings: Vec<String> = Vec::new();
126
127    if workspace {
128        let linked = crate::core::workspace_config::load_linked_projects(root);
129        warnings.extend(linked.warnings);
130        roots.extend(linked.roots);
131    }
132
133    let mut total_files = 0usize;
134    let mut total_chunks = 0usize;
135    for r in roots {
136        let (idx, w) = crate::core::artifact_index::rebuild_from_scratch(&r);
137        warnings.extend(w);
138        total_files += idx.files.len();
139        total_chunks += idx.doc_count;
140    }
141
142    if warnings.is_empty() {
143        format!("Reindexed artifacts: {total_files} files, {total_chunks} chunks")
144    } else {
145        format!(
146            "Reindexed artifacts: {total_files} files, {total_chunks} chunks ({} warning(s))",
147            warnings.len()
148        )
149    }
150}
151
152/// Find chunks semantically related to a given file location.
153///
154/// Marchionini (2006): Exploratory search navigates from known points.
155/// This enables "show me similar code" workflows.
156pub fn handle_find_related(
157    file_path: &str,
158    line: usize,
159    project_root: &str,
160    top_k: usize,
161    crp_mode: CrpMode,
162) -> String {
163    let root = Path::new(project_root);
164    if !root.exists() {
165        return format!("ERR: path does not exist: {project_root}");
166    }
167
168    let index = BM25Index::load_or_build(root);
169    if index.doc_count == 0 {
170        return "ERR: empty index. Try action=reindex first.".to_string();
171    }
172
173    let source_chunk = index
174        .chunks
175        .iter()
176        .find(|c| c.file_path == file_path && c.start_line <= line && c.end_line >= line);
177
178    let Some(source_chunk) = source_chunk else {
179        return format!(
180            "ERR: no indexed chunk found at {file_path}:{line}. Try action=reindex first."
181        );
182    };
183
184    let query_text = source_chunk.content.clone();
185    let source_file = source_chunk.file_path.clone();
186    let source_start = source_chunk.start_line;
187
188    let compact = crp_mode != CrpMode::Off;
189
190    let results = find_related_internal(&query_text, root, &index, top_k + 5, compact);
191
192    let mut lines: Vec<String> = results
193        .into_iter()
194        .filter(|l| !l.contains(&format!("{source_file}:{source_start}-")))
195        .take(top_k)
196        .collect();
197
198    let header = if compact {
199        format!(
200            "find_related({file_path}:{line}) → {} results\n",
201            lines.len()
202        )
203    } else {
204        format!("Find related to {file_path}:{line} (semantic similarity)\n")
205    };
206
207    lines.insert(0, header);
208    lines.join("")
209}
210
211fn find_related_internal(
212    query: &str,
213    root: &Path,
214    index: &BM25Index,
215    top_k: usize,
216    compact: bool,
217) -> Vec<String> {
218    let Ok(filter) = SearchFilter::new(None, None) else {
219        return vec!["ERR: filter init failed\n".to_string()];
220    };
221    let output = hybrid_search_mode(query, root, index, top_k, compact, &filter);
222    output.lines().map(|l| format!("{l}\n")).collect()
223}
224
225fn truncate_query(q: &str, max: usize) -> &str {
226    if q.len() <= max {
227        return q;
228    }
229    match q.char_indices().nth(max) {
230        Some((byte_idx, _)) => &q[..byte_idx],
231        None => q,
232    }
233}
234
235std::thread_local! {
236    static BM25_SHARED_CACHE: std::cell::RefCell<Option<crate::core::bm25_cache::SharedBm25Cache>> =
237        const { std::cell::RefCell::new(None) };
238}
239
240/// Set the shared BM25 cache for the current thread (called from the registered handler).
241pub fn set_thread_cache(cache: crate::core::bm25_cache::SharedBm25Cache) {
242    BM25_SHARED_CACHE.with(|c| {
243        *c.borrow_mut() = Some(cache);
244    });
245}
246
247/// Result of BM25 index loading — may indicate background build in progress.
248pub(crate) enum Bm25LoadResult {
249    Ready(std::sync::Arc<BM25Index>),
250    Building,
251}
252
253fn load_or_refresh_bm25(root: &Path) -> Bm25LoadResult {
254    let cached = BM25_SHARED_CACHE.with(|c| {
255        let borrow = c.borrow();
256        borrow
257            .as_ref()
258            .and_then(|cache| crate::core::bm25_cache::get_or_background(cache, root))
259    });
260    if let Some(idx) = cached {
261        return Bm25LoadResult::Ready(idx);
262    }
263
264    let root_str = root.to_string_lossy().to_string();
265
266    if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
267        let idx = std::sync::Arc::new(idx);
268        store_in_thread_cache(root, &idx);
269        return Bm25LoadResult::Ready(idx);
270    }
271
272    if crate::core::index_orchestrator::is_building() {
273        return Bm25LoadResult::Building;
274    }
275
276    crate::core::index_orchestrator::ensure_all_background(&root_str);
277
278    let idx = std::sync::Arc::new(BM25Index::load_or_build(root));
279    store_in_thread_cache(root, &idx);
280    Bm25LoadResult::Ready(idx)
281}
282
283fn store_in_thread_cache(root: &Path, idx: &std::sync::Arc<BM25Index>) {
284    BM25_SHARED_CACHE.with(|c| {
285        let borrow = c.borrow();
286        if let Some(cache) = borrow.as_ref() {
287            let mut guard = cache
288                .lock()
289                .unwrap_or_else(std::sync::PoisonError::into_inner);
290            *guard = Some(crate::core::bm25_cache::Bm25CacheEntry {
291                root: root.to_path_buf(),
292                index: std::sync::Arc::clone(idx),
293                loaded_at: std::time::Instant::now(),
294            });
295        }
296    });
297}
298
299fn filtered_candidate_k(top_k: usize, filtered: bool) -> usize {
300    if !filtered {
301        return top_k;
302    }
303    let candidates = (top_k.max(10)).saturating_mul(10);
304    candidates.clamp(50, 500)
305}
306
307const WORKSPACE_RRF_K: f64 = 60.0;
308
309fn artifacts_search(
310    query: &str,
311    root: &Path,
312    top_k: usize,
313    compact: bool,
314    filter: &SearchFilter,
315    workspace: bool,
316) -> String {
317    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
318    let mut warnings: Vec<String> = Vec::new();
319
320    if workspace {
321        let linked = crate::core::workspace_config::load_linked_projects(root);
322        warnings.extend(linked.warnings);
323        roots.extend(linked.roots);
324    }
325    roots.sort();
326    roots.dedup();
327
328    let mut per_project: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)> = Vec::new();
329    let mut total_chunks = 0usize;
330
331    for r in &roots {
332        let label = label_for_root(r);
333        let (idx, w) = crate::core::artifact_index::load_or_build(r);
334        warnings.extend(w);
335        total_chunks += idx.doc_count;
336        if idx.doc_count == 0 {
337            continue;
338        }
339
340        let mut results = idx.search(query, filtered_candidate_k(top_k, filter.is_active()));
341        if filter.is_active() {
342            results.retain(|x| filter.matches(&x.file_path));
343        }
344        results.truncate(top_k);
345
346        for res in &mut results {
347            res.file_path = if workspace {
348                format!("[project:{label}] [artifact] {}", res.file_path)
349            } else {
350                format!("[artifact] {}", res.file_path)
351            };
352        }
353
354        per_project.push((label, results));
355    }
356
357    let mut fused: Vec<crate::core::bm25_index::SearchResult> = if per_project.len() <= 1 {
358        per_project
359            .into_iter()
360            .next()
361            .map(|(_, v)| v)
362            .unwrap_or_default()
363    } else {
364        rrf_merge_bm25(per_project, top_k)
365    };
366
367    if fused.is_empty() {
368        return "No artifact files found to index.".to_string();
369    }
370
371    fused.truncate(top_k);
372
373    let header = if compact {
374        if workspace {
375            format!(
376                "semantic_search(artifacts,workspace,{top_k}) → {} results, projects={}, {} chunks indexed\n",
377                fused.len(),
378                roots.len(),
379                total_chunks
380            )
381        } else {
382            format!(
383                "semantic_search(artifacts,{top_k}) → {} results, {} chunks indexed\n",
384                fused.len(),
385                total_chunks
386            )
387        }
388    } else if workspace {
389        format!(
390            "Semantic search (Artifacts/Workspace): \"{}\" ({} results from {} projects)\n",
391            truncate_query(query, 60),
392            fused.len(),
393            roots.len()
394        )
395    } else {
396        format!(
397            "Semantic search (Artifacts): \"{}\" ({} results)\n",
398            truncate_query(query, 60),
399            fused.len()
400        )
401    };
402
403    let mut out = format!("{header}{}", format_search_results(&fused, compact));
404    if !warnings.is_empty() && !compact {
405        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
406        for w in warnings.iter().take(20) {
407            out.push_str(&format!("- {w}\n"));
408        }
409    }
410    out
411}
412
413fn workspace_search(
414    query: &str,
415    root: &Path,
416    top_k: usize,
417    compact: bool,
418    filter: &SearchFilter,
419    mode: &str,
420) -> String {
421    let linked = crate::core::workspace_config::load_linked_projects(root);
422    let mut warnings = linked.warnings;
423
424    let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
425    roots.extend(linked.roots);
426    roots.sort();
427    roots.dedup();
428
429    let mut per_project: Vec<(String, Vec<HybridResult>)> = Vec::new();
430    let mut avg_cov: Option<f64> = None;
431    let mut cov_count = 0usize;
432
433    for r in &roots {
434        let label = label_for_root(r);
435        let index = BM25Index::load_or_build(r);
436        if index.doc_count == 0 {
437            continue;
438        }
439
440        let mut results: Vec<HybridResult> = match mode {
441            "bm25" => {
442                let mut bm25 = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
443                if filter.is_active() {
444                    bm25.retain(|x| filter.matches(&x.file_path));
445                }
446                bm25.truncate(top_k);
447                bm25.into_iter()
448                    .map(HybridResult::from_bm25_public)
449                    .collect()
450            }
451            "dense" => {
452                #[cfg(feature = "embeddings")]
453                {
454                    match dense_results_for_root(query, r, &index, top_k, filter) {
455                        Ok((v, cov)) => {
456                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
457                            cov_count += 1;
458                            v
459                        }
460                        Err(e) => {
461                            warnings.push(format!("[{label}] dense search failed: {e}"));
462                            let mut bm25 = index
463                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
464                            if filter.is_active() {
465                                bm25.retain(|x| filter.matches(&x.file_path));
466                            }
467                            bm25.truncate(top_k);
468                            bm25.into_iter()
469                                .map(HybridResult::from_bm25_public)
470                                .collect()
471                        }
472                    }
473                }
474                #[cfg(not(feature = "embeddings"))]
475                {
476                    let _ = (&label, &warnings);
477                    let mut bm25 =
478                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
479                    if filter.is_active() {
480                        bm25.retain(|x| filter.matches(&x.file_path));
481                    }
482                    bm25.truncate(top_k);
483                    bm25.into_iter()
484                        .map(HybridResult::from_bm25_public)
485                        .collect()
486                }
487            }
488            _ => {
489                #[cfg(feature = "embeddings")]
490                {
491                    match hybrid_results_for_root(query, r, &index, top_k, filter) {
492                        Ok((v, cov)) => {
493                            avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
494                            cov_count += 1;
495                            v
496                        }
497                        Err(e) => {
498                            warnings.push(format!("[{label}] hybrid search failed: {e}"));
499                            let mut bm25 = index
500                                .search(query, filtered_candidate_k(top_k, filter.is_active()));
501                            if filter.is_active() {
502                                bm25.retain(|x| filter.matches(&x.file_path));
503                            }
504                            bm25.truncate(top_k);
505                            bm25.into_iter()
506                                .map(HybridResult::from_bm25_public)
507                                .collect()
508                        }
509                    }
510                }
511                #[cfg(not(feature = "embeddings"))]
512                {
513                    let _ = (&label, &warnings);
514                    let mut bm25 =
515                        index.search(query, filtered_candidate_k(top_k, filter.is_active()));
516                    if filter.is_active() {
517                        bm25.retain(|x| filter.matches(&x.file_path));
518                    }
519                    bm25.truncate(top_k);
520                    bm25.into_iter()
521                        .map(HybridResult::from_bm25_public)
522                        .collect()
523                }
524            }
525        };
526
527        for res in &mut results {
528            res.file_path = format!("[project:{label}] {}", res.file_path);
529        }
530        per_project.push((label, results));
531    }
532
533    let mut fused: Vec<HybridResult> = if per_project.len() <= 1 {
534        per_project
535            .into_iter()
536            .next()
537            .map(|(_, v)| v)
538            .unwrap_or_default()
539    } else {
540        rrf_merge_hybrid(per_project, top_k)
541    };
542
543    if fused.is_empty() {
544        return "No code files found to index.".to_string();
545    }
546
547    fused.truncate(top_k);
548    let cov = avg_cov.and_then(|s| {
549        if cov_count == 0 {
550            None
551        } else {
552            Some(s / cov_count as f64)
553        }
554    });
555
556    let header = if compact {
557        match (mode, cov) {
558            (_, Some(c)) => format!(
559                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}, embed_cov={:.0}%\n",
560                fused.len(),
561                roots.len(),
562                c * 100.0
563            ),
564            _ => format!(
565                "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}\n",
566                fused.len(),
567                roots.len()
568            ),
569        }
570    } else {
571        format!(
572            "Workspace semantic search ({mode}): \"{}\" ({} results from {} projects)\n",
573            truncate_query(query, 60),
574            fused.len(),
575            roots.len()
576        )
577    };
578
579    let mut out = format!("{header}{}", format_hybrid_results(&fused, compact));
580    if !warnings.is_empty() && !compact {
581        out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
582        for w in warnings.iter().take(20) {
583            out.push_str(&format!("- {w}\n"));
584        }
585    }
586    out
587}
588
589fn rrf_merge_hybrid(lists: Vec<(String, Vec<HybridResult>)>, top_k: usize) -> Vec<HybridResult> {
590    use std::collections::HashMap;
591
592    let mut acc: HashMap<String, (HybridResult, f64)> = HashMap::new();
593    for (label, results) in lists {
594        for (rank, r) in results.into_iter().enumerate() {
595            let key = format!(
596                "{label}|{}|{}|{}|{}",
597                r.file_path, r.symbol_name, r.start_line, r.end_line
598            );
599            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
600            acc.entry(key)
601                .and_modify(|(_, s)| *s += rrf)
602                .or_insert((r, rrf));
603        }
604    }
605
606    let mut out: Vec<HybridResult> = acc
607        .into_values()
608        .map(|(mut r, s)| {
609            r.rrf_score = s;
610            r
611        })
612        .collect();
613    out.sort_by(|a, b| {
614        b.rrf_score
615            .partial_cmp(&a.rrf_score)
616            .unwrap_or(std::cmp::Ordering::Equal)
617            .then_with(|| a.file_path.cmp(&b.file_path))
618            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
619            .then_with(|| a.start_line.cmp(&b.start_line))
620            .then_with(|| a.end_line.cmp(&b.end_line))
621    });
622    out.truncate(top_k);
623    out
624}
625
626fn rrf_merge_bm25(
627    lists: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)>,
628    top_k: usize,
629) -> Vec<crate::core::bm25_index::SearchResult> {
630    use std::collections::HashMap;
631
632    let mut acc: HashMap<String, (crate::core::bm25_index::SearchResult, f64)> = HashMap::new();
633    for (label, results) in lists {
634        for (rank, r) in results.into_iter().enumerate() {
635            let key = format!(
636                "{label}|{}|{}|{}|{}",
637                r.file_path, r.symbol_name, r.start_line, r.end_line
638            );
639            let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
640            acc.entry(key)
641                .and_modify(|(_, s)| *s += rrf)
642                .or_insert((r, rrf));
643        }
644    }
645
646    let mut out: Vec<crate::core::bm25_index::SearchResult> = acc
647        .into_values()
648        .map(|(mut r, s)| {
649            r.score = s;
650            r
651        })
652        .collect();
653    out.sort_by(|a, b| {
654        b.score
655            .partial_cmp(&a.score)
656            .unwrap_or(std::cmp::Ordering::Equal)
657            .then_with(|| a.file_path.cmp(&b.file_path))
658            .then_with(|| a.symbol_name.cmp(&b.symbol_name))
659            .then_with(|| a.start_line.cmp(&b.start_line))
660            .then_with(|| a.end_line.cmp(&b.end_line))
661    });
662    out.truncate(top_k);
663    out
664}
665
666#[cfg(feature = "embeddings")]
667fn dense_results_for_root(
668    query: &str,
669    root: &Path,
670    index: &BM25Index,
671    top_k: usize,
672    filter: &SearchFilter,
673) -> Result<(Vec<HybridResult>, f64), String> {
674    let (engine, mut embed_idx) = load_engine_and_index(root)?;
675    let (aligned, coverage, changed_files) =
676        ensure_embeddings(root, index, engine, &mut embed_idx)?;
677
678    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
679    let filter_fn = |p: &str| filter.matches(p);
680    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
681        .is_active()
682        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
683
684    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
685    let mut results = crate::core::dense_backend::dense_results_as_hybrid(
686        backend,
687        root,
688        index,
689        engine,
690        &aligned,
691        &changed_files,
692        query,
693        candidate_k,
694        filter_pred,
695    )?;
696    results.truncate(top_k);
697
698    Ok((results, coverage))
699}
700
701#[cfg(feature = "embeddings")]
702fn hybrid_results_for_root(
703    query: &str,
704    root: &Path,
705    index: &BM25Index,
706    top_k: usize,
707    filter: &SearchFilter,
708) -> Result<(Vec<HybridResult>, f64), String> {
709    let (engine, mut embed_idx) = load_engine_and_index(root)?;
710    let (aligned, coverage, changed_files) =
711        ensure_embeddings(root, index, engine, &mut embed_idx)?;
712
713    let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
714    let cfg = HybridConfig::default();
715    let filter_fn = |p: &str| filter.matches(p);
716    let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
717        .is_active()
718        .then_some(&filter_fn as &dyn Fn(&str) -> bool);
719    let candidate_k = filtered_candidate_k(top_k, filter.is_active());
720    let graph_ranks = graph_rrf_ranks_for_search_root(root);
721    let graph_ranks_ref = graph_ranks.as_ref();
722    let mut results = crate::core::dense_backend::hybrid_results(
723        backend,
724        root,
725        index,
726        engine,
727        &aligned,
728        &changed_files,
729        query,
730        candidate_k,
731        &cfg,
732        filter_pred,
733        graph_ranks_ref,
734    )?;
735    results.truncate(top_k);
736    Ok((results, coverage))
737}
738
739fn label_for_root(root: &Path) -> String {
740    root.file_name()
741        .and_then(|s| s.to_str())
742        .map(str::to_string)
743        .filter(|s| !s.is_empty())
744        .unwrap_or_else(|| root.to_string_lossy().to_string())
745}
746
747fn graph_rrf_ranks_for_search_root(
748    root: &Path,
749) -> Option<std::collections::HashMap<String, usize>> {
750    let root_s = root.to_string_lossy().to_string();
751    let session = crate::core::session::SessionState::load_latest_for_project_root(&root_s)?;
752
753    if session.files_touched.is_empty() {
754        return None;
755    }
756
757    let recent: Vec<String> = session
758        .files_touched
759        .iter()
760        .rev()
761        .filter(|f| path_under_search_root(&f.path, root))
762        .take(12)
763        .map(|f| f.path.clone())
764        .collect();
765
766    if recent.is_empty() {
767        return None;
768    }
769
770    crate::core::graph_context::graph_neighbor_ranks_for_recent_files(&root_s, &recent, 40, 120)
771}
772
773fn path_under_search_root(path: &str, root: &Path) -> bool {
774    let p = std::path::Path::new(path);
775    if p.is_absolute() {
776        let root_norm = crate::core::pathutil::safe_canonicalize_or_self(root);
777        let path_norm = crate::core::pathutil::safe_canonicalize_or_self(p);
778        path_norm.starts_with(&root_norm)
779    } else {
780        true
781    }
782}
783
784fn hybrid_search_mode(
785    query: &str,
786    root: &Path,
787    index: &BM25Index,
788    top_k: usize,
789    compact: bool,
790    filter: &SearchFilter,
791) -> String {
792    #[cfg(feature = "embeddings")]
793    {
794        let (engine, mut embed_idx) = match load_engine_and_index(root) {
795            Ok(v) => v,
796            Err(e) => return format!("ERR: {e}"),
797        };
798
799        let (aligned, coverage, changed_files) =
800            match ensure_embeddings(root, index, engine, &mut embed_idx) {
801                Ok(v) => v,
802                Err(e) => return format!("ERR: {e}"),
803            };
804
805        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
806            Ok(v) => v,
807            Err(e) => return format!("ERR: {e}"),
808        };
809
810        let cfg = HybridConfig::default();
811        let filter_fn = |p: &str| filter.matches(p);
812        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
813            .is_active()
814            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
815        let graph_ranks = graph_rrf_ranks_for_search_root(root);
816        let graph_ranks_ref = graph_ranks.as_ref();
817        let mut results = match crate::core::dense_backend::hybrid_results(
818            backend,
819            root,
820            index,
821            engine,
822            &aligned,
823            &changed_files,
824            query,
825            top_k,
826            &cfg,
827            filter_pred,
828            graph_ranks_ref,
829        ) {
830            Ok(v) => v,
831            Err(e) => return format!("ERR: {e}"),
832        };
833        results.truncate(top_k);
834
835        let header = if compact {
836            format!(
837                "semantic_search(hybrid,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
838                results.len(),
839                index.doc_count,
840                coverage * 100.0
841            )
842        } else {
843            format!(
844                "Semantic search (Hybrid): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
845                truncate_query(query, 60),
846                results.len(),
847                index.doc_count,
848                coverage * 100.0
849            )
850        };
851
852        format!("{header}{}", format_hybrid_results(&results, compact))
853    }
854    #[cfg(not(feature = "embeddings"))]
855    {
856        let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
857        if filter.is_active() {
858            results.retain(|x| filter.matches(&x.file_path));
859        }
860
861        if let Some(graph_ranks) = graph_rrf_ranks_for_search_root(root) {
862            const GRAPH_RRF_K: f64 = 60.0;
863            for r in &mut results {
864                if let Some(&rank) = graph_ranks.get(&r.file_path) {
865                    r.score += 1.0 / (GRAPH_RRF_K + rank as f64 + 1.0);
866                }
867            }
868            results.sort_by(|a, b| {
869                b.score
870                    .partial_cmp(&a.score)
871                    .unwrap_or(std::cmp::Ordering::Equal)
872            });
873        }
874
875        results.truncate(top_k);
876        let graph_tag = if graph_rrf_ranks_for_search_root(root).is_some() {
877            "+graph"
878        } else {
879            ""
880        };
881        let header = if compact {
882            format!(
883                "semantic_search(bm25{graph_tag},{top_k}) → {} results, {} chunks indexed\n",
884                results.len(),
885                index.doc_count
886            )
887        } else {
888            format!(
889                "Semantic search (BM25{graph_tag}): \"{}\" ({} results from {} indexed chunks)\n",
890                truncate_query(query, 60),
891                results.len(),
892                index.doc_count,
893            )
894        };
895        format!("{header}{}", format_search_results(&results, compact))
896    }
897}
898
899fn dense_search_mode(
900    query: &str,
901    root: &Path,
902    index: &BM25Index,
903    top_k: usize,
904    compact: bool,
905    filter: &SearchFilter,
906) -> String {
907    #[cfg(feature = "embeddings")]
908    {
909        let (engine, mut embed_idx) = match load_engine_and_index(root) {
910            Ok(v) => v,
911            Err(e) => return format!("ERR: {e}"),
912        };
913
914        let (aligned, coverage, changed_files) =
915            match ensure_embeddings(root, index, engine, &mut embed_idx) {
916                Ok(v) => v,
917                Err(e) => return format!("ERR: {e}"),
918            };
919
920        let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
921            Ok(v) => v,
922            Err(e) => return format!("ERR: {e}"),
923        };
924
925        let filter_fn = |p: &str| filter.matches(p);
926        let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
927            .is_active()
928            .then_some(&filter_fn as &dyn Fn(&str) -> bool);
929
930        let candidate_k = filtered_candidate_k(top_k, filter.is_active());
931        let mut results = match crate::core::dense_backend::dense_results_as_hybrid(
932            backend,
933            root,
934            index,
935            engine,
936            &aligned,
937            &changed_files,
938            query,
939            candidate_k,
940            filter_pred,
941        ) {
942            Ok(v) => v,
943            Err(e) => return format!("ERR: {e}"),
944        };
945        results.truncate(top_k);
946
947        let header = if compact {
948            format!(
949                "semantic_search(dense,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
950                results.len(),
951                index.doc_count,
952                coverage * 100.0
953            )
954        } else {
955            format!(
956                "Semantic search (Dense): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
957                truncate_query(query, 60),
958                results.len(),
959                index.doc_count,
960                coverage * 100.0
961            )
962        };
963
964        format!("{header}{}", format_hybrid_results(&results, compact))
965    }
966    #[cfg(not(feature = "embeddings"))]
967    {
968        "ERR: embeddings feature not enabled".to_string()
969    }
970}
971
972#[cfg(feature = "embeddings")]
973fn load_engine_and_index(
974    root: &Path,
975) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
976    let cfg = crate::core::config::Config::load();
977    let profile = crate::core::config::MemoryProfile::effective(&cfg);
978    if !profile.embeddings_enabled() {
979        return Err("embeddings disabled by memory_profile=low".into());
980    }
981
982    let engine = crate::core::embeddings::shared_engine()
983        .ok_or_else(|| "embedding engine load failed".to_string())?;
984
985    let mut idx =
986        EmbeddingIndex::load(root).unwrap_or_else(|| EmbeddingIndex::new(engine.dimensions()));
987    if idx.dimensions != engine.dimensions() {
988        idx = EmbeddingIndex::new(engine.dimensions());
989    }
990    Ok((engine, idx))
991}
992
993#[cfg(feature = "embeddings")]
994fn ensure_embeddings(
995    root: &Path,
996    index: &BM25Index,
997    engine: &EmbeddingEngine,
998    embed_idx: &mut EmbeddingIndex,
999) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1000    let mut changed_files = embed_idx.files_needing_update(&index.chunks);
1001    changed_files.sort();
1002    changed_files.dedup();
1003
1004    if !changed_files.is_empty() {
1005        let changed_set: std::collections::HashSet<&str> = changed_files
1006            .iter()
1007            .map(std::string::String::as_str)
1008            .collect();
1009        let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::new();
1010        for (i, c) in index.chunks.iter().enumerate() {
1011            if !changed_set.contains(c.file_path.as_str()) {
1012                continue;
1013            }
1014            let emb = engine
1015                .embed(&c.content)
1016                .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1017            new_embeddings.push((i, emb));
1018        }
1019        embed_idx.update(&index.chunks, &new_embeddings, &changed_files);
1020        embed_idx
1021            .save(root)
1022            .map_err(|e| format!("save embeddings failed: {e}"))?;
1023    }
1024
1025    if let Some(aligned) = embed_idx.get_aligned_embeddings(&index.chunks) {
1026        let coverage = embed_idx.coverage(index.chunks.len());
1027        return Ok((aligned, coverage, changed_files));
1028    }
1029
1030    // Alignment missing: rebuild everything once.
1031    let mut all_files: Vec<String> = index.chunks.iter().map(|c| c.file_path.clone()).collect();
1032    all_files.sort();
1033    all_files.dedup();
1034
1035    let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::with_capacity(index.chunks.len());
1036    for (i, c) in index.chunks.iter().enumerate() {
1037        let emb = engine
1038            .embed(&c.content)
1039            .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1040        new_embeddings.push((i, emb));
1041    }
1042
1043    embed_idx.update(&index.chunks, &new_embeddings, &all_files);
1044    embed_idx
1045        .save(root)
1046        .map_err(|e| format!("save embeddings failed: {e}"))?;
1047
1048    let aligned = embed_idx
1049        .get_aligned_embeddings(&index.chunks)
1050        .ok_or_else(|| "embedding alignment failed after full rebuild".to_string())?;
1051    let coverage = embed_idx.coverage(index.chunks.len());
1052    Ok((aligned, coverage, all_files))
1053}
1054
1055struct SearchFilter {
1056    allowed_exts: Option<HashSet<String>>,
1057    path_glob: Option<glob::Pattern>,
1058}
1059
1060impl SearchFilter {
1061    fn new(languages: Option<&[String]>, path_glob: Option<&str>) -> Result<Self, String> {
1062        let allowed_exts = languages.map(normalize_languages);
1063        let path_glob = match path_glob {
1064            None => None,
1065            Some(s) if s.trim().is_empty() => None,
1066            Some(s) => Some(glob::Pattern::new(s).map_err(|e| e.msg.to_string())?),
1067        };
1068        Ok(Self {
1069            allowed_exts,
1070            path_glob,
1071        })
1072    }
1073
1074    fn is_active(&self) -> bool {
1075        self.allowed_exts.is_some() || self.path_glob.is_some()
1076    }
1077
1078    fn matches(&self, rel_path: &str) -> bool {
1079        let rel_path = rel_path.replace('\\', "/");
1080        if let Some(p) = &self.path_glob {
1081            if !p.matches(&rel_path) {
1082                return false;
1083            }
1084        }
1085        if let Some(exts) = &self.allowed_exts {
1086            let ext = Path::new(&rel_path)
1087                .extension()
1088                .and_then(|e| e.to_str())
1089                .unwrap_or("")
1090                .to_lowercase();
1091            if ext.is_empty() || !exts.contains(&ext) {
1092                return false;
1093            }
1094        }
1095        true
1096    }
1097}
1098
1099fn normalize_languages(langs: &[String]) -> HashSet<String> {
1100    let mut out = HashSet::new();
1101    for l in langs {
1102        let raw = l.trim().trim_start_matches('.').to_lowercase();
1103        match raw.as_str() {
1104            "rust" | "rs" => {
1105                out.insert("rs".to_string());
1106            }
1107            "ts" | "typescript" => {
1108                out.insert("ts".to_string());
1109                out.insert("tsx".to_string());
1110            }
1111            "js" | "javascript" => {
1112                out.insert("js".to_string());
1113                out.insert("jsx".to_string());
1114                out.insert("mjs".to_string());
1115                out.insert("cjs".to_string());
1116            }
1117            "py" | "python" => {
1118                out.insert("py".to_string());
1119            }
1120            "go" => {
1121                out.insert("go".to_string());
1122            }
1123            "java" => {
1124                out.insert("java".to_string());
1125            }
1126            "ruby" | "rb" => {
1127                out.insert("rb".to_string());
1128            }
1129            "php" => {
1130                out.insert("php".to_string());
1131            }
1132            "c" => {
1133                out.insert("c".to_string());
1134                out.insert("h".to_string());
1135            }
1136            "cpp" | "c++" | "cc" => {
1137                out.insert("cpp".to_string());
1138                out.insert("hpp".to_string());
1139                out.insert("cc".to_string());
1140                out.insert("hh".to_string());
1141            }
1142            "cs" | "csharp" => {
1143                out.insert("cs".to_string());
1144            }
1145            "swift" => {
1146                out.insert("swift".to_string());
1147            }
1148            "kt" | "kotlin" => {
1149                out.insert("kt".to_string());
1150                out.insert("kts".to_string());
1151            }
1152            "json" => {
1153                out.insert("json".to_string());
1154            }
1155            "yaml" | "yml" => {
1156                out.insert("yaml".to_string());
1157                out.insert("yml".to_string());
1158            }
1159            other if !other.is_empty() => {
1160                out.insert(other.to_string());
1161            }
1162            _ => {}
1163        }
1164    }
1165    out
1166}
1167
1168#[cfg(test)]
1169mod filter_tests {
1170    use super::*;
1171
1172    #[test]
1173    fn filter_language_rust() {
1174        let f = SearchFilter::new(Some(&["rust".into()]), None).unwrap();
1175        assert!(f.matches("src/main.rs"));
1176        assert!(!f.matches("src/main.ts"));
1177    }
1178
1179    #[test]
1180    fn filter_path_glob() {
1181        let f = SearchFilter::new(None, Some("rust/src/**")).unwrap();
1182        assert!(f.matches("rust/src/core/mod.rs"));
1183        assert!(!f.matches("website/src/pages/index.astro"));
1184    }
1185}
1186
1187#[cfg(test)]
1188mod determinism_tests {
1189    use super::*;
1190
1191    #[test]
1192    fn rrf_merge_hybrid_is_deterministic_on_ties() {
1193        let a = HybridResult {
1194            file_path: "a.rs".to_string(),
1195            symbol_name: "foo".to_string(),
1196            kind: crate::core::bm25_index::ChunkKind::Function,
1197            start_line: 1,
1198            end_line: 1,
1199            snippet: "a".to_string(),
1200            rrf_score: 0.0,
1201            bm25_score: None,
1202            dense_score: None,
1203            bm25_rank: None,
1204            dense_rank: None,
1205        };
1206        let b = HybridResult {
1207            file_path: "b.rs".to_string(),
1208            symbol_name: "foo".to_string(),
1209            kind: crate::core::bm25_index::ChunkKind::Function,
1210            start_line: 1,
1211            end_line: 1,
1212            snippet: "b".to_string(),
1213            rrf_score: 0.0,
1214            bm25_score: None,
1215            dense_score: None,
1216            bm25_rank: None,
1217            dense_rank: None,
1218        };
1219
1220        // Two lists with swapped ranks yield identical RRF sums for a and b.
1221        let fused = rrf_merge_hybrid(
1222            vec![
1223                ("root".to_string(), vec![a.clone(), b.clone()]),
1224                ("root".to_string(), vec![b.clone(), a.clone()]),
1225            ],
1226            10,
1227        );
1228
1229        assert_eq!(fused.len(), 2);
1230        assert_eq!(fused[0].file_path, "a.rs");
1231        assert_eq!(fused[1].file_path, "b.rs");
1232    }
1233}