1use std::collections::HashSet;
2use std::path::Path;
3
4use crate::core::bm25_index::{format_search_results, BM25Index};
5use crate::core::embedding_index::EmbeddingIndex;
6#[cfg(feature = "embeddings")]
7use crate::core::embeddings::EmbeddingEngine;
8use crate::core::hybrid_search::{format_hybrid_results, HybridConfig, HybridResult};
9use crate::tools::CrpMode;
10
11#[allow(clippy::too_many_arguments)]
13pub fn handle(
14 query: &str,
15 path: &str,
16 top_k: usize,
17 crp_mode: CrpMode,
18 languages: Option<&[String]>,
19 path_glob: Option<&str>,
20 mode: Option<&str>,
21 workspace: Option<bool>,
22 artifacts: Option<bool>,
23) -> String {
24 let root = Path::new(path);
25 if !root.exists() {
26 return format!("ERR: path does not exist: {path}");
27 }
28
29 let root = if root.is_file() {
30 root.parent().unwrap_or(root)
31 } else {
32 root
33 };
34
35 let filter = match SearchFilter::new(languages, path_glob) {
36 Ok(f) => f,
37 Err(e) => return format!("ERR: invalid filter: {e}"),
38 };
39
40 let compact = crp_mode.is_tdd();
41 let mode = mode.unwrap_or("hybrid").to_lowercase();
42 let workspace = workspace.unwrap_or(false);
43 let artifacts = artifacts.unwrap_or(false);
44
45 if artifacts {
46 return artifacts_search(query, root, top_k, compact, &filter, workspace);
47 }
48 if workspace {
49 return workspace_search(query, root, top_k, compact, &filter, &mode);
50 }
51
52 let index = match load_or_refresh_bm25(root) {
53 Bm25LoadResult::Ready(idx) => idx,
54 Bm25LoadResult::Building => {
55 return "BM25 index is being built in the background. \
56 Run ctx_semantic_search again in ~30s, or use action=reindex to wait for completion."
57 .to_string();
58 }
59 };
60 if index.doc_count == 0 {
61 return "No code files found to index.".to_string();
62 }
63
64 match mode.as_str() {
65 "bm25" => {
66 let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
67 if filter.is_active() {
68 results.retain(|x| filter.matches(&x.file_path));
69 }
70 results.truncate(top_k);
71
72 let header = if compact {
73 format!(
74 "semantic_search(bm25,{top_k}) → {} results, {} chunks indexed\n",
75 results.len(),
76 index.doc_count
77 )
78 } else {
79 format!(
80 "Semantic search (BM25): \"{}\" ({} results from {} indexed chunks)\n",
81 truncate_query(query, 60),
82 results.len(),
83 index.doc_count,
84 )
85 };
86 format!("{header}{}", format_search_results(&results, compact))
87 }
88 "dense" => dense_search_mode(query, root, &index, top_k, compact, &filter),
89 _ => hybrid_search_mode(query, root, &index, top_k, compact, &filter),
90 }
91}
92
93pub fn search_hits(
100 query: &str,
101 path: &str,
102 top_k: usize,
103 mode: &str,
104 languages: Option<&[String]>,
105 path_glob: Option<&str>,
106) -> Result<Vec<HybridResult>, String> {
107 let root = Path::new(path);
108 if !root.exists() {
109 return Err(format!("path does not exist: {path}"));
110 }
111 let root = if root.is_file() {
112 root.parent().unwrap_or(root)
113 } else {
114 root
115 };
116
117 let filter =
118 SearchFilter::new(languages, path_glob).map_err(|e| format!("invalid filter: {e}"))?;
119
120 let index = BM25Index::load_or_build(root);
121 if index.doc_count == 0 {
122 return Ok(Vec::new());
123 }
124
125 let results = match mode.to_lowercase().as_str() {
126 "bm25" => bm25_hits(&index, query, top_k, &filter),
127 "dense" => {
128 #[cfg(feature = "embeddings")]
129 {
130 dense_results_for_root(query, root, &index, top_k, &filter).map(|(v, _)| v)?
131 }
132 #[cfg(not(feature = "embeddings"))]
133 {
134 return Err("dense mode requires the embeddings feature".to_string());
135 }
136 }
137 _ => {
138 #[cfg(feature = "embeddings")]
139 {
140 hybrid_results_for_root(query, root, &index, top_k, &filter).map(|(v, _)| v)?
141 }
142 #[cfg(not(feature = "embeddings"))]
143 {
144 bm25_hits(&index, query, top_k, &filter)
145 }
146 }
147 };
148
149 Ok(results)
150}
151
152fn bm25_hits(
153 index: &BM25Index,
154 query: &str,
155 top_k: usize,
156 filter: &SearchFilter,
157) -> Vec<HybridResult> {
158 let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
159 if filter.is_active() {
160 results.retain(|x| filter.matches(&x.file_path));
161 }
162 results.truncate(top_k);
163 results
164 .into_iter()
165 .map(HybridResult::from_bm25_public)
166 .collect()
167}
168
169pub fn handle_reindex(path: &str) -> String {
171 let root = Path::new(path);
172 if !root.exists() {
173 return format!("ERR: path does not exist: {path}");
174 }
175 let root = if root.is_file() {
176 root.parent().unwrap_or(root)
177 } else {
178 root
179 };
180
181 let idx = BM25Index::build_from_directory(root);
182 let files = idx.files.len();
183 let chunks = idx.doc_count;
184 let _ = idx.save(root);
185
186 format!("Reindexed {path}: {files} files, {chunks} chunks")
187}
188
189pub fn handle_reindex_artifacts(path: &str, workspace: bool) -> String {
190 let root = Path::new(path);
191 if !root.exists() {
192 return format!("ERR: path does not exist: {path}");
193 }
194 let root = if root.is_file() {
195 root.parent().unwrap_or(root)
196 } else {
197 root
198 };
199
200 let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
201 let mut warnings: Vec<String> = Vec::new();
202
203 if workspace {
204 let linked = crate::core::workspace_config::load_linked_projects(root);
205 warnings.extend(linked.warnings);
206 roots.extend(linked.roots);
207 }
208
209 let mut total_files = 0usize;
210 let mut total_chunks = 0usize;
211 for r in roots {
212 let (idx, w) = crate::core::artifact_index::rebuild_from_scratch(&r);
213 warnings.extend(w);
214 total_files += idx.files.len();
215 total_chunks += idx.doc_count;
216 }
217
218 if warnings.is_empty() {
219 format!("Reindexed artifacts: {total_files} files, {total_chunks} chunks")
220 } else {
221 format!(
222 "Reindexed artifacts: {total_files} files, {total_chunks} chunks ({} warning(s))",
223 warnings.len()
224 )
225 }
226}
227
228pub fn handle_find_related(
233 file_path: &str,
234 line: usize,
235 project_root: &str,
236 top_k: usize,
237 crp_mode: CrpMode,
238) -> String {
239 let root = Path::new(project_root);
240 if !root.exists() {
241 return format!("ERR: path does not exist: {project_root}");
242 }
243
244 let index = BM25Index::load_or_build(root);
245 if index.doc_count == 0 {
246 return "ERR: empty index. Try action=reindex first.".to_string();
247 }
248
249 let source_chunk = index
250 .chunks
251 .iter()
252 .find(|c| c.file_path == file_path && c.start_line <= line && c.end_line >= line);
253
254 let Some(source_chunk) = source_chunk else {
255 return format!(
256 "ERR: no indexed chunk found at {file_path}:{line}. Try action=reindex first."
257 );
258 };
259
260 let query_text = source_chunk.content.clone();
261 let source_file = source_chunk.file_path.clone();
262 let source_start = source_chunk.start_line;
263
264 let compact = crp_mode != CrpMode::Off;
265
266 let results = find_related_internal(&query_text, root, &index, top_k + 5, compact);
267
268 let mut lines: Vec<String> = results
269 .into_iter()
270 .filter(|l| !l.contains(&format!("{source_file}:{source_start}-")))
271 .take(top_k)
272 .collect();
273
274 let header = if compact {
275 format!(
276 "find_related({file_path}:{line}) → {} results\n",
277 lines.len()
278 )
279 } else {
280 format!("Find related to {file_path}:{line} (semantic similarity)\n")
281 };
282
283 lines.insert(0, header);
284 lines.join("")
285}
286
287fn find_related_internal(
288 query: &str,
289 root: &Path,
290 index: &BM25Index,
291 top_k: usize,
292 compact: bool,
293) -> Vec<String> {
294 let Ok(filter) = SearchFilter::new(None, None) else {
295 return vec!["ERR: filter init failed\n".to_string()];
296 };
297 let output = hybrid_search_mode(query, root, index, top_k, compact, &filter);
298 output.lines().map(|l| format!("{l}\n")).collect()
299}
300
301fn truncate_query(q: &str, max: usize) -> &str {
302 if q.len() <= max {
303 return q;
304 }
305 match q.char_indices().nth(max) {
306 Some((byte_idx, _)) => &q[..byte_idx],
307 None => q,
308 }
309}
310
311std::thread_local! {
312 static BM25_SHARED_CACHE: std::cell::RefCell<Option<crate::core::bm25_cache::SharedBm25Cache>> =
313 const { std::cell::RefCell::new(None) };
314}
315
316pub fn set_thread_cache(cache: crate::core::bm25_cache::SharedBm25Cache) {
318 BM25_SHARED_CACHE.with(|c| {
319 *c.borrow_mut() = Some(cache);
320 });
321}
322
323pub fn get_thread_cache() -> Option<crate::core::bm25_cache::SharedBm25Cache> {
327 BM25_SHARED_CACHE.with(|c| c.borrow().clone())
328}
329
330pub(crate) enum Bm25LoadResult {
332 Ready(std::sync::Arc<BM25Index>),
333 Building,
334}
335
336fn load_or_refresh_bm25(root: &Path) -> Bm25LoadResult {
337 let cached = BM25_SHARED_CACHE.with(|c| {
338 let borrow = c.borrow();
339 borrow
340 .as_ref()
341 .and_then(|cache| crate::core::bm25_cache::get_or_background(cache, root))
342 });
343 if let Some(idx) = cached {
344 return Bm25LoadResult::Ready(idx);
345 }
346
347 let root_str = root.to_string_lossy().to_string();
348
349 if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
350 let idx = std::sync::Arc::new(idx);
351 store_in_thread_cache(root, &idx);
352 return Bm25LoadResult::Ready(idx);
353 }
354
355 if crate::core::index_orchestrator::is_building() {
356 return Bm25LoadResult::Building;
357 }
358
359 crate::core::index_orchestrator::ensure_all_background(&root_str);
365
366 let deadline = std::time::Instant::now() + bm25_cold_build_budget();
367 loop {
368 if let Some(idx) = crate::core::index_orchestrator::try_load_bm25_index(&root_str) {
369 let idx = std::sync::Arc::new(idx);
370 store_in_thread_cache(root, &idx);
371 return Bm25LoadResult::Ready(idx);
372 }
373 if std::time::Instant::now() >= deadline {
374 return Bm25LoadResult::Building;
375 }
376 std::thread::sleep(std::time::Duration::from_millis(50));
377 }
378}
379
380fn bm25_cold_build_budget() -> std::time::Duration {
383 let ms = std::env::var("LEAN_CTX_BM25_COLD_BUDGET_MS")
384 .ok()
385 .and_then(|v| v.parse::<u64>().ok())
386 .unwrap_or(3000);
387 std::time::Duration::from_millis(ms)
388}
389
390fn store_in_thread_cache(root: &Path, idx: &std::sync::Arc<BM25Index>) {
391 BM25_SHARED_CACHE.with(|c| {
392 let borrow = c.borrow();
393 if let Some(cache) = borrow.as_ref() {
394 let mut guard = cache
395 .lock()
396 .unwrap_or_else(std::sync::PoisonError::into_inner);
397 *guard = Some(crate::core::bm25_cache::Bm25CacheEntry {
398 root: root.to_path_buf(),
399 index: std::sync::Arc::clone(idx),
400 loaded_at: std::time::Instant::now(),
401 fingerprint: crate::core::bm25_cache::index_fingerprint(root),
402 });
403 }
404 });
405}
406
407fn filtered_candidate_k(top_k: usize, filtered: bool) -> usize {
408 if !filtered {
409 return top_k;
410 }
411 let candidates = (top_k.max(10)).saturating_mul(10);
412 candidates.clamp(50, 500)
413}
414
415const WORKSPACE_RRF_K: f64 = 60.0;
416
417fn artifacts_search(
418 query: &str,
419 root: &Path,
420 top_k: usize,
421 compact: bool,
422 filter: &SearchFilter,
423 workspace: bool,
424) -> String {
425 let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
426 let mut warnings: Vec<String> = Vec::new();
427
428 if workspace {
429 let linked = crate::core::workspace_config::load_linked_projects(root);
430 warnings.extend(linked.warnings);
431 roots.extend(linked.roots);
432 }
433 roots.sort();
434 roots.dedup();
435
436 let mut per_project: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)> = Vec::new();
437 let mut total_chunks = 0usize;
438
439 for r in &roots {
440 let label = label_for_root(r);
441 let (idx, w) = crate::core::artifact_index::load_or_build(r);
442 warnings.extend(w);
443 total_chunks += idx.doc_count;
444 if idx.doc_count == 0 {
445 continue;
446 }
447
448 let mut results = idx.search(query, filtered_candidate_k(top_k, filter.is_active()));
449 if filter.is_active() {
450 results.retain(|x| filter.matches(&x.file_path));
451 }
452 results.truncate(top_k);
453
454 for res in &mut results {
455 res.file_path = if workspace {
456 format!("[project:{label}] [artifact] {}", res.file_path)
457 } else {
458 format!("[artifact] {}", res.file_path)
459 };
460 }
461
462 per_project.push((label, results));
463 }
464
465 let mut fused: Vec<crate::core::bm25_index::SearchResult> = if per_project.len() <= 1 {
466 per_project
467 .into_iter()
468 .next()
469 .map(|(_, v)| v)
470 .unwrap_or_default()
471 } else {
472 rrf_merge_bm25(per_project, top_k)
473 };
474
475 if fused.is_empty() {
476 return "No artifact files found to index.".to_string();
477 }
478
479 fused.truncate(top_k);
480
481 let header = if compact {
482 if workspace {
483 format!(
484 "semantic_search(artifacts,workspace,{top_k}) → {} results, projects={}, {} chunks indexed\n",
485 fused.len(),
486 roots.len(),
487 total_chunks
488 )
489 } else {
490 format!(
491 "semantic_search(artifacts,{top_k}) → {} results, {} chunks indexed\n",
492 fused.len(),
493 total_chunks
494 )
495 }
496 } else if workspace {
497 format!(
498 "Semantic search (Artifacts/Workspace): \"{}\" ({} results from {} projects)\n",
499 truncate_query(query, 60),
500 fused.len(),
501 roots.len()
502 )
503 } else {
504 format!(
505 "Semantic search (Artifacts): \"{}\" ({} results)\n",
506 truncate_query(query, 60),
507 fused.len()
508 )
509 };
510
511 let mut out = format!("{header}{}", format_search_results(&fused, compact));
512 if !warnings.is_empty() && !compact {
513 out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
514 for w in warnings.iter().take(20) {
515 out.push_str(&format!("- {w}\n"));
516 }
517 }
518 out
519}
520
521fn workspace_search(
522 query: &str,
523 root: &Path,
524 top_k: usize,
525 compact: bool,
526 filter: &SearchFilter,
527 mode: &str,
528) -> String {
529 let linked = crate::core::workspace_config::load_linked_projects(root);
530 let mut warnings = linked.warnings;
531
532 let mut roots: Vec<std::path::PathBuf> = vec![root.to_path_buf()];
533 roots.extend(linked.roots);
534 roots.sort();
535 roots.dedup();
536
537 let mut per_project: Vec<(String, Vec<HybridResult>)> = Vec::new();
538 let mut avg_cov: Option<f64> = None;
539 let mut cov_count = 0usize;
540
541 for r in &roots {
542 let label = label_for_root(r);
543 let index = BM25Index::load_or_build(r);
544 if index.doc_count == 0 {
545 continue;
546 }
547
548 let mut results: Vec<HybridResult> = match mode {
549 "bm25" => {
550 let mut bm25 = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
551 if filter.is_active() {
552 bm25.retain(|x| filter.matches(&x.file_path));
553 }
554 bm25.truncate(top_k);
555 bm25.into_iter()
556 .map(HybridResult::from_bm25_public)
557 .collect()
558 }
559 "dense" => {
560 #[cfg(feature = "embeddings")]
561 {
562 match dense_results_for_root(query, r, &index, top_k, filter) {
563 Ok((v, cov)) => {
564 avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
565 cov_count += 1;
566 v
567 }
568 Err(e) => {
569 warnings.push(format!("[{label}] dense search failed: {e}"));
570 let mut bm25 = index
571 .search(query, filtered_candidate_k(top_k, filter.is_active()));
572 if filter.is_active() {
573 bm25.retain(|x| filter.matches(&x.file_path));
574 }
575 bm25.truncate(top_k);
576 bm25.into_iter()
577 .map(HybridResult::from_bm25_public)
578 .collect()
579 }
580 }
581 }
582 #[cfg(not(feature = "embeddings"))]
583 {
584 let _ = (&label, &warnings);
585 let mut bm25 =
586 index.search(query, filtered_candidate_k(top_k, filter.is_active()));
587 if filter.is_active() {
588 bm25.retain(|x| filter.matches(&x.file_path));
589 }
590 bm25.truncate(top_k);
591 bm25.into_iter()
592 .map(HybridResult::from_bm25_public)
593 .collect()
594 }
595 }
596 _ => {
597 #[cfg(feature = "embeddings")]
598 {
599 match hybrid_results_for_root(query, r, &index, top_k, filter) {
600 Ok((v, cov)) => {
601 avg_cov = Some(avg_cov.unwrap_or(0.0) + cov);
602 cov_count += 1;
603 v
604 }
605 Err(e) => {
606 warnings.push(format!("[{label}] hybrid search failed: {e}"));
607 let mut bm25 = index
608 .search(query, filtered_candidate_k(top_k, filter.is_active()));
609 if filter.is_active() {
610 bm25.retain(|x| filter.matches(&x.file_path));
611 }
612 bm25.truncate(top_k);
613 bm25.into_iter()
614 .map(HybridResult::from_bm25_public)
615 .collect()
616 }
617 }
618 }
619 #[cfg(not(feature = "embeddings"))]
620 {
621 let _ = (&label, &warnings);
622 let mut bm25 =
623 index.search(query, filtered_candidate_k(top_k, filter.is_active()));
624 if filter.is_active() {
625 bm25.retain(|x| filter.matches(&x.file_path));
626 }
627 bm25.truncate(top_k);
628 bm25.into_iter()
629 .map(HybridResult::from_bm25_public)
630 .collect()
631 }
632 }
633 };
634
635 for res in &mut results {
636 res.file_path = format!("[project:{label}] {}", res.file_path);
637 }
638 per_project.push((label, results));
639 }
640
641 let mut fused: Vec<HybridResult> = if per_project.len() <= 1 {
642 per_project
643 .into_iter()
644 .next()
645 .map(|(_, v)| v)
646 .unwrap_or_default()
647 } else {
648 rrf_merge_hybrid(per_project, top_k)
649 };
650
651 if fused.is_empty() {
652 return "No code files found to index.".to_string();
653 }
654
655 fused.truncate(top_k);
656 let cov = avg_cov.and_then(|s| {
657 if cov_count == 0 {
658 None
659 } else {
660 Some(s / cov_count as f64)
661 }
662 });
663
664 let header = if compact {
665 match (mode, cov) {
666 (_, Some(c)) => format!(
667 "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}, embed_cov={:.0}%\n",
668 fused.len(),
669 roots.len(),
670 c * 100.0
671 ),
672 _ => format!(
673 "semantic_search(workspace,{mode},{top_k}) → {} results, projects={}\n",
674 fused.len(),
675 roots.len()
676 ),
677 }
678 } else {
679 format!(
680 "Workspace semantic search ({mode}): \"{}\" ({} results from {} projects)\n",
681 truncate_query(query, 60),
682 fused.len(),
683 roots.len()
684 )
685 };
686
687 let mut out = format!("{header}{}", format_hybrid_results(&fused, compact));
688 if !warnings.is_empty() && !compact {
689 out.push_str(&format!("\nWarnings ({}):\n", warnings.len()));
690 for w in warnings.iter().take(20) {
691 out.push_str(&format!("- {w}\n"));
692 }
693 }
694 out
695}
696
697fn rrf_merge_hybrid(lists: Vec<(String, Vec<HybridResult>)>, top_k: usize) -> Vec<HybridResult> {
698 use std::collections::HashMap;
699
700 let mut acc: HashMap<String, (HybridResult, f64)> = HashMap::new();
701 for (label, results) in lists {
702 for (rank, r) in results.into_iter().enumerate() {
703 let key = format!(
704 "{label}|{}|{}|{}|{}",
705 r.file_path, r.symbol_name, r.start_line, r.end_line
706 );
707 let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
708 acc.entry(key)
709 .and_modify(|(_, s)| *s += rrf)
710 .or_insert((r, rrf));
711 }
712 }
713
714 let mut out: Vec<HybridResult> = acc
715 .into_values()
716 .map(|(mut r, s)| {
717 r.rrf_score = s;
718 r
719 })
720 .collect();
721 out.sort_by(|a, b| {
722 b.rrf_score
723 .partial_cmp(&a.rrf_score)
724 .unwrap_or(std::cmp::Ordering::Equal)
725 .then_with(|| a.file_path.cmp(&b.file_path))
726 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
727 .then_with(|| a.start_line.cmp(&b.start_line))
728 .then_with(|| a.end_line.cmp(&b.end_line))
729 });
730 out.truncate(top_k);
731 out
732}
733
734fn rrf_merge_bm25(
735 lists: Vec<(String, Vec<crate::core::bm25_index::SearchResult>)>,
736 top_k: usize,
737) -> Vec<crate::core::bm25_index::SearchResult> {
738 use std::collections::HashMap;
739
740 let mut acc: HashMap<String, (crate::core::bm25_index::SearchResult, f64)> = HashMap::new();
741 for (label, results) in lists {
742 for (rank, r) in results.into_iter().enumerate() {
743 let key = format!(
744 "{label}|{}|{}|{}|{}",
745 r.file_path, r.symbol_name, r.start_line, r.end_line
746 );
747 let rrf = 1.0 / (WORKSPACE_RRF_K + (rank as f64) + 1.0);
748 acc.entry(key)
749 .and_modify(|(_, s)| *s += rrf)
750 .or_insert((r, rrf));
751 }
752 }
753
754 let mut out: Vec<crate::core::bm25_index::SearchResult> = acc
755 .into_values()
756 .map(|(mut r, s)| {
757 r.score = s;
758 r
759 })
760 .collect();
761 out.sort_by(|a, b| {
762 b.score
763 .partial_cmp(&a.score)
764 .unwrap_or(std::cmp::Ordering::Equal)
765 .then_with(|| a.file_path.cmp(&b.file_path))
766 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
767 .then_with(|| a.start_line.cmp(&b.start_line))
768 .then_with(|| a.end_line.cmp(&b.end_line))
769 });
770 out.truncate(top_k);
771 out
772}
773
774#[cfg(feature = "embeddings")]
775fn dense_results_for_root(
776 query: &str,
777 root: &Path,
778 index: &BM25Index,
779 top_k: usize,
780 filter: &SearchFilter,
781) -> Result<(Vec<HybridResult>, f64), String> {
782 let (engine, mut embed_idx) = load_engine_and_index(root)?;
783 let (aligned, coverage, changed_files) =
784 ensure_embeddings(root, index, engine, &mut embed_idx)?;
785
786 let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
787 let filter_fn = |p: &str| filter.matches(p);
788 let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
789 .is_active()
790 .then_some(&filter_fn as &dyn Fn(&str) -> bool);
791
792 let candidate_k = filtered_candidate_k(top_k, filter.is_active());
793 let mut results = crate::core::dense_backend::dense_results_as_hybrid(
794 backend,
795 root,
796 index,
797 engine,
798 &aligned,
799 &changed_files,
800 query,
801 candidate_k,
802 filter_pred,
803 )?;
804 results.truncate(top_k);
805
806 Ok((results, coverage))
807}
808
809#[cfg(feature = "embeddings")]
810fn hybrid_results_for_root(
811 query: &str,
812 root: &Path,
813 index: &BM25Index,
814 top_k: usize,
815 filter: &SearchFilter,
816) -> Result<(Vec<HybridResult>, f64), String> {
817 let (engine, mut embed_idx) = load_engine_and_index(root)?;
818 let (aligned, coverage, changed_files) =
819 ensure_embeddings(root, index, engine, &mut embed_idx)?;
820
821 let backend = crate::core::dense_backend::DenseBackendKind::try_from_env()?;
822 let cfg = HybridConfig::from_config();
823 let filter_fn = |p: &str| filter.matches(p);
824 let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
825 .is_active()
826 .then_some(&filter_fn as &dyn Fn(&str) -> bool);
827 let candidate_k = filtered_candidate_k(top_k, filter.is_active());
828 let graph_ranks = graph_rrf_ranks_for_search_root(root);
829 let graph_ranks_ref = graph_ranks.as_ref();
830 let mut results = crate::core::dense_backend::hybrid_results(
831 backend,
832 root,
833 index,
834 engine,
835 &aligned,
836 &changed_files,
837 query,
838 candidate_k,
839 &cfg,
840 filter_pred,
841 graph_ranks_ref,
842 )?;
843
844 if cfg.splade_weight > 0.0 {
845 let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, candidate_k);
846 if !splade.is_empty() {
847 boost_with_splade(&mut results, &splade, cfg.splade_weight);
848 }
849 }
850
851 results.truncate(top_k);
852 Ok((results, coverage))
853}
854
855fn boost_with_splade(
857 results: &mut [HybridResult],
858 splade: &[crate::core::splade_retrieval::SpladeResult],
859 weight: f64,
860) {
861 use std::collections::HashMap;
862 let rrf_k = 60.0_f64;
863
864 let boosts: HashMap<&str, f64> = splade
865 .iter()
866 .enumerate()
867 .map(|(rank, sr)| (sr.file_path.as_str(), weight / (rrf_k + rank as f64 + 1.0)))
868 .collect();
869
870 for r in results.iter_mut() {
871 if let Some(&boost) = boosts.get(r.file_path.as_str()) {
872 r.rrf_score += boost;
873 }
874 }
875
876 results.sort_by(|a, b| {
877 b.rrf_score
878 .partial_cmp(&a.rrf_score)
879 .unwrap_or(std::cmp::Ordering::Equal)
880 });
881}
882
883fn label_for_root(root: &Path) -> String {
884 root.file_name()
885 .and_then(|s| s.to_str())
886 .map(str::to_string)
887 .filter(|s| !s.is_empty())
888 .unwrap_or_else(|| root.to_string_lossy().to_string())
889}
890
891fn graph_rrf_ranks_for_search_root(
892 root: &Path,
893) -> Option<std::collections::HashMap<String, usize>> {
894 let root_s = root.to_string_lossy().to_string();
895 let session = crate::core::session::SessionState::load_latest_for_project_root(&root_s)?;
896
897 if session.files_touched.is_empty() {
898 return None;
899 }
900
901 let recent: Vec<String> = session
902 .files_touched
903 .iter()
904 .rev()
905 .filter(|f| path_under_search_root(&f.path, root))
906 .take(12)
907 .map(|f| f.path.clone())
908 .collect();
909
910 if recent.is_empty() {
911 return None;
912 }
913
914 crate::core::graph_context::graph_neighbor_ranks_for_recent_files(&root_s, &recent, 40, 120)
915}
916
917fn path_under_search_root(path: &str, root: &Path) -> bool {
918 let p = std::path::Path::new(path);
919 if p.is_absolute() {
920 let root_norm = crate::core::pathutil::safe_canonicalize_or_self(root);
921 let path_norm = crate::core::pathutil::safe_canonicalize_or_self(p);
922 path_norm.starts_with(&root_norm)
923 } else {
924 true
925 }
926}
927
928fn hybrid_search_mode(
929 query: &str,
930 root: &Path,
931 index: &BM25Index,
932 top_k: usize,
933 compact: bool,
934 filter: &SearchFilter,
935) -> String {
936 #[cfg(feature = "embeddings")]
937 {
938 let (engine, mut embed_idx) = match load_engine_and_index(root) {
939 Ok(v) => v,
940 Err(e) => return format!("ERR: {e}"),
941 };
942
943 let (aligned, coverage, changed_files) =
944 match ensure_embeddings(root, index, engine, &mut embed_idx) {
945 Ok(v) => v,
946 Err(e) => return format!("ERR: {e}"),
947 };
948
949 let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
950 Ok(v) => v,
951 Err(e) => return format!("ERR: {e}"),
952 };
953
954 let cfg = HybridConfig::from_config();
955 let filter_fn = |p: &str| filter.matches(p);
956 let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
957 .is_active()
958 .then_some(&filter_fn as &dyn Fn(&str) -> bool);
959 let graph_ranks = graph_rrf_ranks_for_search_root(root);
960 let graph_ranks_ref = graph_ranks.as_ref();
961 let mut results = match crate::core::dense_backend::hybrid_results(
962 backend,
963 root,
964 index,
965 engine,
966 &aligned,
967 &changed_files,
968 query,
969 top_k,
970 &cfg,
971 filter_pred,
972 graph_ranks_ref,
973 ) {
974 Ok(v) => v,
975 Err(e) => return format!("ERR: {e}"),
976 };
977
978 if cfg.splade_weight > 0.0 {
979 let splade = crate::core::splade_retrieval::hybrid_retrieve(query, index, top_k);
980 if !splade.is_empty() {
981 boost_with_splade(&mut results, &splade, cfg.splade_weight);
982 }
983 }
984
985 results.truncate(top_k);
986
987 let header = if compact {
988 format!(
989 "semantic_search(hybrid,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
990 results.len(),
991 index.doc_count,
992 coverage * 100.0
993 )
994 } else {
995 format!(
996 "Semantic search (Hybrid): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
997 truncate_query(query, 60),
998 results.len(),
999 index.doc_count,
1000 coverage * 100.0
1001 )
1002 };
1003
1004 format!("{header}{}", format_hybrid_results(&results, compact))
1005 }
1006 #[cfg(not(feature = "embeddings"))]
1007 {
1008 let mut results = index.search(query, filtered_candidate_k(top_k, filter.is_active()));
1009 if filter.is_active() {
1010 results.retain(|x| filter.matches(&x.file_path));
1011 }
1012
1013 if let Some(graph_ranks) = graph_rrf_ranks_for_search_root(root) {
1014 const GRAPH_RRF_K: f64 = 60.0;
1015 for r in &mut results {
1016 if let Some(&rank) = graph_ranks.get(&r.file_path) {
1017 r.score += 1.0 / (GRAPH_RRF_K + rank as f64 + 1.0);
1018 }
1019 }
1020 results.sort_by(|a, b| {
1021 b.score
1022 .partial_cmp(&a.score)
1023 .unwrap_or(std::cmp::Ordering::Equal)
1024 });
1025 }
1026
1027 results.truncate(top_k);
1028 let graph_tag = if graph_rrf_ranks_for_search_root(root).is_some() {
1029 "+graph"
1030 } else {
1031 ""
1032 };
1033 let header = if compact {
1034 format!(
1035 "semantic_search(bm25{graph_tag},{top_k}) → {} results, {} chunks indexed\n",
1036 results.len(),
1037 index.doc_count
1038 )
1039 } else {
1040 format!(
1041 "Semantic search (BM25{graph_tag}): \"{}\" ({} results from {} indexed chunks)\n",
1042 truncate_query(query, 60),
1043 results.len(),
1044 index.doc_count,
1045 )
1046 };
1047 format!("{header}{}", format_search_results(&results, compact))
1048 }
1049}
1050
1051fn dense_search_mode(
1052 query: &str,
1053 root: &Path,
1054 index: &BM25Index,
1055 top_k: usize,
1056 compact: bool,
1057 filter: &SearchFilter,
1058) -> String {
1059 #[cfg(feature = "embeddings")]
1060 {
1061 let (engine, mut embed_idx) = match load_engine_and_index(root) {
1062 Ok(v) => v,
1063 Err(e) => return format!("ERR: {e}"),
1064 };
1065
1066 let (aligned, coverage, changed_files) =
1067 match ensure_embeddings(root, index, engine, &mut embed_idx) {
1068 Ok(v) => v,
1069 Err(e) => return format!("ERR: {e}"),
1070 };
1071
1072 let backend = match crate::core::dense_backend::DenseBackendKind::try_from_env() {
1073 Ok(v) => v,
1074 Err(e) => return format!("ERR: {e}"),
1075 };
1076
1077 let filter_fn = |p: &str| filter.matches(p);
1078 let filter_pred: Option<&dyn Fn(&str) -> bool> = filter
1079 .is_active()
1080 .then_some(&filter_fn as &dyn Fn(&str) -> bool);
1081
1082 let candidate_k = filtered_candidate_k(top_k, filter.is_active());
1083 let mut results = match crate::core::dense_backend::dense_results_as_hybrid(
1084 backend,
1085 root,
1086 index,
1087 engine,
1088 &aligned,
1089 &changed_files,
1090 query,
1091 candidate_k,
1092 filter_pred,
1093 ) {
1094 Ok(v) => v,
1095 Err(e) => return format!("ERR: {e}"),
1096 };
1097 results.truncate(top_k);
1098
1099 let header = if compact {
1100 format!(
1101 "semantic_search(dense,{top_k}) → {} results, {} chunks, embed_cov={:.0}%\n",
1102 results.len(),
1103 index.doc_count,
1104 coverage * 100.0
1105 )
1106 } else {
1107 format!(
1108 "Semantic search (Dense): \"{}\" ({} results from {} indexed chunks, embeddings coverage {:.0}%)\n",
1109 truncate_query(query, 60),
1110 results.len(),
1111 index.doc_count,
1112 coverage * 100.0
1113 )
1114 };
1115
1116 format!("{header}{}", format_hybrid_results(&results, compact))
1117 }
1118 #[cfg(not(feature = "embeddings"))]
1119 {
1120 "ERR: embeddings feature not enabled".to_string()
1121 }
1122}
1123
1124#[cfg(feature = "embeddings")]
1125fn load_engine_and_index(
1126 root: &Path,
1127) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1128 let cfg = crate::core::config::Config::load();
1129 let profile = crate::core::config::MemoryProfile::effective(&cfg);
1130 if !profile.embeddings_enabled() {
1131 return Err("embeddings disabled by memory_profile=low".into());
1132 }
1133
1134 let engine = crate::core::embeddings::shared_engine()
1135 .ok_or_else(|| "embedding engine load failed".to_string())?;
1136
1137 let model_name = engine.model_name();
1138 let mut idx = EmbeddingIndex::load(root)
1139 .unwrap_or_else(|| EmbeddingIndex::new_with_model(engine.dimensions(), model_name));
1140
1141 if let Some((stored, current)) = idx.model_mismatch(model_name) {
1142 tracing::warn!(
1143 "[embeddings] model changed: {stored} → {current}. Re-indexing all embeddings."
1144 );
1145 idx = EmbeddingIndex::new_with_model(engine.dimensions(), model_name);
1146 } else if idx.dimension_mismatch(engine.dimensions()) {
1147 tracing::warn!(
1148 "[embeddings] dimension mismatch: index={}d, engine={}d. Re-indexing.",
1149 idx.dimensions,
1150 engine.dimensions()
1151 );
1152 idx = EmbeddingIndex::new_with_model(engine.dimensions(), model_name);
1153 }
1154
1155 if idx.model_id.is_none() {
1156 idx.model_id = Some(model_name.to_string());
1157 }
1158
1159 Ok((engine, idx))
1160}
1161
1162#[cfg(feature = "embeddings")]
1163fn ensure_embeddings(
1164 root: &Path,
1165 index: &BM25Index,
1166 engine: &EmbeddingEngine,
1167 embed_idx: &mut EmbeddingIndex,
1168) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1169 let mut changed_files = embed_idx.files_needing_update(&index.chunks);
1170 changed_files.sort();
1171 changed_files.dedup();
1172
1173 if !changed_files.is_empty() {
1174 let changed_set: std::collections::HashSet<&str> = changed_files
1175 .iter()
1176 .map(std::string::String::as_str)
1177 .collect();
1178 let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::new();
1179 for (i, c) in index.chunks.iter().enumerate() {
1180 if !changed_set.contains(c.file_path.as_str()) {
1181 continue;
1182 }
1183 let emb = engine
1184 .embed(&c.content)
1185 .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1186 new_embeddings.push((i, emb));
1187 }
1188 embed_idx.update(&index.chunks, &new_embeddings, &changed_files);
1189 embed_idx
1190 .save(root)
1191 .map_err(|e| format!("save embeddings failed: {e}"))?;
1192 }
1193
1194 if let Some(aligned) = embed_idx.get_aligned_embeddings(&index.chunks) {
1195 let coverage = embed_idx.coverage(index.chunks.len());
1196 return Ok((aligned, coverage, changed_files));
1197 }
1198
1199 let mut all_files: Vec<String> = index.chunks.iter().map(|c| c.file_path.clone()).collect();
1201 all_files.sort();
1202 all_files.dedup();
1203
1204 let mut new_embeddings: Vec<(usize, Vec<f32>)> = Vec::with_capacity(index.chunks.len());
1205 for (i, c) in index.chunks.iter().enumerate() {
1206 let emb = engine
1207 .embed(&c.content)
1208 .map_err(|e| format!("embed failed for {}: {e}", c.file_path))?;
1209 new_embeddings.push((i, emb));
1210 }
1211
1212 embed_idx.update(&index.chunks, &new_embeddings, &all_files);
1213 embed_idx
1214 .save(root)
1215 .map_err(|e| format!("save embeddings failed: {e}"))?;
1216
1217 let aligned = embed_idx
1218 .get_aligned_embeddings(&index.chunks)
1219 .ok_or_else(|| "embedding alignment failed after full rebuild".to_string())?;
1220 let coverage = embed_idx.coverage(index.chunks.len());
1221 Ok((aligned, coverage, all_files))
1222}
1223
1224struct SearchFilter {
1225 allowed_exts: Option<HashSet<String>>,
1226 path_glob: Option<glob::Pattern>,
1227}
1228
1229impl SearchFilter {
1230 fn new(languages: Option<&[String]>, path_glob: Option<&str>) -> Result<Self, String> {
1231 let allowed_exts = languages.map(normalize_languages);
1232 let path_glob = match path_glob {
1233 None => None,
1234 Some(s) if s.trim().is_empty() => None,
1235 Some(s) => Some(glob::Pattern::new(s).map_err(|e| e.msg.to_string())?),
1236 };
1237 Ok(Self {
1238 allowed_exts,
1239 path_glob,
1240 })
1241 }
1242
1243 fn is_active(&self) -> bool {
1244 self.allowed_exts.is_some() || self.path_glob.is_some()
1245 }
1246
1247 fn matches(&self, rel_path: &str) -> bool {
1248 let rel_path = rel_path.replace('\\', "/");
1249 if let Some(p) = &self.path_glob {
1250 if !p.matches(&rel_path) {
1251 return false;
1252 }
1253 }
1254 if let Some(exts) = &self.allowed_exts {
1255 let ext = Path::new(&rel_path)
1256 .extension()
1257 .and_then(|e| e.to_str())
1258 .unwrap_or("")
1259 .to_lowercase();
1260 if ext.is_empty() || !exts.contains(&ext) {
1261 return false;
1262 }
1263 }
1264 true
1265 }
1266}
1267
1268fn normalize_languages(langs: &[String]) -> HashSet<String> {
1269 let mut out = HashSet::new();
1270 for l in langs {
1271 let raw = l.trim().trim_start_matches('.').to_lowercase();
1272 match raw.as_str() {
1273 "rust" | "rs" => {
1274 out.insert("rs".to_string());
1275 }
1276 "ts" | "typescript" => {
1277 out.insert("ts".to_string());
1278 out.insert("tsx".to_string());
1279 }
1280 "js" | "javascript" => {
1281 out.insert("js".to_string());
1282 out.insert("jsx".to_string());
1283 out.insert("mjs".to_string());
1284 out.insert("cjs".to_string());
1285 }
1286 "py" | "python" => {
1287 out.insert("py".to_string());
1288 }
1289 "go" => {
1290 out.insert("go".to_string());
1291 }
1292 "java" => {
1293 out.insert("java".to_string());
1294 }
1295 "ruby" | "rb" => {
1296 out.insert("rb".to_string());
1297 }
1298 "php" => {
1299 out.insert("php".to_string());
1300 }
1301 "c" => {
1302 out.insert("c".to_string());
1303 out.insert("h".to_string());
1304 }
1305 "cpp" | "c++" | "cc" => {
1306 out.insert("cpp".to_string());
1307 out.insert("hpp".to_string());
1308 out.insert("cc".to_string());
1309 out.insert("hh".to_string());
1310 }
1311 "cs" | "csharp" => {
1312 out.insert("cs".to_string());
1313 }
1314 "swift" => {
1315 out.insert("swift".to_string());
1316 }
1317 "kt" | "kotlin" => {
1318 out.insert("kt".to_string());
1319 out.insert("kts".to_string());
1320 }
1321 "json" => {
1322 out.insert("json".to_string());
1323 }
1324 "yaml" | "yml" => {
1325 out.insert("yaml".to_string());
1326 out.insert("yml".to_string());
1327 }
1328 other if !other.is_empty() => {
1329 out.insert(other.to_string());
1330 }
1331 _ => {}
1332 }
1333 }
1334 out
1335}
1336
1337#[cfg(feature = "embeddings")]
1339pub fn load_engine_and_index_pub(
1340 root: &Path,
1341) -> Result<(&'static EmbeddingEngine, EmbeddingIndex), String> {
1342 load_engine_and_index(root)
1343}
1344
1345#[cfg(feature = "embeddings")]
1347pub fn ensure_embeddings_for_eval(
1348 root: &Path,
1349 index: &BM25Index,
1350 engine: &EmbeddingEngine,
1351 embed_idx: &mut EmbeddingIndex,
1352) -> Result<(Vec<Vec<f32>>, f64, Vec<String>), String> {
1353 ensure_embeddings(root, index, engine, embed_idx)
1354}
1355
1356pub fn boost_with_splade_pub(
1358 results: &mut [HybridResult],
1359 splade: &[crate::core::splade_retrieval::SpladeResult],
1360 weight: f64,
1361) {
1362 boost_with_splade(results, splade, weight);
1363}
1364
1365#[cfg(test)]
1366mod filter_tests {
1367 use super::*;
1368
1369 #[test]
1370 fn filter_language_rust() {
1371 let f = SearchFilter::new(Some(&["rust".into()]), None).unwrap();
1372 assert!(f.matches("src/main.rs"));
1373 assert!(!f.matches("src/main.ts"));
1374 }
1375
1376 #[test]
1377 fn filter_path_glob() {
1378 let f = SearchFilter::new(None, Some("rust/src/**")).unwrap();
1379 assert!(f.matches("rust/src/core/mod.rs"));
1380 assert!(!f.matches("website/src/pages/index.astro"));
1381 }
1382}
1383
1384#[cfg(test)]
1385mod determinism_tests {
1386 use super::*;
1387
1388 #[test]
1389 fn rrf_merge_hybrid_is_deterministic_on_ties() {
1390 let a = HybridResult {
1391 file_path: "a.rs".to_string(),
1392 symbol_name: "foo".to_string(),
1393 kind: crate::core::bm25_index::ChunkKind::Function,
1394 start_line: 1,
1395 end_line: 1,
1396 snippet: "a".to_string(),
1397 rrf_score: 0.0,
1398 bm25_score: None,
1399 dense_score: None,
1400 bm25_rank: None,
1401 dense_rank: None,
1402 };
1403 let b = HybridResult {
1404 file_path: "b.rs".to_string(),
1405 symbol_name: "foo".to_string(),
1406 kind: crate::core::bm25_index::ChunkKind::Function,
1407 start_line: 1,
1408 end_line: 1,
1409 snippet: "b".to_string(),
1410 rrf_score: 0.0,
1411 bm25_score: None,
1412 dense_score: None,
1413 bm25_rank: None,
1414 dense_rank: None,
1415 };
1416
1417 let fused = rrf_merge_hybrid(
1419 vec![
1420 ("root".to_string(), vec![a.clone(), b.clone()]),
1421 ("root".to_string(), vec![b.clone(), a.clone()]),
1422 ],
1423 10,
1424 );
1425
1426 assert_eq!(fused.len(), 2);
1427 assert_eq!(fused[0].file_path, "a.rs");
1428 assert_eq!(fused[1].file_path, "b.rs");
1429 }
1430}