1use anyhow::Result;
2use ck_ann::AnnIndex;
3use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
4use globset::{Glob, GlobSet, GlobSetBuilder};
5use rayon::prelude::*;
6use regex::{Regex, RegexBuilder};
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf as StdPathBuf;
10use std::path::{Path, PathBuf};
11use tantivy::collector::TopDocs;
12use tantivy::query::QueryParser;
13use tantivy::schema::{STORED, Schema, TEXT, Value};
14use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
15use walkdir::WalkDir;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24 let content = tokio::fs::read_to_string(file_path).await?;
25 let lines: Vec<&str> = content.lines().collect();
26
27 if span.line_start == 0 || span.line_start > lines.len() {
28 return Ok(String::new());
29 }
30
31 let start_idx = span.line_start - 1; let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34 if start_idx <= end_idx {
35 Ok(lines[start_idx..=end_idx].join("\n"))
36 } else {
37 Ok(lines[start_idx].to_string())
38 }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42 let mut current = if path.is_file() {
43 path.parent().unwrap_or(path)
44 } else {
45 path
46 };
47 loop {
48 if current.join(".ck").exists() {
49 return Some(current.to_path_buf());
50 }
51 match current.parent() {
52 Some(parent) => current = parent,
53 None => return None,
54 }
55 }
56}
57
58pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
59 search_with_progress(options, None).await
60}
61
62pub async fn search_with_progress(
63 options: &SearchOptions,
64 progress_callback: Option<SearchProgressCallback>,
65) -> Result<Vec<SearchResult>> {
66 if !options.path.exists() {
68 return Err(ck_core::CkError::Search(format!(
69 "Path does not exist: {}",
70 options.path.display()
71 ))
72 .into());
73 }
74
75 if !matches!(options.mode, SearchMode::Regex) {
77 let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
78 ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
79 }
80
81 match options.mode {
82 SearchMode::Regex => regex_search(options),
83 SearchMode::Lexical => lexical_search(options).await,
84 SearchMode::Semantic => {
85 semantic_search_v3_with_progress(options, progress_callback).await
87 }
88 SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
89 }
90}
91
92fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
93 let pattern = if options.fixed_string {
94 regex::escape(&options.query)
95 } else if options.whole_word {
96 format!(r"\b{}\b", regex::escape(&options.query))
97 } else {
98 options.query.clone()
99 };
100
101 let regex = RegexBuilder::new(&pattern)
102 .case_insensitive(options.case_insensitive)
103 .build()
104 .map_err(CkError::Regex)?;
105
106 let should_recurse = options.path.is_dir() || options.recursive;
108 let files = if should_recurse {
109 ck_index::collect_files(
111 &options.path,
112 options.respect_gitignore,
113 &options.exclude_patterns,
114 )?
115 } else {
116 collect_files(&options.path, should_recurse, &options.exclude_patterns)?
118 };
119
120 let results: Vec<Vec<SearchResult>> = files
121 .par_iter()
122 .filter_map(|file_path| match search_file(®ex, file_path, options) {
123 Ok(matches) => {
124 if matches.is_empty() {
125 None
126 } else {
127 Some(matches)
128 }
129 }
130 Err(e) => {
131 tracing::debug!("Error searching {:?}: {}", file_path, e);
132 None
133 }
134 })
135 .collect();
136
137 let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
138 all_results.sort_by(|a, b| {
140 let path_cmp = a.file.cmp(&b.file);
141 if path_cmp != std::cmp::Ordering::Equal {
142 return path_cmp;
143 }
144 a.span.line_start.cmp(&b.span.line_start)
145 });
146
147 if let Some(top_k) = options.top_k {
148 all_results.truncate(top_k);
149 }
150
151 Ok(all_results)
152}
153
154fn search_file(
155 regex: &Regex,
156 file_path: &Path,
157 options: &SearchOptions,
158) -> Result<Vec<SearchResult>> {
159 let content = fs::read_to_string(file_path)?;
160 let lines: Vec<&str> = content.lines().collect();
161 let mut results = Vec::new();
162
163 let code_sections = if options.full_section {
165 extract_code_sections(file_path, &content)
166 } else {
167 None
168 };
169
170 let mut byte_offset = 0;
172
173 for (line_idx, line) in lines.iter().enumerate() {
174 let line_number = line_idx + 1;
175
176 if regex.as_str().is_empty() {
179 let preview = if options.full_section {
181 if let Some(ref sections) = code_sections {
183 if let Some(section) = find_containing_section(sections, line_idx) {
184 section.clone()
185 } else {
186 get_context_preview(&lines, line_idx, options)
188 }
189 } else {
190 get_context_preview(&lines, line_idx, options)
191 }
192 } else {
193 get_context_preview(&lines, line_idx, options)
194 };
195
196 results.push(SearchResult {
197 file: file_path.to_path_buf(),
198 span: Span {
199 byte_start: byte_offset,
200 byte_end: byte_offset + line.len(),
201 line_start: line_number,
202 line_end: line_number,
203 },
204 score: 1.0,
205 preview,
206 lang: ck_core::Language::from_path(file_path),
207 symbol: None,
208 chunk_hash: None,
209 index_epoch: None,
210 });
211 } else {
212 for mat in regex.find_iter(line) {
214 let preview = if options.full_section {
215 if let Some(ref sections) = code_sections {
217 if let Some(section) = find_containing_section(sections, line_idx) {
218 section.clone()
219 } else {
220 get_context_preview(&lines, line_idx, options)
222 }
223 } else {
224 get_context_preview(&lines, line_idx, options)
225 }
226 } else {
227 get_context_preview(&lines, line_idx, options)
228 };
229
230 results.push(SearchResult {
231 file: file_path.to_path_buf(),
232 span: Span {
233 byte_start: byte_offset + mat.start(),
234 byte_end: byte_offset + mat.end(),
235 line_start: line_number,
236 line_end: line_number,
237 },
238 score: 1.0,
239 preview,
240 lang: ck_core::Language::from_path(file_path),
241 symbol: None,
242 chunk_hash: None,
243 index_epoch: None,
244 });
245 }
246 }
247
248 byte_offset += line.len();
250 if line_idx < lines.len() - 1 {
251 byte_offset += 1; }
253 }
254
255 Ok(results)
256}
257
258async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
259 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
261 if options.path.is_file() {
262 options.path.parent().unwrap_or(&options.path).to_path_buf()
263 } else {
264 options.path.clone()
265 }
266 });
267
268 let index_dir = index_root.join(".ck");
269 if !index_dir.exists() {
270 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
271 }
272
273 let tantivy_index_path = index_dir.join("tantivy_index");
274
275 if !tantivy_index_path.exists() {
276 return build_tantivy_index(options).await;
277 }
278
279 let mut schema_builder = Schema::builder();
280 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
281 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
282 let _schema = schema_builder.build();
283
284 let index = Index::open_in_dir(&tantivy_index_path)
285 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
286
287 let reader = index
288 .reader_builder()
289 .reload_policy(ReloadPolicy::OnCommitWithDelay)
290 .try_into()
291 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
292
293 let searcher = reader.searcher();
294 let query_parser = QueryParser::for_index(&index, vec![content_field]);
295
296 let query = query_parser
297 .parse_query(&options.query)
298 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
299
300 let top_docs = if let Some(top_k) = options.top_k {
301 searcher.search(&query, &TopDocs::with_limit(top_k))?
302 } else {
303 searcher.search(&query, &TopDocs::with_limit(100))?
304 };
305
306 let mut raw_results = Vec::new();
308 for (_score, doc_address) in top_docs {
309 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
310 let path_text = retrieved_doc
311 .get_first(path_field)
312 .map(|field_value| field_value.as_str().unwrap_or(""))
313 .unwrap_or("");
314 let content_text = retrieved_doc
315 .get_first(content_field)
316 .map(|field_value| field_value.as_str().unwrap_or(""))
317 .unwrap_or("");
318
319 let file_path = PathBuf::from(path_text);
320 let preview = if options.full_section {
321 content_text.to_string()
322 } else {
323 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
324 };
325
326 raw_results.push((
327 _score,
328 SearchResult {
329 file: file_path,
330 span: Span {
331 byte_start: 0,
332 byte_end: content_text.len(),
333 line_start: 1,
334 line_end: content_text.lines().count(),
335 },
336 score: _score,
337 preview,
338 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
339 symbol: None,
340 chunk_hash: None,
341 index_epoch: None,
342 },
343 ));
344 }
345
346 let mut results = Vec::new();
348 if !raw_results.is_empty() {
349 let max_score = raw_results
350 .iter()
351 .map(|(score, _)| *score)
352 .fold(0.0f32, f32::max);
353 if max_score > 0.0 {
354 for (raw_score, mut result) in raw_results {
355 let normalized_score = raw_score / max_score;
356
357 if let Some(threshold) = options.threshold
359 && normalized_score < threshold
360 {
361 continue;
362 }
363
364 result.score = normalized_score;
365 results.push(result);
366 }
367 }
368 }
369
370 Ok(results)
371}
372
373async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
374 let index_root = if options.path.is_file() {
376 options.path.parent().unwrap_or(&options.path)
377 } else {
378 &options.path
379 };
380
381 let index_dir = index_root.join(".ck");
382 let tantivy_index_path = index_dir.join("tantivy_index");
383
384 fs::create_dir_all(&tantivy_index_path)?;
385
386 let mut schema_builder = Schema::builder();
387 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
388 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
389 let schema = schema_builder.build();
390
391 let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
392 .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
393
394 let mut index_writer = index
395 .writer(50_000_000)
396 .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
397
398 let files = collect_files(index_root, true, &options.exclude_patterns)?;
399
400 for file_path in &files {
401 if let Ok(content) = fs::read_to_string(file_path) {
402 let doc = doc!(
403 content_field => content,
404 path_field => file_path.display().to_string()
405 );
406 index_writer.add_document(doc)?;
407 }
408 }
409
410 index_writer
411 .commit()
412 .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
413
414 let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
416 let mut schema_builder = Schema::builder();
417 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
418 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
419 let _schema = schema_builder.build();
420
421 let index = Index::open_in_dir(&tantivy_index_path)
422 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
423
424 let reader = index
425 .reader_builder()
426 .reload_policy(ReloadPolicy::OnCommitWithDelay)
427 .try_into()
428 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
429
430 let searcher = reader.searcher();
431 let query_parser = QueryParser::for_index(&index, vec![content_field]);
432
433 let query = query_parser
434 .parse_query(&options.query)
435 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
436
437 let top_docs = if let Some(top_k) = options.top_k {
438 searcher.search(&query, &TopDocs::with_limit(top_k))?
439 } else {
440 searcher.search(&query, &TopDocs::with_limit(100))?
441 };
442
443 let mut raw_results = Vec::new();
445 for (_score, doc_address) in top_docs {
446 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
447 let path_text = retrieved_doc
448 .get_first(path_field)
449 .map(|field_value| field_value.as_str().unwrap_or(""))
450 .unwrap_or("");
451 let content_text = retrieved_doc
452 .get_first(content_field)
453 .map(|field_value| field_value.as_str().unwrap_or(""))
454 .unwrap_or("");
455
456 let file_path = PathBuf::from(path_text);
457 let preview = if options.full_section {
458 content_text.to_string()
459 } else {
460 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
461 };
462
463 raw_results.push((
464 _score,
465 SearchResult {
466 file: file_path,
467 span: Span {
468 byte_start: 0,
469 byte_end: content_text.len(),
470 line_start: 1,
471 line_end: content_text.lines().count(),
472 },
473 score: _score,
474 preview,
475 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
476 symbol: None,
477 chunk_hash: None,
478 index_epoch: None,
479 },
480 ));
481 }
482
483 let mut results = Vec::new();
485 if !raw_results.is_empty() {
486 let max_score = raw_results
487 .iter()
488 .map(|(score, _)| *score)
489 .fold(0.0f32, f32::max);
490 if max_score > 0.0 {
491 for (raw_score, mut result) in raw_results {
492 let normalized_score = raw_score / max_score;
493
494 if let Some(threshold) = options.threshold
496 && normalized_score < threshold
497 {
498 continue;
499 }
500
501 result.score = normalized_score;
502 results.push(result);
503 }
504 }
505 }
506
507 Ok(results)
508}
509
510#[allow(dead_code)]
511async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
512 semantic_search_with_progress(options, None).await
513}
514
515async fn semantic_search_with_progress(
516 options: &SearchOptions,
517 progress_callback: Option<SearchProgressCallback>,
518) -> Result<Vec<SearchResult>> {
519 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
521 if options.path.is_file() {
522 options.path.parent().unwrap_or(&options.path).to_path_buf()
523 } else {
524 options.path.clone()
525 }
526 });
527
528 let index_dir = index_root.join(".ck");
529 if !index_dir.exists() {
530 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
531 }
532
533 let ann_index_path = index_dir.join("ann_index.bin");
534 let embeddings_path = index_dir.join("embeddings.json");
535
536 if !ann_index_path.exists() || !embeddings_path.exists() {
537 return build_semantic_index_with_progress(options, progress_callback).await;
538 }
539
540 let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
542
543 let embeddings_data = fs::read_to_string(&embeddings_path)?;
545 let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
546
547 if let Some(ref callback) = progress_callback {
549 callback("Loading embedding model...");
550 }
551
552 let mut embedder = if let Some(ref callback) = progress_callback {
553 let _cb = callback.as_ref();
554 let model_cb = Box::new(|msg: &str| {
555 eprintln!("Model: {}", msg);
558 }) as ck_embed::ModelDownloadCallback;
559 ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
560 } else {
561 ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
562 };
563 let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
564
565 if query_embeddings.is_empty() {
566 return Ok(Vec::new());
567 }
568
569 let query_embedding = &query_embeddings[0];
570
571 let top_k = options.top_k.unwrap_or(10);
573 let similar_docs = ann_index.search(query_embedding, top_k);
574
575 let mut results = Vec::new();
576
577 let filter_by_file = options.path.is_file();
579 let target_file = if filter_by_file {
580 Some(
581 options
582 .path
583 .canonicalize()
584 .unwrap_or_else(|_| options.path.clone()),
585 )
586 } else {
587 None
588 };
589
590 for (doc_id, similarity) in similar_docs {
591 if let Some(threshold) = options.threshold
593 && similarity < threshold
594 {
595 continue;
596 }
597
598 if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
599 if let Some(target) = &target_file {
601 let canonical_result = file_path
602 .canonicalize()
603 .unwrap_or_else(|_| file_path.clone());
604 if canonical_result != *target {
605 continue; }
607 }
608
609 let preview = if options.full_section {
611 content.clone()
612 } else {
613 content.lines().take(3).collect::<Vec<_>>().join("\n")
614 };
615
616 results.push(SearchResult {
617 file: file_path.clone(),
618 span: Span {
619 byte_start: 0,
620 byte_end: content.len(),
621 line_start: 1,
622 line_end: content.lines().count(),
623 },
624 score: similarity,
625 preview,
626 lang: ck_core::Language::from_path(file_path),
627 symbol: None,
628 chunk_hash: None,
629 index_epoch: None,
630 });
631 }
632 }
633
634 Ok(results)
635}
636
637#[allow(dead_code)]
638async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
639 build_semantic_index_with_progress(options, None).await
640}
641
642async fn build_semantic_index_with_progress(
643 options: &SearchOptions,
644 progress_callback: Option<SearchProgressCallback>,
645) -> Result<Vec<SearchResult>> {
646 let index_root = if options.path.is_file() {
648 options.path.parent().unwrap_or(&options.path)
649 } else {
650 &options.path
651 };
652
653 let index_dir = index_root.join(".ck");
654 let ann_index_path = index_dir.join("ann_index.bin");
655 let embeddings_path = index_dir.join("embeddings.json");
656
657 fs::create_dir_all(&index_dir)?;
658
659 if let Some(ref callback) = progress_callback {
660 callback("Building semantic index (no index found)...");
661 }
662
663 eprintln!("Building semantic index (no existing index found)...");
665
666 let files = collect_files(index_root, true, &options.exclude_patterns)?;
668
669 if let Some(ref callback) = progress_callback {
670 callback(&format!("Found {} files to index", files.len()));
671 }
672 eprintln!("Found {} files to embed and index", files.len());
673
674 let mut file_embeddings = Vec::new();
675 let mut embeddings = Vec::new();
676
677 if let Some(ref callback) = progress_callback {
679 callback("Loading embedding model...");
680 }
681
682 let model_callback = if progress_callback.is_some() {
683 Some(Box::new(|msg: &str| {
684 eprintln!("Model: {}", msg);
685 }) as ck_embed::ModelDownloadCallback)
686 } else {
687 None
688 };
689
690 let mut embedder =
691 ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
692
693 if let Some(ref callback) = progress_callback {
694 callback("Generating embeddings for code chunks...");
695 }
696
697 for (file_idx, file_path) in files.iter().enumerate() {
698 if let Ok(content) = fs::read_to_string(file_path) {
699 if let Some(ref callback) = progress_callback {
700 let file_name = file_path
701 .file_name()
702 .map(|n| n.to_string_lossy().to_string())
703 .unwrap_or_else(|| file_path.to_string_lossy().to_string());
704 callback(&format!(
705 "Processing {}/{}: {}",
706 file_idx + 1,
707 files.len(),
708 file_name
709 ));
710 }
711
712 let chunks = ck_chunk::chunk_text(&content, ck_core::Language::from_path(file_path))?;
714
715 for chunk in chunks {
716 let chunk_embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
717 if !chunk_embeddings.is_empty() {
718 embeddings.push(chunk_embeddings[0].clone());
719 file_embeddings.push((file_path.clone(), chunk.text));
720 }
721 }
722 }
723 }
724
725 if let Some(ref callback) = progress_callback {
726 callback(&format!(
727 "Built {} embeddings, creating search index...",
728 embeddings.len()
729 ));
730 }
731 eprintln!(
732 "Generated {} embeddings, building search index...",
733 embeddings.len()
734 );
735
736 let index = ck_ann::SimpleIndex::build(&embeddings)?;
738 index.save(&ann_index_path)?;
739
740 let embeddings_json = serde_json::to_string(&file_embeddings)?;
742 fs::write(&embeddings_path, embeddings_json)?;
743
744 if let Some(ref callback) = progress_callback {
745 callback("Semantic index built successfully, running search...");
746 }
747 eprintln!("Semantic index built successfully!");
748
749 let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
751
752 let embeddings_data = fs::read_to_string(&embeddings_path)?;
754 let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
755
756 let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
758 let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
759
760 if query_embeddings.is_empty() {
761 return Ok(Vec::new());
762 }
763
764 let query_embedding = &query_embeddings[0];
765
766 let top_k = options.top_k.unwrap_or(10);
768 let similar_docs = ann_index.search(query_embedding, top_k);
769
770 let mut results = Vec::new();
771
772 let filter_by_file = options.path.is_file();
774 let target_file = if filter_by_file {
775 Some(
776 options
777 .path
778 .canonicalize()
779 .unwrap_or_else(|_| options.path.clone()),
780 )
781 } else {
782 None
783 };
784
785 for (doc_id, similarity) in similar_docs {
786 if let Some(threshold) = options.threshold
788 && similarity < threshold
789 {
790 continue;
791 }
792
793 if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
794 if let Some(target) = &target_file {
796 let canonical_result = file_path
797 .canonicalize()
798 .unwrap_or_else(|_| file_path.clone());
799 if canonical_result != *target {
800 continue; }
802 }
803
804 let preview = if options.full_section {
806 content.clone()
807 } else {
808 content.lines().take(3).collect::<Vec<_>>().join("\n")
809 };
810
811 results.push(SearchResult {
812 file: file_path.clone(),
813 span: Span {
814 byte_start: 0,
815 byte_end: content.len(),
816 line_start: 1,
817 line_end: content.lines().count(),
818 },
819 score: similarity,
820 preview,
821 lang: ck_core::Language::from_path(file_path),
822 symbol: None,
823 chunk_hash: None,
824 index_epoch: None,
825 });
826 }
827 }
828
829 Ok(results)
830}
831
832#[allow(dead_code)]
833async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
834 hybrid_search_with_progress(options, None).await
835}
836
837async fn hybrid_search_with_progress(
838 options: &SearchOptions,
839 progress_callback: Option<SearchProgressCallback>,
840) -> Result<Vec<SearchResult>> {
841 if let Some(ref callback) = progress_callback {
842 callback("Running regex search...");
843 }
844 let regex_results = regex_search(options)?;
845
846 if let Some(ref callback) = progress_callback {
847 callback("Running semantic search...");
848 }
849 let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
850
851 let mut combined = HashMap::new();
852
853 for (rank, result) in regex_results.iter().enumerate() {
854 let key = format!("{}:{}", result.file.display(), result.span.line_start);
855 combined
856 .entry(key)
857 .or_insert(Vec::new())
858 .push((rank + 1, result.clone()));
859 }
860
861 for (rank, result) in semantic_results.iter().enumerate() {
862 let key = format!("{}:{}", result.file.display(), result.span.line_start);
863 combined
864 .entry(key)
865 .or_insert(Vec::new())
866 .push((rank + 1, result.clone()));
867 }
868
869 let mut rrf_results: Vec<SearchResult> = combined
871 .into_values()
872 .map(|ranks| {
873 let mut result = ranks[0].1.clone();
874 let rrf_score = ranks
875 .iter()
876 .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
877 .sum();
878 result.score = rrf_score;
879 result
880 })
881 .filter(|result| {
882 if let Some(threshold) = options.threshold {
884 result.score >= threshold
885 } else {
886 true
887 }
888 })
889 .collect();
890
891 rrf_results.sort_by(|a, b| {
893 b.score
894 .partial_cmp(&a.score)
895 .unwrap_or(std::cmp::Ordering::Equal)
896 });
897
898 if let Some(top_k) = options.top_k {
899 rrf_results.truncate(top_k);
900 }
901
902 Ok(rrf_results)
903}
904
905fn build_globset(patterns: &[String]) -> GlobSet {
906 let mut builder = GlobSetBuilder::new();
907 for pat in patterns {
908 if let Ok(glob) = Glob::new(pat) {
910 builder.add(glob);
911 }
912 }
913 builder.build().unwrap_or_else(|_| GlobSet::empty())
914}
915
916fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
917 let globset = build_globset(exclude_patterns);
918 if globset.is_match(path) {
920 return true;
921 }
922 for component in path.components() {
923 if let std::path::Component::Normal(name) = component
924 && globset.is_match(name)
925 {
926 return true;
927 }
928 }
929 false
930}
931
932fn collect_files(
933 path: &Path,
934 recursive: bool,
935 exclude_patterns: &[String],
936) -> Result<Vec<PathBuf>> {
937 let mut files = Vec::new();
938 let globset = build_globset(exclude_patterns);
939
940 if path.is_file() {
941 files.push(path.to_path_buf());
943 } else if recursive {
944 for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
945 let name = e.file_name();
947 !globset.is_match(e.path()) && !globset.is_match(name)
948 }) {
949 match entry {
950 Ok(entry) => {
951 if entry.file_type().is_file()
952 && !should_exclude_path(entry.path(), exclude_patterns)
953 {
954 files.push(entry.path().to_path_buf());
955 }
956 }
957 Err(e) => {
958 tracing::debug!("Skipping path due to error: {}", e);
960 continue;
961 }
962 }
963 }
964 } else {
965 match fs::read_dir(path) {
966 Ok(read_dir) => {
967 for entry in read_dir {
968 match entry {
969 Ok(entry) => {
970 let path = entry.path();
971 if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
972 files.push(path);
973 }
974 }
975 Err(e) => {
976 tracing::debug!("Skipping directory entry due to error: {}", e);
977 continue;
978 }
979 }
980 }
981 }
982 Err(e) => {
983 tracing::debug!("Cannot read directory {:?}: {}", path, e);
984 return Err(e.into());
985 }
986 }
987 }
988
989 Ok(files)
990}
991
992async fn ensure_index_updated(
993 path: &Path,
994 force_reindex: bool,
995 need_embeddings: bool,
996) -> Result<()> {
997 let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
999 if path.is_file() {
1000 path.parent().unwrap_or(path).to_path_buf()
1001 } else {
1002 path.to_path_buf()
1003 }
1004 });
1005 let index_root = &index_root_buf;
1006
1007 if force_reindex {
1009 let stats = ck_index::smart_update_index_with_progress(
1010 index_root,
1011 false,
1012 None,
1013 need_embeddings,
1014 true,
1015 &[], None, )
1018 .await?;
1019 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1020 tracing::info!(
1021 "Index updated: {} files indexed, {} orphaned files removed",
1022 stats.files_indexed,
1023 stats.orphaned_files_removed
1024 );
1025 }
1026 return Ok(());
1027 }
1028
1029 let stats = ck_index::smart_update_index_with_progress(
1031 index_root,
1032 false,
1033 None,
1034 need_embeddings,
1035 true,
1036 &[],
1037 None, )
1039 .await?;
1040 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1041 tracing::info!(
1042 "Index updated: {} files indexed, {} orphaned files removed",
1043 stats.files_indexed,
1044 stats.orphaned_files_removed
1045 );
1046 }
1047
1048 Ok(())
1049}
1050
1051fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
1052 let before = options.before_context_lines.max(options.context_lines);
1053 let after = options.after_context_lines.max(options.context_lines);
1054
1055 if before > 0 || after > 0 {
1056 let start_idx = line_idx.saturating_sub(before);
1057 let end_idx = (line_idx + after + 1).min(lines.len());
1058 lines[start_idx..end_idx].join("\n")
1059 } else {
1060 lines[line_idx].to_string()
1061 }
1062}
1063
1064fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1065 let lang = ck_core::Language::from_path(file_path)?;
1066
1067 if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1069 let sections: Vec<(usize, usize, String)> = chunks
1070 .into_iter()
1071 .filter(|chunk| {
1072 matches!(
1073 chunk.chunk_type,
1074 ck_chunk::ChunkType::Function
1075 | ck_chunk::ChunkType::Class
1076 | ck_chunk::ChunkType::Method
1077 )
1078 })
1079 .map(|chunk| {
1080 (
1081 chunk.span.line_start - 1, chunk.span.line_end - 1,
1083 chunk.text,
1084 )
1085 })
1086 .collect();
1087
1088 if sections.is_empty() {
1089 None
1090 } else {
1091 Some(sections)
1092 }
1093 } else {
1094 None
1095 }
1096}
1097
1098fn find_containing_section(
1099 sections: &[(usize, usize, String)],
1100 line_idx: usize,
1101) -> Option<&String> {
1102 for (start, end, text) in sections {
1103 if line_idx >= *start && line_idx <= *end {
1104 return Some(text);
1105 }
1106 }
1107 None
1108}
1109
1110#[cfg(test)]
1111mod tests {
1112 use super::*;
1113 use std::fs;
1114 use tempfile::TempDir;
1115
1116 fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1117 let files = vec![
1118 ("test1.txt", "hello world rust programming"),
1119 ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1120 ("test3.py", "print('Hello Python')"),
1121 ("test4.txt", "machine learning artificial intelligence"),
1122 ];
1123
1124 let mut paths = Vec::new();
1125 for (name, content) in files {
1126 let path = dir.join(name);
1127 fs::write(&path, content).unwrap();
1128 paths.push(path);
1129 }
1130 paths
1131 }
1132
1133 #[test]
1134 fn test_collect_files() {
1135 let temp_dir = TempDir::new().unwrap();
1136 let test_files = create_test_files(temp_dir.path());
1137
1138 let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1140 assert_eq!(files.len(), 4);
1141
1142 let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1144 assert_eq!(files.len(), 4);
1145
1146 let files = collect_files(&test_files[0], false, &[]).unwrap();
1148 assert_eq!(files.len(), 1);
1149 assert_eq!(files[0], test_files[0]);
1150 }
1151
1152 #[test]
1153 fn test_regex_search() {
1154 let temp_dir = TempDir::new().unwrap();
1155 create_test_files(temp_dir.path());
1156
1157 let options = SearchOptions {
1158 mode: SearchMode::Regex,
1159 query: "rust".to_string(),
1160 path: temp_dir.path().to_path_buf(),
1161 recursive: true,
1162 ..Default::default()
1163 };
1164
1165 let results = regex_search(&options).unwrap();
1166 assert!(!results.is_empty());
1167
1168 let rust_matches: Vec<_> = results
1170 .iter()
1171 .filter(|r| r.preview.to_lowercase().contains("rust"))
1172 .collect();
1173 assert!(!rust_matches.is_empty());
1174 }
1175
1176 #[test]
1177 fn test_regex_search_case_insensitive() {
1178 let temp_dir = TempDir::new().unwrap();
1179 create_test_files(temp_dir.path());
1180
1181 let options = SearchOptions {
1182 mode: SearchMode::Regex,
1183 query: "HELLO".to_string(),
1184 path: temp_dir.path().to_path_buf(),
1185 recursive: true,
1186 case_insensitive: true,
1187 ..Default::default()
1188 };
1189
1190 let results = regex_search(&options).unwrap();
1191 assert!(!results.is_empty());
1192 }
1193
1194 #[test]
1195 fn test_regex_search_fixed_string() {
1196 let temp_dir = TempDir::new().unwrap();
1197 create_test_files(temp_dir.path());
1198
1199 let options = SearchOptions {
1200 mode: SearchMode::Regex,
1201 query: "fn main()".to_string(),
1202 path: temp_dir.path().to_path_buf(),
1203 recursive: true,
1204 fixed_string: true,
1205 ..Default::default()
1206 };
1207
1208 let results = regex_search(&options).unwrap();
1209 assert!(!results.is_empty());
1210 }
1211
1212 #[test]
1213 fn test_regex_search_whole_word() {
1214 let temp_dir = TempDir::new().unwrap();
1215 fs::write(
1216 temp_dir.path().join("word_test.txt"),
1217 "rust rusty rustacean",
1218 )
1219 .unwrap();
1220
1221 let options = SearchOptions {
1222 mode: SearchMode::Regex,
1223 query: "rust".to_string(),
1224 path: temp_dir.path().to_path_buf(),
1225 recursive: true,
1226 whole_word: true,
1227 ..Default::default()
1228 };
1229
1230 let results = regex_search(&options).unwrap();
1231 assert!(!results.is_empty());
1232 }
1234
1235 #[test]
1236 fn test_regex_search_top_k() {
1237 let temp_dir = TempDir::new().unwrap();
1238
1239 for i in 0..10 {
1241 fs::write(
1242 temp_dir.path().join(format!("file{}.txt", i)),
1243 "test content",
1244 )
1245 .unwrap();
1246 }
1247
1248 let options = SearchOptions {
1249 mode: SearchMode::Regex,
1250 query: "test".to_string(),
1251 path: temp_dir.path().to_path_buf(),
1252 recursive: true,
1253 top_k: Some(5),
1254 ..Default::default()
1255 };
1256
1257 let results = regex_search(&options).unwrap();
1258 assert!(results.len() <= 5);
1259 }
1260
1261 #[test]
1262 fn test_regex_search_span_offsets() {
1263 let temp_dir = TempDir::new().unwrap();
1265 let test_file = temp_dir.path().join("spans.txt");
1266 fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1267
1268 let options = SearchOptions {
1269 mode: SearchMode::Regex,
1270 query: "test".to_string(),
1271 path: test_file.clone(),
1272 recursive: false,
1273 ..Default::default()
1274 };
1275
1276 let results = regex_search(&options).unwrap();
1277
1278 assert_eq!(results.len(), 5);
1280
1281 let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1283 assert_eq!(line1_matches.len(), 3);
1284 assert_eq!(line1_matches[0].span.byte_start, 0);
1285 assert_eq!(line1_matches[1].span.byte_start, 5);
1286 assert_eq!(line1_matches[2].span.byte_start, 10);
1287
1288 let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1290 assert_eq!(line2_matches.len(), 1);
1291 assert_eq!(line2_matches[0].span.byte_start, 24); let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1295 byte_starts.sort();
1296 byte_starts.dedup();
1297 assert_eq!(byte_starts.len(), 5); }
1299
1300 #[test]
1301 fn test_search_file() {
1302 let temp_dir = TempDir::new().unwrap();
1303 let file_path = temp_dir.path().join("test.txt");
1304 fs::write(
1305 &file_path,
1306 "line 1: hello\nline 2: world\nline 3: rust programming",
1307 )
1308 .unwrap();
1309
1310 let regex = regex::Regex::new("rust").unwrap();
1311 let options = SearchOptions::default();
1312
1313 let results = search_file(®ex, &file_path, &options).unwrap();
1314 assert_eq!(results.len(), 1);
1315 assert_eq!(results[0].span.line_start, 3);
1316 assert!(results[0].preview.contains("rust"));
1317 }
1318
1319 #[test]
1320 fn test_search_file_with_context() {
1321 let temp_dir = TempDir::new().unwrap();
1322 let file_path = temp_dir.path().join("test.txt");
1323 fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1324
1325 let regex = regex::Regex::new("target").unwrap();
1326 let options = SearchOptions {
1327 context_lines: 1,
1328 ..Default::default()
1329 };
1330
1331 let results = search_file(®ex, &file_path, &options).unwrap();
1332 assert_eq!(results.len(), 1);
1333
1334 println!("Preview: '{}'", results[0].preview);
1335
1336 assert!(results[0].preview.contains("line 2"));
1339 assert!(results[0].preview.contains("target line"));
1340 assert!(results[0].preview.contains("line 4"));
1341 }
1342
1343 #[tokio::test]
1344 async fn test_search_main_function() {
1345 let temp_dir = TempDir::new().unwrap();
1346 create_test_files(temp_dir.path());
1347
1348 let options = SearchOptions {
1349 mode: SearchMode::Regex,
1350 query: "hello".to_string(),
1351 path: temp_dir.path().to_path_buf(),
1352 recursive: true,
1353 case_insensitive: true,
1354 ..Default::default()
1355 };
1356
1357 let results = search(&options).await.unwrap();
1358 assert!(!results.is_empty());
1359 }
1360}