1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9use rayon::prelude::*;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, STORED, TEXT, Value};
13use tantivy::{doc, Index, ReloadPolicy, TantivyDocument};
14use ck_ann::AnnIndex;
15use std::path::PathBuf as StdPathBuf;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24 let content = fs::read_to_string(file_path)?;
25 let lines: Vec<&str> = content.lines().collect();
26
27 if span.line_start == 0 || span.line_start > lines.len() {
28 return Ok(String::new());
29 }
30
31 let start_idx = span.line_start - 1; let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34 if start_idx <= end_idx {
35 Ok(lines[start_idx..=end_idx].join("\n"))
36 } else {
37 Ok(lines[start_idx].to_string())
38 }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42 let mut current = if path.is_file() { path.parent().unwrap_or(path) } else { path };
43 loop {
44 if current.join(".ck").exists() {
45 return Some(current.to_path_buf());
46 }
47 match current.parent() {
48 Some(parent) => current = parent,
49 None => return None,
50 }
51 }
52}
53
54pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
55 search_with_progress(options, None).await
56}
57
58pub async fn search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
59 if !options.path.exists() {
61 return Err(ck_core::CkError::Search(format!("Path does not exist: {}", options.path.display())).into());
62 }
63
64 if !matches!(options.mode, SearchMode::Regex) {
66 let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
67 ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
68 }
69
70 match options.mode {
71 SearchMode::Regex => regex_search(options),
72 SearchMode::Lexical => lexical_search(options).await,
73 SearchMode::Semantic => {
74 semantic_search_v3_with_progress(options, progress_callback).await
76 },
77 SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
78 }
79}
80
81fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
82 let pattern = if options.fixed_string {
83 regex::escape(&options.query)
84 } else if options.whole_word {
85 format!(r"\b{}\b", regex::escape(&options.query))
86 } else {
87 options.query.clone()
88 };
89
90 let regex = RegexBuilder::new(&pattern)
91 .case_insensitive(options.case_insensitive)
92 .build()
93 .map_err(|e| CkError::Regex(e))?;
94
95 let should_recurse = options.path.is_dir() || options.recursive;
97 let files = if should_recurse {
98 ck_index::collect_files(&options.path, options.respect_gitignore)
100 } else {
101 collect_files(&options.path, should_recurse, &options.exclude_patterns)?
103 };
104
105 let results: Vec<Vec<SearchResult>> = files
106 .par_iter()
107 .filter_map(|file_path| {
108 match search_file(®ex, file_path, options) {
109 Ok(matches) => {
110 if matches.is_empty() {
111 None
112 } else {
113 Some(matches)
114 }
115 }
116 Err(e) => {
117 tracing::debug!("Error searching {:?}: {}", file_path, e);
118 None
119 }
120 }
121 })
122 .collect();
123
124 let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
125 all_results.sort_by(|a, b| {
127 let path_cmp = a.file.cmp(&b.file);
128 if path_cmp != std::cmp::Ordering::Equal {
129 return path_cmp;
130 }
131 a.span.line_start.cmp(&b.span.line_start)
132 });
133
134 if let Some(top_k) = options.top_k {
135 all_results.truncate(top_k);
136 }
137
138 Ok(all_results)
139}
140
141fn search_file(regex: &Regex, file_path: &Path, options: &SearchOptions) -> Result<Vec<SearchResult>> {
142 let content = fs::read_to_string(file_path)?;
143 let lines: Vec<&str> = content.lines().collect();
144 let mut results = Vec::new();
145
146 let code_sections = if options.full_section {
148 extract_code_sections(file_path, &content)
149 } else {
150 None
151 };
152
153 let mut byte_offset = 0;
155
156 for (line_idx, line) in lines.iter().enumerate() {
157 let line_number = line_idx + 1;
158
159 for mat in regex.find_iter(line) {
161 let preview = if options.full_section {
162 if let Some(ref sections) = code_sections {
164 if let Some(section) = find_containing_section(sections, line_idx) {
165 section.clone()
166 } else {
167 get_context_preview(&lines, line_idx, options)
169 }
170 } else {
171 get_context_preview(&lines, line_idx, options)
172 }
173 } else {
174 get_context_preview(&lines, line_idx, options)
175 };
176
177 results.push(SearchResult {
178 file: file_path.to_path_buf(),
179 span: Span {
180 byte_start: byte_offset + mat.start(),
181 byte_end: byte_offset + mat.end(),
182 line_start: line_number,
183 line_end: line_number,
184 },
185 score: 1.0,
186 preview,
187 lang: detect_language(file_path),
188 symbol: None,
189 });
190 }
191
192 byte_offset += line.len();
194 if line_idx < lines.len() - 1 {
195 byte_offset += 1; }
197 }
198
199 Ok(results)
200}
201
202async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
203 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
205 if options.path.is_file() {
206 options.path.parent().unwrap_or(&options.path).to_path_buf()
207 } else {
208 options.path.clone()
209 }
210 });
211
212 let index_dir = index_root.join(".ck");
213 if !index_dir.exists() {
214 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
215 }
216
217 let tantivy_index_path = index_dir.join("tantivy_index");
218
219 if !tantivy_index_path.exists() {
220 return build_tantivy_index(options).await;
221 }
222
223 let mut schema_builder = Schema::builder();
224 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
225 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
226 let _schema = schema_builder.build();
227
228 let index = Index::open_in_dir(&tantivy_index_path)
229 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
230
231 let reader = index
232 .reader_builder()
233 .reload_policy(ReloadPolicy::OnCommitWithDelay)
234 .try_into()
235 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
236
237 let searcher = reader.searcher();
238 let query_parser = QueryParser::for_index(&index, vec![content_field]);
239
240 let query = query_parser
241 .parse_query(&options.query)
242 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
243
244 let top_docs = if let Some(top_k) = options.top_k {
245 searcher.search(&query, &TopDocs::with_limit(top_k))?
246 } else {
247 searcher.search(&query, &TopDocs::with_limit(100))?
248 };
249
250 let mut raw_results = Vec::new();
252 for (_score, doc_address) in top_docs {
253 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
254 let path_text = retrieved_doc
255 .get_first(path_field)
256 .map(|field_value| field_value.as_str().unwrap_or(""))
257 .unwrap_or("");
258 let content_text = retrieved_doc
259 .get_first(content_field)
260 .map(|field_value| field_value.as_str().unwrap_or(""))
261 .unwrap_or("");
262
263 let file_path = PathBuf::from(path_text);
264 let preview = if options.full_section {
265 content_text.to_string()
266 } else {
267 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
268 };
269
270 raw_results.push((_score, SearchResult {
271 file: file_path,
272 span: Span {
273 byte_start: 0,
274 byte_end: content_text.len(),
275 line_start: 1,
276 line_end: content_text.lines().count(),
277 },
278 score: _score,
279 preview,
280 lang: detect_language(&PathBuf::from(path_text)),
281 symbol: None,
282 }));
283 }
284
285 let mut results = Vec::new();
287 if !raw_results.is_empty() {
288 let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
289 if max_score > 0.0 {
290 for (raw_score, mut result) in raw_results {
291 let normalized_score = raw_score / max_score;
292
293 if let Some(threshold) = options.threshold {
295 if normalized_score < threshold {
296 continue;
297 }
298 }
299
300 result.score = normalized_score;
301 results.push(result);
302 }
303 }
304 }
305
306 Ok(results)
307}
308
309async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
310 let index_root = if options.path.is_file() {
312 options.path.parent().unwrap_or(&options.path)
313 } else {
314 &options.path
315 };
316
317 let index_dir = index_root.join(".ck");
318 let tantivy_index_path = index_dir.join("tantivy_index");
319
320 fs::create_dir_all(&tantivy_index_path)?;
321
322 let mut schema_builder = Schema::builder();
323 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
324 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
325 let schema = schema_builder.build();
326
327 let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
328 .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
329
330 let mut index_writer = index.writer(50_000_000)
331 .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
332
333 let files = collect_files(&index_root, true, &options.exclude_patterns)?;
334
335 for file_path in &files {
336 if let Ok(content) = fs::read_to_string(file_path) {
337 let doc = doc!(
338 content_field => content,
339 path_field => file_path.display().to_string()
340 );
341 index_writer.add_document(doc)?;
342 }
343 }
344
345 index_writer.commit()
346 .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
347
348 let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
350 let mut schema_builder = Schema::builder();
351 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
352 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
353 let _schema = schema_builder.build();
354
355 let index = Index::open_in_dir(&tantivy_index_path)
356 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
357
358 let reader = index
359 .reader_builder()
360 .reload_policy(ReloadPolicy::OnCommitWithDelay)
361 .try_into()
362 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
363
364 let searcher = reader.searcher();
365 let query_parser = QueryParser::for_index(&index, vec![content_field]);
366
367 let query = query_parser
368 .parse_query(&options.query)
369 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
370
371 let top_docs = if let Some(top_k) = options.top_k {
372 searcher.search(&query, &TopDocs::with_limit(top_k))?
373 } else {
374 searcher.search(&query, &TopDocs::with_limit(100))?
375 };
376
377 let mut raw_results = Vec::new();
379 for (_score, doc_address) in top_docs {
380 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
381 let path_text = retrieved_doc
382 .get_first(path_field)
383 .map(|field_value| field_value.as_str().unwrap_or(""))
384 .unwrap_or("");
385 let content_text = retrieved_doc
386 .get_first(content_field)
387 .map(|field_value| field_value.as_str().unwrap_or(""))
388 .unwrap_or("");
389
390 let file_path = PathBuf::from(path_text);
391 let preview = if options.full_section {
392 content_text.to_string()
393 } else {
394 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
395 };
396
397 raw_results.push((_score, SearchResult {
398 file: file_path,
399 span: Span {
400 byte_start: 0,
401 byte_end: content_text.len(),
402 line_start: 1,
403 line_end: content_text.lines().count(),
404 },
405 score: _score,
406 preview,
407 lang: detect_language(&PathBuf::from(path_text)),
408 symbol: None,
409 }));
410 }
411
412 let mut results = Vec::new();
414 if !raw_results.is_empty() {
415 let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
416 if max_score > 0.0 {
417 for (raw_score, mut result) in raw_results {
418 let normalized_score = raw_score / max_score;
419
420 if let Some(threshold) = options.threshold {
422 if normalized_score < threshold {
423 continue;
424 }
425 }
426
427 result.score = normalized_score;
428 results.push(result);
429 }
430 }
431 }
432
433 Ok(results)
434}
435
436#[allow(dead_code)]
437async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
438 semantic_search_with_progress(options, None).await
439}
440
441async fn semantic_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
442 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
444 if options.path.is_file() {
445 options.path.parent().unwrap_or(&options.path).to_path_buf()
446 } else {
447 options.path.clone()
448 }
449 });
450
451 let index_dir = index_root.join(".ck");
452 if !index_dir.exists() {
453 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
454 }
455
456 let ann_index_path = index_dir.join("ann_index.bin");
457 let embeddings_path = index_dir.join("embeddings.json");
458
459 if !ann_index_path.exists() || !embeddings_path.exists() {
460 return build_semantic_index_with_progress(options, progress_callback).await;
461 }
462
463 let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
465
466 let embeddings_data = fs::read_to_string(&embeddings_path)?;
468 let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
469
470 if let Some(ref callback) = progress_callback {
472 callback("Loading embedding model...");
473 }
474
475 let mut embedder = if let Some(ref callback) = progress_callback {
476 let _cb = callback.as_ref();
477 let model_cb = Box::new(|msg: &str| {
478 eprintln!("Model: {}", msg);
481 }) as ck_embed::ModelDownloadCallback;
482 ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
483 } else {
484 ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
485 };
486 let query_embeddings = embedder.embed(&[options.query.clone()])?;
487
488 if query_embeddings.is_empty() {
489 return Ok(Vec::new());
490 }
491
492 let query_embedding = &query_embeddings[0];
493
494 let top_k = options.top_k.unwrap_or(10);
496 let similar_docs = ann_index.search(query_embedding, top_k);
497
498 let mut results = Vec::new();
499
500 let filter_by_file = options.path.is_file();
502 let target_file = if filter_by_file {
503 Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
504 } else {
505 None
506 };
507
508 for (doc_id, similarity) in similar_docs {
509 if let Some(threshold) = options.threshold {
511 if similarity < threshold {
512 continue;
513 }
514 }
515
516 if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
517 if let Some(target) = &target_file {
519 let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
520 if canonical_result != *target {
521 continue; }
523 }
524
525 let preview = if options.full_section {
527 content.clone()
528 } else {
529 content.lines().take(3).collect::<Vec<_>>().join("\n")
530 };
531
532 results.push(SearchResult {
533 file: file_path.clone(),
534 span: Span {
535 byte_start: 0,
536 byte_end: content.len(),
537 line_start: 1,
538 line_end: content.lines().count(),
539 },
540 score: similarity,
541 preview,
542 lang: detect_language(file_path),
543 symbol: None,
544 });
545 }
546 }
547
548 Ok(results)
549}
550
551#[allow(dead_code)]
552async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
553 build_semantic_index_with_progress(options, None).await
554}
555
556async fn build_semantic_index_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
557 let index_root = if options.path.is_file() {
559 options.path.parent().unwrap_or(&options.path)
560 } else {
561 &options.path
562 };
563
564 let index_dir = index_root.join(".ck");
565 let ann_index_path = index_dir.join("ann_index.bin");
566 let embeddings_path = index_dir.join("embeddings.json");
567
568 fs::create_dir_all(&index_dir)?;
569
570 if let Some(ref callback) = progress_callback {
571 callback("Building semantic index (no index found)...");
572 }
573
574 eprintln!("Building semantic index (no existing index found)...");
576
577 let files = collect_files(&index_root, true, &options.exclude_patterns)?;
579
580 if let Some(ref callback) = progress_callback {
581 callback(&format!("Found {} files to index", files.len()));
582 }
583 eprintln!("Found {} files to embed and index", files.len());
584
585 let mut file_embeddings = Vec::new();
586 let mut embeddings = Vec::new();
587
588 if let Some(ref callback) = progress_callback {
590 callback("Loading embedding model...");
591 }
592
593 let model_callback = if progress_callback.is_some() {
594 Some(Box::new(|msg: &str| {
595 eprintln!("Model: {}", msg);
596 }) as ck_embed::ModelDownloadCallback)
597 } else {
598 None
599 };
600
601 let mut embedder = ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
602
603 if let Some(ref callback) = progress_callback {
604 callback("Generating embeddings for code chunks...");
605 }
606
607 for (file_idx, file_path) in files.iter().enumerate() {
608 if let Ok(content) = fs::read_to_string(file_path) {
609 if let Some(ref callback) = progress_callback {
610 let file_name = file_path.file_name()
611 .map(|n| n.to_string_lossy().to_string())
612 .unwrap_or_else(|| file_path.to_string_lossy().to_string());
613 callback(&format!("Processing {}/{}: {}", file_idx + 1, files.len(), file_name));
614 }
615
616 let chunks = ck_chunk::chunk_text(&content, detect_language(file_path).as_deref())?;
618
619 for chunk in chunks {
620 let chunk_embeddings = embedder.embed(&[chunk.text.clone()])?;
621 if !chunk_embeddings.is_empty() {
622 embeddings.push(chunk_embeddings[0].clone());
623 file_embeddings.push((file_path.clone(), chunk.text));
624 }
625 }
626 }
627 }
628
629 if let Some(ref callback) = progress_callback {
630 callback(&format!("Built {} embeddings, creating search index...", embeddings.len()));
631 }
632 eprintln!("Generated {} embeddings, building search index...", embeddings.len());
633
634 let index = ck_ann::SimpleIndex::build(&embeddings)?;
636 index.save(&ann_index_path)?;
637
638 let embeddings_json = serde_json::to_string(&file_embeddings)?;
640 fs::write(&embeddings_path, embeddings_json)?;
641
642 if let Some(ref callback) = progress_callback {
643 callback("Semantic index built successfully, running search...");
644 }
645 eprintln!("Semantic index built successfully!");
646
647 let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
649
650 let embeddings_data = fs::read_to_string(&embeddings_path)?;
652 let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
653
654 let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
656 let query_embeddings = embedder.embed(&[options.query.clone()])?;
657
658 if query_embeddings.is_empty() {
659 return Ok(Vec::new());
660 }
661
662 let query_embedding = &query_embeddings[0];
663
664 let top_k = options.top_k.unwrap_or(10);
666 let similar_docs = ann_index.search(query_embedding, top_k);
667
668 let mut results = Vec::new();
669
670 let filter_by_file = options.path.is_file();
672 let target_file = if filter_by_file {
673 Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
674 } else {
675 None
676 };
677
678 for (doc_id, similarity) in similar_docs {
679 if let Some(threshold) = options.threshold {
681 if similarity < threshold {
682 continue;
683 }
684 }
685
686 if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
687 if let Some(target) = &target_file {
689 let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
690 if canonical_result != *target {
691 continue; }
693 }
694
695 let preview = if options.full_section {
697 content.clone()
698 } else {
699 content.lines().take(3).collect::<Vec<_>>().join("\n")
700 };
701
702 results.push(SearchResult {
703 file: file_path.clone(),
704 span: Span {
705 byte_start: 0,
706 byte_end: content.len(),
707 line_start: 1,
708 line_end: content.lines().count(),
709 },
710 score: similarity,
711 preview,
712 lang: detect_language(file_path),
713 symbol: None,
714 });
715 }
716 }
717
718 Ok(results)
719}
720
721#[allow(dead_code)]
722async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
723 hybrid_search_with_progress(options, None).await
724}
725
726async fn hybrid_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
727 if let Some(ref callback) = progress_callback {
728 callback("Running regex search...");
729 }
730 let regex_results = regex_search(options)?;
731
732 if let Some(ref callback) = progress_callback {
733 callback("Running semantic search...");
734 }
735 let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
736
737 let mut combined = HashMap::new();
738
739 for (rank, result) in regex_results.iter().enumerate() {
740 let key = format!("{}:{}", result.file.display(), result.span.line_start);
741 combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
742 }
743
744 for (rank, result) in semantic_results.iter().enumerate() {
745 let key = format!("{}:{}", result.file.display(), result.span.line_start);
746 combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
747 }
748
749 let mut rrf_results: Vec<SearchResult> = combined
751 .into_iter()
752 .map(|(_, ranks)| {
753 let mut result = ranks[0].1.clone();
754 let rrf_score = ranks.iter().map(|(rank, _)| 1.0 / (60.0 + *rank as f32)).sum();
755 result.score = rrf_score;
756 result
757 })
758 .filter(|result| {
759 if let Some(threshold) = options.threshold {
761 result.score >= threshold
762 } else {
763 true
764 }
765 })
766 .collect();
767
768 rrf_results.sort_by(|a, b| {
770 b.score
771 .partial_cmp(&a.score)
772 .unwrap_or(std::cmp::Ordering::Equal)
773 });
774
775 if let Some(top_k) = options.top_k {
776 rrf_results.truncate(top_k);
777 }
778
779 Ok(rrf_results)
780}
781
782fn build_globset(patterns: &[String]) -> GlobSet {
783 let mut builder = GlobSetBuilder::new();
784 for pat in patterns {
785 if let Ok(glob) = Glob::new(pat) {
787 builder.add(glob);
788 }
789 }
790 builder.build().unwrap_or_else(|_| GlobSet::empty())
791}
792
793fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
794 let globset = build_globset(exclude_patterns);
795 if globset.is_match(path) {
797 return true;
798 }
799 for component in path.components() {
800 if let std::path::Component::Normal(name) = component {
801 if globset.is_match(name) {
802 return true;
803 }
804 }
805 }
806 false
807}
808
809fn collect_files(path: &Path, recursive: bool, exclude_patterns: &[String]) -> Result<Vec<PathBuf>> {
810 let mut files = Vec::new();
811 let globset = build_globset(exclude_patterns);
812
813 if path.is_file() {
814 files.push(path.to_path_buf());
816 } else if recursive {
817 for entry in WalkDir::new(path)
818 .into_iter()
819 .filter_entry(|e| {
820 let name = e.file_name();
822 !globset.is_match(e.path()) && !globset.is_match(name)
823 }) {
824 match entry {
825 Ok(entry) => {
826 if entry.file_type().is_file() && !should_exclude_path(entry.path(), exclude_patterns) {
827 files.push(entry.path().to_path_buf());
828 }
829 }
830 Err(e) => {
831 tracing::debug!("Skipping path due to error: {}", e);
833 continue;
834 }
835 }
836 }
837 } else {
838 match fs::read_dir(path) {
839 Ok(read_dir) => {
840 for entry in read_dir {
841 match entry {
842 Ok(entry) => {
843 let path = entry.path();
844 if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
845 files.push(path);
846 }
847 }
848 Err(e) => {
849 tracing::debug!("Skipping directory entry due to error: {}", e);
850 continue;
851 }
852 }
853 }
854 }
855 Err(e) => {
856 tracing::debug!("Cannot read directory {:?}: {}", path, e);
857 return Err(e.into());
858 }
859 }
860 }
861
862 Ok(files)
863}
864
865fn detect_language(path: &Path) -> Option<String> {
866 path.extension()
867 .and_then(|ext| ext.to_str())
868 .map(|ext| match ext {
869 "rs" => "rust",
870 "py" => "python",
871 "js" => "javascript",
872 "ts" => "typescript",
873 "hs" | "lhs" => "haskell",
874 "go" => "go",
875 "java" => "java",
876 "c" => "c",
877 "cpp" | "cc" | "cxx" => "cpp",
878 "h" | "hpp" => "cpp",
879 "cs" => "csharp",
880 "rb" => "ruby",
881 "php" => "php",
882 "swift" => "swift",
883 "kt" => "kotlin",
884 _ => ext,
885 })
886 .map(String::from)
887}
888
889async fn ensure_index_updated(path: &Path, force_reindex: bool, need_embeddings: bool) -> Result<()> {
890
891 let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
893 if path.is_file() {
894 path.parent().unwrap_or(path).to_path_buf()
895 } else {
896 path.to_path_buf()
897 }
898 });
899 let index_root = &index_root_buf;
900
901 if force_reindex {
903 let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings, true).await?;
904 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
905 tracing::info!("Index updated: {} files indexed, {} orphaned files removed",
906 stats.files_indexed, stats.orphaned_files_removed);
907 }
908 return Ok(());
909 }
910
911 let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings, true).await?;
913 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
914 tracing::info!("Index updated: {} files indexed, {} orphaned files removed",
915 stats.files_indexed, stats.orphaned_files_removed);
916 }
917
918 Ok(())
919}
920
921fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
922 let before = options.before_context_lines.max(options.context_lines);
923 let after = options.after_context_lines.max(options.context_lines);
924
925 if before > 0 || after > 0 {
926 let start_idx = line_idx.saturating_sub(before);
927 let end_idx = (line_idx + after + 1).min(lines.len());
928 lines[start_idx..end_idx].join("\n")
929 } else {
930 lines[line_idx].to_string()
931 }
932}
933
934fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
935 let lang = match file_path.extension().and_then(|s| s.to_str()) {
937 Some("py") => Some("python"),
938 Some("js") => Some("javascript"),
939 Some("ts") | Some("tsx") => Some("typescript"),
940 Some("hs") | Some("lhs") => Some("haskell"),
941 _ => return None,
942 };
943
944 if let Ok(chunks) = ck_chunk::chunk_text(content, lang) {
946 let sections: Vec<(usize, usize, String)> = chunks
947 .into_iter()
948 .filter(|chunk| matches!(
949 chunk.chunk_type,
950 ck_chunk::ChunkType::Function |
951 ck_chunk::ChunkType::Class |
952 ck_chunk::ChunkType::Method
953 ))
954 .map(|chunk| {
955 (
956 chunk.span.line_start - 1, chunk.span.line_end - 1,
958 chunk.text,
959 )
960 })
961 .collect();
962
963 if sections.is_empty() {
964 None
965 } else {
966 Some(sections)
967 }
968 } else {
969 None
970 }
971}
972
973fn find_containing_section(sections: &[(usize, usize, String)], line_idx: usize) -> Option<&String> {
974 for (start, end, text) in sections {
975 if line_idx >= *start && line_idx <= *end {
976 return Some(text);
977 }
978 }
979 None
980}
981
982#[cfg(test)]
983mod tests {
984 use super::*;
985 use std::fs;
986 use tempfile::TempDir;
987
988 fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
989 let files = vec![
990 ("test1.txt", "hello world rust programming"),
991 ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
992 ("test3.py", "print('Hello Python')"),
993 ("test4.txt", "machine learning artificial intelligence"),
994 ];
995
996 let mut paths = Vec::new();
997 for (name, content) in files {
998 let path = dir.join(name);
999 fs::write(&path, content).unwrap();
1000 paths.push(path);
1001 }
1002 paths
1003 }
1004
1005 #[test]
1006 fn test_detect_language() {
1007 assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
1008 assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
1009 assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
1010 assert_eq!(detect_language(&PathBuf::from("test.hs")), Some("haskell".to_string()));
1011 assert_eq!(detect_language(&PathBuf::from("test.lhs")), Some("haskell".to_string()));
1012 assert_eq!(detect_language(&PathBuf::from("test.unknown")), Some("unknown".to_string()));
1013 assert_eq!(detect_language(&PathBuf::from("noext")), None);
1014 }
1015
1016 #[test]
1017 fn test_collect_files() {
1018 let temp_dir = TempDir::new().unwrap();
1019 let test_files = create_test_files(temp_dir.path());
1020
1021 let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1023 assert_eq!(files.len(), 4);
1024
1025 let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1027 assert_eq!(files.len(), 4);
1028
1029 let files = collect_files(&test_files[0], false, &[]).unwrap();
1031 assert_eq!(files.len(), 1);
1032 assert_eq!(files[0], test_files[0]);
1033 }
1034
1035 #[test]
1036 fn test_regex_search() {
1037 let temp_dir = TempDir::new().unwrap();
1038 create_test_files(temp_dir.path());
1039
1040 let options = SearchOptions {
1041 mode: SearchMode::Regex,
1042 query: "rust".to_string(),
1043 path: temp_dir.path().to_path_buf(),
1044 recursive: true,
1045 ..Default::default()
1046 };
1047
1048 let results = regex_search(&options).unwrap();
1049 assert!(!results.is_empty());
1050
1051 let rust_matches: Vec<_> = results.iter()
1053 .filter(|r| r.preview.to_lowercase().contains("rust"))
1054 .collect();
1055 assert!(!rust_matches.is_empty());
1056 }
1057
1058 #[test]
1059 fn test_regex_search_case_insensitive() {
1060 let temp_dir = TempDir::new().unwrap();
1061 create_test_files(temp_dir.path());
1062
1063 let options = SearchOptions {
1064 mode: SearchMode::Regex,
1065 query: "HELLO".to_string(),
1066 path: temp_dir.path().to_path_buf(),
1067 recursive: true,
1068 case_insensitive: true,
1069 ..Default::default()
1070 };
1071
1072 let results = regex_search(&options).unwrap();
1073 assert!(!results.is_empty());
1074 }
1075
1076 #[test]
1077 fn test_regex_search_fixed_string() {
1078 let temp_dir = TempDir::new().unwrap();
1079 create_test_files(temp_dir.path());
1080
1081 let options = SearchOptions {
1082 mode: SearchMode::Regex,
1083 query: "fn main()".to_string(),
1084 path: temp_dir.path().to_path_buf(),
1085 recursive: true,
1086 fixed_string: true,
1087 ..Default::default()
1088 };
1089
1090 let results = regex_search(&options).unwrap();
1091 assert!(!results.is_empty());
1092 }
1093
1094 #[test]
1095 fn test_regex_search_whole_word() {
1096 let temp_dir = TempDir::new().unwrap();
1097 fs::write(temp_dir.path().join("word_test.txt"), "rust rusty rustacean").unwrap();
1098
1099 let options = SearchOptions {
1100 mode: SearchMode::Regex,
1101 query: "rust".to_string(),
1102 path: temp_dir.path().to_path_buf(),
1103 recursive: true,
1104 whole_word: true,
1105 ..Default::default()
1106 };
1107
1108 let results = regex_search(&options).unwrap();
1109 assert!(!results.is_empty());
1110 }
1112
1113 #[test]
1114 fn test_regex_search_top_k() {
1115 let temp_dir = TempDir::new().unwrap();
1116
1117 for i in 0..10 {
1119 fs::write(temp_dir.path().join(format!("file{}.txt", i)), "test content").unwrap();
1120 }
1121
1122 let options = SearchOptions {
1123 mode: SearchMode::Regex,
1124 query: "test".to_string(),
1125 path: temp_dir.path().to_path_buf(),
1126 recursive: true,
1127 top_k: Some(5),
1128 ..Default::default()
1129 };
1130
1131 let results = regex_search(&options).unwrap();
1132 assert!(results.len() <= 5);
1133 }
1134
1135 #[test]
1136 fn test_regex_search_span_offsets() {
1137 let temp_dir = TempDir::new().unwrap();
1139 let test_file = temp_dir.path().join("spans.txt");
1140 fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1141
1142 let options = SearchOptions {
1143 mode: SearchMode::Regex,
1144 query: "test".to_string(),
1145 path: test_file.clone(),
1146 recursive: false,
1147 ..Default::default()
1148 };
1149
1150 let results = regex_search(&options).unwrap();
1151
1152 assert_eq!(results.len(), 5);
1154
1155 let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1157 assert_eq!(line1_matches.len(), 3);
1158 assert_eq!(line1_matches[0].span.byte_start, 0);
1159 assert_eq!(line1_matches[1].span.byte_start, 5);
1160 assert_eq!(line1_matches[2].span.byte_start, 10);
1161
1162 let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1164 assert_eq!(line2_matches.len(), 1);
1165 assert_eq!(line2_matches[0].span.byte_start, 24); let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1169 byte_starts.sort();
1170 byte_starts.dedup();
1171 assert_eq!(byte_starts.len(), 5); }
1173
1174 #[test]
1175 fn test_search_file() {
1176 let temp_dir = TempDir::new().unwrap();
1177 let file_path = temp_dir.path().join("test.txt");
1178 fs::write(&file_path, "line 1: hello\nline 2: world\nline 3: rust programming").unwrap();
1179
1180 let regex = regex::Regex::new("rust").unwrap();
1181 let options = SearchOptions::default();
1182
1183 let results = search_file(®ex, &file_path, &options).unwrap();
1184 assert_eq!(results.len(), 1);
1185 assert_eq!(results[0].span.line_start, 3);
1186 assert!(results[0].preview.contains("rust"));
1187 }
1188
1189 #[test]
1190 fn test_search_file_with_context() {
1191 let temp_dir = TempDir::new().unwrap();
1192 let file_path = temp_dir.path().join("test.txt");
1193 fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1194
1195 let regex = regex::Regex::new("target").unwrap();
1196 let options = SearchOptions {
1197 context_lines: 1,
1198 ..Default::default()
1199 };
1200
1201 let results = search_file(®ex, &file_path, &options).unwrap();
1202 assert_eq!(results.len(), 1);
1203
1204 println!("Preview: '{}'", results[0].preview);
1205
1206 assert!(results[0].preview.contains("line 2"));
1209 assert!(results[0].preview.contains("target line"));
1210 assert!(results[0].preview.contains("line 4"));
1211 }
1212
1213 #[tokio::test]
1214 async fn test_search_main_function() {
1215 let temp_dir = TempDir::new().unwrap();
1216 create_test_files(temp_dir.path());
1217
1218 let options = SearchOptions {
1219 mode: SearchMode::Regex,
1220 query: "hello".to_string(),
1221 path: temp_dir.path().to_path_buf(),
1222 recursive: true,
1223 case_insensitive: true,
1224 ..Default::default()
1225 };
1226
1227 let results = search(&options).await.unwrap();
1228 assert!(!results.is_empty());
1229 }
1230}