1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27 if ck_core::pdf::is_pdf_file(file_path) {
28 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30 if !cache_path.exists() {
31 return Err(anyhow::anyhow!(
32 "PDF not preprocessed. Run 'ck --index' first."
33 ));
34 }
35 Ok(cache_path)
36 } else {
37 Ok(file_path.to_path_buf())
39 }
40}
41
42fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46 let content_path = resolve_content_path(file_path, repo_root)?;
47 Ok(fs::read_to_string(content_path)?)
48}
49
50async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52 let repo_root = find_nearest_index_root(file_path)
54 .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56 let content_path = resolve_content_path(file_path, &repo_root)?;
58
59 extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65 use std::io::{BufRead, BufReader};
66
67 if line_start == 0 {
68 return Ok(String::new());
69 }
70
71 let file = fs::File::open(file_path)?;
72 let reader = BufReader::new(file);
73 let mut result = Vec::new();
74
75 let start_idx = line_start.saturating_sub(1);
77 let end_idx = line_end.saturating_sub(1);
78
79 for (current_line, line_result) in reader.lines().enumerate() {
80 if current_line > end_idx {
81 break; }
83
84 let line = line_result?;
85
86 if current_line >= start_idx {
87 result.push(line);
88 }
89 }
90
91 if result.is_empty() && line_start > 0 {
93 return Ok(String::new());
94 }
95
96 Ok(result.join("\n"))
97}
98
99fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
102 let mut lines = Vec::new();
103 let mut endings = Vec::new();
104
105 let bytes = content.as_bytes();
106 let mut start = 0usize;
107 let mut i = 0usize;
108
109 while i < bytes.len() {
110 match bytes[i] {
111 b'\n' => {
112 lines.push(content[start..i].to_string());
113 endings.push(1);
114 i += 1;
115 start = i;
116 }
117 b'\r' => {
118 lines.push(content[start..i].to_string());
119 if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
120 endings.push(2);
121 i += 2;
122 } else {
123 endings.push(1);
124 i += 1;
125 }
126 start = i;
127 }
128 _ => {
129 i += 1;
130 }
131 }
132 }
133
134 if start < bytes.len() {
135 lines.push(content[start..].to_string());
136 endings.push(0);
137 }
138
139 (lines, endings)
140}
141
142fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
143 let mut current = if path.is_file() {
144 path.parent().unwrap_or(path)
145 } else {
146 path
147 };
148 loop {
149 if current.join(".ck").exists() {
150 return Some(current.to_path_buf());
151 }
152 match current.parent() {
153 Some(parent) => current = parent,
154 None => return None,
155 }
156 }
157}
158
159#[derive(Clone, Debug)]
160pub struct ResolvedModel {
161 pub canonical_name: String,
162 pub alias: String,
163 pub dimensions: usize,
164}
165
166fn find_model_entry<'a>(
167 registry: &'a ck_models::ModelRegistry,
168 key: &str,
169) -> Option<(String, &'a ck_models::ModelConfig)> {
170 if let Some(config) = registry.get_model(key) {
171 return Some((key.to_string(), config));
172 }
173
174 registry
175 .models
176 .iter()
177 .find(|(_, config)| config.name == key)
178 .map(|(alias, config)| (alias.clone(), config))
179}
180
181pub(crate) fn resolve_model_from_root(
182 index_root: &Path,
183 cli_model: Option<&str>,
184) -> Result<ResolvedModel> {
185 use ck_models::ModelRegistry;
186
187 let registry = ModelRegistry::default();
188 let index_dir = index_root.join(".ck");
189 let manifest_path = index_dir.join("manifest.json");
190
191 if manifest_path.exists() {
192 let data = std::fs::read(&manifest_path)?;
193 let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
194
195 if let Some(existing_model) = manifest.embedding_model {
196 let (alias, config_opt) = find_model_entry(®istry, &existing_model)
197 .map(|(alias, config)| (alias, Some(config)))
198 .unwrap_or_else(|| (existing_model.clone(), None));
199
200 let dims = manifest
201 .embedding_dimensions
202 .or_else(|| config_opt.map(|c| c.dimensions))
203 .unwrap_or(384);
204
205 if let Some(requested) = cli_model {
206 let (_, requested_config) =
207 find_model_entry(®istry, requested).ok_or_else(|| {
208 CkError::Embedding(format!(
209 "Unknown model '{}'. Available models: {}",
210 requested,
211 registry
212 .models
213 .keys()
214 .cloned()
215 .collect::<Vec<_>>()
216 .join(", ")
217 ))
218 })?;
219
220 if requested_config.name != existing_model {
221 let suggested_alias = alias.clone();
222 return Err(CkError::Embedding(format!(
223 "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
224 existing_model,
225 suggested_alias,
226 requested,
227 requested,
228 suggested_alias
229 ))
230 .into());
231 }
232 }
233
234 return Ok(ResolvedModel {
235 canonical_name: existing_model,
236 alias,
237 dimensions: dims,
238 });
239 }
240 }
241
242 let (alias, config) = if let Some(requested) = cli_model {
243 find_model_entry(®istry, requested).ok_or_else(|| {
244 CkError::Embedding(format!(
245 "Unknown model '{}'. Available models: {}",
246 requested,
247 registry
248 .models
249 .keys()
250 .cloned()
251 .collect::<Vec<_>>()
252 .join(", ")
253 ))
254 })?
255 } else {
256 let alias = registry.default_model.clone();
257 let config = registry.get_default_model().ok_or_else(|| {
258 CkError::Embedding("No default embedding model configured".to_string())
259 })?;
260 (alias, config)
261 };
262
263 Ok(ResolvedModel {
264 canonical_name: config.name.clone(),
265 alias,
266 dimensions: config.dimensions,
267 })
268}
269
270pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
271 let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
272 if path.is_file() {
273 path.parent().unwrap_or(path).to_path_buf()
274 } else {
275 path.to_path_buf()
276 }
277 });
278 resolve_model_from_root(&index_root, cli_model)
279}
280
281pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
282 let results = search_enhanced(options).await?;
283 Ok(results.matches)
284}
285
286pub async fn search_with_progress(
287 options: &SearchOptions,
288 progress_callback: Option<SearchProgressCallback>,
289) -> Result<Vec<SearchResult>> {
290 let results = search_enhanced_with_progress(options, progress_callback).await?;
291 Ok(results.matches)
292}
293
294pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
296 search_enhanced_with_progress(options, None).await
297}
298
299pub async fn search_enhanced_with_progress(
301 options: &SearchOptions,
302 progress_callback: Option<SearchProgressCallback>,
303) -> Result<ck_core::SearchResults> {
304 search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
305}
306
307pub async fn search_enhanced_with_indexing_progress(
309 options: &SearchOptions,
310 progress_callback: Option<SearchProgressCallback>,
311 indexing_progress_callback: Option<IndexingProgressCallback>,
312 detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
313) -> Result<ck_core::SearchResults> {
314 if !options.path.exists() {
316 return Err(ck_core::CkError::Search(format!(
317 "Path does not exist: {}",
318 options.path.display()
319 ))
320 .into());
321 }
322
323 if !matches!(options.mode, SearchMode::Regex) {
325 let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
326 ensure_index_updated_with_progress(
327 &options.path,
328 options.reindex,
329 need_embeddings,
330 indexing_progress_callback,
331 detailed_indexing_progress_callback,
332 options.respect_gitignore,
333 &options.exclude_patterns,
334 options.embedding_model.as_deref(),
335 )
336 .await?;
337 }
338
339 let search_results = match options.mode {
340 SearchMode::Regex => {
341 let matches = regex_search(options)?;
342 ck_core::SearchResults {
343 matches,
344 closest_below_threshold: None,
345 }
346 }
347 SearchMode::Lexical => {
348 let matches = lexical_search(options).await?;
349 ck_core::SearchResults {
350 matches,
351 closest_below_threshold: None,
352 }
353 }
354 SearchMode::Semantic => {
355 semantic_search_v3_with_progress(options, progress_callback).await?
357 }
358 SearchMode::Hybrid => {
359 let matches = hybrid_search_with_progress(options, progress_callback).await?;
360 ck_core::SearchResults {
361 matches,
362 closest_below_threshold: None,
363 }
364 }
365 };
366
367 Ok(search_results)
368}
369
370fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
371 let pattern = if options.fixed_string {
372 regex::escape(&options.query)
373 } else if options.whole_word {
374 format!(r"\b{}\b", regex::escape(&options.query))
375 } else {
376 options.query.clone()
377 };
378
379 let regex = RegexBuilder::new(&pattern)
380 .case_insensitive(options.case_insensitive)
381 .build()
382 .map_err(CkError::Regex)?;
383
384 let should_recurse = options.path.is_dir() || options.recursive;
386 let files = if should_recurse {
387 ck_index::collect_files(
389 &options.path,
390 options.respect_gitignore,
391 &options.exclude_patterns,
392 )?
393 } else {
394 collect_files(&options.path, should_recurse, &options.exclude_patterns)?
396 };
397
398 let results: Vec<Vec<SearchResult>> = files
399 .par_iter()
400 .filter_map(|file_path| match search_file(®ex, file_path, options) {
401 Ok(matches) => {
402 if matches.is_empty() {
403 None
404 } else {
405 Some(matches)
406 }
407 }
408 Err(e) => {
409 tracing::debug!("Error searching {:?}: {}", file_path, e);
410 None
411 }
412 })
413 .collect();
414
415 let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
416 all_results.sort_by(|a, b| {
418 let path_cmp = a.file.cmp(&b.file);
419 if path_cmp != std::cmp::Ordering::Equal {
420 return path_cmp;
421 }
422 a.span.line_start.cmp(&b.span.line_start)
423 });
424
425 if let Some(top_k) = options.top_k {
426 all_results.truncate(top_k);
427 }
428
429 Ok(all_results)
430}
431
432fn search_file(
433 regex: &Regex,
434 file_path: &Path,
435 options: &SearchOptions,
436) -> Result<Vec<SearchResult>> {
437 let repo_root = find_nearest_index_root(file_path)
439 .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
440
441 if options.full_section || options.context_lines > 0 {
445 let content = read_file_content(file_path, &repo_root)?;
447 let (lines, line_ending_lengths) = split_lines_with_endings(&content);
448
449 let code_sections = if options.full_section {
451 extract_code_sections(file_path, &content)
452 } else {
453 None
454 };
455
456 search_file_in_memory(
457 regex,
458 file_path,
459 options,
460 &lines,
461 &code_sections,
462 &line_ending_lengths,
463 )
464 } else {
465 search_file_streaming(regex, file_path, &repo_root, options)
467 }
468}
469
470fn search_file_in_memory(
472 regex: &Regex,
473 file_path: &Path,
474 options: &SearchOptions,
475 lines: &[String],
476 code_sections: &Option<Vec<(usize, usize, String)>>,
477 line_ending_lengths: &[usize],
478) -> Result<Vec<SearchResult>> {
479 let mut results = Vec::new();
480 let mut byte_offset = 0;
481
482 for (line_idx, line) in lines.iter().enumerate() {
483 let line_number = line_idx + 1;
484
485 if regex.as_str().is_empty() {
488 let preview = if options.full_section {
490 if let Some(sections) = code_sections {
492 if let Some(section) = find_containing_section(sections, line_idx) {
493 section.clone()
494 } else {
495 get_context_preview(lines, line_idx, options)
497 }
498 } else {
499 get_context_preview(lines, line_idx, options)
500 }
501 } else {
502 get_context_preview(lines, line_idx, options)
503 };
504
505 results.push(SearchResult {
506 file: file_path.to_path_buf(),
507 span: Span {
508 byte_start: byte_offset,
509 byte_end: byte_offset + line.len(),
510 line_start: line_number,
511 line_end: line_number,
512 },
513 score: 1.0,
514 preview,
515 lang: ck_core::Language::from_path(file_path),
516 symbol: None,
517 chunk_hash: None,
518 index_epoch: None,
519 });
520 } else {
521 for mat in regex.find_iter(line) {
523 let preview = if options.full_section {
524 if let Some(sections) = code_sections {
526 if let Some(section) = find_containing_section(sections, line_idx) {
527 section.clone()
528 } else {
529 get_context_preview(lines, line_idx, options)
531 }
532 } else {
533 get_context_preview(lines, line_idx, options)
534 }
535 } else {
536 get_context_preview(lines, line_idx, options)
537 };
538
539 results.push(SearchResult {
540 file: file_path.to_path_buf(),
541 span: Span {
542 byte_start: byte_offset + mat.start(),
543 byte_end: byte_offset + mat.end(),
544 line_start: line_number,
545 line_end: line_number,
546 },
547 score: 1.0,
548 preview,
549 lang: ck_core::Language::from_path(file_path),
550 symbol: None,
551 chunk_hash: None,
552 index_epoch: None,
553 });
554 }
555 }
556
557 byte_offset += line.len();
559 byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
560 }
561
562 Ok(results)
563}
564
565fn search_file_streaming(
567 regex: &Regex,
568 file_path: &Path,
569 repo_root: &Path,
570 _options: &SearchOptions,
571) -> Result<Vec<SearchResult>> {
572 use std::io::{BufRead, BufReader};
573
574 let content_path = resolve_content_path(file_path, repo_root)?;
575 let file = std::fs::File::open(&content_path)?;
576 let mut reader = BufReader::new(file);
577
578 let mut results = Vec::new();
579 let mut line = String::new();
580 let mut byte_offset = 0usize;
581 let mut line_number = 1usize;
582
583 loop {
584 line.clear();
585 let bytes_read = reader.read_line(&mut line)?;
586 if bytes_read == 0 {
587 break;
588 }
589
590 let mut newline_len = 0usize;
593 if line.ends_with("\r\n") {
594 line.pop(); line.pop(); newline_len = 2;
597 } else if line.ends_with(['\n', '\r']) {
598 line.pop();
599 newline_len = 1;
600 }
601
602 let treat_cr_as_newline = line.contains('\r');
606
607 if treat_cr_as_newline {
608 let bytes = line.as_bytes();
609 let mut segment_start = 0usize;
610 while segment_start <= bytes.len() {
611 match bytes[segment_start..].iter().position(|&b| b == b'\r') {
612 Some(rel_idx) => {
613 let idx = segment_start + rel_idx;
614 let segment_bytes = &bytes[segment_start..idx];
615 let segment_str = std::str::from_utf8(segment_bytes)?;
616 process_streaming_line(
617 regex,
618 file_path,
619 segment_str,
620 line_number,
621 byte_offset,
622 &mut results,
623 );
624 byte_offset += segment_bytes.len() + 1; line_number += 1;
626 segment_start = idx + 1;
627 }
628 None => {
629 let segment_bytes = &bytes[segment_start..];
630 let segment_str = std::str::from_utf8(segment_bytes)?;
631 process_streaming_line(
632 regex,
633 file_path,
634 segment_str,
635 line_number,
636 byte_offset,
637 &mut results,
638 );
639 byte_offset += segment_bytes.len();
640 line_number += 1;
641 break;
642 }
643 }
644 }
645 byte_offset += newline_len;
646 } else {
647 let line_str = line.as_str();
648 process_streaming_line(
649 regex,
650 file_path,
651 line_str,
652 line_number,
653 byte_offset,
654 &mut results,
655 );
656 byte_offset += line_str.len() + newline_len;
657 line_number += 1;
658 }
659 }
660
661 Ok(results)
662}
663
664fn process_streaming_line(
665 regex: &Regex,
666 file_path: &Path,
667 line: &str,
668 line_number: usize,
669 byte_offset: usize,
670 results: &mut Vec<SearchResult>,
671) {
672 if regex.as_str().is_empty() {
673 results.push(SearchResult {
674 file: file_path.to_path_buf(),
675 span: Span {
676 byte_start: byte_offset,
677 byte_end: byte_offset + line.len(),
678 line_start: line_number,
679 line_end: line_number,
680 },
681 score: 1.0,
682 preview: line.to_string(),
683 lang: ck_core::Language::from_path(file_path),
684 symbol: None,
685 chunk_hash: None,
686 index_epoch: None,
687 });
688 } else {
689 for mat in regex.find_iter(line) {
690 results.push(SearchResult {
691 file: file_path.to_path_buf(),
692 span: Span {
693 byte_start: byte_offset + mat.start(),
694 byte_end: byte_offset + mat.end(),
695 line_start: line_number,
696 line_end: line_number,
697 },
698 score: 1.0,
699 preview: line.to_string(),
700 lang: ck_core::Language::from_path(file_path),
701 symbol: None,
702 chunk_hash: None,
703 index_epoch: None,
704 });
705 }
706 }
707}
708
709async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
710 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
712 if options.path.is_file() {
713 options.path.parent().unwrap_or(&options.path).to_path_buf()
714 } else {
715 options.path.clone()
716 }
717 });
718
719 let index_dir = index_root.join(".ck");
720 if !index_dir.exists() {
721 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
722 }
723
724 let tantivy_index_path = index_dir.join("tantivy_index");
725
726 if !tantivy_index_path.exists() {
727 return build_tantivy_index(options).await;
728 }
729
730 let mut schema_builder = Schema::builder();
731 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
732 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
733 let _schema = schema_builder.build();
734
735 let index = Index::open_in_dir(&tantivy_index_path)
736 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
737
738 let reader = index
739 .reader_builder()
740 .reload_policy(ReloadPolicy::OnCommitWithDelay)
741 .try_into()
742 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
743
744 let searcher = reader.searcher();
745 let query_parser = QueryParser::for_index(&index, vec![content_field]);
746
747 let query = query_parser
748 .parse_query(&options.query)
749 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
750
751 let top_docs = if let Some(top_k) = options.top_k {
752 searcher.search(&query, &TopDocs::with_limit(top_k))?
753 } else {
754 searcher.search(&query, &TopDocs::with_limit(100))?
755 };
756
757 let mut raw_results = Vec::new();
759 for (_score, doc_address) in top_docs {
760 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
761 let path_text = retrieved_doc
762 .get_first(path_field)
763 .map(|field_value| field_value.as_str().unwrap_or(""))
764 .unwrap_or("");
765 let content_text = retrieved_doc
766 .get_first(content_field)
767 .map(|field_value| field_value.as_str().unwrap_or(""))
768 .unwrap_or("");
769
770 let file_path = PathBuf::from(path_text);
771 let preview = if options.full_section {
772 content_text.to_string()
773 } else {
774 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
775 };
776
777 raw_results.push((
778 _score,
779 SearchResult {
780 file: file_path,
781 span: Span {
782 byte_start: 0,
783 byte_end: content_text.len(),
784 line_start: 1,
785 line_end: content_text.lines().count(),
786 },
787 score: _score,
788 preview,
789 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
790 symbol: None,
791 chunk_hash: None,
792 index_epoch: None,
793 },
794 ));
795 }
796
797 let mut results = Vec::new();
799 if !raw_results.is_empty() {
800 let max_score = raw_results
801 .iter()
802 .map(|(score, _)| *score)
803 .fold(0.0f32, f32::max);
804 if max_score > 0.0 {
805 for (raw_score, mut result) in raw_results {
806 let normalized_score = raw_score / max_score;
807
808 if let Some(threshold) = options.threshold
810 && normalized_score < threshold
811 {
812 continue;
813 }
814
815 result.score = normalized_score;
816 results.push(result);
817 }
818 }
819 }
820
821 Ok(results)
822}
823
824async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
825 let index_root = if options.path.is_file() {
827 options.path.parent().unwrap_or(&options.path)
828 } else {
829 &options.path
830 };
831
832 let index_dir = index_root.join(".ck");
833 let tantivy_index_path = index_dir.join("tantivy_index");
834
835 fs::create_dir_all(&tantivy_index_path)?;
836
837 let mut schema_builder = Schema::builder();
838 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
839 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
840 let schema = schema_builder.build();
841
842 let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
843 .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
844
845 let mut index_writer = index
846 .writer(50_000_000)
847 .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
848
849 let files = collect_files(index_root, true, &options.exclude_patterns)?;
850
851 for file_path in &files {
852 if let Ok(content) = fs::read_to_string(file_path) {
853 let doc = doc!(
854 content_field => content,
855 path_field => file_path.display().to_string()
856 );
857 index_writer.add_document(doc)?;
858 }
859 }
860
861 index_writer
862 .commit()
863 .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
864
865 let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
867 let mut schema_builder = Schema::builder();
868 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
869 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
870 let _schema = schema_builder.build();
871
872 let index = Index::open_in_dir(&tantivy_index_path)
873 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
874
875 let reader = index
876 .reader_builder()
877 .reload_policy(ReloadPolicy::OnCommitWithDelay)
878 .try_into()
879 .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
880
881 let searcher = reader.searcher();
882 let query_parser = QueryParser::for_index(&index, vec![content_field]);
883
884 let query = query_parser
885 .parse_query(&options.query)
886 .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
887
888 let top_docs = if let Some(top_k) = options.top_k {
889 searcher.search(&query, &TopDocs::with_limit(top_k))?
890 } else {
891 searcher.search(&query, &TopDocs::with_limit(100))?
892 };
893
894 let mut raw_results = Vec::new();
896 for (_score, doc_address) in top_docs {
897 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
898 let path_text = retrieved_doc
899 .get_first(path_field)
900 .map(|field_value| field_value.as_str().unwrap_or(""))
901 .unwrap_or("");
902 let content_text = retrieved_doc
903 .get_first(content_field)
904 .map(|field_value| field_value.as_str().unwrap_or(""))
905 .unwrap_or("");
906
907 let file_path = PathBuf::from(path_text);
908 let preview = if options.full_section {
909 content_text.to_string()
910 } else {
911 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
912 };
913
914 raw_results.push((
915 _score,
916 SearchResult {
917 file: file_path,
918 span: Span {
919 byte_start: 0,
920 byte_end: content_text.len(),
921 line_start: 1,
922 line_end: content_text.lines().count(),
923 },
924 score: _score,
925 preview,
926 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
927 symbol: None,
928 chunk_hash: None,
929 index_epoch: None,
930 },
931 ));
932 }
933
934 let mut results = Vec::new();
936 if !raw_results.is_empty() {
937 let max_score = raw_results
938 .iter()
939 .map(|(score, _)| *score)
940 .fold(0.0f32, f32::max);
941 if max_score > 0.0 {
942 for (raw_score, mut result) in raw_results {
943 let normalized_score = raw_score / max_score;
944
945 if let Some(threshold) = options.threshold
947 && normalized_score < threshold
948 {
949 continue;
950 }
951
952 result.score = normalized_score;
953 results.push(result);
954 }
955 }
956 }
957
958 Ok(results)
959}
960
961#[allow(dead_code)]
962async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
963 hybrid_search_with_progress(options, None).await
964}
965
966async fn hybrid_search_with_progress(
967 options: &SearchOptions,
968 progress_callback: Option<SearchProgressCallback>,
969) -> Result<Vec<SearchResult>> {
970 if let Some(ref callback) = progress_callback {
971 callback("Running regex search...");
972 }
973 let regex_results = regex_search(options)?;
974
975 if let Some(ref callback) = progress_callback {
976 callback("Running semantic search...");
977 }
978 let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
979
980 let mut combined = HashMap::new();
981
982 for (rank, result) in regex_results.iter().enumerate() {
983 let key = format!("{}:{}", result.file.display(), result.span.line_start);
984 combined
985 .entry(key)
986 .or_insert(Vec::new())
987 .push((rank + 1, result.clone()));
988 }
989
990 for (rank, result) in semantic_results.matches.iter().enumerate() {
991 let key = format!("{}:{}", result.file.display(), result.span.line_start);
992 combined
993 .entry(key)
994 .or_insert(Vec::new())
995 .push((rank + 1, result.clone()));
996 }
997
998 let mut rrf_results: Vec<SearchResult> = combined
1000 .into_values()
1001 .map(|ranks| {
1002 let mut result = ranks[0].1.clone();
1003 let rrf_score = ranks
1004 .iter()
1005 .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
1006 .sum();
1007 result.score = rrf_score;
1008 result
1009 })
1010 .filter(|result| {
1011 if let Some(threshold) = options.threshold {
1013 result.score >= threshold
1014 } else {
1015 true
1016 }
1017 })
1018 .collect();
1019
1020 rrf_results.sort_by(|a, b| {
1022 b.score
1023 .partial_cmp(&a.score)
1024 .unwrap_or(std::cmp::Ordering::Equal)
1025 });
1026
1027 if let Some(top_k) = options.top_k {
1028 rrf_results.truncate(top_k);
1029 }
1030
1031 Ok(rrf_results)
1032}
1033
1034fn build_globset(patterns: &[String]) -> GlobSet {
1035 let mut builder = GlobSetBuilder::new();
1036 for pat in patterns {
1037 if let Ok(glob) = Glob::new(pat) {
1039 builder.add(glob);
1040 }
1041 }
1042 builder.build().unwrap_or_else(|_| GlobSet::empty())
1043}
1044
1045fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
1046 let globset = build_globset(exclude_patterns);
1047 if globset.is_match(path) {
1049 return true;
1050 }
1051 for component in path.components() {
1052 if let std::path::Component::Normal(name) = component
1053 && globset.is_match(name)
1054 {
1055 return true;
1056 }
1057 }
1058 false
1059}
1060
1061fn collect_files(
1062 path: &Path,
1063 recursive: bool,
1064 exclude_patterns: &[String],
1065) -> Result<Vec<PathBuf>> {
1066 let mut files = Vec::new();
1067 let globset = build_globset(exclude_patterns);
1068
1069 if path.is_file() {
1070 files.push(path.to_path_buf());
1072 } else if recursive {
1073 for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
1074 let name = e.file_name();
1076 !globset.is_match(e.path()) && !globset.is_match(name)
1077 }) {
1078 match entry {
1079 Ok(entry) => {
1080 if entry.file_type().is_file()
1081 && !should_exclude_path(entry.path(), exclude_patterns)
1082 {
1083 files.push(entry.path().to_path_buf());
1084 }
1085 }
1086 Err(e) => {
1087 tracing::debug!("Skipping path due to error: {}", e);
1089 continue;
1090 }
1091 }
1092 }
1093 } else {
1094 match fs::read_dir(path) {
1095 Ok(read_dir) => {
1096 for entry in read_dir {
1097 match entry {
1098 Ok(entry) => {
1099 let path = entry.path();
1100 if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
1101 files.push(path);
1102 }
1103 }
1104 Err(e) => {
1105 tracing::debug!("Skipping directory entry due to error: {}", e);
1106 continue;
1107 }
1108 }
1109 }
1110 }
1111 Err(e) => {
1112 tracing::debug!("Cannot read directory {:?}: {}", path, e);
1113 return Err(e.into());
1114 }
1115 }
1116 }
1117
1118 Ok(files)
1119}
1120
1121#[allow(clippy::too_many_arguments)]
1122async fn ensure_index_updated_with_progress(
1123 path: &Path,
1124 force_reindex: bool,
1125 need_embeddings: bool,
1126 progress_callback: Option<ck_index::ProgressCallback>,
1127 detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1128 respect_gitignore: bool,
1129 exclude_patterns: &[String],
1130 model_override: Option<&str>,
1131) -> Result<()> {
1132 let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1134 if path.is_file() {
1135 path.parent().unwrap_or(path).to_path_buf()
1136 } else {
1137 path.to_path_buf()
1138 }
1139 });
1140 let index_root = &index_root_buf;
1141
1142 if force_reindex {
1144 let stats = ck_index::smart_update_index_with_detailed_progress(
1145 index_root,
1146 true,
1147 progress_callback,
1148 detailed_progress_callback,
1149 need_embeddings,
1150 respect_gitignore,
1151 exclude_patterns, model_override,
1153 )
1154 .await?;
1155 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1156 tracing::info!(
1157 "Index updated: {} files indexed, {} orphaned files removed",
1158 stats.files_indexed,
1159 stats.orphaned_files_removed
1160 );
1161 }
1162 return Ok(());
1163 }
1164
1165 let stats = ck_index::smart_update_index_with_detailed_progress(
1167 index_root,
1168 false,
1169 progress_callback,
1170 detailed_progress_callback,
1171 need_embeddings,
1172 respect_gitignore,
1173 exclude_patterns,
1174 model_override,
1175 )
1176 .await?;
1177 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1178 tracing::info!(
1179 "Index updated: {} files indexed, {} orphaned files removed",
1180 stats.files_indexed,
1181 stats.orphaned_files_removed
1182 );
1183 }
1184
1185 Ok(())
1186}
1187
1188fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1189 let before = options.before_context_lines.max(options.context_lines);
1190 let after = options.after_context_lines.max(options.context_lines);
1191
1192 if before > 0 || after > 0 {
1193 let start_idx = line_idx.saturating_sub(before);
1194 let end_idx = (line_idx + after + 1).min(lines.len());
1195 lines[start_idx..end_idx].join("\n")
1196 } else {
1197 lines[line_idx].to_string()
1198 }
1199}
1200
1201fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1202 let lang = ck_core::Language::from_path(file_path)?;
1203
1204 if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1206 let sections: Vec<(usize, usize, String)> = chunks
1207 .into_iter()
1208 .filter(|chunk| {
1209 matches!(
1210 chunk.chunk_type,
1211 ck_chunk::ChunkType::Function
1212 | ck_chunk::ChunkType::Class
1213 | ck_chunk::ChunkType::Method
1214 )
1215 })
1216 .map(|chunk| {
1217 (
1218 chunk.span.line_start - 1, chunk.span.line_end - 1,
1220 chunk.text,
1221 )
1222 })
1223 .collect();
1224
1225 if sections.is_empty() {
1226 None
1227 } else {
1228 Some(sections)
1229 }
1230 } else {
1231 None
1232 }
1233}
1234
1235fn find_containing_section(
1236 sections: &[(usize, usize, String)],
1237 line_idx: usize,
1238) -> Option<&String> {
1239 for (start, end, text) in sections {
1240 if line_idx >= *start && line_idx <= *end {
1241 return Some(text);
1242 }
1243 }
1244 None
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249 use super::*;
1250 use std::fs;
1251 use tempfile::TempDir;
1252
1253 fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1254 let files = vec![
1255 ("test1.txt", "hello world rust programming"),
1256 ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1257 ("test3.py", "print('Hello Python')"),
1258 ("test4.txt", "machine learning artificial intelligence"),
1259 ];
1260
1261 let mut paths = Vec::new();
1262 for (name, content) in files {
1263 let path = dir.join(name);
1264 fs::write(&path, content).unwrap();
1265 paths.push(path);
1266 }
1267 paths
1268 }
1269
1270 #[test]
1271 fn test_extract_lines_from_file() {
1272 let temp_dir = TempDir::new().unwrap();
1273 let test_file = temp_dir.path().join("test_lines.txt");
1274
1275 let content =
1277 "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1278 fs::write(&test_file, content).unwrap();
1279
1280 let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1282 assert_eq!(result, "Line 3\nLine 4\nLine 5");
1283
1284 let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1286 assert_eq!(result, "Line 7");
1287
1288 let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1290 assert_eq!(result, "Line 8\nLine 9\nLine 10");
1291
1292 let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1294 assert_eq!(result, "");
1295
1296 let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1298 assert_eq!(result, "");
1299 }
1300
1301 #[tokio::test]
1302 async fn test_extract_content_from_span() {
1303 let temp_dir = TempDir::new().unwrap();
1304 let test_file = temp_dir.path().join("code.rs");
1305
1306 let content = "fn first() {\n println!(\"First\");\n}\n\nfn second() {\n println!(\"Second\");\n}\n\nfn third() {\n println!(\"Third\");\n}";
1308 fs::write(&test_file, content).unwrap();
1309
1310 let span = ck_core::Span {
1312 byte_start: 0, byte_end: 0, line_start: 5,
1315 line_end: 7,
1316 };
1317
1318 let result = extract_content_from_span(&test_file, &span).await.unwrap();
1319 assert_eq!(result, "fn second() {\n println!(\"Second\");\n}");
1320
1321 let span = ck_core::Span {
1323 byte_start: 0,
1324 byte_end: 0,
1325 line_start: 2,
1326 line_end: 2,
1327 };
1328
1329 let result = extract_content_from_span(&test_file, &span).await.unwrap();
1330 assert_eq!(result, " println!(\"First\");");
1331 }
1332
1333 #[test]
1334 fn test_collect_files() {
1335 let temp_dir = TempDir::new().unwrap();
1336 let test_files = create_test_files(temp_dir.path());
1337
1338 let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1340 assert_eq!(files.len(), 4);
1341
1342 let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1344 assert_eq!(files.len(), 4);
1345
1346 let files = collect_files(&test_files[0], false, &[]).unwrap();
1348 assert_eq!(files.len(), 1);
1349 assert_eq!(files[0], test_files[0]);
1350 }
1351
1352 #[test]
1353 fn test_regex_search() {
1354 let temp_dir = TempDir::new().unwrap();
1355 create_test_files(temp_dir.path());
1356
1357 let options = SearchOptions {
1358 mode: SearchMode::Regex,
1359 query: "rust".to_string(),
1360 path: temp_dir.path().to_path_buf(),
1361 recursive: true,
1362 ..Default::default()
1363 };
1364
1365 let results = regex_search(&options).unwrap();
1366 assert!(!results.is_empty());
1367
1368 let rust_matches: Vec<_> = results
1370 .iter()
1371 .filter(|r| r.preview.to_lowercase().contains("rust"))
1372 .collect();
1373 assert!(!rust_matches.is_empty());
1374 }
1375
1376 #[test]
1377 fn test_regex_search_case_insensitive() {
1378 let temp_dir = TempDir::new().unwrap();
1379 create_test_files(temp_dir.path());
1380
1381 let options = SearchOptions {
1382 mode: SearchMode::Regex,
1383 query: "HELLO".to_string(),
1384 path: temp_dir.path().to_path_buf(),
1385 recursive: true,
1386 case_insensitive: true,
1387 ..Default::default()
1388 };
1389
1390 let results = regex_search(&options).unwrap();
1391 assert!(!results.is_empty());
1392 }
1393
1394 #[test]
1395 fn test_regex_search_fixed_string() {
1396 let temp_dir = TempDir::new().unwrap();
1397 create_test_files(temp_dir.path());
1398
1399 let options = SearchOptions {
1400 mode: SearchMode::Regex,
1401 query: "fn main()".to_string(),
1402 path: temp_dir.path().to_path_buf(),
1403 recursive: true,
1404 fixed_string: true,
1405 ..Default::default()
1406 };
1407
1408 let results = regex_search(&options).unwrap();
1409 assert!(!results.is_empty());
1410 }
1411
1412 #[test]
1413 fn test_regex_search_whole_word() {
1414 let temp_dir = TempDir::new().unwrap();
1415 fs::write(
1416 temp_dir.path().join("word_test.txt"),
1417 "rust rusty rustacean",
1418 )
1419 .unwrap();
1420
1421 let options = SearchOptions {
1422 mode: SearchMode::Regex,
1423 query: "rust".to_string(),
1424 path: temp_dir.path().to_path_buf(),
1425 recursive: true,
1426 whole_word: true,
1427 ..Default::default()
1428 };
1429
1430 let results = regex_search(&options).unwrap();
1431 assert!(!results.is_empty());
1432 }
1434
1435 #[test]
1436 fn test_regex_search_top_k() {
1437 let temp_dir = TempDir::new().unwrap();
1438
1439 for i in 0..10 {
1441 fs::write(
1442 temp_dir.path().join(format!("file{}.txt", i)),
1443 "test content",
1444 )
1445 .unwrap();
1446 }
1447
1448 let options = SearchOptions {
1449 mode: SearchMode::Regex,
1450 query: "test".to_string(),
1451 path: temp_dir.path().to_path_buf(),
1452 recursive: true,
1453 top_k: Some(5),
1454 ..Default::default()
1455 };
1456
1457 let results = regex_search(&options).unwrap();
1458 assert!(results.len() <= 5);
1459 }
1460
1461 #[test]
1462 fn test_regex_search_span_offsets() {
1463 let temp_dir = TempDir::new().unwrap();
1465 let test_file = temp_dir.path().join("spans.txt");
1466 fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1467
1468 let options = SearchOptions {
1469 mode: SearchMode::Regex,
1470 query: "test".to_string(),
1471 path: test_file.clone(),
1472 recursive: false,
1473 ..Default::default()
1474 };
1475
1476 let results = regex_search(&options).unwrap();
1477
1478 assert_eq!(results.len(), 5);
1480
1481 let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1483 assert_eq!(line1_matches.len(), 3);
1484 assert_eq!(line1_matches[0].span.byte_start, 0);
1485 assert_eq!(line1_matches[1].span.byte_start, 5);
1486 assert_eq!(line1_matches[2].span.byte_start, 10);
1487
1488 let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1490 assert_eq!(line2_matches.len(), 1);
1491 assert_eq!(line2_matches[0].span.byte_start, 24); let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1495 byte_starts.sort();
1496 byte_starts.dedup();
1497 assert_eq!(byte_starts.len(), 5); }
1499
1500 #[test]
1501 fn test_search_file() {
1502 let temp_dir = TempDir::new().unwrap();
1503 let file_path = temp_dir.path().join("test.txt");
1504 fs::write(
1505 &file_path,
1506 "line 1: hello\nline 2: world\nline 3: rust programming",
1507 )
1508 .unwrap();
1509
1510 let regex = regex::Regex::new("rust").unwrap();
1511 let options = SearchOptions::default();
1512
1513 let results = search_file(®ex, &file_path, &options).unwrap();
1514 assert_eq!(results.len(), 1);
1515 assert_eq!(results[0].span.line_start, 3);
1516 assert!(results[0].preview.contains("rust"));
1517 }
1518
1519 #[test]
1520 fn test_search_file_with_context() {
1521 let temp_dir = TempDir::new().unwrap();
1522 let file_path = temp_dir.path().join("test.txt");
1523 fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1524
1525 let regex = regex::Regex::new("target").unwrap();
1526 let options = SearchOptions {
1527 context_lines: 1,
1528 ..Default::default()
1529 };
1530
1531 let results = search_file(®ex, &file_path, &options).unwrap();
1532 assert_eq!(results.len(), 1);
1533
1534 println!("Preview: '{}'", results[0].preview);
1535
1536 assert!(results[0].preview.contains("line 2"));
1539 assert!(results[0].preview.contains("target line"));
1540 assert!(results[0].preview.contains("line 4"));
1541 }
1542
1543 #[tokio::test]
1544 async fn test_search_main_function() {
1545 let temp_dir = TempDir::new().unwrap();
1546 create_test_files(temp_dir.path());
1547
1548 let options = SearchOptions {
1549 mode: SearchMode::Regex,
1550 query: "hello".to_string(),
1551 path: temp_dir.path().to_path_buf(),
1552 recursive: true,
1553 case_insensitive: true,
1554 ..Default::default()
1555 };
1556
1557 let results = search(&options).await.unwrap();
1558 assert!(!results.is_empty());
1559 }
1560
1561 #[tokio::test]
1562 async fn test_regex_search_mixed_line_endings() {
1563 let temp_dir = TempDir::new().unwrap();
1565
1566 let test_file = temp_dir.path().join("mixed_endings.txt");
1568 let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
1569 std::fs::write(&test_file, content).unwrap();
1570
1571 let options = SearchOptions {
1572 mode: SearchMode::Regex,
1573 query: "pattern".to_string(),
1574 path: test_file.clone(),
1575 recursive: false,
1576 ..Default::default()
1577 };
1578
1579 let results = search(&options).await.unwrap();
1580 assert_eq!(results.len(), 1);
1581
1582 let result = &results[0];
1583 let original_content = std::fs::read_to_string(&test_file).unwrap();
1585 let pattern_start = original_content.find("pattern").unwrap();
1586
1587 assert_eq!(result.span.byte_start, pattern_start);
1588 assert_eq!(result.span.line_start, 4); }
1590
1591 #[tokio::test]
1592 async fn test_regex_search_windows_line_endings() {
1593 let temp_dir = TempDir::new().unwrap();
1595
1596 let test_file = temp_dir.path().join("windows_endings.txt");
1597 let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
1598 std::fs::write(&test_file, content).unwrap();
1599
1600 let options = SearchOptions {
1601 mode: SearchMode::Regex,
1602 query: "match".to_string(),
1603 path: test_file.clone(),
1604 recursive: false,
1605 ..Default::default()
1606 };
1607
1608 let results = search(&options).await.unwrap();
1609 assert_eq!(results.len(), 1);
1610
1611 let result = &results[0];
1612
1613 assert_eq!(result.span.line_start, 3);
1615
1616 let expected_byte_start = 25; assert_eq!(result.span.byte_start, expected_byte_start);
1620 }
1621
1622 #[test]
1623 fn test_split_lines_with_endings_helper() {
1624 let unix_content = "line1\nline2\nline3\n";
1626 let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
1627 assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
1628 assert_eq!(unix_endings, vec![1, 1, 1]);
1629
1630 let windows_content = "line1\r\nline2\r\nline3\r\n";
1632 let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
1633 assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
1634 assert_eq!(windows_endings, vec![2, 2, 2]);
1635
1636 let mac_content = "line1\rline2\rline3\r";
1638 let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
1639 assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
1640 assert_eq!(mac_endings, vec![1, 1, 1]);
1641
1642 let mixed_content = "line1\nline2\r\nline3\r";
1644 let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
1645 assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
1646 assert_eq!(mixed_endings, vec![1, 2, 1]);
1647
1648 let no_endings = "single line";
1650 let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
1651 assert_eq!(no_lines, vec!["single line"]);
1652 assert_eq!(no_endings_vec, vec![0]);
1653 }
1654}