1use anyhow::Result;
2use ck_core::{CkError, IncludePattern, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27 if ck_core::pdf::is_pdf_file(file_path) {
28 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30 if !cache_path.exists() {
31 return Err(anyhow::anyhow!(
32 "PDF not preprocessed. Run 'ck --index' first."
33 ));
34 }
35 Ok(cache_path)
36 } else {
37 Ok(file_path.to_path_buf())
39 }
40}
41
42fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46 let content_path = resolve_content_path(file_path, repo_root)?;
47 Ok(fs::read_to_string(content_path)?)
48}
49
50async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52 let repo_root = find_nearest_index_root(file_path)
54 .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56 let content_path = resolve_content_path(file_path, &repo_root)?;
58
59 extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65 use std::io::{BufRead, BufReader};
66
67 if line_start == 0 {
68 return Ok(String::new());
69 }
70
71 let file = fs::File::open(file_path)?;
72 let reader = BufReader::new(file);
73 let mut result = Vec::new();
74
75 let start_idx = line_start.saturating_sub(1);
77 let end_idx = line_end.saturating_sub(1);
78
79 for (current_line, line_result) in reader.lines().enumerate() {
80 if current_line > end_idx {
81 break; }
83
84 let line = line_result?;
85
86 if current_line >= start_idx {
87 result.push(line);
88 }
89 }
90
91 if result.is_empty() && line_start > 0 {
93 return Ok(String::new());
94 }
95
96 Ok(result.join("\n"))
97}
98
99fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
102 let mut lines = Vec::new();
103 let mut endings = Vec::new();
104
105 let bytes = content.as_bytes();
106 let mut start = 0usize;
107 let mut i = 0usize;
108
109 while i < bytes.len() {
110 match bytes[i] {
111 b'\n' => {
112 lines.push(content[start..i].to_string());
113 endings.push(1);
114 i += 1;
115 start = i;
116 }
117 b'\r' => {
118 lines.push(content[start..i].to_string());
119 if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
120 endings.push(2);
121 i += 2;
122 } else {
123 endings.push(1);
124 i += 1;
125 }
126 start = i;
127 }
128 _ => {
129 i += 1;
130 }
131 }
132 }
133
134 if start < bytes.len() {
135 lines.push(content[start..].to_string());
136 endings.push(0);
137 }
138
139 (lines, endings)
140}
141
142fn canonicalize_for_matching(path: &Path) -> PathBuf {
143 if let Ok(canonical) = path.canonicalize() {
144 return canonical;
145 }
146
147 if path.is_absolute() {
148 path.to_path_buf()
149 } else {
150 std::env::current_dir()
151 .map(|cwd| cwd.join(path))
152 .unwrap_or_else(|_| path.to_path_buf())
153 }
154}
155
156fn path_matches_include(path: &Path, include_patterns: &[IncludePattern]) -> bool {
157 if include_patterns.is_empty() {
158 return true;
159 }
160
161 let candidate = canonicalize_for_matching(path);
162 include_patterns.iter().any(|pattern| {
163 if pattern.is_dir {
164 candidate.starts_with(&pattern.path)
165 } else {
166 candidate == pattern.path
167 }
168 })
169}
170
171fn filter_files_by_include(
172 files: Vec<PathBuf>,
173 include_patterns: &[IncludePattern],
174) -> Vec<PathBuf> {
175 if include_patterns.is_empty() {
176 return files;
177 }
178
179 files
180 .into_iter()
181 .filter(|path| path_matches_include(path, include_patterns))
182 .collect()
183}
184
185fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
186 let mut current = if path.is_file() {
187 path.parent().unwrap_or(path)
188 } else {
189 path
190 };
191 loop {
192 if current.join(".ck").exists() {
193 return Some(current.to_path_buf());
194 }
195 match current.parent() {
196 Some(parent) => current = parent,
197 None => return None,
198 }
199 }
200}
201
202#[derive(Clone, Debug)]
203pub struct ResolvedModel {
204 pub alias: String,
205 pub config: ck_models::ModelConfig,
206}
207
208impl ResolvedModel {
209 pub fn canonical_name(&self) -> &str {
210 self.config.name.as_str()
211 }
212
213 pub fn dimensions(&self) -> usize {
214 self.config.dimensions
215 }
216}
217
218fn legacy_model_config(name: &str, dimensions: usize) -> ck_models::ModelConfig {
219 ck_models::ModelConfig {
220 name: name.to_string(),
221 provider: "fastembed".to_string(),
222 dimensions,
223 max_tokens: 8192,
224 description: "Legacy ck embedding model preserved for backwards compatibility".to_string(),
225 }
226}
227
228pub(crate) fn resolve_model_from_root(
229 index_root: &Path,
230 cli_model: Option<&str>,
231) -> Result<ResolvedModel> {
232 use ck_models::ModelRegistry;
233
234 let registry = ModelRegistry::default();
235 let index_dir = index_root.join(".ck");
236 let manifest_path = index_dir.join("manifest.json");
237
238 if manifest_path.exists() {
239 let data = std::fs::read(&manifest_path)?;
240 let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
241
242 if let Some(existing_model) = manifest.embedding_model {
243 let dims_hint = manifest.embedding_dimensions.unwrap_or(384);
244 let resolved_existing = match registry.resolve(Some(existing_model.as_str())) {
245 Ok((alias, config)) => ResolvedModel { alias, config },
246 Err(_) => ResolvedModel {
247 alias: existing_model.clone(),
248 config: legacy_model_config(&existing_model, dims_hint),
249 },
250 };
251
252 if let Some(requested) = cli_model {
253 let (requested_alias, requested_config) = registry
254 .resolve(Some(requested))
255 .map_err(|e| CkError::Embedding(e.to_string()))?;
256
257 if requested_config.name != resolved_existing.config.name {
258 let suggested_alias = resolved_existing.alias.clone();
259 return Err(CkError::Embedding(format!(
260 "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
261 resolved_existing.config.name,
262 suggested_alias,
263 requested,
264 requested,
265 suggested_alias
266 ))
267 .into());
268 }
269
270 return Ok(ResolvedModel {
271 alias: requested_alias,
272 config: requested_config,
273 });
274 }
275
276 return Ok(resolved_existing);
277 }
278 }
279
280 let (alias, config) = registry
281 .resolve(cli_model)
282 .map_err(|e| CkError::Embedding(e.to_string()))?;
283
284 Ok(ResolvedModel { alias, config })
285}
286
287pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
288 let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
289 if path.is_file() {
290 path.parent().unwrap_or(path).to_path_buf()
291 } else {
292 path.to_path_buf()
293 }
294 });
295 resolve_model_from_root(&index_root, cli_model)
296}
297
298pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
299 let results = search_enhanced(options).await?;
300 Ok(results.matches)
301}
302
303pub async fn search_with_progress(
304 options: &SearchOptions,
305 progress_callback: Option<SearchProgressCallback>,
306) -> Result<Vec<SearchResult>> {
307 let results = search_enhanced_with_progress(options, progress_callback).await?;
308 Ok(results.matches)
309}
310
311pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
313 search_enhanced_with_progress(options, None).await
314}
315
316pub async fn search_enhanced_with_progress(
318 options: &SearchOptions,
319 progress_callback: Option<SearchProgressCallback>,
320) -> Result<ck_core::SearchResults> {
321 search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
322}
323
324pub async fn search_enhanced_with_indexing_progress(
326 options: &SearchOptions,
327 progress_callback: Option<SearchProgressCallback>,
328 indexing_progress_callback: Option<IndexingProgressCallback>,
329 detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
330) -> Result<ck_core::SearchResults> {
331 if !options.path.exists() {
333 return Err(ck_core::CkError::Search(format!(
334 "Path does not exist: {}",
335 options.path.display()
336 ))
337 .into());
338 }
339
340 if !matches!(options.mode, SearchMode::Regex) {
342 let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
343 let file_options = ck_core::FileCollectionOptions::from(options);
344 ensure_index_updated_with_progress(
345 &options.path,
346 options.reindex,
347 need_embeddings,
348 indexing_progress_callback,
349 detailed_indexing_progress_callback,
350 &file_options,
351 options.embedding_model.as_deref(),
352 )
353 .await?;
354 }
355
356 let search_results = match options.mode {
357 SearchMode::Regex => {
358 let matches = regex_search(options)?;
359 ck_core::SearchResults {
360 matches,
361 closest_below_threshold: None,
362 }
363 }
364 SearchMode::Lexical => {
365 let matches = lexical_search(options).await?;
366 ck_core::SearchResults {
367 matches,
368 closest_below_threshold: None,
369 }
370 }
371 SearchMode::Semantic => {
372 semantic_search_v3_with_progress(options, progress_callback).await?
374 }
375 SearchMode::Hybrid => {
376 let matches = hybrid_search_with_progress(options, progress_callback).await?;
377 ck_core::SearchResults {
378 matches,
379 closest_below_threshold: None,
380 }
381 }
382 };
383
384 Ok(search_results)
385}
386
387fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
388 let pattern = if options.fixed_string {
389 regex::escape(&options.query)
390 } else if options.whole_word {
391 format!(r"\b{}\b", regex::escape(&options.query))
392 } else {
393 options.query.clone()
394 };
395
396 let regex = RegexBuilder::new(&pattern)
397 .case_insensitive(options.case_insensitive)
398 .build()
399 .map_err(CkError::Regex)?;
400
401 let should_recurse = options.path.is_dir() || options.recursive;
403 let files = if should_recurse {
404 let file_options = ck_core::FileCollectionOptions {
406 respect_gitignore: options.respect_gitignore,
407 use_ckignore: options.use_ckignore,
408 exclude_patterns: options.exclude_patterns.clone(),
409 };
410 let collected = ck_index::collect_files(&options.path, &file_options)?;
411 filter_files_by_include(collected, &options.include_patterns)
412 } else {
413 let collected = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
415 filter_files_by_include(collected, &options.include_patterns)
416 };
417
418 let results: Vec<Vec<SearchResult>> = files
419 .par_iter()
420 .filter_map(|file_path| match search_file(®ex, file_path, options) {
421 Ok(matches) => {
422 if matches.is_empty() {
423 None
424 } else {
425 Some(matches)
426 }
427 }
428 Err(e) => {
429 tracing::debug!("Error searching {:?}: {}", file_path, e);
430 None
431 }
432 })
433 .collect();
434
435 let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
436 all_results.sort_by(|a, b| {
438 let path_cmp = a.file.cmp(&b.file);
439 if path_cmp != std::cmp::Ordering::Equal {
440 return path_cmp;
441 }
442 a.span.line_start.cmp(&b.span.line_start)
443 });
444
445 if let Some(top_k) = options.top_k {
446 all_results.truncate(top_k);
447 }
448
449 Ok(all_results)
450}
451
452fn search_file(
453 regex: &Regex,
454 file_path: &Path,
455 options: &SearchOptions,
456) -> Result<Vec<SearchResult>> {
457 let repo_root = find_nearest_index_root(file_path)
459 .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
460
461 if options.full_section || options.context_lines > 0 {
465 let content = read_file_content(file_path, &repo_root)?;
467 let (lines, line_ending_lengths) = split_lines_with_endings(&content);
468
469 let code_sections = if options.full_section {
471 extract_code_sections(file_path, &content)
472 } else {
473 None
474 };
475
476 search_file_in_memory(
477 regex,
478 file_path,
479 options,
480 &lines,
481 &code_sections,
482 &line_ending_lengths,
483 )
484 } else {
485 search_file_streaming(regex, file_path, &repo_root, options)
487 }
488}
489
490fn search_file_in_memory(
492 regex: &Regex,
493 file_path: &Path,
494 options: &SearchOptions,
495 lines: &[String],
496 code_sections: &Option<Vec<(usize, usize, String)>>,
497 line_ending_lengths: &[usize],
498) -> Result<Vec<SearchResult>> {
499 let mut results = Vec::new();
500 let mut byte_offset = 0;
501
502 for (line_idx, line) in lines.iter().enumerate() {
503 let line_number = line_idx + 1;
504
505 if regex.as_str().is_empty() {
508 let preview = if options.full_section {
510 if let Some(sections) = code_sections {
512 if let Some(section) = find_containing_section(sections, line_idx) {
513 section.clone()
514 } else {
515 get_context_preview(lines, line_idx, options)
517 }
518 } else {
519 get_context_preview(lines, line_idx, options)
520 }
521 } else {
522 get_context_preview(lines, line_idx, options)
523 };
524
525 results.push(SearchResult {
526 file: file_path.to_path_buf(),
527 span: Span {
528 byte_start: byte_offset,
529 byte_end: byte_offset + line.len(),
530 line_start: line_number,
531 line_end: line_number,
532 },
533 score: 1.0,
534 preview,
535 lang: ck_core::Language::from_path(file_path),
536 symbol: None,
537 chunk_hash: None,
538 index_epoch: None,
539 });
540 } else {
541 for mat in regex.find_iter(line) {
543 let preview = if options.full_section {
544 if let Some(sections) = code_sections {
546 if let Some(section) = find_containing_section(sections, line_idx) {
547 section.clone()
548 } else {
549 get_context_preview(lines, line_idx, options)
551 }
552 } else {
553 get_context_preview(lines, line_idx, options)
554 }
555 } else {
556 get_context_preview(lines, line_idx, options)
557 };
558
559 results.push(SearchResult {
560 file: file_path.to_path_buf(),
561 span: Span {
562 byte_start: byte_offset + mat.start(),
563 byte_end: byte_offset + mat.end(),
564 line_start: line_number,
565 line_end: line_number,
566 },
567 score: 1.0,
568 preview,
569 lang: ck_core::Language::from_path(file_path),
570 symbol: None,
571 chunk_hash: None,
572 index_epoch: None,
573 });
574 }
575 }
576
577 byte_offset += line.len();
579 byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
580 }
581
582 Ok(results)
583}
584
585fn search_file_streaming(
587 regex: &Regex,
588 file_path: &Path,
589 repo_root: &Path,
590 _options: &SearchOptions,
591) -> Result<Vec<SearchResult>> {
592 use std::io::{BufRead, BufReader};
593
594 let content_path = resolve_content_path(file_path, repo_root)?;
595 let file = std::fs::File::open(&content_path)?;
596 let mut reader = BufReader::new(file);
597
598 let mut results = Vec::new();
599 let mut line = String::new();
600 let mut byte_offset = 0usize;
601 let mut line_number = 1usize;
602
603 loop {
604 line.clear();
605 let bytes_read = reader.read_line(&mut line)?;
606 if bytes_read == 0 {
607 break;
608 }
609
610 let mut newline_len = 0usize;
613 if line.ends_with("\r\n") {
614 line.pop(); line.pop(); newline_len = 2;
617 } else if line.ends_with(['\n', '\r']) {
618 line.pop();
619 newline_len = 1;
620 }
621
622 let treat_cr_as_newline = line.contains('\r');
626
627 if treat_cr_as_newline {
628 let bytes = line.as_bytes();
629 let mut segment_start = 0usize;
630 while segment_start <= bytes.len() {
631 match bytes[segment_start..].iter().position(|&b| b == b'\r') {
632 Some(rel_idx) => {
633 let idx = segment_start + rel_idx;
634 let segment_bytes = &bytes[segment_start..idx];
635 let segment_str = std::str::from_utf8(segment_bytes)?;
636 process_streaming_line(
637 regex,
638 file_path,
639 segment_str,
640 line_number,
641 byte_offset,
642 &mut results,
643 );
644 byte_offset += segment_bytes.len() + 1; line_number += 1;
646 segment_start = idx + 1;
647 }
648 None => {
649 let segment_bytes = &bytes[segment_start..];
650 let segment_str = std::str::from_utf8(segment_bytes)?;
651 process_streaming_line(
652 regex,
653 file_path,
654 segment_str,
655 line_number,
656 byte_offset,
657 &mut results,
658 );
659 byte_offset += segment_bytes.len();
660 line_number += 1;
661 break;
662 }
663 }
664 }
665 byte_offset += newline_len;
666 } else {
667 let line_str = line.as_str();
668 process_streaming_line(
669 regex,
670 file_path,
671 line_str,
672 line_number,
673 byte_offset,
674 &mut results,
675 );
676 byte_offset += line_str.len() + newline_len;
677 line_number += 1;
678 }
679 }
680
681 Ok(results)
682}
683
684fn process_streaming_line(
685 regex: &Regex,
686 file_path: &Path,
687 line: &str,
688 line_number: usize,
689 byte_offset: usize,
690 results: &mut Vec<SearchResult>,
691) {
692 if regex.as_str().is_empty() {
693 results.push(SearchResult {
694 file: file_path.to_path_buf(),
695 span: Span {
696 byte_start: byte_offset,
697 byte_end: byte_offset + line.len(),
698 line_start: line_number,
699 line_end: line_number,
700 },
701 score: 1.0,
702 preview: line.to_string(),
703 lang: ck_core::Language::from_path(file_path),
704 symbol: None,
705 chunk_hash: None,
706 index_epoch: None,
707 });
708 } else {
709 for mat in regex.find_iter(line) {
710 results.push(SearchResult {
711 file: file_path.to_path_buf(),
712 span: Span {
713 byte_start: byte_offset + mat.start(),
714 byte_end: byte_offset + mat.end(),
715 line_start: line_number,
716 line_end: line_number,
717 },
718 score: 1.0,
719 preview: line.to_string(),
720 lang: ck_core::Language::from_path(file_path),
721 symbol: None,
722 chunk_hash: None,
723 index_epoch: None,
724 });
725 }
726 }
727}
728
729async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
730 let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
732 if options.path.is_file() {
733 options.path.parent().unwrap_or(&options.path).to_path_buf()
734 } else {
735 options.path.clone()
736 }
737 });
738
739 let index_dir = index_root.join(".ck");
740 if !index_dir.exists() {
741 return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
742 }
743
744 let tantivy_index_path = index_dir.join("tantivy_index");
745
746 if !tantivy_index_path.exists() {
747 return build_tantivy_index(options).await;
748 }
749
750 let mut schema_builder = Schema::builder();
751 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
752 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
753 let _schema = schema_builder.build();
754
755 let index = Index::open_in_dir(&tantivy_index_path)
756 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {e}")))?;
757
758 let reader = index
759 .reader_builder()
760 .reload_policy(ReloadPolicy::OnCommitWithDelay)
761 .try_into()
762 .map_err(|e| CkError::Index(format!("Failed to create index reader: {e}")))?;
763
764 let searcher = reader.searcher();
765 let query_parser = QueryParser::for_index(&index, vec![content_field]);
766
767 let query = query_parser
768 .parse_query(&options.query)
769 .map_err(|e| CkError::Search(format!("Failed to parse query: {e}")))?;
770
771 let top_docs = if let Some(top_k) = options.top_k {
772 searcher.search(&query, &TopDocs::with_limit(top_k))?
773 } else {
774 searcher.search(&query, &TopDocs::with_limit(100))?
775 };
776
777 let mut raw_results = Vec::new();
779 for (_score, doc_address) in top_docs {
780 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
781 let path_text = retrieved_doc
782 .get_first(path_field)
783 .map(|field_value| field_value.as_str().unwrap_or(""))
784 .unwrap_or("");
785 let content_text = retrieved_doc
786 .get_first(content_field)
787 .map(|field_value| field_value.as_str().unwrap_or(""))
788 .unwrap_or("");
789
790 let file_path = PathBuf::from(path_text);
791 if !path_matches_include(&file_path, &options.include_patterns) {
792 continue;
793 }
794 let preview = if options.full_section {
795 content_text.to_string()
796 } else {
797 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
798 };
799
800 raw_results.push((
801 _score,
802 SearchResult {
803 file: file_path,
804 span: Span {
805 byte_start: 0,
806 byte_end: content_text.len(),
807 line_start: 1,
808 line_end: content_text.lines().count(),
809 },
810 score: _score,
811 preview,
812 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
813 symbol: None,
814 chunk_hash: None,
815 index_epoch: None,
816 },
817 ));
818 }
819
820 let mut results = Vec::new();
822 if !raw_results.is_empty() {
823 let max_score = raw_results
824 .iter()
825 .map(|(score, _)| *score)
826 .fold(0.0f32, f32::max);
827 if max_score > 0.0 {
828 for (raw_score, mut result) in raw_results {
829 let normalized_score = raw_score / max_score;
830
831 if let Some(threshold) = options.threshold
833 && normalized_score < threshold
834 {
835 continue;
836 }
837
838 result.score = normalized_score;
839 results.push(result);
840 }
841 }
842 }
843
844 Ok(results)
845}
846
847async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
848 let index_root = if options.path.is_file() {
850 options.path.parent().unwrap_or(&options.path)
851 } else {
852 &options.path
853 };
854
855 let index_dir = index_root.join(".ck");
856 let tantivy_index_path = index_dir.join("tantivy_index");
857
858 fs::create_dir_all(&tantivy_index_path)?;
859
860 let mut schema_builder = Schema::builder();
861 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
862 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
863 let schema = schema_builder.build();
864
865 let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
866 .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {e}")))?;
867
868 let mut index_writer = index
869 .writer(50_000_000)
870 .map_err(|e| CkError::Index(format!("Failed to create index writer: {e}")))?;
871
872 let files = filter_files_by_include(
873 collect_files(index_root, true, &options.exclude_patterns)?,
874 &options.include_patterns,
875 );
876
877 for file_path in &files {
878 if let Ok(content) = fs::read_to_string(file_path) {
879 let doc = doc!(
880 content_field => content,
881 path_field => file_path.display().to_string()
882 );
883 index_writer.add_document(doc)?;
884 }
885 }
886
887 index_writer
888 .commit()
889 .map_err(|e| CkError::Index(format!("Failed to commit index: {e}")))?;
890
891 let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
893 let mut schema_builder = Schema::builder();
894 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
895 let path_field = schema_builder.add_text_field("path", TEXT | STORED);
896 let _schema = schema_builder.build();
897
898 let index = Index::open_in_dir(&tantivy_index_path)
899 .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {e}")))?;
900
901 let reader = index
902 .reader_builder()
903 .reload_policy(ReloadPolicy::OnCommitWithDelay)
904 .try_into()
905 .map_err(|e| CkError::Index(format!("Failed to create index reader: {e}")))?;
906
907 let searcher = reader.searcher();
908 let query_parser = QueryParser::for_index(&index, vec![content_field]);
909
910 let query = query_parser
911 .parse_query(&options.query)
912 .map_err(|e| CkError::Search(format!("Failed to parse query: {e}")))?;
913
914 let top_docs = if let Some(top_k) = options.top_k {
915 searcher.search(&query, &TopDocs::with_limit(top_k))?
916 } else {
917 searcher.search(&query, &TopDocs::with_limit(100))?
918 };
919
920 let mut raw_results = Vec::new();
922 for (_score, doc_address) in top_docs {
923 let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
924 let path_text = retrieved_doc
925 .get_first(path_field)
926 .map(|field_value| field_value.as_str().unwrap_or(""))
927 .unwrap_or("");
928 let content_text = retrieved_doc
929 .get_first(content_field)
930 .map(|field_value| field_value.as_str().unwrap_or(""))
931 .unwrap_or("");
932
933 let file_path = PathBuf::from(path_text);
934 let preview = if options.full_section {
935 content_text.to_string()
936 } else {
937 content_text.lines().take(3).collect::<Vec<_>>().join("\n")
938 };
939
940 raw_results.push((
941 _score,
942 SearchResult {
943 file: file_path,
944 span: Span {
945 byte_start: 0,
946 byte_end: content_text.len(),
947 line_start: 1,
948 line_end: content_text.lines().count(),
949 },
950 score: _score,
951 preview,
952 lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
953 symbol: None,
954 chunk_hash: None,
955 index_epoch: None,
956 },
957 ));
958 }
959
960 let mut results = Vec::new();
962 if !raw_results.is_empty() {
963 let max_score = raw_results
964 .iter()
965 .map(|(score, _)| *score)
966 .fold(0.0f32, f32::max);
967 if max_score > 0.0 {
968 for (raw_score, mut result) in raw_results {
969 let normalized_score = raw_score / max_score;
970
971 if let Some(threshold) = options.threshold
973 && normalized_score < threshold
974 {
975 continue;
976 }
977
978 result.score = normalized_score;
979 results.push(result);
980 }
981 }
982 }
983
984 Ok(results)
985}
986
987#[allow(dead_code)]
988async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
989 hybrid_search_with_progress(options, None).await
990}
991
992async fn hybrid_search_with_progress(
993 options: &SearchOptions,
994 progress_callback: Option<SearchProgressCallback>,
995) -> Result<Vec<SearchResult>> {
996 if let Some(ref callback) = progress_callback {
997 callback("Running regex search...");
998 }
999 let regex_results = regex_search(options)?;
1000
1001 if let Some(ref callback) = progress_callback {
1002 callback("Running semantic search...");
1003 }
1004 let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
1005
1006 let mut combined = HashMap::new();
1007
1008 for (rank, result) in regex_results.iter().enumerate() {
1009 let key = format!("{}:{}", result.file.display(), result.span.line_start);
1010 combined
1011 .entry(key)
1012 .or_insert(Vec::new())
1013 .push((rank + 1, result.clone()));
1014 }
1015
1016 for (rank, result) in semantic_results.matches.iter().enumerate() {
1017 let key = format!("{}:{}", result.file.display(), result.span.line_start);
1018 combined
1019 .entry(key)
1020 .or_insert(Vec::new())
1021 .push((rank + 1, result.clone()));
1022 }
1023
1024 let mut rrf_results: Vec<SearchResult> = combined
1026 .into_values()
1027 .map(|ranks| {
1028 let mut result = ranks[0].1.clone();
1029 let rrf_score = ranks
1030 .iter()
1031 .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
1032 .sum();
1033 result.score = rrf_score;
1034 result
1035 })
1036 .filter(|result| {
1037 if let Some(threshold) = options.threshold {
1039 result.score >= threshold
1040 } else {
1041 true
1042 }
1043 })
1044 .collect();
1045
1046 rrf_results.retain(|result| path_matches_include(&result.file, &options.include_patterns));
1047
1048 rrf_results.sort_by(|a, b| {
1050 b.score
1051 .partial_cmp(&a.score)
1052 .unwrap_or(std::cmp::Ordering::Equal)
1053 });
1054
1055 if let Some(top_k) = options.top_k {
1056 rrf_results.truncate(top_k);
1057 }
1058
1059 Ok(rrf_results)
1060}
1061
1062fn build_globset(patterns: &[String]) -> GlobSet {
1063 let mut builder = GlobSetBuilder::new();
1064 for pat in patterns {
1065 if let Ok(glob) = Glob::new(pat) {
1067 builder.add(glob);
1068 }
1069 }
1070 builder.build().unwrap_or_else(|_| GlobSet::empty())
1071}
1072
1073fn should_exclude_path(path: &Path, globset: &GlobSet) -> bool {
1074 if globset.is_match(path) {
1076 return true;
1077 }
1078 for component in path.components() {
1079 if let std::path::Component::Normal(name) = component
1080 && globset.is_match(name)
1081 {
1082 return true;
1083 }
1084 }
1085 false
1086}
1087
1088fn collect_files(
1089 path: &Path,
1090 recursive: bool,
1091 exclude_patterns: &[String],
1092) -> Result<Vec<PathBuf>> {
1093 let mut files = Vec::new();
1094 let globset = build_globset(exclude_patterns);
1095
1096 if path.is_file() {
1097 files.push(path.to_path_buf());
1099 } else if recursive {
1100 for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
1101 let name = e.file_name();
1103 !globset.is_match(e.path()) && !globset.is_match(name)
1104 }) {
1105 match entry {
1106 Ok(entry) => {
1107 if entry.file_type().is_file() && !should_exclude_path(entry.path(), &globset) {
1108 files.push(entry.path().to_path_buf());
1109 }
1110 }
1111 Err(e) => {
1112 tracing::debug!("Skipping path due to error: {}", e);
1114 continue;
1115 }
1116 }
1117 }
1118 } else {
1119 match fs::read_dir(path) {
1120 Ok(read_dir) => {
1121 for entry in read_dir {
1122 match entry {
1123 Ok(entry) => {
1124 let path = entry.path();
1125 if path.is_file() && !should_exclude_path(&path, &globset) {
1126 files.push(path);
1127 }
1128 }
1129 Err(e) => {
1130 tracing::debug!("Skipping directory entry due to error: {}", e);
1131 continue;
1132 }
1133 }
1134 }
1135 }
1136 Err(e) => {
1137 tracing::debug!("Cannot read directory {:?}: {}", path, e);
1138 return Err(e.into());
1139 }
1140 }
1141 }
1142
1143 Ok(files)
1144}
1145
1146async fn ensure_index_updated_with_progress(
1147 path: &Path,
1148 force_reindex: bool,
1149 need_embeddings: bool,
1150 progress_callback: Option<ck_index::ProgressCallback>,
1151 detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1152 file_options: &ck_core::FileCollectionOptions,
1153 model_override: Option<&str>,
1154) -> Result<()> {
1155 let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1157 if path.is_file() {
1158 path.parent().unwrap_or(path).to_path_buf()
1159 } else {
1160 path.to_path_buf()
1161 }
1162 });
1163 let index_root = &index_root_buf;
1164
1165 if force_reindex {
1168 let stats = ck_index::smart_update_index_with_detailed_progress(
1169 index_root,
1170 true,
1171 progress_callback,
1172 detailed_progress_callback,
1173 need_embeddings,
1174 file_options,
1175 model_override,
1176 )
1177 .await?;
1178 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1179 tracing::info!(
1180 "Index updated: {} files indexed, {} orphaned files removed",
1181 stats.files_indexed,
1182 stats.orphaned_files_removed
1183 );
1184 }
1185 return Ok(());
1186 }
1187
1188 if path.is_file() {
1191 use ck_index::index_file;
1193 index_file(path, need_embeddings).await?;
1194 } else {
1195 let stats = ck_index::smart_update_index_with_detailed_progress(
1197 index_root,
1198 false,
1199 progress_callback,
1200 detailed_progress_callback,
1201 need_embeddings,
1202 file_options,
1203 model_override,
1204 )
1205 .await?;
1206 if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1207 tracing::info!(
1208 "Index updated: {} files indexed, {} orphaned files removed",
1209 stats.files_indexed,
1210 stats.orphaned_files_removed
1211 );
1212 }
1213 }
1214
1215 Ok(())
1216}
1217
1218fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1219 let before = options.before_context_lines.max(options.context_lines);
1220 let after = options.after_context_lines.max(options.context_lines);
1221
1222 if before > 0 || after > 0 {
1223 let start_idx = line_idx.saturating_sub(before);
1224 let end_idx = (line_idx + after + 1).min(lines.len());
1225 lines[start_idx..end_idx].join("\n")
1226 } else {
1227 lines[line_idx].to_string()
1228 }
1229}
1230
1231fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1232 let lang = ck_core::Language::from_path(file_path)?;
1233
1234 if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1236 let include_markdown = lang == ck_core::Language::Markdown;
1237 let sections: Vec<(usize, usize, String)> = chunks
1238 .into_iter()
1239 .filter(|chunk| {
1240 if include_markdown {
1241 matches!(
1242 chunk.chunk_type,
1243 ck_chunk::ChunkType::Module | ck_chunk::ChunkType::Text
1244 )
1245 } else {
1246 matches!(
1247 chunk.chunk_type,
1248 ck_chunk::ChunkType::Function
1249 | ck_chunk::ChunkType::Class
1250 | ck_chunk::ChunkType::Method
1251 )
1252 }
1253 })
1254 .map(|chunk| {
1255 (
1256 chunk.span.line_start - 1, chunk.span.line_end - 1,
1258 chunk.text,
1259 )
1260 })
1261 .collect();
1262
1263 if sections.is_empty() {
1264 None
1265 } else {
1266 Some(sections)
1267 }
1268 } else {
1269 None
1270 }
1271}
1272
1273fn find_containing_section(
1274 sections: &[(usize, usize, String)],
1275 line_idx: usize,
1276) -> Option<&String> {
1277 for (start, end, text) in sections {
1278 if line_idx >= *start && line_idx <= *end {
1279 return Some(text);
1280 }
1281 }
1282 None
1283}
1284
1285#[cfg(test)]
1286mod tests {
1287 use super::*;
1288 use std::fs;
1289 use tempfile::TempDir;
1290
1291 fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1292 let files = vec![
1293 ("test1.txt", "hello world rust programming"),
1294 ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1295 ("test3.py", "print('Hello Python')"),
1296 ("test4.txt", "machine learning artificial intelligence"),
1297 ];
1298
1299 let mut paths = Vec::new();
1300 for (name, content) in files {
1301 let path = dir.join(name);
1302 fs::write(&path, content).unwrap();
1303 paths.push(path);
1304 }
1305 paths
1306 }
1307
1308 #[test]
1309 fn test_extract_lines_from_file() {
1310 let temp_dir = TempDir::new().unwrap();
1311 let test_file = temp_dir.path().join("test_lines.txt");
1312
1313 let content =
1315 "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1316 fs::write(&test_file, content).unwrap();
1317
1318 let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1320 assert_eq!(result, "Line 3\nLine 4\nLine 5");
1321
1322 let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1324 assert_eq!(result, "Line 7");
1325
1326 let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1328 assert_eq!(result, "Line 8\nLine 9\nLine 10");
1329
1330 let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1332 assert_eq!(result, "");
1333
1334 let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1336 assert_eq!(result, "");
1337 }
1338
1339 #[tokio::test]
1340 async fn test_extract_content_from_span() {
1341 let temp_dir = TempDir::new().unwrap();
1342 let test_file = temp_dir.path().join("code.rs");
1343
1344 let content = "fn first() {\n println!(\"First\");\n}\n\nfn second() {\n println!(\"Second\");\n}\n\nfn third() {\n println!(\"Third\");\n}";
1346 fs::write(&test_file, content).unwrap();
1347
1348 let span = ck_core::Span {
1350 byte_start: 0, byte_end: 0, line_start: 5,
1353 line_end: 7,
1354 };
1355
1356 let result = extract_content_from_span(&test_file, &span).await.unwrap();
1357 assert_eq!(result, "fn second() {\n println!(\"Second\");\n}");
1358
1359 let span = ck_core::Span {
1361 byte_start: 0,
1362 byte_end: 0,
1363 line_start: 2,
1364 line_end: 2,
1365 };
1366
1367 let result = extract_content_from_span(&test_file, &span).await.unwrap();
1368 assert_eq!(result, " println!(\"First\");");
1369 }
1370
1371 #[test]
1372 fn test_collect_files() {
1373 let temp_dir = TempDir::new().unwrap();
1374 let test_files = create_test_files(temp_dir.path());
1375
1376 let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1378 assert_eq!(files.len(), 4);
1379
1380 let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1382 assert_eq!(files.len(), 4);
1383
1384 let files = collect_files(&test_files[0], false, &[]).unwrap();
1386 assert_eq!(files.len(), 1);
1387 assert_eq!(files[0], test_files[0]);
1388 }
1389
1390 #[test]
1391 fn test_regex_search() {
1392 let temp_dir = TempDir::new().unwrap();
1393 create_test_files(temp_dir.path());
1394
1395 let options = SearchOptions {
1396 mode: SearchMode::Regex,
1397 query: "rust".to_string(),
1398 path: temp_dir.path().to_path_buf(),
1399 recursive: true,
1400 ..Default::default()
1401 };
1402
1403 let results = regex_search(&options).unwrap();
1404 assert!(!results.is_empty());
1405
1406 let rust_matches: Vec<_> = results
1408 .iter()
1409 .filter(|r| r.preview.to_lowercase().contains("rust"))
1410 .collect();
1411 assert!(!rust_matches.is_empty());
1412 }
1413
1414 #[test]
1415 fn test_regex_search_case_insensitive() {
1416 let temp_dir = TempDir::new().unwrap();
1417 create_test_files(temp_dir.path());
1418
1419 let options = SearchOptions {
1420 mode: SearchMode::Regex,
1421 query: "HELLO".to_string(),
1422 path: temp_dir.path().to_path_buf(),
1423 recursive: true,
1424 case_insensitive: true,
1425 ..Default::default()
1426 };
1427
1428 let results = regex_search(&options).unwrap();
1429 assert!(!results.is_empty());
1430 }
1431
1432 #[test]
1433 fn test_regex_search_fixed_string() {
1434 let temp_dir = TempDir::new().unwrap();
1435 create_test_files(temp_dir.path());
1436
1437 let options = SearchOptions {
1438 mode: SearchMode::Regex,
1439 query: "fn main()".to_string(),
1440 path: temp_dir.path().to_path_buf(),
1441 recursive: true,
1442 fixed_string: true,
1443 ..Default::default()
1444 };
1445
1446 let results = regex_search(&options).unwrap();
1447 assert!(!results.is_empty());
1448 }
1449
1450 #[test]
1451 fn test_regex_search_whole_word() {
1452 let temp_dir = TempDir::new().unwrap();
1453 fs::write(
1454 temp_dir.path().join("word_test.txt"),
1455 "rust rusty rustacean",
1456 )
1457 .unwrap();
1458
1459 let options = SearchOptions {
1460 mode: SearchMode::Regex,
1461 query: "rust".to_string(),
1462 path: temp_dir.path().to_path_buf(),
1463 recursive: true,
1464 whole_word: true,
1465 ..Default::default()
1466 };
1467
1468 let results = regex_search(&options).unwrap();
1469 assert!(!results.is_empty());
1470 }
1472
1473 #[test]
1474 fn test_regex_search_top_k() {
1475 let temp_dir = TempDir::new().unwrap();
1476
1477 for i in 0..10 {
1479 fs::write(temp_dir.path().join(format!("file{i}.txt")), "test content").unwrap();
1480 }
1481
1482 let options = SearchOptions {
1483 mode: SearchMode::Regex,
1484 query: "test".to_string(),
1485 path: temp_dir.path().to_path_buf(),
1486 recursive: true,
1487 top_k: Some(5),
1488 ..Default::default()
1489 };
1490
1491 let results = regex_search(&options).unwrap();
1492 assert!(results.len() <= 5);
1493 }
1494
1495 #[test]
1496 fn test_regex_search_span_offsets() {
1497 let temp_dir = TempDir::new().unwrap();
1499 let test_file = temp_dir.path().join("spans.txt");
1500 fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1501
1502 let options = SearchOptions {
1503 mode: SearchMode::Regex,
1504 query: "test".to_string(),
1505 path: test_file.clone(),
1506 recursive: false,
1507 ..Default::default()
1508 };
1509
1510 let results = regex_search(&options).unwrap();
1511
1512 assert_eq!(results.len(), 5);
1514
1515 let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1517 assert_eq!(line1_matches.len(), 3);
1518 assert_eq!(line1_matches[0].span.byte_start, 0);
1519 assert_eq!(line1_matches[1].span.byte_start, 5);
1520 assert_eq!(line1_matches[2].span.byte_start, 10);
1521
1522 let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1524 assert_eq!(line2_matches.len(), 1);
1525 assert_eq!(line2_matches[0].span.byte_start, 24); let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1529 byte_starts.sort();
1530 byte_starts.dedup();
1531 assert_eq!(byte_starts.len(), 5); }
1533
1534 #[test]
1535 fn test_search_file() {
1536 let temp_dir = TempDir::new().unwrap();
1537 let file_path = temp_dir.path().join("test.txt");
1538 fs::write(
1539 &file_path,
1540 "line 1: hello\nline 2: world\nline 3: rust programming",
1541 )
1542 .unwrap();
1543
1544 let regex = regex::Regex::new("rust").unwrap();
1545 let options = SearchOptions::default();
1546
1547 let results = search_file(®ex, &file_path, &options).unwrap();
1548 assert_eq!(results.len(), 1);
1549 assert_eq!(results[0].span.line_start, 3);
1550 assert!(results[0].preview.contains("rust"));
1551 }
1552
1553 #[test]
1554 fn test_search_file_with_context() {
1555 let temp_dir = TempDir::new().unwrap();
1556 let file_path = temp_dir.path().join("test.txt");
1557 fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1558
1559 let regex = regex::Regex::new("target").unwrap();
1560 let options = SearchOptions {
1561 context_lines: 1,
1562 ..Default::default()
1563 };
1564
1565 let results = search_file(®ex, &file_path, &options).unwrap();
1566 assert_eq!(results.len(), 1);
1567
1568 println!("Preview: '{}'", results[0].preview);
1569
1570 assert!(results[0].preview.contains("line 2"));
1573 assert!(results[0].preview.contains("target line"));
1574 assert!(results[0].preview.contains("line 4"));
1575 }
1576
1577 #[tokio::test]
1578 async fn test_search_main_function() {
1579 let temp_dir = TempDir::new().unwrap();
1580 create_test_files(temp_dir.path());
1581
1582 let options = SearchOptions {
1583 mode: SearchMode::Regex,
1584 query: "hello".to_string(),
1585 path: temp_dir.path().to_path_buf(),
1586 recursive: true,
1587 case_insensitive: true,
1588 ..Default::default()
1589 };
1590
1591 let results = search(&options).await.unwrap();
1592 assert!(!results.is_empty());
1593 }
1594
1595 #[tokio::test]
1596 async fn test_regex_search_mixed_line_endings() {
1597 let temp_dir = TempDir::new().unwrap();
1599
1600 let test_file = temp_dir.path().join("mixed_endings.txt");
1602 let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
1603 std::fs::write(&test_file, content).unwrap();
1604
1605 let options = SearchOptions {
1606 mode: SearchMode::Regex,
1607 query: "pattern".to_string(),
1608 path: test_file.clone(),
1609 recursive: false,
1610 ..Default::default()
1611 };
1612
1613 let results = search(&options).await.unwrap();
1614 assert_eq!(results.len(), 1);
1615
1616 let result = &results[0];
1617 let original_content = std::fs::read_to_string(&test_file).unwrap();
1619 let pattern_start = original_content.find("pattern").unwrap();
1620
1621 assert_eq!(result.span.byte_start, pattern_start);
1622 assert_eq!(result.span.line_start, 4); }
1624
1625 #[tokio::test]
1626 async fn test_regex_search_windows_line_endings() {
1627 let temp_dir = TempDir::new().unwrap();
1629
1630 let test_file = temp_dir.path().join("windows_endings.txt");
1631 let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
1632 std::fs::write(&test_file, content).unwrap();
1633
1634 let options = SearchOptions {
1635 mode: SearchMode::Regex,
1636 query: "match".to_string(),
1637 path: test_file.clone(),
1638 recursive: false,
1639 ..Default::default()
1640 };
1641
1642 let results = search(&options).await.unwrap();
1643 assert_eq!(results.len(), 1);
1644
1645 let result = &results[0];
1646
1647 assert_eq!(result.span.line_start, 3);
1649
1650 let expected_byte_start = 25; assert_eq!(result.span.byte_start, expected_byte_start);
1654 }
1655
1656 #[test]
1657 fn test_split_lines_with_endings_helper() {
1658 let unix_content = "line1\nline2\nline3\n";
1660 let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
1661 assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
1662 assert_eq!(unix_endings, vec![1, 1, 1]);
1663
1664 let windows_content = "line1\r\nline2\r\nline3\r\n";
1666 let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
1667 assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
1668 assert_eq!(windows_endings, vec![2, 2, 2]);
1669
1670 let mac_content = "line1\rline2\rline3\r";
1672 let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
1673 assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
1674 assert_eq!(mac_endings, vec![1, 1, 1]);
1675
1676 let mixed_content = "line1\nline2\r\nline3\r";
1678 let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
1679 assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
1680 assert_eq!(mixed_endings, vec![1, 2, 1]);
1681
1682 let no_endings = "single line";
1684 let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
1685 assert_eq!(no_lines, vec!["single line"]);
1686 assert_eq!(no_endings_vec, vec![0]);
1687 }
1688
1689 #[cfg(feature = "fastembed")]
1693 #[tokio::test]
1694 async fn test_subdirectory_search_uses_parent_ckignore() {
1695 let temp_dir = TempDir::new().unwrap();
1700 let parent = temp_dir.path();
1701 let subdir = parent.join("subproject");
1702 fs::create_dir(&subdir).unwrap();
1703
1704 fs::write(parent.join(".ckignore"), "*.tmp\n").unwrap();
1706
1707 fs::write(parent.join("parent.txt"), "searchable content in parent").unwrap();
1709 fs::write(parent.join("ignored.tmp"), "this should not be indexed").unwrap();
1710
1711 fs::write(subdir.join("nested.txt"), "searchable content in subdir").unwrap();
1713 fs::write(
1714 subdir.join("also_ignored.tmp"),
1715 "this should not be indexed either",
1716 )
1717 .unwrap();
1718
1719 let parent_options = SearchOptions {
1721 mode: SearchMode::Semantic,
1722 query: "searchable".to_string(),
1723 path: parent.to_path_buf(),
1724 top_k: Some(10),
1725 threshold: Some(0.1),
1726 ..Default::default()
1727 };
1728
1729 let _ = search(&parent_options).await;
1730
1731 tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1733
1734 let subdir_options = SearchOptions {
1738 mode: SearchMode::Semantic,
1739 query: "content".to_string(),
1740 path: subdir.clone(),
1741 top_k: Some(10),
1742 threshold: Some(0.1),
1743 ..Default::default()
1744 };
1745
1746 let results = search(&subdir_options).await.unwrap();
1747
1748 let tmp_files: Vec<_> = results
1750 .iter()
1751 .filter(|r| r.file.to_string_lossy().ends_with(".tmp"))
1752 .collect();
1753 assert!(
1754 tmp_files.is_empty(),
1755 "Bug: .tmp files were indexed despite parent .ckignore. Found {} .tmp files: {:?}",
1756 tmp_files.len(),
1757 tmp_files.iter().map(|r| &r.file).collect::<Vec<_>>()
1758 );
1759
1760 let txt_in_subdir = results.iter().any(|r| r.file.ends_with("nested.txt"));
1762 assert!(txt_in_subdir, "Should find nested.txt in subdirectory");
1763
1764 assert!(
1766 !subdir.join(".ck").exists(),
1767 "Should not create .ck directory in subdirectory"
1768 );
1769 }
1770
1771 #[cfg(feature = "fastembed")]
1775 #[tokio::test]
1776 async fn test_multiple_ckignore_files_merge_correctly() {
1777 use std::fs;
1779 use tempfile::TempDir;
1780
1781 let temp_dir = TempDir::new().unwrap();
1782 let parent = temp_dir.path();
1783 let subdir = parent.join("subdir");
1784 let deeper = subdir.join("deeper");
1785 fs::create_dir(&subdir).unwrap();
1786 fs::create_dir(&deeper).unwrap();
1787
1788 fs::write(parent.join(".ckignore"), "*.log\n").unwrap();
1790 fs::write(subdir.join(".ckignore"), "*.tmp\n").unwrap();
1791 fs::write(deeper.join(".ckignore"), "*.cache\n").unwrap();
1792
1793 fs::write(parent.join("root.txt"), "searchable").unwrap();
1795 fs::write(parent.join("root.log"), "should be ignored").unwrap();
1796
1797 fs::write(subdir.join("mid.txt"), "searchable").unwrap();
1798 fs::write(subdir.join("mid.log"), "should be ignored by parent").unwrap();
1799 fs::write(subdir.join("mid.tmp"), "should be ignored by local").unwrap();
1800
1801 fs::write(deeper.join("deep.txt"), "searchable").unwrap();
1802 fs::write(deeper.join("deep.log"), "should be ignored by grandparent").unwrap();
1803 fs::write(deeper.join("deep.tmp"), "should be ignored by parent").unwrap();
1804 fs::write(deeper.join("deep.cache"), "should be ignored by local").unwrap();
1805
1806 let parent_options = SearchOptions {
1808 mode: SearchMode::Semantic,
1809 query: "searchable".to_string(),
1810 path: parent.to_path_buf(),
1811 top_k: Some(20),
1812 threshold: Some(0.1),
1813 ..Default::default()
1814 };
1815
1816 let _ = search(&parent_options).await;
1817 tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1818
1819 let deeper_options = SearchOptions {
1821 mode: SearchMode::Semantic,
1822 query: "ignored".to_string(),
1823 path: deeper.clone(),
1824 top_k: Some(20),
1825 threshold: Some(0.1),
1826 ..Default::default()
1827 };
1828
1829 let results = search(&deeper_options).await.unwrap();
1830
1831 let has_log = results
1833 .iter()
1834 .any(|r| r.file.to_string_lossy().ends_with(".log"));
1835 let has_tmp = results
1836 .iter()
1837 .any(|r| r.file.to_string_lossy().ends_with(".tmp"));
1838 let has_cache = results
1839 .iter()
1840 .any(|r| r.file.to_string_lossy().ends_with(".cache"));
1841
1842 assert!(
1843 !has_log,
1844 "*.log files should be excluded by parent .ckignore"
1845 );
1846 assert!(
1847 !has_tmp,
1848 "*.tmp files should be excluded by subdir .ckignore"
1849 );
1850 assert!(
1851 !has_cache,
1852 "*.cache files should be excluded by deeper .ckignore"
1853 );
1854
1855 let has_txt = results
1857 .iter()
1858 .any(|r| r.file.to_string_lossy().ends_with(".txt"));
1859 assert!(has_txt, "Should find .txt files (not ignored)");
1860 }
1861
1862 #[cfg(feature = "fastembed")]
1866 #[tokio::test]
1867 async fn test_scoped_search_does_not_lose_results_to_global_top_k() {
1868 use std::fs;
1881 use tempfile::TempDir;
1882
1883 let temp_dir = TempDir::new().unwrap();
1884 let parent = temp_dir.path();
1885 let noisy = parent.join("noisy");
1886 let scoped = parent.join("scoped");
1887 fs::create_dir(&noisy).unwrap();
1888 fs::create_dir(&scoped).unwrap();
1889
1890 for i in 0..8 {
1893 fs::write(
1894 noisy.join(format!("noise_{i}.txt")),
1895 format!(
1896 "function open_database_connection_{i}() {{\n \
1897 // establish a database connection to postgres\n \
1898 // handle database connection errors gracefully\n}}\n"
1899 ),
1900 )
1901 .unwrap();
1902 }
1903
1904 fs::write(
1906 scoped.join("target.txt"),
1907 "function connect() {\n \
1908 // open a database connection to the primary store\n \
1909 // database connection pool config goes here\n}\n",
1910 )
1911 .unwrap();
1912
1913 let index_options = SearchOptions {
1915 mode: SearchMode::Semantic,
1916 query: "database connection".to_string(),
1917 path: parent.to_path_buf(),
1918 top_k: Some(20),
1919 threshold: Some(0.0),
1920 ..Default::default()
1921 };
1922 let _ = search(&index_options).await;
1923 tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1924
1925 let scoped_options = SearchOptions {
1929 mode: SearchMode::Semantic,
1930 query: "database connection".to_string(),
1931 path: scoped.clone(),
1932 top_k: Some(3),
1933 threshold: Some(0.0),
1934 ..Default::default()
1935 };
1936
1937 let results = search(&scoped_options).await.unwrap();
1938
1939 assert!(
1940 !results.is_empty(),
1941 "Scoped search returned zero results — top_k was applied \
1942 before the path filter (the bug this test guards against)."
1943 );
1944 let all_in_scope = results.iter().all(|r| {
1945 r.file.starts_with(&scoped)
1946 || r.file.canonicalize().ok() == scoped.join("target.txt").canonicalize().ok()
1947 });
1948 assert!(
1949 all_in_scope,
1950 "Some results leaked out of the requested scope: {:?}",
1951 results.iter().map(|r| &r.file).collect::<Vec<_>>()
1952 );
1953 }
1954}