ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, IncludePattern, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23/// Resolve the actual file path to read content from
24/// For PDFs: returns cache path and validates it exists
25/// For regular files: returns original path
26fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27    if ck_core::pdf::is_pdf_file(file_path) {
28        // PDFs: Read from cached extracted text
29        let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30        if !cache_path.exists() {
31            return Err(anyhow::anyhow!(
32                "PDF not preprocessed. Run 'ck --index' first."
33            ));
34        }
35        Ok(cache_path)
36    } else {
37        // Regular files: Read from original source
38        Ok(file_path.to_path_buf())
39    }
40}
41
42/// Read content from file for search result extraction
43/// Regular files: read directly from source
44/// PDFs: read from preprocessed cache
45fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46    let content_path = resolve_content_path(file_path, repo_root)?;
47    Ok(fs::read_to_string(content_path)?)
48}
49
50/// Extract content from a file using a span (streaming version)
51async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52    // Find repo root to locate cache
53    let repo_root = find_nearest_index_root(file_path)
54        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56    // Use centralized path resolution
57    let content_path = resolve_content_path(file_path, &repo_root)?;
58
59    // Stream only the needed lines
60    extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63/// Stream-read specific lines from a file without loading the entire content
64fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65    use std::io::{BufRead, BufReader};
66
67    if line_start == 0 {
68        return Ok(String::new());
69    }
70
71    let file = fs::File::open(file_path)?;
72    let reader = BufReader::new(file);
73    let mut result = Vec::new();
74
75    // Convert to 0-based indexing
76    let start_idx = line_start.saturating_sub(1);
77    let end_idx = line_end.saturating_sub(1);
78
79    for (current_line, line_result) in reader.lines().enumerate() {
80        if current_line > end_idx {
81            break; // Stop reading once we've passed the needed lines
82        }
83
84        let line = line_result?;
85
86        if current_line >= start_idx {
87            result.push(line);
88        }
89    }
90
91    // Handle case where requested lines exceed file length
92    if result.is_empty() && line_start > 0 {
93        return Ok(String::new());
94    }
95
96    Ok(result.join("\n"))
97}
98
99/// Split content into lines while preserving the exact number of trailing newline bytes per line.
100/// Handles Unix (\n), Windows (\r\n) and old Mac (\r) line endings.
101fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
102    let mut lines = Vec::new();
103    let mut endings = Vec::new();
104
105    let bytes = content.as_bytes();
106    let mut start = 0usize;
107    let mut i = 0usize;
108
109    while i < bytes.len() {
110        match bytes[i] {
111            b'\n' => {
112                lines.push(content[start..i].to_string());
113                endings.push(1);
114                i += 1;
115                start = i;
116            }
117            b'\r' => {
118                lines.push(content[start..i].to_string());
119                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
120                    endings.push(2);
121                    i += 2;
122                } else {
123                    endings.push(1);
124                    i += 1;
125                }
126                start = i;
127            }
128            _ => {
129                i += 1;
130            }
131        }
132    }
133
134    if start < bytes.len() {
135        lines.push(content[start..].to_string());
136        endings.push(0);
137    }
138
139    (lines, endings)
140}
141
142fn canonicalize_for_matching(path: &Path) -> PathBuf {
143    if let Ok(canonical) = path.canonicalize() {
144        return canonical;
145    }
146
147    if path.is_absolute() {
148        path.to_path_buf()
149    } else {
150        std::env::current_dir()
151            .map(|cwd| cwd.join(path))
152            .unwrap_or_else(|_| path.to_path_buf())
153    }
154}
155
156fn path_matches_include(path: &Path, include_patterns: &[IncludePattern]) -> bool {
157    if include_patterns.is_empty() {
158        return true;
159    }
160
161    let candidate = canonicalize_for_matching(path);
162    include_patterns.iter().any(|pattern| {
163        if pattern.is_dir {
164            candidate.starts_with(&pattern.path)
165        } else {
166            candidate == pattern.path
167        }
168    })
169}
170
171fn filter_files_by_include(
172    files: Vec<PathBuf>,
173    include_patterns: &[IncludePattern],
174) -> Vec<PathBuf> {
175    if include_patterns.is_empty() {
176        return files;
177    }
178
179    files
180        .into_iter()
181        .filter(|path| path_matches_include(path, include_patterns))
182        .collect()
183}
184
185fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
186    let mut current = if path.is_file() {
187        path.parent().unwrap_or(path)
188    } else {
189        path
190    };
191    loop {
192        if current.join(".ck").exists() {
193            return Some(current.to_path_buf());
194        }
195        match current.parent() {
196            Some(parent) => current = parent,
197            None => return None,
198        }
199    }
200}
201
202#[derive(Clone, Debug)]
203pub struct ResolvedModel {
204    pub alias: String,
205    pub config: ck_models::ModelConfig,
206}
207
208impl ResolvedModel {
209    pub fn canonical_name(&self) -> &str {
210        self.config.name.as_str()
211    }
212
213    pub fn dimensions(&self) -> usize {
214        self.config.dimensions
215    }
216}
217
218fn legacy_model_config(name: &str, dimensions: usize) -> ck_models::ModelConfig {
219    ck_models::ModelConfig {
220        name: name.to_string(),
221        provider: "fastembed".to_string(),
222        dimensions,
223        max_tokens: 8192,
224        description: "Legacy ck embedding model preserved for backwards compatibility".to_string(),
225    }
226}
227
228pub(crate) fn resolve_model_from_root(
229    index_root: &Path,
230    cli_model: Option<&str>,
231) -> Result<ResolvedModel> {
232    use ck_models::ModelRegistry;
233
234    let registry = ModelRegistry::default();
235    let index_dir = index_root.join(".ck");
236    let manifest_path = index_dir.join("manifest.json");
237
238    if manifest_path.exists() {
239        let data = std::fs::read(&manifest_path)?;
240        let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
241
242        if let Some(existing_model) = manifest.embedding_model {
243            let dims_hint = manifest.embedding_dimensions.unwrap_or(384);
244            let resolved_existing = match registry.resolve(Some(existing_model.as_str())) {
245                Ok((alias, config)) => ResolvedModel { alias, config },
246                Err(_) => ResolvedModel {
247                    alias: existing_model.clone(),
248                    config: legacy_model_config(&existing_model, dims_hint),
249                },
250            };
251
252            if let Some(requested) = cli_model {
253                let (requested_alias, requested_config) = registry
254                    .resolve(Some(requested))
255                    .map_err(|e| CkError::Embedding(e.to_string()))?;
256
257                if requested_config.name != resolved_existing.config.name {
258                    let suggested_alias = resolved_existing.alias.clone();
259                    return Err(CkError::Embedding(format!(
260                        "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
261                        resolved_existing.config.name,
262                        suggested_alias,
263                        requested,
264                        requested,
265                        suggested_alias
266                    ))
267                    .into());
268                }
269
270                return Ok(ResolvedModel {
271                    alias: requested_alias,
272                    config: requested_config,
273                });
274            }
275
276            return Ok(resolved_existing);
277        }
278    }
279
280    let (alias, config) = registry
281        .resolve(cli_model)
282        .map_err(|e| CkError::Embedding(e.to_string()))?;
283
284    Ok(ResolvedModel { alias, config })
285}
286
287pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
288    let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
289        if path.is_file() {
290            path.parent().unwrap_or(path).to_path_buf()
291        } else {
292            path.to_path_buf()
293        }
294    });
295    resolve_model_from_root(&index_root, cli_model)
296}
297
298pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
299    let results = search_enhanced(options).await?;
300    Ok(results.matches)
301}
302
303pub async fn search_with_progress(
304    options: &SearchOptions,
305    progress_callback: Option<SearchProgressCallback>,
306) -> Result<Vec<SearchResult>> {
307    let results = search_enhanced_with_progress(options, progress_callback).await?;
308    Ok(results.matches)
309}
310
311/// Enhanced search that includes near-miss information for threshold queries
312pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
313    search_enhanced_with_progress(options, None).await
314}
315
316/// Enhanced search with progress callback that includes near-miss information
317pub async fn search_enhanced_with_progress(
318    options: &SearchOptions,
319    progress_callback: Option<SearchProgressCallback>,
320) -> Result<ck_core::SearchResults> {
321    search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
322}
323
324/// Enhanced search with both search and indexing progress callbacks
325pub async fn search_enhanced_with_indexing_progress(
326    options: &SearchOptions,
327    progress_callback: Option<SearchProgressCallback>,
328    indexing_progress_callback: Option<IndexingProgressCallback>,
329    detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
330) -> Result<ck_core::SearchResults> {
331    // Validate that the search path exists
332    if !options.path.exists() {
333        return Err(ck_core::CkError::Search(format!(
334            "Path does not exist: {}",
335            options.path.display()
336        ))
337        .into());
338    }
339
340    // Auto-update index if needed (unless it's regex-only mode)
341    if !matches!(options.mode, SearchMode::Regex) {
342        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
343        let file_options = ck_core::FileCollectionOptions::from(options);
344        ensure_index_updated_with_progress(
345            &options.path,
346            options.reindex,
347            need_embeddings,
348            indexing_progress_callback,
349            detailed_indexing_progress_callback,
350            &file_options,
351            options.embedding_model.as_deref(),
352        )
353        .await?;
354    }
355
356    let search_results = match options.mode {
357        SearchMode::Regex => {
358            let matches = regex_search(options)?;
359            ck_core::SearchResults {
360                matches,
361                closest_below_threshold: None,
362            }
363        }
364        SearchMode::Lexical => {
365            let matches = lexical_search(options).await?;
366            ck_core::SearchResults {
367                matches,
368                closest_below_threshold: None,
369            }
370        }
371        SearchMode::Semantic => {
372            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
373            semantic_search_v3_with_progress(options, progress_callback).await?
374        }
375        SearchMode::Hybrid => {
376            let matches = hybrid_search_with_progress(options, progress_callback).await?;
377            ck_core::SearchResults {
378                matches,
379                closest_below_threshold: None,
380            }
381        }
382    };
383
384    Ok(search_results)
385}
386
387fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
388    let pattern = if options.fixed_string {
389        regex::escape(&options.query)
390    } else if options.whole_word {
391        format!(r"\b{}\b", regex::escape(&options.query))
392    } else {
393        options.query.clone()
394    };
395
396    let regex = RegexBuilder::new(&pattern)
397        .case_insensitive(options.case_insensitive)
398        .build()
399        .map_err(CkError::Regex)?;
400
401    // Default to recursive for directories (like grep) to maintain compatibility
402    let should_recurse = options.path.is_dir() || options.recursive;
403    let files = if should_recurse {
404        // Use ck_index's collect_files which respects gitignore
405        let file_options = ck_core::FileCollectionOptions {
406            respect_gitignore: options.respect_gitignore,
407            use_ckignore: options.use_ckignore,
408            exclude_patterns: options.exclude_patterns.clone(),
409        };
410        let collected = ck_index::collect_files(&options.path, &file_options)?;
411        filter_files_by_include(collected, &options.include_patterns)
412    } else {
413        // For non-recursive, use the local collect_files
414        let collected = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
415        filter_files_by_include(collected, &options.include_patterns)
416    };
417
418    let results: Vec<Vec<SearchResult>> = files
419        .par_iter()
420        .filter_map(|file_path| match search_file(&regex, file_path, options) {
421            Ok(matches) => {
422                if matches.is_empty() {
423                    None
424                } else {
425                    Some(matches)
426                }
427            }
428            Err(e) => {
429                tracing::debug!("Error searching {:?}: {}", file_path, e);
430                None
431            }
432        })
433        .collect();
434
435    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
436    // Deterministic ordering: file path, then line number
437    all_results.sort_by(|a, b| {
438        let path_cmp = a.file.cmp(&b.file);
439        if path_cmp != std::cmp::Ordering::Equal {
440            return path_cmp;
441        }
442        a.span.line_start.cmp(&b.span.line_start)
443    });
444
445    if let Some(top_k) = options.top_k {
446        all_results.truncate(top_k);
447    }
448
449    Ok(all_results)
450}
451
452fn search_file(
453    regex: &Regex,
454    file_path: &Path,
455    options: &SearchOptions,
456) -> Result<Vec<SearchResult>> {
457    // Find repo root to locate cache
458    let repo_root = find_nearest_index_root(file_path)
459        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
460
461    // For full_section mode, we need the entire content for parsing
462    // For context previews, we need all lines for surrounding context
463    // So we'll load content when needed, but optimize for the common case
464    if options.full_section || options.context_lines > 0 {
465        // Load full content when we need section parsing or context
466        let content = read_file_content(file_path, &repo_root)?;
467        let (lines, line_ending_lengths) = split_lines_with_endings(&content);
468
469        // If full_section is enabled, try to parse the file and find code sections
470        let code_sections = if options.full_section {
471            extract_code_sections(file_path, &content)
472        } else {
473            None
474        };
475
476        search_file_in_memory(
477            regex,
478            file_path,
479            options,
480            &lines,
481            &code_sections,
482            &line_ending_lengths,
483        )
484    } else {
485        // Streaming search (simple case)
486        search_file_streaming(regex, file_path, &repo_root, options)
487    }
488}
489
490/// In-memory search for cases requiring context or code sections
491fn search_file_in_memory(
492    regex: &Regex,
493    file_path: &Path,
494    options: &SearchOptions,
495    lines: &[String],
496    code_sections: &Option<Vec<(usize, usize, String)>>,
497    line_ending_lengths: &[usize],
498) -> Result<Vec<SearchResult>> {
499    let mut results = Vec::new();
500    let mut byte_offset = 0;
501
502    for (line_idx, line) in lines.iter().enumerate() {
503        let line_number = line_idx + 1;
504
505        // Special handling for empty pattern - match the entire line once
506        // An empty regex pattern will match at every position, so we need to handle it specially
507        if regex.as_str().is_empty() {
508            // Empty pattern matches the whole line once (grep compatibility)
509            let preview = if options.full_section {
510                // Try to find the containing code section
511                if let Some(sections) = code_sections {
512                    if let Some(section) = find_containing_section(sections, line_idx) {
513                        section.clone()
514                    } else {
515                        // Fall back to context lines if no section found
516                        get_context_preview(lines, line_idx, options)
517                    }
518                } else {
519                    get_context_preview(lines, line_idx, options)
520                }
521            } else {
522                get_context_preview(lines, line_idx, options)
523            };
524
525            results.push(SearchResult {
526                file: file_path.to_path_buf(),
527                span: Span {
528                    byte_start: byte_offset,
529                    byte_end: byte_offset + line.len(),
530                    line_start: line_number,
531                    line_end: line_number,
532                },
533                score: 1.0,
534                preview,
535                lang: ck_core::Language::from_path(file_path),
536                symbol: None,
537                chunk_hash: None,
538                index_epoch: None,
539            });
540        } else {
541            // Find all matches in the line with their positions
542            for mat in regex.find_iter(line) {
543                let preview = if options.full_section {
544                    // Try to find the containing code section
545                    if let Some(sections) = code_sections {
546                        if let Some(section) = find_containing_section(sections, line_idx) {
547                            section.clone()
548                        } else {
549                            // Fall back to context lines if no section found
550                            get_context_preview(lines, line_idx, options)
551                        }
552                    } else {
553                        get_context_preview(lines, line_idx, options)
554                    }
555                } else {
556                    get_context_preview(lines, line_idx, options)
557                };
558
559                results.push(SearchResult {
560                    file: file_path.to_path_buf(),
561                    span: Span {
562                        byte_start: byte_offset + mat.start(),
563                        byte_end: byte_offset + mat.end(),
564                        line_start: line_number,
565                        line_end: line_number,
566                    },
567                    score: 1.0,
568                    preview,
569                    lang: ck_core::Language::from_path(file_path),
570                    symbol: None,
571                    chunk_hash: None,
572                    index_epoch: None,
573                });
574            }
575        }
576
577        // Update byte offset for next line (add line length + actual line ending length)
578        byte_offset += line.len();
579        byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
580    }
581
582    Ok(results)
583}
584
585/// Streaming search for simple cases without context or code sections
586fn search_file_streaming(
587    regex: &Regex,
588    file_path: &Path,
589    repo_root: &Path,
590    _options: &SearchOptions,
591) -> Result<Vec<SearchResult>> {
592    use std::io::{BufRead, BufReader};
593
594    let content_path = resolve_content_path(file_path, repo_root)?;
595    let file = std::fs::File::open(&content_path)?;
596    let mut reader = BufReader::new(file);
597
598    let mut results = Vec::new();
599    let mut line = String::new();
600    let mut byte_offset = 0usize;
601    let mut line_number = 1usize;
602
603    loop {
604        line.clear();
605        let bytes_read = reader.read_line(&mut line)?;
606        if bytes_read == 0 {
607            break;
608        }
609
610        // Determine the length of the trailing line ending (if any) and
611        // normalise the line buffer so it no longer contains newline bytes.
612        let mut newline_len = 0usize;
613        if line.ends_with("\r\n") {
614            line.pop(); // remove \n
615            line.pop(); // remove \r
616            newline_len = 2;
617        } else if line.ends_with(['\n', '\r']) {
618            line.pop();
619            newline_len = 1;
620        }
621
622        // Old Mac-style files may use bare carriage returns as separators.
623        // When the trimmed line still contains '\r' characters, treat them as
624        // record separators so the byte offsets remain accurate.
625        let treat_cr_as_newline = line.contains('\r');
626
627        if treat_cr_as_newline {
628            let bytes = line.as_bytes();
629            let mut segment_start = 0usize;
630            while segment_start <= bytes.len() {
631                match bytes[segment_start..].iter().position(|&b| b == b'\r') {
632                    Some(rel_idx) => {
633                        let idx = segment_start + rel_idx;
634                        let segment_bytes = &bytes[segment_start..idx];
635                        let segment_str = std::str::from_utf8(segment_bytes)?;
636                        process_streaming_line(
637                            regex,
638                            file_path,
639                            segment_str,
640                            line_number,
641                            byte_offset,
642                            &mut results,
643                        );
644                        byte_offset += segment_bytes.len() + 1; // account for \r
645                        line_number += 1;
646                        segment_start = idx + 1;
647                    }
648                    None => {
649                        let segment_bytes = &bytes[segment_start..];
650                        let segment_str = std::str::from_utf8(segment_bytes)?;
651                        process_streaming_line(
652                            regex,
653                            file_path,
654                            segment_str,
655                            line_number,
656                            byte_offset,
657                            &mut results,
658                        );
659                        byte_offset += segment_bytes.len();
660                        line_number += 1;
661                        break;
662                    }
663                }
664            }
665            byte_offset += newline_len;
666        } else {
667            let line_str = line.as_str();
668            process_streaming_line(
669                regex,
670                file_path,
671                line_str,
672                line_number,
673                byte_offset,
674                &mut results,
675            );
676            byte_offset += line_str.len() + newline_len;
677            line_number += 1;
678        }
679    }
680
681    Ok(results)
682}
683
684fn process_streaming_line(
685    regex: &Regex,
686    file_path: &Path,
687    line: &str,
688    line_number: usize,
689    byte_offset: usize,
690    results: &mut Vec<SearchResult>,
691) {
692    if regex.as_str().is_empty() {
693        results.push(SearchResult {
694            file: file_path.to_path_buf(),
695            span: Span {
696                byte_start: byte_offset,
697                byte_end: byte_offset + line.len(),
698                line_start: line_number,
699                line_end: line_number,
700            },
701            score: 1.0,
702            preview: line.to_string(),
703            lang: ck_core::Language::from_path(file_path),
704            symbol: None,
705            chunk_hash: None,
706            index_epoch: None,
707        });
708    } else {
709        for mat in regex.find_iter(line) {
710            results.push(SearchResult {
711                file: file_path.to_path_buf(),
712                span: Span {
713                    byte_start: byte_offset + mat.start(),
714                    byte_end: byte_offset + mat.end(),
715                    line_start: line_number,
716                    line_end: line_number,
717                },
718                score: 1.0,
719                preview: line.to_string(),
720                lang: ck_core::Language::from_path(file_path),
721                symbol: None,
722                chunk_hash: None,
723                index_epoch: None,
724            });
725        }
726    }
727}
728
729async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
730    // Handle both files and directories and reuse nearest existing .ck index up the tree
731    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
732        if options.path.is_file() {
733            options.path.parent().unwrap_or(&options.path).to_path_buf()
734        } else {
735            options.path.clone()
736        }
737    });
738
739    let index_dir = index_root.join(".ck");
740    if !index_dir.exists() {
741        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
742    }
743
744    let tantivy_index_path = index_dir.join("tantivy_index");
745
746    if !tantivy_index_path.exists() {
747        return build_tantivy_index(options).await;
748    }
749
750    let mut schema_builder = Schema::builder();
751    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
752    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
753    let _schema = schema_builder.build();
754
755    let index = Index::open_in_dir(&tantivy_index_path)
756        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {e}")))?;
757
758    let reader = index
759        .reader_builder()
760        .reload_policy(ReloadPolicy::OnCommitWithDelay)
761        .try_into()
762        .map_err(|e| CkError::Index(format!("Failed to create index reader: {e}")))?;
763
764    let searcher = reader.searcher();
765    let query_parser = QueryParser::for_index(&index, vec![content_field]);
766
767    let query = query_parser
768        .parse_query(&options.query)
769        .map_err(|e| CkError::Search(format!("Failed to parse query: {e}")))?;
770
771    let top_docs = if let Some(top_k) = options.top_k {
772        searcher.search(&query, &TopDocs::with_limit(top_k))?
773    } else {
774        searcher.search(&query, &TopDocs::with_limit(100))?
775    };
776
777    // First, collect all results with raw scores
778    let mut raw_results = Vec::new();
779    for (_score, doc_address) in top_docs {
780        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
781        let path_text = retrieved_doc
782            .get_first(path_field)
783            .map(|field_value| field_value.as_str().unwrap_or(""))
784            .unwrap_or("");
785        let content_text = retrieved_doc
786            .get_first(content_field)
787            .map(|field_value| field_value.as_str().unwrap_or(""))
788            .unwrap_or("");
789
790        let file_path = PathBuf::from(path_text);
791        if !path_matches_include(&file_path, &options.include_patterns) {
792            continue;
793        }
794        let preview = if options.full_section {
795            content_text.to_string()
796        } else {
797            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
798        };
799
800        raw_results.push((
801            _score,
802            SearchResult {
803                file: file_path,
804                span: Span {
805                    byte_start: 0,
806                    byte_end: content_text.len(),
807                    line_start: 1,
808                    line_end: content_text.lines().count(),
809                },
810                score: _score,
811                preview,
812                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
813                symbol: None,
814                chunk_hash: None,
815                index_epoch: None,
816            },
817        ));
818    }
819
820    // Normalize scores to 0-1 range and apply threshold
821    let mut results = Vec::new();
822    if !raw_results.is_empty() {
823        let max_score = raw_results
824            .iter()
825            .map(|(score, _)| *score)
826            .fold(0.0f32, f32::max);
827        if max_score > 0.0 {
828            for (raw_score, mut result) in raw_results {
829                let normalized_score = raw_score / max_score;
830
831                // Apply threshold filtering with normalized score
832                if let Some(threshold) = options.threshold
833                    && normalized_score < threshold
834                {
835                    continue;
836                }
837
838                result.score = normalized_score;
839                results.push(result);
840            }
841        }
842    }
843
844    Ok(results)
845}
846
847async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
848    // Handle both files and directories by finding the appropriate directory for indexing
849    let index_root = if options.path.is_file() {
850        options.path.parent().unwrap_or(&options.path)
851    } else {
852        &options.path
853    };
854
855    let index_dir = index_root.join(".ck");
856    let tantivy_index_path = index_dir.join("tantivy_index");
857
858    fs::create_dir_all(&tantivy_index_path)?;
859
860    let mut schema_builder = Schema::builder();
861    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
862    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
863    let schema = schema_builder.build();
864
865    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
866        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {e}")))?;
867
868    let mut index_writer = index
869        .writer(50_000_000)
870        .map_err(|e| CkError::Index(format!("Failed to create index writer: {e}")))?;
871
872    let files = filter_files_by_include(
873        collect_files(index_root, true, &options.exclude_patterns)?,
874        &options.include_patterns,
875    );
876
877    for file_path in &files {
878        if let Ok(content) = fs::read_to_string(file_path) {
879            let doc = doc!(
880                content_field => content,
881                path_field => file_path.display().to_string()
882            );
883            index_writer.add_document(doc)?;
884        }
885    }
886
887    index_writer
888        .commit()
889        .map_err(|e| CkError::Index(format!("Failed to commit index: {e}")))?;
890
891    // After building, search again with the same options
892    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
893    let mut schema_builder = Schema::builder();
894    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
895    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
896    let _schema = schema_builder.build();
897
898    let index = Index::open_in_dir(&tantivy_index_path)
899        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {e}")))?;
900
901    let reader = index
902        .reader_builder()
903        .reload_policy(ReloadPolicy::OnCommitWithDelay)
904        .try_into()
905        .map_err(|e| CkError::Index(format!("Failed to create index reader: {e}")))?;
906
907    let searcher = reader.searcher();
908    let query_parser = QueryParser::for_index(&index, vec![content_field]);
909
910    let query = query_parser
911        .parse_query(&options.query)
912        .map_err(|e| CkError::Search(format!("Failed to parse query: {e}")))?;
913
914    let top_docs = if let Some(top_k) = options.top_k {
915        searcher.search(&query, &TopDocs::with_limit(top_k))?
916    } else {
917        searcher.search(&query, &TopDocs::with_limit(100))?
918    };
919
920    // First, collect all results with raw scores
921    let mut raw_results = Vec::new();
922    for (_score, doc_address) in top_docs {
923        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
924        let path_text = retrieved_doc
925            .get_first(path_field)
926            .map(|field_value| field_value.as_str().unwrap_or(""))
927            .unwrap_or("");
928        let content_text = retrieved_doc
929            .get_first(content_field)
930            .map(|field_value| field_value.as_str().unwrap_or(""))
931            .unwrap_or("");
932
933        let file_path = PathBuf::from(path_text);
934        let preview = if options.full_section {
935            content_text.to_string()
936        } else {
937            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
938        };
939
940        raw_results.push((
941            _score,
942            SearchResult {
943                file: file_path,
944                span: Span {
945                    byte_start: 0,
946                    byte_end: content_text.len(),
947                    line_start: 1,
948                    line_end: content_text.lines().count(),
949                },
950                score: _score,
951                preview,
952                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
953                symbol: None,
954                chunk_hash: None,
955                index_epoch: None,
956            },
957        ));
958    }
959
960    // Normalize scores to 0-1 range and apply threshold
961    let mut results = Vec::new();
962    if !raw_results.is_empty() {
963        let max_score = raw_results
964            .iter()
965            .map(|(score, _)| *score)
966            .fold(0.0f32, f32::max);
967        if max_score > 0.0 {
968            for (raw_score, mut result) in raw_results {
969                let normalized_score = raw_score / max_score;
970
971                // Apply threshold filtering with normalized score
972                if let Some(threshold) = options.threshold
973                    && normalized_score < threshold
974                {
975                    continue;
976                }
977
978                result.score = normalized_score;
979                results.push(result);
980            }
981        }
982    }
983
984    Ok(results)
985}
986
987#[allow(dead_code)]
988async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
989    hybrid_search_with_progress(options, None).await
990}
991
992async fn hybrid_search_with_progress(
993    options: &SearchOptions,
994    progress_callback: Option<SearchProgressCallback>,
995) -> Result<Vec<SearchResult>> {
996    if let Some(ref callback) = progress_callback {
997        callback("Running regex search...");
998    }
999    let regex_results = regex_search(options)?;
1000
1001    if let Some(ref callback) = progress_callback {
1002        callback("Running semantic search...");
1003    }
1004    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
1005
1006    let mut combined = HashMap::new();
1007
1008    for (rank, result) in regex_results.iter().enumerate() {
1009        let key = format!("{}:{}", result.file.display(), result.span.line_start);
1010        combined
1011            .entry(key)
1012            .or_insert(Vec::new())
1013            .push((rank + 1, result.clone()));
1014    }
1015
1016    for (rank, result) in semantic_results.matches.iter().enumerate() {
1017        let key = format!("{}:{}", result.file.display(), result.span.line_start);
1018        combined
1019            .entry(key)
1020            .or_insert(Vec::new())
1021            .push((rank + 1, result.clone()));
1022    }
1023
1024    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
1025    let mut rrf_results: Vec<SearchResult> = combined
1026        .into_values()
1027        .map(|ranks| {
1028            let mut result = ranks[0].1.clone();
1029            let rrf_score = ranks
1030                .iter()
1031                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
1032                .sum();
1033            result.score = rrf_score;
1034            result
1035        })
1036        .filter(|result| {
1037            // Apply threshold filtering to raw RRF scores
1038            if let Some(threshold) = options.threshold {
1039                result.score >= threshold
1040            } else {
1041                true
1042            }
1043        })
1044        .collect();
1045
1046    rrf_results.retain(|result| path_matches_include(&result.file, &options.include_patterns));
1047
1048    // Sort by RRF score (highest first)
1049    rrf_results.sort_by(|a, b| {
1050        b.score
1051            .partial_cmp(&a.score)
1052            .unwrap_or(std::cmp::Ordering::Equal)
1053    });
1054
1055    if let Some(top_k) = options.top_k {
1056        rrf_results.truncate(top_k);
1057    }
1058
1059    Ok(rrf_results)
1060}
1061
1062fn build_globset(patterns: &[String]) -> GlobSet {
1063    let mut builder = GlobSetBuilder::new();
1064    for pat in patterns {
1065        // Treat patterns as filename or directory globs
1066        if let Ok(glob) = Glob::new(pat) {
1067            builder.add(glob);
1068        }
1069    }
1070    builder.build().unwrap_or_else(|_| GlobSet::empty())
1071}
1072
1073fn should_exclude_path(path: &Path, globset: &GlobSet) -> bool {
1074    // Match against each path component and the full path
1075    if globset.is_match(path) {
1076        return true;
1077    }
1078    for component in path.components() {
1079        if let std::path::Component::Normal(name) = component
1080            && globset.is_match(name)
1081        {
1082            return true;
1083        }
1084    }
1085    false
1086}
1087
1088fn collect_files(
1089    path: &Path,
1090    recursive: bool,
1091    exclude_patterns: &[String],
1092) -> Result<Vec<PathBuf>> {
1093    let mut files = Vec::new();
1094    let globset = build_globset(exclude_patterns);
1095
1096    if path.is_file() {
1097        // Always add single files, even if they're excluded (user explicitly requested)
1098        files.push(path.to_path_buf());
1099    } else if recursive {
1100        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
1101            // Skip excluded directories entirely for efficiency
1102            let name = e.file_name();
1103            !globset.is_match(e.path()) && !globset.is_match(name)
1104        }) {
1105            match entry {
1106                Ok(entry) => {
1107                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), &globset) {
1108                        files.push(entry.path().to_path_buf());
1109                    }
1110                }
1111                Err(e) => {
1112                    // Log directory traversal errors but continue processing
1113                    tracing::debug!("Skipping path due to error: {}", e);
1114                    continue;
1115                }
1116            }
1117        }
1118    } else {
1119        match fs::read_dir(path) {
1120            Ok(read_dir) => {
1121                for entry in read_dir {
1122                    match entry {
1123                        Ok(entry) => {
1124                            let path = entry.path();
1125                            if path.is_file() && !should_exclude_path(&path, &globset) {
1126                                files.push(path);
1127                            }
1128                        }
1129                        Err(e) => {
1130                            tracing::debug!("Skipping directory entry due to error: {}", e);
1131                            continue;
1132                        }
1133                    }
1134                }
1135            }
1136            Err(e) => {
1137                tracing::debug!("Cannot read directory {:?}: {}", path, e);
1138                return Err(e.into());
1139            }
1140        }
1141    }
1142
1143    Ok(files)
1144}
1145
1146async fn ensure_index_updated_with_progress(
1147    path: &Path,
1148    force_reindex: bool,
1149    need_embeddings: bool,
1150    progress_callback: Option<ck_index::ProgressCallback>,
1151    detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1152    file_options: &ck_core::FileCollectionOptions,
1153    model_override: Option<&str>,
1154) -> Result<()> {
1155    // Find index root for .ck directory location
1156    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1157        if path.is_file() {
1158            path.parent().unwrap_or(path).to_path_buf()
1159        } else {
1160            path.to_path_buf()
1161        }
1162    });
1163    let index_root = &index_root_buf;
1164
1165    // Pass the original path to indexing function so it can index just that file/directory
1166    // The indexing function will use collect_files() which now handles individual files correctly
1167    if force_reindex {
1168        let stats = ck_index::smart_update_index_with_detailed_progress(
1169            index_root,
1170            true,
1171            progress_callback,
1172            detailed_progress_callback,
1173            need_embeddings,
1174            file_options,
1175            model_override,
1176        )
1177        .await?;
1178        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1179            tracing::info!(
1180                "Index updated: {} files indexed, {} orphaned files removed",
1181                stats.files_indexed,
1182                stats.orphaned_files_removed
1183            );
1184        }
1185        return Ok(());
1186    }
1187
1188    // For incremental updates with individual files, we need special handling
1189    // to ensure only the specific file is indexed, not the entire directory
1190    if path.is_file() {
1191        // Index just this one file
1192        use ck_index::index_file;
1193        index_file(path, need_embeddings).await?;
1194    } else {
1195        // For directories, use the standard smart update
1196        let stats = ck_index::smart_update_index_with_detailed_progress(
1197            index_root,
1198            false,
1199            progress_callback,
1200            detailed_progress_callback,
1201            need_embeddings,
1202            file_options,
1203            model_override,
1204        )
1205        .await?;
1206        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1207            tracing::info!(
1208                "Index updated: {} files indexed, {} orphaned files removed",
1209                stats.files_indexed,
1210                stats.orphaned_files_removed
1211            );
1212        }
1213    }
1214
1215    Ok(())
1216}
1217
1218fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1219    let before = options.before_context_lines.max(options.context_lines);
1220    let after = options.after_context_lines.max(options.context_lines);
1221
1222    if before > 0 || after > 0 {
1223        let start_idx = line_idx.saturating_sub(before);
1224        let end_idx = (line_idx + after + 1).min(lines.len());
1225        lines[start_idx..end_idx].join("\n")
1226    } else {
1227        lines[line_idx].to_string()
1228    }
1229}
1230
1231fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1232    let lang = ck_core::Language::from_path(file_path)?;
1233
1234    // Parse the file with tree-sitter and extract function/class sections
1235    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1236        let include_markdown = lang == ck_core::Language::Markdown;
1237        let sections: Vec<(usize, usize, String)> = chunks
1238            .into_iter()
1239            .filter(|chunk| {
1240                if include_markdown {
1241                    matches!(
1242                        chunk.chunk_type,
1243                        ck_chunk::ChunkType::Module | ck_chunk::ChunkType::Text
1244                    )
1245                } else {
1246                    matches!(
1247                        chunk.chunk_type,
1248                        ck_chunk::ChunkType::Function
1249                            | ck_chunk::ChunkType::Class
1250                            | ck_chunk::ChunkType::Method
1251                    )
1252                }
1253            })
1254            .map(|chunk| {
1255                (
1256                    chunk.span.line_start - 1, // Convert to 0-based index
1257                    chunk.span.line_end - 1,
1258                    chunk.text,
1259                )
1260            })
1261            .collect();
1262
1263        if sections.is_empty() {
1264            None
1265        } else {
1266            Some(sections)
1267        }
1268    } else {
1269        None
1270    }
1271}
1272
1273fn find_containing_section(
1274    sections: &[(usize, usize, String)],
1275    line_idx: usize,
1276) -> Option<&String> {
1277    for (start, end, text) in sections {
1278        if line_idx >= *start && line_idx <= *end {
1279            return Some(text);
1280        }
1281    }
1282    None
1283}
1284
1285#[cfg(test)]
1286mod tests {
1287    use super::*;
1288    use std::fs;
1289    use tempfile::TempDir;
1290
1291    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1292        let files = vec![
1293            ("test1.txt", "hello world rust programming"),
1294            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1295            ("test3.py", "print('Hello Python')"),
1296            ("test4.txt", "machine learning artificial intelligence"),
1297        ];
1298
1299        let mut paths = Vec::new();
1300        for (name, content) in files {
1301            let path = dir.join(name);
1302            fs::write(&path, content).unwrap();
1303            paths.push(path);
1304        }
1305        paths
1306    }
1307
1308    #[test]
1309    fn test_extract_lines_from_file() {
1310        let temp_dir = TempDir::new().unwrap();
1311        let test_file = temp_dir.path().join("test_lines.txt");
1312
1313        // Create a multi-line test file
1314        let content =
1315            "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1316        fs::write(&test_file, content).unwrap();
1317
1318        // Test extracting lines 3-5 (1-based indexing)
1319        let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1320        assert_eq!(result, "Line 3\nLine 4\nLine 5");
1321
1322        // Test extracting a single line
1323        let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1324        assert_eq!(result, "Line 7");
1325
1326        // Test extracting from line 8 to end
1327        let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1328        assert_eq!(result, "Line 8\nLine 9\nLine 10");
1329
1330        // Test line_start == 0 (should return empty)
1331        let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1332        assert_eq!(result, "");
1333
1334        // Test line_start > file length (should return empty)
1335        let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1336        assert_eq!(result, "");
1337    }
1338
1339    #[tokio::test]
1340    async fn test_extract_content_from_span() {
1341        let temp_dir = TempDir::new().unwrap();
1342        let test_file = temp_dir.path().join("code.rs");
1343
1344        // Create a multi-line code file
1345        let content = "fn first() {\n    println!(\"First\");\n}\n\nfn second() {\n    println!(\"Second\");\n}\n\nfn third() {\n    println!(\"Third\");\n}";
1346        fs::write(&test_file, content).unwrap();
1347
1348        // Test extracting the second function (lines 5-7)
1349        let span = ck_core::Span {
1350            byte_start: 0, // Not used in line extraction
1351            byte_end: 0,   // Not used in line extraction
1352            line_start: 5,
1353            line_end: 7,
1354        };
1355
1356        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1357        assert_eq!(result, "fn second() {\n    println!(\"Second\");\n}");
1358
1359        // Test extracting a single line
1360        let span = ck_core::Span {
1361            byte_start: 0,
1362            byte_end: 0,
1363            line_start: 2,
1364            line_end: 2,
1365        };
1366
1367        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1368        assert_eq!(result, "    println!(\"First\");");
1369    }
1370
1371    #[test]
1372    fn test_collect_files() {
1373        let temp_dir = TempDir::new().unwrap();
1374        let test_files = create_test_files(temp_dir.path());
1375
1376        // Test non-recursive
1377        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1378        assert_eq!(files.len(), 4);
1379
1380        // Test recursive
1381        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1382        assert_eq!(files.len(), 4);
1383
1384        // Test single file
1385        let files = collect_files(&test_files[0], false, &[]).unwrap();
1386        assert_eq!(files.len(), 1);
1387        assert_eq!(files[0], test_files[0]);
1388    }
1389
1390    #[test]
1391    fn test_regex_search() {
1392        let temp_dir = TempDir::new().unwrap();
1393        create_test_files(temp_dir.path());
1394
1395        let options = SearchOptions {
1396            mode: SearchMode::Regex,
1397            query: "rust".to_string(),
1398            path: temp_dir.path().to_path_buf(),
1399            recursive: true,
1400            ..Default::default()
1401        };
1402
1403        let results = regex_search(&options).unwrap();
1404        assert!(!results.is_empty());
1405
1406        // Should find matches in files containing "rust"
1407        let rust_matches: Vec<_> = results
1408            .iter()
1409            .filter(|r| r.preview.to_lowercase().contains("rust"))
1410            .collect();
1411        assert!(!rust_matches.is_empty());
1412    }
1413
1414    #[test]
1415    fn test_regex_search_case_insensitive() {
1416        let temp_dir = TempDir::new().unwrap();
1417        create_test_files(temp_dir.path());
1418
1419        let options = SearchOptions {
1420            mode: SearchMode::Regex,
1421            query: "HELLO".to_string(),
1422            path: temp_dir.path().to_path_buf(),
1423            recursive: true,
1424            case_insensitive: true,
1425            ..Default::default()
1426        };
1427
1428        let results = regex_search(&options).unwrap();
1429        assert!(!results.is_empty());
1430    }
1431
1432    #[test]
1433    fn test_regex_search_fixed_string() {
1434        let temp_dir = TempDir::new().unwrap();
1435        create_test_files(temp_dir.path());
1436
1437        let options = SearchOptions {
1438            mode: SearchMode::Regex,
1439            query: "fn main()".to_string(),
1440            path: temp_dir.path().to_path_buf(),
1441            recursive: true,
1442            fixed_string: true,
1443            ..Default::default()
1444        };
1445
1446        let results = regex_search(&options).unwrap();
1447        assert!(!results.is_empty());
1448    }
1449
1450    #[test]
1451    fn test_regex_search_whole_word() {
1452        let temp_dir = TempDir::new().unwrap();
1453        fs::write(
1454            temp_dir.path().join("word_test.txt"),
1455            "rust rusty rustacean",
1456        )
1457        .unwrap();
1458
1459        let options = SearchOptions {
1460            mode: SearchMode::Regex,
1461            query: "rust".to_string(),
1462            path: temp_dir.path().to_path_buf(),
1463            recursive: true,
1464            whole_word: true,
1465            ..Default::default()
1466        };
1467
1468        let results = regex_search(&options).unwrap();
1469        assert!(!results.is_empty());
1470        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1471    }
1472
1473    #[test]
1474    fn test_regex_search_top_k() {
1475        let temp_dir = TempDir::new().unwrap();
1476
1477        // Create multiple files with matches
1478        for i in 0..10 {
1479            fs::write(temp_dir.path().join(format!("file{i}.txt")), "test content").unwrap();
1480        }
1481
1482        let options = SearchOptions {
1483            mode: SearchMode::Regex,
1484            query: "test".to_string(),
1485            path: temp_dir.path().to_path_buf(),
1486            recursive: true,
1487            top_k: Some(5),
1488            ..Default::default()
1489        };
1490
1491        let results = regex_search(&options).unwrap();
1492        assert!(results.len() <= 5);
1493    }
1494
1495    #[test]
1496    fn test_regex_search_span_offsets() {
1497        // Test that span offsets are correctly calculated for multiple matches on a line
1498        let temp_dir = TempDir::new().unwrap();
1499        let test_file = temp_dir.path().join("spans.txt");
1500        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1501
1502        let options = SearchOptions {
1503            mode: SearchMode::Regex,
1504            query: "test".to_string(),
1505            path: test_file.clone(),
1506            recursive: false,
1507            ..Default::default()
1508        };
1509
1510        let results = regex_search(&options).unwrap();
1511
1512        // Should find 5 matches total
1513        assert_eq!(results.len(), 5);
1514
1515        // Check first line has 3 matches with correct byte offsets
1516        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1517        assert_eq!(line1_matches.len(), 3);
1518        assert_eq!(line1_matches[0].span.byte_start, 0);
1519        assert_eq!(line1_matches[1].span.byte_start, 5);
1520        assert_eq!(line1_matches[2].span.byte_start, 10);
1521
1522        // Check second line match
1523        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1524        assert_eq!(line2_matches.len(), 1);
1525        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1526
1527        // Each match should have different byte offsets
1528        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1529        byte_starts.sort();
1530        byte_starts.dedup();
1531        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1532    }
1533
1534    #[test]
1535    fn test_search_file() {
1536        let temp_dir = TempDir::new().unwrap();
1537        let file_path = temp_dir.path().join("test.txt");
1538        fs::write(
1539            &file_path,
1540            "line 1: hello\nline 2: world\nline 3: rust programming",
1541        )
1542        .unwrap();
1543
1544        let regex = regex::Regex::new("rust").unwrap();
1545        let options = SearchOptions::default();
1546
1547        let results = search_file(&regex, &file_path, &options).unwrap();
1548        assert_eq!(results.len(), 1);
1549        assert_eq!(results[0].span.line_start, 3);
1550        assert!(results[0].preview.contains("rust"));
1551    }
1552
1553    #[test]
1554    fn test_search_file_with_context() {
1555        let temp_dir = TempDir::new().unwrap();
1556        let file_path = temp_dir.path().join("test.txt");
1557        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1558
1559        let regex = regex::Regex::new("target").unwrap();
1560        let options = SearchOptions {
1561            context_lines: 1,
1562            ..Default::default()
1563        };
1564
1565        let results = search_file(&regex, &file_path, &options).unwrap();
1566        assert_eq!(results.len(), 1);
1567
1568        println!("Preview: '{}'", results[0].preview);
1569
1570        // The target line is line 3, with 1 context line before and after
1571        // So we should get lines 2, 3, 4
1572        assert!(results[0].preview.contains("line 2"));
1573        assert!(results[0].preview.contains("target line"));
1574        assert!(results[0].preview.contains("line 4"));
1575    }
1576
1577    #[tokio::test]
1578    async fn test_search_main_function() {
1579        let temp_dir = TempDir::new().unwrap();
1580        create_test_files(temp_dir.path());
1581
1582        let options = SearchOptions {
1583            mode: SearchMode::Regex,
1584            query: "hello".to_string(),
1585            path: temp_dir.path().to_path_buf(),
1586            recursive: true,
1587            case_insensitive: true,
1588            ..Default::default()
1589        };
1590
1591        let results = search(&options).await.unwrap();
1592        assert!(!results.is_empty());
1593    }
1594
1595    #[tokio::test]
1596    async fn test_regex_search_mixed_line_endings() {
1597        // Regression test for byte offset issues with different line endings
1598        let temp_dir = TempDir::new().unwrap();
1599
1600        // Create test file with mixed line endings (Windows \r\n and Unix \n)
1601        let test_file = temp_dir.path().join("mixed_endings.txt");
1602        let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
1603        std::fs::write(&test_file, content).unwrap();
1604
1605        let options = SearchOptions {
1606            mode: SearchMode::Regex,
1607            query: "pattern".to_string(),
1608            path: test_file.clone(),
1609            recursive: false,
1610            ..Default::default()
1611        };
1612
1613        let results = search(&options).await.unwrap();
1614        assert_eq!(results.len(), 1);
1615
1616        let result = &results[0];
1617        // Verify byte offsets are correct - should point to start of "pattern"
1618        let original_content = std::fs::read_to_string(&test_file).unwrap();
1619        let pattern_start = original_content.find("pattern").unwrap();
1620
1621        assert_eq!(result.span.byte_start, pattern_start);
1622        assert_eq!(result.span.line_start, 4); // Fourth line
1623    }
1624
1625    #[tokio::test]
1626    async fn test_regex_search_windows_line_endings() {
1627        // Regression test specifically for Windows \r\n line endings
1628        let temp_dir = TempDir::new().unwrap();
1629
1630        let test_file = temp_dir.path().join("windows_endings.txt");
1631        let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
1632        std::fs::write(&test_file, content).unwrap();
1633
1634        let options = SearchOptions {
1635            mode: SearchMode::Regex,
1636            query: "match".to_string(),
1637            path: test_file.clone(),
1638            recursive: false,
1639            ..Default::default()
1640        };
1641
1642        let results = search(&options).await.unwrap();
1643        assert_eq!(results.len(), 1);
1644
1645        let result = &results[0];
1646
1647        // Verify the match is on line 3
1648        assert_eq!(result.span.line_start, 3);
1649
1650        // Verify byte offset accounts for \r\n endings
1651        // first line\r\n = 12 bytes, second line\r\n = 13 bytes, total = 25 bytes before "match"
1652        let expected_byte_start = 25; // Position of "match" in the content
1653        assert_eq!(result.span.byte_start, expected_byte_start);
1654    }
1655
1656    #[test]
1657    fn test_split_lines_with_endings_helper() {
1658        // Unix line endings
1659        let unix_content = "line1\nline2\nline3\n";
1660        let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
1661        assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
1662        assert_eq!(unix_endings, vec![1, 1, 1]);
1663
1664        // Windows line endings
1665        let windows_content = "line1\r\nline2\r\nline3\r\n";
1666        let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
1667        assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
1668        assert_eq!(windows_endings, vec![2, 2, 2]);
1669
1670        // Old Mac line endings
1671        let mac_content = "line1\rline2\rline3\r";
1672        let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
1673        assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
1674        assert_eq!(mac_endings, vec![1, 1, 1]);
1675
1676        // Mixed endings
1677        let mixed_content = "line1\nline2\r\nline3\r";
1678        let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
1679        assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
1680        assert_eq!(mixed_endings, vec![1, 2, 1]);
1681
1682        // No line endings
1683        let no_endings = "single line";
1684        let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
1685        assert_eq!(no_lines, vec!["single line"]);
1686        assert_eq!(no_endings_vec, vec![0]);
1687    }
1688
1689    // Default model config is fastembed; without that feature ck-embed
1690    // falls back to DummyEmbedder (zero vectors), so semantic search
1691    // returns nothing and these tests have nothing to assert against.
1692    #[cfg(feature = "fastembed")]
1693    #[tokio::test]
1694    async fn test_subdirectory_search_uses_parent_ckignore() {
1695        // Regression test for issue where searching in subdirectory doesn't use parent .ckignore
1696        // Bug: When searching ~/parent/subdir/, .ckignore is loaded from subdir (doesn't exist)
1697        // instead of from parent (where index and .ckignore live)
1698
1699        let temp_dir = TempDir::new().unwrap();
1700        let parent = temp_dir.path();
1701        let subdir = parent.join("subproject");
1702        fs::create_dir(&subdir).unwrap();
1703
1704        // Create .ckignore at parent level excluding *.tmp files
1705        fs::write(parent.join(".ckignore"), "*.tmp\n").unwrap();
1706
1707        // Create test files in parent directory
1708        fs::write(parent.join("parent.txt"), "searchable content in parent").unwrap();
1709        fs::write(parent.join("ignored.tmp"), "this should not be indexed").unwrap();
1710
1711        // Create test files in subdirectory
1712        fs::write(subdir.join("nested.txt"), "searchable content in subdir").unwrap();
1713        fs::write(
1714            subdir.join("also_ignored.tmp"),
1715            "this should not be indexed either",
1716        )
1717        .unwrap();
1718
1719        // First, search from parent to create the index
1720        let parent_options = SearchOptions {
1721            mode: SearchMode::Semantic,
1722            query: "searchable".to_string(),
1723            path: parent.to_path_buf(),
1724            top_k: Some(10),
1725            threshold: Some(0.1),
1726            ..Default::default()
1727        };
1728
1729        let _ = search(&parent_options).await;
1730
1731        // Give indexing a moment to complete
1732        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1733
1734        // Now search from SUBDIRECTORY - this is where the bug occurs
1735        // The engine should find parent .ck index and use parent .ckignore
1736        // But currently it loads .ckignore from subdir (doesn't exist)
1737        let subdir_options = SearchOptions {
1738            mode: SearchMode::Semantic,
1739            query: "content".to_string(),
1740            path: subdir.clone(),
1741            top_k: Some(10),
1742            threshold: Some(0.1),
1743            ..Default::default()
1744        };
1745
1746        let results = search(&subdir_options).await.unwrap();
1747
1748        // ASSERTION 1: .tmp files should be excluded (currently FAILS due to bug)
1749        let tmp_files: Vec<_> = results
1750            .iter()
1751            .filter(|r| r.file.to_string_lossy().ends_with(".tmp"))
1752            .collect();
1753        assert!(
1754            tmp_files.is_empty(),
1755            "Bug: .tmp files were indexed despite parent .ckignore. Found {} .tmp files: {:?}",
1756            tmp_files.len(),
1757            tmp_files.iter().map(|r| &r.file).collect::<Vec<_>>()
1758        );
1759
1760        // ASSERTION 2: Should find .txt files in subdirectory
1761        let txt_in_subdir = results.iter().any(|r| r.file.ends_with("nested.txt"));
1762        assert!(txt_in_subdir, "Should find nested.txt in subdirectory");
1763
1764        // ASSERTION 3: No .ck directory should be created in subdirectory
1765        assert!(
1766            !subdir.join(".ck").exists(),
1767            "Should not create .ck directory in subdirectory"
1768        );
1769    }
1770
1771    // Default model config is fastembed; without that feature ck-embed
1772    // falls back to DummyEmbedder (zero vectors), so semantic search
1773    // returns nothing and these tests have nothing to assert against.
1774    #[cfg(feature = "fastembed")]
1775    #[tokio::test]
1776    async fn test_multiple_ckignore_files_merge_correctly() {
1777        // Test that multiple .ckignore files in the hierarchy are all applied
1778        use std::fs;
1779        use tempfile::TempDir;
1780
1781        let temp_dir = TempDir::new().unwrap();
1782        let parent = temp_dir.path();
1783        let subdir = parent.join("subdir");
1784        let deeper = subdir.join("deeper");
1785        fs::create_dir(&subdir).unwrap();
1786        fs::create_dir(&deeper).unwrap();
1787
1788        // Create hierarchical .ckignore files
1789        fs::write(parent.join(".ckignore"), "*.log\n").unwrap();
1790        fs::write(subdir.join(".ckignore"), "*.tmp\n").unwrap();
1791        fs::write(deeper.join(".ckignore"), "*.cache\n").unwrap();
1792
1793        // Create test files at each level
1794        fs::write(parent.join("root.txt"), "searchable").unwrap();
1795        fs::write(parent.join("root.log"), "should be ignored").unwrap();
1796
1797        fs::write(subdir.join("mid.txt"), "searchable").unwrap();
1798        fs::write(subdir.join("mid.log"), "should be ignored by parent").unwrap();
1799        fs::write(subdir.join("mid.tmp"), "should be ignored by local").unwrap();
1800
1801        fs::write(deeper.join("deep.txt"), "searchable").unwrap();
1802        fs::write(deeper.join("deep.log"), "should be ignored by grandparent").unwrap();
1803        fs::write(deeper.join("deep.tmp"), "should be ignored by parent").unwrap();
1804        fs::write(deeper.join("deep.cache"), "should be ignored by local").unwrap();
1805
1806        // Index from parent
1807        let parent_options = SearchOptions {
1808            mode: SearchMode::Semantic,
1809            query: "searchable".to_string(),
1810            path: parent.to_path_buf(),
1811            top_k: Some(20),
1812            threshold: Some(0.1),
1813            ..Default::default()
1814        };
1815
1816        let _ = search(&parent_options).await;
1817        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1818
1819        // Search from deeper directory - should respect ALL three .ckignore files
1820        let deeper_options = SearchOptions {
1821            mode: SearchMode::Semantic,
1822            query: "ignored".to_string(),
1823            path: deeper.clone(),
1824            top_k: Some(20),
1825            threshold: Some(0.1),
1826            ..Default::default()
1827        };
1828
1829        let results = search(&deeper_options).await.unwrap();
1830
1831        // All ignored files should be excluded
1832        let has_log = results
1833            .iter()
1834            .any(|r| r.file.to_string_lossy().ends_with(".log"));
1835        let has_tmp = results
1836            .iter()
1837            .any(|r| r.file.to_string_lossy().ends_with(".tmp"));
1838        let has_cache = results
1839            .iter()
1840            .any(|r| r.file.to_string_lossy().ends_with(".cache"));
1841
1842        assert!(
1843            !has_log,
1844            "*.log files should be excluded by parent .ckignore"
1845        );
1846        assert!(
1847            !has_tmp,
1848            "*.tmp files should be excluded by subdir .ckignore"
1849        );
1850        assert!(
1851            !has_cache,
1852            "*.cache files should be excluded by deeper .ckignore"
1853        );
1854
1855        // Should still find .txt files
1856        let has_txt = results
1857            .iter()
1858            .any(|r| r.file.to_string_lossy().ends_with(".txt"));
1859        assert!(has_txt, "Should find .txt files (not ignored)");
1860    }
1861
1862    // Default model config is fastembed; without that feature ck-embed
1863    // falls back to DummyEmbedder (zero vectors) and the assertions can't
1864    // distinguish a scoped match from no match at all.
1865    #[cfg(feature = "fastembed")]
1866    #[tokio::test]
1867    async fn test_scoped_search_does_not_lose_results_to_global_top_k() {
1868        // Regression test for the bug where scoped semantic search applied
1869        // top_k BEFORE the path filter, so a small top_k against a whole-
1870        // codebase index could return zero matches when the global top
1871        // results all lived outside the requested scope.
1872        //
1873        // Reproduction:
1874        //   - Index a parent dir that contains many files about TOPIC_A
1875        //     (so they dominate the global top_k for that query)
1876        //   - Search inside a sibling subdir that contains a file about
1877        //     TOPIC_A, with top_k smaller than the TOPIC_A file count
1878        //   - Before the fix: zero results inside subdir
1879        //   - After the fix:  the in-scope file is returned
1880        use std::fs;
1881        use tempfile::TempDir;
1882
1883        let temp_dir = TempDir::new().unwrap();
1884        let parent = temp_dir.path();
1885        let noisy = parent.join("noisy");
1886        let scoped = parent.join("scoped");
1887        fs::create_dir(&noisy).unwrap();
1888        fs::create_dir(&scoped).unwrap();
1889
1890        // 8 files in noisy/ that all match the query "database connection".
1891        // top_k=3 will be entirely consumed by these globally.
1892        for i in 0..8 {
1893            fs::write(
1894                noisy.join(format!("noise_{i}.txt")),
1895                format!(
1896                    "function open_database_connection_{i}() {{\n    \
1897                     // establish a database connection to postgres\n    \
1898                     // handle database connection errors gracefully\n}}\n"
1899                ),
1900            )
1901            .unwrap();
1902        }
1903
1904        // One in-scope file that also matches the query
1905        fs::write(
1906            scoped.join("target.txt"),
1907            "function connect() {\n    \
1908             // open a database connection to the primary store\n    \
1909             // database connection pool config goes here\n}\n",
1910        )
1911        .unwrap();
1912
1913        // Index from parent so .ck lives at parent root and covers both subdirs.
1914        let index_options = SearchOptions {
1915            mode: SearchMode::Semantic,
1916            query: "database connection".to_string(),
1917            path: parent.to_path_buf(),
1918            top_k: Some(20),
1919            threshold: Some(0.0),
1920            ..Default::default()
1921        };
1922        let _ = search(&index_options).await;
1923        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1924
1925        // Now search scoped to `scoped/` with a small top_k. The bug:
1926        // the 3 global top results all live in `noisy/`, so the path
1927        // filter rejects all of them and we get [].
1928        let scoped_options = SearchOptions {
1929            mode: SearchMode::Semantic,
1930            query: "database connection".to_string(),
1931            path: scoped.clone(),
1932            top_k: Some(3),
1933            threshold: Some(0.0),
1934            ..Default::default()
1935        };
1936
1937        let results = search(&scoped_options).await.unwrap();
1938
1939        assert!(
1940            !results.is_empty(),
1941            "Scoped search returned zero results — top_k was applied \
1942             before the path filter (the bug this test guards against)."
1943        );
1944        let all_in_scope = results.iter().all(|r| {
1945            r.file.starts_with(&scoped)
1946                || r.file.canonicalize().ok() == scoped.join("target.txt").canonicalize().ok()
1947        });
1948        assert!(
1949            all_in_scope,
1950            "Some results leaked out of the requested scope: {:?}",
1951            results.iter().map(|r| &r.file).collect::<Vec<_>>()
1952        );
1953    }
1954}
ck_engine/lib.rs

ck_engine/
lib.rs