ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, IncludePattern, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23/// Resolve the actual file path to read content from
24/// For PDFs: returns cache path and validates it exists
25/// For regular files: returns original path
26fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27    if ck_core::pdf::is_pdf_file(file_path) {
28        // PDFs: Read from cached extracted text
29        let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30        if !cache_path.exists() {
31            return Err(anyhow::anyhow!(
32                "PDF not preprocessed. Run 'ck --index' first."
33            ));
34        }
35        Ok(cache_path)
36    } else {
37        // Regular files: Read from original source
38        Ok(file_path.to_path_buf())
39    }
40}
41
42/// Read content from file for search result extraction
43/// Regular files: read directly from source
44/// PDFs: read from preprocessed cache
45fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46    let content_path = resolve_content_path(file_path, repo_root)?;
47    Ok(fs::read_to_string(content_path)?)
48}
49
50/// Extract content from a file using a span (streaming version)
51async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52    // Find repo root to locate cache
53    let repo_root = find_nearest_index_root(file_path)
54        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56    // Use centralized path resolution
57    let content_path = resolve_content_path(file_path, &repo_root)?;
58
59    // Stream only the needed lines
60    extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63/// Stream-read specific lines from a file without loading the entire content
64fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65    use std::io::{BufRead, BufReader};
66
67    if line_start == 0 {
68        return Ok(String::new());
69    }
70
71    let file = fs::File::open(file_path)?;
72    let reader = BufReader::new(file);
73    let mut result = Vec::new();
74
75    // Convert to 0-based indexing
76    let start_idx = line_start.saturating_sub(1);
77    let end_idx = line_end.saturating_sub(1);
78
79    for (current_line, line_result) in reader.lines().enumerate() {
80        if current_line > end_idx {
81            break; // Stop reading once we've passed the needed lines
82        }
83
84        let line = line_result?;
85
86        if current_line >= start_idx {
87            result.push(line);
88        }
89    }
90
91    // Handle case where requested lines exceed file length
92    if result.is_empty() && line_start > 0 {
93        return Ok(String::new());
94    }
95
96    Ok(result.join("\n"))
97}
98
99/// Split content into lines while preserving the exact number of trailing newline bytes per line.
100/// Handles Unix (\n), Windows (\r\n) and old Mac (\r) line endings.
101fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
102    let mut lines = Vec::new();
103    let mut endings = Vec::new();
104
105    let bytes = content.as_bytes();
106    let mut start = 0usize;
107    let mut i = 0usize;
108
109    while i < bytes.len() {
110        match bytes[i] {
111            b'\n' => {
112                lines.push(content[start..i].to_string());
113                endings.push(1);
114                i += 1;
115                start = i;
116            }
117            b'\r' => {
118                lines.push(content[start..i].to_string());
119                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
120                    endings.push(2);
121                    i += 2;
122                } else {
123                    endings.push(1);
124                    i += 1;
125                }
126                start = i;
127            }
128            _ => {
129                i += 1;
130            }
131        }
132    }
133
134    if start < bytes.len() {
135        lines.push(content[start..].to_string());
136        endings.push(0);
137    }
138
139    (lines, endings)
140}
141
142fn canonicalize_for_matching(path: &Path) -> PathBuf {
143    if let Ok(canonical) = path.canonicalize() {
144        return canonical;
145    }
146
147    if path.is_absolute() {
148        path.to_path_buf()
149    } else {
150        std::env::current_dir()
151            .map(|cwd| cwd.join(path))
152            .unwrap_or_else(|_| path.to_path_buf())
153    }
154}
155
156fn path_matches_include(path: &Path, include_patterns: &[IncludePattern]) -> bool {
157    if include_patterns.is_empty() {
158        return true;
159    }
160
161    let candidate = canonicalize_for_matching(path);
162    include_patterns.iter().any(|pattern| {
163        if pattern.is_dir {
164            candidate.starts_with(&pattern.path)
165        } else {
166            candidate == pattern.path
167        }
168    })
169}
170
171fn filter_files_by_include(
172    files: Vec<PathBuf>,
173    include_patterns: &[IncludePattern],
174) -> Vec<PathBuf> {
175    if include_patterns.is_empty() {
176        return files;
177    }
178
179    files
180        .into_iter()
181        .filter(|path| path_matches_include(path, include_patterns))
182        .collect()
183}
184
185fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
186    let mut current = if path.is_file() {
187        path.parent().unwrap_or(path)
188    } else {
189        path
190    };
191    loop {
192        if current.join(".ck").exists() {
193            return Some(current.to_path_buf());
194        }
195        match current.parent() {
196            Some(parent) => current = parent,
197            None => return None,
198        }
199    }
200}
201
202#[derive(Clone, Debug)]
203pub struct ResolvedModel {
204    pub alias: String,
205    pub config: ck_models::ModelConfig,
206}
207
208impl ResolvedModel {
209    pub fn canonical_name(&self) -> &str {
210        self.config.name.as_str()
211    }
212
213    pub fn dimensions(&self) -> usize {
214        self.config.dimensions
215    }
216}
217
218fn legacy_model_config(name: &str, dimensions: usize) -> ck_models::ModelConfig {
219    ck_models::ModelConfig {
220        name: name.to_string(),
221        provider: "fastembed".to_string(),
222        dimensions,
223        max_tokens: 8192,
224        description: "Legacy ck embedding model preserved for backwards compatibility".to_string(),
225    }
226}
227
228pub(crate) fn resolve_model_from_root(
229    index_root: &Path,
230    cli_model: Option<&str>,
231) -> Result<ResolvedModel> {
232    use ck_models::ModelRegistry;
233
234    let registry = ModelRegistry::default();
235    let index_dir = index_root.join(".ck");
236    let manifest_path = index_dir.join("manifest.json");
237
238    if manifest_path.exists() {
239        let data = std::fs::read(&manifest_path)?;
240        let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
241
242        if let Some(existing_model) = manifest.embedding_model {
243            let dims_hint = manifest.embedding_dimensions.unwrap_or(384);
244            let resolved_existing = match registry.resolve(Some(existing_model.as_str())) {
245                Ok((alias, config)) => ResolvedModel { alias, config },
246                Err(_) => ResolvedModel {
247                    alias: existing_model.clone(),
248                    config: legacy_model_config(&existing_model, dims_hint),
249                },
250            };
251
252            if let Some(requested) = cli_model {
253                let (requested_alias, requested_config) = registry
254                    .resolve(Some(requested))
255                    .map_err(|e| CkError::Embedding(e.to_string()))?;
256
257                if requested_config.name != resolved_existing.config.name {
258                    let suggested_alias = resolved_existing.alias.clone();
259                    return Err(CkError::Embedding(format!(
260                        "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
261                        resolved_existing.config.name,
262                        suggested_alias,
263                        requested,
264                        requested,
265                        suggested_alias
266                    ))
267                    .into());
268                }
269
270                return Ok(ResolvedModel {
271                    alias: requested_alias,
272                    config: requested_config,
273                });
274            }
275
276            return Ok(resolved_existing);
277        }
278    }
279
280    let (alias, config) = registry
281        .resolve(cli_model)
282        .map_err(|e| CkError::Embedding(e.to_string()))?;
283
284    Ok(ResolvedModel { alias, config })
285}
286
287pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
288    let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
289        if path.is_file() {
290            path.parent().unwrap_or(path).to_path_buf()
291        } else {
292            path.to_path_buf()
293        }
294    });
295    resolve_model_from_root(&index_root, cli_model)
296}
297
298pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
299    let results = search_enhanced(options).await?;
300    Ok(results.matches)
301}
302
303pub async fn search_with_progress(
304    options: &SearchOptions,
305    progress_callback: Option<SearchProgressCallback>,
306) -> Result<Vec<SearchResult>> {
307    let results = search_enhanced_with_progress(options, progress_callback).await?;
308    Ok(results.matches)
309}
310
311/// Enhanced search that includes near-miss information for threshold queries
312pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
313    search_enhanced_with_progress(options, None).await
314}
315
316/// Enhanced search with progress callback that includes near-miss information
317pub async fn search_enhanced_with_progress(
318    options: &SearchOptions,
319    progress_callback: Option<SearchProgressCallback>,
320) -> Result<ck_core::SearchResults> {
321    search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
322}
323
324/// Enhanced search with both search and indexing progress callbacks
325pub async fn search_enhanced_with_indexing_progress(
326    options: &SearchOptions,
327    progress_callback: Option<SearchProgressCallback>,
328    indexing_progress_callback: Option<IndexingProgressCallback>,
329    detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
330) -> Result<ck_core::SearchResults> {
331    // Validate that the search path exists
332    if !options.path.exists() {
333        return Err(ck_core::CkError::Search(format!(
334            "Path does not exist: {}",
335            options.path.display()
336        ))
337        .into());
338    }
339
340    // Auto-update index if needed (unless it's regex-only mode)
341    if !matches!(options.mode, SearchMode::Regex) {
342        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
343        let file_options = ck_core::FileCollectionOptions::from(options);
344        ensure_index_updated_with_progress(
345            &options.path,
346            options.reindex,
347            need_embeddings,
348            indexing_progress_callback,
349            detailed_indexing_progress_callback,
350            &file_options,
351            options.embedding_model.as_deref(),
352        )
353        .await?;
354    }
355
356    let search_results = match options.mode {
357        SearchMode::Regex => {
358            let matches = regex_search(options)?;
359            ck_core::SearchResults {
360                matches,
361                closest_below_threshold: None,
362            }
363        }
364        SearchMode::Lexical => {
365            let matches = lexical_search(options).await?;
366            ck_core::SearchResults {
367                matches,
368                closest_below_threshold: None,
369            }
370        }
371        SearchMode::Semantic => {
372            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
373            semantic_search_v3_with_progress(options, progress_callback).await?
374        }
375        SearchMode::Hybrid => {
376            let matches = hybrid_search_with_progress(options, progress_callback).await?;
377            ck_core::SearchResults {
378                matches,
379                closest_below_threshold: None,
380            }
381        }
382    };
383
384    Ok(search_results)
385}
386
387fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
388    let pattern = if options.fixed_string {
389        regex::escape(&options.query)
390    } else if options.whole_word {
391        format!(r"\b{}\b", regex::escape(&options.query))
392    } else {
393        options.query.clone()
394    };
395
396    let regex = RegexBuilder::new(&pattern)
397        .case_insensitive(options.case_insensitive)
398        .build()
399        .map_err(CkError::Regex)?;
400
401    // Default to recursive for directories (like grep) to maintain compatibility
402    let should_recurse = options.path.is_dir() || options.recursive;
403    let files = if should_recurse {
404        // Use ck_index's collect_files which respects gitignore
405        let file_options = ck_core::FileCollectionOptions {
406            respect_gitignore: options.respect_gitignore,
407            use_ckignore: options.use_ckignore,
408            exclude_patterns: options.exclude_patterns.clone(),
409        };
410        let collected = ck_index::collect_files(&options.path, &file_options)?;
411        filter_files_by_include(collected, &options.include_patterns)
412    } else {
413        // For non-recursive, use the local collect_files
414        let collected = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
415        filter_files_by_include(collected, &options.include_patterns)
416    };
417
418    let results: Vec<Vec<SearchResult>> = files
419        .par_iter()
420        .filter_map(|file_path| match search_file(&regex, file_path, options) {
421            Ok(matches) => {
422                if matches.is_empty() {
423                    None
424                } else {
425                    Some(matches)
426                }
427            }
428            Err(e) => {
429                tracing::debug!("Error searching {:?}: {}", file_path, e);
430                None
431            }
432        })
433        .collect();
434
435    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
436    // Deterministic ordering: file path, then line number
437    all_results.sort_by(|a, b| {
438        let path_cmp = a.file.cmp(&b.file);
439        if path_cmp != std::cmp::Ordering::Equal {
440            return path_cmp;
441        }
442        a.span.line_start.cmp(&b.span.line_start)
443    });
444
445    if let Some(top_k) = options.top_k {
446        all_results.truncate(top_k);
447    }
448
449    Ok(all_results)
450}
451
452fn search_file(
453    regex: &Regex,
454    file_path: &Path,
455    options: &SearchOptions,
456) -> Result<Vec<SearchResult>> {
457    // Find repo root to locate cache
458    let repo_root = find_nearest_index_root(file_path)
459        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
460
461    // For full_section mode, we need the entire content for parsing
462    // For context previews, we need all lines for surrounding context
463    // So we'll load content when needed, but optimize for the common case
464    if options.full_section || options.context_lines > 0 {
465        // Load full content when we need section parsing or context
466        let content = read_file_content(file_path, &repo_root)?;
467        let (lines, line_ending_lengths) = split_lines_with_endings(&content);
468
469        // If full_section is enabled, try to parse the file and find code sections
470        let code_sections = if options.full_section {
471            extract_code_sections(file_path, &content)
472        } else {
473            None
474        };
475
476        search_file_in_memory(
477            regex,
478            file_path,
479            options,
480            &lines,
481            &code_sections,
482            &line_ending_lengths,
483        )
484    } else {
485        // Streaming search (simple case)
486        search_file_streaming(regex, file_path, &repo_root, options)
487    }
488}
489
490/// In-memory search for cases requiring context or code sections
491fn search_file_in_memory(
492    regex: &Regex,
493    file_path: &Path,
494    options: &SearchOptions,
495    lines: &[String],
496    code_sections: &Option<Vec<(usize, usize, String)>>,
497    line_ending_lengths: &[usize],
498) -> Result<Vec<SearchResult>> {
499    let mut results = Vec::new();
500    let mut byte_offset = 0;
501
502    for (line_idx, line) in lines.iter().enumerate() {
503        let line_number = line_idx + 1;
504
505        // Special handling for empty pattern - match the entire line once
506        // An empty regex pattern will match at every position, so we need to handle it specially
507        if regex.as_str().is_empty() {
508            // Empty pattern matches the whole line once (grep compatibility)
509            let preview = if options.full_section {
510                // Try to find the containing code section
511                if let Some(sections) = code_sections {
512                    if let Some(section) = find_containing_section(sections, line_idx) {
513                        section.clone()
514                    } else {
515                        // Fall back to context lines if no section found
516                        get_context_preview(lines, line_idx, options)
517                    }
518                } else {
519                    get_context_preview(lines, line_idx, options)
520                }
521            } else {
522                get_context_preview(lines, line_idx, options)
523            };
524
525            results.push(SearchResult {
526                file: file_path.to_path_buf(),
527                span: Span {
528                    byte_start: byte_offset,
529                    byte_end: byte_offset + line.len(),
530                    line_start: line_number,
531                    line_end: line_number,
532                },
533                score: 1.0,
534                preview,
535                lang: ck_core::Language::from_path(file_path),
536                symbol: None,
537                chunk_hash: None,
538                index_epoch: None,
539            });
540        } else {
541            // Find all matches in the line with their positions
542            for mat in regex.find_iter(line) {
543                let preview = if options.full_section {
544                    // Try to find the containing code section
545                    if let Some(sections) = code_sections {
546                        if let Some(section) = find_containing_section(sections, line_idx) {
547                            section.clone()
548                        } else {
549                            // Fall back to context lines if no section found
550                            get_context_preview(lines, line_idx, options)
551                        }
552                    } else {
553                        get_context_preview(lines, line_idx, options)
554                    }
555                } else {
556                    get_context_preview(lines, line_idx, options)
557                };
558
559                results.push(SearchResult {
560                    file: file_path.to_path_buf(),
561                    span: Span {
562                        byte_start: byte_offset + mat.start(),
563                        byte_end: byte_offset + mat.end(),
564                        line_start: line_number,
565                        line_end: line_number,
566                    },
567                    score: 1.0,
568                    preview,
569                    lang: ck_core::Language::from_path(file_path),
570                    symbol: None,
571                    chunk_hash: None,
572                    index_epoch: None,
573                });
574            }
575        }
576
577        // Update byte offset for next line (add line length + actual line ending length)
578        byte_offset += line.len();
579        byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
580    }
581
582    Ok(results)
583}
584
585/// Streaming search for simple cases without context or code sections
586fn search_file_streaming(
587    regex: &Regex,
588    file_path: &Path,
589    repo_root: &Path,
590    _options: &SearchOptions,
591) -> Result<Vec<SearchResult>> {
592    use std::io::{BufRead, BufReader};
593
594    let content_path = resolve_content_path(file_path, repo_root)?;
595    let file = std::fs::File::open(&content_path)?;
596    let mut reader = BufReader::new(file);
597
598    let mut results = Vec::new();
599    let mut line = String::new();
600    let mut byte_offset = 0usize;
601    let mut line_number = 1usize;
602
603    loop {
604        line.clear();
605        let bytes_read = reader.read_line(&mut line)?;
606        if bytes_read == 0 {
607            break;
608        }
609
610        // Determine the length of the trailing line ending (if any) and
611        // normalise the line buffer so it no longer contains newline bytes.
612        let mut newline_len = 0usize;
613        if line.ends_with("\r\n") {
614            line.pop(); // remove \n
615            line.pop(); // remove \r
616            newline_len = 2;
617        } else if line.ends_with(['\n', '\r']) {
618            line.pop();
619            newline_len = 1;
620        }
621
622        // Old Mac-style files may use bare carriage returns as separators.
623        // When the trimmed line still contains '\r' characters, treat them as
624        // record separators so the byte offsets remain accurate.
625        let treat_cr_as_newline = line.contains('\r');
626
627        if treat_cr_as_newline {
628            let bytes = line.as_bytes();
629            let mut segment_start = 0usize;
630            while segment_start <= bytes.len() {
631                match bytes[segment_start..].iter().position(|&b| b == b'\r') {
632                    Some(rel_idx) => {
633                        let idx = segment_start + rel_idx;
634                        let segment_bytes = &bytes[segment_start..idx];
635                        let segment_str = std::str::from_utf8(segment_bytes)?;
636                        process_streaming_line(
637                            regex,
638                            file_path,
639                            segment_str,
640                            line_number,
641                            byte_offset,
642                            &mut results,
643                        );
644                        byte_offset += segment_bytes.len() + 1; // account for \r
645                        line_number += 1;
646                        segment_start = idx + 1;
647                    }
648                    None => {
649                        let segment_bytes = &bytes[segment_start..];
650                        let segment_str = std::str::from_utf8(segment_bytes)?;
651                        process_streaming_line(
652                            regex,
653                            file_path,
654                            segment_str,
655                            line_number,
656                            byte_offset,
657                            &mut results,
658                        );
659                        byte_offset += segment_bytes.len();
660                        line_number += 1;
661                        break;
662                    }
663                }
664            }
665            byte_offset += newline_len;
666        } else {
667            let line_str = line.as_str();
668            process_streaming_line(
669                regex,
670                file_path,
671                line_str,
672                line_number,
673                byte_offset,
674                &mut results,
675            );
676            byte_offset += line_str.len() + newline_len;
677            line_number += 1;
678        }
679    }
680
681    Ok(results)
682}
683
684fn process_streaming_line(
685    regex: &Regex,
686    file_path: &Path,
687    line: &str,
688    line_number: usize,
689    byte_offset: usize,
690    results: &mut Vec<SearchResult>,
691) {
692    if regex.as_str().is_empty() {
693        results.push(SearchResult {
694            file: file_path.to_path_buf(),
695            span: Span {
696                byte_start: byte_offset,
697                byte_end: byte_offset + line.len(),
698                line_start: line_number,
699                line_end: line_number,
700            },
701            score: 1.0,
702            preview: line.to_string(),
703            lang: ck_core::Language::from_path(file_path),
704            symbol: None,
705            chunk_hash: None,
706            index_epoch: None,
707        });
708    } else {
709        for mat in regex.find_iter(line) {
710            results.push(SearchResult {
711                file: file_path.to_path_buf(),
712                span: Span {
713                    byte_start: byte_offset + mat.start(),
714                    byte_end: byte_offset + mat.end(),
715                    line_start: line_number,
716                    line_end: line_number,
717                },
718                score: 1.0,
719                preview: line.to_string(),
720                lang: ck_core::Language::from_path(file_path),
721                symbol: None,
722                chunk_hash: None,
723                index_epoch: None,
724            });
725        }
726    }
727}
728
729async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
730    // Handle both files and directories and reuse nearest existing .ck index up the tree
731    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
732        if options.path.is_file() {
733            options.path.parent().unwrap_or(&options.path).to_path_buf()
734        } else {
735            options.path.clone()
736        }
737    });
738
739    let index_dir = index_root.join(".ck");
740    if !index_dir.exists() {
741        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
742    }
743
744    let tantivy_index_path = index_dir.join("tantivy_index");
745
746    if !tantivy_index_path.exists() {
747        return build_tantivy_index(options).await;
748    }
749
750    let mut schema_builder = Schema::builder();
751    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
752    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
753    let _schema = schema_builder.build();
754
755    let index = Index::open_in_dir(&tantivy_index_path)
756        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
757
758    let reader = index
759        .reader_builder()
760        .reload_policy(ReloadPolicy::OnCommitWithDelay)
761        .try_into()
762        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
763
764    let searcher = reader.searcher();
765    let query_parser = QueryParser::for_index(&index, vec![content_field]);
766
767    let query = query_parser
768        .parse_query(&options.query)
769        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
770
771    let top_docs = if let Some(top_k) = options.top_k {
772        searcher.search(&query, &TopDocs::with_limit(top_k))?
773    } else {
774        searcher.search(&query, &TopDocs::with_limit(100))?
775    };
776
777    // First, collect all results with raw scores
778    let mut raw_results = Vec::new();
779    for (_score, doc_address) in top_docs {
780        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
781        let path_text = retrieved_doc
782            .get_first(path_field)
783            .map(|field_value| field_value.as_str().unwrap_or(""))
784            .unwrap_or("");
785        let content_text = retrieved_doc
786            .get_first(content_field)
787            .map(|field_value| field_value.as_str().unwrap_or(""))
788            .unwrap_or("");
789
790        let file_path = PathBuf::from(path_text);
791        if !path_matches_include(&file_path, &options.include_patterns) {
792            continue;
793        }
794        let preview = if options.full_section {
795            content_text.to_string()
796        } else {
797            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
798        };
799
800        raw_results.push((
801            _score,
802            SearchResult {
803                file: file_path,
804                span: Span {
805                    byte_start: 0,
806                    byte_end: content_text.len(),
807                    line_start: 1,
808                    line_end: content_text.lines().count(),
809                },
810                score: _score,
811                preview,
812                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
813                symbol: None,
814                chunk_hash: None,
815                index_epoch: None,
816            },
817        ));
818    }
819
820    // Normalize scores to 0-1 range and apply threshold
821    let mut results = Vec::new();
822    if !raw_results.is_empty() {
823        let max_score = raw_results
824            .iter()
825            .map(|(score, _)| *score)
826            .fold(0.0f32, f32::max);
827        if max_score > 0.0 {
828            for (raw_score, mut result) in raw_results {
829                let normalized_score = raw_score / max_score;
830
831                // Apply threshold filtering with normalized score
832                if let Some(threshold) = options.threshold
833                    && normalized_score < threshold
834                {
835                    continue;
836                }
837
838                result.score = normalized_score;
839                results.push(result);
840            }
841        }
842    }
843
844    Ok(results)
845}
846
847async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
848    // Handle both files and directories by finding the appropriate directory for indexing
849    let index_root = if options.path.is_file() {
850        options.path.parent().unwrap_or(&options.path)
851    } else {
852        &options.path
853    };
854
855    let index_dir = index_root.join(".ck");
856    let tantivy_index_path = index_dir.join("tantivy_index");
857
858    fs::create_dir_all(&tantivy_index_path)?;
859
860    let mut schema_builder = Schema::builder();
861    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
862    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
863    let schema = schema_builder.build();
864
865    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
866        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
867
868    let mut index_writer = index
869        .writer(50_000_000)
870        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
871
872    let files = filter_files_by_include(
873        collect_files(index_root, true, &options.exclude_patterns)?,
874        &options.include_patterns,
875    );
876
877    for file_path in &files {
878        if let Ok(content) = fs::read_to_string(file_path) {
879            let doc = doc!(
880                content_field => content,
881                path_field => file_path.display().to_string()
882            );
883            index_writer.add_document(doc)?;
884        }
885    }
886
887    index_writer
888        .commit()
889        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
890
891    // After building, search again with the same options
892    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
893    let mut schema_builder = Schema::builder();
894    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
895    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
896    let _schema = schema_builder.build();
897
898    let index = Index::open_in_dir(&tantivy_index_path)
899        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
900
901    let reader = index
902        .reader_builder()
903        .reload_policy(ReloadPolicy::OnCommitWithDelay)
904        .try_into()
905        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
906
907    let searcher = reader.searcher();
908    let query_parser = QueryParser::for_index(&index, vec![content_field]);
909
910    let query = query_parser
911        .parse_query(&options.query)
912        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
913
914    let top_docs = if let Some(top_k) = options.top_k {
915        searcher.search(&query, &TopDocs::with_limit(top_k))?
916    } else {
917        searcher.search(&query, &TopDocs::with_limit(100))?
918    };
919
920    // First, collect all results with raw scores
921    let mut raw_results = Vec::new();
922    for (_score, doc_address) in top_docs {
923        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
924        let path_text = retrieved_doc
925            .get_first(path_field)
926            .map(|field_value| field_value.as_str().unwrap_or(""))
927            .unwrap_or("");
928        let content_text = retrieved_doc
929            .get_first(content_field)
930            .map(|field_value| field_value.as_str().unwrap_or(""))
931            .unwrap_or("");
932
933        let file_path = PathBuf::from(path_text);
934        let preview = if options.full_section {
935            content_text.to_string()
936        } else {
937            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
938        };
939
940        raw_results.push((
941            _score,
942            SearchResult {
943                file: file_path,
944                span: Span {
945                    byte_start: 0,
946                    byte_end: content_text.len(),
947                    line_start: 1,
948                    line_end: content_text.lines().count(),
949                },
950                score: _score,
951                preview,
952                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
953                symbol: None,
954                chunk_hash: None,
955                index_epoch: None,
956            },
957        ));
958    }
959
960    // Normalize scores to 0-1 range and apply threshold
961    let mut results = Vec::new();
962    if !raw_results.is_empty() {
963        let max_score = raw_results
964            .iter()
965            .map(|(score, _)| *score)
966            .fold(0.0f32, f32::max);
967        if max_score > 0.0 {
968            for (raw_score, mut result) in raw_results {
969                let normalized_score = raw_score / max_score;
970
971                // Apply threshold filtering with normalized score
972                if let Some(threshold) = options.threshold
973                    && normalized_score < threshold
974                {
975                    continue;
976                }
977
978                result.score = normalized_score;
979                results.push(result);
980            }
981        }
982    }
983
984    Ok(results)
985}
986
987#[allow(dead_code)]
988async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
989    hybrid_search_with_progress(options, None).await
990}
991
992async fn hybrid_search_with_progress(
993    options: &SearchOptions,
994    progress_callback: Option<SearchProgressCallback>,
995) -> Result<Vec<SearchResult>> {
996    if let Some(ref callback) = progress_callback {
997        callback("Running regex search...");
998    }
999    let regex_results = regex_search(options)?;
1000
1001    if let Some(ref callback) = progress_callback {
1002        callback("Running semantic search...");
1003    }
1004    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
1005
1006    let mut combined = HashMap::new();
1007
1008    for (rank, result) in regex_results.iter().enumerate() {
1009        let key = format!("{}:{}", result.file.display(), result.span.line_start);
1010        combined
1011            .entry(key)
1012            .or_insert(Vec::new())
1013            .push((rank + 1, result.clone()));
1014    }
1015
1016    for (rank, result) in semantic_results.matches.iter().enumerate() {
1017        let key = format!("{}:{}", result.file.display(), result.span.line_start);
1018        combined
1019            .entry(key)
1020            .or_insert(Vec::new())
1021            .push((rank + 1, result.clone()));
1022    }
1023
1024    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
1025    let mut rrf_results: Vec<SearchResult> = combined
1026        .into_values()
1027        .map(|ranks| {
1028            let mut result = ranks[0].1.clone();
1029            let rrf_score = ranks
1030                .iter()
1031                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
1032                .sum();
1033            result.score = rrf_score;
1034            result
1035        })
1036        .filter(|result| {
1037            // Apply threshold filtering to raw RRF scores
1038            if let Some(threshold) = options.threshold {
1039                result.score >= threshold
1040            } else {
1041                true
1042            }
1043        })
1044        .collect();
1045
1046    rrf_results.retain(|result| path_matches_include(&result.file, &options.include_patterns));
1047
1048    // Sort by RRF score (highest first)
1049    rrf_results.sort_by(|a, b| {
1050        b.score
1051            .partial_cmp(&a.score)
1052            .unwrap_or(std::cmp::Ordering::Equal)
1053    });
1054
1055    if let Some(top_k) = options.top_k {
1056        rrf_results.truncate(top_k);
1057    }
1058
1059    Ok(rrf_results)
1060}
1061
1062fn build_globset(patterns: &[String]) -> GlobSet {
1063    let mut builder = GlobSetBuilder::new();
1064    for pat in patterns {
1065        // Treat patterns as filename or directory globs
1066        if let Ok(glob) = Glob::new(pat) {
1067            builder.add(glob);
1068        }
1069    }
1070    builder.build().unwrap_or_else(|_| GlobSet::empty())
1071}
1072
1073fn should_exclude_path(path: &Path, globset: &GlobSet) -> bool {
1074    // Match against each path component and the full path
1075    if globset.is_match(path) {
1076        return true;
1077    }
1078    for component in path.components() {
1079        if let std::path::Component::Normal(name) = component
1080            && globset.is_match(name)
1081        {
1082            return true;
1083        }
1084    }
1085    false
1086}
1087
1088fn collect_files(
1089    path: &Path,
1090    recursive: bool,
1091    exclude_patterns: &[String],
1092) -> Result<Vec<PathBuf>> {
1093    let mut files = Vec::new();
1094    let globset = build_globset(exclude_patterns);
1095
1096    if path.is_file() {
1097        // Always add single files, even if they're excluded (user explicitly requested)
1098        files.push(path.to_path_buf());
1099    } else if recursive {
1100        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
1101            // Skip excluded directories entirely for efficiency
1102            let name = e.file_name();
1103            !globset.is_match(e.path()) && !globset.is_match(name)
1104        }) {
1105            match entry {
1106                Ok(entry) => {
1107                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), &globset) {
1108                        files.push(entry.path().to_path_buf());
1109                    }
1110                }
1111                Err(e) => {
1112                    // Log directory traversal errors but continue processing
1113                    tracing::debug!("Skipping path due to error: {}", e);
1114                    continue;
1115                }
1116            }
1117        }
1118    } else {
1119        match fs::read_dir(path) {
1120            Ok(read_dir) => {
1121                for entry in read_dir {
1122                    match entry {
1123                        Ok(entry) => {
1124                            let path = entry.path();
1125                            if path.is_file() && !should_exclude_path(&path, &globset) {
1126                                files.push(path);
1127                            }
1128                        }
1129                        Err(e) => {
1130                            tracing::debug!("Skipping directory entry due to error: {}", e);
1131                            continue;
1132                        }
1133                    }
1134                }
1135            }
1136            Err(e) => {
1137                tracing::debug!("Cannot read directory {:?}: {}", path, e);
1138                return Err(e.into());
1139            }
1140        }
1141    }
1142
1143    Ok(files)
1144}
1145
1146async fn ensure_index_updated_with_progress(
1147    path: &Path,
1148    force_reindex: bool,
1149    need_embeddings: bool,
1150    progress_callback: Option<ck_index::ProgressCallback>,
1151    detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1152    file_options: &ck_core::FileCollectionOptions,
1153    model_override: Option<&str>,
1154) -> Result<()> {
1155    // Find index root for .ck directory location
1156    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1157        if path.is_file() {
1158            path.parent().unwrap_or(path).to_path_buf()
1159        } else {
1160            path.to_path_buf()
1161        }
1162    });
1163    let index_root = &index_root_buf;
1164
1165    // Pass the original path to indexing function so it can index just that file/directory
1166    // The indexing function will use collect_files() which now handles individual files correctly
1167    if force_reindex {
1168        let stats = ck_index::smart_update_index_with_detailed_progress(
1169            index_root,
1170            true,
1171            progress_callback,
1172            detailed_progress_callback,
1173            need_embeddings,
1174            file_options,
1175            model_override,
1176        )
1177        .await?;
1178        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1179            tracing::info!(
1180                "Index updated: {} files indexed, {} orphaned files removed",
1181                stats.files_indexed,
1182                stats.orphaned_files_removed
1183            );
1184        }
1185        return Ok(());
1186    }
1187
1188    // For incremental updates with individual files, we need special handling
1189    // to ensure only the specific file is indexed, not the entire directory
1190    if path.is_file() {
1191        // Index just this one file
1192        use ck_index::index_file;
1193        index_file(path, need_embeddings).await?;
1194    } else {
1195        // For directories, use the standard smart update
1196        let stats = ck_index::smart_update_index_with_detailed_progress(
1197            index_root,
1198            false,
1199            progress_callback,
1200            detailed_progress_callback,
1201            need_embeddings,
1202            file_options,
1203            model_override,
1204        )
1205        .await?;
1206        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1207            tracing::info!(
1208                "Index updated: {} files indexed, {} orphaned files removed",
1209                stats.files_indexed,
1210                stats.orphaned_files_removed
1211            );
1212        }
1213    }
1214
1215    Ok(())
1216}
1217
1218fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1219    let before = options.before_context_lines.max(options.context_lines);
1220    let after = options.after_context_lines.max(options.context_lines);
1221
1222    if before > 0 || after > 0 {
1223        let start_idx = line_idx.saturating_sub(before);
1224        let end_idx = (line_idx + after + 1).min(lines.len());
1225        lines[start_idx..end_idx].join("\n")
1226    } else {
1227        lines[line_idx].to_string()
1228    }
1229}
1230
1231fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1232    let lang = ck_core::Language::from_path(file_path)?;
1233
1234    // Parse the file with tree-sitter and extract function/class sections
1235    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1236        let sections: Vec<(usize, usize, String)> = chunks
1237            .into_iter()
1238            .filter(|chunk| {
1239                matches!(
1240                    chunk.chunk_type,
1241                    ck_chunk::ChunkType::Function
1242                        | ck_chunk::ChunkType::Class
1243                        | ck_chunk::ChunkType::Method
1244                )
1245            })
1246            .map(|chunk| {
1247                (
1248                    chunk.span.line_start - 1, // Convert to 0-based index
1249                    chunk.span.line_end - 1,
1250                    chunk.text,
1251                )
1252            })
1253            .collect();
1254
1255        if sections.is_empty() {
1256            None
1257        } else {
1258            Some(sections)
1259        }
1260    } else {
1261        None
1262    }
1263}
1264
1265fn find_containing_section(
1266    sections: &[(usize, usize, String)],
1267    line_idx: usize,
1268) -> Option<&String> {
1269    for (start, end, text) in sections {
1270        if line_idx >= *start && line_idx <= *end {
1271            return Some(text);
1272        }
1273    }
1274    None
1275}
1276
1277#[cfg(test)]
1278mod tests {
1279    use super::*;
1280    use std::fs;
1281    use tempfile::TempDir;
1282
1283    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1284        let files = vec![
1285            ("test1.txt", "hello world rust programming"),
1286            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1287            ("test3.py", "print('Hello Python')"),
1288            ("test4.txt", "machine learning artificial intelligence"),
1289        ];
1290
1291        let mut paths = Vec::new();
1292        for (name, content) in files {
1293            let path = dir.join(name);
1294            fs::write(&path, content).unwrap();
1295            paths.push(path);
1296        }
1297        paths
1298    }
1299
1300    #[test]
1301    fn test_extract_lines_from_file() {
1302        let temp_dir = TempDir::new().unwrap();
1303        let test_file = temp_dir.path().join("test_lines.txt");
1304
1305        // Create a multi-line test file
1306        let content =
1307            "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1308        fs::write(&test_file, content).unwrap();
1309
1310        // Test extracting lines 3-5 (1-based indexing)
1311        let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1312        assert_eq!(result, "Line 3\nLine 4\nLine 5");
1313
1314        // Test extracting a single line
1315        let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1316        assert_eq!(result, "Line 7");
1317
1318        // Test extracting from line 8 to end
1319        let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1320        assert_eq!(result, "Line 8\nLine 9\nLine 10");
1321
1322        // Test line_start == 0 (should return empty)
1323        let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1324        assert_eq!(result, "");
1325
1326        // Test line_start > file length (should return empty)
1327        let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1328        assert_eq!(result, "");
1329    }
1330
1331    #[tokio::test]
1332    async fn test_extract_content_from_span() {
1333        let temp_dir = TempDir::new().unwrap();
1334        let test_file = temp_dir.path().join("code.rs");
1335
1336        // Create a multi-line code file
1337        let content = "fn first() {\n    println!(\"First\");\n}\n\nfn second() {\n    println!(\"Second\");\n}\n\nfn third() {\n    println!(\"Third\");\n}";
1338        fs::write(&test_file, content).unwrap();
1339
1340        // Test extracting the second function (lines 5-7)
1341        let span = ck_core::Span {
1342            byte_start: 0, // Not used in line extraction
1343            byte_end: 0,   // Not used in line extraction
1344            line_start: 5,
1345            line_end: 7,
1346        };
1347
1348        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1349        assert_eq!(result, "fn second() {\n    println!(\"Second\");\n}");
1350
1351        // Test extracting a single line
1352        let span = ck_core::Span {
1353            byte_start: 0,
1354            byte_end: 0,
1355            line_start: 2,
1356            line_end: 2,
1357        };
1358
1359        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1360        assert_eq!(result, "    println!(\"First\");");
1361    }
1362
1363    #[test]
1364    fn test_collect_files() {
1365        let temp_dir = TempDir::new().unwrap();
1366        let test_files = create_test_files(temp_dir.path());
1367
1368        // Test non-recursive
1369        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1370        assert_eq!(files.len(), 4);
1371
1372        // Test recursive
1373        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1374        assert_eq!(files.len(), 4);
1375
1376        // Test single file
1377        let files = collect_files(&test_files[0], false, &[]).unwrap();
1378        assert_eq!(files.len(), 1);
1379        assert_eq!(files[0], test_files[0]);
1380    }
1381
1382    #[test]
1383    fn test_regex_search() {
1384        let temp_dir = TempDir::new().unwrap();
1385        create_test_files(temp_dir.path());
1386
1387        let options = SearchOptions {
1388            mode: SearchMode::Regex,
1389            query: "rust".to_string(),
1390            path: temp_dir.path().to_path_buf(),
1391            recursive: true,
1392            ..Default::default()
1393        };
1394
1395        let results = regex_search(&options).unwrap();
1396        assert!(!results.is_empty());
1397
1398        // Should find matches in files containing "rust"
1399        let rust_matches: Vec<_> = results
1400            .iter()
1401            .filter(|r| r.preview.to_lowercase().contains("rust"))
1402            .collect();
1403        assert!(!rust_matches.is_empty());
1404    }
1405
1406    #[test]
1407    fn test_regex_search_case_insensitive() {
1408        let temp_dir = TempDir::new().unwrap();
1409        create_test_files(temp_dir.path());
1410
1411        let options = SearchOptions {
1412            mode: SearchMode::Regex,
1413            query: "HELLO".to_string(),
1414            path: temp_dir.path().to_path_buf(),
1415            recursive: true,
1416            case_insensitive: true,
1417            ..Default::default()
1418        };
1419
1420        let results = regex_search(&options).unwrap();
1421        assert!(!results.is_empty());
1422    }
1423
1424    #[test]
1425    fn test_regex_search_fixed_string() {
1426        let temp_dir = TempDir::new().unwrap();
1427        create_test_files(temp_dir.path());
1428
1429        let options = SearchOptions {
1430            mode: SearchMode::Regex,
1431            query: "fn main()".to_string(),
1432            path: temp_dir.path().to_path_buf(),
1433            recursive: true,
1434            fixed_string: true,
1435            ..Default::default()
1436        };
1437
1438        let results = regex_search(&options).unwrap();
1439        assert!(!results.is_empty());
1440    }
1441
1442    #[test]
1443    fn test_regex_search_whole_word() {
1444        let temp_dir = TempDir::new().unwrap();
1445        fs::write(
1446            temp_dir.path().join("word_test.txt"),
1447            "rust rusty rustacean",
1448        )
1449        .unwrap();
1450
1451        let options = SearchOptions {
1452            mode: SearchMode::Regex,
1453            query: "rust".to_string(),
1454            path: temp_dir.path().to_path_buf(),
1455            recursive: true,
1456            whole_word: true,
1457            ..Default::default()
1458        };
1459
1460        let results = regex_search(&options).unwrap();
1461        assert!(!results.is_empty());
1462        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1463    }
1464
1465    #[test]
1466    fn test_regex_search_top_k() {
1467        let temp_dir = TempDir::new().unwrap();
1468
1469        // Create multiple files with matches
1470        for i in 0..10 {
1471            fs::write(
1472                temp_dir.path().join(format!("file{}.txt", i)),
1473                "test content",
1474            )
1475            .unwrap();
1476        }
1477
1478        let options = SearchOptions {
1479            mode: SearchMode::Regex,
1480            query: "test".to_string(),
1481            path: temp_dir.path().to_path_buf(),
1482            recursive: true,
1483            top_k: Some(5),
1484            ..Default::default()
1485        };
1486
1487        let results = regex_search(&options).unwrap();
1488        assert!(results.len() <= 5);
1489    }
1490
1491    #[test]
1492    fn test_regex_search_span_offsets() {
1493        // Test that span offsets are correctly calculated for multiple matches on a line
1494        let temp_dir = TempDir::new().unwrap();
1495        let test_file = temp_dir.path().join("spans.txt");
1496        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1497
1498        let options = SearchOptions {
1499            mode: SearchMode::Regex,
1500            query: "test".to_string(),
1501            path: test_file.clone(),
1502            recursive: false,
1503            ..Default::default()
1504        };
1505
1506        let results = regex_search(&options).unwrap();
1507
1508        // Should find 5 matches total
1509        assert_eq!(results.len(), 5);
1510
1511        // Check first line has 3 matches with correct byte offsets
1512        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1513        assert_eq!(line1_matches.len(), 3);
1514        assert_eq!(line1_matches[0].span.byte_start, 0);
1515        assert_eq!(line1_matches[1].span.byte_start, 5);
1516        assert_eq!(line1_matches[2].span.byte_start, 10);
1517
1518        // Check second line match
1519        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1520        assert_eq!(line2_matches.len(), 1);
1521        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1522
1523        // Each match should have different byte offsets
1524        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1525        byte_starts.sort();
1526        byte_starts.dedup();
1527        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1528    }
1529
1530    #[test]
1531    fn test_search_file() {
1532        let temp_dir = TempDir::new().unwrap();
1533        let file_path = temp_dir.path().join("test.txt");
1534        fs::write(
1535            &file_path,
1536            "line 1: hello\nline 2: world\nline 3: rust programming",
1537        )
1538        .unwrap();
1539
1540        let regex = regex::Regex::new("rust").unwrap();
1541        let options = SearchOptions::default();
1542
1543        let results = search_file(&regex, &file_path, &options).unwrap();
1544        assert_eq!(results.len(), 1);
1545        assert_eq!(results[0].span.line_start, 3);
1546        assert!(results[0].preview.contains("rust"));
1547    }
1548
1549    #[test]
1550    fn test_search_file_with_context() {
1551        let temp_dir = TempDir::new().unwrap();
1552        let file_path = temp_dir.path().join("test.txt");
1553        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1554
1555        let regex = regex::Regex::new("target").unwrap();
1556        let options = SearchOptions {
1557            context_lines: 1,
1558            ..Default::default()
1559        };
1560
1561        let results = search_file(&regex, &file_path, &options).unwrap();
1562        assert_eq!(results.len(), 1);
1563
1564        println!("Preview: '{}'", results[0].preview);
1565
1566        // The target line is line 3, with 1 context line before and after
1567        // So we should get lines 2, 3, 4
1568        assert!(results[0].preview.contains("line 2"));
1569        assert!(results[0].preview.contains("target line"));
1570        assert!(results[0].preview.contains("line 4"));
1571    }
1572
1573    #[tokio::test]
1574    async fn test_search_main_function() {
1575        let temp_dir = TempDir::new().unwrap();
1576        create_test_files(temp_dir.path());
1577
1578        let options = SearchOptions {
1579            mode: SearchMode::Regex,
1580            query: "hello".to_string(),
1581            path: temp_dir.path().to_path_buf(),
1582            recursive: true,
1583            case_insensitive: true,
1584            ..Default::default()
1585        };
1586
1587        let results = search(&options).await.unwrap();
1588        assert!(!results.is_empty());
1589    }
1590
1591    #[tokio::test]
1592    async fn test_regex_search_mixed_line_endings() {
1593        // Regression test for byte offset issues with different line endings
1594        let temp_dir = TempDir::new().unwrap();
1595
1596        // Create test file with mixed line endings (Windows \r\n and Unix \n)
1597        let test_file = temp_dir.path().join("mixed_endings.txt");
1598        let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
1599        std::fs::write(&test_file, content).unwrap();
1600
1601        let options = SearchOptions {
1602            mode: SearchMode::Regex,
1603            query: "pattern".to_string(),
1604            path: test_file.clone(),
1605            recursive: false,
1606            ..Default::default()
1607        };
1608
1609        let results = search(&options).await.unwrap();
1610        assert_eq!(results.len(), 1);
1611
1612        let result = &results[0];
1613        // Verify byte offsets are correct - should point to start of "pattern"
1614        let original_content = std::fs::read_to_string(&test_file).unwrap();
1615        let pattern_start = original_content.find("pattern").unwrap();
1616
1617        assert_eq!(result.span.byte_start, pattern_start);
1618        assert_eq!(result.span.line_start, 4); // Fourth line
1619    }
1620
1621    #[tokio::test]
1622    async fn test_regex_search_windows_line_endings() {
1623        // Regression test specifically for Windows \r\n line endings
1624        let temp_dir = TempDir::new().unwrap();
1625
1626        let test_file = temp_dir.path().join("windows_endings.txt");
1627        let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
1628        std::fs::write(&test_file, content).unwrap();
1629
1630        let options = SearchOptions {
1631            mode: SearchMode::Regex,
1632            query: "match".to_string(),
1633            path: test_file.clone(),
1634            recursive: false,
1635            ..Default::default()
1636        };
1637
1638        let results = search(&options).await.unwrap();
1639        assert_eq!(results.len(), 1);
1640
1641        let result = &results[0];
1642
1643        // Verify the match is on line 3
1644        assert_eq!(result.span.line_start, 3);
1645
1646        // Verify byte offset accounts for \r\n endings
1647        // first line\r\n = 12 bytes, second line\r\n = 13 bytes, total = 25 bytes before "match"
1648        let expected_byte_start = 25; // Position of "match" in the content
1649        assert_eq!(result.span.byte_start, expected_byte_start);
1650    }
1651
1652    #[test]
1653    fn test_split_lines_with_endings_helper() {
1654        // Unix line endings
1655        let unix_content = "line1\nline2\nline3\n";
1656        let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
1657        assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
1658        assert_eq!(unix_endings, vec![1, 1, 1]);
1659
1660        // Windows line endings
1661        let windows_content = "line1\r\nline2\r\nline3\r\n";
1662        let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
1663        assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
1664        assert_eq!(windows_endings, vec![2, 2, 2]);
1665
1666        // Old Mac line endings
1667        let mac_content = "line1\rline2\rline3\r";
1668        let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
1669        assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
1670        assert_eq!(mac_endings, vec![1, 1, 1]);
1671
1672        // Mixed endings
1673        let mixed_content = "line1\nline2\r\nline3\r";
1674        let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
1675        assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
1676        assert_eq!(mixed_endings, vec![1, 2, 1]);
1677
1678        // No line endings
1679        let no_endings = "single line";
1680        let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
1681        assert_eq!(no_lines, vec!["single line"]);
1682        assert_eq!(no_endings_vec, vec![0]);
1683    }
1684
1685    #[tokio::test]
1686    async fn test_subdirectory_search_uses_parent_ckignore() {
1687        // Regression test for issue where searching in subdirectory doesn't use parent .ckignore
1688        // Bug: When searching ~/parent/subdir/, .ckignore is loaded from subdir (doesn't exist)
1689        // instead of from parent (where index and .ckignore live)
1690
1691        let temp_dir = TempDir::new().unwrap();
1692        let parent = temp_dir.path();
1693        let subdir = parent.join("subproject");
1694        fs::create_dir(&subdir).unwrap();
1695
1696        // Create .ckignore at parent level excluding *.tmp files
1697        fs::write(parent.join(".ckignore"), "*.tmp\n").unwrap();
1698
1699        // Create test files in parent directory
1700        fs::write(parent.join("parent.txt"), "searchable content in parent").unwrap();
1701        fs::write(parent.join("ignored.tmp"), "this should not be indexed").unwrap();
1702
1703        // Create test files in subdirectory
1704        fs::write(subdir.join("nested.txt"), "searchable content in subdir").unwrap();
1705        fs::write(
1706            subdir.join("also_ignored.tmp"),
1707            "this should not be indexed either",
1708        )
1709        .unwrap();
1710
1711        // First, search from parent to create the index
1712        let parent_options = SearchOptions {
1713            mode: SearchMode::Semantic,
1714            query: "searchable".to_string(),
1715            path: parent.to_path_buf(),
1716            top_k: Some(10),
1717            threshold: Some(0.1),
1718            ..Default::default()
1719        };
1720
1721        let _ = search(&parent_options).await;
1722
1723        // Give indexing a moment to complete
1724        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1725
1726        // Now search from SUBDIRECTORY - this is where the bug occurs
1727        // The engine should find parent .ck index and use parent .ckignore
1728        // But currently it loads .ckignore from subdir (doesn't exist)
1729        let subdir_options = SearchOptions {
1730            mode: SearchMode::Semantic,
1731            query: "content".to_string(),
1732            path: subdir.clone(),
1733            top_k: Some(10),
1734            threshold: Some(0.1),
1735            ..Default::default()
1736        };
1737
1738        let results = search(&subdir_options).await.unwrap();
1739
1740        // ASSERTION 1: .tmp files should be excluded (currently FAILS due to bug)
1741        let tmp_files: Vec<_> = results
1742            .iter()
1743            .filter(|r| r.file.to_string_lossy().ends_with(".tmp"))
1744            .collect();
1745        assert!(
1746            tmp_files.is_empty(),
1747            "Bug: .tmp files were indexed despite parent .ckignore. Found {} .tmp files: {:?}",
1748            tmp_files.len(),
1749            tmp_files.iter().map(|r| &r.file).collect::<Vec<_>>()
1750        );
1751
1752        // ASSERTION 2: Should find .txt files in subdirectory
1753        let txt_in_subdir = results.iter().any(|r| r.file.ends_with("nested.txt"));
1754        assert!(txt_in_subdir, "Should find nested.txt in subdirectory");
1755
1756        // ASSERTION 3: No .ck directory should be created in subdirectory
1757        assert!(
1758            !subdir.join(".ck").exists(),
1759            "Should not create .ck directory in subdirectory"
1760        );
1761    }
1762
1763    #[tokio::test]
1764    async fn test_multiple_ckignore_files_merge_correctly() {
1765        // Test that multiple .ckignore files in the hierarchy are all applied
1766        use std::fs;
1767        use tempfile::TempDir;
1768
1769        let temp_dir = TempDir::new().unwrap();
1770        let parent = temp_dir.path();
1771        let subdir = parent.join("subdir");
1772        let deeper = subdir.join("deeper");
1773        fs::create_dir(&subdir).unwrap();
1774        fs::create_dir(&deeper).unwrap();
1775
1776        // Create hierarchical .ckignore files
1777        fs::write(parent.join(".ckignore"), "*.log\n").unwrap();
1778        fs::write(subdir.join(".ckignore"), "*.tmp\n").unwrap();
1779        fs::write(deeper.join(".ckignore"), "*.cache\n").unwrap();
1780
1781        // Create test files at each level
1782        fs::write(parent.join("root.txt"), "searchable").unwrap();
1783        fs::write(parent.join("root.log"), "should be ignored").unwrap();
1784
1785        fs::write(subdir.join("mid.txt"), "searchable").unwrap();
1786        fs::write(subdir.join("mid.log"), "should be ignored by parent").unwrap();
1787        fs::write(subdir.join("mid.tmp"), "should be ignored by local").unwrap();
1788
1789        fs::write(deeper.join("deep.txt"), "searchable").unwrap();
1790        fs::write(deeper.join("deep.log"), "should be ignored by grandparent").unwrap();
1791        fs::write(deeper.join("deep.tmp"), "should be ignored by parent").unwrap();
1792        fs::write(deeper.join("deep.cache"), "should be ignored by local").unwrap();
1793
1794        // Index from parent
1795        let parent_options = SearchOptions {
1796            mode: SearchMode::Semantic,
1797            query: "searchable".to_string(),
1798            path: parent.to_path_buf(),
1799            top_k: Some(20),
1800            threshold: Some(0.1),
1801            ..Default::default()
1802        };
1803
1804        let _ = search(&parent_options).await;
1805        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1806
1807        // Search from deeper directory - should respect ALL three .ckignore files
1808        let deeper_options = SearchOptions {
1809            mode: SearchMode::Semantic,
1810            query: "ignored".to_string(),
1811            path: deeper.clone(),
1812            top_k: Some(20),
1813            threshold: Some(0.1),
1814            ..Default::default()
1815        };
1816
1817        let results = search(&deeper_options).await.unwrap();
1818
1819        // All ignored files should be excluded
1820        let has_log = results
1821            .iter()
1822            .any(|r| r.file.to_string_lossy().ends_with(".log"));
1823        let has_tmp = results
1824            .iter()
1825            .any(|r| r.file.to_string_lossy().ends_with(".tmp"));
1826        let has_cache = results
1827            .iter()
1828            .any(|r| r.file.to_string_lossy().ends_with(".cache"));
1829
1830        assert!(
1831            !has_log,
1832            "*.log files should be excluded by parent .ckignore"
1833        );
1834        assert!(
1835            !has_tmp,
1836            "*.tmp files should be excluded by subdir .ckignore"
1837        );
1838        assert!(
1839            !has_cache,
1840            "*.cache files should be excluded by deeper .ckignore"
1841        );
1842
1843        // Should still find .txt files
1844        let has_txt = results
1845            .iter()
1846            .any(|r| r.file.to_string_lossy().ends_with(".txt"));
1847        assert!(has_txt, "Should find .txt files (not ignored)");
1848    }
1849}
ck_engine/lib.rs

ck_engine/
lib.rs