ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23/// Resolve the actual file path to read content from
24/// For PDFs: returns cache path and validates it exists
25/// For regular files: returns original path
26fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27    if ck_core::pdf::is_pdf_file(file_path) {
28        // PDFs: Read from cached extracted text
29        let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30        if !cache_path.exists() {
31            return Err(anyhow::anyhow!(
32                "PDF not preprocessed. Run 'ck --index' first."
33            ));
34        }
35        Ok(cache_path)
36    } else {
37        // Regular files: Read from original source
38        Ok(file_path.to_path_buf())
39    }
40}
41
42/// Read content from file for search result extraction
43/// Regular files: read directly from source
44/// PDFs: read from preprocessed cache
45fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46    let content_path = resolve_content_path(file_path, repo_root)?;
47    Ok(fs::read_to_string(content_path)?)
48}
49
50/// Extract content from a file using a span (streaming version)
51async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52    // Find repo root to locate cache
53    let repo_root = find_nearest_index_root(file_path)
54        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56    // Use centralized path resolution
57    let content_path = resolve_content_path(file_path, &repo_root)?;
58
59    // Stream only the needed lines
60    extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63/// Stream-read specific lines from a file without loading the entire content
64fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65    use std::io::{BufRead, BufReader};
66
67    if line_start == 0 {
68        return Ok(String::new());
69    }
70
71    let file = fs::File::open(file_path)?;
72    let reader = BufReader::new(file);
73    let mut result = Vec::new();
74
75    // Convert to 0-based indexing
76    let start_idx = line_start.saturating_sub(1);
77    let end_idx = line_end.saturating_sub(1);
78
79    for (current_line, line_result) in reader.lines().enumerate() {
80        if current_line > end_idx {
81            break; // Stop reading once we've passed the needed lines
82        }
83
84        let line = line_result?;
85
86        if current_line >= start_idx {
87            result.push(line);
88        }
89    }
90
91    // Handle case where requested lines exceed file length
92    if result.is_empty() && line_start > 0 {
93        return Ok(String::new());
94    }
95
96    Ok(result.join("\n"))
97}
98
99fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
100    let mut current = if path.is_file() {
101        path.parent().unwrap_or(path)
102    } else {
103        path
104    };
105    loop {
106        if current.join(".ck").exists() {
107            return Some(current.to_path_buf());
108        }
109        match current.parent() {
110            Some(parent) => current = parent,
111            None => return None,
112        }
113    }
114}
115
116#[derive(Clone, Debug)]
117pub struct ResolvedModel {
118    pub canonical_name: String,
119    pub alias: String,
120    pub dimensions: usize,
121}
122
123fn find_model_entry<'a>(
124    registry: &'a ck_models::ModelRegistry,
125    key: &str,
126) -> Option<(String, &'a ck_models::ModelConfig)> {
127    if let Some(config) = registry.get_model(key) {
128        return Some((key.to_string(), config));
129    }
130
131    registry
132        .models
133        .iter()
134        .find(|(_, config)| config.name == key)
135        .map(|(alias, config)| (alias.clone(), config))
136}
137
138pub(crate) fn resolve_model_from_root(
139    index_root: &Path,
140    cli_model: Option<&str>,
141) -> Result<ResolvedModel> {
142    use ck_models::ModelRegistry;
143
144    let registry = ModelRegistry::default();
145    let index_dir = index_root.join(".ck");
146    let manifest_path = index_dir.join("manifest.json");
147
148    if manifest_path.exists() {
149        let data = std::fs::read(&manifest_path)?;
150        let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
151
152        if let Some(existing_model) = manifest.embedding_model {
153            let (alias, config_opt) = find_model_entry(&registry, &existing_model)
154                .map(|(alias, config)| (alias, Some(config)))
155                .unwrap_or_else(|| (existing_model.clone(), None));
156
157            let dims = manifest
158                .embedding_dimensions
159                .or_else(|| config_opt.map(|c| c.dimensions))
160                .unwrap_or(384);
161
162            if let Some(requested) = cli_model {
163                let (_, requested_config) =
164                    find_model_entry(&registry, requested).ok_or_else(|| {
165                        CkError::Embedding(format!(
166                            "Unknown model '{}'. Available models: {}",
167                            requested,
168                            registry
169                                .models
170                                .keys()
171                                .cloned()
172                                .collect::<Vec<_>>()
173                                .join(", ")
174                        ))
175                    })?;
176
177                if requested_config.name != existing_model {
178                    let suggested_alias = alias.clone();
179                    return Err(CkError::Embedding(format!(
180                        "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
181                        existing_model,
182                        suggested_alias,
183                        requested,
184                        requested,
185                        suggested_alias
186                    ))
187                    .into());
188                }
189            }
190
191            return Ok(ResolvedModel {
192                canonical_name: existing_model,
193                alias,
194                dimensions: dims,
195            });
196        }
197    }
198
199    let (alias, config) = if let Some(requested) = cli_model {
200        find_model_entry(&registry, requested).ok_or_else(|| {
201            CkError::Embedding(format!(
202                "Unknown model '{}'. Available models: {}",
203                requested,
204                registry
205                    .models
206                    .keys()
207                    .cloned()
208                    .collect::<Vec<_>>()
209                    .join(", ")
210            ))
211        })?
212    } else {
213        let alias = registry.default_model.clone();
214        let config = registry.get_default_model().ok_or_else(|| {
215            CkError::Embedding("No default embedding model configured".to_string())
216        })?;
217        (alias, config)
218    };
219
220    Ok(ResolvedModel {
221        canonical_name: config.name.clone(),
222        alias,
223        dimensions: config.dimensions,
224    })
225}
226
227pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
228    let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
229        if path.is_file() {
230            path.parent().unwrap_or(path).to_path_buf()
231        } else {
232            path.to_path_buf()
233        }
234    });
235    resolve_model_from_root(&index_root, cli_model)
236}
237
238pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
239    let results = search_enhanced(options).await?;
240    Ok(results.matches)
241}
242
243pub async fn search_with_progress(
244    options: &SearchOptions,
245    progress_callback: Option<SearchProgressCallback>,
246) -> Result<Vec<SearchResult>> {
247    let results = search_enhanced_with_progress(options, progress_callback).await?;
248    Ok(results.matches)
249}
250
251/// Enhanced search that includes near-miss information for threshold queries
252pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
253    search_enhanced_with_progress(options, None).await
254}
255
256/// Enhanced search with progress callback that includes near-miss information
257pub async fn search_enhanced_with_progress(
258    options: &SearchOptions,
259    progress_callback: Option<SearchProgressCallback>,
260) -> Result<ck_core::SearchResults> {
261    search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
262}
263
264/// Enhanced search with both search and indexing progress callbacks
265pub async fn search_enhanced_with_indexing_progress(
266    options: &SearchOptions,
267    progress_callback: Option<SearchProgressCallback>,
268    indexing_progress_callback: Option<IndexingProgressCallback>,
269    detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
270) -> Result<ck_core::SearchResults> {
271    // Validate that the search path exists
272    if !options.path.exists() {
273        return Err(ck_core::CkError::Search(format!(
274            "Path does not exist: {}",
275            options.path.display()
276        ))
277        .into());
278    }
279
280    // Auto-update index if needed (unless it's regex-only mode)
281    if !matches!(options.mode, SearchMode::Regex) {
282        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
283        ensure_index_updated_with_progress(
284            &options.path,
285            options.reindex,
286            need_embeddings,
287            indexing_progress_callback,
288            detailed_indexing_progress_callback,
289            options.respect_gitignore,
290            &options.exclude_patterns,
291            options.embedding_model.as_deref(),
292        )
293        .await?;
294    }
295
296    let search_results = match options.mode {
297        SearchMode::Regex => {
298            let matches = regex_search(options)?;
299            ck_core::SearchResults {
300                matches,
301                closest_below_threshold: None,
302            }
303        }
304        SearchMode::Lexical => {
305            let matches = lexical_search(options).await?;
306            ck_core::SearchResults {
307                matches,
308                closest_below_threshold: None,
309            }
310        }
311        SearchMode::Semantic => {
312            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
313            semantic_search_v3_with_progress(options, progress_callback).await?
314        }
315        SearchMode::Hybrid => {
316            let matches = hybrid_search_with_progress(options, progress_callback).await?;
317            ck_core::SearchResults {
318                matches,
319                closest_below_threshold: None,
320            }
321        }
322    };
323
324    Ok(search_results)
325}
326
327fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
328    let pattern = if options.fixed_string {
329        regex::escape(&options.query)
330    } else if options.whole_word {
331        format!(r"\b{}\b", regex::escape(&options.query))
332    } else {
333        options.query.clone()
334    };
335
336    let regex = RegexBuilder::new(&pattern)
337        .case_insensitive(options.case_insensitive)
338        .build()
339        .map_err(CkError::Regex)?;
340
341    // Default to recursive for directories (like grep) to maintain compatibility
342    let should_recurse = options.path.is_dir() || options.recursive;
343    let files = if should_recurse {
344        // Use ck_index's collect_files which respects gitignore
345        ck_index::collect_files(
346            &options.path,
347            options.respect_gitignore,
348            &options.exclude_patterns,
349        )?
350    } else {
351        // For non-recursive, use the local collect_files
352        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
353    };
354
355    let results: Vec<Vec<SearchResult>> = files
356        .par_iter()
357        .filter_map(|file_path| match search_file(&regex, file_path, options) {
358            Ok(matches) => {
359                if matches.is_empty() {
360                    None
361                } else {
362                    Some(matches)
363                }
364            }
365            Err(e) => {
366                tracing::debug!("Error searching {:?}: {}", file_path, e);
367                None
368            }
369        })
370        .collect();
371
372    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
373    // Deterministic ordering: file path, then line number
374    all_results.sort_by(|a, b| {
375        let path_cmp = a.file.cmp(&b.file);
376        if path_cmp != std::cmp::Ordering::Equal {
377            return path_cmp;
378        }
379        a.span.line_start.cmp(&b.span.line_start)
380    });
381
382    if let Some(top_k) = options.top_k {
383        all_results.truncate(top_k);
384    }
385
386    Ok(all_results)
387}
388
389fn search_file(
390    regex: &Regex,
391    file_path: &Path,
392    options: &SearchOptions,
393) -> Result<Vec<SearchResult>> {
394    // Find repo root to locate cache
395    let repo_root = find_nearest_index_root(file_path)
396        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
397
398    // For full_section mode, we need the entire content for parsing
399    // For context previews, we need all lines for surrounding context
400    // So we'll load content when needed, but optimize for the common case
401    if options.full_section || options.context_lines > 0 {
402        // Load full content when we need section parsing or context
403        let content = read_file_content(file_path, &repo_root)?;
404        let lines: Vec<String> = content.lines().map(|s| s.to_string()).collect();
405
406        // If full_section is enabled, try to parse the file and find code sections
407        let code_sections = if options.full_section {
408            extract_code_sections(file_path, &content)
409        } else {
410            None
411        };
412
413        search_file_in_memory(regex, file_path, options, &lines, &code_sections)
414    } else {
415        // Streaming search (simple case)
416        search_file_streaming(regex, file_path, &repo_root, options)
417    }
418}
419
420/// In-memory search for cases requiring context or code sections
421fn search_file_in_memory(
422    regex: &Regex,
423    file_path: &Path,
424    options: &SearchOptions,
425    lines: &[String],
426    code_sections: &Option<Vec<(usize, usize, String)>>,
427) -> Result<Vec<SearchResult>> {
428    let mut results = Vec::new();
429    let mut byte_offset = 0;
430
431    for (line_idx, line) in lines.iter().enumerate() {
432        let line_number = line_idx + 1;
433
434        // Special handling for empty pattern - match the entire line once
435        // An empty regex pattern will match at every position, so we need to handle it specially
436        if regex.as_str().is_empty() {
437            // Empty pattern matches the whole line once (grep compatibility)
438            let preview = if options.full_section {
439                // Try to find the containing code section
440                if let Some(sections) = code_sections {
441                    if let Some(section) = find_containing_section(sections, line_idx) {
442                        section.clone()
443                    } else {
444                        // Fall back to context lines if no section found
445                        get_context_preview(lines, line_idx, options)
446                    }
447                } else {
448                    get_context_preview(lines, line_idx, options)
449                }
450            } else {
451                get_context_preview(lines, line_idx, options)
452            };
453
454            results.push(SearchResult {
455                file: file_path.to_path_buf(),
456                span: Span {
457                    byte_start: byte_offset,
458                    byte_end: byte_offset + line.len(),
459                    line_start: line_number,
460                    line_end: line_number,
461                },
462                score: 1.0,
463                preview,
464                lang: ck_core::Language::from_path(file_path),
465                symbol: None,
466                chunk_hash: None,
467                index_epoch: None,
468            });
469        } else {
470            // Find all matches in the line with their positions
471            for mat in regex.find_iter(line) {
472                let preview = if options.full_section {
473                    // Try to find the containing code section
474                    if let Some(sections) = code_sections {
475                        if let Some(section) = find_containing_section(sections, line_idx) {
476                            section.clone()
477                        } else {
478                            // Fall back to context lines if no section found
479                            get_context_preview(lines, line_idx, options)
480                        }
481                    } else {
482                        get_context_preview(lines, line_idx, options)
483                    }
484                } else {
485                    get_context_preview(lines, line_idx, options)
486                };
487
488                results.push(SearchResult {
489                    file: file_path.to_path_buf(),
490                    span: Span {
491                        byte_start: byte_offset + mat.start(),
492                        byte_end: byte_offset + mat.end(),
493                        line_start: line_number,
494                        line_end: line_number,
495                    },
496                    score: 1.0,
497                    preview,
498                    lang: ck_core::Language::from_path(file_path),
499                    symbol: None,
500                    chunk_hash: None,
501                    index_epoch: None,
502                });
503            }
504        }
505
506        // Update byte offset for next line (add line length + newline character)
507        byte_offset += line.len();
508        if line_idx < lines.len() - 1 {
509            byte_offset += 1; // Add 1 for the newline character
510        }
511    }
512
513    Ok(results)
514}
515
516/// Streaming search for simple cases without context or code sections
517fn search_file_streaming(
518    regex: &Regex,
519    file_path: &Path,
520    repo_root: &Path,
521    _options: &SearchOptions,
522) -> Result<Vec<SearchResult>> {
523    use std::io::{BufRead, BufReader};
524
525    let content_path = resolve_content_path(file_path, repo_root)?;
526    let file = std::fs::File::open(&content_path)?;
527    let reader = BufReader::new(file);
528
529    let mut results = Vec::new();
530    let mut byte_offset = 0;
531
532    for (line_idx, line_result) in reader.lines().enumerate() {
533        let line = line_result?;
534        let line_number = line_idx + 1;
535
536        // Special handling for empty pattern - match the entire line once
537        if regex.as_str().is_empty() {
538            results.push(SearchResult {
539                file: file_path.to_path_buf(),
540                span: Span {
541                    byte_start: byte_offset,
542                    byte_end: byte_offset + line.len(),
543                    line_start: line_number,
544                    line_end: line_number,
545                },
546                score: 1.0,
547                preview: line.clone(), // Simple preview: just the line itself
548                lang: ck_core::Language::from_path(file_path),
549                symbol: None,
550                chunk_hash: None,
551                index_epoch: None,
552            });
553        } else {
554            // Find all matches in the line with their positions
555            for mat in regex.find_iter(&line) {
556                results.push(SearchResult {
557                    file: file_path.to_path_buf(),
558                    span: Span {
559                        byte_start: byte_offset + mat.start(),
560                        byte_end: byte_offset + mat.end(),
561                        line_start: line_number,
562                        line_end: line_number,
563                    },
564                    score: 1.0,
565                    preview: line.clone(), // Simple preview: just the line itself
566                    lang: ck_core::Language::from_path(file_path),
567                    symbol: None,
568                    chunk_hash: None,
569                    index_epoch: None,
570                });
571            }
572        }
573
574        // Update byte offset for next line (add line length + newline character)
575        byte_offset += line.len() + 1; // +1 for newline
576    }
577
578    Ok(results)
579}
580
581async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
582    // Handle both files and directories and reuse nearest existing .ck index up the tree
583    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
584        if options.path.is_file() {
585            options.path.parent().unwrap_or(&options.path).to_path_buf()
586        } else {
587            options.path.clone()
588        }
589    });
590
591    let index_dir = index_root.join(".ck");
592    if !index_dir.exists() {
593        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
594    }
595
596    let tantivy_index_path = index_dir.join("tantivy_index");
597
598    if !tantivy_index_path.exists() {
599        return build_tantivy_index(options).await;
600    }
601
602    let mut schema_builder = Schema::builder();
603    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
604    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
605    let _schema = schema_builder.build();
606
607    let index = Index::open_in_dir(&tantivy_index_path)
608        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
609
610    let reader = index
611        .reader_builder()
612        .reload_policy(ReloadPolicy::OnCommitWithDelay)
613        .try_into()
614        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
615
616    let searcher = reader.searcher();
617    let query_parser = QueryParser::for_index(&index, vec![content_field]);
618
619    let query = query_parser
620        .parse_query(&options.query)
621        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
622
623    let top_docs = if let Some(top_k) = options.top_k {
624        searcher.search(&query, &TopDocs::with_limit(top_k))?
625    } else {
626        searcher.search(&query, &TopDocs::with_limit(100))?
627    };
628
629    // First, collect all results with raw scores
630    let mut raw_results = Vec::new();
631    for (_score, doc_address) in top_docs {
632        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
633        let path_text = retrieved_doc
634            .get_first(path_field)
635            .map(|field_value| field_value.as_str().unwrap_or(""))
636            .unwrap_or("");
637        let content_text = retrieved_doc
638            .get_first(content_field)
639            .map(|field_value| field_value.as_str().unwrap_or(""))
640            .unwrap_or("");
641
642        let file_path = PathBuf::from(path_text);
643        let preview = if options.full_section {
644            content_text.to_string()
645        } else {
646            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
647        };
648
649        raw_results.push((
650            _score,
651            SearchResult {
652                file: file_path,
653                span: Span {
654                    byte_start: 0,
655                    byte_end: content_text.len(),
656                    line_start: 1,
657                    line_end: content_text.lines().count(),
658                },
659                score: _score,
660                preview,
661                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
662                symbol: None,
663                chunk_hash: None,
664                index_epoch: None,
665            },
666        ));
667    }
668
669    // Normalize scores to 0-1 range and apply threshold
670    let mut results = Vec::new();
671    if !raw_results.is_empty() {
672        let max_score = raw_results
673            .iter()
674            .map(|(score, _)| *score)
675            .fold(0.0f32, f32::max);
676        if max_score > 0.0 {
677            for (raw_score, mut result) in raw_results {
678                let normalized_score = raw_score / max_score;
679
680                // Apply threshold filtering with normalized score
681                if let Some(threshold) = options.threshold
682                    && normalized_score < threshold
683                {
684                    continue;
685                }
686
687                result.score = normalized_score;
688                results.push(result);
689            }
690        }
691    }
692
693    Ok(results)
694}
695
696async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
697    // Handle both files and directories by finding the appropriate directory for indexing
698    let index_root = if options.path.is_file() {
699        options.path.parent().unwrap_or(&options.path)
700    } else {
701        &options.path
702    };
703
704    let index_dir = index_root.join(".ck");
705    let tantivy_index_path = index_dir.join("tantivy_index");
706
707    fs::create_dir_all(&tantivy_index_path)?;
708
709    let mut schema_builder = Schema::builder();
710    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
711    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
712    let schema = schema_builder.build();
713
714    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
715        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
716
717    let mut index_writer = index
718        .writer(50_000_000)
719        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
720
721    let files = collect_files(index_root, true, &options.exclude_patterns)?;
722
723    for file_path in &files {
724        if let Ok(content) = fs::read_to_string(file_path) {
725            let doc = doc!(
726                content_field => content,
727                path_field => file_path.display().to_string()
728            );
729            index_writer.add_document(doc)?;
730        }
731    }
732
733    index_writer
734        .commit()
735        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
736
737    // After building, search again with the same options
738    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
739    let mut schema_builder = Schema::builder();
740    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
741    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
742    let _schema = schema_builder.build();
743
744    let index = Index::open_in_dir(&tantivy_index_path)
745        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
746
747    let reader = index
748        .reader_builder()
749        .reload_policy(ReloadPolicy::OnCommitWithDelay)
750        .try_into()
751        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
752
753    let searcher = reader.searcher();
754    let query_parser = QueryParser::for_index(&index, vec![content_field]);
755
756    let query = query_parser
757        .parse_query(&options.query)
758        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
759
760    let top_docs = if let Some(top_k) = options.top_k {
761        searcher.search(&query, &TopDocs::with_limit(top_k))?
762    } else {
763        searcher.search(&query, &TopDocs::with_limit(100))?
764    };
765
766    // First, collect all results with raw scores
767    let mut raw_results = Vec::new();
768    for (_score, doc_address) in top_docs {
769        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
770        let path_text = retrieved_doc
771            .get_first(path_field)
772            .map(|field_value| field_value.as_str().unwrap_or(""))
773            .unwrap_or("");
774        let content_text = retrieved_doc
775            .get_first(content_field)
776            .map(|field_value| field_value.as_str().unwrap_or(""))
777            .unwrap_or("");
778
779        let file_path = PathBuf::from(path_text);
780        let preview = if options.full_section {
781            content_text.to_string()
782        } else {
783            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
784        };
785
786        raw_results.push((
787            _score,
788            SearchResult {
789                file: file_path,
790                span: Span {
791                    byte_start: 0,
792                    byte_end: content_text.len(),
793                    line_start: 1,
794                    line_end: content_text.lines().count(),
795                },
796                score: _score,
797                preview,
798                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
799                symbol: None,
800                chunk_hash: None,
801                index_epoch: None,
802            },
803        ));
804    }
805
806    // Normalize scores to 0-1 range and apply threshold
807    let mut results = Vec::new();
808    if !raw_results.is_empty() {
809        let max_score = raw_results
810            .iter()
811            .map(|(score, _)| *score)
812            .fold(0.0f32, f32::max);
813        if max_score > 0.0 {
814            for (raw_score, mut result) in raw_results {
815                let normalized_score = raw_score / max_score;
816
817                // Apply threshold filtering with normalized score
818                if let Some(threshold) = options.threshold
819                    && normalized_score < threshold
820                {
821                    continue;
822                }
823
824                result.score = normalized_score;
825                results.push(result);
826            }
827        }
828    }
829
830    Ok(results)
831}
832
833#[allow(dead_code)]
834async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
835    hybrid_search_with_progress(options, None).await
836}
837
838async fn hybrid_search_with_progress(
839    options: &SearchOptions,
840    progress_callback: Option<SearchProgressCallback>,
841) -> Result<Vec<SearchResult>> {
842    if let Some(ref callback) = progress_callback {
843        callback("Running regex search...");
844    }
845    let regex_results = regex_search(options)?;
846
847    if let Some(ref callback) = progress_callback {
848        callback("Running semantic search...");
849    }
850    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
851
852    let mut combined = HashMap::new();
853
854    for (rank, result) in regex_results.iter().enumerate() {
855        let key = format!("{}:{}", result.file.display(), result.span.line_start);
856        combined
857            .entry(key)
858            .or_insert(Vec::new())
859            .push((rank + 1, result.clone()));
860    }
861
862    for (rank, result) in semantic_results.matches.iter().enumerate() {
863        let key = format!("{}:{}", result.file.display(), result.span.line_start);
864        combined
865            .entry(key)
866            .or_insert(Vec::new())
867            .push((rank + 1, result.clone()));
868    }
869
870    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
871    let mut rrf_results: Vec<SearchResult> = combined
872        .into_values()
873        .map(|ranks| {
874            let mut result = ranks[0].1.clone();
875            let rrf_score = ranks
876                .iter()
877                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
878                .sum();
879            result.score = rrf_score;
880            result
881        })
882        .filter(|result| {
883            // Apply threshold filtering to raw RRF scores
884            if let Some(threshold) = options.threshold {
885                result.score >= threshold
886            } else {
887                true
888            }
889        })
890        .collect();
891
892    // Sort by RRF score (highest first)
893    rrf_results.sort_by(|a, b| {
894        b.score
895            .partial_cmp(&a.score)
896            .unwrap_or(std::cmp::Ordering::Equal)
897    });
898
899    if let Some(top_k) = options.top_k {
900        rrf_results.truncate(top_k);
901    }
902
903    Ok(rrf_results)
904}
905
906fn build_globset(patterns: &[String]) -> GlobSet {
907    let mut builder = GlobSetBuilder::new();
908    for pat in patterns {
909        // Treat patterns as filename or directory globs
910        if let Ok(glob) = Glob::new(pat) {
911            builder.add(glob);
912        }
913    }
914    builder.build().unwrap_or_else(|_| GlobSet::empty())
915}
916
917fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
918    let globset = build_globset(exclude_patterns);
919    // Match against each path component and the full path
920    if globset.is_match(path) {
921        return true;
922    }
923    for component in path.components() {
924        if let std::path::Component::Normal(name) = component
925            && globset.is_match(name)
926        {
927            return true;
928        }
929    }
930    false
931}
932
933fn collect_files(
934    path: &Path,
935    recursive: bool,
936    exclude_patterns: &[String],
937) -> Result<Vec<PathBuf>> {
938    let mut files = Vec::new();
939    let globset = build_globset(exclude_patterns);
940
941    if path.is_file() {
942        // Always add single files, even if they're excluded (user explicitly requested)
943        files.push(path.to_path_buf());
944    } else if recursive {
945        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
946            // Skip excluded directories entirely for efficiency
947            let name = e.file_name();
948            !globset.is_match(e.path()) && !globset.is_match(name)
949        }) {
950            match entry {
951                Ok(entry) => {
952                    if entry.file_type().is_file()
953                        && !should_exclude_path(entry.path(), exclude_patterns)
954                    {
955                        files.push(entry.path().to_path_buf());
956                    }
957                }
958                Err(e) => {
959                    // Log directory traversal errors but continue processing
960                    tracing::debug!("Skipping path due to error: {}", e);
961                    continue;
962                }
963            }
964        }
965    } else {
966        match fs::read_dir(path) {
967            Ok(read_dir) => {
968                for entry in read_dir {
969                    match entry {
970                        Ok(entry) => {
971                            let path = entry.path();
972                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
973                                files.push(path);
974                            }
975                        }
976                        Err(e) => {
977                            tracing::debug!("Skipping directory entry due to error: {}", e);
978                            continue;
979                        }
980                    }
981                }
982            }
983            Err(e) => {
984                tracing::debug!("Cannot read directory {:?}: {}", path, e);
985                return Err(e.into());
986            }
987        }
988    }
989
990    Ok(files)
991}
992
993#[allow(clippy::too_many_arguments)]
994async fn ensure_index_updated_with_progress(
995    path: &Path,
996    force_reindex: bool,
997    need_embeddings: bool,
998    progress_callback: Option<ck_index::ProgressCallback>,
999    detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1000    respect_gitignore: bool,
1001    exclude_patterns: &[String],
1002    model_override: Option<&str>,
1003) -> Result<()> {
1004    // Handle both files and directories and reuse nearest existing .ck index up the tree
1005    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1006        if path.is_file() {
1007            path.parent().unwrap_or(path).to_path_buf()
1008        } else {
1009            path.to_path_buf()
1010        }
1011    });
1012    let index_root = &index_root_buf;
1013
1014    // If force reindex is requested, always update
1015    if force_reindex {
1016        let stats = ck_index::smart_update_index_with_detailed_progress(
1017            index_root,
1018            false,
1019            progress_callback,
1020            detailed_progress_callback,
1021            need_embeddings,
1022            respect_gitignore,
1023            exclude_patterns, // Use search-specific exclude patterns
1024            model_override,
1025        )
1026        .await?;
1027        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1028            tracing::info!(
1029                "Index updated: {} files indexed, {} orphaned files removed",
1030                stats.files_indexed,
1031                stats.orphaned_files_removed
1032            );
1033        }
1034        return Ok(());
1035    }
1036
1037    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
1038    let stats = ck_index::smart_update_index_with_detailed_progress(
1039        index_root,
1040        false,
1041        progress_callback,
1042        detailed_progress_callback,
1043        need_embeddings,
1044        respect_gitignore,
1045        exclude_patterns,
1046        model_override,
1047    )
1048    .await?;
1049    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1050        tracing::info!(
1051            "Index updated: {} files indexed, {} orphaned files removed",
1052            stats.files_indexed,
1053            stats.orphaned_files_removed
1054        );
1055    }
1056
1057    Ok(())
1058}
1059
1060fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1061    let before = options.before_context_lines.max(options.context_lines);
1062    let after = options.after_context_lines.max(options.context_lines);
1063
1064    if before > 0 || after > 0 {
1065        let start_idx = line_idx.saturating_sub(before);
1066        let end_idx = (line_idx + after + 1).min(lines.len());
1067        lines[start_idx..end_idx].join("\n")
1068    } else {
1069        lines[line_idx].to_string()
1070    }
1071}
1072
1073fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1074    let lang = ck_core::Language::from_path(file_path)?;
1075
1076    // Parse the file with tree-sitter and extract function/class sections
1077    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1078        let sections: Vec<(usize, usize, String)> = chunks
1079            .into_iter()
1080            .filter(|chunk| {
1081                matches!(
1082                    chunk.chunk_type,
1083                    ck_chunk::ChunkType::Function
1084                        | ck_chunk::ChunkType::Class
1085                        | ck_chunk::ChunkType::Method
1086                )
1087            })
1088            .map(|chunk| {
1089                (
1090                    chunk.span.line_start - 1, // Convert to 0-based index
1091                    chunk.span.line_end - 1,
1092                    chunk.text,
1093                )
1094            })
1095            .collect();
1096
1097        if sections.is_empty() {
1098            None
1099        } else {
1100            Some(sections)
1101        }
1102    } else {
1103        None
1104    }
1105}
1106
1107fn find_containing_section(
1108    sections: &[(usize, usize, String)],
1109    line_idx: usize,
1110) -> Option<&String> {
1111    for (start, end, text) in sections {
1112        if line_idx >= *start && line_idx <= *end {
1113            return Some(text);
1114        }
1115    }
1116    None
1117}
1118
1119#[cfg(test)]
1120mod tests {
1121    use super::*;
1122    use std::fs;
1123    use tempfile::TempDir;
1124
1125    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1126        let files = vec![
1127            ("test1.txt", "hello world rust programming"),
1128            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1129            ("test3.py", "print('Hello Python')"),
1130            ("test4.txt", "machine learning artificial intelligence"),
1131        ];
1132
1133        let mut paths = Vec::new();
1134        for (name, content) in files {
1135            let path = dir.join(name);
1136            fs::write(&path, content).unwrap();
1137            paths.push(path);
1138        }
1139        paths
1140    }
1141
1142    #[test]
1143    fn test_extract_lines_from_file() {
1144        let temp_dir = TempDir::new().unwrap();
1145        let test_file = temp_dir.path().join("test_lines.txt");
1146
1147        // Create a multi-line test file
1148        let content =
1149            "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1150        fs::write(&test_file, content).unwrap();
1151
1152        // Test extracting lines 3-5 (1-based indexing)
1153        let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1154        assert_eq!(result, "Line 3\nLine 4\nLine 5");
1155
1156        // Test extracting a single line
1157        let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1158        assert_eq!(result, "Line 7");
1159
1160        // Test extracting from line 8 to end
1161        let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1162        assert_eq!(result, "Line 8\nLine 9\nLine 10");
1163
1164        // Test line_start == 0 (should return empty)
1165        let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1166        assert_eq!(result, "");
1167
1168        // Test line_start > file length (should return empty)
1169        let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1170        assert_eq!(result, "");
1171    }
1172
1173    #[tokio::test]
1174    async fn test_extract_content_from_span() {
1175        let temp_dir = TempDir::new().unwrap();
1176        let test_file = temp_dir.path().join("code.rs");
1177
1178        // Create a multi-line code file
1179        let content = "fn first() {\n    println!(\"First\");\n}\n\nfn second() {\n    println!(\"Second\");\n}\n\nfn third() {\n    println!(\"Third\");\n}";
1180        fs::write(&test_file, content).unwrap();
1181
1182        // Test extracting the second function (lines 5-7)
1183        let span = ck_core::Span {
1184            byte_start: 0, // Not used in line extraction
1185            byte_end: 0,   // Not used in line extraction
1186            line_start: 5,
1187            line_end: 7,
1188        };
1189
1190        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1191        assert_eq!(result, "fn second() {\n    println!(\"Second\");\n}");
1192
1193        // Test extracting a single line
1194        let span = ck_core::Span {
1195            byte_start: 0,
1196            byte_end: 0,
1197            line_start: 2,
1198            line_end: 2,
1199        };
1200
1201        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1202        assert_eq!(result, "    println!(\"First\");");
1203    }
1204
1205    #[test]
1206    fn test_collect_files() {
1207        let temp_dir = TempDir::new().unwrap();
1208        let test_files = create_test_files(temp_dir.path());
1209
1210        // Test non-recursive
1211        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1212        assert_eq!(files.len(), 4);
1213
1214        // Test recursive
1215        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1216        assert_eq!(files.len(), 4);
1217
1218        // Test single file
1219        let files = collect_files(&test_files[0], false, &[]).unwrap();
1220        assert_eq!(files.len(), 1);
1221        assert_eq!(files[0], test_files[0]);
1222    }
1223
1224    #[test]
1225    fn test_regex_search() {
1226        let temp_dir = TempDir::new().unwrap();
1227        create_test_files(temp_dir.path());
1228
1229        let options = SearchOptions {
1230            mode: SearchMode::Regex,
1231            query: "rust".to_string(),
1232            path: temp_dir.path().to_path_buf(),
1233            recursive: true,
1234            ..Default::default()
1235        };
1236
1237        let results = regex_search(&options).unwrap();
1238        assert!(!results.is_empty());
1239
1240        // Should find matches in files containing "rust"
1241        let rust_matches: Vec<_> = results
1242            .iter()
1243            .filter(|r| r.preview.to_lowercase().contains("rust"))
1244            .collect();
1245        assert!(!rust_matches.is_empty());
1246    }
1247
1248    #[test]
1249    fn test_regex_search_case_insensitive() {
1250        let temp_dir = TempDir::new().unwrap();
1251        create_test_files(temp_dir.path());
1252
1253        let options = SearchOptions {
1254            mode: SearchMode::Regex,
1255            query: "HELLO".to_string(),
1256            path: temp_dir.path().to_path_buf(),
1257            recursive: true,
1258            case_insensitive: true,
1259            ..Default::default()
1260        };
1261
1262        let results = regex_search(&options).unwrap();
1263        assert!(!results.is_empty());
1264    }
1265
1266    #[test]
1267    fn test_regex_search_fixed_string() {
1268        let temp_dir = TempDir::new().unwrap();
1269        create_test_files(temp_dir.path());
1270
1271        let options = SearchOptions {
1272            mode: SearchMode::Regex,
1273            query: "fn main()".to_string(),
1274            path: temp_dir.path().to_path_buf(),
1275            recursive: true,
1276            fixed_string: true,
1277            ..Default::default()
1278        };
1279
1280        let results = regex_search(&options).unwrap();
1281        assert!(!results.is_empty());
1282    }
1283
1284    #[test]
1285    fn test_regex_search_whole_word() {
1286        let temp_dir = TempDir::new().unwrap();
1287        fs::write(
1288            temp_dir.path().join("word_test.txt"),
1289            "rust rusty rustacean",
1290        )
1291        .unwrap();
1292
1293        let options = SearchOptions {
1294            mode: SearchMode::Regex,
1295            query: "rust".to_string(),
1296            path: temp_dir.path().to_path_buf(),
1297            recursive: true,
1298            whole_word: true,
1299            ..Default::default()
1300        };
1301
1302        let results = regex_search(&options).unwrap();
1303        assert!(!results.is_empty());
1304        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1305    }
1306
1307    #[test]
1308    fn test_regex_search_top_k() {
1309        let temp_dir = TempDir::new().unwrap();
1310
1311        // Create multiple files with matches
1312        for i in 0..10 {
1313            fs::write(
1314                temp_dir.path().join(format!("file{}.txt", i)),
1315                "test content",
1316            )
1317            .unwrap();
1318        }
1319
1320        let options = SearchOptions {
1321            mode: SearchMode::Regex,
1322            query: "test".to_string(),
1323            path: temp_dir.path().to_path_buf(),
1324            recursive: true,
1325            top_k: Some(5),
1326            ..Default::default()
1327        };
1328
1329        let results = regex_search(&options).unwrap();
1330        assert!(results.len() <= 5);
1331    }
1332
1333    #[test]
1334    fn test_regex_search_span_offsets() {
1335        // Test that span offsets are correctly calculated for multiple matches on a line
1336        let temp_dir = TempDir::new().unwrap();
1337        let test_file = temp_dir.path().join("spans.txt");
1338        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1339
1340        let options = SearchOptions {
1341            mode: SearchMode::Regex,
1342            query: "test".to_string(),
1343            path: test_file.clone(),
1344            recursive: false,
1345            ..Default::default()
1346        };
1347
1348        let results = regex_search(&options).unwrap();
1349
1350        // Should find 5 matches total
1351        assert_eq!(results.len(), 5);
1352
1353        // Check first line has 3 matches with correct byte offsets
1354        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1355        assert_eq!(line1_matches.len(), 3);
1356        assert_eq!(line1_matches[0].span.byte_start, 0);
1357        assert_eq!(line1_matches[1].span.byte_start, 5);
1358        assert_eq!(line1_matches[2].span.byte_start, 10);
1359
1360        // Check second line match
1361        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1362        assert_eq!(line2_matches.len(), 1);
1363        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1364
1365        // Each match should have different byte offsets
1366        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1367        byte_starts.sort();
1368        byte_starts.dedup();
1369        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1370    }
1371
1372    #[test]
1373    fn test_search_file() {
1374        let temp_dir = TempDir::new().unwrap();
1375        let file_path = temp_dir.path().join("test.txt");
1376        fs::write(
1377            &file_path,
1378            "line 1: hello\nline 2: world\nline 3: rust programming",
1379        )
1380        .unwrap();
1381
1382        let regex = regex::Regex::new("rust").unwrap();
1383        let options = SearchOptions::default();
1384
1385        let results = search_file(&regex, &file_path, &options).unwrap();
1386        assert_eq!(results.len(), 1);
1387        assert_eq!(results[0].span.line_start, 3);
1388        assert!(results[0].preview.contains("rust"));
1389    }
1390
1391    #[test]
1392    fn test_search_file_with_context() {
1393        let temp_dir = TempDir::new().unwrap();
1394        let file_path = temp_dir.path().join("test.txt");
1395        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1396
1397        let regex = regex::Regex::new("target").unwrap();
1398        let options = SearchOptions {
1399            context_lines: 1,
1400            ..Default::default()
1401        };
1402
1403        let results = search_file(&regex, &file_path, &options).unwrap();
1404        assert_eq!(results.len(), 1);
1405
1406        println!("Preview: '{}'", results[0].preview);
1407
1408        // The target line is line 3, with 1 context line before and after
1409        // So we should get lines 2, 3, 4
1410        assert!(results[0].preview.contains("line 2"));
1411        assert!(results[0].preview.contains("target line"));
1412        assert!(results[0].preview.contains("line 4"));
1413    }
1414
1415    #[tokio::test]
1416    async fn test_search_main_function() {
1417        let temp_dir = TempDir::new().unwrap();
1418        create_test_files(temp_dir.path());
1419
1420        let options = SearchOptions {
1421            mode: SearchMode::Regex,
1422            query: "hello".to_string(),
1423            path: temp_dir.path().to_path_buf(),
1424            recursive: true,
1425            case_insensitive: true,
1426            ..Default::default()
1427        };
1428
1429        let results = search(&options).await.unwrap();
1430        assert!(!results.is_empty());
1431    }
1432}