ck_engine/
lib.rs

1use anyhow::Result;
2use ck_ann::AnnIndex;
3use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
4use globset::{Glob, GlobSet, GlobSetBuilder};
5use rayon::prelude::*;
6use regex::{Regex, RegexBuilder};
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf as StdPathBuf;
10use std::path::{Path, PathBuf};
11use tantivy::collector::TopDocs;
12use tantivy::query::QueryParser;
13use tantivy::schema::{STORED, Schema, TEXT, Value};
14use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
15use walkdir::WalkDir;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = tokio::fs::read_to_string(file_path).await?;
25    let lines: Vec<&str> = content.lines().collect();
26
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() {
43        path.parent().unwrap_or(path)
44    } else {
45        path
46    };
47    loop {
48        if current.join(".ck").exists() {
49            return Some(current.to_path_buf());
50        }
51        match current.parent() {
52            Some(parent) => current = parent,
53            None => return None,
54        }
55    }
56}
57
58pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
59    search_with_progress(options, None).await
60}
61
62pub async fn search_with_progress(
63    options: &SearchOptions,
64    progress_callback: Option<SearchProgressCallback>,
65) -> Result<Vec<SearchResult>> {
66    // Validate that the search path exists
67    if !options.path.exists() {
68        return Err(ck_core::CkError::Search(format!(
69            "Path does not exist: {}",
70            options.path.display()
71        ))
72        .into());
73    }
74
75    // Auto-update index if needed (unless it's regex-only mode)
76    if !matches!(options.mode, SearchMode::Regex) {
77        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
78        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
79    }
80
81    match options.mode {
82        SearchMode::Regex => regex_search(options),
83        SearchMode::Lexical => lexical_search(options).await,
84        SearchMode::Semantic => {
85            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
86            semantic_search_v3_with_progress(options, progress_callback).await
87        }
88        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
89    }
90}
91
92fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
93    let pattern = if options.fixed_string {
94        regex::escape(&options.query)
95    } else if options.whole_word {
96        format!(r"\b{}\b", regex::escape(&options.query))
97    } else {
98        options.query.clone()
99    };
100
101    let regex = RegexBuilder::new(&pattern)
102        .case_insensitive(options.case_insensitive)
103        .build()
104        .map_err(CkError::Regex)?;
105
106    // Default to recursive for directories (like grep) to maintain compatibility
107    let should_recurse = options.path.is_dir() || options.recursive;
108    let files = if should_recurse {
109        // Use ck_index's collect_files which respects gitignore
110        ck_index::collect_files(
111            &options.path,
112            options.respect_gitignore,
113            &options.exclude_patterns,
114        )?
115    } else {
116        // For non-recursive, use the local collect_files
117        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
118    };
119
120    let results: Vec<Vec<SearchResult>> = files
121        .par_iter()
122        .filter_map(|file_path| match search_file(&regex, file_path, options) {
123            Ok(matches) => {
124                if matches.is_empty() {
125                    None
126                } else {
127                    Some(matches)
128                }
129            }
130            Err(e) => {
131                tracing::debug!("Error searching {:?}: {}", file_path, e);
132                None
133            }
134        })
135        .collect();
136
137    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
138    // Deterministic ordering: file path, then line number
139    all_results.sort_by(|a, b| {
140        let path_cmp = a.file.cmp(&b.file);
141        if path_cmp != std::cmp::Ordering::Equal {
142            return path_cmp;
143        }
144        a.span.line_start.cmp(&b.span.line_start)
145    });
146
147    if let Some(top_k) = options.top_k {
148        all_results.truncate(top_k);
149    }
150
151    Ok(all_results)
152}
153
154fn search_file(
155    regex: &Regex,
156    file_path: &Path,
157    options: &SearchOptions,
158) -> Result<Vec<SearchResult>> {
159    let content = fs::read_to_string(file_path)?;
160    let lines: Vec<&str> = content.lines().collect();
161    let mut results = Vec::new();
162
163    // If full_section is enabled, try to parse the file and find code sections
164    let code_sections = if options.full_section {
165        extract_code_sections(file_path, &content)
166    } else {
167        None
168    };
169
170    // Track byte offset as we iterate through lines
171    let mut byte_offset = 0;
172
173    for (line_idx, line) in lines.iter().enumerate() {
174        let line_number = line_idx + 1;
175
176        // Special handling for empty pattern - match the entire line once
177        // An empty regex pattern will match at every position, so we need to handle it specially
178        if regex.as_str().is_empty() {
179            // Empty pattern matches the whole line once (grep compatibility)
180            let preview = if options.full_section {
181                // Try to find the containing code section
182                if let Some(ref sections) = code_sections {
183                    if let Some(section) = find_containing_section(sections, line_idx) {
184                        section.clone()
185                    } else {
186                        // Fall back to context lines if no section found
187                        get_context_preview(&lines, line_idx, options)
188                    }
189                } else {
190                    get_context_preview(&lines, line_idx, options)
191                }
192            } else {
193                get_context_preview(&lines, line_idx, options)
194            };
195
196            results.push(SearchResult {
197                file: file_path.to_path_buf(),
198                span: Span {
199                    byte_start: byte_offset,
200                    byte_end: byte_offset + line.len(),
201                    line_start: line_number,
202                    line_end: line_number,
203                },
204                score: 1.0,
205                preview,
206                lang: ck_core::Language::from_path(file_path),
207                symbol: None,
208                chunk_hash: None,
209                index_epoch: None,
210            });
211        } else {
212            // Find all matches in the line with their positions
213            for mat in regex.find_iter(line) {
214                let preview = if options.full_section {
215                    // Try to find the containing code section
216                    if let Some(ref sections) = code_sections {
217                        if let Some(section) = find_containing_section(sections, line_idx) {
218                            section.clone()
219                        } else {
220                            // Fall back to context lines if no section found
221                            get_context_preview(&lines, line_idx, options)
222                        }
223                    } else {
224                        get_context_preview(&lines, line_idx, options)
225                    }
226                } else {
227                    get_context_preview(&lines, line_idx, options)
228                };
229
230                results.push(SearchResult {
231                    file: file_path.to_path_buf(),
232                    span: Span {
233                        byte_start: byte_offset + mat.start(),
234                        byte_end: byte_offset + mat.end(),
235                        line_start: line_number,
236                        line_end: line_number,
237                    },
238                    score: 1.0,
239                    preview,
240                    lang: ck_core::Language::from_path(file_path),
241                    symbol: None,
242                    chunk_hash: None,
243                    index_epoch: None,
244                });
245            }
246        }
247
248        // Update byte offset for next line (add line length + newline character)
249        byte_offset += line.len();
250        if line_idx < lines.len() - 1 {
251            byte_offset += 1; // Add 1 for the newline character
252        }
253    }
254
255    Ok(results)
256}
257
258async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
259    // Handle both files and directories and reuse nearest existing .ck index up the tree
260    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
261        if options.path.is_file() {
262            options.path.parent().unwrap_or(&options.path).to_path_buf()
263        } else {
264            options.path.clone()
265        }
266    });
267
268    let index_dir = index_root.join(".ck");
269    if !index_dir.exists() {
270        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
271    }
272
273    let tantivy_index_path = index_dir.join("tantivy_index");
274
275    if !tantivy_index_path.exists() {
276        return build_tantivy_index(options).await;
277    }
278
279    let mut schema_builder = Schema::builder();
280    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
281    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
282    let _schema = schema_builder.build();
283
284    let index = Index::open_in_dir(&tantivy_index_path)
285        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
286
287    let reader = index
288        .reader_builder()
289        .reload_policy(ReloadPolicy::OnCommitWithDelay)
290        .try_into()
291        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
292
293    let searcher = reader.searcher();
294    let query_parser = QueryParser::for_index(&index, vec![content_field]);
295
296    let query = query_parser
297        .parse_query(&options.query)
298        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
299
300    let top_docs = if let Some(top_k) = options.top_k {
301        searcher.search(&query, &TopDocs::with_limit(top_k))?
302    } else {
303        searcher.search(&query, &TopDocs::with_limit(100))?
304    };
305
306    // First, collect all results with raw scores
307    let mut raw_results = Vec::new();
308    for (_score, doc_address) in top_docs {
309        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
310        let path_text = retrieved_doc
311            .get_first(path_field)
312            .map(|field_value| field_value.as_str().unwrap_or(""))
313            .unwrap_or("");
314        let content_text = retrieved_doc
315            .get_first(content_field)
316            .map(|field_value| field_value.as_str().unwrap_or(""))
317            .unwrap_or("");
318
319        let file_path = PathBuf::from(path_text);
320        let preview = if options.full_section {
321            content_text.to_string()
322        } else {
323            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
324        };
325
326        raw_results.push((
327            _score,
328            SearchResult {
329                file: file_path,
330                span: Span {
331                    byte_start: 0,
332                    byte_end: content_text.len(),
333                    line_start: 1,
334                    line_end: content_text.lines().count(),
335                },
336                score: _score,
337                preview,
338                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
339                symbol: None,
340                chunk_hash: None,
341                index_epoch: None,
342            },
343        ));
344    }
345
346    // Normalize scores to 0-1 range and apply threshold
347    let mut results = Vec::new();
348    if !raw_results.is_empty() {
349        let max_score = raw_results
350            .iter()
351            .map(|(score, _)| *score)
352            .fold(0.0f32, f32::max);
353        if max_score > 0.0 {
354            for (raw_score, mut result) in raw_results {
355                let normalized_score = raw_score / max_score;
356
357                // Apply threshold filtering with normalized score
358                if let Some(threshold) = options.threshold
359                    && normalized_score < threshold
360                {
361                    continue;
362                }
363
364                result.score = normalized_score;
365                results.push(result);
366            }
367        }
368    }
369
370    Ok(results)
371}
372
373async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
374    // Handle both files and directories by finding the appropriate directory for indexing
375    let index_root = if options.path.is_file() {
376        options.path.parent().unwrap_or(&options.path)
377    } else {
378        &options.path
379    };
380
381    let index_dir = index_root.join(".ck");
382    let tantivy_index_path = index_dir.join("tantivy_index");
383
384    fs::create_dir_all(&tantivy_index_path)?;
385
386    let mut schema_builder = Schema::builder();
387    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
388    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
389    let schema = schema_builder.build();
390
391    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
392        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
393
394    let mut index_writer = index
395        .writer(50_000_000)
396        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
397
398    let files = collect_files(index_root, true, &options.exclude_patterns)?;
399
400    for file_path in &files {
401        if let Ok(content) = fs::read_to_string(file_path) {
402            let doc = doc!(
403                content_field => content,
404                path_field => file_path.display().to_string()
405            );
406            index_writer.add_document(doc)?;
407        }
408    }
409
410    index_writer
411        .commit()
412        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
413
414    // After building, search again with the same options
415    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
416    let mut schema_builder = Schema::builder();
417    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
418    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
419    let _schema = schema_builder.build();
420
421    let index = Index::open_in_dir(&tantivy_index_path)
422        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
423
424    let reader = index
425        .reader_builder()
426        .reload_policy(ReloadPolicy::OnCommitWithDelay)
427        .try_into()
428        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
429
430    let searcher = reader.searcher();
431    let query_parser = QueryParser::for_index(&index, vec![content_field]);
432
433    let query = query_parser
434        .parse_query(&options.query)
435        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
436
437    let top_docs = if let Some(top_k) = options.top_k {
438        searcher.search(&query, &TopDocs::with_limit(top_k))?
439    } else {
440        searcher.search(&query, &TopDocs::with_limit(100))?
441    };
442
443    // First, collect all results with raw scores
444    let mut raw_results = Vec::new();
445    for (_score, doc_address) in top_docs {
446        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
447        let path_text = retrieved_doc
448            .get_first(path_field)
449            .map(|field_value| field_value.as_str().unwrap_or(""))
450            .unwrap_or("");
451        let content_text = retrieved_doc
452            .get_first(content_field)
453            .map(|field_value| field_value.as_str().unwrap_or(""))
454            .unwrap_or("");
455
456        let file_path = PathBuf::from(path_text);
457        let preview = if options.full_section {
458            content_text.to_string()
459        } else {
460            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
461        };
462
463        raw_results.push((
464            _score,
465            SearchResult {
466                file: file_path,
467                span: Span {
468                    byte_start: 0,
469                    byte_end: content_text.len(),
470                    line_start: 1,
471                    line_end: content_text.lines().count(),
472                },
473                score: _score,
474                preview,
475                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
476                symbol: None,
477                chunk_hash: None,
478                index_epoch: None,
479            },
480        ));
481    }
482
483    // Normalize scores to 0-1 range and apply threshold
484    let mut results = Vec::new();
485    if !raw_results.is_empty() {
486        let max_score = raw_results
487            .iter()
488            .map(|(score, _)| *score)
489            .fold(0.0f32, f32::max);
490        if max_score > 0.0 {
491            for (raw_score, mut result) in raw_results {
492                let normalized_score = raw_score / max_score;
493
494                // Apply threshold filtering with normalized score
495                if let Some(threshold) = options.threshold
496                    && normalized_score < threshold
497                {
498                    continue;
499                }
500
501                result.score = normalized_score;
502                results.push(result);
503            }
504        }
505    }
506
507    Ok(results)
508}
509
510#[allow(dead_code)]
511async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
512    semantic_search_with_progress(options, None).await
513}
514
515async fn semantic_search_with_progress(
516    options: &SearchOptions,
517    progress_callback: Option<SearchProgressCallback>,
518) -> Result<Vec<SearchResult>> {
519    // Handle both files and directories and reuse nearest existing .ck index up the tree
520    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
521        if options.path.is_file() {
522            options.path.parent().unwrap_or(&options.path).to_path_buf()
523        } else {
524            options.path.clone()
525        }
526    });
527
528    let index_dir = index_root.join(".ck");
529    if !index_dir.exists() {
530        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
531    }
532
533    let ann_index_path = index_dir.join("ann_index.bin");
534    let embeddings_path = index_dir.join("embeddings.json");
535
536    if !ann_index_path.exists() || !embeddings_path.exists() {
537        return build_semantic_index_with_progress(options, progress_callback).await;
538    }
539
540    // Load the ANN index
541    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
542
543    // Load file metadata
544    let embeddings_data = fs::read_to_string(&embeddings_path)?;
545    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
546
547    // Create embedder and embed the query
548    if let Some(ref callback) = progress_callback {
549        callback("Loading embedding model...");
550    }
551
552    let mut embedder = if let Some(ref callback) = progress_callback {
553        let _cb = callback.as_ref();
554        let model_cb = Box::new(|msg: &str| {
555            // Note: We can't directly use the callback here due to lifetime issues
556            // For now, we'll just use eprintln! until we can restructure this better
557            eprintln!("Model: {}", msg);
558        }) as ck_embed::ModelDownloadCallback;
559        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
560    } else {
561        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
562    };
563    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
564
565    if query_embeddings.is_empty() {
566        return Ok(Vec::new());
567    }
568
569    let query_embedding = &query_embeddings[0];
570
571    // Search using ANN
572    let top_k = options.top_k.unwrap_or(10);
573    let similar_docs = ann_index.search(query_embedding, top_k);
574
575    let mut results = Vec::new();
576
577    // Check if we're searching a specific file vs. a directory
578    let filter_by_file = options.path.is_file();
579    let target_file = if filter_by_file {
580        Some(
581            options
582                .path
583                .canonicalize()
584                .unwrap_or_else(|_| options.path.clone()),
585        )
586    } else {
587        None
588    };
589
590    for (doc_id, similarity) in similar_docs {
591        // Apply threshold filtering
592        if let Some(threshold) = options.threshold
593            && similarity < threshold
594        {
595            continue;
596        }
597
598        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
599            // Filter by target file if specified
600            if let Some(target) = &target_file {
601                let canonical_result = file_path
602                    .canonicalize()
603                    .unwrap_or_else(|_| file_path.clone());
604                if canonical_result != *target {
605                    continue; // Skip this result if it doesn't match the target file
606                }
607            }
608
609            // If full_section is enabled and this is a code section, return the full content
610            let preview = if options.full_section {
611                content.clone()
612            } else {
613                content.lines().take(3).collect::<Vec<_>>().join("\n")
614            };
615
616            results.push(SearchResult {
617                file: file_path.clone(),
618                span: Span {
619                    byte_start: 0,
620                    byte_end: content.len(),
621                    line_start: 1,
622                    line_end: content.lines().count(),
623                },
624                score: similarity,
625                preview,
626                lang: ck_core::Language::from_path(file_path),
627                symbol: None,
628                chunk_hash: None,
629                index_epoch: None,
630            });
631        }
632    }
633
634    Ok(results)
635}
636
637#[allow(dead_code)]
638async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
639    build_semantic_index_with_progress(options, None).await
640}
641
642async fn build_semantic_index_with_progress(
643    options: &SearchOptions,
644    progress_callback: Option<SearchProgressCallback>,
645) -> Result<Vec<SearchResult>> {
646    // Handle both files and directories by finding the appropriate directory for indexing
647    let index_root = if options.path.is_file() {
648        options.path.parent().unwrap_or(&options.path)
649    } else {
650        &options.path
651    };
652
653    let index_dir = index_root.join(".ck");
654    let ann_index_path = index_dir.join("ann_index.bin");
655    let embeddings_path = index_dir.join("embeddings.json");
656
657    fs::create_dir_all(&index_dir)?;
658
659    if let Some(ref callback) = progress_callback {
660        callback("Building semantic index (no index found)...");
661    }
662
663    // Always print this important message, even in quiet mode for indexing operations
664    eprintln!("Building semantic index (no existing index found)...");
665
666    // Collect files and their content
667    let files = collect_files(index_root, true, &options.exclude_patterns)?;
668
669    if let Some(ref callback) = progress_callback {
670        callback(&format!("Found {} files to index", files.len()));
671    }
672    eprintln!("Found {} files to embed and index", files.len());
673
674    let mut file_embeddings = Vec::new();
675    let mut embeddings = Vec::new();
676
677    // Create embedder with progress callback
678    if let Some(ref callback) = progress_callback {
679        callback("Loading embedding model...");
680    }
681
682    let model_callback = if progress_callback.is_some() {
683        Some(Box::new(|msg: &str| {
684            eprintln!("Model: {}", msg);
685        }) as ck_embed::ModelDownloadCallback)
686    } else {
687        None
688    };
689
690    let mut embedder =
691        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
692
693    if let Some(ref callback) = progress_callback {
694        callback("Generating embeddings for code chunks...");
695    }
696
697    for (file_idx, file_path) in files.iter().enumerate() {
698        if let Ok(content) = fs::read_to_string(file_path) {
699            if let Some(ref callback) = progress_callback {
700                let file_name = file_path
701                    .file_name()
702                    .map(|n| n.to_string_lossy().to_string())
703                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
704                callback(&format!(
705                    "Processing {}/{}: {}",
706                    file_idx + 1,
707                    files.len(),
708                    file_name
709                ));
710            }
711
712            // Chunk the content for better embeddings
713            let chunks = ck_chunk::chunk_text(&content, ck_core::Language::from_path(file_path))?;
714
715            for chunk in chunks {
716                let chunk_embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
717                if !chunk_embeddings.is_empty() {
718                    embeddings.push(chunk_embeddings[0].clone());
719                    file_embeddings.push((file_path.clone(), chunk.text));
720                }
721            }
722        }
723    }
724
725    if let Some(ref callback) = progress_callback {
726        callback(&format!(
727            "Built {} embeddings, creating search index...",
728            embeddings.len()
729        ));
730    }
731    eprintln!(
732        "Generated {} embeddings, building search index...",
733        embeddings.len()
734    );
735
736    // Build ANN index
737    let index = ck_ann::SimpleIndex::build(&embeddings)?;
738    index.save(&ann_index_path)?;
739
740    // Save file embeddings metadata
741    let embeddings_json = serde_json::to_string(&file_embeddings)?;
742    fs::write(&embeddings_path, embeddings_json)?;
743
744    if let Some(ref callback) = progress_callback {
745        callback("Semantic index built successfully, running search...");
746    }
747    eprintln!("Semantic index built successfully!");
748
749    // After building, search again - inline to avoid recursion
750    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
751
752    // Load file metadata
753    let embeddings_data = fs::read_to_string(&embeddings_path)?;
754    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
755
756    // Create embedder and embed the query
757    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
758    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
759
760    if query_embeddings.is_empty() {
761        return Ok(Vec::new());
762    }
763
764    let query_embedding = &query_embeddings[0];
765
766    // Search using ANN
767    let top_k = options.top_k.unwrap_or(10);
768    let similar_docs = ann_index.search(query_embedding, top_k);
769
770    let mut results = Vec::new();
771
772    // Check if we're searching a specific file vs. a directory
773    let filter_by_file = options.path.is_file();
774    let target_file = if filter_by_file {
775        Some(
776            options
777                .path
778                .canonicalize()
779                .unwrap_or_else(|_| options.path.clone()),
780        )
781    } else {
782        None
783    };
784
785    for (doc_id, similarity) in similar_docs {
786        // Apply threshold filtering
787        if let Some(threshold) = options.threshold
788            && similarity < threshold
789        {
790            continue;
791        }
792
793        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
794            // Filter by target file if specified
795            if let Some(target) = &target_file {
796                let canonical_result = file_path
797                    .canonicalize()
798                    .unwrap_or_else(|_| file_path.clone());
799                if canonical_result != *target {
800                    continue; // Skip this result if it doesn't match the target file
801                }
802            }
803
804            // If full_section is enabled and this is a code section, return the full content
805            let preview = if options.full_section {
806                content.clone()
807            } else {
808                content.lines().take(3).collect::<Vec<_>>().join("\n")
809            };
810
811            results.push(SearchResult {
812                file: file_path.clone(),
813                span: Span {
814                    byte_start: 0,
815                    byte_end: content.len(),
816                    line_start: 1,
817                    line_end: content.lines().count(),
818                },
819                score: similarity,
820                preview,
821                lang: ck_core::Language::from_path(file_path),
822                symbol: None,
823                chunk_hash: None,
824                index_epoch: None,
825            });
826        }
827    }
828
829    Ok(results)
830}
831
832#[allow(dead_code)]
833async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
834    hybrid_search_with_progress(options, None).await
835}
836
837async fn hybrid_search_with_progress(
838    options: &SearchOptions,
839    progress_callback: Option<SearchProgressCallback>,
840) -> Result<Vec<SearchResult>> {
841    if let Some(ref callback) = progress_callback {
842        callback("Running regex search...");
843    }
844    let regex_results = regex_search(options)?;
845
846    if let Some(ref callback) = progress_callback {
847        callback("Running semantic search...");
848    }
849    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
850
851    let mut combined = HashMap::new();
852
853    for (rank, result) in regex_results.iter().enumerate() {
854        let key = format!("{}:{}", result.file.display(), result.span.line_start);
855        combined
856            .entry(key)
857            .or_insert(Vec::new())
858            .push((rank + 1, result.clone()));
859    }
860
861    for (rank, result) in semantic_results.iter().enumerate() {
862        let key = format!("{}:{}", result.file.display(), result.span.line_start);
863        combined
864            .entry(key)
865            .or_insert(Vec::new())
866            .push((rank + 1, result.clone()));
867    }
868
869    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
870    let mut rrf_results: Vec<SearchResult> = combined
871        .into_values()
872        .map(|ranks| {
873            let mut result = ranks[0].1.clone();
874            let rrf_score = ranks
875                .iter()
876                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
877                .sum();
878            result.score = rrf_score;
879            result
880        })
881        .filter(|result| {
882            // Apply threshold filtering to raw RRF scores
883            if let Some(threshold) = options.threshold {
884                result.score >= threshold
885            } else {
886                true
887            }
888        })
889        .collect();
890
891    // Sort by RRF score (highest first)
892    rrf_results.sort_by(|a, b| {
893        b.score
894            .partial_cmp(&a.score)
895            .unwrap_or(std::cmp::Ordering::Equal)
896    });
897
898    if let Some(top_k) = options.top_k {
899        rrf_results.truncate(top_k);
900    }
901
902    Ok(rrf_results)
903}
904
905fn build_globset(patterns: &[String]) -> GlobSet {
906    let mut builder = GlobSetBuilder::new();
907    for pat in patterns {
908        // Treat patterns as filename or directory globs
909        if let Ok(glob) = Glob::new(pat) {
910            builder.add(glob);
911        }
912    }
913    builder.build().unwrap_or_else(|_| GlobSet::empty())
914}
915
916fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
917    let globset = build_globset(exclude_patterns);
918    // Match against each path component and the full path
919    if globset.is_match(path) {
920        return true;
921    }
922    for component in path.components() {
923        if let std::path::Component::Normal(name) = component
924            && globset.is_match(name)
925        {
926            return true;
927        }
928    }
929    false
930}
931
932fn collect_files(
933    path: &Path,
934    recursive: bool,
935    exclude_patterns: &[String],
936) -> Result<Vec<PathBuf>> {
937    let mut files = Vec::new();
938    let globset = build_globset(exclude_patterns);
939
940    if path.is_file() {
941        // Always add single files, even if they're excluded (user explicitly requested)
942        files.push(path.to_path_buf());
943    } else if recursive {
944        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
945            // Skip excluded directories entirely for efficiency
946            let name = e.file_name();
947            !globset.is_match(e.path()) && !globset.is_match(name)
948        }) {
949            match entry {
950                Ok(entry) => {
951                    if entry.file_type().is_file()
952                        && !should_exclude_path(entry.path(), exclude_patterns)
953                    {
954                        files.push(entry.path().to_path_buf());
955                    }
956                }
957                Err(e) => {
958                    // Log directory traversal errors but continue processing
959                    tracing::debug!("Skipping path due to error: {}", e);
960                    continue;
961                }
962            }
963        }
964    } else {
965        match fs::read_dir(path) {
966            Ok(read_dir) => {
967                for entry in read_dir {
968                    match entry {
969                        Ok(entry) => {
970                            let path = entry.path();
971                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
972                                files.push(path);
973                            }
974                        }
975                        Err(e) => {
976                            tracing::debug!("Skipping directory entry due to error: {}", e);
977                            continue;
978                        }
979                    }
980                }
981            }
982            Err(e) => {
983                tracing::debug!("Cannot read directory {:?}: {}", path, e);
984                return Err(e.into());
985            }
986        }
987    }
988
989    Ok(files)
990}
991
992async fn ensure_index_updated(
993    path: &Path,
994    force_reindex: bool,
995    need_embeddings: bool,
996) -> Result<()> {
997    // Handle both files and directories and reuse nearest existing .ck index up the tree
998    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
999        if path.is_file() {
1000            path.parent().unwrap_or(path).to_path_buf()
1001        } else {
1002            path.to_path_buf()
1003        }
1004    });
1005    let index_root = &index_root_buf;
1006
1007    // If force reindex is requested, always update
1008    if force_reindex {
1009        let stats = ck_index::smart_update_index_with_progress(
1010            index_root,
1011            false,
1012            None,
1013            need_embeddings,
1014            true,
1015            &[],  // Empty exclude patterns for internal engine use
1016            None, // model - use existing from index
1017        )
1018        .await?;
1019        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1020            tracing::info!(
1021                "Index updated: {} files indexed, {} orphaned files removed",
1022                stats.files_indexed,
1023                stats.orphaned_files_removed
1024            );
1025        }
1026        return Ok(());
1027    }
1028
1029    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
1030    let stats = ck_index::smart_update_index_with_progress(
1031        index_root,
1032        false,
1033        None,
1034        need_embeddings,
1035        true,
1036        &[],
1037        None, // model - use existing from index
1038    )
1039    .await?;
1040    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1041        tracing::info!(
1042            "Index updated: {} files indexed, {} orphaned files removed",
1043            stats.files_indexed,
1044            stats.orphaned_files_removed
1045        );
1046    }
1047
1048    Ok(())
1049}
1050
1051fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
1052    let before = options.before_context_lines.max(options.context_lines);
1053    let after = options.after_context_lines.max(options.context_lines);
1054
1055    if before > 0 || after > 0 {
1056        let start_idx = line_idx.saturating_sub(before);
1057        let end_idx = (line_idx + after + 1).min(lines.len());
1058        lines[start_idx..end_idx].join("\n")
1059    } else {
1060        lines[line_idx].to_string()
1061    }
1062}
1063
1064fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1065    let lang = ck_core::Language::from_path(file_path)?;
1066
1067    // Parse the file with tree-sitter and extract function/class sections
1068    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1069        let sections: Vec<(usize, usize, String)> = chunks
1070            .into_iter()
1071            .filter(|chunk| {
1072                matches!(
1073                    chunk.chunk_type,
1074                    ck_chunk::ChunkType::Function
1075                        | ck_chunk::ChunkType::Class
1076                        | ck_chunk::ChunkType::Method
1077                )
1078            })
1079            .map(|chunk| {
1080                (
1081                    chunk.span.line_start - 1, // Convert to 0-based index
1082                    chunk.span.line_end - 1,
1083                    chunk.text,
1084                )
1085            })
1086            .collect();
1087
1088        if sections.is_empty() {
1089            None
1090        } else {
1091            Some(sections)
1092        }
1093    } else {
1094        None
1095    }
1096}
1097
1098fn find_containing_section(
1099    sections: &[(usize, usize, String)],
1100    line_idx: usize,
1101) -> Option<&String> {
1102    for (start, end, text) in sections {
1103        if line_idx >= *start && line_idx <= *end {
1104            return Some(text);
1105        }
1106    }
1107    None
1108}
1109
1110#[cfg(test)]
1111mod tests {
1112    use super::*;
1113    use std::fs;
1114    use tempfile::TempDir;
1115
1116    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1117        let files = vec![
1118            ("test1.txt", "hello world rust programming"),
1119            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1120            ("test3.py", "print('Hello Python')"),
1121            ("test4.txt", "machine learning artificial intelligence"),
1122        ];
1123
1124        let mut paths = Vec::new();
1125        for (name, content) in files {
1126            let path = dir.join(name);
1127            fs::write(&path, content).unwrap();
1128            paths.push(path);
1129        }
1130        paths
1131    }
1132
1133    #[test]
1134    fn test_collect_files() {
1135        let temp_dir = TempDir::new().unwrap();
1136        let test_files = create_test_files(temp_dir.path());
1137
1138        // Test non-recursive
1139        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1140        assert_eq!(files.len(), 4);
1141
1142        // Test recursive
1143        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1144        assert_eq!(files.len(), 4);
1145
1146        // Test single file
1147        let files = collect_files(&test_files[0], false, &[]).unwrap();
1148        assert_eq!(files.len(), 1);
1149        assert_eq!(files[0], test_files[0]);
1150    }
1151
1152    #[test]
1153    fn test_regex_search() {
1154        let temp_dir = TempDir::new().unwrap();
1155        create_test_files(temp_dir.path());
1156
1157        let options = SearchOptions {
1158            mode: SearchMode::Regex,
1159            query: "rust".to_string(),
1160            path: temp_dir.path().to_path_buf(),
1161            recursive: true,
1162            ..Default::default()
1163        };
1164
1165        let results = regex_search(&options).unwrap();
1166        assert!(!results.is_empty());
1167
1168        // Should find matches in files containing "rust"
1169        let rust_matches: Vec<_> = results
1170            .iter()
1171            .filter(|r| r.preview.to_lowercase().contains("rust"))
1172            .collect();
1173        assert!(!rust_matches.is_empty());
1174    }
1175
1176    #[test]
1177    fn test_regex_search_case_insensitive() {
1178        let temp_dir = TempDir::new().unwrap();
1179        create_test_files(temp_dir.path());
1180
1181        let options = SearchOptions {
1182            mode: SearchMode::Regex,
1183            query: "HELLO".to_string(),
1184            path: temp_dir.path().to_path_buf(),
1185            recursive: true,
1186            case_insensitive: true,
1187            ..Default::default()
1188        };
1189
1190        let results = regex_search(&options).unwrap();
1191        assert!(!results.is_empty());
1192    }
1193
1194    #[test]
1195    fn test_regex_search_fixed_string() {
1196        let temp_dir = TempDir::new().unwrap();
1197        create_test_files(temp_dir.path());
1198
1199        let options = SearchOptions {
1200            mode: SearchMode::Regex,
1201            query: "fn main()".to_string(),
1202            path: temp_dir.path().to_path_buf(),
1203            recursive: true,
1204            fixed_string: true,
1205            ..Default::default()
1206        };
1207
1208        let results = regex_search(&options).unwrap();
1209        assert!(!results.is_empty());
1210    }
1211
1212    #[test]
1213    fn test_regex_search_whole_word() {
1214        let temp_dir = TempDir::new().unwrap();
1215        fs::write(
1216            temp_dir.path().join("word_test.txt"),
1217            "rust rusty rustacean",
1218        )
1219        .unwrap();
1220
1221        let options = SearchOptions {
1222            mode: SearchMode::Regex,
1223            query: "rust".to_string(),
1224            path: temp_dir.path().to_path_buf(),
1225            recursive: true,
1226            whole_word: true,
1227            ..Default::default()
1228        };
1229
1230        let results = regex_search(&options).unwrap();
1231        assert!(!results.is_empty());
1232        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1233    }
1234
1235    #[test]
1236    fn test_regex_search_top_k() {
1237        let temp_dir = TempDir::new().unwrap();
1238
1239        // Create multiple files with matches
1240        for i in 0..10 {
1241            fs::write(
1242                temp_dir.path().join(format!("file{}.txt", i)),
1243                "test content",
1244            )
1245            .unwrap();
1246        }
1247
1248        let options = SearchOptions {
1249            mode: SearchMode::Regex,
1250            query: "test".to_string(),
1251            path: temp_dir.path().to_path_buf(),
1252            recursive: true,
1253            top_k: Some(5),
1254            ..Default::default()
1255        };
1256
1257        let results = regex_search(&options).unwrap();
1258        assert!(results.len() <= 5);
1259    }
1260
1261    #[test]
1262    fn test_regex_search_span_offsets() {
1263        // Test that span offsets are correctly calculated for multiple matches on a line
1264        let temp_dir = TempDir::new().unwrap();
1265        let test_file = temp_dir.path().join("spans.txt");
1266        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1267
1268        let options = SearchOptions {
1269            mode: SearchMode::Regex,
1270            query: "test".to_string(),
1271            path: test_file.clone(),
1272            recursive: false,
1273            ..Default::default()
1274        };
1275
1276        let results = regex_search(&options).unwrap();
1277
1278        // Should find 5 matches total
1279        assert_eq!(results.len(), 5);
1280
1281        // Check first line has 3 matches with correct byte offsets
1282        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1283        assert_eq!(line1_matches.len(), 3);
1284        assert_eq!(line1_matches[0].span.byte_start, 0);
1285        assert_eq!(line1_matches[1].span.byte_start, 5);
1286        assert_eq!(line1_matches[2].span.byte_start, 10);
1287
1288        // Check second line match
1289        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1290        assert_eq!(line2_matches.len(), 1);
1291        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1292
1293        // Each match should have different byte offsets
1294        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1295        byte_starts.sort();
1296        byte_starts.dedup();
1297        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1298    }
1299
1300    #[test]
1301    fn test_search_file() {
1302        let temp_dir = TempDir::new().unwrap();
1303        let file_path = temp_dir.path().join("test.txt");
1304        fs::write(
1305            &file_path,
1306            "line 1: hello\nline 2: world\nline 3: rust programming",
1307        )
1308        .unwrap();
1309
1310        let regex = regex::Regex::new("rust").unwrap();
1311        let options = SearchOptions::default();
1312
1313        let results = search_file(&regex, &file_path, &options).unwrap();
1314        assert_eq!(results.len(), 1);
1315        assert_eq!(results[0].span.line_start, 3);
1316        assert!(results[0].preview.contains("rust"));
1317    }
1318
1319    #[test]
1320    fn test_search_file_with_context() {
1321        let temp_dir = TempDir::new().unwrap();
1322        let file_path = temp_dir.path().join("test.txt");
1323        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1324
1325        let regex = regex::Regex::new("target").unwrap();
1326        let options = SearchOptions {
1327            context_lines: 1,
1328            ..Default::default()
1329        };
1330
1331        let results = search_file(&regex, &file_path, &options).unwrap();
1332        assert_eq!(results.len(), 1);
1333
1334        println!("Preview: '{}'", results[0].preview);
1335
1336        // The target line is line 3, with 1 context line before and after
1337        // So we should get lines 2, 3, 4
1338        assert!(results[0].preview.contains("line 2"));
1339        assert!(results[0].preview.contains("target line"));
1340        assert!(results[0].preview.contains("line 4"));
1341    }
1342
1343    #[tokio::test]
1344    async fn test_search_main_function() {
1345        let temp_dir = TempDir::new().unwrap();
1346        create_test_files(temp_dir.path());
1347
1348        let options = SearchOptions {
1349            mode: SearchMode::Regex,
1350            query: "hello".to_string(),
1351            path: temp_dir.path().to_path_buf(),
1352            recursive: true,
1353            case_insensitive: true,
1354            ..Default::default()
1355        };
1356
1357        let results = search(&options).await.unwrap();
1358        assert!(!results.is_empty());
1359    }
1360}