ck_engine/
lib.rs

1use anyhow::Result;
2use ck_ann::AnnIndex;
3use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
4use globset::{Glob, GlobSet, GlobSetBuilder};
5use rayon::prelude::*;
6use regex::{Regex, RegexBuilder};
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf as StdPathBuf;
10use std::path::{Path, PathBuf};
11use tantivy::collector::TopDocs;
12use tantivy::query::QueryParser;
13use tantivy::schema::{STORED, Schema, TEXT, Value};
14use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
15use walkdir::WalkDir;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = tokio::fs::read_to_string(file_path).await?;
25    let lines: Vec<&str> = content.lines().collect();
26
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() {
43        path.parent().unwrap_or(path)
44    } else {
45        path
46    };
47    loop {
48        if current.join(".ck").exists() {
49            return Some(current.to_path_buf());
50        }
51        match current.parent() {
52            Some(parent) => current = parent,
53            None => return None,
54        }
55    }
56}
57
58pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
59    search_with_progress(options, None).await
60}
61
62pub async fn search_with_progress(
63    options: &SearchOptions,
64    progress_callback: Option<SearchProgressCallback>,
65) -> Result<Vec<SearchResult>> {
66    // Validate that the search path exists
67    if !options.path.exists() {
68        return Err(ck_core::CkError::Search(format!(
69            "Path does not exist: {}",
70            options.path.display()
71        ))
72        .into());
73    }
74
75    // Auto-update index if needed (unless it's regex-only mode)
76    if !matches!(options.mode, SearchMode::Regex) {
77        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
78        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
79    }
80
81    match options.mode {
82        SearchMode::Regex => regex_search(options),
83        SearchMode::Lexical => lexical_search(options).await,
84        SearchMode::Semantic => {
85            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
86            semantic_search_v3_with_progress(options, progress_callback).await
87        }
88        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
89    }
90}
91
92fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
93    let pattern = if options.fixed_string {
94        regex::escape(&options.query)
95    } else if options.whole_word {
96        format!(r"\b{}\b", regex::escape(&options.query))
97    } else {
98        options.query.clone()
99    };
100
101    let regex = RegexBuilder::new(&pattern)
102        .case_insensitive(options.case_insensitive)
103        .build()
104        .map_err(CkError::Regex)?;
105
106    // Default to recursive for directories (like grep) to maintain compatibility
107    let should_recurse = options.path.is_dir() || options.recursive;
108    let files = if should_recurse {
109        // Use ck_index's collect_files which respects gitignore
110        ck_index::collect_files(
111            &options.path,
112            options.respect_gitignore,
113            &options.exclude_patterns,
114        )?
115    } else {
116        // For non-recursive, use the local collect_files
117        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
118    };
119
120    let results: Vec<Vec<SearchResult>> = files
121        .par_iter()
122        .filter_map(|file_path| match search_file(&regex, file_path, options) {
123            Ok(matches) => {
124                if matches.is_empty() {
125                    None
126                } else {
127                    Some(matches)
128                }
129            }
130            Err(e) => {
131                tracing::debug!("Error searching {:?}: {}", file_path, e);
132                None
133            }
134        })
135        .collect();
136
137    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
138    // Deterministic ordering: file path, then line number
139    all_results.sort_by(|a, b| {
140        let path_cmp = a.file.cmp(&b.file);
141        if path_cmp != std::cmp::Ordering::Equal {
142            return path_cmp;
143        }
144        a.span.line_start.cmp(&b.span.line_start)
145    });
146
147    if let Some(top_k) = options.top_k {
148        all_results.truncate(top_k);
149    }
150
151    Ok(all_results)
152}
153
154fn search_file(
155    regex: &Regex,
156    file_path: &Path,
157    options: &SearchOptions,
158) -> Result<Vec<SearchResult>> {
159    let content = fs::read_to_string(file_path)?;
160    let lines: Vec<&str> = content.lines().collect();
161    let mut results = Vec::new();
162
163    // If full_section is enabled, try to parse the file and find code sections
164    let code_sections = if options.full_section {
165        extract_code_sections(file_path, &content)
166    } else {
167        None
168    };
169
170    // Track byte offset as we iterate through lines
171    let mut byte_offset = 0;
172
173    for (line_idx, line) in lines.iter().enumerate() {
174        let line_number = line_idx + 1;
175
176        // Find all matches in the line with their positions
177        for mat in regex.find_iter(line) {
178            let preview = if options.full_section {
179                // Try to find the containing code section
180                if let Some(ref sections) = code_sections {
181                    if let Some(section) = find_containing_section(sections, line_idx) {
182                        section.clone()
183                    } else {
184                        // Fall back to context lines if no section found
185                        get_context_preview(&lines, line_idx, options)
186                    }
187                } else {
188                    get_context_preview(&lines, line_idx, options)
189                }
190            } else {
191                get_context_preview(&lines, line_idx, options)
192            };
193
194            results.push(SearchResult {
195                file: file_path.to_path_buf(),
196                span: Span {
197                    byte_start: byte_offset + mat.start(),
198                    byte_end: byte_offset + mat.end(),
199                    line_start: line_number,
200                    line_end: line_number,
201                },
202                score: 1.0,
203                preview,
204                lang: ck_core::Language::from_path(file_path),
205                symbol: None,
206                chunk_hash: None,
207                index_epoch: None,
208            });
209        }
210
211        // Update byte offset for next line (add line length + newline character)
212        byte_offset += line.len();
213        if line_idx < lines.len() - 1 {
214            byte_offset += 1; // Add 1 for the newline character
215        }
216    }
217
218    Ok(results)
219}
220
221async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
222    // Handle both files and directories and reuse nearest existing .ck index up the tree
223    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
224        if options.path.is_file() {
225            options.path.parent().unwrap_or(&options.path).to_path_buf()
226        } else {
227            options.path.clone()
228        }
229    });
230
231    let index_dir = index_root.join(".ck");
232    if !index_dir.exists() {
233        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
234    }
235
236    let tantivy_index_path = index_dir.join("tantivy_index");
237
238    if !tantivy_index_path.exists() {
239        return build_tantivy_index(options).await;
240    }
241
242    let mut schema_builder = Schema::builder();
243    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
244    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
245    let _schema = schema_builder.build();
246
247    let index = Index::open_in_dir(&tantivy_index_path)
248        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
249
250    let reader = index
251        .reader_builder()
252        .reload_policy(ReloadPolicy::OnCommitWithDelay)
253        .try_into()
254        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
255
256    let searcher = reader.searcher();
257    let query_parser = QueryParser::for_index(&index, vec![content_field]);
258
259    let query = query_parser
260        .parse_query(&options.query)
261        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
262
263    let top_docs = if let Some(top_k) = options.top_k {
264        searcher.search(&query, &TopDocs::with_limit(top_k))?
265    } else {
266        searcher.search(&query, &TopDocs::with_limit(100))?
267    };
268
269    // First, collect all results with raw scores
270    let mut raw_results = Vec::new();
271    for (_score, doc_address) in top_docs {
272        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
273        let path_text = retrieved_doc
274            .get_first(path_field)
275            .map(|field_value| field_value.as_str().unwrap_or(""))
276            .unwrap_or("");
277        let content_text = retrieved_doc
278            .get_first(content_field)
279            .map(|field_value| field_value.as_str().unwrap_or(""))
280            .unwrap_or("");
281
282        let file_path = PathBuf::from(path_text);
283        let preview = if options.full_section {
284            content_text.to_string()
285        } else {
286            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
287        };
288
289        raw_results.push((
290            _score,
291            SearchResult {
292                file: file_path,
293                span: Span {
294                    byte_start: 0,
295                    byte_end: content_text.len(),
296                    line_start: 1,
297                    line_end: content_text.lines().count(),
298                },
299                score: _score,
300                preview,
301                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
302                symbol: None,
303                chunk_hash: None,
304                index_epoch: None,
305            },
306        ));
307    }
308
309    // Normalize scores to 0-1 range and apply threshold
310    let mut results = Vec::new();
311    if !raw_results.is_empty() {
312        let max_score = raw_results
313            .iter()
314            .map(|(score, _)| *score)
315            .fold(0.0f32, f32::max);
316        if max_score > 0.0 {
317            for (raw_score, mut result) in raw_results {
318                let normalized_score = raw_score / max_score;
319
320                // Apply threshold filtering with normalized score
321                if let Some(threshold) = options.threshold
322                    && normalized_score < threshold
323                {
324                    continue;
325                }
326
327                result.score = normalized_score;
328                results.push(result);
329            }
330        }
331    }
332
333    Ok(results)
334}
335
336async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
337    // Handle both files and directories by finding the appropriate directory for indexing
338    let index_root = if options.path.is_file() {
339        options.path.parent().unwrap_or(&options.path)
340    } else {
341        &options.path
342    };
343
344    let index_dir = index_root.join(".ck");
345    let tantivy_index_path = index_dir.join("tantivy_index");
346
347    fs::create_dir_all(&tantivy_index_path)?;
348
349    let mut schema_builder = Schema::builder();
350    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
351    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
352    let schema = schema_builder.build();
353
354    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
355        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
356
357    let mut index_writer = index
358        .writer(50_000_000)
359        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
360
361    let files = collect_files(index_root, true, &options.exclude_patterns)?;
362
363    for file_path in &files {
364        if let Ok(content) = fs::read_to_string(file_path) {
365            let doc = doc!(
366                content_field => content,
367                path_field => file_path.display().to_string()
368            );
369            index_writer.add_document(doc)?;
370        }
371    }
372
373    index_writer
374        .commit()
375        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
376
377    // After building, search again with the same options
378    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
379    let mut schema_builder = Schema::builder();
380    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
381    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
382    let _schema = schema_builder.build();
383
384    let index = Index::open_in_dir(&tantivy_index_path)
385        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
386
387    let reader = index
388        .reader_builder()
389        .reload_policy(ReloadPolicy::OnCommitWithDelay)
390        .try_into()
391        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
392
393    let searcher = reader.searcher();
394    let query_parser = QueryParser::for_index(&index, vec![content_field]);
395
396    let query = query_parser
397        .parse_query(&options.query)
398        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
399
400    let top_docs = if let Some(top_k) = options.top_k {
401        searcher.search(&query, &TopDocs::with_limit(top_k))?
402    } else {
403        searcher.search(&query, &TopDocs::with_limit(100))?
404    };
405
406    // First, collect all results with raw scores
407    let mut raw_results = Vec::new();
408    for (_score, doc_address) in top_docs {
409        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
410        let path_text = retrieved_doc
411            .get_first(path_field)
412            .map(|field_value| field_value.as_str().unwrap_or(""))
413            .unwrap_or("");
414        let content_text = retrieved_doc
415            .get_first(content_field)
416            .map(|field_value| field_value.as_str().unwrap_or(""))
417            .unwrap_or("");
418
419        let file_path = PathBuf::from(path_text);
420        let preview = if options.full_section {
421            content_text.to_string()
422        } else {
423            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
424        };
425
426        raw_results.push((
427            _score,
428            SearchResult {
429                file: file_path,
430                span: Span {
431                    byte_start: 0,
432                    byte_end: content_text.len(),
433                    line_start: 1,
434                    line_end: content_text.lines().count(),
435                },
436                score: _score,
437                preview,
438                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
439                symbol: None,
440                chunk_hash: None,
441                index_epoch: None,
442            },
443        ));
444    }
445
446    // Normalize scores to 0-1 range and apply threshold
447    let mut results = Vec::new();
448    if !raw_results.is_empty() {
449        let max_score = raw_results
450            .iter()
451            .map(|(score, _)| *score)
452            .fold(0.0f32, f32::max);
453        if max_score > 0.0 {
454            for (raw_score, mut result) in raw_results {
455                let normalized_score = raw_score / max_score;
456
457                // Apply threshold filtering with normalized score
458                if let Some(threshold) = options.threshold
459                    && normalized_score < threshold
460                {
461                    continue;
462                }
463
464                result.score = normalized_score;
465                results.push(result);
466            }
467        }
468    }
469
470    Ok(results)
471}
472
473#[allow(dead_code)]
474async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
475    semantic_search_with_progress(options, None).await
476}
477
478async fn semantic_search_with_progress(
479    options: &SearchOptions,
480    progress_callback: Option<SearchProgressCallback>,
481) -> Result<Vec<SearchResult>> {
482    // Handle both files and directories and reuse nearest existing .ck index up the tree
483    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
484        if options.path.is_file() {
485            options.path.parent().unwrap_or(&options.path).to_path_buf()
486        } else {
487            options.path.clone()
488        }
489    });
490
491    let index_dir = index_root.join(".ck");
492    if !index_dir.exists() {
493        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
494    }
495
496    let ann_index_path = index_dir.join("ann_index.bin");
497    let embeddings_path = index_dir.join("embeddings.json");
498
499    if !ann_index_path.exists() || !embeddings_path.exists() {
500        return build_semantic_index_with_progress(options, progress_callback).await;
501    }
502
503    // Load the ANN index
504    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
505
506    // Load file metadata
507    let embeddings_data = fs::read_to_string(&embeddings_path)?;
508    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
509
510    // Create embedder and embed the query
511    if let Some(ref callback) = progress_callback {
512        callback("Loading embedding model...");
513    }
514
515    let mut embedder = if let Some(ref callback) = progress_callback {
516        let _cb = callback.as_ref();
517        let model_cb = Box::new(|msg: &str| {
518            // Note: We can't directly use the callback here due to lifetime issues
519            // For now, we'll just use eprintln! until we can restructure this better
520            eprintln!("Model: {}", msg);
521        }) as ck_embed::ModelDownloadCallback;
522        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
523    } else {
524        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
525    };
526    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
527
528    if query_embeddings.is_empty() {
529        return Ok(Vec::new());
530    }
531
532    let query_embedding = &query_embeddings[0];
533
534    // Search using ANN
535    let top_k = options.top_k.unwrap_or(10);
536    let similar_docs = ann_index.search(query_embedding, top_k);
537
538    let mut results = Vec::new();
539
540    // Check if we're searching a specific file vs. a directory
541    let filter_by_file = options.path.is_file();
542    let target_file = if filter_by_file {
543        Some(
544            options
545                .path
546                .canonicalize()
547                .unwrap_or_else(|_| options.path.clone()),
548        )
549    } else {
550        None
551    };
552
553    for (doc_id, similarity) in similar_docs {
554        // Apply threshold filtering
555        if let Some(threshold) = options.threshold
556            && similarity < threshold
557        {
558            continue;
559        }
560
561        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
562            // Filter by target file if specified
563            if let Some(target) = &target_file {
564                let canonical_result = file_path
565                    .canonicalize()
566                    .unwrap_or_else(|_| file_path.clone());
567                if canonical_result != *target {
568                    continue; // Skip this result if it doesn't match the target file
569                }
570            }
571
572            // If full_section is enabled and this is a code section, return the full content
573            let preview = if options.full_section {
574                content.clone()
575            } else {
576                content.lines().take(3).collect::<Vec<_>>().join("\n")
577            };
578
579            results.push(SearchResult {
580                file: file_path.clone(),
581                span: Span {
582                    byte_start: 0,
583                    byte_end: content.len(),
584                    line_start: 1,
585                    line_end: content.lines().count(),
586                },
587                score: similarity,
588                preview,
589                lang: ck_core::Language::from_path(file_path),
590                symbol: None,
591                chunk_hash: None,
592                index_epoch: None,
593            });
594        }
595    }
596
597    Ok(results)
598}
599
600#[allow(dead_code)]
601async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
602    build_semantic_index_with_progress(options, None).await
603}
604
605async fn build_semantic_index_with_progress(
606    options: &SearchOptions,
607    progress_callback: Option<SearchProgressCallback>,
608) -> Result<Vec<SearchResult>> {
609    // Handle both files and directories by finding the appropriate directory for indexing
610    let index_root = if options.path.is_file() {
611        options.path.parent().unwrap_or(&options.path)
612    } else {
613        &options.path
614    };
615
616    let index_dir = index_root.join(".ck");
617    let ann_index_path = index_dir.join("ann_index.bin");
618    let embeddings_path = index_dir.join("embeddings.json");
619
620    fs::create_dir_all(&index_dir)?;
621
622    if let Some(ref callback) = progress_callback {
623        callback("Building semantic index (no index found)...");
624    }
625
626    // Always print this important message, even in quiet mode for indexing operations
627    eprintln!("Building semantic index (no existing index found)...");
628
629    // Collect files and their content
630    let files = collect_files(index_root, true, &options.exclude_patterns)?;
631
632    if let Some(ref callback) = progress_callback {
633        callback(&format!("Found {} files to index", files.len()));
634    }
635    eprintln!("Found {} files to embed and index", files.len());
636
637    let mut file_embeddings = Vec::new();
638    let mut embeddings = Vec::new();
639
640    // Create embedder with progress callback
641    if let Some(ref callback) = progress_callback {
642        callback("Loading embedding model...");
643    }
644
645    let model_callback = if progress_callback.is_some() {
646        Some(Box::new(|msg: &str| {
647            eprintln!("Model: {}", msg);
648        }) as ck_embed::ModelDownloadCallback)
649    } else {
650        None
651    };
652
653    let mut embedder =
654        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
655
656    if let Some(ref callback) = progress_callback {
657        callback("Generating embeddings for code chunks...");
658    }
659
660    for (file_idx, file_path) in files.iter().enumerate() {
661        if let Ok(content) = fs::read_to_string(file_path) {
662            if let Some(ref callback) = progress_callback {
663                let file_name = file_path
664                    .file_name()
665                    .map(|n| n.to_string_lossy().to_string())
666                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
667                callback(&format!(
668                    "Processing {}/{}: {}",
669                    file_idx + 1,
670                    files.len(),
671                    file_name
672                ));
673            }
674
675            // Chunk the content for better embeddings
676            let chunks = ck_chunk::chunk_text(&content, ck_core::Language::from_path(file_path))?;
677
678            for chunk in chunks {
679                let chunk_embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
680                if !chunk_embeddings.is_empty() {
681                    embeddings.push(chunk_embeddings[0].clone());
682                    file_embeddings.push((file_path.clone(), chunk.text));
683                }
684            }
685        }
686    }
687
688    if let Some(ref callback) = progress_callback {
689        callback(&format!(
690            "Built {} embeddings, creating search index...",
691            embeddings.len()
692        ));
693    }
694    eprintln!(
695        "Generated {} embeddings, building search index...",
696        embeddings.len()
697    );
698
699    // Build ANN index
700    let index = ck_ann::SimpleIndex::build(&embeddings)?;
701    index.save(&ann_index_path)?;
702
703    // Save file embeddings metadata
704    let embeddings_json = serde_json::to_string(&file_embeddings)?;
705    fs::write(&embeddings_path, embeddings_json)?;
706
707    if let Some(ref callback) = progress_callback {
708        callback("Semantic index built successfully, running search...");
709    }
710    eprintln!("Semantic index built successfully!");
711
712    // After building, search again - inline to avoid recursion
713    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
714
715    // Load file metadata
716    let embeddings_data = fs::read_to_string(&embeddings_path)?;
717    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
718
719    // Create embedder and embed the query
720    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
721    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
722
723    if query_embeddings.is_empty() {
724        return Ok(Vec::new());
725    }
726
727    let query_embedding = &query_embeddings[0];
728
729    // Search using ANN
730    let top_k = options.top_k.unwrap_or(10);
731    let similar_docs = ann_index.search(query_embedding, top_k);
732
733    let mut results = Vec::new();
734
735    // Check if we're searching a specific file vs. a directory
736    let filter_by_file = options.path.is_file();
737    let target_file = if filter_by_file {
738        Some(
739            options
740                .path
741                .canonicalize()
742                .unwrap_or_else(|_| options.path.clone()),
743        )
744    } else {
745        None
746    };
747
748    for (doc_id, similarity) in similar_docs {
749        // Apply threshold filtering
750        if let Some(threshold) = options.threshold
751            && similarity < threshold
752        {
753            continue;
754        }
755
756        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
757            // Filter by target file if specified
758            if let Some(target) = &target_file {
759                let canonical_result = file_path
760                    .canonicalize()
761                    .unwrap_or_else(|_| file_path.clone());
762                if canonical_result != *target {
763                    continue; // Skip this result if it doesn't match the target file
764                }
765            }
766
767            // If full_section is enabled and this is a code section, return the full content
768            let preview = if options.full_section {
769                content.clone()
770            } else {
771                content.lines().take(3).collect::<Vec<_>>().join("\n")
772            };
773
774            results.push(SearchResult {
775                file: file_path.clone(),
776                span: Span {
777                    byte_start: 0,
778                    byte_end: content.len(),
779                    line_start: 1,
780                    line_end: content.lines().count(),
781                },
782                score: similarity,
783                preview,
784                lang: ck_core::Language::from_path(file_path),
785                symbol: None,
786                chunk_hash: None,
787                index_epoch: None,
788            });
789        }
790    }
791
792    Ok(results)
793}
794
795#[allow(dead_code)]
796async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
797    hybrid_search_with_progress(options, None).await
798}
799
800async fn hybrid_search_with_progress(
801    options: &SearchOptions,
802    progress_callback: Option<SearchProgressCallback>,
803) -> Result<Vec<SearchResult>> {
804    if let Some(ref callback) = progress_callback {
805        callback("Running regex search...");
806    }
807    let regex_results = regex_search(options)?;
808
809    if let Some(ref callback) = progress_callback {
810        callback("Running semantic search...");
811    }
812    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
813
814    let mut combined = HashMap::new();
815
816    for (rank, result) in regex_results.iter().enumerate() {
817        let key = format!("{}:{}", result.file.display(), result.span.line_start);
818        combined
819            .entry(key)
820            .or_insert(Vec::new())
821            .push((rank + 1, result.clone()));
822    }
823
824    for (rank, result) in semantic_results.iter().enumerate() {
825        let key = format!("{}:{}", result.file.display(), result.span.line_start);
826        combined
827            .entry(key)
828            .or_insert(Vec::new())
829            .push((rank + 1, result.clone()));
830    }
831
832    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
833    let mut rrf_results: Vec<SearchResult> = combined
834        .into_values()
835        .map(|ranks| {
836            let mut result = ranks[0].1.clone();
837            let rrf_score = ranks
838                .iter()
839                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
840                .sum();
841            result.score = rrf_score;
842            result
843        })
844        .filter(|result| {
845            // Apply threshold filtering to raw RRF scores
846            if let Some(threshold) = options.threshold {
847                result.score >= threshold
848            } else {
849                true
850            }
851        })
852        .collect();
853
854    // Sort by RRF score (highest first)
855    rrf_results.sort_by(|a, b| {
856        b.score
857            .partial_cmp(&a.score)
858            .unwrap_or(std::cmp::Ordering::Equal)
859    });
860
861    if let Some(top_k) = options.top_k {
862        rrf_results.truncate(top_k);
863    }
864
865    Ok(rrf_results)
866}
867
868fn build_globset(patterns: &[String]) -> GlobSet {
869    let mut builder = GlobSetBuilder::new();
870    for pat in patterns {
871        // Treat patterns as filename or directory globs
872        if let Ok(glob) = Glob::new(pat) {
873            builder.add(glob);
874        }
875    }
876    builder.build().unwrap_or_else(|_| GlobSet::empty())
877}
878
879fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
880    let globset = build_globset(exclude_patterns);
881    // Match against each path component and the full path
882    if globset.is_match(path) {
883        return true;
884    }
885    for component in path.components() {
886        if let std::path::Component::Normal(name) = component
887            && globset.is_match(name)
888        {
889            return true;
890        }
891    }
892    false
893}
894
895fn collect_files(
896    path: &Path,
897    recursive: bool,
898    exclude_patterns: &[String],
899) -> Result<Vec<PathBuf>> {
900    let mut files = Vec::new();
901    let globset = build_globset(exclude_patterns);
902
903    if path.is_file() {
904        // Always add single files, even if they're excluded (user explicitly requested)
905        files.push(path.to_path_buf());
906    } else if recursive {
907        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
908            // Skip excluded directories entirely for efficiency
909            let name = e.file_name();
910            !globset.is_match(e.path()) && !globset.is_match(name)
911        }) {
912            match entry {
913                Ok(entry) => {
914                    if entry.file_type().is_file()
915                        && !should_exclude_path(entry.path(), exclude_patterns)
916                    {
917                        files.push(entry.path().to_path_buf());
918                    }
919                }
920                Err(e) => {
921                    // Log directory traversal errors but continue processing
922                    tracing::debug!("Skipping path due to error: {}", e);
923                    continue;
924                }
925            }
926        }
927    } else {
928        match fs::read_dir(path) {
929            Ok(read_dir) => {
930                for entry in read_dir {
931                    match entry {
932                        Ok(entry) => {
933                            let path = entry.path();
934                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
935                                files.push(path);
936                            }
937                        }
938                        Err(e) => {
939                            tracing::debug!("Skipping directory entry due to error: {}", e);
940                            continue;
941                        }
942                    }
943                }
944            }
945            Err(e) => {
946                tracing::debug!("Cannot read directory {:?}: {}", path, e);
947                return Err(e.into());
948            }
949        }
950    }
951
952    Ok(files)
953}
954
955async fn ensure_index_updated(
956    path: &Path,
957    force_reindex: bool,
958    need_embeddings: bool,
959) -> Result<()> {
960    // Handle both files and directories and reuse nearest existing .ck index up the tree
961    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
962        if path.is_file() {
963            path.parent().unwrap_or(path).to_path_buf()
964        } else {
965            path.to_path_buf()
966        }
967    });
968    let index_root = &index_root_buf;
969
970    // If force reindex is requested, always update
971    if force_reindex {
972        let stats = ck_index::smart_update_index_with_progress(
973            index_root,
974            false,
975            None,
976            need_embeddings,
977            true,
978            &[],  // Empty exclude patterns for internal engine use
979            None, // model - use existing from index
980        )
981        .await?;
982        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
983            tracing::info!(
984                "Index updated: {} files indexed, {} orphaned files removed",
985                stats.files_indexed,
986                stats.orphaned_files_removed
987            );
988        }
989        return Ok(());
990    }
991
992    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
993    let stats = ck_index::smart_update_index_with_progress(
994        index_root,
995        false,
996        None,
997        need_embeddings,
998        true,
999        &[],
1000        None, // model - use existing from index
1001    )
1002    .await?;
1003    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1004        tracing::info!(
1005            "Index updated: {} files indexed, {} orphaned files removed",
1006            stats.files_indexed,
1007            stats.orphaned_files_removed
1008        );
1009    }
1010
1011    Ok(())
1012}
1013
1014fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
1015    let before = options.before_context_lines.max(options.context_lines);
1016    let after = options.after_context_lines.max(options.context_lines);
1017
1018    if before > 0 || after > 0 {
1019        let start_idx = line_idx.saturating_sub(before);
1020        let end_idx = (line_idx + after + 1).min(lines.len());
1021        lines[start_idx..end_idx].join("\n")
1022    } else {
1023        lines[line_idx].to_string()
1024    }
1025}
1026
1027fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1028    let lang = ck_core::Language::from_path(file_path)?;
1029
1030    // Parse the file with tree-sitter and extract function/class sections
1031    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1032        let sections: Vec<(usize, usize, String)> = chunks
1033            .into_iter()
1034            .filter(|chunk| {
1035                matches!(
1036                    chunk.chunk_type,
1037                    ck_chunk::ChunkType::Function
1038                        | ck_chunk::ChunkType::Class
1039                        | ck_chunk::ChunkType::Method
1040                )
1041            })
1042            .map(|chunk| {
1043                (
1044                    chunk.span.line_start - 1, // Convert to 0-based index
1045                    chunk.span.line_end - 1,
1046                    chunk.text,
1047                )
1048            })
1049            .collect();
1050
1051        if sections.is_empty() {
1052            None
1053        } else {
1054            Some(sections)
1055        }
1056    } else {
1057        None
1058    }
1059}
1060
1061fn find_containing_section(
1062    sections: &[(usize, usize, String)],
1063    line_idx: usize,
1064) -> Option<&String> {
1065    for (start, end, text) in sections {
1066        if line_idx >= *start && line_idx <= *end {
1067            return Some(text);
1068        }
1069    }
1070    None
1071}
1072
1073#[cfg(test)]
1074mod tests {
1075    use super::*;
1076    use std::fs;
1077    use tempfile::TempDir;
1078
1079    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1080        let files = vec![
1081            ("test1.txt", "hello world rust programming"),
1082            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1083            ("test3.py", "print('Hello Python')"),
1084            ("test4.txt", "machine learning artificial intelligence"),
1085        ];
1086
1087        let mut paths = Vec::new();
1088        for (name, content) in files {
1089            let path = dir.join(name);
1090            fs::write(&path, content).unwrap();
1091            paths.push(path);
1092        }
1093        paths
1094    }
1095
1096    #[test]
1097    fn test_collect_files() {
1098        let temp_dir = TempDir::new().unwrap();
1099        let test_files = create_test_files(temp_dir.path());
1100
1101        // Test non-recursive
1102        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1103        assert_eq!(files.len(), 4);
1104
1105        // Test recursive
1106        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1107        assert_eq!(files.len(), 4);
1108
1109        // Test single file
1110        let files = collect_files(&test_files[0], false, &[]).unwrap();
1111        assert_eq!(files.len(), 1);
1112        assert_eq!(files[0], test_files[0]);
1113    }
1114
1115    #[test]
1116    fn test_regex_search() {
1117        let temp_dir = TempDir::new().unwrap();
1118        create_test_files(temp_dir.path());
1119
1120        let options = SearchOptions {
1121            mode: SearchMode::Regex,
1122            query: "rust".to_string(),
1123            path: temp_dir.path().to_path_buf(),
1124            recursive: true,
1125            ..Default::default()
1126        };
1127
1128        let results = regex_search(&options).unwrap();
1129        assert!(!results.is_empty());
1130
1131        // Should find matches in files containing "rust"
1132        let rust_matches: Vec<_> = results
1133            .iter()
1134            .filter(|r| r.preview.to_lowercase().contains("rust"))
1135            .collect();
1136        assert!(!rust_matches.is_empty());
1137    }
1138
1139    #[test]
1140    fn test_regex_search_case_insensitive() {
1141        let temp_dir = TempDir::new().unwrap();
1142        create_test_files(temp_dir.path());
1143
1144        let options = SearchOptions {
1145            mode: SearchMode::Regex,
1146            query: "HELLO".to_string(),
1147            path: temp_dir.path().to_path_buf(),
1148            recursive: true,
1149            case_insensitive: true,
1150            ..Default::default()
1151        };
1152
1153        let results = regex_search(&options).unwrap();
1154        assert!(!results.is_empty());
1155    }
1156
1157    #[test]
1158    fn test_regex_search_fixed_string() {
1159        let temp_dir = TempDir::new().unwrap();
1160        create_test_files(temp_dir.path());
1161
1162        let options = SearchOptions {
1163            mode: SearchMode::Regex,
1164            query: "fn main()".to_string(),
1165            path: temp_dir.path().to_path_buf(),
1166            recursive: true,
1167            fixed_string: true,
1168            ..Default::default()
1169        };
1170
1171        let results = regex_search(&options).unwrap();
1172        assert!(!results.is_empty());
1173    }
1174
1175    #[test]
1176    fn test_regex_search_whole_word() {
1177        let temp_dir = TempDir::new().unwrap();
1178        fs::write(
1179            temp_dir.path().join("word_test.txt"),
1180            "rust rusty rustacean",
1181        )
1182        .unwrap();
1183
1184        let options = SearchOptions {
1185            mode: SearchMode::Regex,
1186            query: "rust".to_string(),
1187            path: temp_dir.path().to_path_buf(),
1188            recursive: true,
1189            whole_word: true,
1190            ..Default::default()
1191        };
1192
1193        let results = regex_search(&options).unwrap();
1194        assert!(!results.is_empty());
1195        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1196    }
1197
1198    #[test]
1199    fn test_regex_search_top_k() {
1200        let temp_dir = TempDir::new().unwrap();
1201
1202        // Create multiple files with matches
1203        for i in 0..10 {
1204            fs::write(
1205                temp_dir.path().join(format!("file{}.txt", i)),
1206                "test content",
1207            )
1208            .unwrap();
1209        }
1210
1211        let options = SearchOptions {
1212            mode: SearchMode::Regex,
1213            query: "test".to_string(),
1214            path: temp_dir.path().to_path_buf(),
1215            recursive: true,
1216            top_k: Some(5),
1217            ..Default::default()
1218        };
1219
1220        let results = regex_search(&options).unwrap();
1221        assert!(results.len() <= 5);
1222    }
1223
1224    #[test]
1225    fn test_regex_search_span_offsets() {
1226        // Test that span offsets are correctly calculated for multiple matches on a line
1227        let temp_dir = TempDir::new().unwrap();
1228        let test_file = temp_dir.path().join("spans.txt");
1229        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1230
1231        let options = SearchOptions {
1232            mode: SearchMode::Regex,
1233            query: "test".to_string(),
1234            path: test_file.clone(),
1235            recursive: false,
1236            ..Default::default()
1237        };
1238
1239        let results = regex_search(&options).unwrap();
1240
1241        // Should find 5 matches total
1242        assert_eq!(results.len(), 5);
1243
1244        // Check first line has 3 matches with correct byte offsets
1245        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1246        assert_eq!(line1_matches.len(), 3);
1247        assert_eq!(line1_matches[0].span.byte_start, 0);
1248        assert_eq!(line1_matches[1].span.byte_start, 5);
1249        assert_eq!(line1_matches[2].span.byte_start, 10);
1250
1251        // Check second line match
1252        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1253        assert_eq!(line2_matches.len(), 1);
1254        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1255
1256        // Each match should have different byte offsets
1257        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1258        byte_starts.sort();
1259        byte_starts.dedup();
1260        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1261    }
1262
1263    #[test]
1264    fn test_search_file() {
1265        let temp_dir = TempDir::new().unwrap();
1266        let file_path = temp_dir.path().join("test.txt");
1267        fs::write(
1268            &file_path,
1269            "line 1: hello\nline 2: world\nline 3: rust programming",
1270        )
1271        .unwrap();
1272
1273        let regex = regex::Regex::new("rust").unwrap();
1274        let options = SearchOptions::default();
1275
1276        let results = search_file(&regex, &file_path, &options).unwrap();
1277        assert_eq!(results.len(), 1);
1278        assert_eq!(results[0].span.line_start, 3);
1279        assert!(results[0].preview.contains("rust"));
1280    }
1281
1282    #[test]
1283    fn test_search_file_with_context() {
1284        let temp_dir = TempDir::new().unwrap();
1285        let file_path = temp_dir.path().join("test.txt");
1286        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1287
1288        let regex = regex::Regex::new("target").unwrap();
1289        let options = SearchOptions {
1290            context_lines: 1,
1291            ..Default::default()
1292        };
1293
1294        let results = search_file(&regex, &file_path, &options).unwrap();
1295        assert_eq!(results.len(), 1);
1296
1297        println!("Preview: '{}'", results[0].preview);
1298
1299        // The target line is line 3, with 1 context line before and after
1300        // So we should get lines 2, 3, 4
1301        assert!(results[0].preview.contains("line 2"));
1302        assert!(results[0].preview.contains("target line"));
1303        assert!(results[0].preview.contains("line 4"));
1304    }
1305
1306    #[tokio::test]
1307    async fn test_search_main_function() {
1308        let temp_dir = TempDir::new().unwrap();
1309        create_test_files(temp_dir.path());
1310
1311        let options = SearchOptions {
1312            mode: SearchMode::Regex,
1313            query: "hello".to_string(),
1314            path: temp_dir.path().to_path_buf(),
1315            recursive: true,
1316            case_insensitive: true,
1317            ..Default::default()
1318        };
1319
1320        let results = search(&options).await.unwrap();
1321        assert!(!results.is_empty());
1322    }
1323}