ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9use rayon::prelude::*;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, STORED, TEXT, Value};
13use tantivy::{doc, Index, ReloadPolicy, TantivyDocument};
14use ck_ann::AnnIndex;
15use std::path::PathBuf as StdPathBuf;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = fs::read_to_string(file_path)?;
25    let lines: Vec<&str> = content.lines().collect();
26    
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30    
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33    
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() { path.parent().unwrap_or(path) } else { path };
43    loop {
44        if current.join(".ck").exists() {
45            return Some(current.to_path_buf());
46        }
47        match current.parent() {
48            Some(parent) => current = parent,
49            None => return None,
50        }
51    }
52}
53
54pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
55    search_with_progress(options, None).await
56}
57
58pub async fn search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
59    // Validate that the search path exists
60    if !options.path.exists() {
61        return Err(ck_core::CkError::Search(format!("Path does not exist: {}", options.path.display())).into());
62    }
63    
64    // Auto-update index if needed (unless it's regex-only mode)
65    if !matches!(options.mode, SearchMode::Regex) {
66        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
67        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
68    }
69    
70    match options.mode {
71        SearchMode::Regex => regex_search(options),
72        SearchMode::Lexical => lexical_search(options).await,
73        SearchMode::Semantic => {
74            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
75            semantic_search_v3_with_progress(options, progress_callback).await
76        },
77        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
78    }
79}
80
81fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
82    let pattern = if options.fixed_string {
83        regex::escape(&options.query)
84    } else if options.whole_word {
85        format!(r"\b{}\b", regex::escape(&options.query))
86    } else {
87        options.query.clone()
88    };
89    
90    let regex = RegexBuilder::new(&pattern)
91        .case_insensitive(options.case_insensitive)
92        .build()
93        .map_err(|e| CkError::Regex(e))?;
94    
95    // Default to recursive for directories (like grep) to maintain compatibility
96    let should_recurse = options.path.is_dir() || options.recursive;
97    let files = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
98    
99    let results: Vec<Vec<SearchResult>> = files
100        .par_iter()
101        .filter_map(|file_path| {
102            match search_file(&regex, file_path, options) {
103                Ok(matches) => {
104                    if matches.is_empty() {
105                        None
106                    } else {
107                        Some(matches)
108                    }
109                }
110                Err(e) => {
111                    tracing::debug!("Error searching {:?}: {}", file_path, e);
112                    None
113                }
114            }
115        })
116        .collect();
117    
118    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
119    // Deterministic ordering: file path, then line number
120    all_results.sort_by(|a, b| {
121        let path_cmp = a.file.cmp(&b.file);
122        if path_cmp != std::cmp::Ordering::Equal {
123            return path_cmp;
124        }
125        a.span.line_start.cmp(&b.span.line_start)
126    });
127    
128    if let Some(top_k) = options.top_k {
129        all_results.truncate(top_k);
130    }
131    
132    Ok(all_results)
133}
134
135fn search_file(regex: &Regex, file_path: &Path, options: &SearchOptions) -> Result<Vec<SearchResult>> {
136    let content = fs::read_to_string(file_path)?;
137    let lines: Vec<&str> = content.lines().collect();
138    let mut results = Vec::new();
139    
140    // If full_section is enabled, try to parse the file and find code sections
141    let code_sections = if options.full_section {
142        extract_code_sections(file_path, &content)
143    } else {
144        None
145    };
146    
147    for (line_idx, line) in lines.iter().enumerate() {
148        let line_number = line_idx + 1;
149        
150        if regex.is_match(line) {
151            let preview = if options.full_section {
152                // Try to find the containing code section
153                if let Some(ref sections) = code_sections {
154                    if let Some(section) = find_containing_section(sections, line_idx) {
155                        section.clone()
156                    } else {
157                        // Fall back to context lines if no section found
158                        get_context_preview(&lines, line_idx, options)
159                    }
160                } else {
161                    get_context_preview(&lines, line_idx, options)
162                }
163            } else {
164                get_context_preview(&lines, line_idx, options)
165            };
166            
167            results.push(SearchResult {
168                file: file_path.to_path_buf(),
169                span: Span {
170                    byte_start: 0,
171                    byte_end: line.len(),
172                    line_start: line_number,
173                    line_end: line_number,
174                },
175                score: 1.0,
176                preview,
177                lang: detect_language(file_path),
178                symbol: None,
179            });
180        }
181    }
182    
183    Ok(results)
184}
185
186async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
187    // Handle both files and directories and reuse nearest existing .ck index up the tree
188    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
189        if options.path.is_file() {
190            options.path.parent().unwrap_or(&options.path).to_path_buf()
191        } else {
192            options.path.clone()
193        }
194    });
195    
196    let index_dir = index_root.join(".ck");
197    if !index_dir.exists() {
198        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
199    }
200    
201    let tantivy_index_path = index_dir.join("tantivy_index");
202    
203    if !tantivy_index_path.exists() {
204        return build_tantivy_index(options).await;
205    }
206    
207    let mut schema_builder = Schema::builder();
208    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
209    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
210    let _schema = schema_builder.build();
211    
212    let index = Index::open_in_dir(&tantivy_index_path)
213        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
214    
215    let reader = index
216        .reader_builder()
217        .reload_policy(ReloadPolicy::OnCommitWithDelay)
218        .try_into()
219        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
220    
221    let searcher = reader.searcher();
222    let query_parser = QueryParser::for_index(&index, vec![content_field]);
223    
224    let query = query_parser
225        .parse_query(&options.query)
226        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
227    
228    let top_docs = if let Some(top_k) = options.top_k {
229        searcher.search(&query, &TopDocs::with_limit(top_k))?
230    } else {
231        searcher.search(&query, &TopDocs::with_limit(100))?
232    };
233    
234    // First, collect all results with raw scores
235    let mut raw_results = Vec::new();
236    for (_score, doc_address) in top_docs {
237        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
238        let path_text = retrieved_doc
239            .get_first(path_field)
240            .map(|field_value| field_value.as_str().unwrap_or(""))
241            .unwrap_or("");
242        let content_text = retrieved_doc
243            .get_first(content_field)
244            .map(|field_value| field_value.as_str().unwrap_or(""))
245            .unwrap_or("");
246        
247        let file_path = PathBuf::from(path_text);
248        let preview = if options.full_section {
249            content_text.to_string()
250        } else {
251            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
252        };
253        
254        raw_results.push((_score, SearchResult {
255            file: file_path,
256            span: Span {
257                byte_start: 0,
258                byte_end: content_text.len(),
259                line_start: 1,
260                line_end: content_text.lines().count(),
261            },
262            score: _score,
263            preview,
264            lang: detect_language(&PathBuf::from(path_text)),
265            symbol: None,
266        }));
267    }
268    
269    // Normalize scores to 0-1 range and apply threshold
270    let mut results = Vec::new();
271    if !raw_results.is_empty() {
272        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
273        if max_score > 0.0 {
274            for (raw_score, mut result) in raw_results {
275                let normalized_score = raw_score / max_score;
276                
277                // Apply threshold filtering with normalized score
278                if let Some(threshold) = options.threshold {
279                    if normalized_score < threshold {
280                        continue;
281                    }
282                }
283                
284                result.score = normalized_score;
285                results.push(result);
286            }
287        }
288    }
289    
290    Ok(results)
291}
292
293async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
294    // Handle both files and directories by finding the appropriate directory for indexing
295    let index_root = if options.path.is_file() {
296        options.path.parent().unwrap_or(&options.path)
297    } else {
298        &options.path
299    };
300    
301    let index_dir = index_root.join(".ck");
302    let tantivy_index_path = index_dir.join("tantivy_index");
303    
304    fs::create_dir_all(&tantivy_index_path)?;
305    
306    let mut schema_builder = Schema::builder();
307    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
308    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
309    let schema = schema_builder.build();
310    
311    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
312        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
313    
314    let mut index_writer = index.writer(50_000_000)
315        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
316    
317    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
318    
319    for file_path in &files {
320        if let Ok(content) = fs::read_to_string(file_path) {
321            let doc = doc!(
322                content_field => content,
323                path_field => file_path.display().to_string()
324            );
325            index_writer.add_document(doc)?;
326        }
327    }
328    
329    index_writer.commit()
330        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
331    
332    // After building, search again with the same options  
333    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
334    let mut schema_builder = Schema::builder();
335    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
336    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
337    let _schema = schema_builder.build();
338    
339    let index = Index::open_in_dir(&tantivy_index_path)
340        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
341    
342    let reader = index
343        .reader_builder()
344        .reload_policy(ReloadPolicy::OnCommitWithDelay)
345        .try_into()
346        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
347    
348    let searcher = reader.searcher();
349    let query_parser = QueryParser::for_index(&index, vec![content_field]);
350    
351    let query = query_parser
352        .parse_query(&options.query)
353        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
354    
355    let top_docs = if let Some(top_k) = options.top_k {
356        searcher.search(&query, &TopDocs::with_limit(top_k))?
357    } else {
358        searcher.search(&query, &TopDocs::with_limit(100))?
359    };
360    
361    // First, collect all results with raw scores
362    let mut raw_results = Vec::new();
363    for (_score, doc_address) in top_docs {
364        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
365        let path_text = retrieved_doc
366            .get_first(path_field)
367            .map(|field_value| field_value.as_str().unwrap_or(""))
368            .unwrap_or("");
369        let content_text = retrieved_doc
370            .get_first(content_field)
371            .map(|field_value| field_value.as_str().unwrap_or(""))
372            .unwrap_or("");
373        
374        let file_path = PathBuf::from(path_text);
375        let preview = if options.full_section {
376            content_text.to_string()
377        } else {
378            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
379        };
380        
381        raw_results.push((_score, SearchResult {
382            file: file_path,
383            span: Span {
384                byte_start: 0,
385                byte_end: content_text.len(),
386                line_start: 1,
387                line_end: content_text.lines().count(),
388            },
389            score: _score,
390            preview,
391            lang: detect_language(&PathBuf::from(path_text)),
392            symbol: None,
393        }));
394    }
395    
396    // Normalize scores to 0-1 range and apply threshold
397    let mut results = Vec::new();
398    if !raw_results.is_empty() {
399        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
400        if max_score > 0.0 {
401            for (raw_score, mut result) in raw_results {
402                let normalized_score = raw_score / max_score;
403                
404                // Apply threshold filtering with normalized score
405                if let Some(threshold) = options.threshold {
406                    if normalized_score < threshold {
407                        continue;
408                    }
409                }
410                
411                result.score = normalized_score;
412                results.push(result);
413            }
414        }
415    }
416    
417    Ok(results)
418}
419
420#[allow(dead_code)]
421async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
422    semantic_search_with_progress(options, None).await
423}
424
425async fn semantic_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
426    // Handle both files and directories and reuse nearest existing .ck index up the tree
427    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
428        if options.path.is_file() {
429            options.path.parent().unwrap_or(&options.path).to_path_buf()
430        } else {
431            options.path.clone()
432        }
433    });
434    
435    let index_dir = index_root.join(".ck");
436    if !index_dir.exists() {
437        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
438    }
439    
440    let ann_index_path = index_dir.join("ann_index.bin");
441    let embeddings_path = index_dir.join("embeddings.json");
442    
443    if !ann_index_path.exists() || !embeddings_path.exists() {
444        return build_semantic_index_with_progress(options, progress_callback).await;
445    }
446    
447    // Load the ANN index
448    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
449    
450    // Load file metadata
451    let embeddings_data = fs::read_to_string(&embeddings_path)?;
452    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
453    
454    // Create embedder and embed the query
455    if let Some(ref callback) = progress_callback {
456        callback("Loading embedding model...");
457    }
458    
459    let mut embedder = if let Some(ref callback) = progress_callback {
460        let _cb = callback.as_ref();
461        let model_cb = Box::new(|msg: &str| {
462            // Note: We can't directly use the callback here due to lifetime issues
463            // For now, we'll just use eprintln! until we can restructure this better
464            eprintln!("Model: {}", msg);
465        }) as ck_embed::ModelDownloadCallback;
466        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
467    } else {
468        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
469    };
470    let query_embeddings = embedder.embed(&[options.query.clone()])?;
471    
472    if query_embeddings.is_empty() {
473        return Ok(Vec::new());
474    }
475    
476    let query_embedding = &query_embeddings[0];
477    
478    // Search using ANN
479    let top_k = options.top_k.unwrap_or(10);
480    let similar_docs = ann_index.search(query_embedding, top_k);
481    
482    let mut results = Vec::new();
483    
484    // Check if we're searching a specific file vs. a directory
485    let filter_by_file = options.path.is_file();
486    let target_file = if filter_by_file {
487        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
488    } else {
489        None
490    };
491    
492    for (doc_id, similarity) in similar_docs {
493        // Apply threshold filtering
494        if let Some(threshold) = options.threshold {
495            if similarity < threshold {
496                continue;
497            }
498        }
499        
500        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
501            // Filter by target file if specified
502            if let Some(target) = &target_file {
503                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
504                if canonical_result != *target {
505                    continue; // Skip this result if it doesn't match the target file
506                }
507            }
508            
509            // If full_section is enabled and this is a code section, return the full content
510            let preview = if options.full_section {
511                content.clone()
512            } else {
513                content.lines().take(3).collect::<Vec<_>>().join("\n")
514            };
515            
516            results.push(SearchResult {
517                file: file_path.clone(),
518                span: Span {
519                    byte_start: 0,
520                    byte_end: content.len(),
521                    line_start: 1,
522                    line_end: content.lines().count(),
523                },
524                score: similarity,
525                preview,
526                lang: detect_language(file_path),
527                symbol: None,
528            });
529        }
530    }
531    
532    Ok(results)
533}
534
535#[allow(dead_code)]
536async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
537    build_semantic_index_with_progress(options, None).await
538}
539
540async fn build_semantic_index_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
541    // Handle both files and directories by finding the appropriate directory for indexing
542    let index_root = if options.path.is_file() {
543        options.path.parent().unwrap_or(&options.path)
544    } else {
545        &options.path
546    };
547    
548    let index_dir = index_root.join(".ck");
549    let ann_index_path = index_dir.join("ann_index.bin");
550    let embeddings_path = index_dir.join("embeddings.json");
551    
552    fs::create_dir_all(&index_dir)?;
553    
554    if let Some(ref callback) = progress_callback {
555        callback("Building semantic index (no index found)...");
556    }
557    
558    // Always print this important message, even in quiet mode for indexing operations
559    eprintln!("Building semantic index (no existing index found)...");
560    
561    // Collect files and their content
562    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
563    
564    if let Some(ref callback) = progress_callback {
565        callback(&format!("Found {} files to index", files.len()));
566    }
567    eprintln!("Found {} files to embed and index", files.len());
568    
569    let mut file_embeddings = Vec::new();
570    let mut embeddings = Vec::new();
571    
572    // Create embedder with progress callback
573    if let Some(ref callback) = progress_callback {
574        callback("Loading embedding model...");
575    }
576    
577    let model_callback = if progress_callback.is_some() {
578        Some(Box::new(|msg: &str| {
579            eprintln!("Model: {}", msg);
580        }) as ck_embed::ModelDownloadCallback)
581    } else {
582        None
583    };
584    
585    let mut embedder = ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
586    
587    if let Some(ref callback) = progress_callback {
588        callback("Generating embeddings for code chunks...");
589    }
590    
591    for (file_idx, file_path) in files.iter().enumerate() {
592        if let Ok(content) = fs::read_to_string(file_path) {
593            if let Some(ref callback) = progress_callback {
594                let file_name = file_path.file_name()
595                    .map(|n| n.to_string_lossy().to_string())
596                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
597                callback(&format!("Processing {}/{}: {}", file_idx + 1, files.len(), file_name));
598            }
599            
600            // Chunk the content for better embeddings
601            let chunks = ck_chunk::chunk_text(&content, detect_language(file_path).as_deref())?;
602            
603            for chunk in chunks {
604                let chunk_embeddings = embedder.embed(&[chunk.text.clone()])?;
605                if !chunk_embeddings.is_empty() {
606                    embeddings.push(chunk_embeddings[0].clone());
607                    file_embeddings.push((file_path.clone(), chunk.text));
608                }
609            }
610        }
611    }
612    
613    if let Some(ref callback) = progress_callback {
614        callback(&format!("Built {} embeddings, creating search index...", embeddings.len()));
615    }
616    eprintln!("Generated {} embeddings, building search index...", embeddings.len());
617    
618    // Build ANN index
619    let index = ck_ann::SimpleIndex::build(&embeddings)?;
620    index.save(&ann_index_path)?;
621    
622    // Save file embeddings metadata
623    let embeddings_json = serde_json::to_string(&file_embeddings)?;
624    fs::write(&embeddings_path, embeddings_json)?;
625    
626    if let Some(ref callback) = progress_callback {
627        callback("Semantic index built successfully, running search...");
628    }
629    eprintln!("Semantic index built successfully!");
630    
631    // After building, search again - inline to avoid recursion
632    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
633    
634    // Load file metadata
635    let embeddings_data = fs::read_to_string(&embeddings_path)?;
636    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
637    
638    // Create embedder and embed the query
639    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
640    let query_embeddings = embedder.embed(&[options.query.clone()])?;
641    
642    if query_embeddings.is_empty() {
643        return Ok(Vec::new());
644    }
645    
646    let query_embedding = &query_embeddings[0];
647    
648    // Search using ANN
649    let top_k = options.top_k.unwrap_or(10);
650    let similar_docs = ann_index.search(query_embedding, top_k);
651    
652    let mut results = Vec::new();
653    
654    // Check if we're searching a specific file vs. a directory
655    let filter_by_file = options.path.is_file();
656    let target_file = if filter_by_file {
657        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
658    } else {
659        None
660    };
661    
662    for (doc_id, similarity) in similar_docs {
663        // Apply threshold filtering
664        if let Some(threshold) = options.threshold {
665            if similarity < threshold {
666                continue;
667            }
668        }
669        
670        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
671            // Filter by target file if specified
672            if let Some(target) = &target_file {
673                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
674                if canonical_result != *target {
675                    continue; // Skip this result if it doesn't match the target file
676                }
677            }
678            
679            // If full_section is enabled and this is a code section, return the full content
680            let preview = if options.full_section {
681                content.clone()
682            } else {
683                content.lines().take(3).collect::<Vec<_>>().join("\n")
684            };
685            
686            results.push(SearchResult {
687                file: file_path.clone(),
688                span: Span {
689                    byte_start: 0,
690                    byte_end: content.len(),
691                    line_start: 1,
692                    line_end: content.lines().count(),
693                },
694                score: similarity,
695                preview,
696                lang: detect_language(file_path),
697                symbol: None,
698            });
699        }
700    }
701    
702    Ok(results)
703}
704
705#[allow(dead_code)]
706async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
707    hybrid_search_with_progress(options, None).await
708}
709
710async fn hybrid_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
711    if let Some(ref callback) = progress_callback {
712        callback("Running regex search...");
713    }
714    let regex_results = regex_search(options)?;
715    
716    if let Some(ref callback) = progress_callback {
717        callback("Running semantic search...");
718    }
719    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
720    
721    let mut combined = HashMap::new();
722    
723    for (rank, result) in regex_results.iter().enumerate() {
724        let key = format!("{}:{}", result.file.display(), result.span.line_start);
725        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
726    }
727    
728    for (rank, result) in semantic_results.iter().enumerate() {
729        let key = format!("{}:{}", result.file.display(), result.span.line_start);
730        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
731    }
732    
733    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
734    let mut rrf_results: Vec<SearchResult> = combined
735        .into_iter()
736        .map(|(_, ranks)| {
737            let mut result = ranks[0].1.clone();
738            let rrf_score = ranks.iter().map(|(rank, _)| 1.0 / (60.0 + *rank as f32)).sum();
739            result.score = rrf_score;
740            result
741        })
742        .filter(|result| {
743            // Apply threshold filtering to raw RRF scores
744            if let Some(threshold) = options.threshold {
745                result.score >= threshold
746            } else {
747                true
748            }
749        })
750        .collect();
751    
752    // Sort by RRF score (highest first)
753    rrf_results.sort_by(|a, b| {
754        b.score
755            .partial_cmp(&a.score)
756            .unwrap_or(std::cmp::Ordering::Equal)
757    });
758    
759    if let Some(top_k) = options.top_k {
760        rrf_results.truncate(top_k);
761    }
762    
763    Ok(rrf_results)
764}
765
766fn build_globset(patterns: &[String]) -> GlobSet {
767    let mut builder = GlobSetBuilder::new();
768    for pat in patterns {
769        // Treat patterns as filename or directory globs
770        if let Ok(glob) = Glob::new(pat) {
771            builder.add(glob);
772        }
773    }
774    builder.build().unwrap_or_else(|_| GlobSet::empty())
775}
776
777fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
778    let globset = build_globset(exclude_patterns);
779    // Match against each path component and the full path
780    if globset.is_match(path) {
781        return true;
782    }
783    for component in path.components() {
784        if let std::path::Component::Normal(name) = component {
785            if globset.is_match(name) {
786                return true;
787            }
788        }
789    }
790    false
791}
792
793fn collect_files(path: &Path, recursive: bool, exclude_patterns: &[String]) -> Result<Vec<PathBuf>> {
794    let mut files = Vec::new();
795    let globset = build_globset(exclude_patterns);
796    
797    if path.is_file() {
798        // Always add single files, even if they're excluded (user explicitly requested)
799        files.push(path.to_path_buf());
800    } else if recursive {
801        for entry in WalkDir::new(path)
802            .into_iter()
803            .filter_entry(|e| {
804                // Skip excluded directories entirely for efficiency
805                let name = e.file_name();
806                !globset.is_match(e.path()) && !globset.is_match(name)
807            }) {
808            match entry {
809                Ok(entry) => {
810                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), exclude_patterns) {
811                        files.push(entry.path().to_path_buf());
812                    }
813                }
814                Err(e) => {
815                    // Log directory traversal errors but continue processing
816                    tracing::debug!("Skipping path due to error: {}", e);
817                    continue;
818                }
819            }
820        }
821    } else {
822        match fs::read_dir(path) {
823            Ok(read_dir) => {
824                for entry in read_dir {
825                    match entry {
826                        Ok(entry) => {
827                            let path = entry.path();
828                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
829                                files.push(path);
830                            }
831                        }
832                        Err(e) => {
833                            tracing::debug!("Skipping directory entry due to error: {}", e);
834                            continue;
835                        }
836                    }
837                }
838            }
839            Err(e) => {
840                tracing::debug!("Cannot read directory {:?}: {}", path, e);
841                return Err(e.into());
842            }
843        }
844    }
845    
846    Ok(files)
847}
848
849fn detect_language(path: &Path) -> Option<String> {
850    path.extension()
851        .and_then(|ext| ext.to_str())
852        .map(|ext| match ext {
853            "rs" => "rust",
854            "py" => "python",
855            "js" => "javascript",
856            "ts" => "typescript",
857            "go" => "go",
858            "java" => "java",
859            "c" => "c",
860            "cpp" | "cc" | "cxx" => "cpp",
861            "h" | "hpp" => "cpp",
862            "cs" => "csharp",
863            "rb" => "ruby",
864            "php" => "php",
865            "swift" => "swift",
866            "kt" => "kotlin",
867            _ => ext,
868        })
869        .map(String::from)
870}
871
872async fn ensure_index_updated(path: &Path, force_reindex: bool, need_embeddings: bool) -> Result<()> {
873    
874    // Handle both files and directories and reuse nearest existing .ck index up the tree
875    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
876        if path.is_file() {
877            path.parent().unwrap_or(path).to_path_buf()
878        } else {
879            path.to_path_buf()
880        }
881    });
882    let index_root = &index_root_buf;
883    
884    // If force reindex is requested, always update
885    if force_reindex {
886        let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
887        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
888            tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
889                          stats.files_indexed, stats.orphaned_files_removed);
890        }
891        return Ok(());
892    }
893    
894    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
895    let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
896    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
897        tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
898                      stats.files_indexed, stats.orphaned_files_removed);
899    }
900    
901    Ok(())
902}
903
904fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
905    let before = options.before_context_lines.max(options.context_lines);
906    let after = options.after_context_lines.max(options.context_lines);
907    
908    if before > 0 || after > 0 {
909        let start_idx = line_idx.saturating_sub(before);
910        let end_idx = (line_idx + after + 1).min(lines.len());
911        lines[start_idx..end_idx].join("\n")
912    } else {
913        lines[line_idx].to_string()
914    }
915}
916
917fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
918    // Detect language for tree-sitter parsing
919    let lang = match file_path.extension().and_then(|s| s.to_str()) {
920        Some("py") => Some("python"),
921        Some("js") => Some("javascript"),
922        Some("ts") | Some("tsx") => Some("typescript"),
923        _ => return None,
924    };
925    
926    // Parse the file with tree-sitter and extract function/class sections
927    if let Ok(chunks) = ck_chunk::chunk_text(content, lang) {
928        let sections: Vec<(usize, usize, String)> = chunks
929            .into_iter()
930            .filter(|chunk| matches!(
931                chunk.chunk_type,
932                ck_chunk::ChunkType::Function | 
933                ck_chunk::ChunkType::Class | 
934                ck_chunk::ChunkType::Method
935            ))
936            .map(|chunk| {
937                (
938                    chunk.span.line_start - 1,  // Convert to 0-based index
939                    chunk.span.line_end - 1,
940                    chunk.text,
941                )
942            })
943            .collect();
944        
945        if sections.is_empty() {
946            None
947        } else {
948            Some(sections)
949        }
950    } else {
951        None
952    }
953}
954
955fn find_containing_section(sections: &[(usize, usize, String)], line_idx: usize) -> Option<&String> {
956    for (start, end, text) in sections {
957        if line_idx >= *start && line_idx <= *end {
958            return Some(text);
959        }
960    }
961    None
962}
963
964#[cfg(test)]
965mod tests {
966    use super::*;
967    use std::fs;
968    use tempfile::TempDir;
969
970    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
971        let files = vec![
972            ("test1.txt", "hello world rust programming"),
973            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
974            ("test3.py", "print('Hello Python')"),
975            ("test4.txt", "machine learning artificial intelligence"),
976        ];
977
978        let mut paths = Vec::new();
979        for (name, content) in files {
980            let path = dir.join(name);
981            fs::write(&path, content).unwrap();
982            paths.push(path);
983        }
984        paths
985    }
986
987    #[test]
988    fn test_detect_language() {
989        assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
990        assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
991        assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
992        assert_eq!(detect_language(&PathBuf::from("test.unknown")), Some("unknown".to_string()));
993        assert_eq!(detect_language(&PathBuf::from("noext")), None);
994    }
995
996    #[test]
997    fn test_collect_files() {
998        let temp_dir = TempDir::new().unwrap();
999        let test_files = create_test_files(temp_dir.path());
1000
1001        // Test non-recursive
1002        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1003        assert_eq!(files.len(), 4);
1004
1005        // Test recursive
1006        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1007        assert_eq!(files.len(), 4);
1008
1009        // Test single file
1010        let files = collect_files(&test_files[0], false, &[]).unwrap();
1011        assert_eq!(files.len(), 1);
1012        assert_eq!(files[0], test_files[0]);
1013    }
1014
1015    #[test]
1016    fn test_regex_search() {
1017        let temp_dir = TempDir::new().unwrap();
1018        create_test_files(temp_dir.path());
1019
1020        let options = SearchOptions {
1021            mode: SearchMode::Regex,
1022            query: "rust".to_string(),
1023            path: temp_dir.path().to_path_buf(),
1024            recursive: true,
1025            ..Default::default()
1026        };
1027
1028        let results = regex_search(&options).unwrap();
1029        assert!(!results.is_empty());
1030        
1031        // Should find matches in files containing "rust"
1032        let rust_matches: Vec<_> = results.iter()
1033            .filter(|r| r.preview.to_lowercase().contains("rust"))
1034            .collect();
1035        assert!(!rust_matches.is_empty());
1036    }
1037
1038    #[test]
1039    fn test_regex_search_case_insensitive() {
1040        let temp_dir = TempDir::new().unwrap();
1041        create_test_files(temp_dir.path());
1042
1043        let options = SearchOptions {
1044            mode: SearchMode::Regex,
1045            query: "HELLO".to_string(),
1046            path: temp_dir.path().to_path_buf(),
1047            recursive: true,
1048            case_insensitive: true,
1049            ..Default::default()
1050        };
1051
1052        let results = regex_search(&options).unwrap();
1053        assert!(!results.is_empty());
1054    }
1055
1056    #[test]
1057    fn test_regex_search_fixed_string() {
1058        let temp_dir = TempDir::new().unwrap();
1059        create_test_files(temp_dir.path());
1060
1061        let options = SearchOptions {
1062            mode: SearchMode::Regex,
1063            query: "fn main()".to_string(),
1064            path: temp_dir.path().to_path_buf(),
1065            recursive: true,
1066            fixed_string: true,
1067            ..Default::default()
1068        };
1069
1070        let results = regex_search(&options).unwrap();
1071        assert!(!results.is_empty());
1072    }
1073
1074    #[test]
1075    fn test_regex_search_whole_word() {
1076        let temp_dir = TempDir::new().unwrap();
1077        fs::write(temp_dir.path().join("word_test.txt"), "rust rusty rustacean").unwrap();
1078
1079        let options = SearchOptions {
1080            mode: SearchMode::Regex,
1081            query: "rust".to_string(),
1082            path: temp_dir.path().to_path_buf(),
1083            recursive: true,
1084            whole_word: true,
1085            ..Default::default()
1086        };
1087
1088        let results = regex_search(&options).unwrap();
1089        assert!(!results.is_empty());
1090        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1091    }
1092
1093    #[test]
1094    fn test_regex_search_top_k() {
1095        let temp_dir = TempDir::new().unwrap();
1096        
1097        // Create multiple files with matches
1098        for i in 0..10 {
1099            fs::write(temp_dir.path().join(format!("file{}.txt", i)), "test content").unwrap();
1100        }
1101
1102        let options = SearchOptions {
1103            mode: SearchMode::Regex,
1104            query: "test".to_string(),
1105            path: temp_dir.path().to_path_buf(),
1106            recursive: true,
1107            top_k: Some(5),
1108            ..Default::default()
1109        };
1110
1111        let results = regex_search(&options).unwrap();
1112        assert!(results.len() <= 5);
1113    }
1114
1115    #[test] 
1116    fn test_search_file() {
1117        let temp_dir = TempDir::new().unwrap();
1118        let file_path = temp_dir.path().join("test.txt");
1119        fs::write(&file_path, "line 1: hello\nline 2: world\nline 3: rust programming").unwrap();
1120
1121        let regex = regex::Regex::new("rust").unwrap();
1122        let options = SearchOptions::default();
1123
1124        let results = search_file(&regex, &file_path, &options).unwrap();
1125        assert_eq!(results.len(), 1);
1126        assert_eq!(results[0].span.line_start, 3);
1127        assert!(results[0].preview.contains("rust"));
1128    }
1129
1130    #[test]
1131    fn test_search_file_with_context() {
1132        let temp_dir = TempDir::new().unwrap();
1133        let file_path = temp_dir.path().join("test.txt");
1134        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1135
1136        let regex = regex::Regex::new("target").unwrap();
1137        let options = SearchOptions {
1138            context_lines: 1,
1139            ..Default::default()
1140        };
1141
1142        let results = search_file(&regex, &file_path, &options).unwrap();
1143        assert_eq!(results.len(), 1);
1144        
1145        println!("Preview: '{}'", results[0].preview);
1146        
1147        // The target line is line 3, with 1 context line before and after
1148        // So we should get lines 2, 3, 4
1149        assert!(results[0].preview.contains("line 2"));
1150        assert!(results[0].preview.contains("target line"));
1151        assert!(results[0].preview.contains("line 4"));
1152    }
1153
1154    #[tokio::test]
1155    async fn test_search_main_function() {
1156        let temp_dir = TempDir::new().unwrap();
1157        create_test_files(temp_dir.path());
1158
1159        let options = SearchOptions {
1160            mode: SearchMode::Regex,
1161            query: "hello".to_string(),
1162            path: temp_dir.path().to_path_buf(),
1163            recursive: true,
1164            case_insensitive: true,
1165            ..Default::default()
1166        };
1167
1168        let results = search(&options).await.unwrap();
1169        assert!(!results.is_empty());
1170    }
1171}