ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9use rayon::prelude::*;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, STORED, TEXT, Value};
13use tantivy::{doc, Index, ReloadPolicy, TantivyDocument};
14use ck_ann::AnnIndex;
15use std::path::PathBuf as StdPathBuf;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = fs::read_to_string(file_path)?;
25    let lines: Vec<&str> = content.lines().collect();
26    
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30    
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33    
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() { path.parent().unwrap_or(path) } else { path };
43    loop {
44        if current.join(".ck").exists() {
45            return Some(current.to_path_buf());
46        }
47        match current.parent() {
48            Some(parent) => current = parent,
49            None => return None,
50        }
51    }
52}
53
54pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
55    search_with_progress(options, None).await
56}
57
58pub async fn search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
59    // Validate that the search path exists
60    if !options.path.exists() {
61        return Err(ck_core::CkError::Search(format!("Path does not exist: {}", options.path.display())).into());
62    }
63    
64    // Auto-update index if needed (unless it's regex-only mode)
65    if !matches!(options.mode, SearchMode::Regex) {
66        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
67        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
68    }
69    
70    match options.mode {
71        SearchMode::Regex => regex_search(options),
72        SearchMode::Lexical => lexical_search(options).await,
73        SearchMode::Semantic => {
74            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
75            semantic_search_v3_with_progress(options, progress_callback).await
76        },
77        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
78    }
79}
80
81fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
82    let pattern = if options.fixed_string {
83        regex::escape(&options.query)
84    } else if options.whole_word {
85        format!(r"\b{}\b", regex::escape(&options.query))
86    } else {
87        options.query.clone()
88    };
89    
90    let regex = RegexBuilder::new(&pattern)
91        .case_insensitive(options.case_insensitive)
92        .build()
93        .map_err(|e| CkError::Regex(e))?;
94    
95    let files = collect_files(&options.path, options.recursive, &options.exclude_patterns)?;
96    
97    let results: Vec<Vec<SearchResult>> = files
98        .par_iter()
99        .filter_map(|file_path| {
100            match search_file(&regex, file_path, options) {
101                Ok(matches) => {
102                    if matches.is_empty() {
103                        None
104                    } else {
105                        Some(matches)
106                    }
107                }
108                Err(e) => {
109                    tracing::debug!("Error searching {:?}: {}", file_path, e);
110                    None
111                }
112            }
113        })
114        .collect();
115    
116    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
117    // Deterministic ordering: file path, then line number
118    all_results.sort_by(|a, b| {
119        let path_cmp = a.file.cmp(&b.file);
120        if path_cmp != std::cmp::Ordering::Equal {
121            return path_cmp;
122        }
123        a.span.line_start.cmp(&b.span.line_start)
124    });
125    
126    if let Some(top_k) = options.top_k {
127        all_results.truncate(top_k);
128    }
129    
130    Ok(all_results)
131}
132
133fn search_file(regex: &Regex, file_path: &Path, options: &SearchOptions) -> Result<Vec<SearchResult>> {
134    let content = fs::read_to_string(file_path)?;
135    let lines: Vec<&str> = content.lines().collect();
136    let mut results = Vec::new();
137    
138    // If full_section is enabled, try to parse the file and find code sections
139    let code_sections = if options.full_section {
140        extract_code_sections(file_path, &content)
141    } else {
142        None
143    };
144    
145    for (line_idx, line) in lines.iter().enumerate() {
146        let line_number = line_idx + 1;
147        
148        if regex.is_match(line) {
149            let preview = if options.full_section {
150                // Try to find the containing code section
151                if let Some(ref sections) = code_sections {
152                    if let Some(section) = find_containing_section(sections, line_idx) {
153                        section.clone()
154                    } else {
155                        // Fall back to context lines if no section found
156                        get_context_preview(&lines, line_idx, options)
157                    }
158                } else {
159                    get_context_preview(&lines, line_idx, options)
160                }
161            } else {
162                get_context_preview(&lines, line_idx, options)
163            };
164            
165            results.push(SearchResult {
166                file: file_path.to_path_buf(),
167                span: Span {
168                    byte_start: 0,
169                    byte_end: line.len(),
170                    line_start: line_number,
171                    line_end: line_number,
172                },
173                score: 1.0,
174                preview,
175                lang: detect_language(file_path),
176                symbol: None,
177            });
178        }
179    }
180    
181    Ok(results)
182}
183
184async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
185    // Handle both files and directories and reuse nearest existing .ck index up the tree
186    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
187        if options.path.is_file() {
188            options.path.parent().unwrap_or(&options.path).to_path_buf()
189        } else {
190            options.path.clone()
191        }
192    });
193    
194    let index_dir = index_root.join(".ck");
195    if !index_dir.exists() {
196        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
197    }
198    
199    let tantivy_index_path = index_dir.join("tantivy_index");
200    
201    if !tantivy_index_path.exists() {
202        return build_tantivy_index(options).await;
203    }
204    
205    let mut schema_builder = Schema::builder();
206    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
207    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
208    let _schema = schema_builder.build();
209    
210    let index = Index::open_in_dir(&tantivy_index_path)
211        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
212    
213    let reader = index
214        .reader_builder()
215        .reload_policy(ReloadPolicy::OnCommitWithDelay)
216        .try_into()
217        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
218    
219    let searcher = reader.searcher();
220    let query_parser = QueryParser::for_index(&index, vec![content_field]);
221    
222    let query = query_parser
223        .parse_query(&options.query)
224        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
225    
226    let top_docs = if let Some(top_k) = options.top_k {
227        searcher.search(&query, &TopDocs::with_limit(top_k))?
228    } else {
229        searcher.search(&query, &TopDocs::with_limit(100))?
230    };
231    
232    // First, collect all results with raw scores
233    let mut raw_results = Vec::new();
234    for (_score, doc_address) in top_docs {
235        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
236        let path_text = retrieved_doc
237            .get_first(path_field)
238            .map(|field_value| field_value.as_str().unwrap_or(""))
239            .unwrap_or("");
240        let content_text = retrieved_doc
241            .get_first(content_field)
242            .map(|field_value| field_value.as_str().unwrap_or(""))
243            .unwrap_or("");
244        
245        let file_path = PathBuf::from(path_text);
246        let preview = if options.full_section {
247            content_text.to_string()
248        } else {
249            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
250        };
251        
252        raw_results.push((_score, SearchResult {
253            file: file_path,
254            span: Span {
255                byte_start: 0,
256                byte_end: content_text.len(),
257                line_start: 1,
258                line_end: content_text.lines().count(),
259            },
260            score: _score,
261            preview,
262            lang: detect_language(&PathBuf::from(path_text)),
263            symbol: None,
264        }));
265    }
266    
267    // Normalize scores to 0-1 range and apply threshold
268    let mut results = Vec::new();
269    if !raw_results.is_empty() {
270        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
271        if max_score > 0.0 {
272            for (raw_score, mut result) in raw_results {
273                let normalized_score = raw_score / max_score;
274                
275                // Apply threshold filtering with normalized score
276                if let Some(threshold) = options.threshold {
277                    if normalized_score < threshold {
278                        continue;
279                    }
280                }
281                
282                result.score = normalized_score;
283                results.push(result);
284            }
285        }
286    }
287    
288    Ok(results)
289}
290
291async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
292    // Handle both files and directories by finding the appropriate directory for indexing
293    let index_root = if options.path.is_file() {
294        options.path.parent().unwrap_or(&options.path)
295    } else {
296        &options.path
297    };
298    
299    let index_dir = index_root.join(".ck");
300    let tantivy_index_path = index_dir.join("tantivy_index");
301    
302    fs::create_dir_all(&tantivy_index_path)?;
303    
304    let mut schema_builder = Schema::builder();
305    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
306    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
307    let schema = schema_builder.build();
308    
309    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
310        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
311    
312    let mut index_writer = index.writer(50_000_000)
313        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
314    
315    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
316    
317    for file_path in &files {
318        if let Ok(content) = fs::read_to_string(file_path) {
319            let doc = doc!(
320                content_field => content,
321                path_field => file_path.display().to_string()
322            );
323            index_writer.add_document(doc)?;
324        }
325    }
326    
327    index_writer.commit()
328        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
329    
330    // After building, search again with the same options  
331    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
332    let mut schema_builder = Schema::builder();
333    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
334    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
335    let _schema = schema_builder.build();
336    
337    let index = Index::open_in_dir(&tantivy_index_path)
338        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
339    
340    let reader = index
341        .reader_builder()
342        .reload_policy(ReloadPolicy::OnCommitWithDelay)
343        .try_into()
344        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
345    
346    let searcher = reader.searcher();
347    let query_parser = QueryParser::for_index(&index, vec![content_field]);
348    
349    let query = query_parser
350        .parse_query(&options.query)
351        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
352    
353    let top_docs = if let Some(top_k) = options.top_k {
354        searcher.search(&query, &TopDocs::with_limit(top_k))?
355    } else {
356        searcher.search(&query, &TopDocs::with_limit(100))?
357    };
358    
359    // First, collect all results with raw scores
360    let mut raw_results = Vec::new();
361    for (_score, doc_address) in top_docs {
362        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
363        let path_text = retrieved_doc
364            .get_first(path_field)
365            .map(|field_value| field_value.as_str().unwrap_or(""))
366            .unwrap_or("");
367        let content_text = retrieved_doc
368            .get_first(content_field)
369            .map(|field_value| field_value.as_str().unwrap_or(""))
370            .unwrap_or("");
371        
372        let file_path = PathBuf::from(path_text);
373        let preview = if options.full_section {
374            content_text.to_string()
375        } else {
376            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
377        };
378        
379        raw_results.push((_score, SearchResult {
380            file: file_path,
381            span: Span {
382                byte_start: 0,
383                byte_end: content_text.len(),
384                line_start: 1,
385                line_end: content_text.lines().count(),
386            },
387            score: _score,
388            preview,
389            lang: detect_language(&PathBuf::from(path_text)),
390            symbol: None,
391        }));
392    }
393    
394    // Normalize scores to 0-1 range and apply threshold
395    let mut results = Vec::new();
396    if !raw_results.is_empty() {
397        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
398        if max_score > 0.0 {
399            for (raw_score, mut result) in raw_results {
400                let normalized_score = raw_score / max_score;
401                
402                // Apply threshold filtering with normalized score
403                if let Some(threshold) = options.threshold {
404                    if normalized_score < threshold {
405                        continue;
406                    }
407                }
408                
409                result.score = normalized_score;
410                results.push(result);
411            }
412        }
413    }
414    
415    Ok(results)
416}
417
418#[allow(dead_code)]
419async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
420    semantic_search_with_progress(options, None).await
421}
422
423async fn semantic_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
424    // Handle both files and directories and reuse nearest existing .ck index up the tree
425    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
426        if options.path.is_file() {
427            options.path.parent().unwrap_or(&options.path).to_path_buf()
428        } else {
429            options.path.clone()
430        }
431    });
432    
433    let index_dir = index_root.join(".ck");
434    if !index_dir.exists() {
435        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
436    }
437    
438    let ann_index_path = index_dir.join("ann_index.bin");
439    let embeddings_path = index_dir.join("embeddings.json");
440    
441    if !ann_index_path.exists() || !embeddings_path.exists() {
442        return build_semantic_index_with_progress(options, progress_callback).await;
443    }
444    
445    // Load the ANN index
446    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
447    
448    // Load file metadata
449    let embeddings_data = fs::read_to_string(&embeddings_path)?;
450    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
451    
452    // Create embedder and embed the query
453    if let Some(ref callback) = progress_callback {
454        callback("Loading embedding model...");
455    }
456    
457    let mut embedder = if let Some(ref callback) = progress_callback {
458        let _cb = callback.as_ref();
459        let model_cb = Box::new(|msg: &str| {
460            // Note: We can't directly use the callback here due to lifetime issues
461            // For now, we'll just use eprintln! until we can restructure this better
462            eprintln!("Model: {}", msg);
463        }) as ck_embed::ModelDownloadCallback;
464        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
465    } else {
466        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
467    };
468    let query_embeddings = embedder.embed(&[options.query.clone()])?;
469    
470    if query_embeddings.is_empty() {
471        return Ok(Vec::new());
472    }
473    
474    let query_embedding = &query_embeddings[0];
475    
476    // Search using ANN
477    let top_k = options.top_k.unwrap_or(10);
478    let similar_docs = ann_index.search(query_embedding, top_k);
479    
480    let mut results = Vec::new();
481    
482    // Check if we're searching a specific file vs. a directory
483    let filter_by_file = options.path.is_file();
484    let target_file = if filter_by_file {
485        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
486    } else {
487        None
488    };
489    
490    for (doc_id, similarity) in similar_docs {
491        // Apply threshold filtering
492        if let Some(threshold) = options.threshold {
493            if similarity < threshold {
494                continue;
495            }
496        }
497        
498        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
499            // Filter by target file if specified
500            if let Some(target) = &target_file {
501                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
502                if canonical_result != *target {
503                    continue; // Skip this result if it doesn't match the target file
504                }
505            }
506            
507            // If full_section is enabled and this is a code section, return the full content
508            let preview = if options.full_section {
509                content.clone()
510            } else {
511                content.lines().take(3).collect::<Vec<_>>().join("\n")
512            };
513            
514            results.push(SearchResult {
515                file: file_path.clone(),
516                span: Span {
517                    byte_start: 0,
518                    byte_end: content.len(),
519                    line_start: 1,
520                    line_end: content.lines().count(),
521                },
522                score: similarity,
523                preview,
524                lang: detect_language(file_path),
525                symbol: None,
526            });
527        }
528    }
529    
530    Ok(results)
531}
532
533#[allow(dead_code)]
534async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
535    build_semantic_index_with_progress(options, None).await
536}
537
538async fn build_semantic_index_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
539    // Handle both files and directories by finding the appropriate directory for indexing
540    let index_root = if options.path.is_file() {
541        options.path.parent().unwrap_or(&options.path)
542    } else {
543        &options.path
544    };
545    
546    let index_dir = index_root.join(".ck");
547    let ann_index_path = index_dir.join("ann_index.bin");
548    let embeddings_path = index_dir.join("embeddings.json");
549    
550    fs::create_dir_all(&index_dir)?;
551    
552    if let Some(ref callback) = progress_callback {
553        callback("Building semantic index (no index found)...");
554    }
555    
556    // Always print this important message, even in quiet mode for indexing operations
557    eprintln!("Building semantic index (no existing index found)...");
558    
559    // Collect files and their content
560    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
561    
562    if let Some(ref callback) = progress_callback {
563        callback(&format!("Found {} files to index", files.len()));
564    }
565    eprintln!("Found {} files to embed and index", files.len());
566    
567    let mut file_embeddings = Vec::new();
568    let mut embeddings = Vec::new();
569    
570    // Create embedder with progress callback
571    if let Some(ref callback) = progress_callback {
572        callback("Loading embedding model...");
573    }
574    
575    let model_callback = if progress_callback.is_some() {
576        Some(Box::new(|msg: &str| {
577            eprintln!("Model: {}", msg);
578        }) as ck_embed::ModelDownloadCallback)
579    } else {
580        None
581    };
582    
583    let mut embedder = ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
584    
585    if let Some(ref callback) = progress_callback {
586        callback("Generating embeddings for code chunks...");
587    }
588    
589    for (file_idx, file_path) in files.iter().enumerate() {
590        if let Ok(content) = fs::read_to_string(file_path) {
591            if let Some(ref callback) = progress_callback {
592                let file_name = file_path.file_name()
593                    .map(|n| n.to_string_lossy().to_string())
594                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
595                callback(&format!("Processing {}/{}: {}", file_idx + 1, files.len(), file_name));
596            }
597            
598            // Chunk the content for better embeddings
599            let chunks = ck_chunk::chunk_text(&content, detect_language(file_path).as_deref())?;
600            
601            for chunk in chunks {
602                let chunk_embeddings = embedder.embed(&[chunk.text.clone()])?;
603                if !chunk_embeddings.is_empty() {
604                    embeddings.push(chunk_embeddings[0].clone());
605                    file_embeddings.push((file_path.clone(), chunk.text));
606                }
607            }
608        }
609    }
610    
611    if let Some(ref callback) = progress_callback {
612        callback(&format!("Built {} embeddings, creating search index...", embeddings.len()));
613    }
614    eprintln!("Generated {} embeddings, building search index...", embeddings.len());
615    
616    // Build ANN index
617    let index = ck_ann::SimpleIndex::build(&embeddings)?;
618    index.save(&ann_index_path)?;
619    
620    // Save file embeddings metadata
621    let embeddings_json = serde_json::to_string(&file_embeddings)?;
622    fs::write(&embeddings_path, embeddings_json)?;
623    
624    if let Some(ref callback) = progress_callback {
625        callback("Semantic index built successfully, running search...");
626    }
627    eprintln!("Semantic index built successfully!");
628    
629    // After building, search again - inline to avoid recursion
630    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
631    
632    // Load file metadata
633    let embeddings_data = fs::read_to_string(&embeddings_path)?;
634    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
635    
636    // Create embedder and embed the query
637    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
638    let query_embeddings = embedder.embed(&[options.query.clone()])?;
639    
640    if query_embeddings.is_empty() {
641        return Ok(Vec::new());
642    }
643    
644    let query_embedding = &query_embeddings[0];
645    
646    // Search using ANN
647    let top_k = options.top_k.unwrap_or(10);
648    let similar_docs = ann_index.search(query_embedding, top_k);
649    
650    let mut results = Vec::new();
651    
652    // Check if we're searching a specific file vs. a directory
653    let filter_by_file = options.path.is_file();
654    let target_file = if filter_by_file {
655        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
656    } else {
657        None
658    };
659    
660    for (doc_id, similarity) in similar_docs {
661        // Apply threshold filtering
662        if let Some(threshold) = options.threshold {
663            if similarity < threshold {
664                continue;
665            }
666        }
667        
668        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
669            // Filter by target file if specified
670            if let Some(target) = &target_file {
671                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
672                if canonical_result != *target {
673                    continue; // Skip this result if it doesn't match the target file
674                }
675            }
676            
677            // If full_section is enabled and this is a code section, return the full content
678            let preview = if options.full_section {
679                content.clone()
680            } else {
681                content.lines().take(3).collect::<Vec<_>>().join("\n")
682            };
683            
684            results.push(SearchResult {
685                file: file_path.clone(),
686                span: Span {
687                    byte_start: 0,
688                    byte_end: content.len(),
689                    line_start: 1,
690                    line_end: content.lines().count(),
691                },
692                score: similarity,
693                preview,
694                lang: detect_language(file_path),
695                symbol: None,
696            });
697        }
698    }
699    
700    Ok(results)
701}
702
703#[allow(dead_code)]
704async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
705    hybrid_search_with_progress(options, None).await
706}
707
708async fn hybrid_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
709    if let Some(ref callback) = progress_callback {
710        callback("Running regex search...");
711    }
712    let regex_results = regex_search(options)?;
713    
714    if let Some(ref callback) = progress_callback {
715        callback("Running semantic search...");
716    }
717    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
718    
719    let mut combined = HashMap::new();
720    
721    for (rank, result) in regex_results.iter().enumerate() {
722        let key = format!("{}:{}", result.file.display(), result.span.line_start);
723        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
724    }
725    
726    for (rank, result) in semantic_results.iter().enumerate() {
727        let key = format!("{}:{}", result.file.display(), result.span.line_start);
728        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
729    }
730    
731    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
732    let mut rrf_results: Vec<SearchResult> = combined
733        .into_iter()
734        .map(|(_, ranks)| {
735            let mut result = ranks[0].1.clone();
736            let rrf_score = ranks.iter().map(|(rank, _)| 1.0 / (60.0 + *rank as f32)).sum();
737            result.score = rrf_score;
738            result
739        })
740        .filter(|result| {
741            // Apply threshold filtering to raw RRF scores
742            if let Some(threshold) = options.threshold {
743                result.score >= threshold
744            } else {
745                true
746            }
747        })
748        .collect();
749    
750    // Sort by RRF score (highest first)
751    rrf_results.sort_by(|a, b| {
752        b.score
753            .partial_cmp(&a.score)
754            .unwrap_or(std::cmp::Ordering::Equal)
755    });
756    
757    if let Some(top_k) = options.top_k {
758        rrf_results.truncate(top_k);
759    }
760    
761    Ok(rrf_results)
762}
763
764fn build_globset(patterns: &[String]) -> GlobSet {
765    let mut builder = GlobSetBuilder::new();
766    for pat in patterns {
767        // Treat patterns as filename or directory globs
768        if let Ok(glob) = Glob::new(pat) {
769            builder.add(glob);
770        }
771    }
772    builder.build().unwrap_or_else(|_| GlobSet::empty())
773}
774
775fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
776    let globset = build_globset(exclude_patterns);
777    // Match against each path component and the full path
778    if globset.is_match(path) {
779        return true;
780    }
781    for component in path.components() {
782        if let std::path::Component::Normal(name) = component {
783            if globset.is_match(name) {
784                return true;
785            }
786        }
787    }
788    false
789}
790
791fn collect_files(path: &Path, recursive: bool, exclude_patterns: &[String]) -> Result<Vec<PathBuf>> {
792    let mut files = Vec::new();
793    let globset = build_globset(exclude_patterns);
794    
795    if path.is_file() {
796        // Always add single files, even if they're excluded (user explicitly requested)
797        files.push(path.to_path_buf());
798    } else if recursive {
799        for entry in WalkDir::new(path)
800            .into_iter()
801            .filter_entry(|e| {
802                // Skip excluded directories entirely for efficiency
803                let name = e.file_name();
804                !globset.is_match(e.path()) && !globset.is_match(name)
805            }) {
806            match entry {
807                Ok(entry) => {
808                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), exclude_patterns) {
809                        files.push(entry.path().to_path_buf());
810                    }
811                }
812                Err(e) => {
813                    // Log directory traversal errors but continue processing
814                    tracing::debug!("Skipping path due to error: {}", e);
815                    continue;
816                }
817            }
818        }
819    } else {
820        match fs::read_dir(path) {
821            Ok(read_dir) => {
822                for entry in read_dir {
823                    match entry {
824                        Ok(entry) => {
825                            let path = entry.path();
826                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
827                                files.push(path);
828                            }
829                        }
830                        Err(e) => {
831                            tracing::debug!("Skipping directory entry due to error: {}", e);
832                            continue;
833                        }
834                    }
835                }
836            }
837            Err(e) => {
838                tracing::debug!("Cannot read directory {:?}: {}", path, e);
839                return Err(e.into());
840            }
841        }
842    }
843    
844    Ok(files)
845}
846
847fn detect_language(path: &Path) -> Option<String> {
848    path.extension()
849        .and_then(|ext| ext.to_str())
850        .map(|ext| match ext {
851            "rs" => "rust",
852            "py" => "python",
853            "js" => "javascript",
854            "ts" => "typescript",
855            "go" => "go",
856            "java" => "java",
857            "c" => "c",
858            "cpp" | "cc" | "cxx" => "cpp",
859            "h" | "hpp" => "cpp",
860            "cs" => "csharp",
861            "rb" => "ruby",
862            "php" => "php",
863            "swift" => "swift",
864            "kt" => "kotlin",
865            _ => ext,
866        })
867        .map(String::from)
868}
869
870async fn ensure_index_updated(path: &Path, force_reindex: bool, need_embeddings: bool) -> Result<()> {
871    
872    // Handle both files and directories and reuse nearest existing .ck index up the tree
873    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
874        if path.is_file() {
875            path.parent().unwrap_or(path).to_path_buf()
876        } else {
877            path.to_path_buf()
878        }
879    });
880    let index_root = &index_root_buf;
881    
882    // If force reindex is requested, always update
883    if force_reindex {
884        let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
885        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
886            tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
887                          stats.files_indexed, stats.orphaned_files_removed);
888        }
889        return Ok(());
890    }
891    
892    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
893    let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
894    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
895        tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
896                      stats.files_indexed, stats.orphaned_files_removed);
897    }
898    
899    Ok(())
900}
901
902fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
903    let before = options.before_context_lines.max(options.context_lines);
904    let after = options.after_context_lines.max(options.context_lines);
905    
906    if before > 0 || after > 0 {
907        let start_idx = line_idx.saturating_sub(before);
908        let end_idx = (line_idx + after + 1).min(lines.len());
909        lines[start_idx..end_idx].join("\n")
910    } else {
911        lines[line_idx].to_string()
912    }
913}
914
915fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
916    // Detect language for tree-sitter parsing
917    let lang = match file_path.extension().and_then(|s| s.to_str()) {
918        Some("py") => Some("python"),
919        Some("js") => Some("javascript"),
920        Some("ts") | Some("tsx") => Some("typescript"),
921        _ => return None,
922    };
923    
924    // Parse the file with tree-sitter and extract function/class sections
925    if let Ok(chunks) = ck_chunk::chunk_text(content, lang) {
926        let sections: Vec<(usize, usize, String)> = chunks
927            .into_iter()
928            .filter(|chunk| matches!(
929                chunk.chunk_type,
930                ck_chunk::ChunkType::Function | 
931                ck_chunk::ChunkType::Class | 
932                ck_chunk::ChunkType::Method
933            ))
934            .map(|chunk| {
935                (
936                    chunk.span.line_start - 1,  // Convert to 0-based index
937                    chunk.span.line_end - 1,
938                    chunk.text,
939                )
940            })
941            .collect();
942        
943        if sections.is_empty() {
944            None
945        } else {
946            Some(sections)
947        }
948    } else {
949        None
950    }
951}
952
953fn find_containing_section(sections: &[(usize, usize, String)], line_idx: usize) -> Option<&String> {
954    for (start, end, text) in sections {
955        if line_idx >= *start && line_idx <= *end {
956            return Some(text);
957        }
958    }
959    None
960}
961
962#[cfg(test)]
963mod tests {
964    use super::*;
965    use std::fs;
966    use tempfile::TempDir;
967
968    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
969        let files = vec![
970            ("test1.txt", "hello world rust programming"),
971            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
972            ("test3.py", "print('Hello Python')"),
973            ("test4.txt", "machine learning artificial intelligence"),
974        ];
975
976        let mut paths = Vec::new();
977        for (name, content) in files {
978            let path = dir.join(name);
979            fs::write(&path, content).unwrap();
980            paths.push(path);
981        }
982        paths
983    }
984
985    #[test]
986    fn test_detect_language() {
987        assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
988        assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
989        assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
990        assert_eq!(detect_language(&PathBuf::from("test.unknown")), Some("unknown".to_string()));
991        assert_eq!(detect_language(&PathBuf::from("noext")), None);
992    }
993
994    #[test]
995    fn test_collect_files() {
996        let temp_dir = TempDir::new().unwrap();
997        let test_files = create_test_files(temp_dir.path());
998
999        // Test non-recursive
1000        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1001        assert_eq!(files.len(), 4);
1002
1003        // Test recursive
1004        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1005        assert_eq!(files.len(), 4);
1006
1007        // Test single file
1008        let files = collect_files(&test_files[0], false, &[]).unwrap();
1009        assert_eq!(files.len(), 1);
1010        assert_eq!(files[0], test_files[0]);
1011    }
1012
1013    #[test]
1014    fn test_regex_search() {
1015        let temp_dir = TempDir::new().unwrap();
1016        create_test_files(temp_dir.path());
1017
1018        let options = SearchOptions {
1019            mode: SearchMode::Regex,
1020            query: "rust".to_string(),
1021            path: temp_dir.path().to_path_buf(),
1022            recursive: true,
1023            ..Default::default()
1024        };
1025
1026        let results = regex_search(&options).unwrap();
1027        assert!(!results.is_empty());
1028        
1029        // Should find matches in files containing "rust"
1030        let rust_matches: Vec<_> = results.iter()
1031            .filter(|r| r.preview.to_lowercase().contains("rust"))
1032            .collect();
1033        assert!(!rust_matches.is_empty());
1034    }
1035
1036    #[test]
1037    fn test_regex_search_case_insensitive() {
1038        let temp_dir = TempDir::new().unwrap();
1039        create_test_files(temp_dir.path());
1040
1041        let options = SearchOptions {
1042            mode: SearchMode::Regex,
1043            query: "HELLO".to_string(),
1044            path: temp_dir.path().to_path_buf(),
1045            recursive: true,
1046            case_insensitive: true,
1047            ..Default::default()
1048        };
1049
1050        let results = regex_search(&options).unwrap();
1051        assert!(!results.is_empty());
1052    }
1053
1054    #[test]
1055    fn test_regex_search_fixed_string() {
1056        let temp_dir = TempDir::new().unwrap();
1057        create_test_files(temp_dir.path());
1058
1059        let options = SearchOptions {
1060            mode: SearchMode::Regex,
1061            query: "fn main()".to_string(),
1062            path: temp_dir.path().to_path_buf(),
1063            recursive: true,
1064            fixed_string: true,
1065            ..Default::default()
1066        };
1067
1068        let results = regex_search(&options).unwrap();
1069        assert!(!results.is_empty());
1070    }
1071
1072    #[test]
1073    fn test_regex_search_whole_word() {
1074        let temp_dir = TempDir::new().unwrap();
1075        fs::write(temp_dir.path().join("word_test.txt"), "rust rusty rustacean").unwrap();
1076
1077        let options = SearchOptions {
1078            mode: SearchMode::Regex,
1079            query: "rust".to_string(),
1080            path: temp_dir.path().to_path_buf(),
1081            recursive: true,
1082            whole_word: true,
1083            ..Default::default()
1084        };
1085
1086        let results = regex_search(&options).unwrap();
1087        assert!(!results.is_empty());
1088        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1089    }
1090
1091    #[test]
1092    fn test_regex_search_top_k() {
1093        let temp_dir = TempDir::new().unwrap();
1094        
1095        // Create multiple files with matches
1096        for i in 0..10 {
1097            fs::write(temp_dir.path().join(format!("file{}.txt", i)), "test content").unwrap();
1098        }
1099
1100        let options = SearchOptions {
1101            mode: SearchMode::Regex,
1102            query: "test".to_string(),
1103            path: temp_dir.path().to_path_buf(),
1104            recursive: true,
1105            top_k: Some(5),
1106            ..Default::default()
1107        };
1108
1109        let results = regex_search(&options).unwrap();
1110        assert!(results.len() <= 5);
1111    }
1112
1113    #[test] 
1114    fn test_search_file() {
1115        let temp_dir = TempDir::new().unwrap();
1116        let file_path = temp_dir.path().join("test.txt");
1117        fs::write(&file_path, "line 1: hello\nline 2: world\nline 3: rust programming").unwrap();
1118
1119        let regex = regex::Regex::new("rust").unwrap();
1120        let options = SearchOptions::default();
1121
1122        let results = search_file(&regex, &file_path, &options).unwrap();
1123        assert_eq!(results.len(), 1);
1124        assert_eq!(results[0].span.line_start, 3);
1125        assert!(results[0].preview.contains("rust"));
1126    }
1127
1128    #[test]
1129    fn test_search_file_with_context() {
1130        let temp_dir = TempDir::new().unwrap();
1131        let file_path = temp_dir.path().join("test.txt");
1132        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1133
1134        let regex = regex::Regex::new("target").unwrap();
1135        let options = SearchOptions {
1136            context_lines: 1,
1137            ..Default::default()
1138        };
1139
1140        let results = search_file(&regex, &file_path, &options).unwrap();
1141        assert_eq!(results.len(), 1);
1142        
1143        println!("Preview: '{}'", results[0].preview);
1144        
1145        // The target line is line 3, with 1 context line before and after
1146        // So we should get lines 2, 3, 4
1147        assert!(results[0].preview.contains("line 2"));
1148        assert!(results[0].preview.contains("target line"));
1149        assert!(results[0].preview.contains("line 4"));
1150    }
1151
1152    #[tokio::test]
1153    async fn test_search_main_function() {
1154        let temp_dir = TempDir::new().unwrap();
1155        create_test_files(temp_dir.path());
1156
1157        let options = SearchOptions {
1158            mode: SearchMode::Regex,
1159            query: "hello".to_string(),
1160            path: temp_dir.path().to_path_buf(),
1161            recursive: true,
1162            case_insensitive: true,
1163            ..Default::default()
1164        };
1165
1166        let results = search(&options).await.unwrap();
1167        assert!(!results.is_empty());
1168    }
1169}