ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9use rayon::prelude::*;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, STORED, TEXT, Value};
13use tantivy::{doc, Index, ReloadPolicy, TantivyDocument};
14use ck_ann::AnnIndex;
15use std::path::PathBuf as StdPathBuf;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = fs::read_to_string(file_path)?;
25    let lines: Vec<&str> = content.lines().collect();
26    
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30    
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33    
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() { path.parent().unwrap_or(path) } else { path };
43    loop {
44        if current.join(".ck").exists() {
45            return Some(current.to_path_buf());
46        }
47        match current.parent() {
48            Some(parent) => current = parent,
49            None => return None,
50        }
51    }
52}
53
54pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
55    search_with_progress(options, None).await
56}
57
58pub async fn search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
59    // Validate that the search path exists
60    if !options.path.exists() {
61        return Err(ck_core::CkError::Search(format!("Path does not exist: {}", options.path.display())).into());
62    }
63    
64    // Auto-update index if needed (unless it's regex-only mode)
65    if !matches!(options.mode, SearchMode::Regex) {
66        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
67        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
68    }
69    
70    match options.mode {
71        SearchMode::Regex => regex_search(options),
72        SearchMode::Lexical => lexical_search(options).await,
73        SearchMode::Semantic => {
74            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
75            semantic_search_v3_with_progress(options, progress_callback).await
76        },
77        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
78    }
79}
80
81fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
82    let pattern = if options.fixed_string {
83        regex::escape(&options.query)
84    } else if options.whole_word {
85        format!(r"\b{}\b", regex::escape(&options.query))
86    } else {
87        options.query.clone()
88    };
89    
90    let regex = RegexBuilder::new(&pattern)
91        .case_insensitive(options.case_insensitive)
92        .build()
93        .map_err(|e| CkError::Regex(e))?;
94    
95    // Default to recursive for directories (like grep) to maintain compatibility
96    let should_recurse = options.path.is_dir() || options.recursive;
97    let files = if should_recurse {
98        // Use ck_index's collect_files which respects gitignore
99        ck_index::collect_files(&options.path, options.respect_gitignore)
100    } else {
101        // For non-recursive, use the local collect_files
102        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
103    };
104    
105    let results: Vec<Vec<SearchResult>> = files
106        .par_iter()
107        .filter_map(|file_path| {
108            match search_file(&regex, file_path, options) {
109                Ok(matches) => {
110                    if matches.is_empty() {
111                        None
112                    } else {
113                        Some(matches)
114                    }
115                }
116                Err(e) => {
117                    tracing::debug!("Error searching {:?}: {}", file_path, e);
118                    None
119                }
120            }
121        })
122        .collect();
123    
124    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
125    // Deterministic ordering: file path, then line number
126    all_results.sort_by(|a, b| {
127        let path_cmp = a.file.cmp(&b.file);
128        if path_cmp != std::cmp::Ordering::Equal {
129            return path_cmp;
130        }
131        a.span.line_start.cmp(&b.span.line_start)
132    });
133    
134    if let Some(top_k) = options.top_k {
135        all_results.truncate(top_k);
136    }
137    
138    Ok(all_results)
139}
140
141fn search_file(regex: &Regex, file_path: &Path, options: &SearchOptions) -> Result<Vec<SearchResult>> {
142    let content = fs::read_to_string(file_path)?;
143    let lines: Vec<&str> = content.lines().collect();
144    let mut results = Vec::new();
145    
146    // If full_section is enabled, try to parse the file and find code sections
147    let code_sections = if options.full_section {
148        extract_code_sections(file_path, &content)
149    } else {
150        None
151    };
152    
153    // Track byte offset as we iterate through lines
154    let mut byte_offset = 0;
155    
156    for (line_idx, line) in lines.iter().enumerate() {
157        let line_number = line_idx + 1;
158        
159        // Find all matches in the line with their positions
160        for mat in regex.find_iter(line) {
161            let preview = if options.full_section {
162                // Try to find the containing code section
163                if let Some(ref sections) = code_sections {
164                    if let Some(section) = find_containing_section(sections, line_idx) {
165                        section.clone()
166                    } else {
167                        // Fall back to context lines if no section found
168                        get_context_preview(&lines, line_idx, options)
169                    }
170                } else {
171                    get_context_preview(&lines, line_idx, options)
172                }
173            } else {
174                get_context_preview(&lines, line_idx, options)
175            };
176            
177            results.push(SearchResult {
178                file: file_path.to_path_buf(),
179                span: Span {
180                    byte_start: byte_offset + mat.start(),
181                    byte_end: byte_offset + mat.end(),
182                    line_start: line_number,
183                    line_end: line_number,
184                },
185                score: 1.0,
186                preview,
187                lang: detect_language(file_path),
188                symbol: None,
189            });
190        }
191        
192        // Update byte offset for next line (add line length + newline character)
193        byte_offset += line.len();
194        if line_idx < lines.len() - 1 {
195            byte_offset += 1; // Add 1 for the newline character
196        }
197    }
198    
199    Ok(results)
200}
201
202async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
203    // Handle both files and directories and reuse nearest existing .ck index up the tree
204    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
205        if options.path.is_file() {
206            options.path.parent().unwrap_or(&options.path).to_path_buf()
207        } else {
208            options.path.clone()
209        }
210    });
211    
212    let index_dir = index_root.join(".ck");
213    if !index_dir.exists() {
214        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
215    }
216    
217    let tantivy_index_path = index_dir.join("tantivy_index");
218    
219    if !tantivy_index_path.exists() {
220        return build_tantivy_index(options).await;
221    }
222    
223    let mut schema_builder = Schema::builder();
224    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
225    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
226    let _schema = schema_builder.build();
227    
228    let index = Index::open_in_dir(&tantivy_index_path)
229        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
230    
231    let reader = index
232        .reader_builder()
233        .reload_policy(ReloadPolicy::OnCommitWithDelay)
234        .try_into()
235        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
236    
237    let searcher = reader.searcher();
238    let query_parser = QueryParser::for_index(&index, vec![content_field]);
239    
240    let query = query_parser
241        .parse_query(&options.query)
242        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
243    
244    let top_docs = if let Some(top_k) = options.top_k {
245        searcher.search(&query, &TopDocs::with_limit(top_k))?
246    } else {
247        searcher.search(&query, &TopDocs::with_limit(100))?
248    };
249    
250    // First, collect all results with raw scores
251    let mut raw_results = Vec::new();
252    for (_score, doc_address) in top_docs {
253        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
254        let path_text = retrieved_doc
255            .get_first(path_field)
256            .map(|field_value| field_value.as_str().unwrap_or(""))
257            .unwrap_or("");
258        let content_text = retrieved_doc
259            .get_first(content_field)
260            .map(|field_value| field_value.as_str().unwrap_or(""))
261            .unwrap_or("");
262        
263        let file_path = PathBuf::from(path_text);
264        let preview = if options.full_section {
265            content_text.to_string()
266        } else {
267            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
268        };
269        
270        raw_results.push((_score, SearchResult {
271            file: file_path,
272            span: Span {
273                byte_start: 0,
274                byte_end: content_text.len(),
275                line_start: 1,
276                line_end: content_text.lines().count(),
277            },
278            score: _score,
279            preview,
280            lang: detect_language(&PathBuf::from(path_text)),
281            symbol: None,
282        }));
283    }
284    
285    // Normalize scores to 0-1 range and apply threshold
286    let mut results = Vec::new();
287    if !raw_results.is_empty() {
288        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
289        if max_score > 0.0 {
290            for (raw_score, mut result) in raw_results {
291                let normalized_score = raw_score / max_score;
292                
293                // Apply threshold filtering with normalized score
294                if let Some(threshold) = options.threshold {
295                    if normalized_score < threshold {
296                        continue;
297                    }
298                }
299                
300                result.score = normalized_score;
301                results.push(result);
302            }
303        }
304    }
305    
306    Ok(results)
307}
308
309async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
310    // Handle both files and directories by finding the appropriate directory for indexing
311    let index_root = if options.path.is_file() {
312        options.path.parent().unwrap_or(&options.path)
313    } else {
314        &options.path
315    };
316    
317    let index_dir = index_root.join(".ck");
318    let tantivy_index_path = index_dir.join("tantivy_index");
319    
320    fs::create_dir_all(&tantivy_index_path)?;
321    
322    let mut schema_builder = Schema::builder();
323    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
324    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
325    let schema = schema_builder.build();
326    
327    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
328        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
329    
330    let mut index_writer = index.writer(50_000_000)
331        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
332    
333    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
334    
335    for file_path in &files {
336        if let Ok(content) = fs::read_to_string(file_path) {
337            let doc = doc!(
338                content_field => content,
339                path_field => file_path.display().to_string()
340            );
341            index_writer.add_document(doc)?;
342        }
343    }
344    
345    index_writer.commit()
346        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
347    
348    // After building, search again with the same options  
349    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
350    let mut schema_builder = Schema::builder();
351    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
352    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
353    let _schema = schema_builder.build();
354    
355    let index = Index::open_in_dir(&tantivy_index_path)
356        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
357    
358    let reader = index
359        .reader_builder()
360        .reload_policy(ReloadPolicy::OnCommitWithDelay)
361        .try_into()
362        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
363    
364    let searcher = reader.searcher();
365    let query_parser = QueryParser::for_index(&index, vec![content_field]);
366    
367    let query = query_parser
368        .parse_query(&options.query)
369        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
370    
371    let top_docs = if let Some(top_k) = options.top_k {
372        searcher.search(&query, &TopDocs::with_limit(top_k))?
373    } else {
374        searcher.search(&query, &TopDocs::with_limit(100))?
375    };
376    
377    // First, collect all results with raw scores
378    let mut raw_results = Vec::new();
379    for (_score, doc_address) in top_docs {
380        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
381        let path_text = retrieved_doc
382            .get_first(path_field)
383            .map(|field_value| field_value.as_str().unwrap_or(""))
384            .unwrap_or("");
385        let content_text = retrieved_doc
386            .get_first(content_field)
387            .map(|field_value| field_value.as_str().unwrap_or(""))
388            .unwrap_or("");
389        
390        let file_path = PathBuf::from(path_text);
391        let preview = if options.full_section {
392            content_text.to_string()
393        } else {
394            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
395        };
396        
397        raw_results.push((_score, SearchResult {
398            file: file_path,
399            span: Span {
400                byte_start: 0,
401                byte_end: content_text.len(),
402                line_start: 1,
403                line_end: content_text.lines().count(),
404            },
405            score: _score,
406            preview,
407            lang: detect_language(&PathBuf::from(path_text)),
408            symbol: None,
409        }));
410    }
411    
412    // Normalize scores to 0-1 range and apply threshold
413    let mut results = Vec::new();
414    if !raw_results.is_empty() {
415        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
416        if max_score > 0.0 {
417            for (raw_score, mut result) in raw_results {
418                let normalized_score = raw_score / max_score;
419                
420                // Apply threshold filtering with normalized score
421                if let Some(threshold) = options.threshold {
422                    if normalized_score < threshold {
423                        continue;
424                    }
425                }
426                
427                result.score = normalized_score;
428                results.push(result);
429            }
430        }
431    }
432    
433    Ok(results)
434}
435
436#[allow(dead_code)]
437async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
438    semantic_search_with_progress(options, None).await
439}
440
441async fn semantic_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
442    // Handle both files and directories and reuse nearest existing .ck index up the tree
443    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
444        if options.path.is_file() {
445            options.path.parent().unwrap_or(&options.path).to_path_buf()
446        } else {
447            options.path.clone()
448        }
449    });
450    
451    let index_dir = index_root.join(".ck");
452    if !index_dir.exists() {
453        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
454    }
455    
456    let ann_index_path = index_dir.join("ann_index.bin");
457    let embeddings_path = index_dir.join("embeddings.json");
458    
459    if !ann_index_path.exists() || !embeddings_path.exists() {
460        return build_semantic_index_with_progress(options, progress_callback).await;
461    }
462    
463    // Load the ANN index
464    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
465    
466    // Load file metadata
467    let embeddings_data = fs::read_to_string(&embeddings_path)?;
468    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
469    
470    // Create embedder and embed the query
471    if let Some(ref callback) = progress_callback {
472        callback("Loading embedding model...");
473    }
474    
475    let mut embedder = if let Some(ref callback) = progress_callback {
476        let _cb = callback.as_ref();
477        let model_cb = Box::new(|msg: &str| {
478            // Note: We can't directly use the callback here due to lifetime issues
479            // For now, we'll just use eprintln! until we can restructure this better
480            eprintln!("Model: {}", msg);
481        }) as ck_embed::ModelDownloadCallback;
482        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
483    } else {
484        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
485    };
486    let query_embeddings = embedder.embed(&[options.query.clone()])?;
487    
488    if query_embeddings.is_empty() {
489        return Ok(Vec::new());
490    }
491    
492    let query_embedding = &query_embeddings[0];
493    
494    // Search using ANN
495    let top_k = options.top_k.unwrap_or(10);
496    let similar_docs = ann_index.search(query_embedding, top_k);
497    
498    let mut results = Vec::new();
499    
500    // Check if we're searching a specific file vs. a directory
501    let filter_by_file = options.path.is_file();
502    let target_file = if filter_by_file {
503        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
504    } else {
505        None
506    };
507    
508    for (doc_id, similarity) in similar_docs {
509        // Apply threshold filtering
510        if let Some(threshold) = options.threshold {
511            if similarity < threshold {
512                continue;
513            }
514        }
515        
516        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
517            // Filter by target file if specified
518            if let Some(target) = &target_file {
519                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
520                if canonical_result != *target {
521                    continue; // Skip this result if it doesn't match the target file
522                }
523            }
524            
525            // If full_section is enabled and this is a code section, return the full content
526            let preview = if options.full_section {
527                content.clone()
528            } else {
529                content.lines().take(3).collect::<Vec<_>>().join("\n")
530            };
531            
532            results.push(SearchResult {
533                file: file_path.clone(),
534                span: Span {
535                    byte_start: 0,
536                    byte_end: content.len(),
537                    line_start: 1,
538                    line_end: content.lines().count(),
539                },
540                score: similarity,
541                preview,
542                lang: detect_language(file_path),
543                symbol: None,
544            });
545        }
546    }
547    
548    Ok(results)
549}
550
551#[allow(dead_code)]
552async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
553    build_semantic_index_with_progress(options, None).await
554}
555
556async fn build_semantic_index_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
557    // Handle both files and directories by finding the appropriate directory for indexing
558    let index_root = if options.path.is_file() {
559        options.path.parent().unwrap_or(&options.path)
560    } else {
561        &options.path
562    };
563    
564    let index_dir = index_root.join(".ck");
565    let ann_index_path = index_dir.join("ann_index.bin");
566    let embeddings_path = index_dir.join("embeddings.json");
567    
568    fs::create_dir_all(&index_dir)?;
569    
570    if let Some(ref callback) = progress_callback {
571        callback("Building semantic index (no index found)...");
572    }
573    
574    // Always print this important message, even in quiet mode for indexing operations
575    eprintln!("Building semantic index (no existing index found)...");
576    
577    // Collect files and their content
578    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
579    
580    if let Some(ref callback) = progress_callback {
581        callback(&format!("Found {} files to index", files.len()));
582    }
583    eprintln!("Found {} files to embed and index", files.len());
584    
585    let mut file_embeddings = Vec::new();
586    let mut embeddings = Vec::new();
587    
588    // Create embedder with progress callback
589    if let Some(ref callback) = progress_callback {
590        callback("Loading embedding model...");
591    }
592    
593    let model_callback = if progress_callback.is_some() {
594        Some(Box::new(|msg: &str| {
595            eprintln!("Model: {}", msg);
596        }) as ck_embed::ModelDownloadCallback)
597    } else {
598        None
599    };
600    
601    let mut embedder = ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
602    
603    if let Some(ref callback) = progress_callback {
604        callback("Generating embeddings for code chunks...");
605    }
606    
607    for (file_idx, file_path) in files.iter().enumerate() {
608        if let Ok(content) = fs::read_to_string(file_path) {
609            if let Some(ref callback) = progress_callback {
610                let file_name = file_path.file_name()
611                    .map(|n| n.to_string_lossy().to_string())
612                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
613                callback(&format!("Processing {}/{}: {}", file_idx + 1, files.len(), file_name));
614            }
615            
616            // Chunk the content for better embeddings
617            let chunks = ck_chunk::chunk_text(&content, detect_language(file_path).as_deref())?;
618            
619            for chunk in chunks {
620                let chunk_embeddings = embedder.embed(&[chunk.text.clone()])?;
621                if !chunk_embeddings.is_empty() {
622                    embeddings.push(chunk_embeddings[0].clone());
623                    file_embeddings.push((file_path.clone(), chunk.text));
624                }
625            }
626        }
627    }
628    
629    if let Some(ref callback) = progress_callback {
630        callback(&format!("Built {} embeddings, creating search index...", embeddings.len()));
631    }
632    eprintln!("Generated {} embeddings, building search index...", embeddings.len());
633    
634    // Build ANN index
635    let index = ck_ann::SimpleIndex::build(&embeddings)?;
636    index.save(&ann_index_path)?;
637    
638    // Save file embeddings metadata
639    let embeddings_json = serde_json::to_string(&file_embeddings)?;
640    fs::write(&embeddings_path, embeddings_json)?;
641    
642    if let Some(ref callback) = progress_callback {
643        callback("Semantic index built successfully, running search...");
644    }
645    eprintln!("Semantic index built successfully!");
646    
647    // After building, search again - inline to avoid recursion
648    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
649    
650    // Load file metadata
651    let embeddings_data = fs::read_to_string(&embeddings_path)?;
652    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
653    
654    // Create embedder and embed the query
655    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
656    let query_embeddings = embedder.embed(&[options.query.clone()])?;
657    
658    if query_embeddings.is_empty() {
659        return Ok(Vec::new());
660    }
661    
662    let query_embedding = &query_embeddings[0];
663    
664    // Search using ANN
665    let top_k = options.top_k.unwrap_or(10);
666    let similar_docs = ann_index.search(query_embedding, top_k);
667    
668    let mut results = Vec::new();
669    
670    // Check if we're searching a specific file vs. a directory
671    let filter_by_file = options.path.is_file();
672    let target_file = if filter_by_file {
673        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
674    } else {
675        None
676    };
677    
678    for (doc_id, similarity) in similar_docs {
679        // Apply threshold filtering
680        if let Some(threshold) = options.threshold {
681            if similarity < threshold {
682                continue;
683            }
684        }
685        
686        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
687            // Filter by target file if specified
688            if let Some(target) = &target_file {
689                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
690                if canonical_result != *target {
691                    continue; // Skip this result if it doesn't match the target file
692                }
693            }
694            
695            // If full_section is enabled and this is a code section, return the full content
696            let preview = if options.full_section {
697                content.clone()
698            } else {
699                content.lines().take(3).collect::<Vec<_>>().join("\n")
700            };
701            
702            results.push(SearchResult {
703                file: file_path.clone(),
704                span: Span {
705                    byte_start: 0,
706                    byte_end: content.len(),
707                    line_start: 1,
708                    line_end: content.lines().count(),
709                },
710                score: similarity,
711                preview,
712                lang: detect_language(file_path),
713                symbol: None,
714            });
715        }
716    }
717    
718    Ok(results)
719}
720
721#[allow(dead_code)]
722async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
723    hybrid_search_with_progress(options, None).await
724}
725
726async fn hybrid_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
727    if let Some(ref callback) = progress_callback {
728        callback("Running regex search...");
729    }
730    let regex_results = regex_search(options)?;
731    
732    if let Some(ref callback) = progress_callback {
733        callback("Running semantic search...");
734    }
735    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
736    
737    let mut combined = HashMap::new();
738    
739    for (rank, result) in regex_results.iter().enumerate() {
740        let key = format!("{}:{}", result.file.display(), result.span.line_start);
741        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
742    }
743    
744    for (rank, result) in semantic_results.iter().enumerate() {
745        let key = format!("{}:{}", result.file.display(), result.span.line_start);
746        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
747    }
748    
749    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
750    let mut rrf_results: Vec<SearchResult> = combined
751        .into_iter()
752        .map(|(_, ranks)| {
753            let mut result = ranks[0].1.clone();
754            let rrf_score = ranks.iter().map(|(rank, _)| 1.0 / (60.0 + *rank as f32)).sum();
755            result.score = rrf_score;
756            result
757        })
758        .filter(|result| {
759            // Apply threshold filtering to raw RRF scores
760            if let Some(threshold) = options.threshold {
761                result.score >= threshold
762            } else {
763                true
764            }
765        })
766        .collect();
767    
768    // Sort by RRF score (highest first)
769    rrf_results.sort_by(|a, b| {
770        b.score
771            .partial_cmp(&a.score)
772            .unwrap_or(std::cmp::Ordering::Equal)
773    });
774    
775    if let Some(top_k) = options.top_k {
776        rrf_results.truncate(top_k);
777    }
778    
779    Ok(rrf_results)
780}
781
782fn build_globset(patterns: &[String]) -> GlobSet {
783    let mut builder = GlobSetBuilder::new();
784    for pat in patterns {
785        // Treat patterns as filename or directory globs
786        if let Ok(glob) = Glob::new(pat) {
787            builder.add(glob);
788        }
789    }
790    builder.build().unwrap_or_else(|_| GlobSet::empty())
791}
792
793fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
794    let globset = build_globset(exclude_patterns);
795    // Match against each path component and the full path
796    if globset.is_match(path) {
797        return true;
798    }
799    for component in path.components() {
800        if let std::path::Component::Normal(name) = component {
801            if globset.is_match(name) {
802                return true;
803            }
804        }
805    }
806    false
807}
808
809fn collect_files(path: &Path, recursive: bool, exclude_patterns: &[String]) -> Result<Vec<PathBuf>> {
810    let mut files = Vec::new();
811    let globset = build_globset(exclude_patterns);
812    
813    if path.is_file() {
814        // Always add single files, even if they're excluded (user explicitly requested)
815        files.push(path.to_path_buf());
816    } else if recursive {
817        for entry in WalkDir::new(path)
818            .into_iter()
819            .filter_entry(|e| {
820                // Skip excluded directories entirely for efficiency
821                let name = e.file_name();
822                !globset.is_match(e.path()) && !globset.is_match(name)
823            }) {
824            match entry {
825                Ok(entry) => {
826                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), exclude_patterns) {
827                        files.push(entry.path().to_path_buf());
828                    }
829                }
830                Err(e) => {
831                    // Log directory traversal errors but continue processing
832                    tracing::debug!("Skipping path due to error: {}", e);
833                    continue;
834                }
835            }
836        }
837    } else {
838        match fs::read_dir(path) {
839            Ok(read_dir) => {
840                for entry in read_dir {
841                    match entry {
842                        Ok(entry) => {
843                            let path = entry.path();
844                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
845                                files.push(path);
846                            }
847                        }
848                        Err(e) => {
849                            tracing::debug!("Skipping directory entry due to error: {}", e);
850                            continue;
851                        }
852                    }
853                }
854            }
855            Err(e) => {
856                tracing::debug!("Cannot read directory {:?}: {}", path, e);
857                return Err(e.into());
858            }
859        }
860    }
861    
862    Ok(files)
863}
864
865fn detect_language(path: &Path) -> Option<String> {
866    path.extension()
867        .and_then(|ext| ext.to_str())
868        .map(|ext| match ext {
869            "rs" => "rust",
870            "py" => "python",
871            "js" => "javascript",
872            "ts" => "typescript",
873            "hs" | "lhs" => "haskell",
874            "go" => "go",
875            "java" => "java",
876            "c" => "c",
877            "cpp" | "cc" | "cxx" => "cpp",
878            "h" | "hpp" => "cpp",
879            "cs" => "csharp",
880            "rb" => "ruby",
881            "php" => "php",
882            "swift" => "swift",
883            "kt" => "kotlin",
884            _ => ext,
885        })
886        .map(String::from)
887}
888
889async fn ensure_index_updated(path: &Path, force_reindex: bool, need_embeddings: bool) -> Result<()> {
890    
891    // Handle both files and directories and reuse nearest existing .ck index up the tree
892    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
893        if path.is_file() {
894            path.parent().unwrap_or(path).to_path_buf()
895        } else {
896            path.to_path_buf()
897        }
898    });
899    let index_root = &index_root_buf;
900    
901    // If force reindex is requested, always update
902    if force_reindex {
903        let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings, true).await?;
904        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
905            tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
906                          stats.files_indexed, stats.orphaned_files_removed);
907        }
908        return Ok(());
909    }
910    
911    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
912    let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings, true).await?;
913    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
914        tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
915                      stats.files_indexed, stats.orphaned_files_removed);
916    }
917    
918    Ok(())
919}
920
921fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
922    let before = options.before_context_lines.max(options.context_lines);
923    let after = options.after_context_lines.max(options.context_lines);
924    
925    if before > 0 || after > 0 {
926        let start_idx = line_idx.saturating_sub(before);
927        let end_idx = (line_idx + after + 1).min(lines.len());
928        lines[start_idx..end_idx].join("\n")
929    } else {
930        lines[line_idx].to_string()
931    }
932}
933
934fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
935    // Detect language for tree-sitter parsing
936    let lang = match file_path.extension().and_then(|s| s.to_str()) {
937        Some("py") => Some("python"),
938        Some("js") => Some("javascript"),
939        Some("ts") | Some("tsx") => Some("typescript"),
940        Some("hs") | Some("lhs") => Some("haskell"),
941        _ => return None,
942    };
943    
944    // Parse the file with tree-sitter and extract function/class sections
945    if let Ok(chunks) = ck_chunk::chunk_text(content, lang) {
946        let sections: Vec<(usize, usize, String)> = chunks
947            .into_iter()
948            .filter(|chunk| matches!(
949                chunk.chunk_type,
950                ck_chunk::ChunkType::Function | 
951                ck_chunk::ChunkType::Class | 
952                ck_chunk::ChunkType::Method
953            ))
954            .map(|chunk| {
955                (
956                    chunk.span.line_start - 1,  // Convert to 0-based index
957                    chunk.span.line_end - 1,
958                    chunk.text,
959                )
960            })
961            .collect();
962        
963        if sections.is_empty() {
964            None
965        } else {
966            Some(sections)
967        }
968    } else {
969        None
970    }
971}
972
973fn find_containing_section(sections: &[(usize, usize, String)], line_idx: usize) -> Option<&String> {
974    for (start, end, text) in sections {
975        if line_idx >= *start && line_idx <= *end {
976            return Some(text);
977        }
978    }
979    None
980}
981
982#[cfg(test)]
983mod tests {
984    use super::*;
985    use std::fs;
986    use tempfile::TempDir;
987
988    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
989        let files = vec![
990            ("test1.txt", "hello world rust programming"),
991            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
992            ("test3.py", "print('Hello Python')"),
993            ("test4.txt", "machine learning artificial intelligence"),
994        ];
995
996        let mut paths = Vec::new();
997        for (name, content) in files {
998            let path = dir.join(name);
999            fs::write(&path, content).unwrap();
1000            paths.push(path);
1001        }
1002        paths
1003    }
1004
1005    #[test]
1006    fn test_detect_language() {
1007        assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
1008        assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
1009        assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
1010        assert_eq!(detect_language(&PathBuf::from("test.hs")), Some("haskell".to_string()));
1011        assert_eq!(detect_language(&PathBuf::from("test.lhs")), Some("haskell".to_string()));
1012        assert_eq!(detect_language(&PathBuf::from("test.unknown")), Some("unknown".to_string()));
1013        assert_eq!(detect_language(&PathBuf::from("noext")), None);
1014    }
1015
1016    #[test]
1017    fn test_collect_files() {
1018        let temp_dir = TempDir::new().unwrap();
1019        let test_files = create_test_files(temp_dir.path());
1020
1021        // Test non-recursive
1022        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1023        assert_eq!(files.len(), 4);
1024
1025        // Test recursive
1026        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1027        assert_eq!(files.len(), 4);
1028
1029        // Test single file
1030        let files = collect_files(&test_files[0], false, &[]).unwrap();
1031        assert_eq!(files.len(), 1);
1032        assert_eq!(files[0], test_files[0]);
1033    }
1034
1035    #[test]
1036    fn test_regex_search() {
1037        let temp_dir = TempDir::new().unwrap();
1038        create_test_files(temp_dir.path());
1039
1040        let options = SearchOptions {
1041            mode: SearchMode::Regex,
1042            query: "rust".to_string(),
1043            path: temp_dir.path().to_path_buf(),
1044            recursive: true,
1045            ..Default::default()
1046        };
1047
1048        let results = regex_search(&options).unwrap();
1049        assert!(!results.is_empty());
1050        
1051        // Should find matches in files containing "rust"
1052        let rust_matches: Vec<_> = results.iter()
1053            .filter(|r| r.preview.to_lowercase().contains("rust"))
1054            .collect();
1055        assert!(!rust_matches.is_empty());
1056    }
1057
1058    #[test]
1059    fn test_regex_search_case_insensitive() {
1060        let temp_dir = TempDir::new().unwrap();
1061        create_test_files(temp_dir.path());
1062
1063        let options = SearchOptions {
1064            mode: SearchMode::Regex,
1065            query: "HELLO".to_string(),
1066            path: temp_dir.path().to_path_buf(),
1067            recursive: true,
1068            case_insensitive: true,
1069            ..Default::default()
1070        };
1071
1072        let results = regex_search(&options).unwrap();
1073        assert!(!results.is_empty());
1074    }
1075
1076    #[test]
1077    fn test_regex_search_fixed_string() {
1078        let temp_dir = TempDir::new().unwrap();
1079        create_test_files(temp_dir.path());
1080
1081        let options = SearchOptions {
1082            mode: SearchMode::Regex,
1083            query: "fn main()".to_string(),
1084            path: temp_dir.path().to_path_buf(),
1085            recursive: true,
1086            fixed_string: true,
1087            ..Default::default()
1088        };
1089
1090        let results = regex_search(&options).unwrap();
1091        assert!(!results.is_empty());
1092    }
1093
1094    #[test]
1095    fn test_regex_search_whole_word() {
1096        let temp_dir = TempDir::new().unwrap();
1097        fs::write(temp_dir.path().join("word_test.txt"), "rust rusty rustacean").unwrap();
1098
1099        let options = SearchOptions {
1100            mode: SearchMode::Regex,
1101            query: "rust".to_string(),
1102            path: temp_dir.path().to_path_buf(),
1103            recursive: true,
1104            whole_word: true,
1105            ..Default::default()
1106        };
1107
1108        let results = regex_search(&options).unwrap();
1109        assert!(!results.is_empty());
1110        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1111    }
1112
1113    #[test]
1114    fn test_regex_search_top_k() {
1115        let temp_dir = TempDir::new().unwrap();
1116        
1117        // Create multiple files with matches
1118        for i in 0..10 {
1119            fs::write(temp_dir.path().join(format!("file{}.txt", i)), "test content").unwrap();
1120        }
1121
1122        let options = SearchOptions {
1123            mode: SearchMode::Regex,
1124            query: "test".to_string(),
1125            path: temp_dir.path().to_path_buf(),
1126            recursive: true,
1127            top_k: Some(5),
1128            ..Default::default()
1129        };
1130
1131        let results = regex_search(&options).unwrap();
1132        assert!(results.len() <= 5);
1133    }
1134
1135    #[test]
1136    fn test_regex_search_span_offsets() {
1137        // Test that span offsets are correctly calculated for multiple matches on a line
1138        let temp_dir = TempDir::new().unwrap();
1139        let test_file = temp_dir.path().join("spans.txt");
1140        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1141
1142        let options = SearchOptions {
1143            mode: SearchMode::Regex,
1144            query: "test".to_string(),
1145            path: test_file.clone(),
1146            recursive: false,
1147            ..Default::default()
1148        };
1149
1150        let results = regex_search(&options).unwrap();
1151        
1152        // Should find 5 matches total
1153        assert_eq!(results.len(), 5);
1154        
1155        // Check first line has 3 matches with correct byte offsets
1156        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1157        assert_eq!(line1_matches.len(), 3);
1158        assert_eq!(line1_matches[0].span.byte_start, 0);
1159        assert_eq!(line1_matches[1].span.byte_start, 5);
1160        assert_eq!(line1_matches[2].span.byte_start, 10);
1161        
1162        // Check second line match
1163        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1164        assert_eq!(line2_matches.len(), 1);
1165        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1166        
1167        // Each match should have different byte offsets
1168        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1169        byte_starts.sort();
1170        byte_starts.dedup();
1171        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1172    }
1173
1174    #[test] 
1175    fn test_search_file() {
1176        let temp_dir = TempDir::new().unwrap();
1177        let file_path = temp_dir.path().join("test.txt");
1178        fs::write(&file_path, "line 1: hello\nline 2: world\nline 3: rust programming").unwrap();
1179
1180        let regex = regex::Regex::new("rust").unwrap();
1181        let options = SearchOptions::default();
1182
1183        let results = search_file(&regex, &file_path, &options).unwrap();
1184        assert_eq!(results.len(), 1);
1185        assert_eq!(results[0].span.line_start, 3);
1186        assert!(results[0].preview.contains("rust"));
1187    }
1188
1189    #[test]
1190    fn test_search_file_with_context() {
1191        let temp_dir = TempDir::new().unwrap();
1192        let file_path = temp_dir.path().join("test.txt");
1193        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1194
1195        let regex = regex::Regex::new("target").unwrap();
1196        let options = SearchOptions {
1197            context_lines: 1,
1198            ..Default::default()
1199        };
1200
1201        let results = search_file(&regex, &file_path, &options).unwrap();
1202        assert_eq!(results.len(), 1);
1203        
1204        println!("Preview: '{}'", results[0].preview);
1205        
1206        // The target line is line 3, with 1 context line before and after
1207        // So we should get lines 2, 3, 4
1208        assert!(results[0].preview.contains("line 2"));
1209        assert!(results[0].preview.contains("target line"));
1210        assert!(results[0].preview.contains("line 4"));
1211    }
1212
1213    #[tokio::test]
1214    async fn test_search_main_function() {
1215        let temp_dir = TempDir::new().unwrap();
1216        create_test_files(temp_dir.path());
1217
1218        let options = SearchOptions {
1219            mode: SearchMode::Regex,
1220            query: "hello".to_string(),
1221            path: temp_dir.path().to_path_buf(),
1222            recursive: true,
1223            case_insensitive: true,
1224            ..Default::default()
1225        };
1226
1227        let results = search(&options).await.unwrap();
1228        assert!(!results.is_empty());
1229    }
1230}