ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::WalkDir;
9use rayon::prelude::*;
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{Schema, STORED, TEXT, Value};
13use tantivy::{doc, Index, ReloadPolicy, TantivyDocument};
14use ck_ann::AnnIndex;
15use std::path::PathBuf as StdPathBuf;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = fs::read_to_string(file_path)?;
25    let lines: Vec<&str> = content.lines().collect();
26    
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30    
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33    
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() { path.parent().unwrap_or(path) } else { path };
43    loop {
44        if current.join(".ck").exists() {
45            return Some(current.to_path_buf());
46        }
47        match current.parent() {
48            Some(parent) => current = parent,
49            None => return None,
50        }
51    }
52}
53
54pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
55    search_with_progress(options, None).await
56}
57
58pub async fn search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
59    // Validate that the search path exists
60    if !options.path.exists() {
61        return Err(ck_core::CkError::Search(format!("Path does not exist: {}", options.path.display())).into());
62    }
63    
64    // Auto-update index if needed (unless it's regex-only mode)
65    if !matches!(options.mode, SearchMode::Regex) {
66        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
67        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
68    }
69    
70    match options.mode {
71        SearchMode::Regex => regex_search(options),
72        SearchMode::Lexical => lexical_search(options).await,
73        SearchMode::Semantic => {
74            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
75            semantic_search_v3_with_progress(options, progress_callback).await
76        },
77        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
78    }
79}
80
81fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
82    let pattern = if options.fixed_string {
83        regex::escape(&options.query)
84    } else if options.whole_word {
85        format!(r"\b{}\b", regex::escape(&options.query))
86    } else {
87        options.query.clone()
88    };
89    
90    let regex = RegexBuilder::new(&pattern)
91        .case_insensitive(options.case_insensitive)
92        .build()
93        .map_err(|e| CkError::Regex(e))?;
94    
95    // Default to recursive for directories (like grep) to maintain compatibility
96    let should_recurse = options.path.is_dir() || options.recursive;
97    let files = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
98    
99    let results: Vec<Vec<SearchResult>> = files
100        .par_iter()
101        .filter_map(|file_path| {
102            match search_file(&regex, file_path, options) {
103                Ok(matches) => {
104                    if matches.is_empty() {
105                        None
106                    } else {
107                        Some(matches)
108                    }
109                }
110                Err(e) => {
111                    tracing::debug!("Error searching {:?}: {}", file_path, e);
112                    None
113                }
114            }
115        })
116        .collect();
117    
118    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
119    // Deterministic ordering: file path, then line number
120    all_results.sort_by(|a, b| {
121        let path_cmp = a.file.cmp(&b.file);
122        if path_cmp != std::cmp::Ordering::Equal {
123            return path_cmp;
124        }
125        a.span.line_start.cmp(&b.span.line_start)
126    });
127    
128    if let Some(top_k) = options.top_k {
129        all_results.truncate(top_k);
130    }
131    
132    Ok(all_results)
133}
134
135fn search_file(regex: &Regex, file_path: &Path, options: &SearchOptions) -> Result<Vec<SearchResult>> {
136    let content = fs::read_to_string(file_path)?;
137    let lines: Vec<&str> = content.lines().collect();
138    let mut results = Vec::new();
139    
140    // If full_section is enabled, try to parse the file and find code sections
141    let code_sections = if options.full_section {
142        extract_code_sections(file_path, &content)
143    } else {
144        None
145    };
146    
147    // Track byte offset as we iterate through lines
148    let mut byte_offset = 0;
149    
150    for (line_idx, line) in lines.iter().enumerate() {
151        let line_number = line_idx + 1;
152        
153        // Find all matches in the line with their positions
154        for mat in regex.find_iter(line) {
155            let preview = if options.full_section {
156                // Try to find the containing code section
157                if let Some(ref sections) = code_sections {
158                    if let Some(section) = find_containing_section(sections, line_idx) {
159                        section.clone()
160                    } else {
161                        // Fall back to context lines if no section found
162                        get_context_preview(&lines, line_idx, options)
163                    }
164                } else {
165                    get_context_preview(&lines, line_idx, options)
166                }
167            } else {
168                get_context_preview(&lines, line_idx, options)
169            };
170            
171            results.push(SearchResult {
172                file: file_path.to_path_buf(),
173                span: Span {
174                    byte_start: byte_offset + mat.start(),
175                    byte_end: byte_offset + mat.end(),
176                    line_start: line_number,
177                    line_end: line_number,
178                },
179                score: 1.0,
180                preview,
181                lang: detect_language(file_path),
182                symbol: None,
183            });
184        }
185        
186        // Update byte offset for next line (add line length + newline character)
187        byte_offset += line.len();
188        if line_idx < lines.len() - 1 {
189            byte_offset += 1; // Add 1 for the newline character
190        }
191    }
192    
193    Ok(results)
194}
195
196async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
197    // Handle both files and directories and reuse nearest existing .ck index up the tree
198    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
199        if options.path.is_file() {
200            options.path.parent().unwrap_or(&options.path).to_path_buf()
201        } else {
202            options.path.clone()
203        }
204    });
205    
206    let index_dir = index_root.join(".ck");
207    if !index_dir.exists() {
208        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
209    }
210    
211    let tantivy_index_path = index_dir.join("tantivy_index");
212    
213    if !tantivy_index_path.exists() {
214        return build_tantivy_index(options).await;
215    }
216    
217    let mut schema_builder = Schema::builder();
218    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
219    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
220    let _schema = schema_builder.build();
221    
222    let index = Index::open_in_dir(&tantivy_index_path)
223        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
224    
225    let reader = index
226        .reader_builder()
227        .reload_policy(ReloadPolicy::OnCommitWithDelay)
228        .try_into()
229        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
230    
231    let searcher = reader.searcher();
232    let query_parser = QueryParser::for_index(&index, vec![content_field]);
233    
234    let query = query_parser
235        .parse_query(&options.query)
236        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
237    
238    let top_docs = if let Some(top_k) = options.top_k {
239        searcher.search(&query, &TopDocs::with_limit(top_k))?
240    } else {
241        searcher.search(&query, &TopDocs::with_limit(100))?
242    };
243    
244    // First, collect all results with raw scores
245    let mut raw_results = Vec::new();
246    for (_score, doc_address) in top_docs {
247        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
248        let path_text = retrieved_doc
249            .get_first(path_field)
250            .map(|field_value| field_value.as_str().unwrap_or(""))
251            .unwrap_or("");
252        let content_text = retrieved_doc
253            .get_first(content_field)
254            .map(|field_value| field_value.as_str().unwrap_or(""))
255            .unwrap_or("");
256        
257        let file_path = PathBuf::from(path_text);
258        let preview = if options.full_section {
259            content_text.to_string()
260        } else {
261            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
262        };
263        
264        raw_results.push((_score, SearchResult {
265            file: file_path,
266            span: Span {
267                byte_start: 0,
268                byte_end: content_text.len(),
269                line_start: 1,
270                line_end: content_text.lines().count(),
271            },
272            score: _score,
273            preview,
274            lang: detect_language(&PathBuf::from(path_text)),
275            symbol: None,
276        }));
277    }
278    
279    // Normalize scores to 0-1 range and apply threshold
280    let mut results = Vec::new();
281    if !raw_results.is_empty() {
282        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
283        if max_score > 0.0 {
284            for (raw_score, mut result) in raw_results {
285                let normalized_score = raw_score / max_score;
286                
287                // Apply threshold filtering with normalized score
288                if let Some(threshold) = options.threshold {
289                    if normalized_score < threshold {
290                        continue;
291                    }
292                }
293                
294                result.score = normalized_score;
295                results.push(result);
296            }
297        }
298    }
299    
300    Ok(results)
301}
302
303async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
304    // Handle both files and directories by finding the appropriate directory for indexing
305    let index_root = if options.path.is_file() {
306        options.path.parent().unwrap_or(&options.path)
307    } else {
308        &options.path
309    };
310    
311    let index_dir = index_root.join(".ck");
312    let tantivy_index_path = index_dir.join("tantivy_index");
313    
314    fs::create_dir_all(&tantivy_index_path)?;
315    
316    let mut schema_builder = Schema::builder();
317    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
318    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
319    let schema = schema_builder.build();
320    
321    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
322        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
323    
324    let mut index_writer = index.writer(50_000_000)
325        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
326    
327    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
328    
329    for file_path in &files {
330        if let Ok(content) = fs::read_to_string(file_path) {
331            let doc = doc!(
332                content_field => content,
333                path_field => file_path.display().to_string()
334            );
335            index_writer.add_document(doc)?;
336        }
337    }
338    
339    index_writer.commit()
340        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
341    
342    // After building, search again with the same options  
343    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
344    let mut schema_builder = Schema::builder();
345    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
346    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
347    let _schema = schema_builder.build();
348    
349    let index = Index::open_in_dir(&tantivy_index_path)
350        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
351    
352    let reader = index
353        .reader_builder()
354        .reload_policy(ReloadPolicy::OnCommitWithDelay)
355        .try_into()
356        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
357    
358    let searcher = reader.searcher();
359    let query_parser = QueryParser::for_index(&index, vec![content_field]);
360    
361    let query = query_parser
362        .parse_query(&options.query)
363        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
364    
365    let top_docs = if let Some(top_k) = options.top_k {
366        searcher.search(&query, &TopDocs::with_limit(top_k))?
367    } else {
368        searcher.search(&query, &TopDocs::with_limit(100))?
369    };
370    
371    // First, collect all results with raw scores
372    let mut raw_results = Vec::new();
373    for (_score, doc_address) in top_docs {
374        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
375        let path_text = retrieved_doc
376            .get_first(path_field)
377            .map(|field_value| field_value.as_str().unwrap_or(""))
378            .unwrap_or("");
379        let content_text = retrieved_doc
380            .get_first(content_field)
381            .map(|field_value| field_value.as_str().unwrap_or(""))
382            .unwrap_or("");
383        
384        let file_path = PathBuf::from(path_text);
385        let preview = if options.full_section {
386            content_text.to_string()
387        } else {
388            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
389        };
390        
391        raw_results.push((_score, SearchResult {
392            file: file_path,
393            span: Span {
394                byte_start: 0,
395                byte_end: content_text.len(),
396                line_start: 1,
397                line_end: content_text.lines().count(),
398            },
399            score: _score,
400            preview,
401            lang: detect_language(&PathBuf::from(path_text)),
402            symbol: None,
403        }));
404    }
405    
406    // Normalize scores to 0-1 range and apply threshold
407    let mut results = Vec::new();
408    if !raw_results.is_empty() {
409        let max_score = raw_results.iter().map(|(score, _)| *score).fold(0.0f32, f32::max);
410        if max_score > 0.0 {
411            for (raw_score, mut result) in raw_results {
412                let normalized_score = raw_score / max_score;
413                
414                // Apply threshold filtering with normalized score
415                if let Some(threshold) = options.threshold {
416                    if normalized_score < threshold {
417                        continue;
418                    }
419                }
420                
421                result.score = normalized_score;
422                results.push(result);
423            }
424        }
425    }
426    
427    Ok(results)
428}
429
430#[allow(dead_code)]
431async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
432    semantic_search_with_progress(options, None).await
433}
434
435async fn semantic_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
436    // Handle both files and directories and reuse nearest existing .ck index up the tree
437    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
438        if options.path.is_file() {
439            options.path.parent().unwrap_or(&options.path).to_path_buf()
440        } else {
441            options.path.clone()
442        }
443    });
444    
445    let index_dir = index_root.join(".ck");
446    if !index_dir.exists() {
447        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
448    }
449    
450    let ann_index_path = index_dir.join("ann_index.bin");
451    let embeddings_path = index_dir.join("embeddings.json");
452    
453    if !ann_index_path.exists() || !embeddings_path.exists() {
454        return build_semantic_index_with_progress(options, progress_callback).await;
455    }
456    
457    // Load the ANN index
458    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
459    
460    // Load file metadata
461    let embeddings_data = fs::read_to_string(&embeddings_path)?;
462    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
463    
464    // Create embedder and embed the query
465    if let Some(ref callback) = progress_callback {
466        callback("Loading embedding model...");
467    }
468    
469    let mut embedder = if let Some(ref callback) = progress_callback {
470        let _cb = callback.as_ref();
471        let model_cb = Box::new(|msg: &str| {
472            // Note: We can't directly use the callback here due to lifetime issues
473            // For now, we'll just use eprintln! until we can restructure this better
474            eprintln!("Model: {}", msg);
475        }) as ck_embed::ModelDownloadCallback;
476        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
477    } else {
478        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
479    };
480    let query_embeddings = embedder.embed(&[options.query.clone()])?;
481    
482    if query_embeddings.is_empty() {
483        return Ok(Vec::new());
484    }
485    
486    let query_embedding = &query_embeddings[0];
487    
488    // Search using ANN
489    let top_k = options.top_k.unwrap_or(10);
490    let similar_docs = ann_index.search(query_embedding, top_k);
491    
492    let mut results = Vec::new();
493    
494    // Check if we're searching a specific file vs. a directory
495    let filter_by_file = options.path.is_file();
496    let target_file = if filter_by_file {
497        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
498    } else {
499        None
500    };
501    
502    for (doc_id, similarity) in similar_docs {
503        // Apply threshold filtering
504        if let Some(threshold) = options.threshold {
505            if similarity < threshold {
506                continue;
507            }
508        }
509        
510        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
511            // Filter by target file if specified
512            if let Some(target) = &target_file {
513                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
514                if canonical_result != *target {
515                    continue; // Skip this result if it doesn't match the target file
516                }
517            }
518            
519            // If full_section is enabled and this is a code section, return the full content
520            let preview = if options.full_section {
521                content.clone()
522            } else {
523                content.lines().take(3).collect::<Vec<_>>().join("\n")
524            };
525            
526            results.push(SearchResult {
527                file: file_path.clone(),
528                span: Span {
529                    byte_start: 0,
530                    byte_end: content.len(),
531                    line_start: 1,
532                    line_end: content.lines().count(),
533                },
534                score: similarity,
535                preview,
536                lang: detect_language(file_path),
537                symbol: None,
538            });
539        }
540    }
541    
542    Ok(results)
543}
544
545#[allow(dead_code)]
546async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
547    build_semantic_index_with_progress(options, None).await
548}
549
550async fn build_semantic_index_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
551    // Handle both files and directories by finding the appropriate directory for indexing
552    let index_root = if options.path.is_file() {
553        options.path.parent().unwrap_or(&options.path)
554    } else {
555        &options.path
556    };
557    
558    let index_dir = index_root.join(".ck");
559    let ann_index_path = index_dir.join("ann_index.bin");
560    let embeddings_path = index_dir.join("embeddings.json");
561    
562    fs::create_dir_all(&index_dir)?;
563    
564    if let Some(ref callback) = progress_callback {
565        callback("Building semantic index (no index found)...");
566    }
567    
568    // Always print this important message, even in quiet mode for indexing operations
569    eprintln!("Building semantic index (no existing index found)...");
570    
571    // Collect files and their content
572    let files = collect_files(&index_root, true, &options.exclude_patterns)?;
573    
574    if let Some(ref callback) = progress_callback {
575        callback(&format!("Found {} files to index", files.len()));
576    }
577    eprintln!("Found {} files to embed and index", files.len());
578    
579    let mut file_embeddings = Vec::new();
580    let mut embeddings = Vec::new();
581    
582    // Create embedder with progress callback
583    if let Some(ref callback) = progress_callback {
584        callback("Loading embedding model...");
585    }
586    
587    let model_callback = if progress_callback.is_some() {
588        Some(Box::new(|msg: &str| {
589            eprintln!("Model: {}", msg);
590        }) as ck_embed::ModelDownloadCallback)
591    } else {
592        None
593    };
594    
595    let mut embedder = ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
596    
597    if let Some(ref callback) = progress_callback {
598        callback("Generating embeddings for code chunks...");
599    }
600    
601    for (file_idx, file_path) in files.iter().enumerate() {
602        if let Ok(content) = fs::read_to_string(file_path) {
603            if let Some(ref callback) = progress_callback {
604                let file_name = file_path.file_name()
605                    .map(|n| n.to_string_lossy().to_string())
606                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
607                callback(&format!("Processing {}/{}: {}", file_idx + 1, files.len(), file_name));
608            }
609            
610            // Chunk the content for better embeddings
611            let chunks = ck_chunk::chunk_text(&content, detect_language(file_path).as_deref())?;
612            
613            for chunk in chunks {
614                let chunk_embeddings = embedder.embed(&[chunk.text.clone()])?;
615                if !chunk_embeddings.is_empty() {
616                    embeddings.push(chunk_embeddings[0].clone());
617                    file_embeddings.push((file_path.clone(), chunk.text));
618                }
619            }
620        }
621    }
622    
623    if let Some(ref callback) = progress_callback {
624        callback(&format!("Built {} embeddings, creating search index...", embeddings.len()));
625    }
626    eprintln!("Generated {} embeddings, building search index...", embeddings.len());
627    
628    // Build ANN index
629    let index = ck_ann::SimpleIndex::build(&embeddings)?;
630    index.save(&ann_index_path)?;
631    
632    // Save file embeddings metadata
633    let embeddings_json = serde_json::to_string(&file_embeddings)?;
634    fs::write(&embeddings_path, embeddings_json)?;
635    
636    if let Some(ref callback) = progress_callback {
637        callback("Semantic index built successfully, running search...");
638    }
639    eprintln!("Semantic index built successfully!");
640    
641    // After building, search again - inline to avoid recursion
642    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
643    
644    // Load file metadata
645    let embeddings_data = fs::read_to_string(&embeddings_path)?;
646    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
647    
648    // Create embedder and embed the query
649    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
650    let query_embeddings = embedder.embed(&[options.query.clone()])?;
651    
652    if query_embeddings.is_empty() {
653        return Ok(Vec::new());
654    }
655    
656    let query_embedding = &query_embeddings[0];
657    
658    // Search using ANN
659    let top_k = options.top_k.unwrap_or(10);
660    let similar_docs = ann_index.search(query_embedding, top_k);
661    
662    let mut results = Vec::new();
663    
664    // Check if we're searching a specific file vs. a directory
665    let filter_by_file = options.path.is_file();
666    let target_file = if filter_by_file {
667        Some(options.path.canonicalize().unwrap_or_else(|_| options.path.clone()))
668    } else {
669        None
670    };
671    
672    for (doc_id, similarity) in similar_docs {
673        // Apply threshold filtering
674        if let Some(threshold) = options.threshold {
675            if similarity < threshold {
676                continue;
677            }
678        }
679        
680        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
681            // Filter by target file if specified
682            if let Some(target) = &target_file {
683                let canonical_result = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
684                if canonical_result != *target {
685                    continue; // Skip this result if it doesn't match the target file
686                }
687            }
688            
689            // If full_section is enabled and this is a code section, return the full content
690            let preview = if options.full_section {
691                content.clone()
692            } else {
693                content.lines().take(3).collect::<Vec<_>>().join("\n")
694            };
695            
696            results.push(SearchResult {
697                file: file_path.clone(),
698                span: Span {
699                    byte_start: 0,
700                    byte_end: content.len(),
701                    line_start: 1,
702                    line_end: content.lines().count(),
703                },
704                score: similarity,
705                preview,
706                lang: detect_language(file_path),
707                symbol: None,
708            });
709        }
710    }
711    
712    Ok(results)
713}
714
715#[allow(dead_code)]
716async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
717    hybrid_search_with_progress(options, None).await
718}
719
720async fn hybrid_search_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
721    if let Some(ref callback) = progress_callback {
722        callback("Running regex search...");
723    }
724    let regex_results = regex_search(options)?;
725    
726    if let Some(ref callback) = progress_callback {
727        callback("Running semantic search...");
728    }
729    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
730    
731    let mut combined = HashMap::new();
732    
733    for (rank, result) in regex_results.iter().enumerate() {
734        let key = format!("{}:{}", result.file.display(), result.span.line_start);
735        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
736    }
737    
738    for (rank, result) in semantic_results.iter().enumerate() {
739        let key = format!("{}:{}", result.file.display(), result.span.line_start);
740        combined.entry(key).or_insert(Vec::new()).push((rank + 1, result.clone()));
741    }
742    
743    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
744    let mut rrf_results: Vec<SearchResult> = combined
745        .into_iter()
746        .map(|(_, ranks)| {
747            let mut result = ranks[0].1.clone();
748            let rrf_score = ranks.iter().map(|(rank, _)| 1.0 / (60.0 + *rank as f32)).sum();
749            result.score = rrf_score;
750            result
751        })
752        .filter(|result| {
753            // Apply threshold filtering to raw RRF scores
754            if let Some(threshold) = options.threshold {
755                result.score >= threshold
756            } else {
757                true
758            }
759        })
760        .collect();
761    
762    // Sort by RRF score (highest first)
763    rrf_results.sort_by(|a, b| {
764        b.score
765            .partial_cmp(&a.score)
766            .unwrap_or(std::cmp::Ordering::Equal)
767    });
768    
769    if let Some(top_k) = options.top_k {
770        rrf_results.truncate(top_k);
771    }
772    
773    Ok(rrf_results)
774}
775
776fn build_globset(patterns: &[String]) -> GlobSet {
777    let mut builder = GlobSetBuilder::new();
778    for pat in patterns {
779        // Treat patterns as filename or directory globs
780        if let Ok(glob) = Glob::new(pat) {
781            builder.add(glob);
782        }
783    }
784    builder.build().unwrap_or_else(|_| GlobSet::empty())
785}
786
787fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
788    let globset = build_globset(exclude_patterns);
789    // Match against each path component and the full path
790    if globset.is_match(path) {
791        return true;
792    }
793    for component in path.components() {
794        if let std::path::Component::Normal(name) = component {
795            if globset.is_match(name) {
796                return true;
797            }
798        }
799    }
800    false
801}
802
803fn collect_files(path: &Path, recursive: bool, exclude_patterns: &[String]) -> Result<Vec<PathBuf>> {
804    let mut files = Vec::new();
805    let globset = build_globset(exclude_patterns);
806    
807    if path.is_file() {
808        // Always add single files, even if they're excluded (user explicitly requested)
809        files.push(path.to_path_buf());
810    } else if recursive {
811        for entry in WalkDir::new(path)
812            .into_iter()
813            .filter_entry(|e| {
814                // Skip excluded directories entirely for efficiency
815                let name = e.file_name();
816                !globset.is_match(e.path()) && !globset.is_match(name)
817            }) {
818            match entry {
819                Ok(entry) => {
820                    if entry.file_type().is_file() && !should_exclude_path(entry.path(), exclude_patterns) {
821                        files.push(entry.path().to_path_buf());
822                    }
823                }
824                Err(e) => {
825                    // Log directory traversal errors but continue processing
826                    tracing::debug!("Skipping path due to error: {}", e);
827                    continue;
828                }
829            }
830        }
831    } else {
832        match fs::read_dir(path) {
833            Ok(read_dir) => {
834                for entry in read_dir {
835                    match entry {
836                        Ok(entry) => {
837                            let path = entry.path();
838                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
839                                files.push(path);
840                            }
841                        }
842                        Err(e) => {
843                            tracing::debug!("Skipping directory entry due to error: {}", e);
844                            continue;
845                        }
846                    }
847                }
848            }
849            Err(e) => {
850                tracing::debug!("Cannot read directory {:?}: {}", path, e);
851                return Err(e.into());
852            }
853        }
854    }
855    
856    Ok(files)
857}
858
859fn detect_language(path: &Path) -> Option<String> {
860    path.extension()
861        .and_then(|ext| ext.to_str())
862        .map(|ext| match ext {
863            "rs" => "rust",
864            "py" => "python",
865            "js" => "javascript",
866            "ts" => "typescript",
867            "hs" | "lhs" => "haskell",
868            "go" => "go",
869            "java" => "java",
870            "c" => "c",
871            "cpp" | "cc" | "cxx" => "cpp",
872            "h" | "hpp" => "cpp",
873            "cs" => "csharp",
874            "rb" => "ruby",
875            "php" => "php",
876            "swift" => "swift",
877            "kt" => "kotlin",
878            _ => ext,
879        })
880        .map(String::from)
881}
882
883async fn ensure_index_updated(path: &Path, force_reindex: bool, need_embeddings: bool) -> Result<()> {
884    
885    // Handle both files and directories and reuse nearest existing .ck index up the tree
886    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
887        if path.is_file() {
888            path.parent().unwrap_or(path).to_path_buf()
889        } else {
890            path.to_path_buf()
891        }
892    });
893    let index_root = &index_root_buf;
894    
895    // If force reindex is requested, always update
896    if force_reindex {
897        let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
898        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
899            tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
900                          stats.files_indexed, stats.orphaned_files_removed);
901        }
902        return Ok(());
903    }
904    
905    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
906    let stats = ck_index::smart_update_index_with_progress(index_root, false, None, need_embeddings).await?;
907    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
908        tracing::info!("Index updated: {} files indexed, {} orphaned files removed", 
909                      stats.files_indexed, stats.orphaned_files_removed);
910    }
911    
912    Ok(())
913}
914
915fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
916    let before = options.before_context_lines.max(options.context_lines);
917    let after = options.after_context_lines.max(options.context_lines);
918    
919    if before > 0 || after > 0 {
920        let start_idx = line_idx.saturating_sub(before);
921        let end_idx = (line_idx + after + 1).min(lines.len());
922        lines[start_idx..end_idx].join("\n")
923    } else {
924        lines[line_idx].to_string()
925    }
926}
927
928fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
929    // Detect language for tree-sitter parsing
930    let lang = match file_path.extension().and_then(|s| s.to_str()) {
931        Some("py") => Some("python"),
932        Some("js") => Some("javascript"),
933        Some("ts") | Some("tsx") => Some("typescript"),
934        Some("hs") | Some("lhs") => Some("haskell"),
935        _ => return None,
936    };
937    
938    // Parse the file with tree-sitter and extract function/class sections
939    if let Ok(chunks) = ck_chunk::chunk_text(content, lang) {
940        let sections: Vec<(usize, usize, String)> = chunks
941            .into_iter()
942            .filter(|chunk| matches!(
943                chunk.chunk_type,
944                ck_chunk::ChunkType::Function | 
945                ck_chunk::ChunkType::Class | 
946                ck_chunk::ChunkType::Method
947            ))
948            .map(|chunk| {
949                (
950                    chunk.span.line_start - 1,  // Convert to 0-based index
951                    chunk.span.line_end - 1,
952                    chunk.text,
953                )
954            })
955            .collect();
956        
957        if sections.is_empty() {
958            None
959        } else {
960            Some(sections)
961        }
962    } else {
963        None
964    }
965}
966
967fn find_containing_section(sections: &[(usize, usize, String)], line_idx: usize) -> Option<&String> {
968    for (start, end, text) in sections {
969        if line_idx >= *start && line_idx <= *end {
970            return Some(text);
971        }
972    }
973    None
974}
975
976#[cfg(test)]
977mod tests {
978    use super::*;
979    use std::fs;
980    use tempfile::TempDir;
981
982    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
983        let files = vec![
984            ("test1.txt", "hello world rust programming"),
985            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
986            ("test3.py", "print('Hello Python')"),
987            ("test4.txt", "machine learning artificial intelligence"),
988        ];
989
990        let mut paths = Vec::new();
991        for (name, content) in files {
992            let path = dir.join(name);
993            fs::write(&path, content).unwrap();
994            paths.push(path);
995        }
996        paths
997    }
998
999    #[test]
1000    fn test_detect_language() {
1001        assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
1002        assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
1003        assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
1004        assert_eq!(detect_language(&PathBuf::from("test.hs")), Some("haskell".to_string()));
1005        assert_eq!(detect_language(&PathBuf::from("test.lhs")), Some("haskell".to_string()));
1006        assert_eq!(detect_language(&PathBuf::from("test.unknown")), Some("unknown".to_string()));
1007        assert_eq!(detect_language(&PathBuf::from("noext")), None);
1008    }
1009
1010    #[test]
1011    fn test_collect_files() {
1012        let temp_dir = TempDir::new().unwrap();
1013        let test_files = create_test_files(temp_dir.path());
1014
1015        // Test non-recursive
1016        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1017        assert_eq!(files.len(), 4);
1018
1019        // Test recursive
1020        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1021        assert_eq!(files.len(), 4);
1022
1023        // Test single file
1024        let files = collect_files(&test_files[0], false, &[]).unwrap();
1025        assert_eq!(files.len(), 1);
1026        assert_eq!(files[0], test_files[0]);
1027    }
1028
1029    #[test]
1030    fn test_regex_search() {
1031        let temp_dir = TempDir::new().unwrap();
1032        create_test_files(temp_dir.path());
1033
1034        let options = SearchOptions {
1035            mode: SearchMode::Regex,
1036            query: "rust".to_string(),
1037            path: temp_dir.path().to_path_buf(),
1038            recursive: true,
1039            ..Default::default()
1040        };
1041
1042        let results = regex_search(&options).unwrap();
1043        assert!(!results.is_empty());
1044        
1045        // Should find matches in files containing "rust"
1046        let rust_matches: Vec<_> = results.iter()
1047            .filter(|r| r.preview.to_lowercase().contains("rust"))
1048            .collect();
1049        assert!(!rust_matches.is_empty());
1050    }
1051
1052    #[test]
1053    fn test_regex_search_case_insensitive() {
1054        let temp_dir = TempDir::new().unwrap();
1055        create_test_files(temp_dir.path());
1056
1057        let options = SearchOptions {
1058            mode: SearchMode::Regex,
1059            query: "HELLO".to_string(),
1060            path: temp_dir.path().to_path_buf(),
1061            recursive: true,
1062            case_insensitive: true,
1063            ..Default::default()
1064        };
1065
1066        let results = regex_search(&options).unwrap();
1067        assert!(!results.is_empty());
1068    }
1069
1070    #[test]
1071    fn test_regex_search_fixed_string() {
1072        let temp_dir = TempDir::new().unwrap();
1073        create_test_files(temp_dir.path());
1074
1075        let options = SearchOptions {
1076            mode: SearchMode::Regex,
1077            query: "fn main()".to_string(),
1078            path: temp_dir.path().to_path_buf(),
1079            recursive: true,
1080            fixed_string: true,
1081            ..Default::default()
1082        };
1083
1084        let results = regex_search(&options).unwrap();
1085        assert!(!results.is_empty());
1086    }
1087
1088    #[test]
1089    fn test_regex_search_whole_word() {
1090        let temp_dir = TempDir::new().unwrap();
1091        fs::write(temp_dir.path().join("word_test.txt"), "rust rusty rustacean").unwrap();
1092
1093        let options = SearchOptions {
1094            mode: SearchMode::Regex,
1095            query: "rust".to_string(),
1096            path: temp_dir.path().to_path_buf(),
1097            recursive: true,
1098            whole_word: true,
1099            ..Default::default()
1100        };
1101
1102        let results = regex_search(&options).unwrap();
1103        assert!(!results.is_empty());
1104        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1105    }
1106
1107    #[test]
1108    fn test_regex_search_top_k() {
1109        let temp_dir = TempDir::new().unwrap();
1110        
1111        // Create multiple files with matches
1112        for i in 0..10 {
1113            fs::write(temp_dir.path().join(format!("file{}.txt", i)), "test content").unwrap();
1114        }
1115
1116        let options = SearchOptions {
1117            mode: SearchMode::Regex,
1118            query: "test".to_string(),
1119            path: temp_dir.path().to_path_buf(),
1120            recursive: true,
1121            top_k: Some(5),
1122            ..Default::default()
1123        };
1124
1125        let results = regex_search(&options).unwrap();
1126        assert!(results.len() <= 5);
1127    }
1128
1129    #[test]
1130    fn test_regex_search_span_offsets() {
1131        // Test that span offsets are correctly calculated for multiple matches on a line
1132        let temp_dir = TempDir::new().unwrap();
1133        let test_file = temp_dir.path().join("spans.txt");
1134        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1135
1136        let options = SearchOptions {
1137            mode: SearchMode::Regex,
1138            query: "test".to_string(),
1139            path: test_file.clone(),
1140            recursive: false,
1141            ..Default::default()
1142        };
1143
1144        let results = regex_search(&options).unwrap();
1145        
1146        // Should find 5 matches total
1147        assert_eq!(results.len(), 5);
1148        
1149        // Check first line has 3 matches with correct byte offsets
1150        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1151        assert_eq!(line1_matches.len(), 3);
1152        assert_eq!(line1_matches[0].span.byte_start, 0);
1153        assert_eq!(line1_matches[1].span.byte_start, 5);
1154        assert_eq!(line1_matches[2].span.byte_start, 10);
1155        
1156        // Check second line match
1157        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1158        assert_eq!(line2_matches.len(), 1);
1159        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1160        
1161        // Each match should have different byte offsets
1162        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1163        byte_starts.sort();
1164        byte_starts.dedup();
1165        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1166    }
1167
1168    #[test] 
1169    fn test_search_file() {
1170        let temp_dir = TempDir::new().unwrap();
1171        let file_path = temp_dir.path().join("test.txt");
1172        fs::write(&file_path, "line 1: hello\nline 2: world\nline 3: rust programming").unwrap();
1173
1174        let regex = regex::Regex::new("rust").unwrap();
1175        let options = SearchOptions::default();
1176
1177        let results = search_file(&regex, &file_path, &options).unwrap();
1178        assert_eq!(results.len(), 1);
1179        assert_eq!(results[0].span.line_start, 3);
1180        assert!(results[0].preview.contains("rust"));
1181    }
1182
1183    #[test]
1184    fn test_search_file_with_context() {
1185        let temp_dir = TempDir::new().unwrap();
1186        let file_path = temp_dir.path().join("test.txt");
1187        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1188
1189        let regex = regex::Regex::new("target").unwrap();
1190        let options = SearchOptions {
1191            context_lines: 1,
1192            ..Default::default()
1193        };
1194
1195        let results = search_file(&regex, &file_path, &options).unwrap();
1196        assert_eq!(results.len(), 1);
1197        
1198        println!("Preview: '{}'", results[0].preview);
1199        
1200        // The target line is line 3, with 1 context line before and after
1201        // So we should get lines 2, 3, 4
1202        assert!(results[0].preview.contains("line 2"));
1203        assert!(results[0].preview.contains("target line"));
1204        assert!(results[0].preview.contains("line 4"));
1205    }
1206
1207    #[tokio::test]
1208    async fn test_search_main_function() {
1209        let temp_dir = TempDir::new().unwrap();
1210        create_test_files(temp_dir.path());
1211
1212        let options = SearchOptions {
1213            mode: SearchMode::Regex,
1214            query: "hello".to_string(),
1215            path: temp_dir.path().to_path_buf(),
1216            recursive: true,
1217            case_insensitive: true,
1218            ..Default::default()
1219        };
1220
1221        let results = search(&options).await.unwrap();
1222        assert!(!results.is_empty());
1223    }
1224}