ck_engine/
lib.rs

1use anyhow::Result;
2use ck_ann::AnnIndex;
3use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
4use globset::{Glob, GlobSet, GlobSetBuilder};
5use rayon::prelude::*;
6use regex::{Regex, RegexBuilder};
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf as StdPathBuf;
10use std::path::{Path, PathBuf};
11use tantivy::collector::TopDocs;
12use tantivy::query::QueryParser;
13use tantivy::schema::{STORED, Schema, TEXT, Value};
14use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
15use walkdir::WalkDir;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = tokio::fs::read_to_string(file_path).await?;
25    let lines: Vec<&str> = content.lines().collect();
26
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() {
43        path.parent().unwrap_or(path)
44    } else {
45        path
46    };
47    loop {
48        if current.join(".ck").exists() {
49            return Some(current.to_path_buf());
50        }
51        match current.parent() {
52            Some(parent) => current = parent,
53            None => return None,
54        }
55    }
56}
57
58pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
59    search_with_progress(options, None).await
60}
61
62pub async fn search_with_progress(
63    options: &SearchOptions,
64    progress_callback: Option<SearchProgressCallback>,
65) -> Result<Vec<SearchResult>> {
66    // Validate that the search path exists
67    if !options.path.exists() {
68        return Err(ck_core::CkError::Search(format!(
69            "Path does not exist: {}",
70            options.path.display()
71        ))
72        .into());
73    }
74
75    // Auto-update index if needed (unless it's regex-only mode)
76    if !matches!(options.mode, SearchMode::Regex) {
77        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
78        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
79    }
80
81    match options.mode {
82        SearchMode::Regex => regex_search(options),
83        SearchMode::Lexical => lexical_search(options).await,
84        SearchMode::Semantic => {
85            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
86            semantic_search_v3_with_progress(options, progress_callback).await
87        }
88        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
89    }
90}
91
92fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
93    let pattern = if options.fixed_string {
94        regex::escape(&options.query)
95    } else if options.whole_word {
96        format!(r"\b{}\b", regex::escape(&options.query))
97    } else {
98        options.query.clone()
99    };
100
101    let regex = RegexBuilder::new(&pattern)
102        .case_insensitive(options.case_insensitive)
103        .build()
104        .map_err(CkError::Regex)?;
105
106    // Default to recursive for directories (like grep) to maintain compatibility
107    let should_recurse = options.path.is_dir() || options.recursive;
108    let files = if should_recurse {
109        // Use ck_index's collect_files which respects gitignore
110        ck_index::collect_files(
111            &options.path,
112            options.respect_gitignore,
113            &options.exclude_patterns,
114        )?
115    } else {
116        // For non-recursive, use the local collect_files
117        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
118    };
119
120    let results: Vec<Vec<SearchResult>> = files
121        .par_iter()
122        .filter_map(|file_path| match search_file(&regex, file_path, options) {
123            Ok(matches) => {
124                if matches.is_empty() {
125                    None
126                } else {
127                    Some(matches)
128                }
129            }
130            Err(e) => {
131                tracing::debug!("Error searching {:?}: {}", file_path, e);
132                None
133            }
134        })
135        .collect();
136
137    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
138    // Deterministic ordering: file path, then line number
139    all_results.sort_by(|a, b| {
140        let path_cmp = a.file.cmp(&b.file);
141        if path_cmp != std::cmp::Ordering::Equal {
142            return path_cmp;
143        }
144        a.span.line_start.cmp(&b.span.line_start)
145    });
146
147    if let Some(top_k) = options.top_k {
148        all_results.truncate(top_k);
149    }
150
151    Ok(all_results)
152}
153
154fn search_file(
155    regex: &Regex,
156    file_path: &Path,
157    options: &SearchOptions,
158) -> Result<Vec<SearchResult>> {
159    let content = fs::read_to_string(file_path)?;
160    let lines: Vec<&str> = content.lines().collect();
161    let mut results = Vec::new();
162
163    // If full_section is enabled, try to parse the file and find code sections
164    let code_sections = if options.full_section {
165        extract_code_sections(file_path, &content)
166    } else {
167        None
168    };
169
170    // Track byte offset as we iterate through lines
171    let mut byte_offset = 0;
172
173    for (line_idx, line) in lines.iter().enumerate() {
174        let line_number = line_idx + 1;
175
176        // Find all matches in the line with their positions
177        for mat in regex.find_iter(line) {
178            let preview = if options.full_section {
179                // Try to find the containing code section
180                if let Some(ref sections) = code_sections {
181                    if let Some(section) = find_containing_section(sections, line_idx) {
182                        section.clone()
183                    } else {
184                        // Fall back to context lines if no section found
185                        get_context_preview(&lines, line_idx, options)
186                    }
187                } else {
188                    get_context_preview(&lines, line_idx, options)
189                }
190            } else {
191                get_context_preview(&lines, line_idx, options)
192            };
193
194            results.push(SearchResult {
195                file: file_path.to_path_buf(),
196                span: Span {
197                    byte_start: byte_offset + mat.start(),
198                    byte_end: byte_offset + mat.end(),
199                    line_start: line_number,
200                    line_end: line_number,
201                },
202                score: 1.0,
203                preview,
204                lang: ck_core::Language::from_path(file_path),
205                symbol: None,
206                chunk_hash: None,
207                index_epoch: None,
208            });
209        }
210
211        // Update byte offset for next line (add line length + newline character)
212        byte_offset += line.len();
213        if line_idx < lines.len() - 1 {
214            byte_offset += 1; // Add 1 for the newline character
215        }
216    }
217
218    Ok(results)
219}
220
221async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
222    // Handle both files and directories and reuse nearest existing .ck index up the tree
223    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
224        if options.path.is_file() {
225            options.path.parent().unwrap_or(&options.path).to_path_buf()
226        } else {
227            options.path.clone()
228        }
229    });
230
231    let index_dir = index_root.join(".ck");
232    if !index_dir.exists() {
233        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
234    }
235
236    let tantivy_index_path = index_dir.join("tantivy_index");
237
238    if !tantivy_index_path.exists() {
239        return build_tantivy_index(options).await;
240    }
241
242    let mut schema_builder = Schema::builder();
243    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
244    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
245    let _schema = schema_builder.build();
246
247    let index = Index::open_in_dir(&tantivy_index_path)
248        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
249
250    let reader = index
251        .reader_builder()
252        .reload_policy(ReloadPolicy::OnCommitWithDelay)
253        .try_into()
254        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
255
256    let searcher = reader.searcher();
257    let query_parser = QueryParser::for_index(&index, vec![content_field]);
258
259    let query = query_parser
260        .parse_query(&options.query)
261        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
262
263    let top_docs = if let Some(top_k) = options.top_k {
264        searcher.search(&query, &TopDocs::with_limit(top_k))?
265    } else {
266        searcher.search(&query, &TopDocs::with_limit(100))?
267    };
268
269    // First, collect all results with raw scores
270    let mut raw_results = Vec::new();
271    for (_score, doc_address) in top_docs {
272        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
273        let path_text = retrieved_doc
274            .get_first(path_field)
275            .map(|field_value| field_value.as_str().unwrap_or(""))
276            .unwrap_or("");
277        let content_text = retrieved_doc
278            .get_first(content_field)
279            .map(|field_value| field_value.as_str().unwrap_or(""))
280            .unwrap_or("");
281
282        let file_path = PathBuf::from(path_text);
283        let preview = if options.full_section {
284            content_text.to_string()
285        } else {
286            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
287        };
288
289        raw_results.push((
290            _score,
291            SearchResult {
292                file: file_path,
293                span: Span {
294                    byte_start: 0,
295                    byte_end: content_text.len(),
296                    line_start: 1,
297                    line_end: content_text.lines().count(),
298                },
299                score: _score,
300                preview,
301                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
302                symbol: None,
303                chunk_hash: None,
304                index_epoch: None,
305            },
306        ));
307    }
308
309    // Normalize scores to 0-1 range and apply threshold
310    let mut results = Vec::new();
311    if !raw_results.is_empty() {
312        let max_score = raw_results
313            .iter()
314            .map(|(score, _)| *score)
315            .fold(0.0f32, f32::max);
316        if max_score > 0.0 {
317            for (raw_score, mut result) in raw_results {
318                let normalized_score = raw_score / max_score;
319
320                // Apply threshold filtering with normalized score
321                if let Some(threshold) = options.threshold
322                    && normalized_score < threshold
323                {
324                    continue;
325                }
326
327                result.score = normalized_score;
328                results.push(result);
329            }
330        }
331    }
332
333    Ok(results)
334}
335
336async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
337    // Handle both files and directories by finding the appropriate directory for indexing
338    let index_root = if options.path.is_file() {
339        options.path.parent().unwrap_or(&options.path)
340    } else {
341        &options.path
342    };
343
344    let index_dir = index_root.join(".ck");
345    let tantivy_index_path = index_dir.join("tantivy_index");
346
347    fs::create_dir_all(&tantivy_index_path)?;
348
349    let mut schema_builder = Schema::builder();
350    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
351    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
352    let schema = schema_builder.build();
353
354    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
355        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
356
357    let mut index_writer = index
358        .writer(50_000_000)
359        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
360
361    let files = collect_files(index_root, true, &options.exclude_patterns)?;
362
363    for file_path in &files {
364        if let Ok(content) = fs::read_to_string(file_path) {
365            let doc = doc!(
366                content_field => content,
367                path_field => file_path.display().to_string()
368            );
369            index_writer.add_document(doc)?;
370        }
371    }
372
373    index_writer
374        .commit()
375        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
376
377    // After building, search again with the same options
378    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
379    let mut schema_builder = Schema::builder();
380    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
381    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
382    let _schema = schema_builder.build();
383
384    let index = Index::open_in_dir(&tantivy_index_path)
385        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
386
387    let reader = index
388        .reader_builder()
389        .reload_policy(ReloadPolicy::OnCommitWithDelay)
390        .try_into()
391        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
392
393    let searcher = reader.searcher();
394    let query_parser = QueryParser::for_index(&index, vec![content_field]);
395
396    let query = query_parser
397        .parse_query(&options.query)
398        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
399
400    let top_docs = if let Some(top_k) = options.top_k {
401        searcher.search(&query, &TopDocs::with_limit(top_k))?
402    } else {
403        searcher.search(&query, &TopDocs::with_limit(100))?
404    };
405
406    // First, collect all results with raw scores
407    let mut raw_results = Vec::new();
408    for (_score, doc_address) in top_docs {
409        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
410        let path_text = retrieved_doc
411            .get_first(path_field)
412            .map(|field_value| field_value.as_str().unwrap_or(""))
413            .unwrap_or("");
414        let content_text = retrieved_doc
415            .get_first(content_field)
416            .map(|field_value| field_value.as_str().unwrap_or(""))
417            .unwrap_or("");
418
419        let file_path = PathBuf::from(path_text);
420        let preview = if options.full_section {
421            content_text.to_string()
422        } else {
423            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
424        };
425
426        raw_results.push((
427            _score,
428            SearchResult {
429                file: file_path,
430                span: Span {
431                    byte_start: 0,
432                    byte_end: content_text.len(),
433                    line_start: 1,
434                    line_end: content_text.lines().count(),
435                },
436                score: _score,
437                preview,
438                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
439                symbol: None,
440                chunk_hash: None,
441                index_epoch: None,
442            },
443        ));
444    }
445
446    // Normalize scores to 0-1 range and apply threshold
447    let mut results = Vec::new();
448    if !raw_results.is_empty() {
449        let max_score = raw_results
450            .iter()
451            .map(|(score, _)| *score)
452            .fold(0.0f32, f32::max);
453        if max_score > 0.0 {
454            for (raw_score, mut result) in raw_results {
455                let normalized_score = raw_score / max_score;
456
457                // Apply threshold filtering with normalized score
458                if let Some(threshold) = options.threshold
459                    && normalized_score < threshold
460                {
461                    continue;
462                }
463
464                result.score = normalized_score;
465                results.push(result);
466            }
467        }
468    }
469
470    Ok(results)
471}
472
473#[allow(dead_code)]
474async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
475    semantic_search_with_progress(options, None).await
476}
477
478async fn semantic_search_with_progress(
479    options: &SearchOptions,
480    progress_callback: Option<SearchProgressCallback>,
481) -> Result<Vec<SearchResult>> {
482    // Handle both files and directories and reuse nearest existing .ck index up the tree
483    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
484        if options.path.is_file() {
485            options.path.parent().unwrap_or(&options.path).to_path_buf()
486        } else {
487            options.path.clone()
488        }
489    });
490
491    let index_dir = index_root.join(".ck");
492    if !index_dir.exists() {
493        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
494    }
495
496    let ann_index_path = index_dir.join("ann_index.bin");
497    let embeddings_path = index_dir.join("embeddings.json");
498
499    if !ann_index_path.exists() || !embeddings_path.exists() {
500        return build_semantic_index_with_progress(options, progress_callback).await;
501    }
502
503    // Load the ANN index
504    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
505
506    // Load file metadata
507    let embeddings_data = fs::read_to_string(&embeddings_path)?;
508    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
509
510    // Create embedder and embed the query
511    if let Some(ref callback) = progress_callback {
512        callback("Loading embedding model...");
513    }
514
515    let mut embedder = if let Some(ref callback) = progress_callback {
516        let _cb = callback.as_ref();
517        let model_cb = Box::new(|msg: &str| {
518            // Note: We can't directly use the callback here due to lifetime issues
519            // For now, we'll just use eprintln! until we can restructure this better
520            eprintln!("Model: {}", msg);
521        }) as ck_embed::ModelDownloadCallback;
522        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
523    } else {
524        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
525    };
526    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
527
528    if query_embeddings.is_empty() {
529        return Ok(Vec::new());
530    }
531
532    let query_embedding = &query_embeddings[0];
533
534    // Search using ANN
535    let top_k = options.top_k.unwrap_or(10);
536    let similar_docs = ann_index.search(query_embedding, top_k);
537
538    let mut results = Vec::new();
539
540    // Check if we're searching a specific file vs. a directory
541    let filter_by_file = options.path.is_file();
542    let target_file = if filter_by_file {
543        Some(
544            options
545                .path
546                .canonicalize()
547                .unwrap_or_else(|_| options.path.clone()),
548        )
549    } else {
550        None
551    };
552
553    for (doc_id, similarity) in similar_docs {
554        // Apply threshold filtering
555        if let Some(threshold) = options.threshold
556            && similarity < threshold
557        {
558            continue;
559        }
560
561        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
562            // Filter by target file if specified
563            if let Some(target) = &target_file {
564                let canonical_result = file_path
565                    .canonicalize()
566                    .unwrap_or_else(|_| file_path.clone());
567                if canonical_result != *target {
568                    continue; // Skip this result if it doesn't match the target file
569                }
570            }
571
572            // If full_section is enabled and this is a code section, return the full content
573            let preview = if options.full_section {
574                content.clone()
575            } else {
576                content.lines().take(3).collect::<Vec<_>>().join("\n")
577            };
578
579            results.push(SearchResult {
580                file: file_path.clone(),
581                span: Span {
582                    byte_start: 0,
583                    byte_end: content.len(),
584                    line_start: 1,
585                    line_end: content.lines().count(),
586                },
587                score: similarity,
588                preview,
589                lang: ck_core::Language::from_path(file_path),
590                symbol: None,
591                chunk_hash: None,
592                index_epoch: None,
593            });
594        }
595    }
596
597    Ok(results)
598}
599
600#[allow(dead_code)]
601async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
602    build_semantic_index_with_progress(options, None).await
603}
604
605async fn build_semantic_index_with_progress(
606    options: &SearchOptions,
607    progress_callback: Option<SearchProgressCallback>,
608) -> Result<Vec<SearchResult>> {
609    // Handle both files and directories by finding the appropriate directory for indexing
610    let index_root = if options.path.is_file() {
611        options.path.parent().unwrap_or(&options.path)
612    } else {
613        &options.path
614    };
615
616    let index_dir = index_root.join(".ck");
617    let ann_index_path = index_dir.join("ann_index.bin");
618    let embeddings_path = index_dir.join("embeddings.json");
619
620    fs::create_dir_all(&index_dir)?;
621
622    if let Some(ref callback) = progress_callback {
623        callback("Building semantic index (no index found)...");
624    }
625
626    // Always print this important message, even in quiet mode for indexing operations
627    eprintln!("Building semantic index (no existing index found)...");
628
629    // Collect files and their content
630    let files = collect_files(index_root, true, &options.exclude_patterns)?;
631
632    if let Some(ref callback) = progress_callback {
633        callback(&format!("Found {} files to index", files.len()));
634    }
635    eprintln!("Found {} files to embed and index", files.len());
636
637    let mut file_embeddings = Vec::new();
638    let mut embeddings = Vec::new();
639
640    // Create embedder with progress callback
641    if let Some(ref callback) = progress_callback {
642        callback("Loading embedding model...");
643    }
644
645    let model_callback = if progress_callback.is_some() {
646        Some(Box::new(|msg: &str| {
647            eprintln!("Model: {}", msg);
648        }) as ck_embed::ModelDownloadCallback)
649    } else {
650        None
651    };
652
653    let mut embedder =
654        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
655
656    if let Some(ref callback) = progress_callback {
657        callback("Generating embeddings for code chunks...");
658    }
659
660    for (file_idx, file_path) in files.iter().enumerate() {
661        if let Ok(content) = fs::read_to_string(file_path) {
662            if let Some(ref callback) = progress_callback {
663                let file_name = file_path
664                    .file_name()
665                    .map(|n| n.to_string_lossy().to_string())
666                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
667                callback(&format!(
668                    "Processing {}/{}: {}",
669                    file_idx + 1,
670                    files.len(),
671                    file_name
672                ));
673            }
674
675            // Chunk the content for better embeddings
676            let chunks = ck_chunk::chunk_text(&content, ck_core::Language::from_path(file_path))?;
677
678            for chunk in chunks {
679                let chunk_embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
680                if !chunk_embeddings.is_empty() {
681                    embeddings.push(chunk_embeddings[0].clone());
682                    file_embeddings.push((file_path.clone(), chunk.text));
683                }
684            }
685        }
686    }
687
688    if let Some(ref callback) = progress_callback {
689        callback(&format!(
690            "Built {} embeddings, creating search index...",
691            embeddings.len()
692        ));
693    }
694    eprintln!(
695        "Generated {} embeddings, building search index...",
696        embeddings.len()
697    );
698
699    // Build ANN index
700    let index = ck_ann::SimpleIndex::build(&embeddings)?;
701    index.save(&ann_index_path)?;
702
703    // Save file embeddings metadata
704    let embeddings_json = serde_json::to_string(&file_embeddings)?;
705    fs::write(&embeddings_path, embeddings_json)?;
706
707    if let Some(ref callback) = progress_callback {
708        callback("Semantic index built successfully, running search...");
709    }
710    eprintln!("Semantic index built successfully!");
711
712    // After building, search again - inline to avoid recursion
713    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
714
715    // Load file metadata
716    let embeddings_data = fs::read_to_string(&embeddings_path)?;
717    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
718
719    // Create embedder and embed the query
720    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
721    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
722
723    if query_embeddings.is_empty() {
724        return Ok(Vec::new());
725    }
726
727    let query_embedding = &query_embeddings[0];
728
729    // Search using ANN
730    let top_k = options.top_k.unwrap_or(10);
731    let similar_docs = ann_index.search(query_embedding, top_k);
732
733    let mut results = Vec::new();
734
735    // Check if we're searching a specific file vs. a directory
736    let filter_by_file = options.path.is_file();
737    let target_file = if filter_by_file {
738        Some(
739            options
740                .path
741                .canonicalize()
742                .unwrap_or_else(|_| options.path.clone()),
743        )
744    } else {
745        None
746    };
747
748    for (doc_id, similarity) in similar_docs {
749        // Apply threshold filtering
750        if let Some(threshold) = options.threshold
751            && similarity < threshold
752        {
753            continue;
754        }
755
756        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
757            // Filter by target file if specified
758            if let Some(target) = &target_file {
759                let canonical_result = file_path
760                    .canonicalize()
761                    .unwrap_or_else(|_| file_path.clone());
762                if canonical_result != *target {
763                    continue; // Skip this result if it doesn't match the target file
764                }
765            }
766
767            // If full_section is enabled and this is a code section, return the full content
768            let preview = if options.full_section {
769                content.clone()
770            } else {
771                content.lines().take(3).collect::<Vec<_>>().join("\n")
772            };
773
774            results.push(SearchResult {
775                file: file_path.clone(),
776                span: Span {
777                    byte_start: 0,
778                    byte_end: content.len(),
779                    line_start: 1,
780                    line_end: content.lines().count(),
781                },
782                score: similarity,
783                preview,
784                lang: ck_core::Language::from_path(file_path),
785                symbol: None,
786                chunk_hash: None,
787                index_epoch: None,
788            });
789        }
790    }
791
792    Ok(results)
793}
794
795#[allow(dead_code)]
796async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
797    hybrid_search_with_progress(options, None).await
798}
799
800async fn hybrid_search_with_progress(
801    options: &SearchOptions,
802    progress_callback: Option<SearchProgressCallback>,
803) -> Result<Vec<SearchResult>> {
804    if let Some(ref callback) = progress_callback {
805        callback("Running regex search...");
806    }
807    let regex_results = regex_search(options)?;
808
809    if let Some(ref callback) = progress_callback {
810        callback("Running semantic search...");
811    }
812    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
813
814    let mut combined = HashMap::new();
815
816    for (rank, result) in regex_results.iter().enumerate() {
817        let key = format!("{}:{}", result.file.display(), result.span.line_start);
818        combined
819            .entry(key)
820            .or_insert(Vec::new())
821            .push((rank + 1, result.clone()));
822    }
823
824    for (rank, result) in semantic_results.iter().enumerate() {
825        let key = format!("{}:{}", result.file.display(), result.span.line_start);
826        combined
827            .entry(key)
828            .or_insert(Vec::new())
829            .push((rank + 1, result.clone()));
830    }
831
832    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
833    let mut rrf_results: Vec<SearchResult> = combined
834        .into_values()
835        .map(|ranks| {
836            let mut result = ranks[0].1.clone();
837            let rrf_score = ranks
838                .iter()
839                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
840                .sum();
841            result.score = rrf_score;
842            result
843        })
844        .filter(|result| {
845            // Apply threshold filtering to raw RRF scores
846            if let Some(threshold) = options.threshold {
847                result.score >= threshold
848            } else {
849                true
850            }
851        })
852        .collect();
853
854    // Sort by RRF score (highest first)
855    rrf_results.sort_by(|a, b| {
856        b.score
857            .partial_cmp(&a.score)
858            .unwrap_or(std::cmp::Ordering::Equal)
859    });
860
861    if let Some(top_k) = options.top_k {
862        rrf_results.truncate(top_k);
863    }
864
865    Ok(rrf_results)
866}
867
868fn build_globset(patterns: &[String]) -> GlobSet {
869    let mut builder = GlobSetBuilder::new();
870    for pat in patterns {
871        // Treat patterns as filename or directory globs
872        if let Ok(glob) = Glob::new(pat) {
873            builder.add(glob);
874        }
875    }
876    builder.build().unwrap_or_else(|_| GlobSet::empty())
877}
878
879fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
880    let globset = build_globset(exclude_patterns);
881    // Match against each path component and the full path
882    if globset.is_match(path) {
883        return true;
884    }
885    for component in path.components() {
886        if let std::path::Component::Normal(name) = component
887            && globset.is_match(name)
888        {
889            return true;
890        }
891    }
892    false
893}
894
895fn collect_files(
896    path: &Path,
897    recursive: bool,
898    exclude_patterns: &[String],
899) -> Result<Vec<PathBuf>> {
900    let mut files = Vec::new();
901    let globset = build_globset(exclude_patterns);
902
903    if path.is_file() {
904        // Always add single files, even if they're excluded (user explicitly requested)
905        files.push(path.to_path_buf());
906    } else if recursive {
907        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
908            // Skip excluded directories entirely for efficiency
909            let name = e.file_name();
910            !globset.is_match(e.path()) && !globset.is_match(name)
911        }) {
912            match entry {
913                Ok(entry) => {
914                    if entry.file_type().is_file()
915                        && !should_exclude_path(entry.path(), exclude_patterns)
916                    {
917                        files.push(entry.path().to_path_buf());
918                    }
919                }
920                Err(e) => {
921                    // Log directory traversal errors but continue processing
922                    tracing::debug!("Skipping path due to error: {}", e);
923                    continue;
924                }
925            }
926        }
927    } else {
928        match fs::read_dir(path) {
929            Ok(read_dir) => {
930                for entry in read_dir {
931                    match entry {
932                        Ok(entry) => {
933                            let path = entry.path();
934                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
935                                files.push(path);
936                            }
937                        }
938                        Err(e) => {
939                            tracing::debug!("Skipping directory entry due to error: {}", e);
940                            continue;
941                        }
942                    }
943                }
944            }
945            Err(e) => {
946                tracing::debug!("Cannot read directory {:?}: {}", path, e);
947                return Err(e.into());
948            }
949        }
950    }
951
952    Ok(files)
953}
954
955async fn ensure_index_updated(
956    path: &Path,
957    force_reindex: bool,
958    need_embeddings: bool,
959) -> Result<()> {
960    // Handle both files and directories and reuse nearest existing .ck index up the tree
961    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
962        if path.is_file() {
963            path.parent().unwrap_or(path).to_path_buf()
964        } else {
965            path.to_path_buf()
966        }
967    });
968    let index_root = &index_root_buf;
969
970    // If force reindex is requested, always update
971    if force_reindex {
972        let stats = ck_index::smart_update_index_with_progress(
973            index_root,
974            false,
975            None,
976            need_embeddings,
977            true,
978            &[], // Empty exclude patterns for internal engine use
979        )
980        .await?;
981        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
982            tracing::info!(
983                "Index updated: {} files indexed, {} orphaned files removed",
984                stats.files_indexed,
985                stats.orphaned_files_removed
986            );
987        }
988        return Ok(());
989    }
990
991    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
992    let stats = ck_index::smart_update_index_with_progress(
993        index_root,
994        false,
995        None,
996        need_embeddings,
997        true,
998        &[],
999    )
1000    .await?;
1001    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1002        tracing::info!(
1003            "Index updated: {} files indexed, {} orphaned files removed",
1004            stats.files_indexed,
1005            stats.orphaned_files_removed
1006        );
1007    }
1008
1009    Ok(())
1010}
1011
1012fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
1013    let before = options.before_context_lines.max(options.context_lines);
1014    let after = options.after_context_lines.max(options.context_lines);
1015
1016    if before > 0 || after > 0 {
1017        let start_idx = line_idx.saturating_sub(before);
1018        let end_idx = (line_idx + after + 1).min(lines.len());
1019        lines[start_idx..end_idx].join("\n")
1020    } else {
1021        lines[line_idx].to_string()
1022    }
1023}
1024
1025fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1026    let lang = ck_core::Language::from_path(file_path)?;
1027
1028    // Parse the file with tree-sitter and extract function/class sections
1029    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1030        let sections: Vec<(usize, usize, String)> = chunks
1031            .into_iter()
1032            .filter(|chunk| {
1033                matches!(
1034                    chunk.chunk_type,
1035                    ck_chunk::ChunkType::Function
1036                        | ck_chunk::ChunkType::Class
1037                        | ck_chunk::ChunkType::Method
1038                )
1039            })
1040            .map(|chunk| {
1041                (
1042                    chunk.span.line_start - 1, // Convert to 0-based index
1043                    chunk.span.line_end - 1,
1044                    chunk.text,
1045                )
1046            })
1047            .collect();
1048
1049        if sections.is_empty() {
1050            None
1051        } else {
1052            Some(sections)
1053        }
1054    } else {
1055        None
1056    }
1057}
1058
1059fn find_containing_section(
1060    sections: &[(usize, usize, String)],
1061    line_idx: usize,
1062) -> Option<&String> {
1063    for (start, end, text) in sections {
1064        if line_idx >= *start && line_idx <= *end {
1065            return Some(text);
1066        }
1067    }
1068    None
1069}
1070
1071#[cfg(test)]
1072mod tests {
1073    use super::*;
1074    use std::fs;
1075    use tempfile::TempDir;
1076
1077    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1078        let files = vec![
1079            ("test1.txt", "hello world rust programming"),
1080            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1081            ("test3.py", "print('Hello Python')"),
1082            ("test4.txt", "machine learning artificial intelligence"),
1083        ];
1084
1085        let mut paths = Vec::new();
1086        for (name, content) in files {
1087            let path = dir.join(name);
1088            fs::write(&path, content).unwrap();
1089            paths.push(path);
1090        }
1091        paths
1092    }
1093
1094    #[test]
1095    fn test_collect_files() {
1096        let temp_dir = TempDir::new().unwrap();
1097        let test_files = create_test_files(temp_dir.path());
1098
1099        // Test non-recursive
1100        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1101        assert_eq!(files.len(), 4);
1102
1103        // Test recursive
1104        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1105        assert_eq!(files.len(), 4);
1106
1107        // Test single file
1108        let files = collect_files(&test_files[0], false, &[]).unwrap();
1109        assert_eq!(files.len(), 1);
1110        assert_eq!(files[0], test_files[0]);
1111    }
1112
1113    #[test]
1114    fn test_regex_search() {
1115        let temp_dir = TempDir::new().unwrap();
1116        create_test_files(temp_dir.path());
1117
1118        let options = SearchOptions {
1119            mode: SearchMode::Regex,
1120            query: "rust".to_string(),
1121            path: temp_dir.path().to_path_buf(),
1122            recursive: true,
1123            ..Default::default()
1124        };
1125
1126        let results = regex_search(&options).unwrap();
1127        assert!(!results.is_empty());
1128
1129        // Should find matches in files containing "rust"
1130        let rust_matches: Vec<_> = results
1131            .iter()
1132            .filter(|r| r.preview.to_lowercase().contains("rust"))
1133            .collect();
1134        assert!(!rust_matches.is_empty());
1135    }
1136
1137    #[test]
1138    fn test_regex_search_case_insensitive() {
1139        let temp_dir = TempDir::new().unwrap();
1140        create_test_files(temp_dir.path());
1141
1142        let options = SearchOptions {
1143            mode: SearchMode::Regex,
1144            query: "HELLO".to_string(),
1145            path: temp_dir.path().to_path_buf(),
1146            recursive: true,
1147            case_insensitive: true,
1148            ..Default::default()
1149        };
1150
1151        let results = regex_search(&options).unwrap();
1152        assert!(!results.is_empty());
1153    }
1154
1155    #[test]
1156    fn test_regex_search_fixed_string() {
1157        let temp_dir = TempDir::new().unwrap();
1158        create_test_files(temp_dir.path());
1159
1160        let options = SearchOptions {
1161            mode: SearchMode::Regex,
1162            query: "fn main()".to_string(),
1163            path: temp_dir.path().to_path_buf(),
1164            recursive: true,
1165            fixed_string: true,
1166            ..Default::default()
1167        };
1168
1169        let results = regex_search(&options).unwrap();
1170        assert!(!results.is_empty());
1171    }
1172
1173    #[test]
1174    fn test_regex_search_whole_word() {
1175        let temp_dir = TempDir::new().unwrap();
1176        fs::write(
1177            temp_dir.path().join("word_test.txt"),
1178            "rust rusty rustacean",
1179        )
1180        .unwrap();
1181
1182        let options = SearchOptions {
1183            mode: SearchMode::Regex,
1184            query: "rust".to_string(),
1185            path: temp_dir.path().to_path_buf(),
1186            recursive: true,
1187            whole_word: true,
1188            ..Default::default()
1189        };
1190
1191        let results = regex_search(&options).unwrap();
1192        assert!(!results.is_empty());
1193        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1194    }
1195
1196    #[test]
1197    fn test_regex_search_top_k() {
1198        let temp_dir = TempDir::new().unwrap();
1199
1200        // Create multiple files with matches
1201        for i in 0..10 {
1202            fs::write(
1203                temp_dir.path().join(format!("file{}.txt", i)),
1204                "test content",
1205            )
1206            .unwrap();
1207        }
1208
1209        let options = SearchOptions {
1210            mode: SearchMode::Regex,
1211            query: "test".to_string(),
1212            path: temp_dir.path().to_path_buf(),
1213            recursive: true,
1214            top_k: Some(5),
1215            ..Default::default()
1216        };
1217
1218        let results = regex_search(&options).unwrap();
1219        assert!(results.len() <= 5);
1220    }
1221
1222    #[test]
1223    fn test_regex_search_span_offsets() {
1224        // Test that span offsets are correctly calculated for multiple matches on a line
1225        let temp_dir = TempDir::new().unwrap();
1226        let test_file = temp_dir.path().join("spans.txt");
1227        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1228
1229        let options = SearchOptions {
1230            mode: SearchMode::Regex,
1231            query: "test".to_string(),
1232            path: test_file.clone(),
1233            recursive: false,
1234            ..Default::default()
1235        };
1236
1237        let results = regex_search(&options).unwrap();
1238
1239        // Should find 5 matches total
1240        assert_eq!(results.len(), 5);
1241
1242        // Check first line has 3 matches with correct byte offsets
1243        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1244        assert_eq!(line1_matches.len(), 3);
1245        assert_eq!(line1_matches[0].span.byte_start, 0);
1246        assert_eq!(line1_matches[1].span.byte_start, 5);
1247        assert_eq!(line1_matches[2].span.byte_start, 10);
1248
1249        // Check second line match
1250        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1251        assert_eq!(line2_matches.len(), 1);
1252        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1253
1254        // Each match should have different byte offsets
1255        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1256        byte_starts.sort();
1257        byte_starts.dedup();
1258        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1259    }
1260
1261    #[test]
1262    fn test_search_file() {
1263        let temp_dir = TempDir::new().unwrap();
1264        let file_path = temp_dir.path().join("test.txt");
1265        fs::write(
1266            &file_path,
1267            "line 1: hello\nline 2: world\nline 3: rust programming",
1268        )
1269        .unwrap();
1270
1271        let regex = regex::Regex::new("rust").unwrap();
1272        let options = SearchOptions::default();
1273
1274        let results = search_file(&regex, &file_path, &options).unwrap();
1275        assert_eq!(results.len(), 1);
1276        assert_eq!(results[0].span.line_start, 3);
1277        assert!(results[0].preview.contains("rust"));
1278    }
1279
1280    #[test]
1281    fn test_search_file_with_context() {
1282        let temp_dir = TempDir::new().unwrap();
1283        let file_path = temp_dir.path().join("test.txt");
1284        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1285
1286        let regex = regex::Regex::new("target").unwrap();
1287        let options = SearchOptions {
1288            context_lines: 1,
1289            ..Default::default()
1290        };
1291
1292        let results = search_file(&regex, &file_path, &options).unwrap();
1293        assert_eq!(results.len(), 1);
1294
1295        println!("Preview: '{}'", results[0].preview);
1296
1297        // The target line is line 3, with 1 context line before and after
1298        // So we should get lines 2, 3, 4
1299        assert!(results[0].preview.contains("line 2"));
1300        assert!(results[0].preview.contains("target line"));
1301        assert!(results[0].preview.contains("line 4"));
1302    }
1303
1304    #[tokio::test]
1305    async fn test_search_main_function() {
1306        let temp_dir = TempDir::new().unwrap();
1307        create_test_files(temp_dir.path());
1308
1309        let options = SearchOptions {
1310            mode: SearchMode::Regex,
1311            query: "hello".to_string(),
1312            path: temp_dir.path().to_path_buf(),
1313            recursive: true,
1314            case_insensitive: true,
1315            ..Default::default()
1316        };
1317
1318        let results = search(&options).await.unwrap();
1319        assert!(!results.is_empty());
1320    }
1321}