ck_engine/
lib.rs

1use anyhow::Result;
2use ck_ann::AnnIndex;
3use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
4use globset::{Glob, GlobSet, GlobSetBuilder};
5use rayon::prelude::*;
6use regex::{Regex, RegexBuilder};
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf as StdPathBuf;
10use std::path::{Path, PathBuf};
11use tantivy::collector::TopDocs;
12use tantivy::query::QueryParser;
13use tantivy::schema::{STORED, Schema, TEXT, Value};
14use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
15use walkdir::WalkDir;
16
17mod semantic_v3;
18pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
19
20pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21
22/// Extract content from a file using a span
23async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
24    let content = tokio::fs::read_to_string(file_path).await?;
25    let lines: Vec<&str> = content.lines().collect();
26
27    if span.line_start == 0 || span.line_start > lines.len() {
28        return Ok(String::new());
29    }
30
31    let start_idx = span.line_start - 1; // Convert to 0-based
32    let end_idx = (span.line_end - 1).min(lines.len().saturating_sub(1));
33
34    if start_idx <= end_idx {
35        Ok(lines[start_idx..=end_idx].join("\n"))
36    } else {
37        Ok(lines[start_idx].to_string())
38    }
39}
40
41fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
42    let mut current = if path.is_file() {
43        path.parent().unwrap_or(path)
44    } else {
45        path
46    };
47    loop {
48        if current.join(".ck").exists() {
49            return Some(current.to_path_buf());
50        }
51        match current.parent() {
52            Some(parent) => current = parent,
53            None => return None,
54        }
55    }
56}
57
58pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
59    search_with_progress(options, None).await
60}
61
62pub async fn search_with_progress(
63    options: &SearchOptions,
64    progress_callback: Option<SearchProgressCallback>,
65) -> Result<Vec<SearchResult>> {
66    // Validate that the search path exists
67    if !options.path.exists() {
68        return Err(ck_core::CkError::Search(format!(
69            "Path does not exist: {}",
70            options.path.display()
71        ))
72        .into());
73    }
74
75    // Auto-update index if needed (unless it's regex-only mode)
76    if !matches!(options.mode, SearchMode::Regex) {
77        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
78        ensure_index_updated(&options.path, options.reindex, need_embeddings).await?;
79    }
80
81    match options.mode {
82        SearchMode::Regex => regex_search(options),
83        SearchMode::Lexical => lexical_search(options).await,
84        SearchMode::Semantic => {
85            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
86            semantic_search_v3_with_progress(options, progress_callback).await
87        }
88        SearchMode::Hybrid => hybrid_search_with_progress(options, progress_callback).await,
89    }
90}
91
92fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
93    let pattern = if options.fixed_string {
94        regex::escape(&options.query)
95    } else if options.whole_word {
96        format!(r"\b{}\b", regex::escape(&options.query))
97    } else {
98        options.query.clone()
99    };
100
101    let regex = RegexBuilder::new(&pattern)
102        .case_insensitive(options.case_insensitive)
103        .build()
104        .map_err(CkError::Regex)?;
105
106    // Default to recursive for directories (like grep) to maintain compatibility
107    let should_recurse = options.path.is_dir() || options.recursive;
108    let files = if should_recurse {
109        // Use ck_index's collect_files which respects gitignore
110        ck_index::collect_files(
111            &options.path,
112            options.respect_gitignore,
113            &options.exclude_patterns,
114        )?
115    } else {
116        // For non-recursive, use the local collect_files
117        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
118    };
119
120    let results: Vec<Vec<SearchResult>> = files
121        .par_iter()
122        .filter_map(|file_path| match search_file(&regex, file_path, options) {
123            Ok(matches) => {
124                if matches.is_empty() {
125                    None
126                } else {
127                    Some(matches)
128                }
129            }
130            Err(e) => {
131                tracing::debug!("Error searching {:?}: {}", file_path, e);
132                None
133            }
134        })
135        .collect();
136
137    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
138    // Deterministic ordering: file path, then line number
139    all_results.sort_by(|a, b| {
140        let path_cmp = a.file.cmp(&b.file);
141        if path_cmp != std::cmp::Ordering::Equal {
142            return path_cmp;
143        }
144        a.span.line_start.cmp(&b.span.line_start)
145    });
146
147    if let Some(top_k) = options.top_k {
148        all_results.truncate(top_k);
149    }
150
151    Ok(all_results)
152}
153
154fn search_file(
155    regex: &Regex,
156    file_path: &Path,
157    options: &SearchOptions,
158) -> Result<Vec<SearchResult>> {
159    let content = fs::read_to_string(file_path)?;
160    let lines: Vec<&str> = content.lines().collect();
161    let mut results = Vec::new();
162
163    // If full_section is enabled, try to parse the file and find code sections
164    let code_sections = if options.full_section {
165        extract_code_sections(file_path, &content)
166    } else {
167        None
168    };
169
170    // Track byte offset as we iterate through lines
171    let mut byte_offset = 0;
172
173    for (line_idx, line) in lines.iter().enumerate() {
174        let line_number = line_idx + 1;
175
176        // Find all matches in the line with their positions
177        for mat in regex.find_iter(line) {
178            let preview = if options.full_section {
179                // Try to find the containing code section
180                if let Some(ref sections) = code_sections {
181                    if let Some(section) = find_containing_section(sections, line_idx) {
182                        section.clone()
183                    } else {
184                        // Fall back to context lines if no section found
185                        get_context_preview(&lines, line_idx, options)
186                    }
187                } else {
188                    get_context_preview(&lines, line_idx, options)
189                }
190            } else {
191                get_context_preview(&lines, line_idx, options)
192            };
193
194            results.push(SearchResult {
195                file: file_path.to_path_buf(),
196                span: Span {
197                    byte_start: byte_offset + mat.start(),
198                    byte_end: byte_offset + mat.end(),
199                    line_start: line_number,
200                    line_end: line_number,
201                },
202                score: 1.0,
203                preview,
204                lang: ck_core::Language::from_path(file_path),
205                symbol: None,
206            });
207        }
208
209        // Update byte offset for next line (add line length + newline character)
210        byte_offset += line.len();
211        if line_idx < lines.len() - 1 {
212            byte_offset += 1; // Add 1 for the newline character
213        }
214    }
215
216    Ok(results)
217}
218
219async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
220    // Handle both files and directories and reuse nearest existing .ck index up the tree
221    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
222        if options.path.is_file() {
223            options.path.parent().unwrap_or(&options.path).to_path_buf()
224        } else {
225            options.path.clone()
226        }
227    });
228
229    let index_dir = index_root.join(".ck");
230    if !index_dir.exists() {
231        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
232    }
233
234    let tantivy_index_path = index_dir.join("tantivy_index");
235
236    if !tantivy_index_path.exists() {
237        return build_tantivy_index(options).await;
238    }
239
240    let mut schema_builder = Schema::builder();
241    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
242    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
243    let _schema = schema_builder.build();
244
245    let index = Index::open_in_dir(&tantivy_index_path)
246        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
247
248    let reader = index
249        .reader_builder()
250        .reload_policy(ReloadPolicy::OnCommitWithDelay)
251        .try_into()
252        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
253
254    let searcher = reader.searcher();
255    let query_parser = QueryParser::for_index(&index, vec![content_field]);
256
257    let query = query_parser
258        .parse_query(&options.query)
259        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
260
261    let top_docs = if let Some(top_k) = options.top_k {
262        searcher.search(&query, &TopDocs::with_limit(top_k))?
263    } else {
264        searcher.search(&query, &TopDocs::with_limit(100))?
265    };
266
267    // First, collect all results with raw scores
268    let mut raw_results = Vec::new();
269    for (_score, doc_address) in top_docs {
270        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
271        let path_text = retrieved_doc
272            .get_first(path_field)
273            .map(|field_value| field_value.as_str().unwrap_or(""))
274            .unwrap_or("");
275        let content_text = retrieved_doc
276            .get_first(content_field)
277            .map(|field_value| field_value.as_str().unwrap_or(""))
278            .unwrap_or("");
279
280        let file_path = PathBuf::from(path_text);
281        let preview = if options.full_section {
282            content_text.to_string()
283        } else {
284            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
285        };
286
287        raw_results.push((
288            _score,
289            SearchResult {
290                file: file_path,
291                span: Span {
292                    byte_start: 0,
293                    byte_end: content_text.len(),
294                    line_start: 1,
295                    line_end: content_text.lines().count(),
296                },
297                score: _score,
298                preview,
299                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
300                symbol: None,
301            },
302        ));
303    }
304
305    // Normalize scores to 0-1 range and apply threshold
306    let mut results = Vec::new();
307    if !raw_results.is_empty() {
308        let max_score = raw_results
309            .iter()
310            .map(|(score, _)| *score)
311            .fold(0.0f32, f32::max);
312        if max_score > 0.0 {
313            for (raw_score, mut result) in raw_results {
314                let normalized_score = raw_score / max_score;
315
316                // Apply threshold filtering with normalized score
317                if let Some(threshold) = options.threshold
318                    && normalized_score < threshold
319                {
320                    continue;
321                }
322
323                result.score = normalized_score;
324                results.push(result);
325            }
326        }
327    }
328
329    Ok(results)
330}
331
332async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
333    // Handle both files and directories by finding the appropriate directory for indexing
334    let index_root = if options.path.is_file() {
335        options.path.parent().unwrap_or(&options.path)
336    } else {
337        &options.path
338    };
339
340    let index_dir = index_root.join(".ck");
341    let tantivy_index_path = index_dir.join("tantivy_index");
342
343    fs::create_dir_all(&tantivy_index_path)?;
344
345    let mut schema_builder = Schema::builder();
346    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
347    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
348    let schema = schema_builder.build();
349
350    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
351        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
352
353    let mut index_writer = index
354        .writer(50_000_000)
355        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
356
357    let files = collect_files(index_root, true, &options.exclude_patterns)?;
358
359    for file_path in &files {
360        if let Ok(content) = fs::read_to_string(file_path) {
361            let doc = doc!(
362                content_field => content,
363                path_field => file_path.display().to_string()
364            );
365            index_writer.add_document(doc)?;
366        }
367    }
368
369    index_writer
370        .commit()
371        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
372
373    // After building, search again with the same options
374    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
375    let mut schema_builder = Schema::builder();
376    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
377    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
378    let _schema = schema_builder.build();
379
380    let index = Index::open_in_dir(&tantivy_index_path)
381        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
382
383    let reader = index
384        .reader_builder()
385        .reload_policy(ReloadPolicy::OnCommitWithDelay)
386        .try_into()
387        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
388
389    let searcher = reader.searcher();
390    let query_parser = QueryParser::for_index(&index, vec![content_field]);
391
392    let query = query_parser
393        .parse_query(&options.query)
394        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
395
396    let top_docs = if let Some(top_k) = options.top_k {
397        searcher.search(&query, &TopDocs::with_limit(top_k))?
398    } else {
399        searcher.search(&query, &TopDocs::with_limit(100))?
400    };
401
402    // First, collect all results with raw scores
403    let mut raw_results = Vec::new();
404    for (_score, doc_address) in top_docs {
405        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
406        let path_text = retrieved_doc
407            .get_first(path_field)
408            .map(|field_value| field_value.as_str().unwrap_or(""))
409            .unwrap_or("");
410        let content_text = retrieved_doc
411            .get_first(content_field)
412            .map(|field_value| field_value.as_str().unwrap_or(""))
413            .unwrap_or("");
414
415        let file_path = PathBuf::from(path_text);
416        let preview = if options.full_section {
417            content_text.to_string()
418        } else {
419            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
420        };
421
422        raw_results.push((
423            _score,
424            SearchResult {
425                file: file_path,
426                span: Span {
427                    byte_start: 0,
428                    byte_end: content_text.len(),
429                    line_start: 1,
430                    line_end: content_text.lines().count(),
431                },
432                score: _score,
433                preview,
434                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
435                symbol: None,
436            },
437        ));
438    }
439
440    // Normalize scores to 0-1 range and apply threshold
441    let mut results = Vec::new();
442    if !raw_results.is_empty() {
443        let max_score = raw_results
444            .iter()
445            .map(|(score, _)| *score)
446            .fold(0.0f32, f32::max);
447        if max_score > 0.0 {
448            for (raw_score, mut result) in raw_results {
449                let normalized_score = raw_score / max_score;
450
451                // Apply threshold filtering with normalized score
452                if let Some(threshold) = options.threshold
453                    && normalized_score < threshold
454                {
455                    continue;
456                }
457
458                result.score = normalized_score;
459                results.push(result);
460            }
461        }
462    }
463
464    Ok(results)
465}
466
467#[allow(dead_code)]
468async fn semantic_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
469    semantic_search_with_progress(options, None).await
470}
471
472async fn semantic_search_with_progress(
473    options: &SearchOptions,
474    progress_callback: Option<SearchProgressCallback>,
475) -> Result<Vec<SearchResult>> {
476    // Handle both files and directories and reuse nearest existing .ck index up the tree
477    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
478        if options.path.is_file() {
479            options.path.parent().unwrap_or(&options.path).to_path_buf()
480        } else {
481            options.path.clone()
482        }
483    });
484
485    let index_dir = index_root.join(".ck");
486    if !index_dir.exists() {
487        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
488    }
489
490    let ann_index_path = index_dir.join("ann_index.bin");
491    let embeddings_path = index_dir.join("embeddings.json");
492
493    if !ann_index_path.exists() || !embeddings_path.exists() {
494        return build_semantic_index_with_progress(options, progress_callback).await;
495    }
496
497    // Load the ANN index
498    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
499
500    // Load file metadata
501    let embeddings_data = fs::read_to_string(&embeddings_path)?;
502    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
503
504    // Create embedder and embed the query
505    if let Some(ref callback) = progress_callback {
506        callback("Loading embedding model...");
507    }
508
509    let mut embedder = if let Some(ref callback) = progress_callback {
510        let _cb = callback.as_ref();
511        let model_cb = Box::new(|msg: &str| {
512            // Note: We can't directly use the callback here due to lifetime issues
513            // For now, we'll just use eprintln! until we can restructure this better
514            eprintln!("Model: {}", msg);
515        }) as ck_embed::ModelDownloadCallback;
516        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), Some(model_cb))?
517    } else {
518        ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?
519    };
520    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
521
522    if query_embeddings.is_empty() {
523        return Ok(Vec::new());
524    }
525
526    let query_embedding = &query_embeddings[0];
527
528    // Search using ANN
529    let top_k = options.top_k.unwrap_or(10);
530    let similar_docs = ann_index.search(query_embedding, top_k);
531
532    let mut results = Vec::new();
533
534    // Check if we're searching a specific file vs. a directory
535    let filter_by_file = options.path.is_file();
536    let target_file = if filter_by_file {
537        Some(
538            options
539                .path
540                .canonicalize()
541                .unwrap_or_else(|_| options.path.clone()),
542        )
543    } else {
544        None
545    };
546
547    for (doc_id, similarity) in similar_docs {
548        // Apply threshold filtering
549        if let Some(threshold) = options.threshold
550            && similarity < threshold
551        {
552            continue;
553        }
554
555        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
556            // Filter by target file if specified
557            if let Some(target) = &target_file {
558                let canonical_result = file_path
559                    .canonicalize()
560                    .unwrap_or_else(|_| file_path.clone());
561                if canonical_result != *target {
562                    continue; // Skip this result if it doesn't match the target file
563                }
564            }
565
566            // If full_section is enabled and this is a code section, return the full content
567            let preview = if options.full_section {
568                content.clone()
569            } else {
570                content.lines().take(3).collect::<Vec<_>>().join("\n")
571            };
572
573            results.push(SearchResult {
574                file: file_path.clone(),
575                span: Span {
576                    byte_start: 0,
577                    byte_end: content.len(),
578                    line_start: 1,
579                    line_end: content.lines().count(),
580                },
581                score: similarity,
582                preview,
583                lang: ck_core::Language::from_path(file_path),
584                symbol: None,
585            });
586        }
587    }
588
589    Ok(results)
590}
591
592#[allow(dead_code)]
593async fn build_semantic_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
594    build_semantic_index_with_progress(options, None).await
595}
596
597async fn build_semantic_index_with_progress(
598    options: &SearchOptions,
599    progress_callback: Option<SearchProgressCallback>,
600) -> Result<Vec<SearchResult>> {
601    // Handle both files and directories by finding the appropriate directory for indexing
602    let index_root = if options.path.is_file() {
603        options.path.parent().unwrap_or(&options.path)
604    } else {
605        &options.path
606    };
607
608    let index_dir = index_root.join(".ck");
609    let ann_index_path = index_dir.join("ann_index.bin");
610    let embeddings_path = index_dir.join("embeddings.json");
611
612    fs::create_dir_all(&index_dir)?;
613
614    if let Some(ref callback) = progress_callback {
615        callback("Building semantic index (no index found)...");
616    }
617
618    // Always print this important message, even in quiet mode for indexing operations
619    eprintln!("Building semantic index (no existing index found)...");
620
621    // Collect files and their content
622    let files = collect_files(index_root, true, &options.exclude_patterns)?;
623
624    if let Some(ref callback) = progress_callback {
625        callback(&format!("Found {} files to index", files.len()));
626    }
627    eprintln!("Found {} files to embed and index", files.len());
628
629    let mut file_embeddings = Vec::new();
630    let mut embeddings = Vec::new();
631
632    // Create embedder with progress callback
633    if let Some(ref callback) = progress_callback {
634        callback("Loading embedding model...");
635    }
636
637    let model_callback = if progress_callback.is_some() {
638        Some(Box::new(|msg: &str| {
639            eprintln!("Model: {}", msg);
640        }) as ck_embed::ModelDownloadCallback)
641    } else {
642        None
643    };
644
645    let mut embedder =
646        ck_embed::create_embedder_with_progress(Some("BAAI/bge-small-en-v1.5"), model_callback)?;
647
648    if let Some(ref callback) = progress_callback {
649        callback("Generating embeddings for code chunks...");
650    }
651
652    for (file_idx, file_path) in files.iter().enumerate() {
653        if let Ok(content) = fs::read_to_string(file_path) {
654            if let Some(ref callback) = progress_callback {
655                let file_name = file_path
656                    .file_name()
657                    .map(|n| n.to_string_lossy().to_string())
658                    .unwrap_or_else(|| file_path.to_string_lossy().to_string());
659                callback(&format!(
660                    "Processing {}/{}: {}",
661                    file_idx + 1,
662                    files.len(),
663                    file_name
664                ));
665            }
666
667            // Chunk the content for better embeddings
668            let chunks = ck_chunk::chunk_text(&content, ck_core::Language::from_path(file_path))?;
669
670            for chunk in chunks {
671                let chunk_embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
672                if !chunk_embeddings.is_empty() {
673                    embeddings.push(chunk_embeddings[0].clone());
674                    file_embeddings.push((file_path.clone(), chunk.text));
675                }
676            }
677        }
678    }
679
680    if let Some(ref callback) = progress_callback {
681        callback(&format!(
682            "Built {} embeddings, creating search index...",
683            embeddings.len()
684        ));
685    }
686    eprintln!(
687        "Generated {} embeddings, building search index...",
688        embeddings.len()
689    );
690
691    // Build ANN index
692    let index = ck_ann::SimpleIndex::build(&embeddings)?;
693    index.save(&ann_index_path)?;
694
695    // Save file embeddings metadata
696    let embeddings_json = serde_json::to_string(&file_embeddings)?;
697    fs::write(&embeddings_path, embeddings_json)?;
698
699    if let Some(ref callback) = progress_callback {
700        callback("Semantic index built successfully, running search...");
701    }
702    eprintln!("Semantic index built successfully!");
703
704    // After building, search again - inline to avoid recursion
705    let ann_index = ck_ann::SimpleIndex::load(&ann_index_path)?;
706
707    // Load file metadata
708    let embeddings_data = fs::read_to_string(&embeddings_path)?;
709    let file_embeddings: Vec<(PathBuf, String)> = serde_json::from_str(&embeddings_data)?;
710
711    // Create embedder and embed the query
712    let mut embedder = ck_embed::create_embedder(Some("BAAI/bge-small-en-v1.5"))?;
713    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
714
715    if query_embeddings.is_empty() {
716        return Ok(Vec::new());
717    }
718
719    let query_embedding = &query_embeddings[0];
720
721    // Search using ANN
722    let top_k = options.top_k.unwrap_or(10);
723    let similar_docs = ann_index.search(query_embedding, top_k);
724
725    let mut results = Vec::new();
726
727    // Check if we're searching a specific file vs. a directory
728    let filter_by_file = options.path.is_file();
729    let target_file = if filter_by_file {
730        Some(
731            options
732                .path
733                .canonicalize()
734                .unwrap_or_else(|_| options.path.clone()),
735        )
736    } else {
737        None
738    };
739
740    for (doc_id, similarity) in similar_docs {
741        // Apply threshold filtering
742        if let Some(threshold) = options.threshold
743            && similarity < threshold
744        {
745            continue;
746        }
747
748        if let Some((file_path, content)) = file_embeddings.get(doc_id as usize) {
749            // Filter by target file if specified
750            if let Some(target) = &target_file {
751                let canonical_result = file_path
752                    .canonicalize()
753                    .unwrap_or_else(|_| file_path.clone());
754                if canonical_result != *target {
755                    continue; // Skip this result if it doesn't match the target file
756                }
757            }
758
759            // If full_section is enabled and this is a code section, return the full content
760            let preview = if options.full_section {
761                content.clone()
762            } else {
763                content.lines().take(3).collect::<Vec<_>>().join("\n")
764            };
765
766            results.push(SearchResult {
767                file: file_path.clone(),
768                span: Span {
769                    byte_start: 0,
770                    byte_end: content.len(),
771                    line_start: 1,
772                    line_end: content.lines().count(),
773                },
774                score: similarity,
775                preview,
776                lang: ck_core::Language::from_path(file_path),
777                symbol: None,
778            });
779        }
780    }
781
782    Ok(results)
783}
784
785#[allow(dead_code)]
786async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
787    hybrid_search_with_progress(options, None).await
788}
789
790async fn hybrid_search_with_progress(
791    options: &SearchOptions,
792    progress_callback: Option<SearchProgressCallback>,
793) -> Result<Vec<SearchResult>> {
794    if let Some(ref callback) = progress_callback {
795        callback("Running regex search...");
796    }
797    let regex_results = regex_search(options)?;
798
799    if let Some(ref callback) = progress_callback {
800        callback("Running semantic search...");
801    }
802    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
803
804    let mut combined = HashMap::new();
805
806    for (rank, result) in regex_results.iter().enumerate() {
807        let key = format!("{}:{}", result.file.display(), result.span.line_start);
808        combined
809            .entry(key)
810            .or_insert(Vec::new())
811            .push((rank + 1, result.clone()));
812    }
813
814    for (rank, result) in semantic_results.iter().enumerate() {
815        let key = format!("{}:{}", result.file.display(), result.span.line_start);
816        combined
817            .entry(key)
818            .or_insert(Vec::new())
819            .push((rank + 1, result.clone()));
820    }
821
822    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
823    let mut rrf_results: Vec<SearchResult> = combined
824        .into_values()
825        .map(|ranks| {
826            let mut result = ranks[0].1.clone();
827            let rrf_score = ranks
828                .iter()
829                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
830                .sum();
831            result.score = rrf_score;
832            result
833        })
834        .filter(|result| {
835            // Apply threshold filtering to raw RRF scores
836            if let Some(threshold) = options.threshold {
837                result.score >= threshold
838            } else {
839                true
840            }
841        })
842        .collect();
843
844    // Sort by RRF score (highest first)
845    rrf_results.sort_by(|a, b| {
846        b.score
847            .partial_cmp(&a.score)
848            .unwrap_or(std::cmp::Ordering::Equal)
849    });
850
851    if let Some(top_k) = options.top_k {
852        rrf_results.truncate(top_k);
853    }
854
855    Ok(rrf_results)
856}
857
858fn build_globset(patterns: &[String]) -> GlobSet {
859    let mut builder = GlobSetBuilder::new();
860    for pat in patterns {
861        // Treat patterns as filename or directory globs
862        if let Ok(glob) = Glob::new(pat) {
863            builder.add(glob);
864        }
865    }
866    builder.build().unwrap_or_else(|_| GlobSet::empty())
867}
868
869fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
870    let globset = build_globset(exclude_patterns);
871    // Match against each path component and the full path
872    if globset.is_match(path) {
873        return true;
874    }
875    for component in path.components() {
876        if let std::path::Component::Normal(name) = component
877            && globset.is_match(name)
878        {
879            return true;
880        }
881    }
882    false
883}
884
885fn collect_files(
886    path: &Path,
887    recursive: bool,
888    exclude_patterns: &[String],
889) -> Result<Vec<PathBuf>> {
890    let mut files = Vec::new();
891    let globset = build_globset(exclude_patterns);
892
893    if path.is_file() {
894        // Always add single files, even if they're excluded (user explicitly requested)
895        files.push(path.to_path_buf());
896    } else if recursive {
897        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
898            // Skip excluded directories entirely for efficiency
899            let name = e.file_name();
900            !globset.is_match(e.path()) && !globset.is_match(name)
901        }) {
902            match entry {
903                Ok(entry) => {
904                    if entry.file_type().is_file()
905                        && !should_exclude_path(entry.path(), exclude_patterns)
906                    {
907                        files.push(entry.path().to_path_buf());
908                    }
909                }
910                Err(e) => {
911                    // Log directory traversal errors but continue processing
912                    tracing::debug!("Skipping path due to error: {}", e);
913                    continue;
914                }
915            }
916        }
917    } else {
918        match fs::read_dir(path) {
919            Ok(read_dir) => {
920                for entry in read_dir {
921                    match entry {
922                        Ok(entry) => {
923                            let path = entry.path();
924                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
925                                files.push(path);
926                            }
927                        }
928                        Err(e) => {
929                            tracing::debug!("Skipping directory entry due to error: {}", e);
930                            continue;
931                        }
932                    }
933                }
934            }
935            Err(e) => {
936                tracing::debug!("Cannot read directory {:?}: {}", path, e);
937                return Err(e.into());
938            }
939        }
940    }
941
942    Ok(files)
943}
944
945async fn ensure_index_updated(
946    path: &Path,
947    force_reindex: bool,
948    need_embeddings: bool,
949) -> Result<()> {
950    // Handle both files and directories and reuse nearest existing .ck index up the tree
951    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
952        if path.is_file() {
953            path.parent().unwrap_or(path).to_path_buf()
954        } else {
955            path.to_path_buf()
956        }
957    });
958    let index_root = &index_root_buf;
959
960    // If force reindex is requested, always update
961    if force_reindex {
962        let stats = ck_index::smart_update_index_with_progress(
963            index_root,
964            false,
965            None,
966            need_embeddings,
967            true,
968            &[], // Empty exclude patterns for internal engine use
969        )
970        .await?;
971        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
972            tracing::info!(
973                "Index updated: {} files indexed, {} orphaned files removed",
974                stats.files_indexed,
975                stats.orphaned_files_removed
976            );
977        }
978        return Ok(());
979    }
980
981    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
982    let stats = ck_index::smart_update_index_with_progress(
983        index_root,
984        false,
985        None,
986        need_embeddings,
987        true,
988        &[],
989    )
990    .await?;
991    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
992        tracing::info!(
993            "Index updated: {} files indexed, {} orphaned files removed",
994            stats.files_indexed,
995            stats.orphaned_files_removed
996        );
997    }
998
999    Ok(())
1000}
1001
1002fn get_context_preview(lines: &[&str], line_idx: usize, options: &SearchOptions) -> String {
1003    let before = options.before_context_lines.max(options.context_lines);
1004    let after = options.after_context_lines.max(options.context_lines);
1005
1006    if before > 0 || after > 0 {
1007        let start_idx = line_idx.saturating_sub(before);
1008        let end_idx = (line_idx + after + 1).min(lines.len());
1009        lines[start_idx..end_idx].join("\n")
1010    } else {
1011        lines[line_idx].to_string()
1012    }
1013}
1014
1015fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1016    let lang = ck_core::Language::from_path(file_path)?;
1017
1018    // Parse the file with tree-sitter and extract function/class sections
1019    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1020        let sections: Vec<(usize, usize, String)> = chunks
1021            .into_iter()
1022            .filter(|chunk| {
1023                matches!(
1024                    chunk.chunk_type,
1025                    ck_chunk::ChunkType::Function
1026                        | ck_chunk::ChunkType::Class
1027                        | ck_chunk::ChunkType::Method
1028                )
1029            })
1030            .map(|chunk| {
1031                (
1032                    chunk.span.line_start - 1, // Convert to 0-based index
1033                    chunk.span.line_end - 1,
1034                    chunk.text,
1035                )
1036            })
1037            .collect();
1038
1039        if sections.is_empty() {
1040            None
1041        } else {
1042            Some(sections)
1043        }
1044    } else {
1045        None
1046    }
1047}
1048
1049fn find_containing_section(
1050    sections: &[(usize, usize, String)],
1051    line_idx: usize,
1052) -> Option<&String> {
1053    for (start, end, text) in sections {
1054        if line_idx >= *start && line_idx <= *end {
1055            return Some(text);
1056        }
1057    }
1058    None
1059}
1060
1061#[cfg(test)]
1062mod tests {
1063    use super::*;
1064    use std::fs;
1065    use tempfile::TempDir;
1066
1067    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1068        let files = vec![
1069            ("test1.txt", "hello world rust programming"),
1070            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1071            ("test3.py", "print('Hello Python')"),
1072            ("test4.txt", "machine learning artificial intelligence"),
1073        ];
1074
1075        let mut paths = Vec::new();
1076        for (name, content) in files {
1077            let path = dir.join(name);
1078            fs::write(&path, content).unwrap();
1079            paths.push(path);
1080        }
1081        paths
1082    }
1083
1084    #[test]
1085    fn test_collect_files() {
1086        let temp_dir = TempDir::new().unwrap();
1087        let test_files = create_test_files(temp_dir.path());
1088
1089        // Test non-recursive
1090        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1091        assert_eq!(files.len(), 4);
1092
1093        // Test recursive
1094        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1095        assert_eq!(files.len(), 4);
1096
1097        // Test single file
1098        let files = collect_files(&test_files[0], false, &[]).unwrap();
1099        assert_eq!(files.len(), 1);
1100        assert_eq!(files[0], test_files[0]);
1101    }
1102
1103    #[test]
1104    fn test_regex_search() {
1105        let temp_dir = TempDir::new().unwrap();
1106        create_test_files(temp_dir.path());
1107
1108        let options = SearchOptions {
1109            mode: SearchMode::Regex,
1110            query: "rust".to_string(),
1111            path: temp_dir.path().to_path_buf(),
1112            recursive: true,
1113            ..Default::default()
1114        };
1115
1116        let results = regex_search(&options).unwrap();
1117        assert!(!results.is_empty());
1118
1119        // Should find matches in files containing "rust"
1120        let rust_matches: Vec<_> = results
1121            .iter()
1122            .filter(|r| r.preview.to_lowercase().contains("rust"))
1123            .collect();
1124        assert!(!rust_matches.is_empty());
1125    }
1126
1127    #[test]
1128    fn test_regex_search_case_insensitive() {
1129        let temp_dir = TempDir::new().unwrap();
1130        create_test_files(temp_dir.path());
1131
1132        let options = SearchOptions {
1133            mode: SearchMode::Regex,
1134            query: "HELLO".to_string(),
1135            path: temp_dir.path().to_path_buf(),
1136            recursive: true,
1137            case_insensitive: true,
1138            ..Default::default()
1139        };
1140
1141        let results = regex_search(&options).unwrap();
1142        assert!(!results.is_empty());
1143    }
1144
1145    #[test]
1146    fn test_regex_search_fixed_string() {
1147        let temp_dir = TempDir::new().unwrap();
1148        create_test_files(temp_dir.path());
1149
1150        let options = SearchOptions {
1151            mode: SearchMode::Regex,
1152            query: "fn main()".to_string(),
1153            path: temp_dir.path().to_path_buf(),
1154            recursive: true,
1155            fixed_string: true,
1156            ..Default::default()
1157        };
1158
1159        let results = regex_search(&options).unwrap();
1160        assert!(!results.is_empty());
1161    }
1162
1163    #[test]
1164    fn test_regex_search_whole_word() {
1165        let temp_dir = TempDir::new().unwrap();
1166        fs::write(
1167            temp_dir.path().join("word_test.txt"),
1168            "rust rusty rustacean",
1169        )
1170        .unwrap();
1171
1172        let options = SearchOptions {
1173            mode: SearchMode::Regex,
1174            query: "rust".to_string(),
1175            path: temp_dir.path().to_path_buf(),
1176            recursive: true,
1177            whole_word: true,
1178            ..Default::default()
1179        };
1180
1181        let results = regex_search(&options).unwrap();
1182        assert!(!results.is_empty());
1183        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1184    }
1185
1186    #[test]
1187    fn test_regex_search_top_k() {
1188        let temp_dir = TempDir::new().unwrap();
1189
1190        // Create multiple files with matches
1191        for i in 0..10 {
1192            fs::write(
1193                temp_dir.path().join(format!("file{}.txt", i)),
1194                "test content",
1195            )
1196            .unwrap();
1197        }
1198
1199        let options = SearchOptions {
1200            mode: SearchMode::Regex,
1201            query: "test".to_string(),
1202            path: temp_dir.path().to_path_buf(),
1203            recursive: true,
1204            top_k: Some(5),
1205            ..Default::default()
1206        };
1207
1208        let results = regex_search(&options).unwrap();
1209        assert!(results.len() <= 5);
1210    }
1211
1212    #[test]
1213    fn test_regex_search_span_offsets() {
1214        // Test that span offsets are correctly calculated for multiple matches on a line
1215        let temp_dir = TempDir::new().unwrap();
1216        let test_file = temp_dir.path().join("spans.txt");
1217        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1218
1219        let options = SearchOptions {
1220            mode: SearchMode::Regex,
1221            query: "test".to_string(),
1222            path: test_file.clone(),
1223            recursive: false,
1224            ..Default::default()
1225        };
1226
1227        let results = regex_search(&options).unwrap();
1228
1229        // Should find 5 matches total
1230        assert_eq!(results.len(), 5);
1231
1232        // Check first line has 3 matches with correct byte offsets
1233        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1234        assert_eq!(line1_matches.len(), 3);
1235        assert_eq!(line1_matches[0].span.byte_start, 0);
1236        assert_eq!(line1_matches[1].span.byte_start, 5);
1237        assert_eq!(line1_matches[2].span.byte_start, 10);
1238
1239        // Check second line match
1240        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1241        assert_eq!(line2_matches.len(), 1);
1242        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1243
1244        // Each match should have different byte offsets
1245        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1246        byte_starts.sort();
1247        byte_starts.dedup();
1248        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1249    }
1250
1251    #[test]
1252    fn test_search_file() {
1253        let temp_dir = TempDir::new().unwrap();
1254        let file_path = temp_dir.path().join("test.txt");
1255        fs::write(
1256            &file_path,
1257            "line 1: hello\nline 2: world\nline 3: rust programming",
1258        )
1259        .unwrap();
1260
1261        let regex = regex::Regex::new("rust").unwrap();
1262        let options = SearchOptions::default();
1263
1264        let results = search_file(&regex, &file_path, &options).unwrap();
1265        assert_eq!(results.len(), 1);
1266        assert_eq!(results[0].span.line_start, 3);
1267        assert!(results[0].preview.contains("rust"));
1268    }
1269
1270    #[test]
1271    fn test_search_file_with_context() {
1272        let temp_dir = TempDir::new().unwrap();
1273        let file_path = temp_dir.path().join("test.txt");
1274        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1275
1276        let regex = regex::Regex::new("target").unwrap();
1277        let options = SearchOptions {
1278            context_lines: 1,
1279            ..Default::default()
1280        };
1281
1282        let results = search_file(&regex, &file_path, &options).unwrap();
1283        assert_eq!(results.len(), 1);
1284
1285        println!("Preview: '{}'", results[0].preview);
1286
1287        // The target line is line 3, with 1 context line before and after
1288        // So we should get lines 2, 3, 4
1289        assert!(results[0].preview.contains("line 2"));
1290        assert!(results[0].preview.contains("target line"));
1291        assert!(results[0].preview.contains("line 4"));
1292    }
1293
1294    #[tokio::test]
1295    async fn test_search_main_function() {
1296        let temp_dir = TempDir::new().unwrap();
1297        create_test_files(temp_dir.path());
1298
1299        let options = SearchOptions {
1300            mode: SearchMode::Regex,
1301            query: "hello".to_string(),
1302            path: temp_dir.path().to_path_buf(),
1303            recursive: true,
1304            case_insensitive: true,
1305            ..Default::default()
1306        };
1307
1308        let results = search(&options).await.unwrap();
1309        assert!(!results.is_empty());
1310    }
1311}