ck_engine/
lib.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchMode, SearchOptions, SearchResult, Span};
3use globset::{Glob, GlobSet, GlobSetBuilder};
4use rayon::prelude::*;
5use regex::{Regex, RegexBuilder};
6use std::collections::HashMap;
7use std::fs;
8use std::path::PathBuf as StdPathBuf;
9use std::path::{Path, PathBuf};
10use tantivy::collector::TopDocs;
11use tantivy::query::QueryParser;
12use tantivy::schema::{STORED, Schema, TEXT, Value};
13use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
14use walkdir::WalkDir;
15
16mod semantic_v3;
17pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
18
19pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
20pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
21pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
22
23/// Resolve the actual file path to read content from
24/// For PDFs: returns cache path and validates it exists
25/// For regular files: returns original path
26fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
27    if ck_core::pdf::is_pdf_file(file_path) {
28        // PDFs: Read from cached extracted text
29        let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
30        if !cache_path.exists() {
31            return Err(anyhow::anyhow!(
32                "PDF not preprocessed. Run 'ck --index' first."
33            ));
34        }
35        Ok(cache_path)
36    } else {
37        // Regular files: Read from original source
38        Ok(file_path.to_path_buf())
39    }
40}
41
42/// Read content from file for search result extraction
43/// Regular files: read directly from source
44/// PDFs: read from preprocessed cache
45fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
46    let content_path = resolve_content_path(file_path, repo_root)?;
47    Ok(fs::read_to_string(content_path)?)
48}
49
50/// Extract content from a file using a span (streaming version)
51async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
52    // Find repo root to locate cache
53    let repo_root = find_nearest_index_root(file_path)
54        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
55
56    // Use centralized path resolution
57    let content_path = resolve_content_path(file_path, &repo_root)?;
58
59    // Stream only the needed lines
60    extract_lines_from_file(&content_path, span.line_start, span.line_end)
61}
62
63/// Stream-read specific lines from a file without loading the entire content
64fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
65    use std::io::{BufRead, BufReader};
66
67    if line_start == 0 {
68        return Ok(String::new());
69    }
70
71    let file = fs::File::open(file_path)?;
72    let reader = BufReader::new(file);
73    let mut result = Vec::new();
74
75    // Convert to 0-based indexing
76    let start_idx = line_start.saturating_sub(1);
77    let end_idx = line_end.saturating_sub(1);
78
79    for (current_line, line_result) in reader.lines().enumerate() {
80        if current_line > end_idx {
81            break; // Stop reading once we've passed the needed lines
82        }
83
84        let line = line_result?;
85
86        if current_line >= start_idx {
87            result.push(line);
88        }
89    }
90
91    // Handle case where requested lines exceed file length
92    if result.is_empty() && line_start > 0 {
93        return Ok(String::new());
94    }
95
96    Ok(result.join("\n"))
97}
98
99/// Split content into lines while preserving the exact number of trailing newline bytes per line.
100/// Handles Unix (\n), Windows (\r\n) and old Mac (\r) line endings.
101fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
102    let mut lines = Vec::new();
103    let mut endings = Vec::new();
104
105    let bytes = content.as_bytes();
106    let mut start = 0usize;
107    let mut i = 0usize;
108
109    while i < bytes.len() {
110        match bytes[i] {
111            b'\n' => {
112                lines.push(content[start..i].to_string());
113                endings.push(1);
114                i += 1;
115                start = i;
116            }
117            b'\r' => {
118                lines.push(content[start..i].to_string());
119                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
120                    endings.push(2);
121                    i += 2;
122                } else {
123                    endings.push(1);
124                    i += 1;
125                }
126                start = i;
127            }
128            _ => {
129                i += 1;
130            }
131        }
132    }
133
134    if start < bytes.len() {
135        lines.push(content[start..].to_string());
136        endings.push(0);
137    }
138
139    (lines, endings)
140}
141
142fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
143    let mut current = if path.is_file() {
144        path.parent().unwrap_or(path)
145    } else {
146        path
147    };
148    loop {
149        if current.join(".ck").exists() {
150            return Some(current.to_path_buf());
151        }
152        match current.parent() {
153            Some(parent) => current = parent,
154            None => return None,
155        }
156    }
157}
158
159#[derive(Clone, Debug)]
160pub struct ResolvedModel {
161    pub canonical_name: String,
162    pub alias: String,
163    pub dimensions: usize,
164}
165
166fn find_model_entry<'a>(
167    registry: &'a ck_models::ModelRegistry,
168    key: &str,
169) -> Option<(String, &'a ck_models::ModelConfig)> {
170    if let Some(config) = registry.get_model(key) {
171        return Some((key.to_string(), config));
172    }
173
174    registry
175        .models
176        .iter()
177        .find(|(_, config)| config.name == key)
178        .map(|(alias, config)| (alias.clone(), config))
179}
180
181pub(crate) fn resolve_model_from_root(
182    index_root: &Path,
183    cli_model: Option<&str>,
184) -> Result<ResolvedModel> {
185    use ck_models::ModelRegistry;
186
187    let registry = ModelRegistry::default();
188    let index_dir = index_root.join(".ck");
189    let manifest_path = index_dir.join("manifest.json");
190
191    if manifest_path.exists() {
192        let data = std::fs::read(&manifest_path)?;
193        let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
194
195        if let Some(existing_model) = manifest.embedding_model {
196            let (alias, config_opt) = find_model_entry(&registry, &existing_model)
197                .map(|(alias, config)| (alias, Some(config)))
198                .unwrap_or_else(|| (existing_model.clone(), None));
199
200            let dims = manifest
201                .embedding_dimensions
202                .or_else(|| config_opt.map(|c| c.dimensions))
203                .unwrap_or(384);
204
205            if let Some(requested) = cli_model {
206                let (_, requested_config) =
207                    find_model_entry(&registry, requested).ok_or_else(|| {
208                        CkError::Embedding(format!(
209                            "Unknown model '{}'. Available models: {}",
210                            requested,
211                            registry
212                                .models
213                                .keys()
214                                .cloned()
215                                .collect::<Vec<_>>()
216                                .join(", ")
217                        ))
218                    })?;
219
220                if requested_config.name != existing_model {
221                    let suggested_alias = alias.clone();
222                    return Err(CkError::Embedding(format!(
223                        "Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
224                        existing_model,
225                        suggested_alias,
226                        requested,
227                        requested,
228                        suggested_alias
229                    ))
230                    .into());
231                }
232            }
233
234            return Ok(ResolvedModel {
235                canonical_name: existing_model,
236                alias,
237                dimensions: dims,
238            });
239        }
240    }
241
242    let (alias, config) = if let Some(requested) = cli_model {
243        find_model_entry(&registry, requested).ok_or_else(|| {
244            CkError::Embedding(format!(
245                "Unknown model '{}'. Available models: {}",
246                requested,
247                registry
248                    .models
249                    .keys()
250                    .cloned()
251                    .collect::<Vec<_>>()
252                    .join(", ")
253            ))
254        })?
255    } else {
256        let alias = registry.default_model.clone();
257        let config = registry.get_default_model().ok_or_else(|| {
258            CkError::Embedding("No default embedding model configured".to_string())
259        })?;
260        (alias, config)
261    };
262
263    Ok(ResolvedModel {
264        canonical_name: config.name.clone(),
265        alias,
266        dimensions: config.dimensions,
267    })
268}
269
270pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
271    let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
272        if path.is_file() {
273            path.parent().unwrap_or(path).to_path_buf()
274        } else {
275            path.to_path_buf()
276        }
277    });
278    resolve_model_from_root(&index_root, cli_model)
279}
280
281pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
282    let results = search_enhanced(options).await?;
283    Ok(results.matches)
284}
285
286pub async fn search_with_progress(
287    options: &SearchOptions,
288    progress_callback: Option<SearchProgressCallback>,
289) -> Result<Vec<SearchResult>> {
290    let results = search_enhanced_with_progress(options, progress_callback).await?;
291    Ok(results.matches)
292}
293
294/// Enhanced search that includes near-miss information for threshold queries
295pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
296    search_enhanced_with_progress(options, None).await
297}
298
299/// Enhanced search with progress callback that includes near-miss information
300pub async fn search_enhanced_with_progress(
301    options: &SearchOptions,
302    progress_callback: Option<SearchProgressCallback>,
303) -> Result<ck_core::SearchResults> {
304    search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
305}
306
307/// Enhanced search with both search and indexing progress callbacks
308pub async fn search_enhanced_with_indexing_progress(
309    options: &SearchOptions,
310    progress_callback: Option<SearchProgressCallback>,
311    indexing_progress_callback: Option<IndexingProgressCallback>,
312    detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
313) -> Result<ck_core::SearchResults> {
314    // Validate that the search path exists
315    if !options.path.exists() {
316        return Err(ck_core::CkError::Search(format!(
317            "Path does not exist: {}",
318            options.path.display()
319        ))
320        .into());
321    }
322
323    // Auto-update index if needed (unless it's regex-only mode)
324    if !matches!(options.mode, SearchMode::Regex) {
325        let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
326        ensure_index_updated_with_progress(
327            &options.path,
328            options.reindex,
329            need_embeddings,
330            indexing_progress_callback,
331            detailed_indexing_progress_callback,
332            options.respect_gitignore,
333            &options.exclude_patterns,
334            options.embedding_model.as_deref(),
335        )
336        .await?;
337    }
338
339    let search_results = match options.mode {
340        SearchMode::Regex => {
341            let matches = regex_search(options)?;
342            ck_core::SearchResults {
343                matches,
344                closest_below_threshold: None,
345            }
346        }
347        SearchMode::Lexical => {
348            let matches = lexical_search(options).await?;
349            ck_core::SearchResults {
350                matches,
351                closest_below_threshold: None,
352            }
353        }
354        SearchMode::Semantic => {
355            // Use v3 semantic search (reads pre-computed embeddings from sidecars using spans)
356            semantic_search_v3_with_progress(options, progress_callback).await?
357        }
358        SearchMode::Hybrid => {
359            let matches = hybrid_search_with_progress(options, progress_callback).await?;
360            ck_core::SearchResults {
361                matches,
362                closest_below_threshold: None,
363            }
364        }
365    };
366
367    Ok(search_results)
368}
369
370fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
371    let pattern = if options.fixed_string {
372        regex::escape(&options.query)
373    } else if options.whole_word {
374        format!(r"\b{}\b", regex::escape(&options.query))
375    } else {
376        options.query.clone()
377    };
378
379    let regex = RegexBuilder::new(&pattern)
380        .case_insensitive(options.case_insensitive)
381        .build()
382        .map_err(CkError::Regex)?;
383
384    // Default to recursive for directories (like grep) to maintain compatibility
385    let should_recurse = options.path.is_dir() || options.recursive;
386    let files = if should_recurse {
387        // Use ck_index's collect_files which respects gitignore
388        ck_index::collect_files(
389            &options.path,
390            options.respect_gitignore,
391            &options.exclude_patterns,
392        )?
393    } else {
394        // For non-recursive, use the local collect_files
395        collect_files(&options.path, should_recurse, &options.exclude_patterns)?
396    };
397
398    let results: Vec<Vec<SearchResult>> = files
399        .par_iter()
400        .filter_map(|file_path| match search_file(&regex, file_path, options) {
401            Ok(matches) => {
402                if matches.is_empty() {
403                    None
404                } else {
405                    Some(matches)
406                }
407            }
408            Err(e) => {
409                tracing::debug!("Error searching {:?}: {}", file_path, e);
410                None
411            }
412        })
413        .collect();
414
415    let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
416    // Deterministic ordering: file path, then line number
417    all_results.sort_by(|a, b| {
418        let path_cmp = a.file.cmp(&b.file);
419        if path_cmp != std::cmp::Ordering::Equal {
420            return path_cmp;
421        }
422        a.span.line_start.cmp(&b.span.line_start)
423    });
424
425    if let Some(top_k) = options.top_k {
426        all_results.truncate(top_k);
427    }
428
429    Ok(all_results)
430}
431
432fn search_file(
433    regex: &Regex,
434    file_path: &Path,
435    options: &SearchOptions,
436) -> Result<Vec<SearchResult>> {
437    // Find repo root to locate cache
438    let repo_root = find_nearest_index_root(file_path)
439        .unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
440
441    // For full_section mode, we need the entire content for parsing
442    // For context previews, we need all lines for surrounding context
443    // So we'll load content when needed, but optimize for the common case
444    if options.full_section || options.context_lines > 0 {
445        // Load full content when we need section parsing or context
446        let content = read_file_content(file_path, &repo_root)?;
447        let (lines, line_ending_lengths) = split_lines_with_endings(&content);
448
449        // If full_section is enabled, try to parse the file and find code sections
450        let code_sections = if options.full_section {
451            extract_code_sections(file_path, &content)
452        } else {
453            None
454        };
455
456        search_file_in_memory(
457            regex,
458            file_path,
459            options,
460            &lines,
461            &code_sections,
462            &line_ending_lengths,
463        )
464    } else {
465        // Streaming search (simple case)
466        search_file_streaming(regex, file_path, &repo_root, options)
467    }
468}
469
470/// In-memory search for cases requiring context or code sections
471fn search_file_in_memory(
472    regex: &Regex,
473    file_path: &Path,
474    options: &SearchOptions,
475    lines: &[String],
476    code_sections: &Option<Vec<(usize, usize, String)>>,
477    line_ending_lengths: &[usize],
478) -> Result<Vec<SearchResult>> {
479    let mut results = Vec::new();
480    let mut byte_offset = 0;
481
482    for (line_idx, line) in lines.iter().enumerate() {
483        let line_number = line_idx + 1;
484
485        // Special handling for empty pattern - match the entire line once
486        // An empty regex pattern will match at every position, so we need to handle it specially
487        if regex.as_str().is_empty() {
488            // Empty pattern matches the whole line once (grep compatibility)
489            let preview = if options.full_section {
490                // Try to find the containing code section
491                if let Some(sections) = code_sections {
492                    if let Some(section) = find_containing_section(sections, line_idx) {
493                        section.clone()
494                    } else {
495                        // Fall back to context lines if no section found
496                        get_context_preview(lines, line_idx, options)
497                    }
498                } else {
499                    get_context_preview(lines, line_idx, options)
500                }
501            } else {
502                get_context_preview(lines, line_idx, options)
503            };
504
505            results.push(SearchResult {
506                file: file_path.to_path_buf(),
507                span: Span {
508                    byte_start: byte_offset,
509                    byte_end: byte_offset + line.len(),
510                    line_start: line_number,
511                    line_end: line_number,
512                },
513                score: 1.0,
514                preview,
515                lang: ck_core::Language::from_path(file_path),
516                symbol: None,
517                chunk_hash: None,
518                index_epoch: None,
519            });
520        } else {
521            // Find all matches in the line with their positions
522            for mat in regex.find_iter(line) {
523                let preview = if options.full_section {
524                    // Try to find the containing code section
525                    if let Some(sections) = code_sections {
526                        if let Some(section) = find_containing_section(sections, line_idx) {
527                            section.clone()
528                        } else {
529                            // Fall back to context lines if no section found
530                            get_context_preview(lines, line_idx, options)
531                        }
532                    } else {
533                        get_context_preview(lines, line_idx, options)
534                    }
535                } else {
536                    get_context_preview(lines, line_idx, options)
537                };
538
539                results.push(SearchResult {
540                    file: file_path.to_path_buf(),
541                    span: Span {
542                        byte_start: byte_offset + mat.start(),
543                        byte_end: byte_offset + mat.end(),
544                        line_start: line_number,
545                        line_end: line_number,
546                    },
547                    score: 1.0,
548                    preview,
549                    lang: ck_core::Language::from_path(file_path),
550                    symbol: None,
551                    chunk_hash: None,
552                    index_epoch: None,
553                });
554            }
555        }
556
557        // Update byte offset for next line (add line length + actual line ending length)
558        byte_offset += line.len();
559        byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
560    }
561
562    Ok(results)
563}
564
565/// Streaming search for simple cases without context or code sections
566fn search_file_streaming(
567    regex: &Regex,
568    file_path: &Path,
569    repo_root: &Path,
570    _options: &SearchOptions,
571) -> Result<Vec<SearchResult>> {
572    use std::io::{BufRead, BufReader};
573
574    let content_path = resolve_content_path(file_path, repo_root)?;
575    let file = std::fs::File::open(&content_path)?;
576    let mut reader = BufReader::new(file);
577
578    let mut results = Vec::new();
579    let mut line = String::new();
580    let mut byte_offset = 0usize;
581    let mut line_number = 1usize;
582
583    loop {
584        line.clear();
585        let bytes_read = reader.read_line(&mut line)?;
586        if bytes_read == 0 {
587            break;
588        }
589
590        // Determine the length of the trailing line ending (if any) and
591        // normalise the line buffer so it no longer contains newline bytes.
592        let mut newline_len = 0usize;
593        if line.ends_with("\r\n") {
594            line.pop(); // remove \n
595            line.pop(); // remove \r
596            newline_len = 2;
597        } else if line.ends_with(['\n', '\r']) {
598            line.pop();
599            newline_len = 1;
600        }
601
602        // Old Mac-style files may use bare carriage returns as separators.
603        // When the trimmed line still contains '\r' characters, treat them as
604        // record separators so the byte offsets remain accurate.
605        let treat_cr_as_newline = line.contains('\r');
606
607        if treat_cr_as_newline {
608            let bytes = line.as_bytes();
609            let mut segment_start = 0usize;
610            while segment_start <= bytes.len() {
611                match bytes[segment_start..].iter().position(|&b| b == b'\r') {
612                    Some(rel_idx) => {
613                        let idx = segment_start + rel_idx;
614                        let segment_bytes = &bytes[segment_start..idx];
615                        let segment_str = std::str::from_utf8(segment_bytes)?;
616                        process_streaming_line(
617                            regex,
618                            file_path,
619                            segment_str,
620                            line_number,
621                            byte_offset,
622                            &mut results,
623                        );
624                        byte_offset += segment_bytes.len() + 1; // account for \r
625                        line_number += 1;
626                        segment_start = idx + 1;
627                    }
628                    None => {
629                        let segment_bytes = &bytes[segment_start..];
630                        let segment_str = std::str::from_utf8(segment_bytes)?;
631                        process_streaming_line(
632                            regex,
633                            file_path,
634                            segment_str,
635                            line_number,
636                            byte_offset,
637                            &mut results,
638                        );
639                        byte_offset += segment_bytes.len();
640                        line_number += 1;
641                        break;
642                    }
643                }
644            }
645            byte_offset += newline_len;
646        } else {
647            let line_str = line.as_str();
648            process_streaming_line(
649                regex,
650                file_path,
651                line_str,
652                line_number,
653                byte_offset,
654                &mut results,
655            );
656            byte_offset += line_str.len() + newline_len;
657            line_number += 1;
658        }
659    }
660
661    Ok(results)
662}
663
664fn process_streaming_line(
665    regex: &Regex,
666    file_path: &Path,
667    line: &str,
668    line_number: usize,
669    byte_offset: usize,
670    results: &mut Vec<SearchResult>,
671) {
672    if regex.as_str().is_empty() {
673        results.push(SearchResult {
674            file: file_path.to_path_buf(),
675            span: Span {
676                byte_start: byte_offset,
677                byte_end: byte_offset + line.len(),
678                line_start: line_number,
679                line_end: line_number,
680            },
681            score: 1.0,
682            preview: line.to_string(),
683            lang: ck_core::Language::from_path(file_path),
684            symbol: None,
685            chunk_hash: None,
686            index_epoch: None,
687        });
688    } else {
689        for mat in regex.find_iter(line) {
690            results.push(SearchResult {
691                file: file_path.to_path_buf(),
692                span: Span {
693                    byte_start: byte_offset + mat.start(),
694                    byte_end: byte_offset + mat.end(),
695                    line_start: line_number,
696                    line_end: line_number,
697                },
698                score: 1.0,
699                preview: line.to_string(),
700                lang: ck_core::Language::from_path(file_path),
701                symbol: None,
702                chunk_hash: None,
703                index_epoch: None,
704            });
705        }
706    }
707}
708
709async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
710    // Handle both files and directories and reuse nearest existing .ck index up the tree
711    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
712        if options.path.is_file() {
713            options.path.parent().unwrap_or(&options.path).to_path_buf()
714        } else {
715            options.path.clone()
716        }
717    });
718
719    let index_dir = index_root.join(".ck");
720    if !index_dir.exists() {
721        return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
722    }
723
724    let tantivy_index_path = index_dir.join("tantivy_index");
725
726    if !tantivy_index_path.exists() {
727        return build_tantivy_index(options).await;
728    }
729
730    let mut schema_builder = Schema::builder();
731    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
732    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
733    let _schema = schema_builder.build();
734
735    let index = Index::open_in_dir(&tantivy_index_path)
736        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
737
738    let reader = index
739        .reader_builder()
740        .reload_policy(ReloadPolicy::OnCommitWithDelay)
741        .try_into()
742        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
743
744    let searcher = reader.searcher();
745    let query_parser = QueryParser::for_index(&index, vec![content_field]);
746
747    let query = query_parser
748        .parse_query(&options.query)
749        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
750
751    let top_docs = if let Some(top_k) = options.top_k {
752        searcher.search(&query, &TopDocs::with_limit(top_k))?
753    } else {
754        searcher.search(&query, &TopDocs::with_limit(100))?
755    };
756
757    // First, collect all results with raw scores
758    let mut raw_results = Vec::new();
759    for (_score, doc_address) in top_docs {
760        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
761        let path_text = retrieved_doc
762            .get_first(path_field)
763            .map(|field_value| field_value.as_str().unwrap_or(""))
764            .unwrap_or("");
765        let content_text = retrieved_doc
766            .get_first(content_field)
767            .map(|field_value| field_value.as_str().unwrap_or(""))
768            .unwrap_or("");
769
770        let file_path = PathBuf::from(path_text);
771        let preview = if options.full_section {
772            content_text.to_string()
773        } else {
774            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
775        };
776
777        raw_results.push((
778            _score,
779            SearchResult {
780                file: file_path,
781                span: Span {
782                    byte_start: 0,
783                    byte_end: content_text.len(),
784                    line_start: 1,
785                    line_end: content_text.lines().count(),
786                },
787                score: _score,
788                preview,
789                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
790                symbol: None,
791                chunk_hash: None,
792                index_epoch: None,
793            },
794        ));
795    }
796
797    // Normalize scores to 0-1 range and apply threshold
798    let mut results = Vec::new();
799    if !raw_results.is_empty() {
800        let max_score = raw_results
801            .iter()
802            .map(|(score, _)| *score)
803            .fold(0.0f32, f32::max);
804        if max_score > 0.0 {
805            for (raw_score, mut result) in raw_results {
806                let normalized_score = raw_score / max_score;
807
808                // Apply threshold filtering with normalized score
809                if let Some(threshold) = options.threshold
810                    && normalized_score < threshold
811                {
812                    continue;
813                }
814
815                result.score = normalized_score;
816                results.push(result);
817            }
818        }
819    }
820
821    Ok(results)
822}
823
824async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
825    // Handle both files and directories by finding the appropriate directory for indexing
826    let index_root = if options.path.is_file() {
827        options.path.parent().unwrap_or(&options.path)
828    } else {
829        &options.path
830    };
831
832    let index_dir = index_root.join(".ck");
833    let tantivy_index_path = index_dir.join("tantivy_index");
834
835    fs::create_dir_all(&tantivy_index_path)?;
836
837    let mut schema_builder = Schema::builder();
838    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
839    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
840    let schema = schema_builder.build();
841
842    let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
843        .map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
844
845    let mut index_writer = index
846        .writer(50_000_000)
847        .map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
848
849    let files = collect_files(index_root, true, &options.exclude_patterns)?;
850
851    for file_path in &files {
852        if let Ok(content) = fs::read_to_string(file_path) {
853            let doc = doc!(
854                content_field => content,
855                path_field => file_path.display().to_string()
856            );
857            index_writer.add_document(doc)?;
858        }
859    }
860
861    index_writer
862        .commit()
863        .map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
864
865    // After building, search again with the same options
866    let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
867    let mut schema_builder = Schema::builder();
868    let content_field = schema_builder.add_text_field("content", TEXT | STORED);
869    let path_field = schema_builder.add_text_field("path", TEXT | STORED);
870    let _schema = schema_builder.build();
871
872    let index = Index::open_in_dir(&tantivy_index_path)
873        .map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
874
875    let reader = index
876        .reader_builder()
877        .reload_policy(ReloadPolicy::OnCommitWithDelay)
878        .try_into()
879        .map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
880
881    let searcher = reader.searcher();
882    let query_parser = QueryParser::for_index(&index, vec![content_field]);
883
884    let query = query_parser
885        .parse_query(&options.query)
886        .map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
887
888    let top_docs = if let Some(top_k) = options.top_k {
889        searcher.search(&query, &TopDocs::with_limit(top_k))?
890    } else {
891        searcher.search(&query, &TopDocs::with_limit(100))?
892    };
893
894    // First, collect all results with raw scores
895    let mut raw_results = Vec::new();
896    for (_score, doc_address) in top_docs {
897        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
898        let path_text = retrieved_doc
899            .get_first(path_field)
900            .map(|field_value| field_value.as_str().unwrap_or(""))
901            .unwrap_or("");
902        let content_text = retrieved_doc
903            .get_first(content_field)
904            .map(|field_value| field_value.as_str().unwrap_or(""))
905            .unwrap_or("");
906
907        let file_path = PathBuf::from(path_text);
908        let preview = if options.full_section {
909            content_text.to_string()
910        } else {
911            content_text.lines().take(3).collect::<Vec<_>>().join("\n")
912        };
913
914        raw_results.push((
915            _score,
916            SearchResult {
917                file: file_path,
918                span: Span {
919                    byte_start: 0,
920                    byte_end: content_text.len(),
921                    line_start: 1,
922                    line_end: content_text.lines().count(),
923                },
924                score: _score,
925                preview,
926                lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
927                symbol: None,
928                chunk_hash: None,
929                index_epoch: None,
930            },
931        ));
932    }
933
934    // Normalize scores to 0-1 range and apply threshold
935    let mut results = Vec::new();
936    if !raw_results.is_empty() {
937        let max_score = raw_results
938            .iter()
939            .map(|(score, _)| *score)
940            .fold(0.0f32, f32::max);
941        if max_score > 0.0 {
942            for (raw_score, mut result) in raw_results {
943                let normalized_score = raw_score / max_score;
944
945                // Apply threshold filtering with normalized score
946                if let Some(threshold) = options.threshold
947                    && normalized_score < threshold
948                {
949                    continue;
950                }
951
952                result.score = normalized_score;
953                results.push(result);
954            }
955        }
956    }
957
958    Ok(results)
959}
960
961#[allow(dead_code)]
962async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
963    hybrid_search_with_progress(options, None).await
964}
965
966async fn hybrid_search_with_progress(
967    options: &SearchOptions,
968    progress_callback: Option<SearchProgressCallback>,
969) -> Result<Vec<SearchResult>> {
970    if let Some(ref callback) = progress_callback {
971        callback("Running regex search...");
972    }
973    let regex_results = regex_search(options)?;
974
975    if let Some(ref callback) = progress_callback {
976        callback("Running semantic search...");
977    }
978    let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
979
980    let mut combined = HashMap::new();
981
982    for (rank, result) in regex_results.iter().enumerate() {
983        let key = format!("{}:{}", result.file.display(), result.span.line_start);
984        combined
985            .entry(key)
986            .or_insert(Vec::new())
987            .push((rank + 1, result.clone()));
988    }
989
990    for (rank, result) in semantic_results.matches.iter().enumerate() {
991        let key = format!("{}:{}", result.file.display(), result.span.line_start);
992        combined
993            .entry(key)
994            .or_insert(Vec::new())
995            .push((rank + 1, result.clone()));
996    }
997
998    // Calculate RRF scores according to original paper: RRFscore(d) = Σ(r∈R) 1/(k + r(d))
999    let mut rrf_results: Vec<SearchResult> = combined
1000        .into_values()
1001        .map(|ranks| {
1002            let mut result = ranks[0].1.clone();
1003            let rrf_score = ranks
1004                .iter()
1005                .map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
1006                .sum();
1007            result.score = rrf_score;
1008            result
1009        })
1010        .filter(|result| {
1011            // Apply threshold filtering to raw RRF scores
1012            if let Some(threshold) = options.threshold {
1013                result.score >= threshold
1014            } else {
1015                true
1016            }
1017        })
1018        .collect();
1019
1020    // Sort by RRF score (highest first)
1021    rrf_results.sort_by(|a, b| {
1022        b.score
1023            .partial_cmp(&a.score)
1024            .unwrap_or(std::cmp::Ordering::Equal)
1025    });
1026
1027    if let Some(top_k) = options.top_k {
1028        rrf_results.truncate(top_k);
1029    }
1030
1031    Ok(rrf_results)
1032}
1033
1034fn build_globset(patterns: &[String]) -> GlobSet {
1035    let mut builder = GlobSetBuilder::new();
1036    for pat in patterns {
1037        // Treat patterns as filename or directory globs
1038        if let Ok(glob) = Glob::new(pat) {
1039            builder.add(glob);
1040        }
1041    }
1042    builder.build().unwrap_or_else(|_| GlobSet::empty())
1043}
1044
1045fn should_exclude_path(path: &Path, exclude_patterns: &[String]) -> bool {
1046    let globset = build_globset(exclude_patterns);
1047    // Match against each path component and the full path
1048    if globset.is_match(path) {
1049        return true;
1050    }
1051    for component in path.components() {
1052        if let std::path::Component::Normal(name) = component
1053            && globset.is_match(name)
1054        {
1055            return true;
1056        }
1057    }
1058    false
1059}
1060
1061fn collect_files(
1062    path: &Path,
1063    recursive: bool,
1064    exclude_patterns: &[String],
1065) -> Result<Vec<PathBuf>> {
1066    let mut files = Vec::new();
1067    let globset = build_globset(exclude_patterns);
1068
1069    if path.is_file() {
1070        // Always add single files, even if they're excluded (user explicitly requested)
1071        files.push(path.to_path_buf());
1072    } else if recursive {
1073        for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
1074            // Skip excluded directories entirely for efficiency
1075            let name = e.file_name();
1076            !globset.is_match(e.path()) && !globset.is_match(name)
1077        }) {
1078            match entry {
1079                Ok(entry) => {
1080                    if entry.file_type().is_file()
1081                        && !should_exclude_path(entry.path(), exclude_patterns)
1082                    {
1083                        files.push(entry.path().to_path_buf());
1084                    }
1085                }
1086                Err(e) => {
1087                    // Log directory traversal errors but continue processing
1088                    tracing::debug!("Skipping path due to error: {}", e);
1089                    continue;
1090                }
1091            }
1092        }
1093    } else {
1094        match fs::read_dir(path) {
1095            Ok(read_dir) => {
1096                for entry in read_dir {
1097                    match entry {
1098                        Ok(entry) => {
1099                            let path = entry.path();
1100                            if path.is_file() && !should_exclude_path(&path, exclude_patterns) {
1101                                files.push(path);
1102                            }
1103                        }
1104                        Err(e) => {
1105                            tracing::debug!("Skipping directory entry due to error: {}", e);
1106                            continue;
1107                        }
1108                    }
1109                }
1110            }
1111            Err(e) => {
1112                tracing::debug!("Cannot read directory {:?}: {}", path, e);
1113                return Err(e.into());
1114            }
1115        }
1116    }
1117
1118    Ok(files)
1119}
1120
1121#[allow(clippy::too_many_arguments)]
1122async fn ensure_index_updated_with_progress(
1123    path: &Path,
1124    force_reindex: bool,
1125    need_embeddings: bool,
1126    progress_callback: Option<ck_index::ProgressCallback>,
1127    detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
1128    respect_gitignore: bool,
1129    exclude_patterns: &[String],
1130    model_override: Option<&str>,
1131) -> Result<()> {
1132    // Handle both files and directories and reuse nearest existing .ck index up the tree
1133    let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
1134        if path.is_file() {
1135            path.parent().unwrap_or(path).to_path_buf()
1136        } else {
1137            path.to_path_buf()
1138        }
1139    });
1140    let index_root = &index_root_buf;
1141
1142    // If force reindex is requested, always update
1143    if force_reindex {
1144        let stats = ck_index::smart_update_index_with_detailed_progress(
1145            index_root,
1146            true,
1147            progress_callback,
1148            detailed_progress_callback,
1149            need_embeddings,
1150            respect_gitignore,
1151            exclude_patterns, // Use search-specific exclude patterns
1152            model_override,
1153        )
1154        .await?;
1155        if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1156            tracing::info!(
1157                "Index updated: {} files indexed, {} orphaned files removed",
1158                stats.files_indexed,
1159                stats.orphaned_files_removed
1160            );
1161        }
1162        return Ok(());
1163    }
1164
1165    // Always use smart_update_index for incremental updates (handles both new and existing indexes)
1166    let stats = ck_index::smart_update_index_with_detailed_progress(
1167        index_root,
1168        false,
1169        progress_callback,
1170        detailed_progress_callback,
1171        need_embeddings,
1172        respect_gitignore,
1173        exclude_patterns,
1174        model_override,
1175    )
1176    .await?;
1177    if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
1178        tracing::info!(
1179            "Index updated: {} files indexed, {} orphaned files removed",
1180            stats.files_indexed,
1181            stats.orphaned_files_removed
1182        );
1183    }
1184
1185    Ok(())
1186}
1187
1188fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
1189    let before = options.before_context_lines.max(options.context_lines);
1190    let after = options.after_context_lines.max(options.context_lines);
1191
1192    if before > 0 || after > 0 {
1193        let start_idx = line_idx.saturating_sub(before);
1194        let end_idx = (line_idx + after + 1).min(lines.len());
1195        lines[start_idx..end_idx].join("\n")
1196    } else {
1197        lines[line_idx].to_string()
1198    }
1199}
1200
1201fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
1202    let lang = ck_core::Language::from_path(file_path)?;
1203
1204    // Parse the file with tree-sitter and extract function/class sections
1205    if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
1206        let sections: Vec<(usize, usize, String)> = chunks
1207            .into_iter()
1208            .filter(|chunk| {
1209                matches!(
1210                    chunk.chunk_type,
1211                    ck_chunk::ChunkType::Function
1212                        | ck_chunk::ChunkType::Class
1213                        | ck_chunk::ChunkType::Method
1214                )
1215            })
1216            .map(|chunk| {
1217                (
1218                    chunk.span.line_start - 1, // Convert to 0-based index
1219                    chunk.span.line_end - 1,
1220                    chunk.text,
1221                )
1222            })
1223            .collect();
1224
1225        if sections.is_empty() {
1226            None
1227        } else {
1228            Some(sections)
1229        }
1230    } else {
1231        None
1232    }
1233}
1234
1235fn find_containing_section(
1236    sections: &[(usize, usize, String)],
1237    line_idx: usize,
1238) -> Option<&String> {
1239    for (start, end, text) in sections {
1240        if line_idx >= *start && line_idx <= *end {
1241            return Some(text);
1242        }
1243    }
1244    None
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249    use super::*;
1250    use std::fs;
1251    use tempfile::TempDir;
1252
1253    fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
1254        let files = vec![
1255            ("test1.txt", "hello world rust programming"),
1256            ("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
1257            ("test3.py", "print('Hello Python')"),
1258            ("test4.txt", "machine learning artificial intelligence"),
1259        ];
1260
1261        let mut paths = Vec::new();
1262        for (name, content) in files {
1263            let path = dir.join(name);
1264            fs::write(&path, content).unwrap();
1265            paths.push(path);
1266        }
1267        paths
1268    }
1269
1270    #[test]
1271    fn test_extract_lines_from_file() {
1272        let temp_dir = TempDir::new().unwrap();
1273        let test_file = temp_dir.path().join("test_lines.txt");
1274
1275        // Create a multi-line test file
1276        let content =
1277            "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
1278        fs::write(&test_file, content).unwrap();
1279
1280        // Test extracting lines 3-5 (1-based indexing)
1281        let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
1282        assert_eq!(result, "Line 3\nLine 4\nLine 5");
1283
1284        // Test extracting a single line
1285        let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
1286        assert_eq!(result, "Line 7");
1287
1288        // Test extracting from line 8 to end
1289        let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
1290        assert_eq!(result, "Line 8\nLine 9\nLine 10");
1291
1292        // Test line_start == 0 (should return empty)
1293        let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
1294        assert_eq!(result, "");
1295
1296        // Test line_start > file length (should return empty)
1297        let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
1298        assert_eq!(result, "");
1299    }
1300
1301    #[tokio::test]
1302    async fn test_extract_content_from_span() {
1303        let temp_dir = TempDir::new().unwrap();
1304        let test_file = temp_dir.path().join("code.rs");
1305
1306        // Create a multi-line code file
1307        let content = "fn first() {\n    println!(\"First\");\n}\n\nfn second() {\n    println!(\"Second\");\n}\n\nfn third() {\n    println!(\"Third\");\n}";
1308        fs::write(&test_file, content).unwrap();
1309
1310        // Test extracting the second function (lines 5-7)
1311        let span = ck_core::Span {
1312            byte_start: 0, // Not used in line extraction
1313            byte_end: 0,   // Not used in line extraction
1314            line_start: 5,
1315            line_end: 7,
1316        };
1317
1318        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1319        assert_eq!(result, "fn second() {\n    println!(\"Second\");\n}");
1320
1321        // Test extracting a single line
1322        let span = ck_core::Span {
1323            byte_start: 0,
1324            byte_end: 0,
1325            line_start: 2,
1326            line_end: 2,
1327        };
1328
1329        let result = extract_content_from_span(&test_file, &span).await.unwrap();
1330        assert_eq!(result, "    println!(\"First\");");
1331    }
1332
1333    #[test]
1334    fn test_collect_files() {
1335        let temp_dir = TempDir::new().unwrap();
1336        let test_files = create_test_files(temp_dir.path());
1337
1338        // Test non-recursive
1339        let files = collect_files(temp_dir.path(), false, &[]).unwrap();
1340        assert_eq!(files.len(), 4);
1341
1342        // Test recursive
1343        let files = collect_files(temp_dir.path(), true, &[]).unwrap();
1344        assert_eq!(files.len(), 4);
1345
1346        // Test single file
1347        let files = collect_files(&test_files[0], false, &[]).unwrap();
1348        assert_eq!(files.len(), 1);
1349        assert_eq!(files[0], test_files[0]);
1350    }
1351
1352    #[test]
1353    fn test_regex_search() {
1354        let temp_dir = TempDir::new().unwrap();
1355        create_test_files(temp_dir.path());
1356
1357        let options = SearchOptions {
1358            mode: SearchMode::Regex,
1359            query: "rust".to_string(),
1360            path: temp_dir.path().to_path_buf(),
1361            recursive: true,
1362            ..Default::default()
1363        };
1364
1365        let results = regex_search(&options).unwrap();
1366        assert!(!results.is_empty());
1367
1368        // Should find matches in files containing "rust"
1369        let rust_matches: Vec<_> = results
1370            .iter()
1371            .filter(|r| r.preview.to_lowercase().contains("rust"))
1372            .collect();
1373        assert!(!rust_matches.is_empty());
1374    }
1375
1376    #[test]
1377    fn test_regex_search_case_insensitive() {
1378        let temp_dir = TempDir::new().unwrap();
1379        create_test_files(temp_dir.path());
1380
1381        let options = SearchOptions {
1382            mode: SearchMode::Regex,
1383            query: "HELLO".to_string(),
1384            path: temp_dir.path().to_path_buf(),
1385            recursive: true,
1386            case_insensitive: true,
1387            ..Default::default()
1388        };
1389
1390        let results = regex_search(&options).unwrap();
1391        assert!(!results.is_empty());
1392    }
1393
1394    #[test]
1395    fn test_regex_search_fixed_string() {
1396        let temp_dir = TempDir::new().unwrap();
1397        create_test_files(temp_dir.path());
1398
1399        let options = SearchOptions {
1400            mode: SearchMode::Regex,
1401            query: "fn main()".to_string(),
1402            path: temp_dir.path().to_path_buf(),
1403            recursive: true,
1404            fixed_string: true,
1405            ..Default::default()
1406        };
1407
1408        let results = regex_search(&options).unwrap();
1409        assert!(!results.is_empty());
1410    }
1411
1412    #[test]
1413    fn test_regex_search_whole_word() {
1414        let temp_dir = TempDir::new().unwrap();
1415        fs::write(
1416            temp_dir.path().join("word_test.txt"),
1417            "rust rusty rustacean",
1418        )
1419        .unwrap();
1420
1421        let options = SearchOptions {
1422            mode: SearchMode::Regex,
1423            query: "rust".to_string(),
1424            path: temp_dir.path().to_path_buf(),
1425            recursive: true,
1426            whole_word: true,
1427            ..Default::default()
1428        };
1429
1430        let results = regex_search(&options).unwrap();
1431        assert!(!results.is_empty());
1432        // Should only match "rust" as a whole word, not "rusty" or "rustacean"
1433    }
1434
1435    #[test]
1436    fn test_regex_search_top_k() {
1437        let temp_dir = TempDir::new().unwrap();
1438
1439        // Create multiple files with matches
1440        for i in 0..10 {
1441            fs::write(
1442                temp_dir.path().join(format!("file{}.txt", i)),
1443                "test content",
1444            )
1445            .unwrap();
1446        }
1447
1448        let options = SearchOptions {
1449            mode: SearchMode::Regex,
1450            query: "test".to_string(),
1451            path: temp_dir.path().to_path_buf(),
1452            recursive: true,
1453            top_k: Some(5),
1454            ..Default::default()
1455        };
1456
1457        let results = regex_search(&options).unwrap();
1458        assert!(results.len() <= 5);
1459    }
1460
1461    #[test]
1462    fn test_regex_search_span_offsets() {
1463        // Test that span offsets are correctly calculated for multiple matches on a line
1464        let temp_dir = TempDir::new().unwrap();
1465        let test_file = temp_dir.path().join("spans.txt");
1466        fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
1467
1468        let options = SearchOptions {
1469            mode: SearchMode::Regex,
1470            query: "test".to_string(),
1471            path: test_file.clone(),
1472            recursive: false,
1473            ..Default::default()
1474        };
1475
1476        let results = regex_search(&options).unwrap();
1477
1478        // Should find 5 matches total
1479        assert_eq!(results.len(), 5);
1480
1481        // Check first line has 3 matches with correct byte offsets
1482        let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
1483        assert_eq!(line1_matches.len(), 3);
1484        assert_eq!(line1_matches[0].span.byte_start, 0);
1485        assert_eq!(line1_matches[1].span.byte_start, 5);
1486        assert_eq!(line1_matches[2].span.byte_start, 10);
1487
1488        // Check second line match
1489        let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
1490        assert_eq!(line2_matches.len(), 1);
1491        assert_eq!(line2_matches[0].span.byte_start, 24); // "test test test\n" = 15 bytes, "line two " = 9 bytes
1492
1493        // Each match should have different byte offsets
1494        let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
1495        byte_starts.sort();
1496        byte_starts.dedup();
1497        assert_eq!(byte_starts.len(), 5); // All byte_starts should be unique
1498    }
1499
1500    #[test]
1501    fn test_search_file() {
1502        let temp_dir = TempDir::new().unwrap();
1503        let file_path = temp_dir.path().join("test.txt");
1504        fs::write(
1505            &file_path,
1506            "line 1: hello\nline 2: world\nline 3: rust programming",
1507        )
1508        .unwrap();
1509
1510        let regex = regex::Regex::new("rust").unwrap();
1511        let options = SearchOptions::default();
1512
1513        let results = search_file(&regex, &file_path, &options).unwrap();
1514        assert_eq!(results.len(), 1);
1515        assert_eq!(results[0].span.line_start, 3);
1516        assert!(results[0].preview.contains("rust"));
1517    }
1518
1519    #[test]
1520    fn test_search_file_with_context() {
1521        let temp_dir = TempDir::new().unwrap();
1522        let file_path = temp_dir.path().join("test.txt");
1523        fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
1524
1525        let regex = regex::Regex::new("target").unwrap();
1526        let options = SearchOptions {
1527            context_lines: 1,
1528            ..Default::default()
1529        };
1530
1531        let results = search_file(&regex, &file_path, &options).unwrap();
1532        assert_eq!(results.len(), 1);
1533
1534        println!("Preview: '{}'", results[0].preview);
1535
1536        // The target line is line 3, with 1 context line before and after
1537        // So we should get lines 2, 3, 4
1538        assert!(results[0].preview.contains("line 2"));
1539        assert!(results[0].preview.contains("target line"));
1540        assert!(results[0].preview.contains("line 4"));
1541    }
1542
1543    #[tokio::test]
1544    async fn test_search_main_function() {
1545        let temp_dir = TempDir::new().unwrap();
1546        create_test_files(temp_dir.path());
1547
1548        let options = SearchOptions {
1549            mode: SearchMode::Regex,
1550            query: "hello".to_string(),
1551            path: temp_dir.path().to_path_buf(),
1552            recursive: true,
1553            case_insensitive: true,
1554            ..Default::default()
1555        };
1556
1557        let results = search(&options).await.unwrap();
1558        assert!(!results.is_empty());
1559    }
1560
1561    #[tokio::test]
1562    async fn test_regex_search_mixed_line_endings() {
1563        // Regression test for byte offset issues with different line endings
1564        let temp_dir = TempDir::new().unwrap();
1565
1566        // Create test file with mixed line endings (Windows \r\n and Unix \n)
1567        let test_file = temp_dir.path().join("mixed_endings.txt");
1568        let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
1569        std::fs::write(&test_file, content).unwrap();
1570
1571        let options = SearchOptions {
1572            mode: SearchMode::Regex,
1573            query: "pattern".to_string(),
1574            path: test_file.clone(),
1575            recursive: false,
1576            ..Default::default()
1577        };
1578
1579        let results = search(&options).await.unwrap();
1580        assert_eq!(results.len(), 1);
1581
1582        let result = &results[0];
1583        // Verify byte offsets are correct - should point to start of "pattern"
1584        let original_content = std::fs::read_to_string(&test_file).unwrap();
1585        let pattern_start = original_content.find("pattern").unwrap();
1586
1587        assert_eq!(result.span.byte_start, pattern_start);
1588        assert_eq!(result.span.line_start, 4); // Fourth line
1589    }
1590
1591    #[tokio::test]
1592    async fn test_regex_search_windows_line_endings() {
1593        // Regression test specifically for Windows \r\n line endings
1594        let temp_dir = TempDir::new().unwrap();
1595
1596        let test_file = temp_dir.path().join("windows_endings.txt");
1597        let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
1598        std::fs::write(&test_file, content).unwrap();
1599
1600        let options = SearchOptions {
1601            mode: SearchMode::Regex,
1602            query: "match".to_string(),
1603            path: test_file.clone(),
1604            recursive: false,
1605            ..Default::default()
1606        };
1607
1608        let results = search(&options).await.unwrap();
1609        assert_eq!(results.len(), 1);
1610
1611        let result = &results[0];
1612
1613        // Verify the match is on line 3
1614        assert_eq!(result.span.line_start, 3);
1615
1616        // Verify byte offset accounts for \r\n endings
1617        // first line\r\n = 12 bytes, second line\r\n = 13 bytes, total = 25 bytes before "match"
1618        let expected_byte_start = 25; // Position of "match" in the content
1619        assert_eq!(result.span.byte_start, expected_byte_start);
1620    }
1621
1622    #[test]
1623    fn test_split_lines_with_endings_helper() {
1624        // Unix line endings
1625        let unix_content = "line1\nline2\nline3\n";
1626        let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
1627        assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
1628        assert_eq!(unix_endings, vec![1, 1, 1]);
1629
1630        // Windows line endings
1631        let windows_content = "line1\r\nline2\r\nline3\r\n";
1632        let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
1633        assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
1634        assert_eq!(windows_endings, vec![2, 2, 2]);
1635
1636        // Old Mac line endings
1637        let mac_content = "line1\rline2\rline3\r";
1638        let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
1639        assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
1640        assert_eq!(mac_endings, vec![1, 1, 1]);
1641
1642        // Mixed endings
1643        let mixed_content = "line1\nline2\r\nline3\r";
1644        let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
1645        assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
1646        assert_eq!(mixed_endings, vec![1, 2, 1]);
1647
1648        // No line endings
1649        let no_endings = "single line";
1650        let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
1651        assert_eq!(no_lines, vec!["single line"]);
1652        assert_eq!(no_endings_vec, vec![0]);
1653    }
1654}