use anyhow::Result;
use ck_core::{CkError, IncludePattern, SearchMode, SearchOptions, SearchResult, Span};
use globset::{Glob, GlobSet, GlobSetBuilder};
use rayon::prelude::*;
use regex::{Regex, RegexBuilder};
use std::collections::HashMap;
use std::fs;
use std::path::PathBuf as StdPathBuf;
use std::path::{Path, PathBuf};
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{STORED, Schema, TEXT, Value};
use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
use walkdir::WalkDir;
mod semantic_v3;
pub use semantic_v3::{semantic_search_v3, semantic_search_v3_with_progress};
pub type SearchProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
pub type IndexingProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
pub type DetailedIndexingProgressCallback = Box<dyn Fn(ck_index::EmbeddingProgress) + Send + Sync>;
fn resolve_content_path(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
if ck_core::pdf::is_pdf_file(file_path) {
let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
if !cache_path.exists() {
return Err(anyhow::anyhow!(
"PDF not preprocessed. Run 'ck --index' first."
));
}
Ok(cache_path)
} else {
Ok(file_path.to_path_buf())
}
}
fn read_file_content(file_path: &Path, repo_root: &Path) -> Result<String> {
let content_path = resolve_content_path(file_path, repo_root)?;
Ok(fs::read_to_string(content_path)?)
}
async fn extract_content_from_span(file_path: &Path, span: &ck_core::Span) -> Result<String> {
let repo_root = find_nearest_index_root(file_path)
.unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
let content_path = resolve_content_path(file_path, &repo_root)?;
extract_lines_from_file(&content_path, span.line_start, span.line_end)
}
fn extract_lines_from_file(file_path: &Path, line_start: usize, line_end: usize) -> Result<String> {
use std::io::{BufRead, BufReader};
if line_start == 0 {
return Ok(String::new());
}
let file = fs::File::open(file_path)?;
let reader = BufReader::new(file);
let mut result = Vec::new();
let start_idx = line_start.saturating_sub(1);
let end_idx = line_end.saturating_sub(1);
for (current_line, line_result) in reader.lines().enumerate() {
if current_line > end_idx {
break; }
let line = line_result?;
if current_line >= start_idx {
result.push(line);
}
}
if result.is_empty() && line_start > 0 {
return Ok(String::new());
}
Ok(result.join("\n"))
}
fn split_lines_with_endings(content: &str) -> (Vec<String>, Vec<usize>) {
let mut lines = Vec::new();
let mut endings = Vec::new();
let bytes = content.as_bytes();
let mut start = 0usize;
let mut i = 0usize;
while i < bytes.len() {
match bytes[i] {
b'\n' => {
lines.push(content[start..i].to_string());
endings.push(1);
i += 1;
start = i;
}
b'\r' => {
lines.push(content[start..i].to_string());
if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
endings.push(2);
i += 2;
} else {
endings.push(1);
i += 1;
}
start = i;
}
_ => {
i += 1;
}
}
}
if start < bytes.len() {
lines.push(content[start..].to_string());
endings.push(0);
}
(lines, endings)
}
fn canonicalize_for_matching(path: &Path) -> PathBuf {
if let Ok(canonical) = path.canonicalize() {
return canonical;
}
if path.is_absolute() {
path.to_path_buf()
} else {
std::env::current_dir()
.map(|cwd| cwd.join(path))
.unwrap_or_else(|_| path.to_path_buf())
}
}
fn path_matches_include(path: &Path, include_patterns: &[IncludePattern]) -> bool {
if include_patterns.is_empty() {
return true;
}
let candidate = canonicalize_for_matching(path);
include_patterns.iter().any(|pattern| {
if pattern.is_dir {
candidate.starts_with(&pattern.path)
} else {
candidate == pattern.path
}
})
}
fn filter_files_by_include(
files: Vec<PathBuf>,
include_patterns: &[IncludePattern],
) -> Vec<PathBuf> {
if include_patterns.is_empty() {
return files;
}
files
.into_iter()
.filter(|path| path_matches_include(path, include_patterns))
.collect()
}
fn find_nearest_index_root(path: &Path) -> Option<StdPathBuf> {
let mut current = if path.is_file() {
path.parent().unwrap_or(path)
} else {
path
};
loop {
if current.join(".ck").exists() {
return Some(current.to_path_buf());
}
match current.parent() {
Some(parent) => current = parent,
None => return None,
}
}
}
#[derive(Clone, Debug)]
pub struct ResolvedModel {
pub alias: String,
pub config: ck_models::ModelConfig,
}
impl ResolvedModel {
pub fn canonical_name(&self) -> &str {
self.config.name.as_str()
}
pub fn dimensions(&self) -> usize {
self.config.dimensions
}
}
fn legacy_model_config(name: &str, dimensions: usize) -> ck_models::ModelConfig {
ck_models::ModelConfig {
name: name.to_string(),
provider: "fastembed".to_string(),
dimensions,
max_tokens: 8192,
description: "Legacy ck embedding model preserved for backwards compatibility".to_string(),
}
}
pub(crate) fn resolve_model_from_root(
index_root: &Path,
cli_model: Option<&str>,
) -> Result<ResolvedModel> {
use ck_models::ModelRegistry;
let registry = ModelRegistry::default();
let index_dir = index_root.join(".ck");
let manifest_path = index_dir.join("manifest.json");
if manifest_path.exists() {
let data = std::fs::read(&manifest_path)?;
let manifest: ck_index::IndexManifest = serde_json::from_slice(&data)?;
if let Some(existing_model) = manifest.embedding_model {
let dims_hint = manifest.embedding_dimensions.unwrap_or(384);
let resolved_existing = match registry.resolve(Some(existing_model.as_str())) {
Ok((alias, config)) => ResolvedModel { alias, config },
Err(_) => ResolvedModel {
alias: existing_model.clone(),
config: legacy_model_config(&existing_model, dims_hint),
},
};
if let Some(requested) = cli_model {
let (requested_alias, requested_config) = registry
.resolve(Some(requested))
.map_err(|e| CkError::Embedding(e.to_string()))?;
if requested_config.name != resolved_existing.config.name {
let suggested_alias = resolved_existing.alias.clone();
return Err(CkError::Embedding(format!(
"Index was built with embedding model '{}' (alias '{}'), but '--model {}' was requested. To switch models run `ck --clean .` then `ck --index --model {}`. To keep using this index rerun your command with '--model {}'.",
resolved_existing.config.name,
suggested_alias,
requested,
requested,
suggested_alias
))
.into());
}
return Ok(ResolvedModel {
alias: requested_alias,
config: requested_config,
});
}
return Ok(resolved_existing);
}
}
let (alias, config) = registry
.resolve(cli_model)
.map_err(|e| CkError::Embedding(e.to_string()))?;
Ok(ResolvedModel { alias, config })
}
pub fn resolve_model_for_path(path: &Path, cli_model: Option<&str>) -> Result<ResolvedModel> {
let index_root = find_nearest_index_root(path).unwrap_or_else(|| {
if path.is_file() {
path.parent().unwrap_or(path).to_path_buf()
} else {
path.to_path_buf()
}
});
resolve_model_from_root(&index_root, cli_model)
}
pub async fn search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
let results = search_enhanced(options).await?;
Ok(results.matches)
}
pub async fn search_with_progress(
options: &SearchOptions,
progress_callback: Option<SearchProgressCallback>,
) -> Result<Vec<SearchResult>> {
let results = search_enhanced_with_progress(options, progress_callback).await?;
Ok(results.matches)
}
pub async fn search_enhanced(options: &SearchOptions) -> Result<ck_core::SearchResults> {
search_enhanced_with_progress(options, None).await
}
pub async fn search_enhanced_with_progress(
options: &SearchOptions,
progress_callback: Option<SearchProgressCallback>,
) -> Result<ck_core::SearchResults> {
search_enhanced_with_indexing_progress(options, progress_callback, None, None).await
}
pub async fn search_enhanced_with_indexing_progress(
options: &SearchOptions,
progress_callback: Option<SearchProgressCallback>,
indexing_progress_callback: Option<IndexingProgressCallback>,
detailed_indexing_progress_callback: Option<DetailedIndexingProgressCallback>,
) -> Result<ck_core::SearchResults> {
if !options.path.exists() {
return Err(ck_core::CkError::Search(format!(
"Path does not exist: {}",
options.path.display()
))
.into());
}
if !matches!(options.mode, SearchMode::Regex) {
let need_embeddings = matches!(options.mode, SearchMode::Semantic | SearchMode::Hybrid);
let file_options = ck_core::FileCollectionOptions::from(options);
ensure_index_updated_with_progress(
&options.path,
options.reindex,
need_embeddings,
indexing_progress_callback,
detailed_indexing_progress_callback,
&file_options,
options.embedding_model.as_deref(),
)
.await?;
}
let search_results = match options.mode {
SearchMode::Regex => {
let matches = regex_search(options)?;
ck_core::SearchResults {
matches,
closest_below_threshold: None,
}
}
SearchMode::Lexical => {
let matches = lexical_search(options).await?;
ck_core::SearchResults {
matches,
closest_below_threshold: None,
}
}
SearchMode::Semantic => {
semantic_search_v3_with_progress(options, progress_callback).await?
}
SearchMode::Hybrid => {
let matches = hybrid_search_with_progress(options, progress_callback).await?;
ck_core::SearchResults {
matches,
closest_below_threshold: None,
}
}
};
Ok(search_results)
}
fn regex_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
let pattern = if options.fixed_string {
regex::escape(&options.query)
} else if options.whole_word {
format!(r"\b{}\b", regex::escape(&options.query))
} else {
options.query.clone()
};
let regex = RegexBuilder::new(&pattern)
.case_insensitive(options.case_insensitive)
.build()
.map_err(CkError::Regex)?;
let should_recurse = options.path.is_dir() || options.recursive;
let files = if should_recurse {
let file_options = ck_core::FileCollectionOptions {
respect_gitignore: options.respect_gitignore,
use_ckignore: options.use_ckignore,
exclude_patterns: options.exclude_patterns.clone(),
};
let collected = ck_index::collect_files(&options.path, &file_options)?;
filter_files_by_include(collected, &options.include_patterns)
} else {
let collected = collect_files(&options.path, should_recurse, &options.exclude_patterns)?;
filter_files_by_include(collected, &options.include_patterns)
};
let results: Vec<Vec<SearchResult>> = files
.par_iter()
.filter_map(|file_path| match search_file(®ex, file_path, options) {
Ok(matches) => {
if matches.is_empty() {
None
} else {
Some(matches)
}
}
Err(e) => {
tracing::debug!("Error searching {:?}: {}", file_path, e);
None
}
})
.collect();
let mut all_results: Vec<SearchResult> = results.into_iter().flatten().collect();
all_results.sort_by(|a, b| {
let path_cmp = a.file.cmp(&b.file);
if path_cmp != std::cmp::Ordering::Equal {
return path_cmp;
}
a.span.line_start.cmp(&b.span.line_start)
});
if let Some(top_k) = options.top_k {
all_results.truncate(top_k);
}
Ok(all_results)
}
fn search_file(
regex: &Regex,
file_path: &Path,
options: &SearchOptions,
) -> Result<Vec<SearchResult>> {
let repo_root = find_nearest_index_root(file_path)
.unwrap_or_else(|| file_path.parent().unwrap_or(file_path).to_path_buf());
if options.full_section || options.context_lines > 0 {
let content = read_file_content(file_path, &repo_root)?;
let (lines, line_ending_lengths) = split_lines_with_endings(&content);
let code_sections = if options.full_section {
extract_code_sections(file_path, &content)
} else {
None
};
search_file_in_memory(
regex,
file_path,
options,
&lines,
&code_sections,
&line_ending_lengths,
)
} else {
search_file_streaming(regex, file_path, &repo_root, options)
}
}
fn search_file_in_memory(
regex: &Regex,
file_path: &Path,
options: &SearchOptions,
lines: &[String],
code_sections: &Option<Vec<(usize, usize, String)>>,
line_ending_lengths: &[usize],
) -> Result<Vec<SearchResult>> {
let mut results = Vec::new();
let mut byte_offset = 0;
for (line_idx, line) in lines.iter().enumerate() {
let line_number = line_idx + 1;
if regex.as_str().is_empty() {
let preview = if options.full_section {
if let Some(sections) = code_sections {
if let Some(section) = find_containing_section(sections, line_idx) {
section.clone()
} else {
get_context_preview(lines, line_idx, options)
}
} else {
get_context_preview(lines, line_idx, options)
}
} else {
get_context_preview(lines, line_idx, options)
};
results.push(SearchResult {
file: file_path.to_path_buf(),
span: Span {
byte_start: byte_offset,
byte_end: byte_offset + line.len(),
line_start: line_number,
line_end: line_number,
},
score: 1.0,
preview,
lang: ck_core::Language::from_path(file_path),
symbol: None,
chunk_hash: None,
index_epoch: None,
});
} else {
for mat in regex.find_iter(line) {
let preview = if options.full_section {
if let Some(sections) = code_sections {
if let Some(section) = find_containing_section(sections, line_idx) {
section.clone()
} else {
get_context_preview(lines, line_idx, options)
}
} else {
get_context_preview(lines, line_idx, options)
}
} else {
get_context_preview(lines, line_idx, options)
};
results.push(SearchResult {
file: file_path.to_path_buf(),
span: Span {
byte_start: byte_offset + mat.start(),
byte_end: byte_offset + mat.end(),
line_start: line_number,
line_end: line_number,
},
score: 1.0,
preview,
lang: ck_core::Language::from_path(file_path),
symbol: None,
chunk_hash: None,
index_epoch: None,
});
}
}
byte_offset += line.len();
byte_offset += line_ending_lengths.get(line_idx).copied().unwrap_or(0);
}
Ok(results)
}
fn search_file_streaming(
regex: &Regex,
file_path: &Path,
repo_root: &Path,
_options: &SearchOptions,
) -> Result<Vec<SearchResult>> {
use std::io::{BufRead, BufReader};
let content_path = resolve_content_path(file_path, repo_root)?;
let file = std::fs::File::open(&content_path)?;
let mut reader = BufReader::new(file);
let mut results = Vec::new();
let mut line = String::new();
let mut byte_offset = 0usize;
let mut line_number = 1usize;
loop {
line.clear();
let bytes_read = reader.read_line(&mut line)?;
if bytes_read == 0 {
break;
}
let mut newline_len = 0usize;
if line.ends_with("\r\n") {
line.pop(); line.pop(); newline_len = 2;
} else if line.ends_with(['\n', '\r']) {
line.pop();
newline_len = 1;
}
let treat_cr_as_newline = line.contains('\r');
if treat_cr_as_newline {
let bytes = line.as_bytes();
let mut segment_start = 0usize;
while segment_start <= bytes.len() {
match bytes[segment_start..].iter().position(|&b| b == b'\r') {
Some(rel_idx) => {
let idx = segment_start + rel_idx;
let segment_bytes = &bytes[segment_start..idx];
let segment_str = std::str::from_utf8(segment_bytes)?;
process_streaming_line(
regex,
file_path,
segment_str,
line_number,
byte_offset,
&mut results,
);
byte_offset += segment_bytes.len() + 1; line_number += 1;
segment_start = idx + 1;
}
None => {
let segment_bytes = &bytes[segment_start..];
let segment_str = std::str::from_utf8(segment_bytes)?;
process_streaming_line(
regex,
file_path,
segment_str,
line_number,
byte_offset,
&mut results,
);
byte_offset += segment_bytes.len();
line_number += 1;
break;
}
}
}
byte_offset += newline_len;
} else {
let line_str = line.as_str();
process_streaming_line(
regex,
file_path,
line_str,
line_number,
byte_offset,
&mut results,
);
byte_offset += line_str.len() + newline_len;
line_number += 1;
}
}
Ok(results)
}
fn process_streaming_line(
regex: &Regex,
file_path: &Path,
line: &str,
line_number: usize,
byte_offset: usize,
results: &mut Vec<SearchResult>,
) {
if regex.as_str().is_empty() {
results.push(SearchResult {
file: file_path.to_path_buf(),
span: Span {
byte_start: byte_offset,
byte_end: byte_offset + line.len(),
line_start: line_number,
line_end: line_number,
},
score: 1.0,
preview: line.to_string(),
lang: ck_core::Language::from_path(file_path),
symbol: None,
chunk_hash: None,
index_epoch: None,
});
} else {
for mat in regex.find_iter(line) {
results.push(SearchResult {
file: file_path.to_path_buf(),
span: Span {
byte_start: byte_offset + mat.start(),
byte_end: byte_offset + mat.end(),
line_start: line_number,
line_end: line_number,
},
score: 1.0,
preview: line.to_string(),
lang: ck_core::Language::from_path(file_path),
symbol: None,
chunk_hash: None,
index_epoch: None,
});
}
}
}
async fn lexical_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
if options.path.is_file() {
options.path.parent().unwrap_or(&options.path).to_path_buf()
} else {
options.path.clone()
}
});
let index_dir = index_root.join(".ck");
if !index_dir.exists() {
return Err(CkError::Index("No index found. Run 'ck index' first.".to_string()).into());
}
let tantivy_index_path = index_dir.join("tantivy_index");
if !tantivy_index_path.exists() {
return build_tantivy_index(options).await;
}
let mut schema_builder = Schema::builder();
let content_field = schema_builder.add_text_field("content", TEXT | STORED);
let path_field = schema_builder.add_text_field("path", TEXT | STORED);
let _schema = schema_builder.build();
let index = Index::open_in_dir(&tantivy_index_path)
.map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()
.map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![content_field]);
let query = query_parser
.parse_query(&options.query)
.map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
let top_docs = if let Some(top_k) = options.top_k {
searcher.search(&query, &TopDocs::with_limit(top_k))?
} else {
searcher.search(&query, &TopDocs::with_limit(100))?
};
let mut raw_results = Vec::new();
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
let path_text = retrieved_doc
.get_first(path_field)
.map(|field_value| field_value.as_str().unwrap_or(""))
.unwrap_or("");
let content_text = retrieved_doc
.get_first(content_field)
.map(|field_value| field_value.as_str().unwrap_or(""))
.unwrap_or("");
let file_path = PathBuf::from(path_text);
if !path_matches_include(&file_path, &options.include_patterns) {
continue;
}
let preview = if options.full_section {
content_text.to_string()
} else {
content_text.lines().take(3).collect::<Vec<_>>().join("\n")
};
raw_results.push((
_score,
SearchResult {
file: file_path,
span: Span {
byte_start: 0,
byte_end: content_text.len(),
line_start: 1,
line_end: content_text.lines().count(),
},
score: _score,
preview,
lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
symbol: None,
chunk_hash: None,
index_epoch: None,
},
));
}
let mut results = Vec::new();
if !raw_results.is_empty() {
let max_score = raw_results
.iter()
.map(|(score, _)| *score)
.fold(0.0f32, f32::max);
if max_score > 0.0 {
for (raw_score, mut result) in raw_results {
let normalized_score = raw_score / max_score;
if let Some(threshold) = options.threshold
&& normalized_score < threshold
{
continue;
}
result.score = normalized_score;
results.push(result);
}
}
}
Ok(results)
}
async fn build_tantivy_index(options: &SearchOptions) -> Result<Vec<SearchResult>> {
let index_root = if options.path.is_file() {
options.path.parent().unwrap_or(&options.path)
} else {
&options.path
};
let index_dir = index_root.join(".ck");
let tantivy_index_path = index_dir.join("tantivy_index");
fs::create_dir_all(&tantivy_index_path)?;
let mut schema_builder = Schema::builder();
let content_field = schema_builder.add_text_field("content", TEXT | STORED);
let path_field = schema_builder.add_text_field("path", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_dir(&tantivy_index_path, schema.clone())
.map_err(|e| CkError::Index(format!("Failed to create tantivy index: {}", e)))?;
let mut index_writer = index
.writer(50_000_000)
.map_err(|e| CkError::Index(format!("Failed to create index writer: {}", e)))?;
let files = filter_files_by_include(
collect_files(index_root, true, &options.exclude_patterns)?,
&options.include_patterns,
);
for file_path in &files {
if let Ok(content) = fs::read_to_string(file_path) {
let doc = doc!(
content_field => content,
path_field => file_path.display().to_string()
);
index_writer.add_document(doc)?;
}
}
index_writer
.commit()
.map_err(|e| CkError::Index(format!("Failed to commit index: {}", e)))?;
let tantivy_index_path = index_root.join(".ck").join("tantivy_index");
let mut schema_builder = Schema::builder();
let content_field = schema_builder.add_text_field("content", TEXT | STORED);
let path_field = schema_builder.add_text_field("path", TEXT | STORED);
let _schema = schema_builder.build();
let index = Index::open_in_dir(&tantivy_index_path)
.map_err(|e| CkError::Index(format!("Failed to open tantivy index: {}", e)))?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()
.map_err(|e| CkError::Index(format!("Failed to create index reader: {}", e)))?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![content_field]);
let query = query_parser
.parse_query(&options.query)
.map_err(|e| CkError::Search(format!("Failed to parse query: {}", e)))?;
let top_docs = if let Some(top_k) = options.top_k {
searcher.search(&query, &TopDocs::with_limit(top_k))?
} else {
searcher.search(&query, &TopDocs::with_limit(100))?
};
let mut raw_results = Vec::new();
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
let path_text = retrieved_doc
.get_first(path_field)
.map(|field_value| field_value.as_str().unwrap_or(""))
.unwrap_or("");
let content_text = retrieved_doc
.get_first(content_field)
.map(|field_value| field_value.as_str().unwrap_or(""))
.unwrap_or("");
let file_path = PathBuf::from(path_text);
let preview = if options.full_section {
content_text.to_string()
} else {
content_text.lines().take(3).collect::<Vec<_>>().join("\n")
};
raw_results.push((
_score,
SearchResult {
file: file_path,
span: Span {
byte_start: 0,
byte_end: content_text.len(),
line_start: 1,
line_end: content_text.lines().count(),
},
score: _score,
preview,
lang: ck_core::Language::from_path(&PathBuf::from(path_text)),
symbol: None,
chunk_hash: None,
index_epoch: None,
},
));
}
let mut results = Vec::new();
if !raw_results.is_empty() {
let max_score = raw_results
.iter()
.map(|(score, _)| *score)
.fold(0.0f32, f32::max);
if max_score > 0.0 {
for (raw_score, mut result) in raw_results {
let normalized_score = raw_score / max_score;
if let Some(threshold) = options.threshold
&& normalized_score < threshold
{
continue;
}
result.score = normalized_score;
results.push(result);
}
}
}
Ok(results)
}
#[allow(dead_code)]
async fn hybrid_search(options: &SearchOptions) -> Result<Vec<SearchResult>> {
hybrid_search_with_progress(options, None).await
}
async fn hybrid_search_with_progress(
options: &SearchOptions,
progress_callback: Option<SearchProgressCallback>,
) -> Result<Vec<SearchResult>> {
if let Some(ref callback) = progress_callback {
callback("Running regex search...");
}
let regex_results = regex_search(options)?;
if let Some(ref callback) = progress_callback {
callback("Running semantic search...");
}
let semantic_results = semantic_search_v3_with_progress(options, progress_callback).await?;
let mut combined = HashMap::new();
for (rank, result) in regex_results.iter().enumerate() {
let key = format!("{}:{}", result.file.display(), result.span.line_start);
combined
.entry(key)
.or_insert(Vec::new())
.push((rank + 1, result.clone()));
}
for (rank, result) in semantic_results.matches.iter().enumerate() {
let key = format!("{}:{}", result.file.display(), result.span.line_start);
combined
.entry(key)
.or_insert(Vec::new())
.push((rank + 1, result.clone()));
}
let mut rrf_results: Vec<SearchResult> = combined
.into_values()
.map(|ranks| {
let mut result = ranks[0].1.clone();
let rrf_score = ranks
.iter()
.map(|(rank, _)| 1.0 / (60.0 + *rank as f32))
.sum();
result.score = rrf_score;
result
})
.filter(|result| {
if let Some(threshold) = options.threshold {
result.score >= threshold
} else {
true
}
})
.collect();
rrf_results.retain(|result| path_matches_include(&result.file, &options.include_patterns));
rrf_results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
if let Some(top_k) = options.top_k {
rrf_results.truncate(top_k);
}
Ok(rrf_results)
}
fn build_globset(patterns: &[String]) -> GlobSet {
let mut builder = GlobSetBuilder::new();
for pat in patterns {
if let Ok(glob) = Glob::new(pat) {
builder.add(glob);
}
}
builder.build().unwrap_or_else(|_| GlobSet::empty())
}
fn should_exclude_path(path: &Path, globset: &GlobSet) -> bool {
if globset.is_match(path) {
return true;
}
for component in path.components() {
if let std::path::Component::Normal(name) = component
&& globset.is_match(name)
{
return true;
}
}
false
}
fn collect_files(
path: &Path,
recursive: bool,
exclude_patterns: &[String],
) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
let globset = build_globset(exclude_patterns);
if path.is_file() {
files.push(path.to_path_buf());
} else if recursive {
for entry in WalkDir::new(path).into_iter().filter_entry(|e| {
let name = e.file_name();
!globset.is_match(e.path()) && !globset.is_match(name)
}) {
match entry {
Ok(entry) => {
if entry.file_type().is_file() && !should_exclude_path(entry.path(), &globset) {
files.push(entry.path().to_path_buf());
}
}
Err(e) => {
tracing::debug!("Skipping path due to error: {}", e);
continue;
}
}
}
} else {
match fs::read_dir(path) {
Ok(read_dir) => {
for entry in read_dir {
match entry {
Ok(entry) => {
let path = entry.path();
if path.is_file() && !should_exclude_path(&path, &globset) {
files.push(path);
}
}
Err(e) => {
tracing::debug!("Skipping directory entry due to error: {}", e);
continue;
}
}
}
}
Err(e) => {
tracing::debug!("Cannot read directory {:?}: {}", path, e);
return Err(e.into());
}
}
}
Ok(files)
}
async fn ensure_index_updated_with_progress(
path: &Path,
force_reindex: bool,
need_embeddings: bool,
progress_callback: Option<ck_index::ProgressCallback>,
detailed_progress_callback: Option<ck_index::DetailedProgressCallback>,
file_options: &ck_core::FileCollectionOptions,
model_override: Option<&str>,
) -> Result<()> {
let index_root_buf = find_nearest_index_root(path).unwrap_or_else(|| {
if path.is_file() {
path.parent().unwrap_or(path).to_path_buf()
} else {
path.to_path_buf()
}
});
let index_root = &index_root_buf;
if force_reindex {
let stats = ck_index::smart_update_index_with_detailed_progress(
index_root,
true,
progress_callback,
detailed_progress_callback,
need_embeddings,
file_options,
model_override,
)
.await?;
if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
tracing::info!(
"Index updated: {} files indexed, {} orphaned files removed",
stats.files_indexed,
stats.orphaned_files_removed
);
}
return Ok(());
}
if path.is_file() {
use ck_index::index_file;
index_file(path, need_embeddings).await?;
} else {
let stats = ck_index::smart_update_index_with_detailed_progress(
index_root,
false,
progress_callback,
detailed_progress_callback,
need_embeddings,
file_options,
model_override,
)
.await?;
if stats.files_indexed > 0 || stats.orphaned_files_removed > 0 {
tracing::info!(
"Index updated: {} files indexed, {} orphaned files removed",
stats.files_indexed,
stats.orphaned_files_removed
);
}
}
Ok(())
}
fn get_context_preview(lines: &[String], line_idx: usize, options: &SearchOptions) -> String {
let before = options.before_context_lines.max(options.context_lines);
let after = options.after_context_lines.max(options.context_lines);
if before > 0 || after > 0 {
let start_idx = line_idx.saturating_sub(before);
let end_idx = (line_idx + after + 1).min(lines.len());
lines[start_idx..end_idx].join("\n")
} else {
lines[line_idx].to_string()
}
}
fn extract_code_sections(file_path: &Path, content: &str) -> Option<Vec<(usize, usize, String)>> {
let lang = ck_core::Language::from_path(file_path)?;
if let Ok(chunks) = ck_chunk::chunk_text(content, Some(lang)) {
let sections: Vec<(usize, usize, String)> = chunks
.into_iter()
.filter(|chunk| {
matches!(
chunk.chunk_type,
ck_chunk::ChunkType::Function
| ck_chunk::ChunkType::Class
| ck_chunk::ChunkType::Method
)
})
.map(|chunk| {
(
chunk.span.line_start - 1, chunk.span.line_end - 1,
chunk.text,
)
})
.collect();
if sections.is_empty() {
None
} else {
Some(sections)
}
} else {
None
}
}
fn find_containing_section(
sections: &[(usize, usize, String)],
line_idx: usize,
) -> Option<&String> {
for (start, end, text) in sections {
if line_idx >= *start && line_idx <= *end {
return Some(text);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
fn create_test_files(dir: &std::path::Path) -> Vec<PathBuf> {
let files = vec![
("test1.txt", "hello world rust programming"),
("test2.rs", "fn main() { println!(\"Hello Rust\"); }"),
("test3.py", "print('Hello Python')"),
("test4.txt", "machine learning artificial intelligence"),
];
let mut paths = Vec::new();
for (name, content) in files {
let path = dir.join(name);
fs::write(&path, content).unwrap();
paths.push(path);
}
paths
}
#[test]
fn test_extract_lines_from_file() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("test_lines.txt");
let content =
"Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10";
fs::write(&test_file, content).unwrap();
let result = extract_lines_from_file(&test_file, 3, 5).unwrap();
assert_eq!(result, "Line 3\nLine 4\nLine 5");
let result = extract_lines_from_file(&test_file, 7, 7).unwrap();
assert_eq!(result, "Line 7");
let result = extract_lines_from_file(&test_file, 8, 100).unwrap();
assert_eq!(result, "Line 8\nLine 9\nLine 10");
let result = extract_lines_from_file(&test_file, 0, 5).unwrap();
assert_eq!(result, "");
let result = extract_lines_from_file(&test_file, 20, 25).unwrap();
assert_eq!(result, "");
}
#[tokio::test]
async fn test_extract_content_from_span() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("code.rs");
let content = "fn first() {\n println!(\"First\");\n}\n\nfn second() {\n println!(\"Second\");\n}\n\nfn third() {\n println!(\"Third\");\n}";
fs::write(&test_file, content).unwrap();
let span = ck_core::Span {
byte_start: 0, byte_end: 0, line_start: 5,
line_end: 7,
};
let result = extract_content_from_span(&test_file, &span).await.unwrap();
assert_eq!(result, "fn second() {\n println!(\"Second\");\n}");
let span = ck_core::Span {
byte_start: 0,
byte_end: 0,
line_start: 2,
line_end: 2,
};
let result = extract_content_from_span(&test_file, &span).await.unwrap();
assert_eq!(result, " println!(\"First\");");
}
#[test]
fn test_collect_files() {
let temp_dir = TempDir::new().unwrap();
let test_files = create_test_files(temp_dir.path());
let files = collect_files(temp_dir.path(), false, &[]).unwrap();
assert_eq!(files.len(), 4);
let files = collect_files(temp_dir.path(), true, &[]).unwrap();
assert_eq!(files.len(), 4);
let files = collect_files(&test_files[0], false, &[]).unwrap();
assert_eq!(files.len(), 1);
assert_eq!(files[0], test_files[0]);
}
#[test]
fn test_regex_search() {
let temp_dir = TempDir::new().unwrap();
create_test_files(temp_dir.path());
let options = SearchOptions {
mode: SearchMode::Regex,
query: "rust".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
..Default::default()
};
let results = regex_search(&options).unwrap();
assert!(!results.is_empty());
let rust_matches: Vec<_> = results
.iter()
.filter(|r| r.preview.to_lowercase().contains("rust"))
.collect();
assert!(!rust_matches.is_empty());
}
#[test]
fn test_regex_search_case_insensitive() {
let temp_dir = TempDir::new().unwrap();
create_test_files(temp_dir.path());
let options = SearchOptions {
mode: SearchMode::Regex,
query: "HELLO".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
case_insensitive: true,
..Default::default()
};
let results = regex_search(&options).unwrap();
assert!(!results.is_empty());
}
#[test]
fn test_regex_search_fixed_string() {
let temp_dir = TempDir::new().unwrap();
create_test_files(temp_dir.path());
let options = SearchOptions {
mode: SearchMode::Regex,
query: "fn main()".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
fixed_string: true,
..Default::default()
};
let results = regex_search(&options).unwrap();
assert!(!results.is_empty());
}
#[test]
fn test_regex_search_whole_word() {
let temp_dir = TempDir::new().unwrap();
fs::write(
temp_dir.path().join("word_test.txt"),
"rust rusty rustacean",
)
.unwrap();
let options = SearchOptions {
mode: SearchMode::Regex,
query: "rust".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
whole_word: true,
..Default::default()
};
let results = regex_search(&options).unwrap();
assert!(!results.is_empty());
}
#[test]
fn test_regex_search_top_k() {
let temp_dir = TempDir::new().unwrap();
for i in 0..10 {
fs::write(
temp_dir.path().join(format!("file{}.txt", i)),
"test content",
)
.unwrap();
}
let options = SearchOptions {
mode: SearchMode::Regex,
query: "test".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
top_k: Some(5),
..Default::default()
};
let results = regex_search(&options).unwrap();
assert!(results.len() <= 5);
}
#[test]
fn test_regex_search_span_offsets() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("spans.txt");
fs::write(&test_file, "test test test\nline two test\ntest end").unwrap();
let options = SearchOptions {
mode: SearchMode::Regex,
query: "test".to_string(),
path: test_file.clone(),
recursive: false,
..Default::default()
};
let results = regex_search(&options).unwrap();
assert_eq!(results.len(), 5);
let line1_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 1).collect();
assert_eq!(line1_matches.len(), 3);
assert_eq!(line1_matches[0].span.byte_start, 0);
assert_eq!(line1_matches[1].span.byte_start, 5);
assert_eq!(line1_matches[2].span.byte_start, 10);
let line2_matches: Vec<_> = results.iter().filter(|r| r.span.line_start == 2).collect();
assert_eq!(line2_matches.len(), 1);
assert_eq!(line2_matches[0].span.byte_start, 24);
let mut byte_starts: Vec<_> = results.iter().map(|r| r.span.byte_start).collect();
byte_starts.sort();
byte_starts.dedup();
assert_eq!(byte_starts.len(), 5); }
#[test]
fn test_search_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.txt");
fs::write(
&file_path,
"line 1: hello\nline 2: world\nline 3: rust programming",
)
.unwrap();
let regex = regex::Regex::new("rust").unwrap();
let options = SearchOptions::default();
let results = search_file(®ex, &file_path, &options).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].span.line_start, 3);
assert!(results[0].preview.contains("rust"));
}
#[test]
fn test_search_file_with_context() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.txt");
fs::write(&file_path, "line 1\nline 2\ntarget line\nline 4\nline 5").unwrap();
let regex = regex::Regex::new("target").unwrap();
let options = SearchOptions {
context_lines: 1,
..Default::default()
};
let results = search_file(®ex, &file_path, &options).unwrap();
assert_eq!(results.len(), 1);
println!("Preview: '{}'", results[0].preview);
assert!(results[0].preview.contains("line 2"));
assert!(results[0].preview.contains("target line"));
assert!(results[0].preview.contains("line 4"));
}
#[tokio::test]
async fn test_search_main_function() {
let temp_dir = TempDir::new().unwrap();
create_test_files(temp_dir.path());
let options = SearchOptions {
mode: SearchMode::Regex,
query: "hello".to_string(),
path: temp_dir.path().to_path_buf(),
recursive: true,
case_insensitive: true,
..Default::default()
};
let results = search(&options).await.unwrap();
assert!(!results.is_empty());
}
#[tokio::test]
async fn test_regex_search_mixed_line_endings() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("mixed_endings.txt");
let content = "line1\r\nline2\nline3\r\npattern here\nline5\r\n";
std::fs::write(&test_file, content).unwrap();
let options = SearchOptions {
mode: SearchMode::Regex,
query: "pattern".to_string(),
path: test_file.clone(),
recursive: false,
..Default::default()
};
let results = search(&options).await.unwrap();
assert_eq!(results.len(), 1);
let result = &results[0];
let original_content = std::fs::read_to_string(&test_file).unwrap();
let pattern_start = original_content.find("pattern").unwrap();
assert_eq!(result.span.byte_start, pattern_start);
assert_eq!(result.span.line_start, 4); }
#[tokio::test]
async fn test_regex_search_windows_line_endings() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("windows_endings.txt");
let content = "first line\r\nsecond line\r\nmatch this\r\nfourth line\r\n";
std::fs::write(&test_file, content).unwrap();
let options = SearchOptions {
mode: SearchMode::Regex,
query: "match".to_string(),
path: test_file.clone(),
recursive: false,
..Default::default()
};
let results = search(&options).await.unwrap();
assert_eq!(results.len(), 1);
let result = &results[0];
assert_eq!(result.span.line_start, 3);
let expected_byte_start = 25; assert_eq!(result.span.byte_start, expected_byte_start);
}
#[test]
fn test_split_lines_with_endings_helper() {
let unix_content = "line1\nline2\nline3\n";
let (unix_lines, unix_endings) = split_lines_with_endings(unix_content);
assert_eq!(unix_lines, vec!["line1", "line2", "line3"]);
assert_eq!(unix_endings, vec![1, 1, 1]);
let windows_content = "line1\r\nline2\r\nline3\r\n";
let (windows_lines, windows_endings) = split_lines_with_endings(windows_content);
assert_eq!(windows_lines, vec!["line1", "line2", "line3"]);
assert_eq!(windows_endings, vec![2, 2, 2]);
let mac_content = "line1\rline2\rline3\r";
let (mac_lines, mac_endings) = split_lines_with_endings(mac_content);
assert_eq!(mac_lines, vec!["line1", "line2", "line3"]);
assert_eq!(mac_endings, vec![1, 1, 1]);
let mixed_content = "line1\nline2\r\nline3\r";
let (mixed_lines, mixed_endings) = split_lines_with_endings(mixed_content);
assert_eq!(mixed_lines, vec!["line1", "line2", "line3"]);
assert_eq!(mixed_endings, vec![1, 2, 1]);
let no_endings = "single line";
let (no_lines, no_endings_vec) = split_lines_with_endings(no_endings);
assert_eq!(no_lines, vec!["single line"]);
assert_eq!(no_endings_vec, vec![0]);
}
#[cfg(feature = "fastembed")]
#[tokio::test]
async fn test_subdirectory_search_uses_parent_ckignore() {
let temp_dir = TempDir::new().unwrap();
let parent = temp_dir.path();
let subdir = parent.join("subproject");
fs::create_dir(&subdir).unwrap();
fs::write(parent.join(".ckignore"), "*.tmp\n").unwrap();
fs::write(parent.join("parent.txt"), "searchable content in parent").unwrap();
fs::write(parent.join("ignored.tmp"), "this should not be indexed").unwrap();
fs::write(subdir.join("nested.txt"), "searchable content in subdir").unwrap();
fs::write(
subdir.join("also_ignored.tmp"),
"this should not be indexed either",
)
.unwrap();
let parent_options = SearchOptions {
mode: SearchMode::Semantic,
query: "searchable".to_string(),
path: parent.to_path_buf(),
top_k: Some(10),
threshold: Some(0.1),
..Default::default()
};
let _ = search(&parent_options).await;
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
let subdir_options = SearchOptions {
mode: SearchMode::Semantic,
query: "content".to_string(),
path: subdir.clone(),
top_k: Some(10),
threshold: Some(0.1),
..Default::default()
};
let results = search(&subdir_options).await.unwrap();
let tmp_files: Vec<_> = results
.iter()
.filter(|r| r.file.to_string_lossy().ends_with(".tmp"))
.collect();
assert!(
tmp_files.is_empty(),
"Bug: .tmp files were indexed despite parent .ckignore. Found {} .tmp files: {:?}",
tmp_files.len(),
tmp_files.iter().map(|r| &r.file).collect::<Vec<_>>()
);
let txt_in_subdir = results.iter().any(|r| r.file.ends_with("nested.txt"));
assert!(txt_in_subdir, "Should find nested.txt in subdirectory");
assert!(
!subdir.join(".ck").exists(),
"Should not create .ck directory in subdirectory"
);
}
#[cfg(feature = "fastembed")]
#[tokio::test]
async fn test_multiple_ckignore_files_merge_correctly() {
use std::fs;
use tempfile::TempDir;
let temp_dir = TempDir::new().unwrap();
let parent = temp_dir.path();
let subdir = parent.join("subdir");
let deeper = subdir.join("deeper");
fs::create_dir(&subdir).unwrap();
fs::create_dir(&deeper).unwrap();
fs::write(parent.join(".ckignore"), "*.log\n").unwrap();
fs::write(subdir.join(".ckignore"), "*.tmp\n").unwrap();
fs::write(deeper.join(".ckignore"), "*.cache\n").unwrap();
fs::write(parent.join("root.txt"), "searchable").unwrap();
fs::write(parent.join("root.log"), "should be ignored").unwrap();
fs::write(subdir.join("mid.txt"), "searchable").unwrap();
fs::write(subdir.join("mid.log"), "should be ignored by parent").unwrap();
fs::write(subdir.join("mid.tmp"), "should be ignored by local").unwrap();
fs::write(deeper.join("deep.txt"), "searchable").unwrap();
fs::write(deeper.join("deep.log"), "should be ignored by grandparent").unwrap();
fs::write(deeper.join("deep.tmp"), "should be ignored by parent").unwrap();
fs::write(deeper.join("deep.cache"), "should be ignored by local").unwrap();
let parent_options = SearchOptions {
mode: SearchMode::Semantic,
query: "searchable".to_string(),
path: parent.to_path_buf(),
top_k: Some(20),
threshold: Some(0.1),
..Default::default()
};
let _ = search(&parent_options).await;
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
let deeper_options = SearchOptions {
mode: SearchMode::Semantic,
query: "ignored".to_string(),
path: deeper.clone(),
top_k: Some(20),
threshold: Some(0.1),
..Default::default()
};
let results = search(&deeper_options).await.unwrap();
let has_log = results
.iter()
.any(|r| r.file.to_string_lossy().ends_with(".log"));
let has_tmp = results
.iter()
.any(|r| r.file.to_string_lossy().ends_with(".tmp"));
let has_cache = results
.iter()
.any(|r| r.file.to_string_lossy().ends_with(".cache"));
assert!(
!has_log,
"*.log files should be excluded by parent .ckignore"
);
assert!(
!has_tmp,
"*.tmp files should be excluded by subdir .ckignore"
);
assert!(
!has_cache,
"*.cache files should be excluded by deeper .ckignore"
);
let has_txt = results
.iter()
.any(|r| r.file.to_string_lossy().ends_with(".txt"));
assert!(has_txt, "Should find .txt files (not ignored)");
}
}