Skip to main content

ck_engine/
semantic_v3.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchOptions, SearchResult};
3use std::path::Path;
4use walkdir::WalkDir;
5
6use super::{
7    SearchProgressCallback, extract_content_from_span, find_nearest_index_root,
8    resolve_model_from_root,
9};
10
11/// New semantic search implementation using span-based storage
12pub async fn semantic_search_v3(options: &SearchOptions) -> Result<ck_core::SearchResults> {
13    semantic_search_v3_with_progress(options, None).await
14}
15
16pub async fn semantic_search_v3_with_progress(
17    options: &SearchOptions,
18    progress_callback: Option<SearchProgressCallback>,
19) -> Result<ck_core::SearchResults> {
20    // Find the index root
21    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
22        if options.path.is_file() {
23            options.path.parent().unwrap_or(&options.path).to_path_buf()
24        } else {
25            options.path.clone()
26        }
27    });
28
29    let index_dir = index_root.join(".ck");
30    if !index_dir.exists() {
31        return Err(CkError::Index(
32            "Index creation failed. Please try running 'ck --index' explicitly.".to_string(),
33        )
34        .into());
35    }
36
37    if let Some(ref callback) = progress_callback {
38        callback("Loading embeddings from sidecar files...");
39    }
40
41    // Collect all sidecar files and their embeddings
42    let mut file_chunks: Vec<(std::path::PathBuf, ck_index::ChunkEntry)> = Vec::new();
43
44    for entry in WalkDir::new(&index_dir) {
45        let entry = entry?;
46        if entry.file_type().is_file() {
47            let path = entry.path();
48            if path.extension().and_then(|s| s.to_str()) == Some("ck") {
49                // Load the sidecar file
50                if let Ok(index_entry) = ck_index::load_index_entry(path) {
51                    let original_file = reconstruct_original_path(path, &index_dir, &index_root);
52                    if let Some(original_file) = original_file {
53                        if !super::path_matches_include(&original_file, &options.include_patterns) {
54                            continue;
55                        }
56                        for chunk in index_entry.chunks {
57                            if chunk.embedding.is_some() {
58                                file_chunks.push((original_file.clone(), chunk));
59                            }
60                        }
61                    }
62                }
63            }
64        }
65    }
66
67    if file_chunks.is_empty() {
68        return Err(CkError::Index(
69            "No embeddings found. Run 'ck --index' first with embeddings.".to_string(),
70        )
71        .into());
72    }
73
74    if let Some(ref callback) = progress_callback {
75        callback(&format!(
76            "Found {} chunks with embeddings",
77            file_chunks.len()
78        ));
79    }
80
81    // Create embedder and embed the query
82    if let Some(ref callback) = progress_callback {
83        callback("Loading embedding model...");
84    }
85
86    let resolved_model = resolve_model_from_root(&index_root, options.embedding_model.as_deref())?;
87    if let Some(ref callback) = progress_callback {
88        if resolved_model.alias == resolved_model.canonical_name() {
89            callback(&format!(
90                "Using embedding model {} ({} dims)",
91                resolved_model.canonical_name(),
92                resolved_model.dimensions()
93            ));
94        } else {
95            callback(&format!(
96                "Using embedding model {} (alias '{}', {} dims)",
97                resolved_model.canonical_name(),
98                resolved_model.alias,
99                resolved_model.dimensions()
100            ));
101        }
102    }
103
104    let mut embedder = ck_embed::create_embedder_for_config(&resolved_model.config, None)?;
105    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
106
107    if query_embeddings.is_empty() {
108        return Ok(ck_core::SearchResults {
109            matches: Vec::new(),
110            closest_below_threshold: None,
111        });
112    }
113
114    let query_embedding = &query_embeddings[0];
115
116    if let Some(ref callback) = progress_callback {
117        callback("Computing similarity scores...");
118    }
119
120    // Compute similarities
121    let mut similarities: Vec<(f32, &std::path::PathBuf, &ck_index::ChunkEntry)> = Vec::new();
122
123    for (file_path, chunk) in &file_chunks {
124        if let Some(ref embedding) = chunk.embedding {
125            let similarity = cosine_similarity(query_embedding, embedding);
126            similarities.push((similarity, file_path, chunk));
127        }
128    }
129
130    // Sort by similarity (highest first)
131    similarities.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
132
133    // Apply threshold and top_k filtering
134    let mut results = Vec::new();
135    let mut closest_below_threshold: Option<SearchResult> = None;
136    let limit = options.top_k.unwrap_or(similarities.len());
137
138    for (similarity, file_path, chunk) in similarities.into_iter().take(limit) {
139        let is_below_threshold = options
140            .threshold
141            .is_some_and(|threshold| similarity < threshold);
142
143        // Check if we're filtering by a specific file or directory (apply to both above/below threshold)
144        let passes_path_filter = if options.path.is_file() {
145            let target_file = options
146                .path
147                .canonicalize()
148                .unwrap_or_else(|_| options.path.clone());
149            let result_file = file_path
150                .canonicalize()
151                .unwrap_or_else(|_| file_path.clone());
152            result_file == target_file
153        } else if options.path != Path::new(".") {
154            // Filter by directory path - only include files within the specified directory
155            let target_dir = options
156                .path
157                .canonicalize()
158                .unwrap_or_else(|_| options.path.clone());
159            let result_file = file_path
160                .canonicalize()
161                .unwrap_or_else(|_| file_path.clone());
162            result_file.starts_with(&target_dir)
163        } else {
164            true
165        };
166
167        if !passes_path_filter {
168            continue;
169        }
170
171        // Extract content from the file using the span, skip if file doesn't exist
172        let content = if options.full_section {
173            match extract_content_from_span(file_path, &chunk.span).await {
174                Ok(content) => content,
175                Err(_) => {
176                    // Skip files that no longer exist (stale index entries)
177                    continue;
178                }
179            }
180        } else {
181            match extract_content_from_span(file_path, &chunk.span).await {
182                Ok(full_content) => {
183                    // Take first 3 lines for preview
184                    full_content.lines().take(3).collect::<Vec<_>>().join("\n")
185                }
186                Err(_) => {
187                    // Skip files that no longer exist (stale index entries)
188                    continue;
189                }
190            }
191        };
192
193        let search_result = SearchResult {
194            file: file_path.clone(),
195            span: chunk.span.clone(),
196            score: similarity,
197            preview: content,
198            lang: ck_core::Language::from_path(file_path),
199            symbol: None,
200            chunk_hash: None,
201            index_epoch: None,
202        };
203
204        if is_below_threshold {
205            // Track the closest below-threshold result (first one since sorted by highest first)
206            if closest_below_threshold.is_none() {
207                closest_below_threshold = Some(search_result);
208            }
209        } else {
210            // Add to main results if above threshold
211            results.push(search_result);
212        }
213    }
214
215    // Apply reranking if enabled
216    if options.rerank && !results.is_empty() {
217        if let Some(ref callback) = progress_callback {
218            callback("Reranking results for improved relevance...");
219        }
220
221        let rerank_registry = ck_models::RerankModelRegistry::default();
222        let (rerank_alias, rerank_config) = rerank_registry
223            .resolve(options.rerank_model.as_deref())
224            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
225
226        match ck_embed::create_reranker_for_config(&rerank_config, None) {
227            Ok(mut reranker) => {
228                if let Some(ref callback) = progress_callback {
229                    callback(&format!("Reranking results with model {}", rerank_alias));
230                }
231
232                let documents: Vec<String> = results.iter().map(|r| r.preview.clone()).collect();
233
234                match reranker.rerank(&options.query, &documents) {
235                    Ok(rerank_results) => {
236                        // Create a map from document text to indices for handling duplicates
237                        let mut doc_to_indices: std::collections::HashMap<String, Vec<usize>> =
238                            std::collections::HashMap::new();
239                        for (i, result) in results.iter().enumerate() {
240                            doc_to_indices
241                                .entry(result.preview.clone())
242                                .or_default()
243                                .push(i);
244                        }
245
246                        // Update results with reranked scores
247                        // The reranker returns results in reranked order, so we match by document text
248                        for rerank_result in rerank_results.iter() {
249                            if let Some(indices) = doc_to_indices.get_mut(&rerank_result.document)
250                                && let Some(idx) = indices.pop()
251                            {
252                                results[idx].score = rerank_result.score;
253                            }
254                        }
255
256                        // Re-sort by reranked scores
257                        results.sort_by(|a, b| {
258                            b.score
259                                .partial_cmp(&a.score)
260                                .unwrap_or(std::cmp::Ordering::Equal)
261                        });
262
263                        // Apply top_k limit again after reranking
264                        if let Some(limit) = options.top_k {
265                            results.truncate(limit);
266                        }
267                    }
268                    Err(e) => {
269                        tracing::warn!("Reranking failed, using original scores: {}", e);
270                    }
271                }
272            }
273            Err(e) => {
274                tracing::warn!("Failed to create reranker, using original scores: {}", e);
275            }
276        }
277    }
278
279    Ok(ck_core::SearchResults {
280        matches: results,
281        closest_below_threshold,
282    })
283}
284
285fn reconstruct_original_path(
286    sidecar_path: &Path,
287    index_dir: &Path,
288    repo_root: &Path,
289) -> Option<std::path::PathBuf> {
290    // Remove the index directory prefix and .ck extension
291    let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
292    let mut original_path = relative_path.with_extension("");
293
294    // Handle the .ck extension removal
295    if let Some(name) = original_path.file_name() {
296        let name_str = name.to_string_lossy();
297        if let Some(original_name) = name_str.strip_suffix(".ck") {
298            let mut new_path = original_path.clone();
299            new_path.set_file_name(original_name);
300            original_path = new_path;
301        }
302    }
303
304    Some(repo_root.join(original_path))
305}
306
307fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
308    if a.len() != b.len() {
309        return 0.0;
310    }
311
312    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
313    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
314    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
315
316    if norm_a == 0.0 || norm_b == 0.0 {
317        0.0
318    } else {
319        dot_product / (norm_a * norm_b)
320    }
321}