Skip to main content

ck_engine/
semantic_v3.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchOptions, SearchResult};
3use std::path::Path;
4use walkdir::WalkDir;
5
6use super::{
7    SearchProgressCallback, extract_content_from_span, find_nearest_index_root,
8    resolve_model_from_root,
9};
10
11/// New semantic search implementation using span-based storage
12pub async fn semantic_search_v3(options: &SearchOptions) -> Result<ck_core::SearchResults> {
13    semantic_search_v3_with_progress(options, None).await
14}
15
16pub async fn semantic_search_v3_with_progress(
17    options: &SearchOptions,
18    progress_callback: Option<SearchProgressCallback>,
19) -> Result<ck_core::SearchResults> {
20    // Find the index root
21    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
22        if options.path.is_file() {
23            options.path.parent().unwrap_or(&options.path).to_path_buf()
24        } else {
25            options.path.clone()
26        }
27    });
28
29    let index_dir = index_root.join(".ck");
30    if !index_dir.exists() {
31        return Err(CkError::Index(
32            "Index creation failed. Please try running 'ck --index' explicitly.".to_string(),
33        )
34        .into());
35    }
36
37    if let Some(ref callback) = progress_callback {
38        callback("Loading embeddings from sidecar files...");
39    }
40
41    // Build the path scope filter once, up front. Previously this was
42    // applied AFTER top_k inside the iteration loop, so a whole-codebase
43    // index plus a narrow `path=` query could return zero matches when
44    // the global top_k results all lived outside the requested scope.
45    // Filtering at collection time fixes that and skips embedding loads
46    // for chunks we'd discard anyway.
47    let scope = PathScope::new(&options.path);
48
49    // Collect all sidecar files and their embeddings
50    let mut file_chunks: Vec<(std::path::PathBuf, ck_index::ChunkEntry)> = Vec::new();
51
52    for entry in WalkDir::new(&index_dir) {
53        let entry = entry?;
54        if entry.file_type().is_file() {
55            let path = entry.path();
56            if path.extension().and_then(|s| s.to_str()) == Some("ck") {
57                // Load the sidecar file
58                if let Ok(index_entry) = ck_index::load_index_entry(path) {
59                    let original_file = reconstruct_original_path(path, &index_dir, &index_root);
60                    if let Some(original_file) = original_file {
61                        if !super::path_matches_include(&original_file, &options.include_patterns) {
62                            continue;
63                        }
64                        if !scope.contains(&original_file) {
65                            continue;
66                        }
67                        for chunk in index_entry.chunks {
68                            if chunk.embedding.is_some() {
69                                file_chunks.push((original_file.clone(), chunk));
70                            }
71                        }
72                    }
73                }
74            }
75        }
76    }
77
78    if file_chunks.is_empty() {
79        return Err(CkError::Index(
80            "No embeddings found. Run 'ck --index' first with embeddings.".to_string(),
81        )
82        .into());
83    }
84
85    if let Some(ref callback) = progress_callback {
86        callback(&format!(
87            "Found {} chunks with embeddings",
88            file_chunks.len()
89        ));
90    }
91
92    // Create embedder and embed the query
93    if let Some(ref callback) = progress_callback {
94        callback("Loading embedding model...");
95    }
96
97    let resolved_model = resolve_model_from_root(&index_root, options.embedding_model.as_deref())?;
98    if let Some(ref callback) = progress_callback {
99        if resolved_model.alias == resolved_model.canonical_name() {
100            callback(&format!(
101                "Using embedding model {} ({} dims)",
102                resolved_model.canonical_name(),
103                resolved_model.dimensions()
104            ));
105        } else {
106            callback(&format!(
107                "Using embedding model {} (alias '{}', {} dims)",
108                resolved_model.canonical_name(),
109                resolved_model.alias,
110                resolved_model.dimensions()
111            ));
112        }
113    }
114
115    let mut embedder = ck_embed::create_embedder_for_config(&resolved_model.config, None)?;
116    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
117
118    if query_embeddings.is_empty() {
119        return Ok(ck_core::SearchResults {
120            matches: Vec::new(),
121            closest_below_threshold: None,
122        });
123    }
124
125    let query_embedding = &query_embeddings[0];
126
127    if let Some(ref callback) = progress_callback {
128        callback("Computing similarity scores...");
129    }
130
131    // Compute similarities
132    let mut similarities: Vec<(f32, &std::path::PathBuf, &ck_index::ChunkEntry)> = Vec::new();
133
134    for (file_path, chunk) in &file_chunks {
135        if let Some(ref embedding) = chunk.embedding {
136            let similarity = cosine_similarity(query_embedding, embedding);
137            similarities.push((similarity, file_path, chunk));
138        }
139    }
140
141    // Sort by similarity (highest first)
142    similarities.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
143
144    // Apply threshold and top_k filtering
145    let mut results = Vec::new();
146    let mut closest_below_threshold: Option<SearchResult> = None;
147    let limit = options.top_k.unwrap_or(similarities.len());
148
149    for (similarity, file_path, chunk) in similarities.into_iter().take(limit) {
150        let is_below_threshold = options
151            .threshold
152            .is_some_and(|threshold| similarity < threshold);
153
154        // Extract content from the file using the span, skip if file doesn't exist
155        let content = if options.full_section {
156            match extract_content_from_span(file_path, &chunk.span).await {
157                Ok(content) => content,
158                Err(_) => {
159                    // Skip files that no longer exist (stale index entries)
160                    continue;
161                }
162            }
163        } else {
164            match extract_content_from_span(file_path, &chunk.span).await {
165                Ok(full_content) => {
166                    // Take first 3 lines for preview
167                    full_content.lines().take(3).collect::<Vec<_>>().join("\n")
168                }
169                Err(_) => {
170                    // Skip files that no longer exist (stale index entries)
171                    continue;
172                }
173            }
174        };
175
176        let search_result = SearchResult {
177            file: file_path.clone(),
178            span: chunk.span.clone(),
179            score: similarity,
180            preview: content,
181            lang: ck_core::Language::from_path(file_path),
182            symbol: None,
183            chunk_hash: None,
184            index_epoch: None,
185        };
186
187        if is_below_threshold {
188            // Track the closest below-threshold result (first one since sorted by highest first)
189            if closest_below_threshold.is_none() {
190                closest_below_threshold = Some(search_result);
191            }
192        } else {
193            // Add to main results if above threshold
194            results.push(search_result);
195        }
196    }
197
198    // Apply reranking if enabled
199    if options.rerank && !results.is_empty() {
200        if let Some(ref callback) = progress_callback {
201            callback("Reranking results for improved relevance...");
202        }
203
204        let rerank_registry = ck_models::RerankModelRegistry::default();
205        let (rerank_alias, rerank_config) = rerank_registry
206            .resolve(options.rerank_model.as_deref())
207            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
208
209        match ck_embed::create_reranker_for_config(&rerank_config, None) {
210            Ok(mut reranker) => {
211                if let Some(ref callback) = progress_callback {
212                    callback(&format!("Reranking results with model {rerank_alias}"));
213                }
214
215                let documents: Vec<String> = results.iter().map(|r| r.preview.clone()).collect();
216
217                match reranker.rerank(&options.query, &documents) {
218                    Ok(rerank_results) => {
219                        // Create a map from document text to indices for handling duplicates
220                        let mut doc_to_indices: std::collections::HashMap<String, Vec<usize>> =
221                            std::collections::HashMap::new();
222                        for (i, result) in results.iter().enumerate() {
223                            doc_to_indices
224                                .entry(result.preview.clone())
225                                .or_default()
226                                .push(i);
227                        }
228
229                        // Update results with reranked scores
230                        // The reranker returns results in reranked order, so we match by document text
231                        for rerank_result in rerank_results.iter() {
232                            if let Some(indices) = doc_to_indices.get_mut(&rerank_result.document)
233                                && let Some(idx) = indices.pop()
234                            {
235                                results[idx].score = rerank_result.score;
236                            }
237                        }
238
239                        // Re-sort by reranked scores
240                        results.sort_by(|a, b| {
241                            b.score
242                                .partial_cmp(&a.score)
243                                .unwrap_or(std::cmp::Ordering::Equal)
244                        });
245
246                        // Apply top_k limit again after reranking
247                        if let Some(limit) = options.top_k {
248                            results.truncate(limit);
249                        }
250                    }
251                    Err(e) => {
252                        tracing::warn!("Reranking failed, using original scores: {}", e);
253                    }
254                }
255            }
256            Err(e) => {
257                tracing::warn!("Failed to create reranker, using original scores: {}", e);
258            }
259        }
260    }
261
262    Ok(ck_core::SearchResults {
263        matches: results,
264        closest_below_threshold,
265    })
266}
267
268/// Scope a semantic query to a file, a directory, or the whole index.
269///
270/// Cached canonical form of `options.path` so per-chunk membership
271/// checks don't re-canonicalize on every iteration.
272enum PathScope {
273    All,
274    File(std::path::PathBuf),
275    Dir(std::path::PathBuf),
276}
277
278impl PathScope {
279    fn new(path: &Path) -> Self {
280        if path == Path::new(".") {
281            return Self::All;
282        }
283        let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
284        if path.is_file() {
285            Self::File(canonical)
286        } else {
287            Self::Dir(canonical)
288        }
289    }
290
291    fn contains(&self, file: &Path) -> bool {
292        match self {
293            Self::All => true,
294            Self::File(target) => {
295                let canonical = file.canonicalize().unwrap_or_else(|_| file.to_path_buf());
296                canonical == *target
297            }
298            Self::Dir(target) => {
299                let canonical = file.canonicalize().unwrap_or_else(|_| file.to_path_buf());
300                canonical.starts_with(target)
301            }
302        }
303    }
304}
305
306fn reconstruct_original_path(
307    sidecar_path: &Path,
308    index_dir: &Path,
309    repo_root: &Path,
310) -> Option<std::path::PathBuf> {
311    // Remove the index directory prefix and .ck extension
312    let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
313    let mut original_path = relative_path.with_extension("");
314
315    // Handle the .ck extension removal
316    if let Some(name) = original_path.file_name() {
317        let name_str = name.to_string_lossy();
318        if let Some(original_name) = name_str.strip_suffix(".ck") {
319            let mut new_path = original_path.clone();
320            new_path.set_file_name(original_name);
321            original_path = new_path;
322        }
323    }
324
325    Some(repo_root.join(original_path))
326}
327
328fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
329    if a.len() != b.len() {
330        return 0.0;
331    }
332
333    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
334    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
335    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
336
337    if norm_a == 0.0 || norm_b == 0.0 {
338        0.0
339    } else {
340        dot_product / (norm_a * norm_b)
341    }
342}
343
344#[cfg(test)]
345mod path_scope_tests {
346    use super::PathScope;
347    use std::fs;
348    use std::path::Path;
349    use tempfile::TempDir;
350
351    #[test]
352    fn all_matches_anything() {
353        let scope = PathScope::new(Path::new("."));
354        assert!(scope.contains(Path::new("/tmp/whatever")));
355        assert!(scope.contains(Path::new("./relative")));
356    }
357
358    #[test]
359    fn dir_matches_descendants_only() {
360        let tmp = TempDir::new().unwrap();
361        let scoped = tmp.path().join("inside");
362        let outside = tmp.path().join("outside");
363        fs::create_dir(&scoped).unwrap();
364        fs::create_dir(&outside).unwrap();
365        let inside_file = scoped.join("a.txt");
366        let outside_file = outside.join("b.txt");
367        fs::write(&inside_file, "x").unwrap();
368        fs::write(&outside_file, "y").unwrap();
369
370        let scope = PathScope::new(&scoped);
371        assert!(scope.contains(&inside_file));
372        assert!(!scope.contains(&outside_file));
373    }
374
375    #[test]
376    fn file_matches_exactly_that_file() {
377        let tmp = TempDir::new().unwrap();
378        let target = tmp.path().join("target.txt");
379        let other = tmp.path().join("other.txt");
380        fs::write(&target, "x").unwrap();
381        fs::write(&other, "y").unwrap();
382
383        let scope = PathScope::new(&target);
384        assert!(scope.contains(&target));
385        assert!(!scope.contains(&other));
386    }
387}