ck_engine/
semantic_v3.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchOptions, SearchResult};
3use std::path::Path;
4use walkdir::WalkDir;
5
6use super::{SearchProgressCallback, extract_content_from_span, find_nearest_index_root};
7
8/// New semantic search implementation using span-based storage
9pub async fn semantic_search_v3(options: &SearchOptions) -> Result<Vec<SearchResult>> {
10    semantic_search_v3_with_progress(options, None).await
11}
12
13pub async fn semantic_search_v3_with_progress(
14    options: &SearchOptions,
15    progress_callback: Option<SearchProgressCallback>,
16) -> Result<Vec<SearchResult>> {
17    // Find the index root
18    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
19        if options.path.is_file() {
20            options.path.parent().unwrap_or(&options.path).to_path_buf()
21        } else {
22            options.path.clone()
23        }
24    });
25
26    let index_dir = index_root.join(".ck");
27    if !index_dir.exists() {
28        return Err(CkError::Index(
29            "No index found. Run 'ck --index' first with embeddings.".to_string(),
30        )
31        .into());
32    }
33
34    if let Some(ref callback) = progress_callback {
35        callback("Loading embeddings from sidecar files...");
36    }
37
38    // Collect all sidecar files and their embeddings
39    let mut file_chunks: Vec<(std::path::PathBuf, ck_index::ChunkEntry)> = Vec::new();
40
41    for entry in WalkDir::new(&index_dir) {
42        let entry = entry?;
43        if entry.file_type().is_file() {
44            let path = entry.path();
45            if path.extension().and_then(|s| s.to_str()) == Some("ck") {
46                // Load the sidecar file
47                if let Ok(index_entry) = ck_index::load_index_entry(path) {
48                    let original_file = reconstruct_original_path(path, &index_dir, &index_root);
49                    if let Some(original_file) = original_file {
50                        for chunk in index_entry.chunks {
51                            if chunk.embedding.is_some() {
52                                file_chunks.push((original_file.clone(), chunk));
53                            }
54                        }
55                    }
56                }
57            }
58        }
59    }
60
61    if file_chunks.is_empty() {
62        return Err(CkError::Index(
63            "No embeddings found. Run 'ck --index' first with embeddings.".to_string(),
64        )
65        .into());
66    }
67
68    if let Some(ref callback) = progress_callback {
69        callback(&format!(
70            "Found {} chunks with embeddings",
71            file_chunks.len()
72        ));
73    }
74
75    // Create embedder and embed the query
76    if let Some(ref callback) = progress_callback {
77        callback("Loading embedding model...");
78    }
79
80    let mut embedder = ck_embed::create_embedder(None)?;
81    let query_embeddings = embedder.embed(std::slice::from_ref(&options.query))?;
82
83    if query_embeddings.is_empty() {
84        return Ok(Vec::new());
85    }
86
87    let query_embedding = &query_embeddings[0];
88
89    if let Some(ref callback) = progress_callback {
90        callback("Computing similarity scores...");
91    }
92
93    // Compute similarities
94    let mut similarities: Vec<(f32, &std::path::PathBuf, &ck_index::ChunkEntry)> = Vec::new();
95
96    for (file_path, chunk) in &file_chunks {
97        if let Some(ref embedding) = chunk.embedding {
98            let similarity = cosine_similarity(query_embedding, embedding);
99            similarities.push((similarity, file_path, chunk));
100        }
101    }
102
103    // Sort by similarity (highest first)
104    similarities.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
105
106    // Apply threshold and top_k filtering
107    let mut results = Vec::new();
108    let limit = options.top_k.unwrap_or(similarities.len());
109
110    for (similarity, file_path, chunk) in similarities.into_iter().take(limit) {
111        // Apply threshold filtering
112        if let Some(threshold) = options.threshold
113            && similarity < threshold
114        {
115            continue;
116        }
117
118        // Check if we're filtering by a specific file or directory
119        if options.path.is_file() {
120            let target_file = options
121                .path
122                .canonicalize()
123                .unwrap_or_else(|_| options.path.clone());
124            let result_file = file_path
125                .canonicalize()
126                .unwrap_or_else(|_| file_path.clone());
127            if result_file != target_file {
128                continue;
129            }
130        } else if options.path != Path::new(".") {
131            // Filter by directory path - only include files within the specified directory
132            let target_dir = options
133                .path
134                .canonicalize()
135                .unwrap_or_else(|_| options.path.clone());
136            let result_file = file_path
137                .canonicalize()
138                .unwrap_or_else(|_| file_path.clone());
139            if !result_file.starts_with(&target_dir) {
140                continue;
141            }
142        }
143
144        // Extract content from the file using the span
145        let content = if options.full_section {
146            extract_content_from_span(file_path, &chunk.span).await?
147        } else {
148            let full_content = extract_content_from_span(file_path, &chunk.span).await?;
149            // Take first 3 lines for preview
150            full_content.lines().take(3).collect::<Vec<_>>().join("\n")
151        };
152
153        results.push(SearchResult {
154            file: file_path.clone(),
155            span: chunk.span.clone(),
156            score: similarity,
157            preview: content,
158            lang: ck_core::Language::from_path(file_path),
159            symbol: None,
160            chunk_hash: None,
161            index_epoch: None,
162        });
163    }
164
165    Ok(results)
166}
167
168fn reconstruct_original_path(
169    sidecar_path: &Path,
170    index_dir: &Path,
171    repo_root: &Path,
172) -> Option<std::path::PathBuf> {
173    // Remove the index directory prefix and .ck extension
174    let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
175    let mut original_path = relative_path.with_extension("");
176
177    // Handle the .ck extension removal
178    if let Some(name) = original_path.file_name() {
179        let name_str = name.to_string_lossy();
180        if let Some(original_name) = name_str.strip_suffix(".ck") {
181            let mut new_path = original_path.clone();
182            new_path.set_file_name(original_name);
183            original_path = new_path;
184        }
185    }
186
187    Some(repo_root.join(original_path))
188}
189
190fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
191    if a.len() != b.len() {
192        return 0.0;
193    }
194
195    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
196    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
197    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
198
199    if norm_a == 0.0 || norm_b == 0.0 {
200        0.0
201    } else {
202        dot_product / (norm_a * norm_b)
203    }
204}