ck_engine/
semantic_v3.rs

1use anyhow::Result;
2use ck_core::{CkError, SearchOptions, SearchResult};
3use std::path::Path;
4use walkdir::WalkDir;
5
6use super::{SearchProgressCallback, extract_content_from_span, find_nearest_index_root, detect_language};
7
8/// New semantic search implementation using span-based storage
9pub async fn semantic_search_v3(options: &SearchOptions) -> Result<Vec<SearchResult>> {
10    semantic_search_v3_with_progress(options, None).await
11}
12
13pub async fn semantic_search_v3_with_progress(options: &SearchOptions, progress_callback: Option<SearchProgressCallback>) -> Result<Vec<SearchResult>> {
14    // Find the index root
15    let index_root = find_nearest_index_root(&options.path).unwrap_or_else(|| {
16        if options.path.is_file() {
17            options.path.parent().unwrap_or(&options.path).to_path_buf()
18        } else {
19            options.path.clone()
20        }
21    });
22    
23    let index_dir = index_root.join(".ck");
24    if !index_dir.exists() {
25        return Err(CkError::Index("No index found. Run 'ck --index' first with embeddings.".to_string()).into());
26    }
27
28    if let Some(ref callback) = progress_callback {
29        callback("Loading embeddings from sidecar files...");
30    }
31
32    // Collect all sidecar files and their embeddings
33    let mut file_chunks: Vec<(std::path::PathBuf, ck_index::ChunkEntry)> = Vec::new();
34    
35    for entry in WalkDir::new(&index_dir) {
36        let entry = entry?;
37        if entry.file_type().is_file() {
38            let path = entry.path();
39            if path.extension().and_then(|s| s.to_str()) == Some("ck") {
40                // Load the sidecar file
41                if let Ok(index_entry) = ck_index::load_index_entry(path) {
42                    let original_file = reconstruct_original_path(path, &index_dir, &index_root);
43                    if let Some(original_file) = original_file {
44                        for chunk in index_entry.chunks {
45                            if chunk.embedding.is_some() {
46                                file_chunks.push((original_file.clone(), chunk));
47                            }
48                        }
49                    }
50                }
51            }
52        }
53    }
54
55    if file_chunks.is_empty() {
56        return Err(CkError::Index("No embeddings found. Run 'ck --index' first with embeddings.".to_string()).into());
57    }
58
59    if let Some(ref callback) = progress_callback {
60        callback(&format!("Found {} chunks with embeddings", file_chunks.len()));
61    }
62
63    // Create embedder and embed the query
64    if let Some(ref callback) = progress_callback {
65        callback("Loading embedding model...");
66    }
67    
68    let mut embedder = ck_embed::create_embedder(None)?;
69    let query_embeddings = embedder.embed(&[options.query.clone()])?;
70    
71    if query_embeddings.is_empty() {
72        return Ok(Vec::new());
73    }
74    
75    let query_embedding = &query_embeddings[0];
76
77    if let Some(ref callback) = progress_callback {
78        callback("Computing similarity scores...");
79    }
80
81    // Compute similarities
82    let mut similarities: Vec<(f32, &std::path::PathBuf, &ck_index::ChunkEntry)> = Vec::new();
83    
84    for (file_path, chunk) in &file_chunks {
85        if let Some(ref embedding) = chunk.embedding {
86            let similarity = cosine_similarity(query_embedding, embedding);
87            similarities.push((similarity, file_path, chunk));
88        }
89    }
90
91    // Sort by similarity (highest first)
92    similarities.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
93
94    // Apply threshold and top_k filtering
95    let mut results = Vec::new();
96    let limit = options.top_k.unwrap_or(similarities.len());
97    
98    for (similarity, file_path, chunk) in similarities.into_iter().take(limit) {
99        // Apply threshold filtering
100        if let Some(threshold) = options.threshold {
101            if similarity < threshold {
102                continue;
103            }
104        }
105
106        // Check if we're filtering by a specific file or directory
107        if options.path.is_file() {
108            let target_file = options.path.canonicalize().unwrap_or_else(|_| options.path.clone());
109            let result_file = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
110            if result_file != target_file {
111                continue;
112            }
113        } else if options.path != Path::new(".") {
114            // Filter by directory path - only include files within the specified directory
115            let target_dir = options.path.canonicalize().unwrap_or_else(|_| options.path.clone());
116            let result_file = file_path.canonicalize().unwrap_or_else(|_| file_path.clone());
117            if !result_file.starts_with(&target_dir) {
118                continue;
119            }
120        }
121
122        // Extract content from the file using the span
123        let content = if options.full_section {
124            extract_content_from_span(file_path, &chunk.span)?
125        } else {
126            let full_content = extract_content_from_span(file_path, &chunk.span)?;
127            // Take first 3 lines for preview
128            full_content.lines().take(3).collect::<Vec<_>>().join("\n")
129        };
130
131        results.push(SearchResult {
132            file: file_path.clone(),
133            span: chunk.span.clone(),
134            score: similarity,
135            preview: content,
136            lang: detect_language(file_path),
137            symbol: None,
138        });
139    }
140
141    Ok(results)
142}
143
144fn reconstruct_original_path(sidecar_path: &Path, index_dir: &Path, repo_root: &Path) -> Option<std::path::PathBuf> {
145    // Remove the index directory prefix and .ck extension
146    let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
147    let mut original_path = relative_path.with_extension("");
148    
149    // Handle the .ck extension removal
150    if let Some(name) = original_path.file_name() {
151        let name_str = name.to_string_lossy();
152        if let Some(original_name) = name_str.strip_suffix(".ck") {
153            let mut new_path = original_path.clone();
154            new_path.set_file_name(original_name);
155            original_path = new_path;
156        }
157    }
158    
159    Some(repo_root.join(original_path))
160}
161
162fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
163    if a.len() != b.len() {
164        return 0.0;
165    }
166    
167    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
168    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
169    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
170    
171    if norm_a == 0.0 || norm_b == 0.0 {
172        0.0
173    } else {
174        dot_product / (norm_a * norm_b)
175    }
176}