1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 1024;
27const F32_BYTES: usize = std::mem::size_of::<f32>();
28const HEADER_BYTES_V1: usize = 9;
29const HEADER_BYTES_V2: usize = 13;
30const ONNX_RUNTIME_INSTALL_HINT: &str =
31 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
32
33const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
34const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
35const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
40const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
43const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
46const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
48const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
49const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
50const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
52const DEFAULT_MAX_BATCH_SIZE: usize = 64;
53const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
54const FALLBACK_BACKEND: &str = "none";
55const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
56const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
57static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
58
59pub struct SemanticIndexLock {
60 _guard: fs_lock::LockGuard,
61}
62
63impl SemanticIndexLock {
64 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
65 let dir = storage_dir.join("semantic").join(project_key);
66 fs::create_dir_all(&dir)?;
67 let path = dir.join("cache.lock");
68 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
69 .lock()
70 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
71 fs_lock::try_acquire(&path, Duration::from_secs(2))
72 .map(|guard| Self { _guard: guard })
73 .map_err(|error| match error {
74 fs_lock::AcquireError::Timeout => {
75 std::io::Error::other("timed out acquiring semantic cache lock")
76 }
77 fs_lock::AcquireError::Io(error) => error,
78 })
79 }
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct SemanticIndexFingerprint {
84 pub backend: String,
85 pub model: String,
86 #[serde(default)]
87 pub base_url: String,
88 pub dimension: usize,
89 #[serde(default = "default_chunking_version")]
90 pub chunking_version: u32,
91}
92
93fn default_chunking_version() -> u32 {
94 2
95}
96
97impl SemanticIndexFingerprint {
98 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
99 let base_url = config
102 .base_url
103 .as_ref()
104 .and_then(|u| normalize_base_url(u).ok())
105 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
106 Self {
107 backend: config.backend.as_str().to_string(),
108 model: config.model.clone(),
109 base_url,
110 dimension,
111 chunking_version: default_chunking_version(),
112 }
113 }
114
115 pub fn as_string(&self) -> String {
116 serde_json::to_string(self).unwrap_or_else(|_| String::new())
117 }
118
119 fn matches_expected(&self, expected: &str) -> bool {
120 let encoded = self.as_string();
121 !encoded.is_empty() && encoded == expected
122 }
123}
124
125enum SemanticEmbeddingEngine {
126 Fastembed(TextEmbedding),
127 OpenAiCompatible {
128 client: Client,
129 model: String,
130 base_url: String,
131 api_key: Option<String>,
132 },
133 Ollama {
134 client: Client,
135 model: String,
136 base_url: String,
137 },
138}
139
140pub struct SemanticEmbeddingModel {
141 backend: SemanticBackend,
142 model: String,
143 base_url: Option<String>,
144 timeout_ms: u64,
145 max_batch_size: usize,
146 dimension: Option<usize>,
147 engine: SemanticEmbeddingEngine,
148 query_embedding_cache: HashMap<String, Vec<f32>>,
149 query_embedding_cache_order: VecDeque<String>,
150 query_embedding_cache_hits: u64,
151 query_embedding_cache_misses: u64,
152}
153
154pub type EmbeddingModel = SemanticEmbeddingModel;
155
156fn validate_embedding_batch(
157 vectors: &[Vec<f32>],
158 expected_count: usize,
159 context: &str,
160) -> Result<(), String> {
161 if expected_count > 0 && vectors.is_empty() {
162 return Err(format!(
163 "{context} returned no vectors for {expected_count} inputs"
164 ));
165 }
166
167 if vectors.len() != expected_count {
168 return Err(format!(
169 "{context} returned {} vectors for {} inputs",
170 vectors.len(),
171 expected_count
172 ));
173 }
174
175 let Some(first_vector) = vectors.first() else {
176 return Ok(());
177 };
178 let expected_dimension = first_vector.len();
179 for (index, vector) in vectors.iter().enumerate() {
180 if vector.len() != expected_dimension {
181 return Err(format!(
182 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
183 vector.len()
184 ));
185 }
186 }
187
188 Ok(())
189}
190
191fn normalize_base_url(raw: &str) -> Result<String, String> {
195 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
196 let scheme = parsed.scheme();
197 if scheme != "http" && scheme != "https" {
198 return Err(format!(
199 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
200 scheme
201 ));
202 }
203 Ok(parsed.to_string().trim_end_matches('/').to_string())
204}
205
206pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
221 use std::net::{IpAddr, ToSocketAddrs};
222
223 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
224
225 let host = parsed.host_str().unwrap_or("");
226
227 let is_loopback_host =
232 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
233 if is_loopback_host {
234 return Ok(());
235 }
236
237 if host.ends_with(".local") {
240 return Err(format!(
241 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
242 ));
243 }
244
245 let port = parsed.port_or_known_default().unwrap_or(443);
248 let addr_str = format!("{host}:{port}");
249 let addrs: Vec<IpAddr> = addr_str
250 .to_socket_addrs()
251 .map(|iter| iter.map(|sa| sa.ip()).collect())
252 .unwrap_or_default();
253 for ip in &addrs {
254 if is_private_non_loopback_ip(ip) {
255 return Err(format!(
256 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
257 ));
258 }
259 }
260
261 Ok(())
262}
263
264fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
268 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
269 match ip {
270 IpAddr::V4(v4) => {
271 let o = v4.octets();
272 o[0] == 10
275 || (o[0] == 172 && (16..=31).contains(&o[1]))
277 || (o[0] == 192 && o[1] == 168)
279 || (o[0] == 169 && o[1] == 254)
281 || (o[0] == 100 && (64..=127).contains(&o[1]))
283 || o[0] == 0
285 }
286 IpAddr::V6(v6) => {
287 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
291 || (v6.segments()[0] & 0xfe00) == 0xfc00
293 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
295 && v6.segments()[2] == 0 && v6.segments()[3] == 0
296 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
297 && {
298 let [a, b] = v6.segments()[6..8] else { return false; };
299 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
300 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
301 })
302 }
303 }
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307 if base_url.ends_with("/v1") {
308 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309 } else {
310 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311 }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315 if base_url.ends_with("/api") {
316 format!("{base_url}/embed")
317 } else {
318 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319 }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323 value.and_then(|token| {
324 let token = token.trim();
325 if token.is_empty() {
326 None
327 } else {
328 Some(token.to_string())
329 }
330 })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
338 error.is_connect()
339}
340
341fn sleep_before_embedding_retry(attempt_index: usize) {
342 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
343 std::thread::sleep(Duration::from_millis(*delay_ms));
344 }
345}
346
347fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
348where
349 F: FnMut() -> reqwest::blocking::RequestBuilder,
350{
351 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
352 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
353
354 let response = match make_request().send() {
355 Ok(response) => response,
356 Err(error) => {
357 if !last_attempt && is_retryable_embedding_error(&error) {
358 sleep_before_embedding_retry(attempt_index);
359 continue;
360 }
361 return Err(format!("{backend_label} request failed: {error}"));
362 }
363 };
364
365 let status = response.status();
366 let raw = match response.text() {
367 Ok(raw) => raw,
368 Err(error) => {
369 if !last_attempt && is_retryable_embedding_error(&error) {
370 sleep_before_embedding_retry(attempt_index);
371 continue;
372 }
373 return Err(format!("{backend_label} response read failed: {error}"));
374 }
375 };
376
377 if status.is_success() {
378 return Ok(raw);
379 }
380
381 if !last_attempt && is_retryable_embedding_status(status) {
382 sleep_before_embedding_retry(attempt_index);
383 continue;
384 }
385
386 return Err(format!(
387 "{backend_label} request failed (HTTP {}): {}",
388 status, raw
389 ));
390 }
391
392 unreachable!("embedding request retries exhausted without returning")
393}
394
395impl SemanticEmbeddingModel {
396 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
397 let timeout_ms = if config.timeout_ms == 0 {
398 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
399 } else {
400 config.timeout_ms
401 };
402
403 let max_batch_size = if config.max_batch_size == 0 {
404 DEFAULT_MAX_BATCH_SIZE
405 } else {
406 config.max_batch_size
407 };
408
409 let api_key_env = normalize_api_key(config.api_key_env.clone());
410 let model = config.model.clone();
411
412 let client = Client::builder()
413 .timeout(Duration::from_millis(timeout_ms))
414 .redirect(reqwest::redirect::Policy::none())
415 .build()
416 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
417
418 let engine = match config.backend {
419 SemanticBackend::Fastembed => {
420 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
421 }
422 SemanticBackend::OpenAiCompatible => {
423 let raw = config.base_url.as_ref().ok_or_else(|| {
424 "base_url is required for openai_compatible backend".to_string()
425 })?;
426 let base_url = normalize_base_url(raw)?;
427
428 let api_key = match api_key_env {
429 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
430 format!("missing api_key_env '{var_name}' for openai_compatible backend")
431 })?),
432 None => None,
433 };
434
435 SemanticEmbeddingEngine::OpenAiCompatible {
436 client,
437 model,
438 base_url,
439 api_key,
440 }
441 }
442 SemanticBackend::Ollama => {
443 let raw = config
444 .base_url
445 .as_ref()
446 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
447 let base_url = normalize_base_url(raw)?;
448
449 SemanticEmbeddingEngine::Ollama {
450 client,
451 model,
452 base_url,
453 }
454 }
455 };
456
457 Ok(Self {
458 backend: config.backend,
459 model: config.model.clone(),
460 base_url: config.base_url.clone(),
461 timeout_ms,
462 max_batch_size,
463 dimension: None,
464 engine,
465 query_embedding_cache: HashMap::new(),
466 query_embedding_cache_order: VecDeque::new(),
467 query_embedding_cache_hits: 0,
468 query_embedding_cache_misses: 0,
469 })
470 }
471
472 pub fn backend(&self) -> SemanticBackend {
473 self.backend
474 }
475
476 pub fn model(&self) -> &str {
477 &self.model
478 }
479
480 pub fn base_url(&self) -> Option<&str> {
481 self.base_url.as_deref()
482 }
483
484 pub fn max_batch_size(&self) -> usize {
485 self.max_batch_size
486 }
487
488 pub fn timeout_ms(&self) -> u64 {
489 self.timeout_ms
490 }
491
492 pub fn fingerprint(
493 &mut self,
494 config: &SemanticBackendConfig,
495 ) -> Result<SemanticIndexFingerprint, String> {
496 let dimension = self.dimension()?;
497 Ok(SemanticIndexFingerprint::from_config(config, dimension))
498 }
499
500 pub fn dimension(&mut self) -> Result<usize, String> {
501 if let Some(dimension) = self.dimension {
502 return Ok(dimension);
503 }
504
505 let dimension = match &mut self.engine {
506 SemanticEmbeddingEngine::Fastembed(model) => {
507 let vectors = model
508 .embed(vec!["semantic index fingerprint probe".to_string()], None)
509 .map_err(|error| format_embedding_init_error(error.to_string()))?;
510 vectors
511 .first()
512 .map(|v| v.len())
513 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
514 }
515 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
516 let vectors =
517 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
518 vectors
519 .first()
520 .map(|v| v.len())
521 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
522 }
523 SemanticEmbeddingEngine::Ollama { .. } => {
524 let vectors =
525 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
526 vectors
527 .first()
528 .map(|v| v.len())
529 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530 }
531 };
532
533 self.dimension = Some(dimension);
534 Ok(dimension)
535 }
536
537 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
538 self.embed_texts(texts)
539 }
540
541 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
542 if let Some(vector) = self.query_embedding_cache.get(query) {
543 self.query_embedding_cache_hits += 1;
544 return Ok(vector.clone());
545 }
546
547 self.query_embedding_cache_misses += 1;
548 let embeddings = self.embed_texts(vec![query.to_string()])?;
549 let vector = embeddings
550 .first()
551 .cloned()
552 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
553
554 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
555 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
556 self.query_embedding_cache.remove(&oldest);
557 }
558 }
559 self.query_embedding_cache
560 .insert(query.to_string(), vector.clone());
561 self.query_embedding_cache_order
562 .push_back(query.to_string());
563
564 Ok(vector)
565 }
566
567 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
568 (
569 self.query_embedding_cache_hits,
570 self.query_embedding_cache_misses,
571 self.query_embedding_cache.len(),
572 )
573 }
574
575 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
576 match &mut self.engine {
577 SemanticEmbeddingEngine::Fastembed(model) => model
578 .embed(texts, None::<usize>)
579 .map_err(|error| format_embedding_init_error(error.to_string()))
580 .map_err(|error| format!("failed to embed batch: {error}")),
581 SemanticEmbeddingEngine::OpenAiCompatible {
582 client,
583 model,
584 base_url,
585 api_key,
586 } => {
587 let expected_text_count = texts.len();
588 let endpoint = build_openai_embeddings_endpoint(base_url);
589 let body = serde_json::json!({
590 "input": texts,
591 "model": model,
592 });
593
594 let raw = send_embedding_request(
595 || {
596 let mut request = client.post(&endpoint).json(&body);
606
607 if let Some(api_key) = api_key {
608 request = request.header("Authorization", format!("Bearer {api_key}"));
609 }
610
611 request
612 },
613 "openai compatible",
614 )?;
615
616 #[derive(Deserialize)]
617 struct OpenAiResponse {
618 data: Vec<OpenAiEmbeddingResult>,
619 }
620
621 #[derive(Deserialize)]
622 struct OpenAiEmbeddingResult {
623 embedding: Vec<f32>,
624 index: Option<u32>,
625 }
626
627 let parsed: OpenAiResponse = serde_json::from_str(&raw)
628 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
629 if parsed.data.len() != expected_text_count {
630 return Err(format!(
631 "openai compatible response returned {} embeddings for {} inputs",
632 parsed.data.len(),
633 expected_text_count
634 ));
635 }
636
637 let mut vectors = vec![Vec::new(); parsed.data.len()];
638 for (i, item) in parsed.data.into_iter().enumerate() {
639 let index = item.index.unwrap_or(i as u32) as usize;
640 if index >= vectors.len() {
641 return Err(
642 "openai compatible response contains invalid vector index".to_string()
643 );
644 }
645 vectors[index] = item.embedding;
646 }
647
648 for vector in &vectors {
649 if vector.is_empty() {
650 return Err(
651 "openai compatible response contained missing vectors".to_string()
652 );
653 }
654 }
655
656 self.dimension = vectors.first().map(Vec::len);
657 Ok(vectors)
658 }
659 SemanticEmbeddingEngine::Ollama {
660 client,
661 model,
662 base_url,
663 } => {
664 let expected_text_count = texts.len();
665 let endpoint = build_ollama_embeddings_endpoint(base_url);
666
667 #[derive(Serialize)]
668 struct OllamaPayload<'a> {
669 model: &'a str,
670 input: Vec<String>,
671 }
672
673 let payload = OllamaPayload {
674 model,
675 input: texts,
676 };
677
678 let raw = send_embedding_request(
679 || {
680 client.post(&endpoint).json(&payload)
685 },
686 "ollama",
687 )?;
688
689 #[derive(Deserialize)]
690 struct OllamaResponse {
691 embeddings: Vec<Vec<f32>>,
692 }
693
694 let parsed: OllamaResponse = serde_json::from_str(&raw)
695 .map_err(|error| format!("invalid ollama response: {error}"))?;
696 if parsed.embeddings.is_empty() {
697 return Err("ollama response returned no embeddings".to_string());
698 }
699 if parsed.embeddings.len() != expected_text_count {
700 return Err(format!(
701 "ollama response returned {} embeddings for {} inputs",
702 parsed.embeddings.len(),
703 expected_text_count
704 ));
705 }
706
707 let vectors = parsed.embeddings;
708 for vector in &vectors {
709 if vector.is_empty() {
710 return Err("ollama response contained empty embeddings".to_string());
711 }
712 }
713
714 self.dimension = vectors.first().map(Vec::len);
715 Ok(vectors)
716 }
717 }
718 }
719}
720
721pub fn pre_validate_onnx_runtime() -> Result<(), String> {
725 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
726
727 #[cfg(any(target_os = "linux", target_os = "macos"))]
728 {
729 #[cfg(target_os = "linux")]
730 let default_name = "libonnxruntime.so";
731 #[cfg(target_os = "macos")]
732 let default_name = "libonnxruntime.dylib";
733
734 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
735
736 unsafe {
737 let c_name = std::ffi::CString::new(lib_name)
738 .map_err(|e| format!("invalid library path: {}", e))?;
739 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
740 if handle.is_null() {
741 let err = libc::dlerror();
742 let msg = if err.is_null() {
743 "unknown dlopen error".to_string()
744 } else {
745 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
746 };
747 return Err(format!(
748 "ONNX Runtime not found. dlopen('{}') failed: {}. \
749 Run `npx @cortexkit/aft doctor` to diagnose.",
750 lib_name, msg
751 ));
752 }
753
754 let detected_version = detect_ort_version_from_path(lib_name);
757
758 libc::dlclose(handle);
759
760 if let Some(ref version) = detected_version {
762 let parts: Vec<&str> = version.split('.').collect();
763 if let (Some(major), Some(minor)) = (
764 parts.first().and_then(|s| s.parse::<u32>().ok()),
765 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
766 ) {
767 if major != 1 || minor < 20 {
768 return Err(format_ort_version_mismatch(version, lib_name));
769 }
770 }
771 }
772 }
773 }
774
775 #[cfg(target_os = "windows")]
776 {
777 let _ = dylib_path;
779 }
780
781 Ok(())
782}
783
784#[cfg(any(test, target_os = "linux", target_os = "macos"))]
787fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
788 let path = std::path::Path::new(lib_path);
789
790 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
792 .into_iter()
793 .flatten()
794 {
795 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
796 if let Some(version) = extract_version_from_filename(name) {
797 return Some(version);
798 }
799 }
800 }
801
802 if let Some(parent) = path.parent() {
804 if let Ok(entries) = std::fs::read_dir(parent) {
805 for entry in entries.flatten() {
806 if let Some(name) = entry.file_name().to_str() {
807 if name.starts_with("libonnxruntime") {
808 if let Some(version) = extract_version_from_filename(name) {
809 return Some(version);
810 }
811 }
812 }
813 }
814 }
815 }
816
817 None
818}
819
820#[cfg(any(test, target_os = "linux", target_os = "macos"))]
822fn extract_version_from_filename(name: &str) -> Option<String> {
823 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
825 re.find(name).map(|m| m.as_str().to_string())
826}
827
828#[cfg(any(test, target_os = "linux", target_os = "macos"))]
829fn suggest_removal_command(lib_path: &str) -> String {
830 if lib_path.starts_with("/usr/local/lib")
831 || lib_path == "libonnxruntime.so"
832 || lib_path == "libonnxruntime.dylib"
833 {
834 #[cfg(target_os = "linux")]
835 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
836 #[cfg(target_os = "macos")]
837 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
838 #[cfg(target_os = "windows")]
839 return " Delete the ONNX Runtime DLL from your PATH".to_string();
840 }
841 format!(" rm '{}'", lib_path)
842}
843
844#[cfg(any(test, target_os = "linux", target_os = "macos"))]
850pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
851 format!(
852 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
853 Solutions:\n\
854 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
855 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
856 configures the bridge to load it instead of the system library — no \
857 changes to '{}'.\n\
858 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
859 {}\n\
860 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
861 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
862 version,
863 lib_name,
864 lib_name,
865 suggest_removal_command(lib_name),
866 )
867}
868
869pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
870 pre_validate_onnx_runtime()?;
872
873 let selected_model = match model {
874 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
875 _ => {
876 return Err(format!(
877 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
878 model
879 ))
880 }
881 };
882
883 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
884}
885
886pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
887 if message.trim_start().starts_with("ONNX Runtime not found.") {
888 return true;
889 }
890
891 let message = message.to_ascii_lowercase();
892 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
893 .iter()
894 .any(|pattern| message.contains(pattern));
895 let mentions_dynamic_load_failure = [
896 "shared library",
897 "dynamic library",
898 "failed to load",
899 "could not load",
900 "unable to load",
901 "dlopen",
902 "loadlibrary",
903 "no such file",
904 "not found",
905 ]
906 .iter()
907 .any(|pattern| message.contains(pattern));
908
909 mentions_onnx_runtime && mentions_dynamic_load_failure
910}
911
912fn format_embedding_init_error(error: impl Display) -> String {
913 let message = error.to_string();
914
915 if is_onnx_runtime_unavailable(&message) {
916 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
917 }
918
919 format!("failed to initialize semantic embedding model: {message}")
920}
921
922#[derive(Debug, Clone)]
924pub struct SemanticChunk {
925 pub file: PathBuf,
927 pub name: String,
929 pub kind: SymbolKind,
931 pub start_line: u32,
933 pub end_line: u32,
934 pub exported: bool,
936 pub embed_text: String,
938 pub snippet: String,
940}
941
942#[derive(Debug)]
944struct EmbeddingEntry {
945 chunk: SemanticChunk,
946 vector: Vec<f32>,
947}
948
949#[derive(Debug)]
951pub struct SemanticIndex {
952 entries: Vec<EmbeddingEntry>,
953 file_mtimes: HashMap<PathBuf, SystemTime>,
955 file_sizes: HashMap<PathBuf, u64>,
957 file_hashes: HashMap<PathBuf, blake3::Hash>,
958 dimension: usize,
960 fingerprint: Option<SemanticIndexFingerprint>,
961 project_root: PathBuf,
962}
963
964#[derive(Debug, Clone, Copy)]
965struct IndexedFileMetadata {
966 mtime: SystemTime,
967 size: u64,
968 content_hash: blake3::Hash,
969}
970
971#[derive(Debug, Default, Clone, Copy)]
974pub struct RefreshSummary {
975 pub changed: usize,
976 pub added: usize,
977 pub deleted: usize,
978 pub total_processed: usize,
979}
980
981impl RefreshSummary {
982 pub fn is_noop(&self) -> bool {
984 self.changed == 0 && self.added == 0 && self.deleted == 0
985 }
986}
987
988#[derive(Debug, Clone)]
990pub struct SemanticResult {
991 pub file: PathBuf,
992 pub name: String,
993 pub kind: SymbolKind,
994 pub start_line: u32,
995 pub end_line: u32,
996 pub exported: bool,
997 pub snippet: String,
998 pub score: f32,
999 pub source: &'static str,
1000}
1001
1002impl SemanticIndex {
1003 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1004 debug_assert!(project_root.is_absolute());
1005 Self {
1006 entries: Vec::new(),
1007 file_mtimes: HashMap::new(),
1008 file_sizes: HashMap::new(),
1009 file_hashes: HashMap::new(),
1010 dimension,
1011 fingerprint: None,
1012 project_root,
1013 }
1014 }
1015
1016 pub fn entry_count(&self) -> usize {
1018 self.entries.len()
1019 }
1020
1021 pub fn status_label(&self) -> &'static str {
1023 if self.entries.is_empty() {
1024 "empty"
1025 } else {
1026 "ready"
1027 }
1028 }
1029
1030 fn collect_chunks(
1031 project_root: &Path,
1032 files: &[PathBuf],
1033 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1034 let per_file: Vec<(
1035 PathBuf,
1036 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1037 )> = files
1038 .par_iter()
1039 .map_init(HashMap::new, |parsers, file| {
1040 let result = collect_file_metadata(file).and_then(|metadata| {
1041 collect_file_chunks(project_root, file, parsers)
1042 .map(|chunks| (metadata, chunks))
1043 });
1044 (file.clone(), result)
1045 })
1046 .collect();
1047
1048 let mut chunks: Vec<SemanticChunk> = Vec::new();
1049 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1050
1051 for (file, result) in per_file {
1052 match result {
1053 Ok((metadata, file_chunks)) => {
1054 file_metadata.insert(file, metadata);
1055 chunks.extend(file_chunks);
1056 }
1057 Err(error) => {
1058 if error == "unsupported file extension" {
1064 continue;
1065 }
1066 slog_warn!(
1067 "failed to collect semantic chunks for {}: {}",
1068 file.display(),
1069 error
1070 );
1071 }
1072 }
1073 }
1074
1075 (chunks, file_metadata)
1076 }
1077
1078 fn build_from_chunks<F, P>(
1079 project_root: &Path,
1080 chunks: Vec<SemanticChunk>,
1081 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1082 embed_fn: &mut F,
1083 max_batch_size: usize,
1084 mut progress: Option<&mut P>,
1085 ) -> Result<Self, String>
1086 where
1087 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1088 P: FnMut(usize, usize),
1089 {
1090 debug_assert!(project_root.is_absolute());
1091 let total_chunks = chunks.len();
1092
1093 if chunks.is_empty() {
1094 return Ok(Self {
1095 entries: Vec::new(),
1096 file_mtimes: file_metadata
1097 .iter()
1098 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1099 .collect(),
1100 file_sizes: file_metadata
1101 .iter()
1102 .map(|(path, metadata)| (path.clone(), metadata.size))
1103 .collect(),
1104 file_hashes: file_metadata
1105 .into_iter()
1106 .map(|(path, metadata)| (path, metadata.content_hash))
1107 .collect(),
1108 dimension: DEFAULT_DIMENSION,
1109 fingerprint: None,
1110 project_root: project_root.to_path_buf(),
1111 });
1112 }
1113
1114 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1116 let mut expected_dimension: Option<usize> = None;
1117 let batch_size = max_batch_size.max(1);
1118 for batch_start in (0..chunks.len()).step_by(batch_size) {
1119 let batch_end = (batch_start + batch_size).min(chunks.len());
1120 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1121 .iter()
1122 .map(|c| c.embed_text.clone())
1123 .collect();
1124
1125 let vectors = embed_fn(batch_texts)?;
1126 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1127
1128 if let Some(dim) = vectors.first().map(|v| v.len()) {
1130 match expected_dimension {
1131 None => expected_dimension = Some(dim),
1132 Some(expected) if dim != expected => {
1133 return Err(format!(
1134 "embedding dimension changed across batches: expected {expected}, got {dim}"
1135 ));
1136 }
1137 _ => {}
1138 }
1139 }
1140
1141 for (i, vector) in vectors.into_iter().enumerate() {
1142 let chunk_idx = batch_start + i;
1143 entries.push(EmbeddingEntry {
1144 chunk: chunks[chunk_idx].clone(),
1145 vector,
1146 });
1147 }
1148
1149 if let Some(callback) = progress.as_mut() {
1150 callback(entries.len(), total_chunks);
1151 }
1152 }
1153
1154 let dimension = entries
1155 .first()
1156 .map(|e| e.vector.len())
1157 .unwrap_or(DEFAULT_DIMENSION);
1158
1159 Ok(Self {
1160 entries,
1161 file_mtimes: file_metadata
1162 .iter()
1163 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1164 .collect(),
1165 file_sizes: file_metadata
1166 .iter()
1167 .map(|(path, metadata)| (path.clone(), metadata.size))
1168 .collect(),
1169 file_hashes: file_metadata
1170 .into_iter()
1171 .map(|(path, metadata)| (path, metadata.content_hash))
1172 .collect(),
1173 dimension,
1174 fingerprint: None,
1175 project_root: project_root.to_path_buf(),
1176 })
1177 }
1178
1179 pub fn build<F>(
1182 project_root: &Path,
1183 files: &[PathBuf],
1184 embed_fn: &mut F,
1185 max_batch_size: usize,
1186 ) -> Result<Self, String>
1187 where
1188 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1189 {
1190 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1191 Self::build_from_chunks(
1192 project_root,
1193 chunks,
1194 file_mtimes,
1195 embed_fn,
1196 max_batch_size,
1197 Option::<&mut fn(usize, usize)>::None,
1198 )
1199 }
1200
1201 pub fn build_with_progress<F, P>(
1203 project_root: &Path,
1204 files: &[PathBuf],
1205 embed_fn: &mut F,
1206 max_batch_size: usize,
1207 progress: &mut P,
1208 ) -> Result<Self, String>
1209 where
1210 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1211 P: FnMut(usize, usize),
1212 {
1213 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1214 let total_chunks = chunks.len();
1215 progress(0, total_chunks);
1216 Self::build_from_chunks(
1217 project_root,
1218 chunks,
1219 file_mtimes,
1220 embed_fn,
1221 max_batch_size,
1222 Some(progress),
1223 )
1224 }
1225
1226 pub fn refresh_stale_files<F, P>(
1237 &mut self,
1238 project_root: &Path,
1239 current_files: &[PathBuf],
1240 embed_fn: &mut F,
1241 max_batch_size: usize,
1242 progress: &mut P,
1243 ) -> Result<RefreshSummary, String>
1244 where
1245 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1246 P: FnMut(usize, usize),
1247 {
1248 self.backfill_missing_file_sizes();
1249
1250 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1252 let total_processed = current_set.len() + self.file_mtimes.len()
1253 - self
1254 .file_mtimes
1255 .keys()
1256 .filter(|path| current_set.contains(path.as_path()))
1257 .count();
1258
1259 let mut deleted: Vec<PathBuf> = Vec::new();
1262 let mut changed: Vec<PathBuf> = Vec::new();
1263 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1264 for indexed_path in &indexed_paths {
1265 if !current_set.contains(indexed_path.as_path()) {
1266 deleted.push(indexed_path.clone());
1267 continue;
1268 }
1269 let cached = match (
1270 self.file_mtimes.get(indexed_path),
1271 self.file_sizes.get(indexed_path),
1272 self.file_hashes.get(indexed_path),
1273 ) {
1274 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1275 mtime: *mtime,
1276 size: *size,
1277 content_hash: *hash,
1278 }),
1279 _ => None,
1280 };
1281 match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1282 Some(FreshnessVerdict::HotFresh) => {}
1283 Some(FreshnessVerdict::ContentFresh {
1284 new_mtime,
1285 new_size,
1286 }) => {
1287 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1288 self.file_sizes.insert(indexed_path.clone(), new_size);
1289 }
1290 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1291 changed.push(indexed_path.clone());
1292 }
1293 }
1294 }
1295
1296 let mut added: Vec<PathBuf> = Vec::new();
1298 for path in current_files {
1299 if !self.file_mtimes.contains_key(path) {
1300 added.push(path.clone());
1301 }
1302 }
1303
1304 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1306 progress(0, 0);
1307 return Ok(RefreshSummary {
1308 total_processed,
1309 ..RefreshSummary::default()
1310 });
1311 }
1312
1313 if !deleted.is_empty() {
1317 let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1318 self.entries
1319 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1320 for path in &deleted {
1321 self.file_mtimes.remove(path);
1322 self.file_sizes.remove(path);
1323 self.file_hashes.remove(path);
1324 }
1325 }
1326
1327 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1329 to_embed.extend(changed.iter().cloned());
1330 to_embed.extend(added.iter().cloned());
1331
1332 if to_embed.is_empty() {
1333 progress(0, 0);
1335 return Ok(RefreshSummary {
1336 changed: 0,
1337 added: 0,
1338 deleted: deleted.len(),
1339 total_processed,
1340 });
1341 }
1342
1343 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1344
1345 if chunks.is_empty() {
1346 progress(0, 0);
1347 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1348 if !successful_files.is_empty() {
1349 self.entries
1350 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1351 }
1352 let changed_count = changed
1353 .iter()
1354 .filter(|path| successful_files.contains(*path))
1355 .count();
1356 let added_count = added
1357 .iter()
1358 .filter(|path| successful_files.contains(*path))
1359 .count();
1360 for (file, metadata) in fresh_metadata {
1361 self.file_mtimes.insert(file.clone(), metadata.mtime);
1362 self.file_sizes.insert(file.clone(), metadata.size);
1363 self.file_hashes.insert(file.clone(), metadata.content_hash);
1364 }
1365 return Ok(RefreshSummary {
1366 changed: changed_count,
1367 added: added_count,
1368 deleted: deleted.len(),
1369 total_processed,
1370 });
1371 }
1372
1373 let total_chunks = chunks.len();
1375 progress(0, total_chunks);
1376 let batch_size = max_batch_size.max(1);
1377 let existing_dimension = if self.entries.is_empty() {
1378 None
1379 } else {
1380 Some(self.dimension)
1381 };
1382 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1383 let mut observed_dimension: Option<usize> = existing_dimension;
1384
1385 for batch_start in (0..chunks.len()).step_by(batch_size) {
1386 let batch_end = (batch_start + batch_size).min(chunks.len());
1387 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1388 .iter()
1389 .map(|c| c.embed_text.clone())
1390 .collect();
1391
1392 let vectors = embed_fn(batch_texts)?;
1393 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1394
1395 if let Some(dim) = vectors.first().map(|v| v.len()) {
1396 match observed_dimension {
1397 None => observed_dimension = Some(dim),
1398 Some(expected) if dim != expected => {
1399 return Err(format!(
1402 "embedding dimension changed during incremental refresh: \
1403 cached index uses {expected}, new vectors use {dim}"
1404 ));
1405 }
1406 _ => {}
1407 }
1408 }
1409
1410 for (i, vector) in vectors.into_iter().enumerate() {
1411 let chunk_idx = batch_start + i;
1412 new_entries.push(EmbeddingEntry {
1413 chunk: chunks[chunk_idx].clone(),
1414 vector,
1415 });
1416 }
1417
1418 progress(new_entries.len(), total_chunks);
1419 }
1420
1421 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1422 if !successful_files.is_empty() {
1423 self.entries
1424 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1425 }
1426
1427 self.entries.extend(new_entries);
1428 for (file, metadata) in fresh_metadata {
1429 self.file_mtimes.insert(file.clone(), metadata.mtime);
1430 self.file_sizes.insert(file.clone(), metadata.size);
1431 self.file_hashes.insert(file, metadata.content_hash);
1432 }
1433 if let Some(dim) = observed_dimension {
1434 self.dimension = dim;
1435 }
1436
1437 Ok(RefreshSummary {
1438 changed: changed
1439 .iter()
1440 .filter(|path| successful_files.contains(*path))
1441 .count(),
1442 added: added
1443 .iter()
1444 .filter(|path| successful_files.contains(*path))
1445 .count(),
1446 deleted: deleted.len(),
1447 total_processed,
1448 })
1449 }
1450
1451 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1453 if self.entries.is_empty() || query_vector.len() != self.dimension {
1454 return Vec::new();
1455 }
1456
1457 let mut scored: Vec<(f32, usize)> = self
1458 .entries
1459 .iter()
1460 .enumerate()
1461 .map(|(i, entry)| {
1462 let mut score = cosine_similarity(query_vector, &entry.vector);
1463 if entry.chunk.exported {
1464 score *= 1.1;
1465 }
1466 (score, i)
1467 })
1468 .collect();
1469
1470 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1472
1473 scored
1474 .into_iter()
1475 .take(top_k)
1476 .map(|(score, idx)| {
1480 let entry = &self.entries[idx];
1481 SemanticResult {
1482 file: entry.chunk.file.clone(),
1483 name: entry.chunk.name.clone(),
1484 kind: entry.chunk.kind.clone(),
1485 start_line: entry.chunk.start_line,
1486 end_line: entry.chunk.end_line,
1487 exported: entry.chunk.exported,
1488 snippet: entry.chunk.snippet.clone(),
1489 score,
1490 source: "semantic",
1491 }
1492 })
1493 .collect()
1494 }
1495
1496 pub fn len(&self) -> usize {
1498 self.entries.len()
1499 }
1500
1501 pub fn is_file_stale(&self, file: &Path) -> bool {
1503 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1504 return true;
1505 };
1506 let Some(stored_size) = self.file_sizes.get(file) else {
1507 return true;
1508 };
1509 let Some(stored_hash) = self.file_hashes.get(file) else {
1510 return true;
1511 };
1512 let cached = FileFreshness {
1513 mtime: *stored_mtime,
1514 size: *stored_size,
1515 content_hash: *stored_hash,
1516 };
1517 match cache_freshness::verify_file(file, &cached) {
1518 FreshnessVerdict::HotFresh => false,
1519 FreshnessVerdict::ContentFresh { .. } => false,
1520 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1521 }
1522 }
1523
1524 fn backfill_missing_file_sizes(&mut self) {
1525 for path in self.file_mtimes.keys() {
1526 if self.file_sizes.contains_key(path) {
1527 continue;
1528 }
1529 if let Ok(metadata) = fs::metadata(path) {
1530 self.file_sizes.insert(path.clone(), metadata.len());
1531 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1532 self.file_hashes.insert(path.clone(), hash);
1533 }
1534 }
1535 }
1536 }
1537
1538 pub fn remove_file(&mut self, file: &Path) {
1540 self.invalidate_file(file);
1541 }
1542
1543 pub fn invalidate_file(&mut self, file: &Path) {
1544 self.entries.retain(|e| e.chunk.file != file);
1545 self.file_mtimes.remove(file);
1546 self.file_sizes.remove(file);
1547 self.file_hashes.remove(file);
1548 }
1549
1550 pub fn dimension(&self) -> usize {
1552 self.dimension
1553 }
1554
1555 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1556 self.fingerprint.as_ref()
1557 }
1558
1559 pub fn backend_label(&self) -> Option<&str> {
1560 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1561 }
1562
1563 pub fn model_label(&self) -> Option<&str> {
1564 self.fingerprint.as_ref().map(|f| f.model.as_str())
1565 }
1566
1567 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1568 self.fingerprint = Some(fingerprint);
1569 }
1570
1571 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1573 if self.entries.is_empty() {
1576 slog_info!("skipping semantic index persistence (0 entries)");
1577 return;
1578 }
1579 let dir = storage_dir.join("semantic").join(project_key);
1580 if let Err(e) = fs::create_dir_all(&dir) {
1581 slog_warn!("failed to create semantic cache dir: {}", e);
1582 return;
1583 }
1584 let data_path = dir.join("semantic.bin");
1585 let tmp_path = dir.join(format!(
1586 "semantic.bin.tmp.{}.{}",
1587 std::process::id(),
1588 SystemTime::now()
1589 .duration_since(SystemTime::UNIX_EPOCH)
1590 .unwrap_or(Duration::ZERO)
1591 .as_nanos()
1592 ));
1593 let bytes = self.to_bytes();
1594 let write_result = (|| -> std::io::Result<()> {
1595 use std::io::Write;
1596 let mut file = fs::File::create(&tmp_path)?;
1597 file.write_all(&bytes)?;
1598 file.sync_all()?;
1599 Ok(())
1600 })();
1601 if let Err(e) = write_result {
1602 slog_warn!("failed to write semantic index: {}", e);
1603 let _ = fs::remove_file(&tmp_path);
1604 return;
1605 }
1606 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1607 slog_warn!("failed to rename semantic index: {}", e);
1608 let _ = fs::remove_file(&tmp_path);
1609 return;
1610 }
1611 slog_info!(
1612 "semantic index persisted: {} entries, {:.1} KB",
1613 self.entries.len(),
1614 bytes.len() as f64 / 1024.0
1615 );
1616 }
1617
1618 pub fn read_from_disk(
1620 storage_dir: &Path,
1621 project_key: &str,
1622 current_canonical_root: &Path,
1623 is_worktree_bridge: bool,
1624 expected_fingerprint: Option<&str>,
1625 ) -> Option<Self> {
1626 debug_assert!(current_canonical_root.is_absolute());
1627 let data_path = storage_dir
1628 .join("semantic")
1629 .join(project_key)
1630 .join("semantic.bin");
1631 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1632 if file_len < HEADER_BYTES_V1 {
1633 slog_warn!(
1634 "corrupt semantic index (too small: {} bytes), removing",
1635 file_len
1636 );
1637 if !is_worktree_bridge {
1638 let _ = fs::remove_file(&data_path);
1639 }
1640 return None;
1641 }
1642
1643 let bytes = fs::read(&data_path).ok()?;
1644 let version = bytes[0];
1645 if version != SEMANTIC_INDEX_VERSION_V6 {
1646 slog_info!(
1647 "cached semantic index version {} is older than {}, rebuilding",
1648 version,
1649 SEMANTIC_INDEX_VERSION_V6
1650 );
1651 if !is_worktree_bridge {
1652 let _ = fs::remove_file(&data_path);
1653 }
1654 return None;
1655 }
1656 match Self::from_bytes(&bytes, current_canonical_root) {
1657 Ok(index) => {
1658 if index.entries.is_empty() {
1659 slog_info!("cached semantic index is empty, will rebuild");
1660 if !is_worktree_bridge {
1661 let _ = fs::remove_file(&data_path);
1662 }
1663 return None;
1664 }
1665 if let Some(expected) = expected_fingerprint {
1666 let matches = index
1667 .fingerprint()
1668 .map(|fingerprint| fingerprint.matches_expected(expected))
1669 .unwrap_or(false);
1670 if !matches {
1671 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1672 if !is_worktree_bridge {
1673 let _ = fs::remove_file(&data_path);
1674 }
1675 return None;
1676 }
1677 }
1678 slog_info!(
1679 "loaded semantic index from disk: {} entries",
1680 index.entries.len()
1681 );
1682 Some(index)
1683 }
1684 Err(e) => {
1685 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1686 if !is_worktree_bridge {
1687 let _ = fs::remove_file(&data_path);
1688 }
1689 None
1690 }
1691 }
1692 }
1693
1694 pub fn to_bytes(&self) -> Vec<u8> {
1696 let mut buf = Vec::new();
1697 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1698 let encoded = fingerprint.as_string();
1699 if encoded.is_empty() {
1700 None
1701 } else {
1702 Some(encoded.into_bytes())
1703 }
1704 });
1705 let file_mtimes: Vec<_> = self
1706 .file_mtimes
1707 .iter()
1708 .filter_map(|(path, mtime)| {
1709 cache_relative_path(&self.project_root, path)
1710 .map(|relative| (relative, path, mtime))
1711 })
1712 .collect();
1713 let entries: Vec<_> = self
1714 .entries
1715 .iter()
1716 .filter_map(|entry| {
1717 cache_relative_path(&self.project_root, &entry.chunk.file)
1718 .map(|relative| (relative, entry))
1719 })
1720 .collect();
1721
1722 let version = SEMANTIC_INDEX_VERSION_V6;
1735 buf.push(version);
1736 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1737 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1738 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1739 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1740 buf.extend_from_slice(fp_bytes_ref);
1741
1742 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1745 for (relative, path, mtime) in &file_mtimes {
1746 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1747 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1748 buf.extend_from_slice(&path_bytes);
1749 let duration = mtime
1750 .duration_since(SystemTime::UNIX_EPOCH)
1751 .unwrap_or_default();
1752 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1753 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1754 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1755 buf.extend_from_slice(&size.to_le_bytes());
1756 let hash = self
1757 .file_hashes
1758 .get(*path)
1759 .copied()
1760 .unwrap_or_else(cache_freshness::zero_hash);
1761 buf.extend_from_slice(hash.as_bytes());
1762 }
1763
1764 for (relative, entry) in &entries {
1766 let c = &entry.chunk;
1767
1768 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1770 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1771 buf.extend_from_slice(&file_bytes);
1772
1773 let name_bytes = c.name.as_bytes();
1775 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1776 buf.extend_from_slice(name_bytes);
1777
1778 buf.push(symbol_kind_to_u8(&c.kind));
1780
1781 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1783 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1784 buf.push(c.exported as u8);
1785
1786 let snippet_bytes = c.snippet.as_bytes();
1788 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1789 buf.extend_from_slice(snippet_bytes);
1790
1791 let embed_bytes = c.embed_text.as_bytes();
1793 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1794 buf.extend_from_slice(embed_bytes);
1795
1796 for &val in &entry.vector {
1798 buf.extend_from_slice(&val.to_le_bytes());
1799 }
1800 }
1801
1802 buf
1803 }
1804
1805 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1807 debug_assert!(current_canonical_root.is_absolute());
1808 let mut pos = 0;
1809
1810 if data.len() < HEADER_BYTES_V1 {
1811 return Err("data too short".to_string());
1812 }
1813
1814 let version = data[pos];
1815 pos += 1;
1816 if version != SEMANTIC_INDEX_VERSION_V1
1817 && version != SEMANTIC_INDEX_VERSION_V2
1818 && version != SEMANTIC_INDEX_VERSION_V3
1819 && version != SEMANTIC_INDEX_VERSION_V4
1820 && version != SEMANTIC_INDEX_VERSION_V5
1821 && version != SEMANTIC_INDEX_VERSION_V6
1822 {
1823 return Err(format!("unsupported version: {}", version));
1824 }
1825 if (version == SEMANTIC_INDEX_VERSION_V2
1829 || version == SEMANTIC_INDEX_VERSION_V3
1830 || version == SEMANTIC_INDEX_VERSION_V4
1831 || version == SEMANTIC_INDEX_VERSION_V5
1832 || version == SEMANTIC_INDEX_VERSION_V6)
1833 && data.len() < HEADER_BYTES_V2
1834 {
1835 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1836 }
1837
1838 let dimension = read_u32(data, &mut pos)? as usize;
1839 let entry_count = read_u32(data, &mut pos)? as usize;
1840 if dimension == 0 || dimension > MAX_DIMENSION {
1841 return Err(format!("invalid embedding dimension: {}", dimension));
1842 }
1843 if entry_count > MAX_ENTRIES {
1844 return Err(format!("too many semantic index entries: {}", entry_count));
1845 }
1846
1847 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1853 || version == SEMANTIC_INDEX_VERSION_V3
1854 || version == SEMANTIC_INDEX_VERSION_V4
1855 || version == SEMANTIC_INDEX_VERSION_V5
1856 || version == SEMANTIC_INDEX_VERSION_V6;
1857 let fingerprint = if has_fingerprint_field {
1858 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1859 if pos + fingerprint_len > data.len() {
1860 return Err("unexpected end of data reading fingerprint".to_string());
1861 }
1862 if fingerprint_len == 0 {
1863 None
1864 } else {
1865 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1866 pos += fingerprint_len;
1867 Some(
1868 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1869 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1870 )
1871 }
1872 } else {
1873 None
1874 };
1875
1876 let mtime_count = read_u32(data, &mut pos)? as usize;
1878 if mtime_count > MAX_ENTRIES {
1879 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1880 }
1881
1882 let vector_bytes = entry_count
1883 .checked_mul(dimension)
1884 .and_then(|count| count.checked_mul(F32_BYTES))
1885 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1886 if vector_bytes > data.len().saturating_sub(pos) {
1887 return Err("semantic index vectors exceed available data".to_string());
1888 }
1889
1890 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1891 let mut file_sizes = HashMap::with_capacity(mtime_count);
1892 let mut file_hashes = HashMap::with_capacity(mtime_count);
1893 for _ in 0..mtime_count {
1894 let path = read_string(data, &mut pos)?;
1895 let secs = read_u64(data, &mut pos)?;
1896 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1902 || version == SEMANTIC_INDEX_VERSION_V4
1903 || version == SEMANTIC_INDEX_VERSION_V5
1904 || version == SEMANTIC_INDEX_VERSION_V6
1905 {
1906 read_u32(data, &mut pos)?
1907 } else {
1908 0
1909 };
1910 let size =
1911 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1912 read_u64(data, &mut pos)?
1913 } else {
1914 0
1915 };
1916 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1917 if pos + 32 > data.len() {
1918 return Err("unexpected end of data reading content hash".to_string());
1919 }
1920 let mut hash_bytes = [0u8; 32];
1921 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1922 pos += 32;
1923 blake3::Hash::from_bytes(hash_bytes)
1924 } else {
1925 cache_freshness::zero_hash()
1926 };
1927 if nanos >= 1_000_000_000 {
1934 return Err(format!(
1935 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1936 nanos
1937 ));
1938 }
1939 let duration = std::time::Duration::new(secs, nanos);
1940 let mtime = SystemTime::UNIX_EPOCH
1941 .checked_add(duration)
1942 .ok_or_else(|| {
1943 format!(
1944 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1945 secs, nanos
1946 )
1947 })?;
1948 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1949 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1950 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1951 } else {
1952 PathBuf::from(path)
1953 };
1954 file_mtimes.insert(path.clone(), mtime);
1955 file_sizes.insert(path.clone(), size);
1956 file_hashes.insert(path, content_hash);
1957 }
1958
1959 let mut entries = Vec::with_capacity(entry_count);
1961 for _ in 0..entry_count {
1962 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1963 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1964 cached_path_under_root(current_canonical_root, &raw_file)
1965 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1966 } else {
1967 raw_file
1968 };
1969 let name = read_string(data, &mut pos)?;
1970
1971 if pos >= data.len() {
1972 return Err("unexpected end of data".to_string());
1973 }
1974 let kind = u8_to_symbol_kind(data[pos]);
1975 pos += 1;
1976
1977 let start_line = read_u32(data, &mut pos)?;
1978 let end_line = read_u32(data, &mut pos)?;
1979
1980 if pos >= data.len() {
1981 return Err("unexpected end of data".to_string());
1982 }
1983 let exported = data[pos] != 0;
1984 pos += 1;
1985
1986 let snippet = read_string(data, &mut pos)?;
1987 let embed_text = read_string(data, &mut pos)?;
1988
1989 let vec_bytes = dimension
1991 .checked_mul(F32_BYTES)
1992 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1993 if pos + vec_bytes > data.len() {
1994 return Err("unexpected end of data reading vector".to_string());
1995 }
1996 let mut vector = Vec::with_capacity(dimension);
1997 for _ in 0..dimension {
1998 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1999 vector.push(f32::from_le_bytes(bytes));
2000 pos += 4;
2001 }
2002
2003 entries.push(EmbeddingEntry {
2004 chunk: SemanticChunk {
2005 file,
2006 name,
2007 kind,
2008 start_line,
2009 end_line,
2010 exported,
2011 embed_text,
2012 snippet,
2013 },
2014 vector,
2015 });
2016 }
2017
2018 if entries.len() != entry_count {
2019 return Err(format!(
2020 "semantic cache entry count drift: header={} decoded={}",
2021 entry_count,
2022 entries.len()
2023 ));
2024 }
2025 for entry in &entries {
2026 if !file_mtimes.contains_key(&entry.chunk.file) {
2027 return Err(format!(
2028 "semantic cache metadata missing for entry file {}",
2029 entry.chunk.file.display()
2030 ));
2031 }
2032 }
2033
2034 Ok(Self {
2035 entries,
2036 file_mtimes,
2037 file_sizes,
2038 file_hashes,
2039 dimension,
2040 fingerprint,
2041 project_root: current_canonical_root.to_path_buf(),
2042 })
2043 }
2044}
2045
2046fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2048 let relative = file
2049 .strip_prefix(project_root)
2050 .unwrap_or(file)
2051 .to_string_lossy();
2052
2053 let kind_label = match &symbol.kind {
2054 SymbolKind::Function => "function",
2055 SymbolKind::Class => "class",
2056 SymbolKind::Method => "method",
2057 SymbolKind::Struct => "struct",
2058 SymbolKind::Interface => "interface",
2059 SymbolKind::Enum => "enum",
2060 SymbolKind::TypeAlias => "type",
2061 SymbolKind::Variable => "variable",
2062 SymbolKind::Heading => "heading",
2063 SymbolKind::FileSummary => "file-summary",
2064 };
2065
2066 let name = &symbol.name;
2068 let mut text = format!(
2069 "name:{name} file:{} kind:{} name:{name}",
2070 relative, kind_label
2071 );
2072
2073 if let Some(sig) = &symbol.signature {
2074 text.push_str(&format!(" signature:{}", sig));
2075 }
2076
2077 let lines: Vec<&str> = source.lines().collect();
2079 let start = (symbol.range.start_line as usize).min(lines.len());
2080 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2082 if start < end {
2083 let body: String = lines[start..end]
2084 .iter()
2085 .take(15) .copied()
2087 .collect::<Vec<&str>>()
2088 .join("\n");
2089 let snippet = if body.len() > 300 {
2090 format!("{}...", &body[..body.floor_char_boundary(300)])
2091 } else {
2092 body
2093 };
2094 text.push_str(&format!(" body:{}", snippet));
2095 }
2096
2097 text
2098}
2099
2100fn truncate_chars(value: &str, max_chars: usize) -> String {
2101 value.chars().take(max_chars).collect()
2102}
2103
2104fn first_leading_doc_comment(source: &str) -> String {
2105 let lines: Vec<&str> = source.lines().collect();
2106 let Some((start, first)) = lines
2107 .iter()
2108 .enumerate()
2109 .find(|(_, line)| !line.trim().is_empty())
2110 else {
2111 return String::new();
2112 };
2113
2114 let trimmed = first.trim_start();
2115 if trimmed.starts_with("/**") {
2116 let mut comment = Vec::new();
2117 for line in lines.iter().skip(start) {
2118 comment.push(*line);
2119 if line.contains("*/") {
2120 break;
2121 }
2122 }
2123 return truncate_chars(&comment.join("\n"), 200);
2124 }
2125
2126 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2127 let comment = lines
2128 .iter()
2129 .skip(start)
2130 .take_while(|line| {
2131 let trimmed = line.trim_start();
2132 trimmed.starts_with("///") || trimmed.starts_with("//!")
2133 })
2134 .copied()
2135 .collect::<Vec<_>>()
2136 .join("\n");
2137 return truncate_chars(&comment, 200);
2138 }
2139
2140 String::new()
2141}
2142
2143pub fn build_file_summary_chunk(
2144 file: &Path,
2145 project_root: &Path,
2146 source: &str,
2147 top_exports: &[&str],
2148 top_export_signatures: &[Option<&str>],
2149) -> SemanticChunk {
2150 let relative = file.strip_prefix(project_root).unwrap_or(file);
2151 let rel_path = relative.to_string_lossy();
2152 let parent_dir = relative
2153 .parent()
2154 .map(|parent| parent.to_string_lossy().to_string())
2155 .unwrap_or_default();
2156 let name = file
2157 .file_stem()
2158 .map(|stem| stem.to_string_lossy().to_string())
2159 .unwrap_or_default();
2160 let doc = first_leading_doc_comment(source);
2161 let exports = top_exports
2162 .iter()
2163 .take(5)
2164 .copied()
2165 .collect::<Vec<_>>()
2166 .join(",");
2167 let snippet = if doc.is_empty() {
2168 top_export_signatures
2169 .first()
2170 .and_then(|signature| signature.as_deref())
2171 .map(|signature| truncate_chars(signature, 200))
2172 .unwrap_or_default()
2173 } else {
2174 doc.clone()
2175 };
2176
2177 SemanticChunk {
2178 file: file.to_path_buf(),
2179 name,
2180 kind: SymbolKind::FileSummary,
2181 start_line: 0,
2182 end_line: 0,
2183 exported: false,
2184 embed_text: format!(
2185 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2186 file.file_stem()
2187 .map(|stem| stem.to_string_lossy().to_string())
2188 .unwrap_or_default()
2189 ),
2190 snippet,
2191 }
2192}
2193
2194fn parser_for(
2195 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2196 lang: crate::parser::LangId,
2197) -> Result<&mut Parser, String> {
2198 use std::collections::hash_map::Entry;
2199
2200 match parsers.entry(lang) {
2201 Entry::Occupied(entry) => Ok(entry.into_mut()),
2202 Entry::Vacant(entry) => {
2203 let grammar = grammar_for(lang);
2204 let mut parser = Parser::new();
2205 parser
2206 .set_language(&grammar)
2207 .map_err(|error| error.to_string())?;
2208 Ok(entry.insert(parser))
2209 }
2210 }
2211}
2212
2213pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2214 matches!(
2215 path.extension().and_then(|extension| extension.to_str()),
2216 Some(
2217 "ts" | "tsx"
2218 | "js"
2219 | "jsx"
2220 | "py"
2221 | "rs"
2222 | "go"
2223 | "c"
2224 | "h"
2225 | "cc"
2226 | "cpp"
2227 | "cxx"
2228 | "hpp"
2229 | "hh"
2230 | "zig"
2231 | "cs"
2232 | "sh"
2233 | "bash"
2234 | "zsh"
2235 | "sol"
2236 | "vue"
2237 )
2238 )
2239}
2240
2241fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2242 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2243 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2244 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2245 .map_err(|error| error.to_string())?
2246 .unwrap_or_else(cache_freshness::zero_hash);
2247 Ok(IndexedFileMetadata {
2248 mtime,
2249 size: metadata.len(),
2250 content_hash,
2251 })
2252}
2253
2254fn collect_file_chunks(
2255 project_root: &Path,
2256 file: &Path,
2257 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2258) -> Result<Vec<SemanticChunk>, String> {
2259 if !is_semantic_indexed_extension(file) {
2260 return Err("unsupported file extension".to_string());
2261 }
2262 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2263 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2264 let tree = parser_for(parsers, lang)?
2265 .parse(&source, None)
2266 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2267 let symbols =
2268 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2269
2270 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2271}
2272
2273fn build_snippet(symbol: &Symbol, source: &str) -> String {
2275 let lines: Vec<&str> = source.lines().collect();
2276 let start = (symbol.range.start_line as usize).min(lines.len());
2277 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2279 if start < end {
2280 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2281 let mut snippet = snippet_lines.join("\n");
2282 if end - start > 5 {
2283 snippet.push_str("\n ...");
2284 }
2285 if snippet.len() > 300 {
2286 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2287 }
2288 snippet
2289 } else {
2290 String::new()
2291 }
2292}
2293
2294fn symbols_to_chunks(
2296 file: &Path,
2297 symbols: &[Symbol],
2298 source: &str,
2299 project_root: &Path,
2300) -> Vec<SemanticChunk> {
2301 let mut chunks = Vec::new();
2302 let top_exports_with_signatures = symbols
2303 .iter()
2304 .filter(|symbol| {
2305 symbol.exported
2306 && symbol.parent.is_none()
2307 && !matches!(symbol.kind, SymbolKind::Heading)
2308 })
2309 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2310 .collect::<Vec<_>>();
2311
2312 let has_only_headings = !symbols.is_empty()
2313 && symbols
2314 .iter()
2315 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2316 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2317 let top_exports = top_exports_with_signatures
2318 .iter()
2319 .map(|(name, _)| *name)
2320 .collect::<Vec<_>>();
2321 let top_export_signatures = top_exports_with_signatures
2322 .iter()
2323 .map(|(_, signature)| *signature)
2324 .collect::<Vec<_>>();
2325 chunks.push(build_file_summary_chunk(
2326 file,
2327 project_root,
2328 source,
2329 &top_exports,
2330 &top_export_signatures,
2331 ));
2332 }
2333
2334 for symbol in symbols {
2335 if matches!(symbol.kind, SymbolKind::Heading) {
2340 continue;
2341 }
2342
2343 let line_count = symbol
2345 .range
2346 .end_line
2347 .saturating_sub(symbol.range.start_line)
2348 + 1;
2349 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2350 continue;
2351 }
2352
2353 let embed_text = build_embed_text(symbol, source, file, project_root);
2354 let snippet = build_snippet(symbol, source);
2355
2356 chunks.push(SemanticChunk {
2357 file: file.to_path_buf(),
2358 name: symbol.name.clone(),
2359 kind: symbol.kind.clone(),
2360 start_line: symbol.range.start_line,
2361 end_line: symbol.range.end_line,
2362 exported: symbol.exported,
2363 embed_text,
2364 snippet,
2365 });
2366
2367 }
2370
2371 chunks
2372}
2373
2374fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2376 if a.len() != b.len() {
2377 return 0.0;
2378 }
2379
2380 let mut dot = 0.0f32;
2381 let mut norm_a = 0.0f32;
2382 let mut norm_b = 0.0f32;
2383
2384 for i in 0..a.len() {
2385 dot += a[i] * b[i];
2386 norm_a += a[i] * a[i];
2387 norm_b += b[i] * b[i];
2388 }
2389
2390 let denom = norm_a.sqrt() * norm_b.sqrt();
2391 if denom == 0.0 {
2392 0.0
2393 } else {
2394 dot / denom
2395 }
2396}
2397
2398fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2400 match kind {
2401 SymbolKind::Function => 0,
2402 SymbolKind::Class => 1,
2403 SymbolKind::Method => 2,
2404 SymbolKind::Struct => 3,
2405 SymbolKind::Interface => 4,
2406 SymbolKind::Enum => 5,
2407 SymbolKind::TypeAlias => 6,
2408 SymbolKind::Variable => 7,
2409 SymbolKind::Heading => 8,
2410 SymbolKind::FileSummary => 9,
2411 }
2412}
2413
2414fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2415 match v {
2416 0 => SymbolKind::Function,
2417 1 => SymbolKind::Class,
2418 2 => SymbolKind::Method,
2419 3 => SymbolKind::Struct,
2420 4 => SymbolKind::Interface,
2421 5 => SymbolKind::Enum,
2422 6 => SymbolKind::TypeAlias,
2423 7 => SymbolKind::Variable,
2424 8 => SymbolKind::Heading,
2425 9 => SymbolKind::FileSummary,
2426 _ => SymbolKind::Heading,
2427 }
2428}
2429
2430fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2431 if *pos + 4 > data.len() {
2432 return Err("unexpected end of data reading u32".to_string());
2433 }
2434 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2435 *pos += 4;
2436 Ok(val)
2437}
2438
2439fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2440 if *pos + 8 > data.len() {
2441 return Err("unexpected end of data reading u64".to_string());
2442 }
2443 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2444 *pos += 8;
2445 Ok(u64::from_le_bytes(bytes))
2446}
2447
2448fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2449 let len = read_u32(data, pos)? as usize;
2450 if *pos + len > data.len() {
2451 return Err("unexpected end of data reading string".to_string());
2452 }
2453 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2454 *pos += len;
2455 Ok(s)
2456}
2457
2458#[cfg(test)]
2459mod tests {
2460 use super::*;
2461 use crate::config::{SemanticBackend, SemanticBackendConfig};
2462 use crate::parser::FileParser;
2463 use std::io::{Read, Write};
2464 use std::net::TcpListener;
2465 use std::thread;
2466
2467 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2468 where
2469 F: Fn(String, String, String) -> String + Send + 'static,
2470 {
2471 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2472 let addr = listener.local_addr().expect("local addr");
2473 let handle = thread::spawn(move || {
2474 let (mut stream, _) = listener.accept().expect("accept request");
2475 let mut buf = Vec::new();
2476 let mut chunk = [0u8; 4096];
2477 let mut header_end = None;
2478 let mut content_length = 0usize;
2479 loop {
2480 let n = stream.read(&mut chunk).expect("read request");
2481 if n == 0 {
2482 break;
2483 }
2484 buf.extend_from_slice(&chunk[..n]);
2485 if header_end.is_none() {
2486 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2487 header_end = Some(pos + 4);
2488 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2489 for line in headers.lines() {
2490 if let Some(value) = line.strip_prefix("Content-Length:") {
2491 content_length = value.trim().parse::<usize>().unwrap_or(0);
2492 }
2493 }
2494 }
2495 }
2496 if let Some(end) = header_end {
2497 if buf.len() >= end + content_length {
2498 break;
2499 }
2500 }
2501 }
2502
2503 let end = header_end.expect("header terminator");
2504 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2505 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2506 let mut lines = request.lines();
2507 let request_line = lines.next().expect("request line").to_string();
2508 let path = request_line
2509 .split_whitespace()
2510 .nth(1)
2511 .expect("request path")
2512 .to_string();
2513 let response_body = handler(request_line, path, body);
2514 let response = format!(
2515 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2516 response_body.len(),
2517 response_body
2518 );
2519 stream
2520 .write_all(response.as_bytes())
2521 .expect("write response");
2522 });
2523
2524 (format!("http://{}", addr), handle)
2525 }
2526
2527 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2528 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2529 }
2530
2531 fn write_rust_file(path: &Path, function_name: &str) {
2532 fs::write(
2533 path,
2534 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2535 )
2536 .unwrap();
2537 }
2538
2539 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2540 let mut embed = test_vector_for_texts;
2541 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2542 }
2543
2544 fn test_project_root() -> PathBuf {
2545 std::env::current_dir().unwrap()
2546 }
2547
2548 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2549 index.file_mtimes.insert(file.to_path_buf(), mtime);
2550 index.file_sizes.insert(file.to_path_buf(), size);
2551 index
2552 .file_hashes
2553 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2554 }
2555
2556 #[test]
2557 fn semantic_cache_serialization_skips_paths_outside_project_root() {
2558 let dir = tempfile::tempdir().expect("create temp dir");
2559 let project = fs::canonicalize(dir.path()).expect("canonical project");
2560 let outside = project.join("..").join("outside.rs");
2561 let mut index = SemanticIndex::new(project.clone(), 3);
2562 index
2563 .file_mtimes
2564 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2565 index.file_sizes.insert(outside.clone(), 1);
2566 index
2567 .file_hashes
2568 .insert(outside.clone(), cache_freshness::zero_hash());
2569 index.entries.push(EmbeddingEntry {
2570 chunk: SemanticChunk {
2571 file: outside,
2572 name: "outside".to_string(),
2573 kind: SymbolKind::Function,
2574 start_line: 0,
2575 end_line: 0,
2576 exported: false,
2577 embed_text: "outside".to_string(),
2578 snippet: "outside".to_string(),
2579 },
2580 vector: vec![1.0, 0.0, 0.0],
2581 });
2582
2583 let bytes = index.to_bytes();
2584 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2585 assert_eq!(loaded.entries.len(), 0);
2586 assert!(loaded.file_mtimes.is_empty());
2587 }
2588
2589 #[test]
2590 fn test_cosine_similarity_identical() {
2591 let a = vec![1.0, 0.0, 0.0];
2592 let b = vec![1.0, 0.0, 0.0];
2593 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2594 }
2595
2596 #[test]
2597 fn test_cosine_similarity_orthogonal() {
2598 let a = vec![1.0, 0.0, 0.0];
2599 let b = vec![0.0, 1.0, 0.0];
2600 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2601 }
2602
2603 #[test]
2604 fn test_cosine_similarity_opposite() {
2605 let a = vec![1.0, 0.0, 0.0];
2606 let b = vec![-1.0, 0.0, 0.0];
2607 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2608 }
2609
2610 #[test]
2611 fn test_serialization_roundtrip() {
2612 let project_root = test_project_root();
2613 let file = project_root.join("src/main.rs");
2614 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2615 index.entries.push(EmbeddingEntry {
2616 chunk: SemanticChunk {
2617 file: file.clone(),
2618 name: "handle_request".to_string(),
2619 kind: SymbolKind::Function,
2620 start_line: 10,
2621 end_line: 25,
2622 exported: true,
2623 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2624 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2625 },
2626 vector: vec![0.1, 0.2, 0.3, 0.4],
2627 });
2628 index.dimension = 4;
2629 index
2630 .file_mtimes
2631 .insert(file.clone(), SystemTime::UNIX_EPOCH);
2632 index.file_sizes.insert(file, 0);
2633 index.set_fingerprint(SemanticIndexFingerprint {
2634 backend: "fastembed".to_string(),
2635 model: "all-MiniLM-L6-v2".to_string(),
2636 base_url: FALLBACK_BACKEND.to_string(),
2637 dimension: 4,
2638 chunking_version: default_chunking_version(),
2639 });
2640
2641 let bytes = index.to_bytes();
2642 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2643
2644 assert_eq!(restored.entries.len(), 1);
2645 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2646 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2647 assert_eq!(restored.dimension, 4);
2648 assert_eq!(restored.backend_label(), Some("fastembed"));
2649 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2650 }
2651
2652 #[test]
2653 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2654 let cases = [
2655 (SymbolKind::Function, 0),
2656 (SymbolKind::Class, 1),
2657 (SymbolKind::Method, 2),
2658 (SymbolKind::Struct, 3),
2659 (SymbolKind::Interface, 4),
2660 (SymbolKind::Enum, 5),
2661 (SymbolKind::TypeAlias, 6),
2662 (SymbolKind::Variable, 7),
2663 (SymbolKind::Heading, 8),
2664 (SymbolKind::FileSummary, 9),
2665 ];
2666
2667 for (kind, encoded) in cases {
2668 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2669 assert_eq!(u8_to_symbol_kind(encoded), kind);
2670 }
2671 }
2672
2673 #[test]
2674 fn test_search_top_k() {
2675 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2676 index.dimension = 3;
2677
2678 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2680 let mut vec = vec![0.0f32; 3];
2681 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2683 chunk: SemanticChunk {
2684 file: PathBuf::from("/src/lib.rs"),
2685 name: name.to_string(),
2686 kind: SymbolKind::Function,
2687 start_line: (i * 10 + 1) as u32,
2688 end_line: (i * 10 + 5) as u32,
2689 exported: true,
2690 embed_text: format!("kind:function name:{}", name),
2691 snippet: format!("fn {}() {{}}", name),
2692 },
2693 vector: vec,
2694 });
2695 }
2696
2697 let query = vec![0.9, 0.1, 0.0];
2699 let results = index.search(&query, 2);
2700
2701 assert_eq!(results.len(), 2);
2702 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2704 }
2705
2706 #[test]
2707 fn test_empty_index_search() {
2708 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2709 let results = index.search(&[0.1, 0.2, 0.3], 10);
2710 assert!(results.is_empty());
2711 }
2712
2713 #[test]
2714 fn single_line_symbol_builds_non_empty_snippet() {
2715 let symbol = Symbol {
2716 name: "answer".to_string(),
2717 kind: SymbolKind::Variable,
2718 range: crate::symbols::Range {
2719 start_line: 0,
2720 start_col: 0,
2721 end_line: 0,
2722 end_col: 24,
2723 },
2724 signature: Some("const answer = 42".to_string()),
2725 scope_chain: Vec::new(),
2726 exported: true,
2727 parent: None,
2728 };
2729 let source = "export const answer = 42;\n";
2730
2731 let snippet = build_snippet(&symbol, source);
2732
2733 assert_eq!(snippet, "export const answer = 42;");
2734 }
2735
2736 #[test]
2737 fn optimized_file_chunk_collection_matches_file_parser_path() {
2738 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2739 let file = project_root.join("src/semantic_index.rs");
2740 let source = std::fs::read_to_string(&file).unwrap();
2741
2742 let mut legacy_parser = FileParser::new();
2743 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2744 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2745
2746 let mut parsers = HashMap::new();
2747 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2748
2749 assert_eq!(
2750 chunk_fingerprint(&optimized_chunks),
2751 chunk_fingerprint(&legacy_chunks)
2752 );
2753 }
2754
2755 fn chunk_fingerprint(
2756 chunks: &[SemanticChunk],
2757 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2758 chunks
2759 .iter()
2760 .map(|chunk| {
2761 (
2762 chunk.name.clone(),
2763 chunk.kind.clone(),
2764 chunk.start_line,
2765 chunk.end_line,
2766 chunk.exported,
2767 chunk.embed_text.clone(),
2768 chunk.snippet.clone(),
2769 )
2770 })
2771 .collect()
2772 }
2773
2774 #[test]
2775 fn rejects_oversized_dimension_during_deserialization() {
2776 let mut bytes = Vec::new();
2777 bytes.push(1u8);
2778 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2779 bytes.extend_from_slice(&0u32.to_le_bytes());
2780 bytes.extend_from_slice(&0u32.to_le_bytes());
2781
2782 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2783 }
2784
2785 #[test]
2786 fn rejects_oversized_entry_count_during_deserialization() {
2787 let mut bytes = Vec::new();
2788 bytes.push(1u8);
2789 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2790 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2791 bytes.extend_from_slice(&0u32.to_le_bytes());
2792
2793 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2794 }
2795
2796 #[test]
2797 fn invalidate_file_removes_entries_and_mtime() {
2798 let target = PathBuf::from("/src/main.rs");
2799 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2800 index.entries.push(EmbeddingEntry {
2801 chunk: SemanticChunk {
2802 file: target.clone(),
2803 name: "main".to_string(),
2804 kind: SymbolKind::Function,
2805 start_line: 0,
2806 end_line: 1,
2807 exported: false,
2808 embed_text: "main".to_string(),
2809 snippet: "fn main() {}".to_string(),
2810 },
2811 vector: vec![1.0; DEFAULT_DIMENSION],
2812 });
2813 index
2814 .file_mtimes
2815 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2816 index.file_sizes.insert(target.clone(), 0);
2817
2818 index.invalidate_file(&target);
2819
2820 assert!(index.entries.is_empty());
2821 assert!(!index.file_mtimes.contains_key(&target));
2822 assert!(!index.file_sizes.contains_key(&target));
2823 }
2824
2825 #[test]
2826 fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2827 let temp = tempfile::tempdir().unwrap();
2828 let project_root = temp.path();
2829 let file = project_root.join("src/lib.rs");
2830 fs::create_dir_all(file.parent().unwrap()).unwrap();
2831 write_rust_file(&file, "kept_symbol");
2832
2833 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2834 let original_entry_count = index.entries.len();
2835 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2836 let original_size = *index.file_sizes.get(&file).unwrap();
2837
2838 let stale_mtime = SystemTime::UNIX_EPOCH;
2839 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2840 fs::remove_file(&file).unwrap();
2841
2842 let mut embed = test_vector_for_texts;
2843 let mut progress = |_done: usize, _total: usize| {};
2844 let summary = index
2845 .refresh_stale_files(
2846 project_root,
2847 std::slice::from_ref(&file),
2848 &mut embed,
2849 8,
2850 &mut progress,
2851 )
2852 .unwrap();
2853
2854 assert_eq!(summary.changed, 0);
2855 assert_eq!(summary.added, 0);
2856 assert_eq!(summary.deleted, 0);
2857 assert_eq!(index.entries.len(), original_entry_count);
2858 assert!(index
2859 .entries
2860 .iter()
2861 .any(|entry| entry.chunk.name == "kept_symbol"));
2862 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2863 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2864 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2865 }
2866
2867 #[test]
2868 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2869 let temp = tempfile::tempdir().unwrap();
2870 let project_root = temp.path();
2871 let missing = project_root.join("src/missing.rs");
2872 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2873
2874 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2875 let mut embed = test_vector_for_texts;
2876 let mut progress = |_done: usize, _total: usize| {};
2877 let summary = index
2878 .refresh_stale_files(
2879 project_root,
2880 std::slice::from_ref(&missing),
2881 &mut embed,
2882 8,
2883 &mut progress,
2884 )
2885 .unwrap();
2886
2887 assert_eq!(summary.added, 0);
2888 assert_eq!(summary.changed, 0);
2889 assert_eq!(summary.deleted, 0);
2890 assert!(!index.file_mtimes.contains_key(&missing));
2891 assert!(!index.file_sizes.contains_key(&missing));
2892 assert!(index.entries.is_empty());
2893 }
2894
2895 #[test]
2896 fn refresh_reports_added_for_new_files() {
2897 let temp = tempfile::tempdir().unwrap();
2898 let project_root = temp.path();
2899 let existing = project_root.join("src/lib.rs");
2900 let added = project_root.join("src/new.rs");
2901 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2902 write_rust_file(&existing, "existing_symbol");
2903 write_rust_file(&added, "added_symbol");
2904
2905 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2906 let mut embed = test_vector_for_texts;
2907 let mut progress = |_done: usize, _total: usize| {};
2908 let summary = index
2909 .refresh_stale_files(
2910 project_root,
2911 &[existing.clone(), added.clone()],
2912 &mut embed,
2913 8,
2914 &mut progress,
2915 )
2916 .unwrap();
2917
2918 assert_eq!(summary.added, 1);
2919 assert_eq!(summary.changed, 0);
2920 assert_eq!(summary.deleted, 0);
2921 assert_eq!(summary.total_processed, 2);
2922 assert!(index.file_mtimes.contains_key(&added));
2923 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2924 }
2925
2926 #[test]
2927 fn refresh_reports_deleted_for_removed_files() {
2928 let temp = tempfile::tempdir().unwrap();
2929 let project_root = temp.path();
2930 let deleted = project_root.join("src/deleted.rs");
2931 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2932 write_rust_file(&deleted, "deleted_symbol");
2933
2934 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2935 fs::remove_file(&deleted).unwrap();
2936
2937 let mut embed = test_vector_for_texts;
2938 let mut progress = |_done: usize, _total: usize| {};
2939 let summary = index
2940 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2941 .unwrap();
2942
2943 assert_eq!(summary.deleted, 1);
2944 assert_eq!(summary.changed, 0);
2945 assert_eq!(summary.added, 0);
2946 assert_eq!(summary.total_processed, 1);
2947 assert!(!index.file_mtimes.contains_key(&deleted));
2948 assert!(index.entries.is_empty());
2949 }
2950
2951 #[test]
2952 fn refresh_reports_changed_for_modified_files() {
2953 let temp = tempfile::tempdir().unwrap();
2954 let project_root = temp.path();
2955 let file = project_root.join("src/lib.rs");
2956 fs::create_dir_all(file.parent().unwrap()).unwrap();
2957 write_rust_file(&file, "old_symbol");
2958
2959 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2960 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2961 write_rust_file(&file, "new_symbol");
2962
2963 let mut embed = test_vector_for_texts;
2964 let mut progress = |_done: usize, _total: usize| {};
2965 let summary = index
2966 .refresh_stale_files(
2967 project_root,
2968 std::slice::from_ref(&file),
2969 &mut embed,
2970 8,
2971 &mut progress,
2972 )
2973 .unwrap();
2974
2975 assert_eq!(summary.changed, 1);
2976 assert_eq!(summary.added, 0);
2977 assert_eq!(summary.deleted, 0);
2978 assert_eq!(summary.total_processed, 1);
2979 assert!(index
2980 .entries
2981 .iter()
2982 .any(|entry| entry.chunk.name == "new_symbol"));
2983 assert!(!index
2984 .entries
2985 .iter()
2986 .any(|entry| entry.chunk.name == "old_symbol"));
2987 }
2988
2989 #[test]
2990 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2991 let temp = tempfile::tempdir().unwrap();
2992 let project_root = temp.path();
2993 let file = project_root.join("src/lib.rs");
2994 fs::create_dir_all(file.parent().unwrap()).unwrap();
2995 write_rust_file(&file, "clean_symbol");
2996
2997 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2998 let original_entries = index.entries.len();
2999 let mut embed_called = false;
3000 let mut embed = |texts: Vec<String>| {
3001 embed_called = true;
3002 test_vector_for_texts(texts)
3003 };
3004 let mut progress = |_done: usize, _total: usize| {};
3005 let summary = index
3006 .refresh_stale_files(
3007 project_root,
3008 std::slice::from_ref(&file),
3009 &mut embed,
3010 8,
3011 &mut progress,
3012 )
3013 .unwrap();
3014
3015 assert!(summary.is_noop());
3016 assert_eq!(summary.total_processed, 1);
3017 assert!(!embed_called);
3018 assert_eq!(index.entries.len(), original_entries);
3019 }
3020
3021 #[test]
3022 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3023 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3024
3025 assert!(is_onnx_runtime_unavailable(message));
3026 }
3027
3028 #[test]
3029 fn formats_missing_onnx_runtime_with_install_hint() {
3030 let message = format_embedding_init_error(
3031 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3032 );
3033
3034 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3035 assert!(message.contains("Original error:"));
3036 }
3037
3038 #[test]
3039 fn openai_compatible_backend_embeds_with_mock_server() {
3040 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3041 assert!(request_line.starts_with("POST "));
3042 assert_eq!(path, "/v1/embeddings");
3043 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3044 });
3045
3046 let config = SemanticBackendConfig {
3047 backend: SemanticBackend::OpenAiCompatible,
3048 model: "test-embedding".to_string(),
3049 base_url: Some(base_url),
3050 api_key_env: None,
3051 timeout_ms: 5_000,
3052 max_batch_size: 64,
3053 };
3054
3055 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3056 let vectors = model
3057 .embed(vec!["hello".to_string(), "world".to_string()])
3058 .unwrap();
3059
3060 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3061 handle.join().unwrap();
3062 }
3063
3064 #[test]
3074 fn openai_compatible_request_has_single_content_type_header() {
3075 use std::sync::{Arc, Mutex};
3076 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3077 let captured_for_thread = Arc::clone(&captured);
3078
3079 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3080 let addr = listener.local_addr().expect("local addr");
3081 let handle = thread::spawn(move || {
3082 let (mut stream, _) = listener.accept().expect("accept");
3083 let mut buf = Vec::new();
3084 let mut chunk = [0u8; 4096];
3085 let mut header_end = None;
3086 let mut content_length = 0usize;
3087 loop {
3088 let n = stream.read(&mut chunk).expect("read");
3089 if n == 0 {
3090 break;
3091 }
3092 buf.extend_from_slice(&chunk[..n]);
3093 if header_end.is_none() {
3094 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3095 header_end = Some(pos + 4);
3096 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3097 if let Some(value) = line.strip_prefix("Content-Length:") {
3098 content_length = value.trim().parse::<usize>().unwrap_or(0);
3099 }
3100 }
3101 }
3102 }
3103 if let Some(end) = header_end {
3104 if buf.len() >= end + content_length {
3105 break;
3106 }
3107 }
3108 }
3109 *captured_for_thread.lock().unwrap() = buf;
3110 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3111 let response = format!(
3112 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3113 body.len(),
3114 body
3115 );
3116 let _ = stream.write_all(response.as_bytes());
3117 });
3118
3119 let config = SemanticBackendConfig {
3120 backend: SemanticBackend::OpenAiCompatible,
3121 model: "text-embedding-3-small".to_string(),
3122 base_url: Some(format!("http://{}", addr)),
3123 api_key_env: None,
3124 timeout_ms: 5_000,
3125 max_batch_size: 64,
3126 };
3127 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3128 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3129 handle.join().unwrap();
3130
3131 let bytes = captured.lock().unwrap().clone();
3132 let request = String::from_utf8_lossy(&bytes);
3133
3134 let content_type_lines = request
3137 .lines()
3138 .filter(|line| {
3139 let lower = line.to_ascii_lowercase();
3140 lower.starts_with("content-type:")
3141 })
3142 .count();
3143 assert_eq!(
3144 content_type_lines, 1,
3145 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3146 );
3147
3148 assert!(
3151 request.contains(r#""model":"text-embedding-3-small""#),
3152 "request body should contain model field; full request:\n{request}",
3153 );
3154 }
3155
3156 #[test]
3157 fn ollama_backend_embeds_with_mock_server() {
3158 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3159 assert!(request_line.starts_with("POST "));
3160 assert_eq!(path, "/api/embed");
3161 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3162 });
3163
3164 let config = SemanticBackendConfig {
3165 backend: SemanticBackend::Ollama,
3166 model: "embeddinggemma".to_string(),
3167 base_url: Some(base_url),
3168 api_key_env: None,
3169 timeout_ms: 5_000,
3170 max_batch_size: 64,
3171 };
3172
3173 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3174 let vectors = model
3175 .embed(vec!["hello".to_string(), "world".to_string()])
3176 .unwrap();
3177
3178 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3179 handle.join().unwrap();
3180 }
3181
3182 #[test]
3183 fn read_from_disk_rejects_fingerprint_mismatch() {
3184 let storage = tempfile::tempdir().unwrap();
3185 let project_key = "proj";
3186
3187 let project_root = test_project_root();
3188 let file = project_root.join("src/main.rs");
3189 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3190 index.entries.push(EmbeddingEntry {
3191 chunk: SemanticChunk {
3192 file: file.clone(),
3193 name: "handle_request".to_string(),
3194 kind: SymbolKind::Function,
3195 start_line: 10,
3196 end_line: 25,
3197 exported: true,
3198 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3199 snippet: "fn handle_request() {}".to_string(),
3200 },
3201 vector: vec![0.1, 0.2, 0.3],
3202 });
3203 index.dimension = 3;
3204 index
3205 .file_mtimes
3206 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3207 index.file_sizes.insert(file, 0);
3208 index.set_fingerprint(SemanticIndexFingerprint {
3209 backend: "openai_compatible".to_string(),
3210 model: "test-embedding".to_string(),
3211 base_url: "http://127.0.0.1:1234/v1".to_string(),
3212 dimension: 3,
3213 chunking_version: default_chunking_version(),
3214 });
3215 index.write_to_disk(storage.path(), project_key);
3216
3217 let matching = index.fingerprint().unwrap().as_string();
3218 assert!(SemanticIndex::read_from_disk(
3219 storage.path(),
3220 project_key,
3221 &project_root,
3222 false,
3223 Some(&matching),
3224 )
3225 .is_some());
3226
3227 let mismatched = SemanticIndexFingerprint {
3228 backend: "ollama".to_string(),
3229 model: "embeddinggemma".to_string(),
3230 base_url: "http://127.0.0.1:11434".to_string(),
3231 dimension: 3,
3232 chunking_version: default_chunking_version(),
3233 }
3234 .as_string();
3235 assert!(SemanticIndex::read_from_disk(
3236 storage.path(),
3237 project_key,
3238 &project_root,
3239 false,
3240 Some(&mismatched),
3241 )
3242 .is_none());
3243 }
3244
3245 #[test]
3246 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3247 let storage = tempfile::tempdir().unwrap();
3248 let project_key = "proj-v3";
3249 let dir = storage.path().join("semantic").join(project_key);
3250 fs::create_dir_all(&dir).unwrap();
3251
3252 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3253 index.entries.push(EmbeddingEntry {
3254 chunk: SemanticChunk {
3255 file: PathBuf::from("/src/main.rs"),
3256 name: "handle_request".to_string(),
3257 kind: SymbolKind::Function,
3258 start_line: 0,
3259 end_line: 0,
3260 exported: true,
3261 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3262 snippet: "fn handle_request() {}".to_string(),
3263 },
3264 vector: vec![0.1, 0.2, 0.3],
3265 });
3266 index.dimension = 3;
3267 index
3268 .file_mtimes
3269 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3270 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3271 let fingerprint = SemanticIndexFingerprint {
3272 backend: "fastembed".to_string(),
3273 model: "test".to_string(),
3274 base_url: FALLBACK_BACKEND.to_string(),
3275 dimension: 3,
3276 chunking_version: default_chunking_version(),
3277 };
3278 index.set_fingerprint(fingerprint.clone());
3279
3280 let mut bytes = index.to_bytes();
3281 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3282 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3283
3284 assert!(SemanticIndex::read_from_disk(
3285 storage.path(),
3286 project_key,
3287 &test_project_root(),
3288 false,
3289 Some(&fingerprint.as_string())
3290 )
3291 .is_none());
3292 assert!(!dir.join("semantic.bin").exists());
3293 }
3294
3295 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3296 crate::symbols::Symbol {
3297 name: name.to_string(),
3298 kind,
3299 range: crate::symbols::Range {
3300 start_line: start,
3301 start_col: 0,
3302 end_line: end,
3303 end_col: 0,
3304 },
3305 signature: None,
3306 scope_chain: Vec::new(),
3307 exported: false,
3308 parent: None,
3309 }
3310 }
3311
3312 #[test]
3317 fn symbols_to_chunks_skips_heading_symbols() {
3318 let project_root = PathBuf::from("/proj");
3319 let file = project_root.join("README.md");
3320 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3321
3322 let symbols = vec![
3323 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3324 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3325 ];
3326
3327 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3328 assert!(
3329 chunks.is_empty(),
3330 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3331 chunks.len()
3332 );
3333 }
3334
3335 #[test]
3339 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3340 let project_root = PathBuf::from("/proj");
3341 let file = project_root.join("src/lib.rs");
3342 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3343
3344 let symbols = vec![
3345 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3347 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3348 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3349 ];
3350
3351 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3352 assert_eq!(
3353 chunks.len(),
3354 3,
3355 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3356 chunks.len()
3357 );
3358 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3359 assert!(chunks
3360 .iter()
3361 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3362 assert!(names.contains(&"handle_request"));
3363 assert!(names.contains(&"AuthService"));
3364 assert!(
3365 !names.contains(&"doc heading"),
3366 "Heading symbol leaked into chunks: {names:?}"
3367 );
3368 }
3369
3370 #[test]
3371 fn validate_ssrf_allows_loopback_hostnames() {
3372 for host in &[
3375 "http://localhost",
3376 "http://localhost:8080",
3377 "http://localhost:11434", "http://localhost.localdomain",
3379 "http://foo.localhost",
3380 ] {
3381 assert!(
3382 validate_base_url_no_ssrf(host).is_ok(),
3383 "Expected {host} to be allowed (loopback), got: {:?}",
3384 validate_base_url_no_ssrf(host)
3385 );
3386 }
3387 }
3388
3389 #[test]
3390 fn validate_ssrf_allows_loopback_ips() {
3391 for url in &[
3394 "http://127.0.0.1",
3395 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3397 "http://127.1.2.3",
3398 ] {
3399 let result = validate_base_url_no_ssrf(url);
3400 assert!(
3401 result.is_ok(),
3402 "Expected {url} to be allowed (loopback), got: {:?}",
3403 result
3404 );
3405 }
3406 }
3407
3408 #[test]
3409 fn validate_ssrf_rejects_private_non_loopback_ips() {
3410 for url in &[
3415 "http://192.168.1.1",
3416 "http://10.0.0.1",
3417 "http://172.16.0.1",
3418 "http://169.254.169.254",
3419 "http://100.64.0.1",
3420 ] {
3421 let result = validate_base_url_no_ssrf(url);
3422 assert!(
3423 result.is_err(),
3424 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3425 result
3426 );
3427 }
3428 }
3429
3430 #[test]
3431 fn validate_ssrf_rejects_mdns_local_hostnames() {
3432 for host in &[
3435 "http://printer.local",
3436 "http://nas.local:8080",
3437 "http://homelab.local",
3438 ] {
3439 let result = validate_base_url_no_ssrf(host);
3440 assert!(
3441 result.is_err(),
3442 "Expected {host} to be rejected (mDNS), got: {:?}",
3443 result
3444 );
3445 }
3446 }
3447
3448 #[test]
3449 fn normalize_base_url_allows_localhost_for_tests() {
3450 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3453 assert!(normalize_base_url("http://localhost:8080").is_ok());
3454 }
3455
3456 #[test]
3463 fn ort_mismatch_message_recommends_auto_fix_first() {
3464 let msg =
3465 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3466
3467 assert!(
3469 msg.contains("v1.9.0"),
3470 "should report detected version: {msg}"
3471 );
3472 assert!(
3473 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3474 "should report system path: {msg}"
3475 );
3476 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3477
3478 let auto_fix_pos = msg
3480 .find("Auto-fix")
3481 .expect("Auto-fix solution missing — users won't discover --fix");
3482 let remove_pos = msg
3483 .find("Remove the old library")
3484 .expect("system-rm solution missing");
3485 assert!(
3486 auto_fix_pos < remove_pos,
3487 "Auto-fix must come before manual rm — see PR comment thread"
3488 );
3489
3490 assert!(
3492 msg.contains("npx @cortexkit/aft doctor --fix"),
3493 "auto-fix command must be present and copy-pasteable: {msg}"
3494 );
3495 }
3496
3497 #[test]
3501 fn ort_mismatch_message_handles_macos_dylib_path() {
3502 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3503 assert!(msg.contains("v1.9.0"));
3504 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3505 assert!(
3509 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3510 "system path should be quoted in the auto-fix sentence: {msg}"
3511 );
3512 }
3513}