1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
34
35const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
36const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
37const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
42const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
45const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
48const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
50const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
51const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
52const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
54const DEFAULT_MAX_BATCH_SIZE: usize = 64;
55const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
56const FALLBACK_BACKEND: &str = "none";
57const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
58const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
59static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
60
61pub struct SemanticIndexLock {
62 _guard: fs_lock::LockGuard,
63}
64
65impl SemanticIndexLock {
66 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
67 let dir = storage_dir.join("semantic").join(project_key);
68 fs::create_dir_all(&dir)?;
69 let path = dir.join("cache.lock");
70 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
71 .lock()
72 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
73 fs_lock::try_acquire(&path, Duration::from_secs(2))
74 .map(|guard| Self { _guard: guard })
75 .map_err(|error| match error {
76 fs_lock::AcquireError::Timeout => {
77 std::io::Error::other("timed out acquiring semantic cache lock")
78 }
79 fs_lock::AcquireError::Io(error) => error,
80 })
81 }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct SemanticIndexFingerprint {
86 pub backend: String,
87 pub model: String,
88 #[serde(default)]
89 pub base_url: String,
90 pub dimension: usize,
91 #[serde(default = "default_chunking_version")]
92 pub chunking_version: u32,
93}
94
95fn default_chunking_version() -> u32 {
96 2
97}
98
99impl SemanticIndexFingerprint {
100 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
101 let base_url = config
104 .base_url
105 .as_ref()
106 .and_then(|u| normalize_base_url(u).ok())
107 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
108 Self {
109 backend: config.backend.as_str().to_string(),
110 model: config.model.clone(),
111 base_url,
112 dimension,
113 chunking_version: default_chunking_version(),
114 }
115 }
116
117 pub fn as_string(&self) -> String {
118 serde_json::to_string(self).unwrap_or_else(|_| String::new())
119 }
120
121 fn matches_expected(&self, expected: &str) -> bool {
122 let encoded = self.as_string();
123 !encoded.is_empty() && encoded == expected
124 }
125}
126
127enum SemanticEmbeddingEngine {
128 Fastembed(TextEmbedding),
129 OpenAiCompatible {
130 client: Client,
131 model: String,
132 base_url: String,
133 api_key: Option<String>,
134 },
135 Ollama {
136 client: Client,
137 model: String,
138 base_url: String,
139 },
140}
141
142pub struct SemanticEmbeddingModel {
143 backend: SemanticBackend,
144 model: String,
145 base_url: Option<String>,
146 timeout_ms: u64,
147 max_batch_size: usize,
148 dimension: Option<usize>,
149 engine: SemanticEmbeddingEngine,
150 query_embedding_cache: HashMap<String, Vec<f32>>,
151 query_embedding_cache_order: VecDeque<String>,
152 query_embedding_cache_hits: u64,
153 query_embedding_cache_misses: u64,
154}
155
156pub type EmbeddingModel = SemanticEmbeddingModel;
157
158fn validate_embedding_batch(
159 vectors: &[Vec<f32>],
160 expected_count: usize,
161 context: &str,
162) -> Result<(), String> {
163 if expected_count > 0 && vectors.is_empty() {
164 return Err(format!(
165 "{context} returned no vectors for {expected_count} inputs"
166 ));
167 }
168
169 if vectors.len() != expected_count {
170 return Err(format!(
171 "{context} returned {} vectors for {} inputs",
172 vectors.len(),
173 expected_count
174 ));
175 }
176
177 let Some(first_vector) = vectors.first() else {
178 return Ok(());
179 };
180 let expected_dimension = first_vector.len();
181 validate_embedding_dimension(expected_dimension)
182 .map_err(|error| format!("{context} returned {error}"))?;
183 for (index, vector) in vectors.iter().enumerate() {
184 if vector.len() != expected_dimension {
185 return Err(format!(
186 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
187 vector.len()
188 ));
189 }
190 }
191
192 Ok(())
193}
194
195fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
196 if dimension == 0 || dimension > MAX_DIMENSION {
197 return Err(format!(
198 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
199 ));
200 }
201
202 Ok(())
203}
204
205fn normalize_base_url(raw: &str) -> Result<String, String> {
209 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
210 let scheme = parsed.scheme();
211 if scheme != "http" && scheme != "https" {
212 return Err(format!(
213 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
214 scheme
215 ));
216 }
217 Ok(parsed.to_string().trim_end_matches('/').to_string())
218}
219
220pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
235 use std::net::{IpAddr, ToSocketAddrs};
236
237 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
238
239 let host = parsed.host_str().unwrap_or("");
240
241 let is_loopback_host =
246 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
247 if is_loopback_host {
248 return Ok(());
249 }
250
251 if host.ends_with(".local") {
254 return Err(format!(
255 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
256 ));
257 }
258
259 let port = parsed.port_or_known_default().unwrap_or(443);
262 let addr_str = format!("{host}:{port}");
263 let addrs: Vec<IpAddr> = addr_str
264 .to_socket_addrs()
265 .map(|iter| iter.map(|sa| sa.ip()).collect())
266 .unwrap_or_default();
267 for ip in &addrs {
268 if is_private_non_loopback_ip(ip) {
269 return Err(format!(
270 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
271 ));
272 }
273 }
274
275 Ok(())
276}
277
278fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
282 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
283 match ip {
284 IpAddr::V4(v4) => {
285 let o = v4.octets();
286 o[0] == 10
289 || (o[0] == 172 && (16..=31).contains(&o[1]))
291 || (o[0] == 192 && o[1] == 168)
293 || (o[0] == 169 && o[1] == 254)
295 || (o[0] == 100 && (64..=127).contains(&o[1]))
297 || o[0] == 0
299 }
300 IpAddr::V6(v6) => {
301 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
305 || (v6.segments()[0] & 0xfe00) == 0xfc00
307 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
309 && v6.segments()[2] == 0 && v6.segments()[3] == 0
310 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
311 && {
312 let [a, b] = v6.segments()[6..8] else { return false; };
313 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
314 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
315 })
316 }
317 }
318}
319
320fn build_openai_embeddings_endpoint(base_url: &str) -> String {
321 if base_url.ends_with("/v1") {
322 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
323 } else {
324 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
325 }
326}
327
328fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
329 if base_url.ends_with("/api") {
330 format!("{base_url}/embed")
331 } else {
332 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
333 }
334}
335
336fn normalize_api_key(value: Option<String>) -> Option<String> {
337 value.and_then(|token| {
338 let token = token.trim();
339 if token.is_empty() {
340 None
341 } else {
342 Some(token.to_string())
343 }
344 })
345}
346
347fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
348 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
349}
350
351fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
352 error.is_connect()
353}
354
355fn sleep_before_embedding_retry(attempt_index: usize) {
356 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
357 std::thread::sleep(Duration::from_millis(*delay_ms));
358 }
359}
360
361fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
362where
363 F: FnMut() -> reqwest::blocking::RequestBuilder,
364{
365 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
366 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
367
368 let response = match make_request().send() {
369 Ok(response) => response,
370 Err(error) => {
371 if !last_attempt && is_retryable_embedding_error(&error) {
372 sleep_before_embedding_retry(attempt_index);
373 continue;
374 }
375 return Err(format!("{backend_label} request failed: {error}"));
376 }
377 };
378
379 let status = response.status();
380 let raw = match response.text() {
381 Ok(raw) => raw,
382 Err(error) => {
383 if !last_attempt && is_retryable_embedding_error(&error) {
384 sleep_before_embedding_retry(attempt_index);
385 continue;
386 }
387 return Err(format!("{backend_label} response read failed: {error}"));
388 }
389 };
390
391 if status.is_success() {
392 return Ok(raw);
393 }
394
395 if !last_attempt && is_retryable_embedding_status(status) {
396 sleep_before_embedding_retry(attempt_index);
397 continue;
398 }
399
400 return Err(format!(
401 "{backend_label} request failed (HTTP {}): {}",
402 status, raw
403 ));
404 }
405
406 unreachable!("embedding request retries exhausted without returning")
407}
408
409impl SemanticEmbeddingModel {
410 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
411 let timeout_ms = if config.timeout_ms == 0 {
412 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
413 } else {
414 config.timeout_ms
415 };
416
417 let max_batch_size = if config.max_batch_size == 0 {
418 DEFAULT_MAX_BATCH_SIZE
419 } else {
420 config.max_batch_size
421 };
422
423 let api_key_env = normalize_api_key(config.api_key_env.clone());
424 let model = config.model.clone();
425
426 let client = Client::builder()
427 .timeout(Duration::from_millis(timeout_ms))
428 .redirect(reqwest::redirect::Policy::none())
429 .build()
430 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
431
432 let engine = match config.backend {
433 SemanticBackend::Fastembed => {
434 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
435 }
436 SemanticBackend::OpenAiCompatible => {
437 let raw = config.base_url.as_ref().ok_or_else(|| {
438 "base_url is required for openai_compatible backend".to_string()
439 })?;
440 let base_url = normalize_base_url(raw)?;
441
442 let api_key = match api_key_env {
443 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
444 format!("missing api_key_env '{var_name}' for openai_compatible backend")
445 })?),
446 None => None,
447 };
448
449 SemanticEmbeddingEngine::OpenAiCompatible {
450 client,
451 model,
452 base_url,
453 api_key,
454 }
455 }
456 SemanticBackend::Ollama => {
457 let raw = config
458 .base_url
459 .as_ref()
460 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
461 let base_url = normalize_base_url(raw)?;
462
463 SemanticEmbeddingEngine::Ollama {
464 client,
465 model,
466 base_url,
467 }
468 }
469 };
470
471 Ok(Self {
472 backend: config.backend,
473 model: config.model.clone(),
474 base_url: config.base_url.clone(),
475 timeout_ms,
476 max_batch_size,
477 dimension: None,
478 engine,
479 query_embedding_cache: HashMap::new(),
480 query_embedding_cache_order: VecDeque::new(),
481 query_embedding_cache_hits: 0,
482 query_embedding_cache_misses: 0,
483 })
484 }
485
486 pub fn backend(&self) -> SemanticBackend {
487 self.backend
488 }
489
490 pub fn model(&self) -> &str {
491 &self.model
492 }
493
494 pub fn base_url(&self) -> Option<&str> {
495 self.base_url.as_deref()
496 }
497
498 pub fn max_batch_size(&self) -> usize {
499 self.max_batch_size
500 }
501
502 pub fn timeout_ms(&self) -> u64 {
503 self.timeout_ms
504 }
505
506 pub fn fingerprint(
507 &mut self,
508 config: &SemanticBackendConfig,
509 ) -> Result<SemanticIndexFingerprint, String> {
510 let dimension = self.dimension()?;
511 Ok(SemanticIndexFingerprint::from_config(config, dimension))
512 }
513
514 pub fn dimension(&mut self) -> Result<usize, String> {
515 if let Some(dimension) = self.dimension {
516 return Ok(dimension);
517 }
518
519 let dimension = match &mut self.engine {
520 SemanticEmbeddingEngine::Fastembed(model) => {
521 let vectors = model
522 .embed(vec!["semantic index fingerprint probe".to_string()], None)
523 .map_err(|error| format_embedding_init_error(error.to_string()))?;
524 vectors
525 .first()
526 .map(|v| v.len())
527 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
528 }
529 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
530 let vectors =
531 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
532 vectors
533 .first()
534 .map(|v| v.len())
535 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
536 }
537 SemanticEmbeddingEngine::Ollama { .. } => {
538 let vectors =
539 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
540 vectors
541 .first()
542 .map(|v| v.len())
543 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
544 }
545 };
546
547 self.dimension = Some(dimension);
548 Ok(dimension)
549 }
550
551 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
552 self.embed_texts(texts)
553 }
554
555 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
556 if let Some(vector) = self.query_embedding_cache.get(query) {
557 self.query_embedding_cache_hits += 1;
558 return Ok(vector.clone());
559 }
560
561 self.query_embedding_cache_misses += 1;
562 let embeddings = self.embed_texts(vec![query.to_string()])?;
563 let vector = embeddings
564 .first()
565 .cloned()
566 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
567
568 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
569 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
570 self.query_embedding_cache.remove(&oldest);
571 }
572 }
573 self.query_embedding_cache
574 .insert(query.to_string(), vector.clone());
575 self.query_embedding_cache_order
576 .push_back(query.to_string());
577
578 Ok(vector)
579 }
580
581 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
582 (
583 self.query_embedding_cache_hits,
584 self.query_embedding_cache_misses,
585 self.query_embedding_cache.len(),
586 )
587 }
588
589 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
590 match &mut self.engine {
591 SemanticEmbeddingEngine::Fastembed(model) => model
592 .embed(texts, None::<usize>)
593 .map_err(|error| format_embedding_init_error(error.to_string()))
594 .map_err(|error| format!("failed to embed batch: {error}")),
595 SemanticEmbeddingEngine::OpenAiCompatible {
596 client,
597 model,
598 base_url,
599 api_key,
600 } => {
601 let expected_text_count = texts.len();
602 let endpoint = build_openai_embeddings_endpoint(base_url);
603 let body = serde_json::json!({
604 "input": texts,
605 "model": model,
606 });
607
608 let raw = send_embedding_request(
609 || {
610 let mut request = client.post(&endpoint).json(&body);
620
621 if let Some(api_key) = api_key {
622 request = request.header("Authorization", format!("Bearer {api_key}"));
623 }
624
625 request
626 },
627 "openai compatible",
628 )?;
629
630 #[derive(Deserialize)]
631 struct OpenAiResponse {
632 data: Vec<OpenAiEmbeddingResult>,
633 }
634
635 #[derive(Deserialize)]
636 struct OpenAiEmbeddingResult {
637 embedding: Vec<f32>,
638 index: Option<u32>,
639 }
640
641 let parsed: OpenAiResponse = serde_json::from_str(&raw)
642 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
643 if parsed.data.len() != expected_text_count {
644 return Err(format!(
645 "openai compatible response returned {} embeddings for {} inputs",
646 parsed.data.len(),
647 expected_text_count
648 ));
649 }
650
651 let mut vectors = vec![Vec::new(); parsed.data.len()];
652 for (i, item) in parsed.data.into_iter().enumerate() {
653 let index = item.index.unwrap_or(i as u32) as usize;
654 if index >= vectors.len() {
655 return Err(
656 "openai compatible response contains invalid vector index".to_string()
657 );
658 }
659 vectors[index] = item.embedding;
660 }
661
662 for vector in &vectors {
663 if vector.is_empty() {
664 return Err(
665 "openai compatible response contained missing vectors".to_string()
666 );
667 }
668 }
669
670 self.dimension = vectors.first().map(Vec::len);
671 Ok(vectors)
672 }
673 SemanticEmbeddingEngine::Ollama {
674 client,
675 model,
676 base_url,
677 } => {
678 let expected_text_count = texts.len();
679 let endpoint = build_ollama_embeddings_endpoint(base_url);
680
681 #[derive(Serialize)]
682 struct OllamaPayload<'a> {
683 model: &'a str,
684 input: Vec<String>,
685 }
686
687 let payload = OllamaPayload {
688 model,
689 input: texts,
690 };
691
692 let raw = send_embedding_request(
693 || {
694 client.post(&endpoint).json(&payload)
699 },
700 "ollama",
701 )?;
702
703 #[derive(Deserialize)]
704 struct OllamaResponse {
705 embeddings: Vec<Vec<f32>>,
706 }
707
708 let parsed: OllamaResponse = serde_json::from_str(&raw)
709 .map_err(|error| format!("invalid ollama response: {error}"))?;
710 if parsed.embeddings.is_empty() {
711 return Err("ollama response returned no embeddings".to_string());
712 }
713 if parsed.embeddings.len() != expected_text_count {
714 return Err(format!(
715 "ollama response returned {} embeddings for {} inputs",
716 parsed.embeddings.len(),
717 expected_text_count
718 ));
719 }
720
721 let vectors = parsed.embeddings;
722 for vector in &vectors {
723 if vector.is_empty() {
724 return Err("ollama response contained empty embeddings".to_string());
725 }
726 }
727
728 self.dimension = vectors.first().map(Vec::len);
729 Ok(vectors)
730 }
731 }
732 }
733}
734
735pub fn pre_validate_onnx_runtime() -> Result<(), String> {
739 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
740
741 #[cfg(any(target_os = "linux", target_os = "macos"))]
742 {
743 #[cfg(target_os = "linux")]
744 let default_name = "libonnxruntime.so";
745 #[cfg(target_os = "macos")]
746 let default_name = "libonnxruntime.dylib";
747
748 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
749
750 unsafe {
751 let c_name = std::ffi::CString::new(lib_name)
752 .map_err(|e| format!("invalid library path: {}", e))?;
753 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
754 if handle.is_null() {
755 let err = libc::dlerror();
756 let msg = if err.is_null() {
757 "unknown dlopen error".to_string()
758 } else {
759 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
760 };
761 return Err(format!(
762 "ONNX Runtime not found. dlopen('{}') failed: {}. \
763 Run `npx @cortexkit/aft doctor` to diagnose.",
764 lib_name, msg
765 ));
766 }
767
768 let detected_version = detect_ort_version_from_path(lib_name);
771
772 libc::dlclose(handle);
773
774 if let Some(ref version) = detected_version {
776 let parts: Vec<&str> = version.split('.').collect();
777 if let (Some(major), Some(minor)) = (
778 parts.first().and_then(|s| s.parse::<u32>().ok()),
779 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
780 ) {
781 if major != 1 || minor < 20 {
782 return Err(format_ort_version_mismatch(version, lib_name));
783 }
784 }
785 }
786 }
787 }
788
789 #[cfg(target_os = "windows")]
790 {
791 let _ = dylib_path;
793 }
794
795 Ok(())
796}
797
798#[cfg(any(test, target_os = "linux", target_os = "macos"))]
801fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
802 let path = std::path::Path::new(lib_path);
803
804 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
806 .into_iter()
807 .flatten()
808 {
809 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
810 if let Some(version) = extract_version_from_filename(name) {
811 return Some(version);
812 }
813 }
814 }
815
816 if let Some(parent) = path.parent() {
818 if let Ok(entries) = std::fs::read_dir(parent) {
819 for entry in entries.flatten() {
820 if let Some(name) = entry.file_name().to_str() {
821 if name.starts_with("libonnxruntime") {
822 if let Some(version) = extract_version_from_filename(name) {
823 return Some(version);
824 }
825 }
826 }
827 }
828 }
829 }
830
831 None
832}
833
834#[cfg(any(test, target_os = "linux", target_os = "macos"))]
836fn extract_version_from_filename(name: &str) -> Option<String> {
837 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
839 re.find(name).map(|m| m.as_str().to_string())
840}
841
842#[cfg(any(test, target_os = "linux", target_os = "macos"))]
843fn suggest_removal_command(lib_path: &str) -> String {
844 if lib_path.starts_with("/usr/local/lib")
845 || lib_path == "libonnxruntime.so"
846 || lib_path == "libonnxruntime.dylib"
847 {
848 #[cfg(target_os = "linux")]
849 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
850 #[cfg(target_os = "macos")]
851 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
852 #[cfg(target_os = "windows")]
853 return " Delete the ONNX Runtime DLL from your PATH".to_string();
854 }
855 format!(" rm '{}'", lib_path)
856}
857
858#[cfg(any(test, target_os = "linux", target_os = "macos"))]
864pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
865 format!(
866 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
867 Solutions:\n\
868 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
869 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
870 configures the bridge to load it instead of the system library — no \
871 changes to '{}'.\n\
872 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
873 {}\n\
874 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
875 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
876 version,
877 lib_name,
878 lib_name,
879 suggest_removal_command(lib_name),
880 )
881}
882
883pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
884 pre_validate_onnx_runtime()?;
886
887 let selected_model = match model {
888 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
889 _ => {
890 return Err(format!(
891 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
892 model
893 ))
894 }
895 };
896
897 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
898}
899
900pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
901 if message.trim_start().starts_with("ONNX Runtime not found.") {
902 return true;
903 }
904
905 let message = message.to_ascii_lowercase();
906 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
907 .iter()
908 .any(|pattern| message.contains(pattern));
909 let mentions_dynamic_load_failure = [
910 "shared library",
911 "dynamic library",
912 "failed to load",
913 "could not load",
914 "unable to load",
915 "dlopen",
916 "loadlibrary",
917 "no such file",
918 "not found",
919 ]
920 .iter()
921 .any(|pattern| message.contains(pattern));
922
923 mentions_onnx_runtime && mentions_dynamic_load_failure
924}
925
926fn format_embedding_init_error(error: impl Display) -> String {
927 let message = error.to_string();
928
929 if is_onnx_runtime_unavailable(&message) {
930 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
931 }
932
933 format!("failed to initialize semantic embedding model: {message}")
934}
935
936#[derive(Debug, Clone)]
938pub struct SemanticChunk {
939 pub file: PathBuf,
941 pub name: String,
943 pub kind: SymbolKind,
945 pub start_line: u32,
947 pub end_line: u32,
948 pub exported: bool,
950 pub embed_text: String,
952 pub snippet: String,
954}
955
956#[derive(Debug)]
958struct EmbeddingEntry {
959 chunk: SemanticChunk,
960 vector: Vec<f32>,
961}
962
963#[derive(Debug)]
965pub struct SemanticIndex {
966 entries: Vec<EmbeddingEntry>,
967 file_mtimes: HashMap<PathBuf, SystemTime>,
969 file_sizes: HashMap<PathBuf, u64>,
971 file_hashes: HashMap<PathBuf, blake3::Hash>,
972 dimension: usize,
974 fingerprint: Option<SemanticIndexFingerprint>,
975 project_root: PathBuf,
976}
977
978#[derive(Debug, Clone, Copy)]
979struct IndexedFileMetadata {
980 mtime: SystemTime,
981 size: u64,
982 content_hash: blake3::Hash,
983}
984
985#[derive(Debug, Default, Clone, Copy)]
988pub struct RefreshSummary {
989 pub changed: usize,
990 pub added: usize,
991 pub deleted: usize,
992 pub total_processed: usize,
993}
994
995impl RefreshSummary {
996 pub fn is_noop(&self) -> bool {
998 self.changed == 0 && self.added == 0 && self.deleted == 0
999 }
1000}
1001
1002#[derive(Debug, Clone)]
1004pub struct SemanticResult {
1005 pub file: PathBuf,
1006 pub name: String,
1007 pub kind: SymbolKind,
1008 pub start_line: u32,
1009 pub end_line: u32,
1010 pub exported: bool,
1011 pub snippet: String,
1012 pub score: f32,
1013 pub source: &'static str,
1014}
1015
1016impl SemanticIndex {
1017 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1018 debug_assert!(project_root.is_absolute());
1019 Self {
1020 entries: Vec::new(),
1021 file_mtimes: HashMap::new(),
1022 file_sizes: HashMap::new(),
1023 file_hashes: HashMap::new(),
1024 dimension,
1025 fingerprint: None,
1026 project_root,
1027 }
1028 }
1029
1030 pub fn entry_count(&self) -> usize {
1032 self.entries.len()
1033 }
1034
1035 pub fn status_label(&self) -> &'static str {
1037 if self.entries.is_empty() {
1038 "empty"
1039 } else {
1040 "ready"
1041 }
1042 }
1043
1044 fn collect_chunks(
1045 project_root: &Path,
1046 files: &[PathBuf],
1047 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1048 let per_file: Vec<(
1049 PathBuf,
1050 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1051 )> = files
1052 .par_iter()
1053 .map_init(HashMap::new, |parsers, file| {
1054 let result = collect_file_metadata(file).and_then(|metadata| {
1055 collect_file_chunks(project_root, file, parsers)
1056 .map(|chunks| (metadata, chunks))
1057 });
1058 (file.clone(), result)
1059 })
1060 .collect();
1061
1062 let mut chunks: Vec<SemanticChunk> = Vec::new();
1063 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1064
1065 for (file, result) in per_file {
1066 match result {
1067 Ok((metadata, file_chunks)) => {
1068 file_metadata.insert(file, metadata);
1069 chunks.extend(file_chunks);
1070 }
1071 Err(error) => {
1072 if error == "unsupported file extension" {
1078 continue;
1079 }
1080 slog_warn!(
1081 "failed to collect semantic chunks for {}: {}",
1082 file.display(),
1083 error
1084 );
1085 }
1086 }
1087 }
1088
1089 (chunks, file_metadata)
1090 }
1091
1092 fn build_from_chunks<F, P>(
1093 project_root: &Path,
1094 chunks: Vec<SemanticChunk>,
1095 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1096 embed_fn: &mut F,
1097 max_batch_size: usize,
1098 mut progress: Option<&mut P>,
1099 ) -> Result<Self, String>
1100 where
1101 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1102 P: FnMut(usize, usize),
1103 {
1104 debug_assert!(project_root.is_absolute());
1105 let total_chunks = chunks.len();
1106
1107 if chunks.is_empty() {
1108 return Ok(Self {
1109 entries: Vec::new(),
1110 file_mtimes: file_metadata
1111 .iter()
1112 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1113 .collect(),
1114 file_sizes: file_metadata
1115 .iter()
1116 .map(|(path, metadata)| (path.clone(), metadata.size))
1117 .collect(),
1118 file_hashes: file_metadata
1119 .into_iter()
1120 .map(|(path, metadata)| (path, metadata.content_hash))
1121 .collect(),
1122 dimension: DEFAULT_DIMENSION,
1123 fingerprint: None,
1124 project_root: project_root.to_path_buf(),
1125 });
1126 }
1127
1128 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1130 let mut expected_dimension: Option<usize> = None;
1131 let batch_size = max_batch_size.max(1);
1132 for batch_start in (0..chunks.len()).step_by(batch_size) {
1133 let batch_end = (batch_start + batch_size).min(chunks.len());
1134 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1135 .iter()
1136 .map(|c| c.embed_text.clone())
1137 .collect();
1138
1139 let vectors = embed_fn(batch_texts)?;
1140 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1141
1142 if let Some(dim) = vectors.first().map(|v| v.len()) {
1144 match expected_dimension {
1145 None => expected_dimension = Some(dim),
1146 Some(expected) if dim != expected => {
1147 return Err(format!(
1148 "embedding dimension changed across batches: expected {expected}, got {dim}"
1149 ));
1150 }
1151 _ => {}
1152 }
1153 }
1154
1155 for (i, vector) in vectors.into_iter().enumerate() {
1156 let chunk_idx = batch_start + i;
1157 entries.push(EmbeddingEntry {
1158 chunk: chunks[chunk_idx].clone(),
1159 vector,
1160 });
1161 }
1162
1163 if let Some(callback) = progress.as_mut() {
1164 callback(entries.len(), total_chunks);
1165 }
1166 }
1167
1168 let dimension = entries
1169 .first()
1170 .map(|e| e.vector.len())
1171 .unwrap_or(DEFAULT_DIMENSION);
1172
1173 Ok(Self {
1174 entries,
1175 file_mtimes: file_metadata
1176 .iter()
1177 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1178 .collect(),
1179 file_sizes: file_metadata
1180 .iter()
1181 .map(|(path, metadata)| (path.clone(), metadata.size))
1182 .collect(),
1183 file_hashes: file_metadata
1184 .into_iter()
1185 .map(|(path, metadata)| (path, metadata.content_hash))
1186 .collect(),
1187 dimension,
1188 fingerprint: None,
1189 project_root: project_root.to_path_buf(),
1190 })
1191 }
1192
1193 pub fn build<F>(
1196 project_root: &Path,
1197 files: &[PathBuf],
1198 embed_fn: &mut F,
1199 max_batch_size: usize,
1200 ) -> Result<Self, String>
1201 where
1202 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1203 {
1204 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1205 Self::build_from_chunks(
1206 project_root,
1207 chunks,
1208 file_mtimes,
1209 embed_fn,
1210 max_batch_size,
1211 Option::<&mut fn(usize, usize)>::None,
1212 )
1213 }
1214
1215 pub fn build_with_progress<F, P>(
1217 project_root: &Path,
1218 files: &[PathBuf],
1219 embed_fn: &mut F,
1220 max_batch_size: usize,
1221 progress: &mut P,
1222 ) -> Result<Self, String>
1223 where
1224 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1225 P: FnMut(usize, usize),
1226 {
1227 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1228 let total_chunks = chunks.len();
1229 progress(0, total_chunks);
1230 Self::build_from_chunks(
1231 project_root,
1232 chunks,
1233 file_mtimes,
1234 embed_fn,
1235 max_batch_size,
1236 Some(progress),
1237 )
1238 }
1239
1240 pub fn refresh_stale_files<F, P>(
1251 &mut self,
1252 project_root: &Path,
1253 current_files: &[PathBuf],
1254 embed_fn: &mut F,
1255 max_batch_size: usize,
1256 progress: &mut P,
1257 ) -> Result<RefreshSummary, String>
1258 where
1259 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1260 P: FnMut(usize, usize),
1261 {
1262 self.backfill_missing_file_sizes();
1263
1264 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1266 let total_processed = current_set.len() + self.file_mtimes.len()
1267 - self
1268 .file_mtimes
1269 .keys()
1270 .filter(|path| current_set.contains(path.as_path()))
1271 .count();
1272
1273 let mut deleted: Vec<PathBuf> = Vec::new();
1276 let mut changed: Vec<PathBuf> = Vec::new();
1277 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1278 for indexed_path in &indexed_paths {
1279 if !current_set.contains(indexed_path.as_path()) {
1280 deleted.push(indexed_path.clone());
1281 continue;
1282 }
1283 let cached = match (
1284 self.file_mtimes.get(indexed_path),
1285 self.file_sizes.get(indexed_path),
1286 self.file_hashes.get(indexed_path),
1287 ) {
1288 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1289 mtime: *mtime,
1290 size: *size,
1291 content_hash: *hash,
1292 }),
1293 _ => None,
1294 };
1295 match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1296 Some(FreshnessVerdict::HotFresh) => {}
1297 Some(FreshnessVerdict::ContentFresh {
1298 new_mtime,
1299 new_size,
1300 }) => {
1301 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1302 self.file_sizes.insert(indexed_path.clone(), new_size);
1303 }
1304 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1305 changed.push(indexed_path.clone());
1306 }
1307 }
1308 }
1309
1310 let mut added: Vec<PathBuf> = Vec::new();
1312 for path in current_files {
1313 if !self.file_mtimes.contains_key(path) {
1314 added.push(path.clone());
1315 }
1316 }
1317
1318 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1320 progress(0, 0);
1321 return Ok(RefreshSummary {
1322 total_processed,
1323 ..RefreshSummary::default()
1324 });
1325 }
1326
1327 if !deleted.is_empty() {
1331 let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1332 self.entries
1333 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1334 for path in &deleted {
1335 self.file_mtimes.remove(path);
1336 self.file_sizes.remove(path);
1337 self.file_hashes.remove(path);
1338 }
1339 }
1340
1341 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1343 to_embed.extend(changed.iter().cloned());
1344 to_embed.extend(added.iter().cloned());
1345
1346 if to_embed.is_empty() {
1347 progress(0, 0);
1349 return Ok(RefreshSummary {
1350 changed: 0,
1351 added: 0,
1352 deleted: deleted.len(),
1353 total_processed,
1354 });
1355 }
1356
1357 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1358
1359 if chunks.is_empty() {
1360 progress(0, 0);
1361 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1362 if !successful_files.is_empty() {
1363 self.entries
1364 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1365 }
1366 let changed_count = changed
1367 .iter()
1368 .filter(|path| successful_files.contains(*path))
1369 .count();
1370 let added_count = added
1371 .iter()
1372 .filter(|path| successful_files.contains(*path))
1373 .count();
1374 for (file, metadata) in fresh_metadata {
1375 self.file_mtimes.insert(file.clone(), metadata.mtime);
1376 self.file_sizes.insert(file.clone(), metadata.size);
1377 self.file_hashes.insert(file.clone(), metadata.content_hash);
1378 }
1379 return Ok(RefreshSummary {
1380 changed: changed_count,
1381 added: added_count,
1382 deleted: deleted.len(),
1383 total_processed,
1384 });
1385 }
1386
1387 let total_chunks = chunks.len();
1389 progress(0, total_chunks);
1390 let batch_size = max_batch_size.max(1);
1391 let existing_dimension = if self.entries.is_empty() {
1392 None
1393 } else {
1394 Some(self.dimension)
1395 };
1396 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1397 let mut observed_dimension: Option<usize> = existing_dimension;
1398
1399 for batch_start in (0..chunks.len()).step_by(batch_size) {
1400 let batch_end = (batch_start + batch_size).min(chunks.len());
1401 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1402 .iter()
1403 .map(|c| c.embed_text.clone())
1404 .collect();
1405
1406 let vectors = embed_fn(batch_texts)?;
1407 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1408
1409 if let Some(dim) = vectors.first().map(|v| v.len()) {
1410 match observed_dimension {
1411 None => observed_dimension = Some(dim),
1412 Some(expected) if dim != expected => {
1413 return Err(format!(
1416 "embedding dimension changed during incremental refresh: \
1417 cached index uses {expected}, new vectors use {dim}"
1418 ));
1419 }
1420 _ => {}
1421 }
1422 }
1423
1424 for (i, vector) in vectors.into_iter().enumerate() {
1425 let chunk_idx = batch_start + i;
1426 new_entries.push(EmbeddingEntry {
1427 chunk: chunks[chunk_idx].clone(),
1428 vector,
1429 });
1430 }
1431
1432 progress(new_entries.len(), total_chunks);
1433 }
1434
1435 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1436 if !successful_files.is_empty() {
1437 self.entries
1438 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1439 }
1440
1441 self.entries.extend(new_entries);
1442 for (file, metadata) in fresh_metadata {
1443 self.file_mtimes.insert(file.clone(), metadata.mtime);
1444 self.file_sizes.insert(file.clone(), metadata.size);
1445 self.file_hashes.insert(file, metadata.content_hash);
1446 }
1447 if let Some(dim) = observed_dimension {
1448 self.dimension = dim;
1449 }
1450
1451 Ok(RefreshSummary {
1452 changed: changed
1453 .iter()
1454 .filter(|path| successful_files.contains(*path))
1455 .count(),
1456 added: added
1457 .iter()
1458 .filter(|path| successful_files.contains(*path))
1459 .count(),
1460 deleted: deleted.len(),
1461 total_processed,
1462 })
1463 }
1464
1465 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1467 if self.entries.is_empty() || query_vector.len() != self.dimension {
1468 return Vec::new();
1469 }
1470
1471 let mut scored: Vec<(f32, usize)> = self
1472 .entries
1473 .iter()
1474 .enumerate()
1475 .map(|(i, entry)| {
1476 let mut score = cosine_similarity(query_vector, &entry.vector);
1477 if entry.chunk.exported {
1478 score *= 1.1;
1479 }
1480 (score, i)
1481 })
1482 .collect();
1483
1484 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1486
1487 scored
1488 .into_iter()
1489 .take(top_k)
1490 .map(|(score, idx)| {
1494 let entry = &self.entries[idx];
1495 SemanticResult {
1496 file: entry.chunk.file.clone(),
1497 name: entry.chunk.name.clone(),
1498 kind: entry.chunk.kind.clone(),
1499 start_line: entry.chunk.start_line,
1500 end_line: entry.chunk.end_line,
1501 exported: entry.chunk.exported,
1502 snippet: entry.chunk.snippet.clone(),
1503 score,
1504 source: "semantic",
1505 }
1506 })
1507 .collect()
1508 }
1509
1510 pub fn len(&self) -> usize {
1512 self.entries.len()
1513 }
1514
1515 pub fn is_file_stale(&self, file: &Path) -> bool {
1517 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1518 return true;
1519 };
1520 let Some(stored_size) = self.file_sizes.get(file) else {
1521 return true;
1522 };
1523 let Some(stored_hash) = self.file_hashes.get(file) else {
1524 return true;
1525 };
1526 let cached = FileFreshness {
1527 mtime: *stored_mtime,
1528 size: *stored_size,
1529 content_hash: *stored_hash,
1530 };
1531 match cache_freshness::verify_file(file, &cached) {
1532 FreshnessVerdict::HotFresh => false,
1533 FreshnessVerdict::ContentFresh { .. } => false,
1534 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1535 }
1536 }
1537
1538 fn backfill_missing_file_sizes(&mut self) {
1539 for path in self.file_mtimes.keys() {
1540 if self.file_sizes.contains_key(path) {
1541 continue;
1542 }
1543 if let Ok(metadata) = fs::metadata(path) {
1544 self.file_sizes.insert(path.clone(), metadata.len());
1545 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1546 self.file_hashes.insert(path.clone(), hash);
1547 }
1548 }
1549 }
1550 }
1551
1552 pub fn remove_file(&mut self, file: &Path) {
1554 self.invalidate_file(file);
1555 }
1556
1557 pub fn invalidate_file(&mut self, file: &Path) {
1558 self.entries.retain(|e| e.chunk.file != file);
1559 self.file_mtimes.remove(file);
1560 self.file_sizes.remove(file);
1561 self.file_hashes.remove(file);
1562 }
1563
1564 pub fn dimension(&self) -> usize {
1566 self.dimension
1567 }
1568
1569 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1570 self.fingerprint.as_ref()
1571 }
1572
1573 pub fn backend_label(&self) -> Option<&str> {
1574 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1575 }
1576
1577 pub fn model_label(&self) -> Option<&str> {
1578 self.fingerprint.as_ref().map(|f| f.model.as_str())
1579 }
1580
1581 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1582 self.fingerprint = Some(fingerprint);
1583 }
1584
1585 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1587 if self.entries.is_empty() {
1590 slog_info!("skipping semantic index persistence (0 entries)");
1591 return;
1592 }
1593 let dir = storage_dir.join("semantic").join(project_key);
1594 if let Err(e) = fs::create_dir_all(&dir) {
1595 slog_warn!("failed to create semantic cache dir: {}", e);
1596 return;
1597 }
1598 let data_path = dir.join("semantic.bin");
1599 let tmp_path = dir.join(format!(
1600 "semantic.bin.tmp.{}.{}",
1601 std::process::id(),
1602 SystemTime::now()
1603 .duration_since(SystemTime::UNIX_EPOCH)
1604 .unwrap_or(Duration::ZERO)
1605 .as_nanos()
1606 ));
1607 let bytes = self.to_bytes();
1608 let write_result = (|| -> std::io::Result<()> {
1609 use std::io::Write;
1610 let mut file = fs::File::create(&tmp_path)?;
1611 file.write_all(&bytes)?;
1612 file.sync_all()?;
1613 Ok(())
1614 })();
1615 if let Err(e) = write_result {
1616 slog_warn!("failed to write semantic index: {}", e);
1617 let _ = fs::remove_file(&tmp_path);
1618 return;
1619 }
1620 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1621 slog_warn!("failed to rename semantic index: {}", e);
1622 let _ = fs::remove_file(&tmp_path);
1623 return;
1624 }
1625 slog_info!(
1626 "semantic index persisted: {} entries, {:.1} KB",
1627 self.entries.len(),
1628 bytes.len() as f64 / 1024.0
1629 );
1630 }
1631
1632 pub fn read_from_disk(
1634 storage_dir: &Path,
1635 project_key: &str,
1636 current_canonical_root: &Path,
1637 is_worktree_bridge: bool,
1638 expected_fingerprint: Option<&str>,
1639 ) -> Option<Self> {
1640 debug_assert!(current_canonical_root.is_absolute());
1641 let data_path = storage_dir
1642 .join("semantic")
1643 .join(project_key)
1644 .join("semantic.bin");
1645 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1646 if file_len < HEADER_BYTES_V1 {
1647 slog_warn!(
1648 "corrupt semantic index (too small: {} bytes), removing",
1649 file_len
1650 );
1651 if !is_worktree_bridge {
1652 let _ = fs::remove_file(&data_path);
1653 }
1654 return None;
1655 }
1656
1657 let bytes = fs::read(&data_path).ok()?;
1658 let version = bytes[0];
1659 if version != SEMANTIC_INDEX_VERSION_V6 {
1660 slog_info!(
1661 "cached semantic index version {} is older than {}, rebuilding",
1662 version,
1663 SEMANTIC_INDEX_VERSION_V6
1664 );
1665 if !is_worktree_bridge {
1666 let _ = fs::remove_file(&data_path);
1667 }
1668 return None;
1669 }
1670 match Self::from_bytes(&bytes, current_canonical_root) {
1671 Ok(index) => {
1672 if index.entries.is_empty() {
1673 slog_info!("cached semantic index is empty, will rebuild");
1674 if !is_worktree_bridge {
1675 let _ = fs::remove_file(&data_path);
1676 }
1677 return None;
1678 }
1679 if let Some(expected) = expected_fingerprint {
1680 let matches = index
1681 .fingerprint()
1682 .map(|fingerprint| fingerprint.matches_expected(expected))
1683 .unwrap_or(false);
1684 if !matches {
1685 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1686 if !is_worktree_bridge {
1687 let _ = fs::remove_file(&data_path);
1688 }
1689 return None;
1690 }
1691 }
1692 slog_info!(
1693 "loaded semantic index from disk: {} entries",
1694 index.entries.len()
1695 );
1696 Some(index)
1697 }
1698 Err(e) => {
1699 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1700 if !is_worktree_bridge {
1701 let _ = fs::remove_file(&data_path);
1702 }
1703 None
1704 }
1705 }
1706 }
1707
1708 pub fn to_bytes(&self) -> Vec<u8> {
1710 let mut buf = Vec::new();
1711 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1712 let encoded = fingerprint.as_string();
1713 if encoded.is_empty() {
1714 None
1715 } else {
1716 Some(encoded.into_bytes())
1717 }
1718 });
1719 let file_mtimes: Vec<_> = self
1720 .file_mtimes
1721 .iter()
1722 .filter_map(|(path, mtime)| {
1723 cache_relative_path(&self.project_root, path)
1724 .map(|relative| (relative, path, mtime))
1725 })
1726 .collect();
1727 let entries: Vec<_> = self
1728 .entries
1729 .iter()
1730 .filter_map(|entry| {
1731 cache_relative_path(&self.project_root, &entry.chunk.file)
1732 .map(|relative| (relative, entry))
1733 })
1734 .collect();
1735
1736 let version = SEMANTIC_INDEX_VERSION_V6;
1749 buf.push(version);
1750 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1751 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1752 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1753 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1754 buf.extend_from_slice(fp_bytes_ref);
1755
1756 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1759 for (relative, path, mtime) in &file_mtimes {
1760 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1761 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1762 buf.extend_from_slice(&path_bytes);
1763 let duration = mtime
1764 .duration_since(SystemTime::UNIX_EPOCH)
1765 .unwrap_or_default();
1766 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1767 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1768 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1769 buf.extend_from_slice(&size.to_le_bytes());
1770 let hash = self
1771 .file_hashes
1772 .get(*path)
1773 .copied()
1774 .unwrap_or_else(cache_freshness::zero_hash);
1775 buf.extend_from_slice(hash.as_bytes());
1776 }
1777
1778 for (relative, entry) in &entries {
1780 let c = &entry.chunk;
1781
1782 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1784 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1785 buf.extend_from_slice(&file_bytes);
1786
1787 let name_bytes = c.name.as_bytes();
1789 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1790 buf.extend_from_slice(name_bytes);
1791
1792 buf.push(symbol_kind_to_u8(&c.kind));
1794
1795 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1797 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1798 buf.push(c.exported as u8);
1799
1800 let snippet_bytes = c.snippet.as_bytes();
1802 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1803 buf.extend_from_slice(snippet_bytes);
1804
1805 let embed_bytes = c.embed_text.as_bytes();
1807 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1808 buf.extend_from_slice(embed_bytes);
1809
1810 for &val in &entry.vector {
1812 buf.extend_from_slice(&val.to_le_bytes());
1813 }
1814 }
1815
1816 buf
1817 }
1818
1819 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1821 debug_assert!(current_canonical_root.is_absolute());
1822 let mut pos = 0;
1823
1824 if data.len() < HEADER_BYTES_V1 {
1825 return Err("data too short".to_string());
1826 }
1827
1828 let version = data[pos];
1829 pos += 1;
1830 if version != SEMANTIC_INDEX_VERSION_V1
1831 && version != SEMANTIC_INDEX_VERSION_V2
1832 && version != SEMANTIC_INDEX_VERSION_V3
1833 && version != SEMANTIC_INDEX_VERSION_V4
1834 && version != SEMANTIC_INDEX_VERSION_V5
1835 && version != SEMANTIC_INDEX_VERSION_V6
1836 {
1837 return Err(format!("unsupported version: {}", version));
1838 }
1839 if (version == SEMANTIC_INDEX_VERSION_V2
1843 || version == SEMANTIC_INDEX_VERSION_V3
1844 || version == SEMANTIC_INDEX_VERSION_V4
1845 || version == SEMANTIC_INDEX_VERSION_V5
1846 || version == SEMANTIC_INDEX_VERSION_V6)
1847 && data.len() < HEADER_BYTES_V2
1848 {
1849 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1850 }
1851
1852 let dimension = read_u32(data, &mut pos)? as usize;
1853 let entry_count = read_u32(data, &mut pos)? as usize;
1854 validate_embedding_dimension(dimension)?;
1855 if entry_count > MAX_ENTRIES {
1856 return Err(format!("too many semantic index entries: {}", entry_count));
1857 }
1858
1859 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1865 || version == SEMANTIC_INDEX_VERSION_V3
1866 || version == SEMANTIC_INDEX_VERSION_V4
1867 || version == SEMANTIC_INDEX_VERSION_V5
1868 || version == SEMANTIC_INDEX_VERSION_V6;
1869 let fingerprint = if has_fingerprint_field {
1870 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1871 if pos + fingerprint_len > data.len() {
1872 return Err("unexpected end of data reading fingerprint".to_string());
1873 }
1874 if fingerprint_len == 0 {
1875 None
1876 } else {
1877 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1878 pos += fingerprint_len;
1879 Some(
1880 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1881 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1882 )
1883 }
1884 } else {
1885 None
1886 };
1887
1888 let mtime_count = read_u32(data, &mut pos)? as usize;
1890 if mtime_count > MAX_ENTRIES {
1891 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1892 }
1893
1894 let vector_bytes = entry_count
1895 .checked_mul(dimension)
1896 .and_then(|count| count.checked_mul(F32_BYTES))
1897 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1898 if vector_bytes > data.len().saturating_sub(pos) {
1899 return Err("semantic index vectors exceed available data".to_string());
1900 }
1901
1902 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1903 let mut file_sizes = HashMap::with_capacity(mtime_count);
1904 let mut file_hashes = HashMap::with_capacity(mtime_count);
1905 for _ in 0..mtime_count {
1906 let path = read_string(data, &mut pos)?;
1907 let secs = read_u64(data, &mut pos)?;
1908 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1914 || version == SEMANTIC_INDEX_VERSION_V4
1915 || version == SEMANTIC_INDEX_VERSION_V5
1916 || version == SEMANTIC_INDEX_VERSION_V6
1917 {
1918 read_u32(data, &mut pos)?
1919 } else {
1920 0
1921 };
1922 let size =
1923 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1924 read_u64(data, &mut pos)?
1925 } else {
1926 0
1927 };
1928 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1929 if pos + 32 > data.len() {
1930 return Err("unexpected end of data reading content hash".to_string());
1931 }
1932 let mut hash_bytes = [0u8; 32];
1933 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1934 pos += 32;
1935 blake3::Hash::from_bytes(hash_bytes)
1936 } else {
1937 cache_freshness::zero_hash()
1938 };
1939 if nanos >= 1_000_000_000 {
1946 return Err(format!(
1947 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1948 nanos
1949 ));
1950 }
1951 let duration = std::time::Duration::new(secs, nanos);
1952 let mtime = SystemTime::UNIX_EPOCH
1953 .checked_add(duration)
1954 .ok_or_else(|| {
1955 format!(
1956 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1957 secs, nanos
1958 )
1959 })?;
1960 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1961 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1962 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1963 } else {
1964 PathBuf::from(path)
1965 };
1966 file_mtimes.insert(path.clone(), mtime);
1967 file_sizes.insert(path.clone(), size);
1968 file_hashes.insert(path, content_hash);
1969 }
1970
1971 let mut entries = Vec::with_capacity(entry_count);
1973 for _ in 0..entry_count {
1974 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1975 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1976 cached_path_under_root(current_canonical_root, &raw_file)
1977 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1978 } else {
1979 raw_file
1980 };
1981 let name = read_string(data, &mut pos)?;
1982
1983 if pos >= data.len() {
1984 return Err("unexpected end of data".to_string());
1985 }
1986 let kind = u8_to_symbol_kind(data[pos]);
1987 pos += 1;
1988
1989 let start_line = read_u32(data, &mut pos)?;
1990 let end_line = read_u32(data, &mut pos)?;
1991
1992 if pos >= data.len() {
1993 return Err("unexpected end of data".to_string());
1994 }
1995 let exported = data[pos] != 0;
1996 pos += 1;
1997
1998 let snippet = read_string(data, &mut pos)?;
1999 let embed_text = read_string(data, &mut pos)?;
2000
2001 let vec_bytes = dimension
2003 .checked_mul(F32_BYTES)
2004 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2005 if pos + vec_bytes > data.len() {
2006 return Err("unexpected end of data reading vector".to_string());
2007 }
2008 let mut vector = Vec::with_capacity(dimension);
2009 for _ in 0..dimension {
2010 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2011 vector.push(f32::from_le_bytes(bytes));
2012 pos += 4;
2013 }
2014
2015 entries.push(EmbeddingEntry {
2016 chunk: SemanticChunk {
2017 file,
2018 name,
2019 kind,
2020 start_line,
2021 end_line,
2022 exported,
2023 embed_text,
2024 snippet,
2025 },
2026 vector,
2027 });
2028 }
2029
2030 if entries.len() != entry_count {
2031 return Err(format!(
2032 "semantic cache entry count drift: header={} decoded={}",
2033 entry_count,
2034 entries.len()
2035 ));
2036 }
2037 for entry in &entries {
2038 if !file_mtimes.contains_key(&entry.chunk.file) {
2039 return Err(format!(
2040 "semantic cache metadata missing for entry file {}",
2041 entry.chunk.file.display()
2042 ));
2043 }
2044 }
2045
2046 Ok(Self {
2047 entries,
2048 file_mtimes,
2049 file_sizes,
2050 file_hashes,
2051 dimension,
2052 fingerprint,
2053 project_root: current_canonical_root.to_path_buf(),
2054 })
2055 }
2056}
2057
2058fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2060 let relative = file
2061 .strip_prefix(project_root)
2062 .unwrap_or(file)
2063 .to_string_lossy();
2064
2065 let kind_label = match &symbol.kind {
2066 SymbolKind::Function => "function",
2067 SymbolKind::Class => "class",
2068 SymbolKind::Method => "method",
2069 SymbolKind::Struct => "struct",
2070 SymbolKind::Interface => "interface",
2071 SymbolKind::Enum => "enum",
2072 SymbolKind::TypeAlias => "type",
2073 SymbolKind::Variable => "variable",
2074 SymbolKind::Heading => "heading",
2075 SymbolKind::FileSummary => "file-summary",
2076 };
2077
2078 let name = &symbol.name;
2080 let mut text = format!(
2081 "name:{name} file:{} kind:{} name:{name}",
2082 relative, kind_label
2083 );
2084
2085 if let Some(sig) = &symbol.signature {
2086 text.push_str(&format!(" signature:{}", sig));
2087 }
2088
2089 let lines: Vec<&str> = source.lines().collect();
2091 let start = (symbol.range.start_line as usize).min(lines.len());
2092 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2094 if start < end {
2095 let body: String = lines[start..end]
2096 .iter()
2097 .take(15) .copied()
2099 .collect::<Vec<&str>>()
2100 .join("\n");
2101 let snippet = if body.len() > 300 {
2102 format!("{}...", &body[..body.floor_char_boundary(300)])
2103 } else {
2104 body
2105 };
2106 text.push_str(&format!(" body:{}", snippet));
2107 }
2108
2109 text
2110}
2111
2112fn truncate_chars(value: &str, max_chars: usize) -> String {
2113 value.chars().take(max_chars).collect()
2114}
2115
2116fn first_leading_doc_comment(source: &str) -> String {
2117 let lines: Vec<&str> = source.lines().collect();
2118 let Some((start, first)) = lines
2119 .iter()
2120 .enumerate()
2121 .find(|(_, line)| !line.trim().is_empty())
2122 else {
2123 return String::new();
2124 };
2125
2126 let trimmed = first.trim_start();
2127 if trimmed.starts_with("/**") {
2128 let mut comment = Vec::new();
2129 for line in lines.iter().skip(start) {
2130 comment.push(*line);
2131 if line.contains("*/") {
2132 break;
2133 }
2134 }
2135 return truncate_chars(&comment.join("\n"), 200);
2136 }
2137
2138 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2139 let comment = lines
2140 .iter()
2141 .skip(start)
2142 .take_while(|line| {
2143 let trimmed = line.trim_start();
2144 trimmed.starts_with("///") || trimmed.starts_with("//!")
2145 })
2146 .copied()
2147 .collect::<Vec<_>>()
2148 .join("\n");
2149 return truncate_chars(&comment, 200);
2150 }
2151
2152 String::new()
2153}
2154
2155pub fn build_file_summary_chunk(
2156 file: &Path,
2157 project_root: &Path,
2158 source: &str,
2159 top_exports: &[&str],
2160 top_export_signatures: &[Option<&str>],
2161) -> SemanticChunk {
2162 let relative = file.strip_prefix(project_root).unwrap_or(file);
2163 let rel_path = relative.to_string_lossy();
2164 let parent_dir = relative
2165 .parent()
2166 .map(|parent| parent.to_string_lossy().to_string())
2167 .unwrap_or_default();
2168 let name = file
2169 .file_stem()
2170 .map(|stem| stem.to_string_lossy().to_string())
2171 .unwrap_or_default();
2172 let doc = first_leading_doc_comment(source);
2173 let exports = top_exports
2174 .iter()
2175 .take(5)
2176 .copied()
2177 .collect::<Vec<_>>()
2178 .join(",");
2179 let snippet = if doc.is_empty() {
2180 top_export_signatures
2181 .first()
2182 .and_then(|signature| signature.as_deref())
2183 .map(|signature| truncate_chars(signature, 200))
2184 .unwrap_or_default()
2185 } else {
2186 doc.clone()
2187 };
2188
2189 SemanticChunk {
2190 file: file.to_path_buf(),
2191 name,
2192 kind: SymbolKind::FileSummary,
2193 start_line: 0,
2194 end_line: 0,
2195 exported: false,
2196 embed_text: format!(
2197 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2198 file.file_stem()
2199 .map(|stem| stem.to_string_lossy().to_string())
2200 .unwrap_or_default()
2201 ),
2202 snippet,
2203 }
2204}
2205
2206fn parser_for(
2207 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2208 lang: crate::parser::LangId,
2209) -> Result<&mut Parser, String> {
2210 use std::collections::hash_map::Entry;
2211
2212 match parsers.entry(lang) {
2213 Entry::Occupied(entry) => Ok(entry.into_mut()),
2214 Entry::Vacant(entry) => {
2215 let grammar = grammar_for(lang);
2216 let mut parser = Parser::new();
2217 parser
2218 .set_language(&grammar)
2219 .map_err(|error| error.to_string())?;
2220 Ok(entry.insert(parser))
2221 }
2222 }
2223}
2224
2225pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2226 matches!(
2227 path.extension().and_then(|extension| extension.to_str()),
2228 Some(
2229 "ts" | "tsx"
2230 | "js"
2231 | "jsx"
2232 | "py"
2233 | "rs"
2234 | "go"
2235 | "c"
2236 | "h"
2237 | "cc"
2238 | "cpp"
2239 | "cxx"
2240 | "hpp"
2241 | "hh"
2242 | "zig"
2243 | "cs"
2244 | "sh"
2245 | "bash"
2246 | "zsh"
2247 | "sol"
2248 | "vue"
2249 )
2250 )
2251}
2252
2253fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2254 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2255 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2256 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2257 .map_err(|error| error.to_string())?
2258 .unwrap_or_else(cache_freshness::zero_hash);
2259 Ok(IndexedFileMetadata {
2260 mtime,
2261 size: metadata.len(),
2262 content_hash,
2263 })
2264}
2265
2266fn collect_file_chunks(
2267 project_root: &Path,
2268 file: &Path,
2269 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2270) -> Result<Vec<SemanticChunk>, String> {
2271 if !is_semantic_indexed_extension(file) {
2272 return Err("unsupported file extension".to_string());
2273 }
2274 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2275 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2276 let tree = parser_for(parsers, lang)?
2277 .parse(&source, None)
2278 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2279 let symbols =
2280 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2281
2282 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2283}
2284
2285fn build_snippet(symbol: &Symbol, source: &str) -> String {
2287 let lines: Vec<&str> = source.lines().collect();
2288 let start = (symbol.range.start_line as usize).min(lines.len());
2289 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2291 if start < end {
2292 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2293 let mut snippet = snippet_lines.join("\n");
2294 if end - start > 5 {
2295 snippet.push_str("\n ...");
2296 }
2297 if snippet.len() > 300 {
2298 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2299 }
2300 snippet
2301 } else {
2302 String::new()
2303 }
2304}
2305
2306fn symbols_to_chunks(
2308 file: &Path,
2309 symbols: &[Symbol],
2310 source: &str,
2311 project_root: &Path,
2312) -> Vec<SemanticChunk> {
2313 let mut chunks = Vec::new();
2314 let top_exports_with_signatures = symbols
2315 .iter()
2316 .filter(|symbol| {
2317 symbol.exported
2318 && symbol.parent.is_none()
2319 && !matches!(symbol.kind, SymbolKind::Heading)
2320 })
2321 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2322 .collect::<Vec<_>>();
2323
2324 let has_only_headings = !symbols.is_empty()
2325 && symbols
2326 .iter()
2327 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2328 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2329 let top_exports = top_exports_with_signatures
2330 .iter()
2331 .map(|(name, _)| *name)
2332 .collect::<Vec<_>>();
2333 let top_export_signatures = top_exports_with_signatures
2334 .iter()
2335 .map(|(_, signature)| *signature)
2336 .collect::<Vec<_>>();
2337 chunks.push(build_file_summary_chunk(
2338 file,
2339 project_root,
2340 source,
2341 &top_exports,
2342 &top_export_signatures,
2343 ));
2344 }
2345
2346 for symbol in symbols {
2347 if matches!(symbol.kind, SymbolKind::Heading) {
2352 continue;
2353 }
2354
2355 let line_count = symbol
2357 .range
2358 .end_line
2359 .saturating_sub(symbol.range.start_line)
2360 + 1;
2361 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2362 continue;
2363 }
2364
2365 let embed_text = build_embed_text(symbol, source, file, project_root);
2366 let snippet = build_snippet(symbol, source);
2367
2368 chunks.push(SemanticChunk {
2369 file: file.to_path_buf(),
2370 name: symbol.name.clone(),
2371 kind: symbol.kind.clone(),
2372 start_line: symbol.range.start_line,
2373 end_line: symbol.range.end_line,
2374 exported: symbol.exported,
2375 embed_text,
2376 snippet,
2377 });
2378
2379 }
2382
2383 chunks
2384}
2385
2386fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2388 if a.len() != b.len() {
2389 return 0.0;
2390 }
2391
2392 let mut dot = 0.0f32;
2393 let mut norm_a = 0.0f32;
2394 let mut norm_b = 0.0f32;
2395
2396 for i in 0..a.len() {
2397 dot += a[i] * b[i];
2398 norm_a += a[i] * a[i];
2399 norm_b += b[i] * b[i];
2400 }
2401
2402 let denom = norm_a.sqrt() * norm_b.sqrt();
2403 if denom == 0.0 {
2404 0.0
2405 } else {
2406 dot / denom
2407 }
2408}
2409
2410fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2412 match kind {
2413 SymbolKind::Function => 0,
2414 SymbolKind::Class => 1,
2415 SymbolKind::Method => 2,
2416 SymbolKind::Struct => 3,
2417 SymbolKind::Interface => 4,
2418 SymbolKind::Enum => 5,
2419 SymbolKind::TypeAlias => 6,
2420 SymbolKind::Variable => 7,
2421 SymbolKind::Heading => 8,
2422 SymbolKind::FileSummary => 9,
2423 }
2424}
2425
2426fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2427 match v {
2428 0 => SymbolKind::Function,
2429 1 => SymbolKind::Class,
2430 2 => SymbolKind::Method,
2431 3 => SymbolKind::Struct,
2432 4 => SymbolKind::Interface,
2433 5 => SymbolKind::Enum,
2434 6 => SymbolKind::TypeAlias,
2435 7 => SymbolKind::Variable,
2436 8 => SymbolKind::Heading,
2437 9 => SymbolKind::FileSummary,
2438 _ => SymbolKind::Heading,
2439 }
2440}
2441
2442fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2443 if *pos + 4 > data.len() {
2444 return Err("unexpected end of data reading u32".to_string());
2445 }
2446 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2447 *pos += 4;
2448 Ok(val)
2449}
2450
2451fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2452 if *pos + 8 > data.len() {
2453 return Err("unexpected end of data reading u64".to_string());
2454 }
2455 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2456 *pos += 8;
2457 Ok(u64::from_le_bytes(bytes))
2458}
2459
2460fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2461 let len = read_u32(data, pos)? as usize;
2462 if *pos + len > data.len() {
2463 return Err("unexpected end of data reading string".to_string());
2464 }
2465 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2466 *pos += len;
2467 Ok(s)
2468}
2469
2470#[cfg(test)]
2471mod tests {
2472 use super::*;
2473 use crate::config::{SemanticBackend, SemanticBackendConfig};
2474 use crate::parser::FileParser;
2475 use std::io::{Read, Write};
2476 use std::net::TcpListener;
2477 use std::thread;
2478
2479 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2480 where
2481 F: Fn(String, String, String) -> String + Send + 'static,
2482 {
2483 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2484 let addr = listener.local_addr().expect("local addr");
2485 let handle = thread::spawn(move || {
2486 let (mut stream, _) = listener.accept().expect("accept request");
2487 let mut buf = Vec::new();
2488 let mut chunk = [0u8; 4096];
2489 let mut header_end = None;
2490 let mut content_length = 0usize;
2491 loop {
2492 let n = stream.read(&mut chunk).expect("read request");
2493 if n == 0 {
2494 break;
2495 }
2496 buf.extend_from_slice(&chunk[..n]);
2497 if header_end.is_none() {
2498 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2499 header_end = Some(pos + 4);
2500 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2501 for line in headers.lines() {
2502 if let Some(value) = line.strip_prefix("Content-Length:") {
2503 content_length = value.trim().parse::<usize>().unwrap_or(0);
2504 }
2505 }
2506 }
2507 }
2508 if let Some(end) = header_end {
2509 if buf.len() >= end + content_length {
2510 break;
2511 }
2512 }
2513 }
2514
2515 let end = header_end.expect("header terminator");
2516 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2517 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2518 let mut lines = request.lines();
2519 let request_line = lines.next().expect("request line").to_string();
2520 let path = request_line
2521 .split_whitespace()
2522 .nth(1)
2523 .expect("request path")
2524 .to_string();
2525 let response_body = handler(request_line, path, body);
2526 let response = format!(
2527 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2528 response_body.len(),
2529 response_body
2530 );
2531 stream
2532 .write_all(response.as_bytes())
2533 .expect("write response");
2534 });
2535
2536 (format!("http://{}", addr), handle)
2537 }
2538
2539 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2540 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2541 }
2542
2543 fn write_rust_file(path: &Path, function_name: &str) {
2544 fs::write(
2545 path,
2546 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2547 )
2548 .unwrap();
2549 }
2550
2551 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2552 let mut embed = test_vector_for_texts;
2553 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2554 }
2555
2556 fn test_project_root() -> PathBuf {
2557 std::env::current_dir().unwrap()
2558 }
2559
2560 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2561 index.file_mtimes.insert(file.to_path_buf(), mtime);
2562 index.file_sizes.insert(file.to_path_buf(), size);
2563 index
2564 .file_hashes
2565 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2566 }
2567
2568 #[test]
2569 fn semantic_cache_serialization_skips_paths_outside_project_root() {
2570 let dir = tempfile::tempdir().expect("create temp dir");
2571 let project = fs::canonicalize(dir.path()).expect("canonical project");
2572 let outside = project.join("..").join("outside.rs");
2573 let mut index = SemanticIndex::new(project.clone(), 3);
2574 index
2575 .file_mtimes
2576 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2577 index.file_sizes.insert(outside.clone(), 1);
2578 index
2579 .file_hashes
2580 .insert(outside.clone(), cache_freshness::zero_hash());
2581 index.entries.push(EmbeddingEntry {
2582 chunk: SemanticChunk {
2583 file: outside,
2584 name: "outside".to_string(),
2585 kind: SymbolKind::Function,
2586 start_line: 0,
2587 end_line: 0,
2588 exported: false,
2589 embed_text: "outside".to_string(),
2590 snippet: "outside".to_string(),
2591 },
2592 vector: vec![1.0, 0.0, 0.0],
2593 });
2594
2595 let bytes = index.to_bytes();
2596 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2597 assert_eq!(loaded.entries.len(), 0);
2598 assert!(loaded.file_mtimes.is_empty());
2599 }
2600
2601 #[test]
2602 fn test_cosine_similarity_identical() {
2603 let a = vec![1.0, 0.0, 0.0];
2604 let b = vec![1.0, 0.0, 0.0];
2605 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2606 }
2607
2608 #[test]
2609 fn test_cosine_similarity_orthogonal() {
2610 let a = vec![1.0, 0.0, 0.0];
2611 let b = vec![0.0, 1.0, 0.0];
2612 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2613 }
2614
2615 #[test]
2616 fn test_cosine_similarity_opposite() {
2617 let a = vec![1.0, 0.0, 0.0];
2618 let b = vec![-1.0, 0.0, 0.0];
2619 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2620 }
2621
2622 #[test]
2623 fn test_serialization_roundtrip() {
2624 let project_root = test_project_root();
2625 let file = project_root.join("src/main.rs");
2626 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2627 index.entries.push(EmbeddingEntry {
2628 chunk: SemanticChunk {
2629 file: file.clone(),
2630 name: "handle_request".to_string(),
2631 kind: SymbolKind::Function,
2632 start_line: 10,
2633 end_line: 25,
2634 exported: true,
2635 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2636 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2637 },
2638 vector: vec![0.1, 0.2, 0.3, 0.4],
2639 });
2640 index.dimension = 4;
2641 index
2642 .file_mtimes
2643 .insert(file.clone(), SystemTime::UNIX_EPOCH);
2644 index.file_sizes.insert(file, 0);
2645 index.set_fingerprint(SemanticIndexFingerprint {
2646 backend: "fastembed".to_string(),
2647 model: "all-MiniLM-L6-v2".to_string(),
2648 base_url: FALLBACK_BACKEND.to_string(),
2649 dimension: 4,
2650 chunking_version: default_chunking_version(),
2651 });
2652
2653 let bytes = index.to_bytes();
2654 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2655
2656 assert_eq!(restored.entries.len(), 1);
2657 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2658 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2659 assert_eq!(restored.dimension, 4);
2660 assert_eq!(restored.backend_label(), Some("fastembed"));
2661 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2662 }
2663
2664 #[test]
2665 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2666 let cases = [
2667 (SymbolKind::Function, 0),
2668 (SymbolKind::Class, 1),
2669 (SymbolKind::Method, 2),
2670 (SymbolKind::Struct, 3),
2671 (SymbolKind::Interface, 4),
2672 (SymbolKind::Enum, 5),
2673 (SymbolKind::TypeAlias, 6),
2674 (SymbolKind::Variable, 7),
2675 (SymbolKind::Heading, 8),
2676 (SymbolKind::FileSummary, 9),
2677 ];
2678
2679 for (kind, encoded) in cases {
2680 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2681 assert_eq!(u8_to_symbol_kind(encoded), kind);
2682 }
2683 }
2684
2685 #[test]
2686 fn test_search_top_k() {
2687 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2688 index.dimension = 3;
2689
2690 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2692 let mut vec = vec![0.0f32; 3];
2693 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2695 chunk: SemanticChunk {
2696 file: PathBuf::from("/src/lib.rs"),
2697 name: name.to_string(),
2698 kind: SymbolKind::Function,
2699 start_line: (i * 10 + 1) as u32,
2700 end_line: (i * 10 + 5) as u32,
2701 exported: true,
2702 embed_text: format!("kind:function name:{}", name),
2703 snippet: format!("fn {}() {{}}", name),
2704 },
2705 vector: vec,
2706 });
2707 }
2708
2709 let query = vec![0.9, 0.1, 0.0];
2711 let results = index.search(&query, 2);
2712
2713 assert_eq!(results.len(), 2);
2714 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2716 }
2717
2718 #[test]
2719 fn test_empty_index_search() {
2720 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2721 let results = index.search(&[0.1, 0.2, 0.3], 10);
2722 assert!(results.is_empty());
2723 }
2724
2725 #[test]
2726 fn single_line_symbol_builds_non_empty_snippet() {
2727 let symbol = Symbol {
2728 name: "answer".to_string(),
2729 kind: SymbolKind::Variable,
2730 range: crate::symbols::Range {
2731 start_line: 0,
2732 start_col: 0,
2733 end_line: 0,
2734 end_col: 24,
2735 },
2736 signature: Some("const answer = 42".to_string()),
2737 scope_chain: Vec::new(),
2738 exported: true,
2739 parent: None,
2740 };
2741 let source = "export const answer = 42;\n";
2742
2743 let snippet = build_snippet(&symbol, source);
2744
2745 assert_eq!(snippet, "export const answer = 42;");
2746 }
2747
2748 #[test]
2749 fn optimized_file_chunk_collection_matches_file_parser_path() {
2750 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2751 let file = project_root.join("src/semantic_index.rs");
2752 let source = std::fs::read_to_string(&file).unwrap();
2753
2754 let mut legacy_parser = FileParser::new();
2755 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2756 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2757
2758 let mut parsers = HashMap::new();
2759 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2760
2761 assert_eq!(
2762 chunk_fingerprint(&optimized_chunks),
2763 chunk_fingerprint(&legacy_chunks)
2764 );
2765 }
2766
2767 fn chunk_fingerprint(
2768 chunks: &[SemanticChunk],
2769 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2770 chunks
2771 .iter()
2772 .map(|chunk| {
2773 (
2774 chunk.name.clone(),
2775 chunk.kind.clone(),
2776 chunk.start_line,
2777 chunk.end_line,
2778 chunk.exported,
2779 chunk.embed_text.clone(),
2780 chunk.snippet.clone(),
2781 )
2782 })
2783 .collect()
2784 }
2785
2786 #[test]
2787 fn rejects_oversized_dimension_during_deserialization() {
2788 let mut bytes = Vec::new();
2789 bytes.push(1u8);
2790 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2791 bytes.extend_from_slice(&0u32.to_le_bytes());
2792 bytes.extend_from_slice(&0u32.to_le_bytes());
2793
2794 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2795 }
2796
2797 #[test]
2798 fn rejects_oversized_entry_count_during_deserialization() {
2799 let mut bytes = Vec::new();
2800 bytes.push(1u8);
2801 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2802 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2803 bytes.extend_from_slice(&0u32.to_le_bytes());
2804
2805 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2806 }
2807
2808 #[test]
2809 fn invalidate_file_removes_entries_and_mtime() {
2810 let target = PathBuf::from("/src/main.rs");
2811 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2812 index.entries.push(EmbeddingEntry {
2813 chunk: SemanticChunk {
2814 file: target.clone(),
2815 name: "main".to_string(),
2816 kind: SymbolKind::Function,
2817 start_line: 0,
2818 end_line: 1,
2819 exported: false,
2820 embed_text: "main".to_string(),
2821 snippet: "fn main() {}".to_string(),
2822 },
2823 vector: vec![1.0; DEFAULT_DIMENSION],
2824 });
2825 index
2826 .file_mtimes
2827 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2828 index.file_sizes.insert(target.clone(), 0);
2829
2830 index.invalidate_file(&target);
2831
2832 assert!(index.entries.is_empty());
2833 assert!(!index.file_mtimes.contains_key(&target));
2834 assert!(!index.file_sizes.contains_key(&target));
2835 }
2836
2837 #[test]
2838 fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2839 let temp = tempfile::tempdir().unwrap();
2840 let project_root = temp.path();
2841 let file = project_root.join("src/lib.rs");
2842 fs::create_dir_all(file.parent().unwrap()).unwrap();
2843 write_rust_file(&file, "kept_symbol");
2844
2845 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2846 let original_entry_count = index.entries.len();
2847 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2848 let original_size = *index.file_sizes.get(&file).unwrap();
2849
2850 let stale_mtime = SystemTime::UNIX_EPOCH;
2851 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2852 fs::remove_file(&file).unwrap();
2853
2854 let mut embed = test_vector_for_texts;
2855 let mut progress = |_done: usize, _total: usize| {};
2856 let summary = index
2857 .refresh_stale_files(
2858 project_root,
2859 std::slice::from_ref(&file),
2860 &mut embed,
2861 8,
2862 &mut progress,
2863 )
2864 .unwrap();
2865
2866 assert_eq!(summary.changed, 0);
2867 assert_eq!(summary.added, 0);
2868 assert_eq!(summary.deleted, 0);
2869 assert_eq!(index.entries.len(), original_entry_count);
2870 assert!(index
2871 .entries
2872 .iter()
2873 .any(|entry| entry.chunk.name == "kept_symbol"));
2874 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2875 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2876 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2877 }
2878
2879 #[test]
2880 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2881 let temp = tempfile::tempdir().unwrap();
2882 let project_root = temp.path();
2883 let missing = project_root.join("src/missing.rs");
2884 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2885
2886 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2887 let mut embed = test_vector_for_texts;
2888 let mut progress = |_done: usize, _total: usize| {};
2889 let summary = index
2890 .refresh_stale_files(
2891 project_root,
2892 std::slice::from_ref(&missing),
2893 &mut embed,
2894 8,
2895 &mut progress,
2896 )
2897 .unwrap();
2898
2899 assert_eq!(summary.added, 0);
2900 assert_eq!(summary.changed, 0);
2901 assert_eq!(summary.deleted, 0);
2902 assert!(!index.file_mtimes.contains_key(&missing));
2903 assert!(!index.file_sizes.contains_key(&missing));
2904 assert!(index.entries.is_empty());
2905 }
2906
2907 #[test]
2908 fn refresh_reports_added_for_new_files() {
2909 let temp = tempfile::tempdir().unwrap();
2910 let project_root = temp.path();
2911 let existing = project_root.join("src/lib.rs");
2912 let added = project_root.join("src/new.rs");
2913 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2914 write_rust_file(&existing, "existing_symbol");
2915 write_rust_file(&added, "added_symbol");
2916
2917 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2918 let mut embed = test_vector_for_texts;
2919 let mut progress = |_done: usize, _total: usize| {};
2920 let summary = index
2921 .refresh_stale_files(
2922 project_root,
2923 &[existing.clone(), added.clone()],
2924 &mut embed,
2925 8,
2926 &mut progress,
2927 )
2928 .unwrap();
2929
2930 assert_eq!(summary.added, 1);
2931 assert_eq!(summary.changed, 0);
2932 assert_eq!(summary.deleted, 0);
2933 assert_eq!(summary.total_processed, 2);
2934 assert!(index.file_mtimes.contains_key(&added));
2935 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2936 }
2937
2938 #[test]
2939 fn refresh_reports_deleted_for_removed_files() {
2940 let temp = tempfile::tempdir().unwrap();
2941 let project_root = temp.path();
2942 let deleted = project_root.join("src/deleted.rs");
2943 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2944 write_rust_file(&deleted, "deleted_symbol");
2945
2946 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2947 fs::remove_file(&deleted).unwrap();
2948
2949 let mut embed = test_vector_for_texts;
2950 let mut progress = |_done: usize, _total: usize| {};
2951 let summary = index
2952 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2953 .unwrap();
2954
2955 assert_eq!(summary.deleted, 1);
2956 assert_eq!(summary.changed, 0);
2957 assert_eq!(summary.added, 0);
2958 assert_eq!(summary.total_processed, 1);
2959 assert!(!index.file_mtimes.contains_key(&deleted));
2960 assert!(index.entries.is_empty());
2961 }
2962
2963 #[test]
2964 fn refresh_reports_changed_for_modified_files() {
2965 let temp = tempfile::tempdir().unwrap();
2966 let project_root = temp.path();
2967 let file = project_root.join("src/lib.rs");
2968 fs::create_dir_all(file.parent().unwrap()).unwrap();
2969 write_rust_file(&file, "old_symbol");
2970
2971 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2972 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2973 write_rust_file(&file, "new_symbol");
2974
2975 let mut embed = test_vector_for_texts;
2976 let mut progress = |_done: usize, _total: usize| {};
2977 let summary = index
2978 .refresh_stale_files(
2979 project_root,
2980 std::slice::from_ref(&file),
2981 &mut embed,
2982 8,
2983 &mut progress,
2984 )
2985 .unwrap();
2986
2987 assert_eq!(summary.changed, 1);
2988 assert_eq!(summary.added, 0);
2989 assert_eq!(summary.deleted, 0);
2990 assert_eq!(summary.total_processed, 1);
2991 assert!(index
2992 .entries
2993 .iter()
2994 .any(|entry| entry.chunk.name == "new_symbol"));
2995 assert!(!index
2996 .entries
2997 .iter()
2998 .any(|entry| entry.chunk.name == "old_symbol"));
2999 }
3000
3001 #[test]
3002 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3003 let temp = tempfile::tempdir().unwrap();
3004 let project_root = temp.path();
3005 let file = project_root.join("src/lib.rs");
3006 fs::create_dir_all(file.parent().unwrap()).unwrap();
3007 write_rust_file(&file, "clean_symbol");
3008
3009 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3010 let original_entries = index.entries.len();
3011 let mut embed_called = false;
3012 let mut embed = |texts: Vec<String>| {
3013 embed_called = true;
3014 test_vector_for_texts(texts)
3015 };
3016 let mut progress = |_done: usize, _total: usize| {};
3017 let summary = index
3018 .refresh_stale_files(
3019 project_root,
3020 std::slice::from_ref(&file),
3021 &mut embed,
3022 8,
3023 &mut progress,
3024 )
3025 .unwrap();
3026
3027 assert!(summary.is_noop());
3028 assert_eq!(summary.total_processed, 1);
3029 assert!(!embed_called);
3030 assert_eq!(index.entries.len(), original_entries);
3031 }
3032
3033 #[test]
3034 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3035 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3036
3037 assert!(is_onnx_runtime_unavailable(message));
3038 }
3039
3040 #[test]
3041 fn formats_missing_onnx_runtime_with_install_hint() {
3042 let message = format_embedding_init_error(
3043 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3044 );
3045
3046 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3047 assert!(message.contains("Original error:"));
3048 }
3049
3050 #[test]
3051 fn openai_compatible_backend_embeds_with_mock_server() {
3052 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3053 assert!(request_line.starts_with("POST "));
3054 assert_eq!(path, "/v1/embeddings");
3055 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3056 });
3057
3058 let config = SemanticBackendConfig {
3059 backend: SemanticBackend::OpenAiCompatible,
3060 model: "test-embedding".to_string(),
3061 base_url: Some(base_url),
3062 api_key_env: None,
3063 timeout_ms: 5_000,
3064 max_batch_size: 64,
3065 };
3066
3067 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3068 let vectors = model
3069 .embed(vec!["hello".to_string(), "world".to_string()])
3070 .unwrap();
3071
3072 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3073 handle.join().unwrap();
3074 }
3075
3076 #[test]
3086 fn openai_compatible_request_has_single_content_type_header() {
3087 use std::sync::{Arc, Mutex};
3088 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3089 let captured_for_thread = Arc::clone(&captured);
3090
3091 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3092 let addr = listener.local_addr().expect("local addr");
3093 let handle = thread::spawn(move || {
3094 let (mut stream, _) = listener.accept().expect("accept");
3095 let mut buf = Vec::new();
3096 let mut chunk = [0u8; 4096];
3097 let mut header_end = None;
3098 let mut content_length = 0usize;
3099 loop {
3100 let n = stream.read(&mut chunk).expect("read");
3101 if n == 0 {
3102 break;
3103 }
3104 buf.extend_from_slice(&chunk[..n]);
3105 if header_end.is_none() {
3106 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3107 header_end = Some(pos + 4);
3108 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3109 if let Some(value) = line.strip_prefix("Content-Length:") {
3110 content_length = value.trim().parse::<usize>().unwrap_or(0);
3111 }
3112 }
3113 }
3114 }
3115 if let Some(end) = header_end {
3116 if buf.len() >= end + content_length {
3117 break;
3118 }
3119 }
3120 }
3121 *captured_for_thread.lock().unwrap() = buf;
3122 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3123 let response = format!(
3124 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3125 body.len(),
3126 body
3127 );
3128 let _ = stream.write_all(response.as_bytes());
3129 });
3130
3131 let config = SemanticBackendConfig {
3132 backend: SemanticBackend::OpenAiCompatible,
3133 model: "text-embedding-3-small".to_string(),
3134 base_url: Some(format!("http://{}", addr)),
3135 api_key_env: None,
3136 timeout_ms: 5_000,
3137 max_batch_size: 64,
3138 };
3139 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3140 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3141 handle.join().unwrap();
3142
3143 let bytes = captured.lock().unwrap().clone();
3144 let request = String::from_utf8_lossy(&bytes);
3145
3146 let content_type_lines = request
3149 .lines()
3150 .filter(|line| {
3151 let lower = line.to_ascii_lowercase();
3152 lower.starts_with("content-type:")
3153 })
3154 .count();
3155 assert_eq!(
3156 content_type_lines, 1,
3157 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3158 );
3159
3160 assert!(
3163 request.contains(r#""model":"text-embedding-3-small""#),
3164 "request body should contain model field; full request:\n{request}",
3165 );
3166 }
3167
3168 #[test]
3169 fn ollama_backend_embeds_with_mock_server() {
3170 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3171 assert!(request_line.starts_with("POST "));
3172 assert_eq!(path, "/api/embed");
3173 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3174 });
3175
3176 let config = SemanticBackendConfig {
3177 backend: SemanticBackend::Ollama,
3178 model: "embeddinggemma".to_string(),
3179 base_url: Some(base_url),
3180 api_key_env: None,
3181 timeout_ms: 5_000,
3182 max_batch_size: 64,
3183 };
3184
3185 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3186 let vectors = model
3187 .embed(vec!["hello".to_string(), "world".to_string()])
3188 .unwrap();
3189
3190 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3191 handle.join().unwrap();
3192 }
3193
3194 #[test]
3195 fn read_from_disk_rejects_fingerprint_mismatch() {
3196 let storage = tempfile::tempdir().unwrap();
3197 let project_key = "proj";
3198
3199 let project_root = test_project_root();
3200 let file = project_root.join("src/main.rs");
3201 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3202 index.entries.push(EmbeddingEntry {
3203 chunk: SemanticChunk {
3204 file: file.clone(),
3205 name: "handle_request".to_string(),
3206 kind: SymbolKind::Function,
3207 start_line: 10,
3208 end_line: 25,
3209 exported: true,
3210 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3211 snippet: "fn handle_request() {}".to_string(),
3212 },
3213 vector: vec![0.1, 0.2, 0.3],
3214 });
3215 index.dimension = 3;
3216 index
3217 .file_mtimes
3218 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3219 index.file_sizes.insert(file, 0);
3220 index.set_fingerprint(SemanticIndexFingerprint {
3221 backend: "openai_compatible".to_string(),
3222 model: "test-embedding".to_string(),
3223 base_url: "http://127.0.0.1:1234/v1".to_string(),
3224 dimension: 3,
3225 chunking_version: default_chunking_version(),
3226 });
3227 index.write_to_disk(storage.path(), project_key);
3228
3229 let matching = index.fingerprint().unwrap().as_string();
3230 assert!(SemanticIndex::read_from_disk(
3231 storage.path(),
3232 project_key,
3233 &project_root,
3234 false,
3235 Some(&matching),
3236 )
3237 .is_some());
3238
3239 let mismatched = SemanticIndexFingerprint {
3240 backend: "ollama".to_string(),
3241 model: "embeddinggemma".to_string(),
3242 base_url: "http://127.0.0.1:11434".to_string(),
3243 dimension: 3,
3244 chunking_version: default_chunking_version(),
3245 }
3246 .as_string();
3247 assert!(SemanticIndex::read_from_disk(
3248 storage.path(),
3249 project_key,
3250 &project_root,
3251 false,
3252 Some(&mismatched),
3253 )
3254 .is_none());
3255 }
3256
3257 #[test]
3258 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3259 let storage = tempfile::tempdir().unwrap();
3260 let project_key = "proj-v3";
3261 let dir = storage.path().join("semantic").join(project_key);
3262 fs::create_dir_all(&dir).unwrap();
3263
3264 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3265 index.entries.push(EmbeddingEntry {
3266 chunk: SemanticChunk {
3267 file: PathBuf::from("/src/main.rs"),
3268 name: "handle_request".to_string(),
3269 kind: SymbolKind::Function,
3270 start_line: 0,
3271 end_line: 0,
3272 exported: true,
3273 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3274 snippet: "fn handle_request() {}".to_string(),
3275 },
3276 vector: vec![0.1, 0.2, 0.3],
3277 });
3278 index.dimension = 3;
3279 index
3280 .file_mtimes
3281 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3282 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3283 let fingerprint = SemanticIndexFingerprint {
3284 backend: "fastembed".to_string(),
3285 model: "test".to_string(),
3286 base_url: FALLBACK_BACKEND.to_string(),
3287 dimension: 3,
3288 chunking_version: default_chunking_version(),
3289 };
3290 index.set_fingerprint(fingerprint.clone());
3291
3292 let mut bytes = index.to_bytes();
3293 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3294 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3295
3296 assert!(SemanticIndex::read_from_disk(
3297 storage.path(),
3298 project_key,
3299 &test_project_root(),
3300 false,
3301 Some(&fingerprint.as_string())
3302 )
3303 .is_none());
3304 assert!(!dir.join("semantic.bin").exists());
3305 }
3306
3307 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3308 crate::symbols::Symbol {
3309 name: name.to_string(),
3310 kind,
3311 range: crate::symbols::Range {
3312 start_line: start,
3313 start_col: 0,
3314 end_line: end,
3315 end_col: 0,
3316 },
3317 signature: None,
3318 scope_chain: Vec::new(),
3319 exported: false,
3320 parent: None,
3321 }
3322 }
3323
3324 #[test]
3329 fn symbols_to_chunks_skips_heading_symbols() {
3330 let project_root = PathBuf::from("/proj");
3331 let file = project_root.join("README.md");
3332 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3333
3334 let symbols = vec![
3335 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3336 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3337 ];
3338
3339 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3340 assert!(
3341 chunks.is_empty(),
3342 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3343 chunks.len()
3344 );
3345 }
3346
3347 #[test]
3351 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3352 let project_root = PathBuf::from("/proj");
3353 let file = project_root.join("src/lib.rs");
3354 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3355
3356 let symbols = vec![
3357 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3359 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3360 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3361 ];
3362
3363 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3364 assert_eq!(
3365 chunks.len(),
3366 3,
3367 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3368 chunks.len()
3369 );
3370 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3371 assert!(chunks
3372 .iter()
3373 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3374 assert!(names.contains(&"handle_request"));
3375 assert!(names.contains(&"AuthService"));
3376 assert!(
3377 !names.contains(&"doc heading"),
3378 "Heading symbol leaked into chunks: {names:?}"
3379 );
3380 }
3381
3382 #[test]
3383 fn validate_ssrf_allows_loopback_hostnames() {
3384 for host in &[
3387 "http://localhost",
3388 "http://localhost:8080",
3389 "http://localhost:11434", "http://localhost.localdomain",
3391 "http://foo.localhost",
3392 ] {
3393 assert!(
3394 validate_base_url_no_ssrf(host).is_ok(),
3395 "Expected {host} to be allowed (loopback), got: {:?}",
3396 validate_base_url_no_ssrf(host)
3397 );
3398 }
3399 }
3400
3401 #[test]
3402 fn validate_ssrf_allows_loopback_ips() {
3403 for url in &[
3406 "http://127.0.0.1",
3407 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3409 "http://127.1.2.3",
3410 ] {
3411 let result = validate_base_url_no_ssrf(url);
3412 assert!(
3413 result.is_ok(),
3414 "Expected {url} to be allowed (loopback), got: {:?}",
3415 result
3416 );
3417 }
3418 }
3419
3420 #[test]
3421 fn validate_ssrf_rejects_private_non_loopback_ips() {
3422 for url in &[
3427 "http://192.168.1.1",
3428 "http://10.0.0.1",
3429 "http://172.16.0.1",
3430 "http://169.254.169.254",
3431 "http://100.64.0.1",
3432 ] {
3433 let result = validate_base_url_no_ssrf(url);
3434 assert!(
3435 result.is_err(),
3436 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3437 result
3438 );
3439 }
3440 }
3441
3442 #[test]
3443 fn validate_ssrf_rejects_mdns_local_hostnames() {
3444 for host in &[
3447 "http://printer.local",
3448 "http://nas.local:8080",
3449 "http://homelab.local",
3450 ] {
3451 let result = validate_base_url_no_ssrf(host);
3452 assert!(
3453 result.is_err(),
3454 "Expected {host} to be rejected (mDNS), got: {:?}",
3455 result
3456 );
3457 }
3458 }
3459
3460 #[test]
3461 fn normalize_base_url_allows_localhost_for_tests() {
3462 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3465 assert!(normalize_base_url("http://localhost:8080").is_ok());
3466 }
3467
3468 #[test]
3475 fn ort_mismatch_message_recommends_auto_fix_first() {
3476 let msg =
3477 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3478
3479 assert!(
3481 msg.contains("v1.9.0"),
3482 "should report detected version: {msg}"
3483 );
3484 assert!(
3485 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3486 "should report system path: {msg}"
3487 );
3488 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3489
3490 let auto_fix_pos = msg
3492 .find("Auto-fix")
3493 .expect("Auto-fix solution missing — users won't discover --fix");
3494 let remove_pos = msg
3495 .find("Remove the old library")
3496 .expect("system-rm solution missing");
3497 assert!(
3498 auto_fix_pos < remove_pos,
3499 "Auto-fix must come before manual rm — see PR comment thread"
3500 );
3501
3502 assert!(
3504 msg.contains("npx @cortexkit/aft doctor --fix"),
3505 "auto-fix command must be present and copy-pasteable: {msg}"
3506 );
3507 }
3508
3509 #[test]
3513 fn ort_mismatch_message_handles_macos_dylib_path() {
3514 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3515 assert!(msg.contains("v1.9.0"));
3516 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3517 assert!(
3521 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3522 "system path should be quoted in the auto-fix sentence: {msg}"
3523 );
3524 }
3525}