1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
34
35const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
36const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
37const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
42const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
45const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
48const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
50const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
51const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
52const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
54const DEFAULT_MAX_BATCH_SIZE: usize = 64;
55const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
56const FALLBACK_BACKEND: &str = "none";
57const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
58const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
59static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
60
61pub struct SemanticIndexLock {
62 _guard: fs_lock::LockGuard,
63}
64
65impl SemanticIndexLock {
66 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
67 let dir = storage_dir.join("semantic").join(project_key);
68 fs::create_dir_all(&dir)?;
69 let path = dir.join("cache.lock");
70 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
71 .lock()
72 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
73 fs_lock::try_acquire(&path, Duration::from_secs(2))
74 .map(|guard| Self { _guard: guard })
75 .map_err(|error| match error {
76 fs_lock::AcquireError::Timeout => {
77 std::io::Error::other("timed out acquiring semantic cache lock")
78 }
79 fs_lock::AcquireError::Io(error) => error,
80 })
81 }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct SemanticIndexFingerprint {
86 pub backend: String,
87 pub model: String,
88 #[serde(default)]
89 pub base_url: String,
90 pub dimension: usize,
91 #[serde(default = "default_chunking_version")]
92 pub chunking_version: u32,
93}
94
95fn default_chunking_version() -> u32 {
96 2
97}
98
99impl SemanticIndexFingerprint {
100 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
101 let base_url = config
104 .base_url
105 .as_ref()
106 .and_then(|u| normalize_base_url(u).ok())
107 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
108 Self {
109 backend: config.backend.as_str().to_string(),
110 model: config.model.clone(),
111 base_url,
112 dimension,
113 chunking_version: default_chunking_version(),
114 }
115 }
116
117 pub fn as_string(&self) -> String {
118 serde_json::to_string(self).unwrap_or_else(|_| String::new())
119 }
120
121 fn matches_expected(&self, expected: &str) -> bool {
122 let encoded = self.as_string();
123 !encoded.is_empty() && encoded == expected
124 }
125}
126
127enum SemanticEmbeddingEngine {
128 Fastembed(TextEmbedding),
129 OpenAiCompatible {
130 client: Client,
131 model: String,
132 base_url: String,
133 api_key: Option<String>,
134 },
135 Ollama {
136 client: Client,
137 model: String,
138 base_url: String,
139 },
140}
141
142pub struct SemanticEmbeddingModel {
143 backend: SemanticBackend,
144 model: String,
145 base_url: Option<String>,
146 timeout_ms: u64,
147 max_batch_size: usize,
148 dimension: Option<usize>,
149 engine: SemanticEmbeddingEngine,
150 query_embedding_cache: HashMap<String, Vec<f32>>,
151 query_embedding_cache_order: VecDeque<String>,
152 query_embedding_cache_hits: u64,
153 query_embedding_cache_misses: u64,
154}
155
156pub type EmbeddingModel = SemanticEmbeddingModel;
157
158fn validate_embedding_batch(
159 vectors: &[Vec<f32>],
160 expected_count: usize,
161 context: &str,
162) -> Result<(), String> {
163 if expected_count > 0 && vectors.is_empty() {
164 return Err(format!(
165 "{context} returned no vectors for {expected_count} inputs"
166 ));
167 }
168
169 if vectors.len() != expected_count {
170 return Err(format!(
171 "{context} returned {} vectors for {} inputs",
172 vectors.len(),
173 expected_count
174 ));
175 }
176
177 let Some(first_vector) = vectors.first() else {
178 return Ok(());
179 };
180 let expected_dimension = first_vector.len();
181 validate_embedding_dimension(expected_dimension)
182 .map_err(|error| format!("{context} returned {error}"))?;
183 for (index, vector) in vectors.iter().enumerate() {
184 if vector.len() != expected_dimension {
185 return Err(format!(
186 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
187 vector.len()
188 ));
189 }
190 }
191
192 Ok(())
193}
194
195fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
196 if dimension == 0 || dimension > MAX_DIMENSION {
197 return Err(format!(
198 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
199 ));
200 }
201
202 Ok(())
203}
204
205fn normalize_base_url(raw: &str) -> Result<String, String> {
209 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
210 let scheme = parsed.scheme();
211 if scheme != "http" && scheme != "https" {
212 return Err(format!(
213 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
214 scheme
215 ));
216 }
217 Ok(parsed.to_string().trim_end_matches('/').to_string())
218}
219
220pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
235 use std::net::{IpAddr, ToSocketAddrs};
236
237 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
238
239 let host = parsed.host_str().unwrap_or("");
240
241 let is_loopback_host =
246 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
247 if is_loopback_host {
248 return Ok(());
249 }
250
251 if host.ends_with(".local") {
254 return Err(format!(
255 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
256 ));
257 }
258
259 let port = parsed.port_or_known_default().unwrap_or(443);
262 let addr_str = format!("{host}:{port}");
263 let addrs: Vec<IpAddr> = addr_str
264 .to_socket_addrs()
265 .map(|iter| iter.map(|sa| sa.ip()).collect())
266 .unwrap_or_default();
267 for ip in &addrs {
268 if is_private_non_loopback_ip(ip) {
269 return Err(format!(
270 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
271 ));
272 }
273 }
274
275 Ok(())
276}
277
278fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
282 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
283 match ip {
284 IpAddr::V4(v4) => {
285 let o = v4.octets();
286 o[0] == 10
289 || (o[0] == 172 && (16..=31).contains(&o[1]))
291 || (o[0] == 192 && o[1] == 168)
293 || (o[0] == 169 && o[1] == 254)
295 || (o[0] == 100 && (64..=127).contains(&o[1]))
297 || o[0] == 0
299 }
300 IpAddr::V6(v6) => {
301 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
305 || (v6.segments()[0] & 0xfe00) == 0xfc00
307 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
309 && v6.segments()[2] == 0 && v6.segments()[3] == 0
310 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
311 && {
312 let [a, b] = v6.segments()[6..8] else { return false; };
313 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
314 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
315 })
316 }
317 }
318}
319
320fn build_openai_embeddings_endpoint(base_url: &str) -> String {
321 if base_url.ends_with("/v1") {
322 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
323 } else {
324 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
325 }
326}
327
328fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
329 if base_url.ends_with("/api") {
330 format!("{base_url}/embed")
331 } else {
332 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
333 }
334}
335
336fn normalize_api_key(value: Option<String>) -> Option<String> {
337 value.and_then(|token| {
338 let token = token.trim();
339 if token.is_empty() {
340 None
341 } else {
342 Some(token.to_string())
343 }
344 })
345}
346
347fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
348 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
349}
350
351fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
352 error.is_connect()
353}
354
355fn sleep_before_embedding_retry(attempt_index: usize) {
356 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
357 std::thread::sleep(Duration::from_millis(*delay_ms));
358 }
359}
360
361fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
362where
363 F: FnMut() -> reqwest::blocking::RequestBuilder,
364{
365 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
366 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
367
368 let response = match make_request().send() {
369 Ok(response) => response,
370 Err(error) => {
371 if !last_attempt && is_retryable_embedding_error(&error) {
372 sleep_before_embedding_retry(attempt_index);
373 continue;
374 }
375 return Err(format!("{backend_label} request failed: {error}"));
376 }
377 };
378
379 let status = response.status();
380 let raw = match response.text() {
381 Ok(raw) => raw,
382 Err(error) => {
383 if !last_attempt && is_retryable_embedding_error(&error) {
384 sleep_before_embedding_retry(attempt_index);
385 continue;
386 }
387 return Err(format!("{backend_label} response read failed: {error}"));
388 }
389 };
390
391 if status.is_success() {
392 return Ok(raw);
393 }
394
395 if !last_attempt && is_retryable_embedding_status(status) {
396 sleep_before_embedding_retry(attempt_index);
397 continue;
398 }
399
400 return Err(format!(
401 "{backend_label} request failed (HTTP {}): {}",
402 status, raw
403 ));
404 }
405
406 unreachable!("embedding request retries exhausted without returning")
407}
408
409impl SemanticEmbeddingModel {
410 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
411 let timeout_ms = if config.timeout_ms == 0 {
412 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
413 } else {
414 config.timeout_ms
415 };
416
417 let max_batch_size = if config.max_batch_size == 0 {
418 DEFAULT_MAX_BATCH_SIZE
419 } else {
420 config.max_batch_size
421 };
422
423 let api_key_env = normalize_api_key(config.api_key_env.clone());
424 let model = config.model.clone();
425
426 let client = Client::builder()
427 .timeout(Duration::from_millis(timeout_ms))
428 .redirect(reqwest::redirect::Policy::none())
429 .build()
430 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
431
432 let engine = match config.backend {
433 SemanticBackend::Fastembed => {
434 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
435 }
436 SemanticBackend::OpenAiCompatible => {
437 let raw = config.base_url.as_ref().ok_or_else(|| {
438 "base_url is required for openai_compatible backend".to_string()
439 })?;
440 let base_url = normalize_base_url(raw)?;
441
442 let api_key = match api_key_env {
443 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
444 format!("missing api_key_env '{var_name}' for openai_compatible backend")
445 })?),
446 None => None,
447 };
448
449 SemanticEmbeddingEngine::OpenAiCompatible {
450 client,
451 model,
452 base_url,
453 api_key,
454 }
455 }
456 SemanticBackend::Ollama => {
457 let raw = config
458 .base_url
459 .as_ref()
460 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
461 let base_url = normalize_base_url(raw)?;
462
463 SemanticEmbeddingEngine::Ollama {
464 client,
465 model,
466 base_url,
467 }
468 }
469 };
470
471 Ok(Self {
472 backend: config.backend,
473 model: config.model.clone(),
474 base_url: config.base_url.clone(),
475 timeout_ms,
476 max_batch_size,
477 dimension: None,
478 engine,
479 query_embedding_cache: HashMap::new(),
480 query_embedding_cache_order: VecDeque::new(),
481 query_embedding_cache_hits: 0,
482 query_embedding_cache_misses: 0,
483 })
484 }
485
486 pub fn backend(&self) -> SemanticBackend {
487 self.backend
488 }
489
490 pub fn model(&self) -> &str {
491 &self.model
492 }
493
494 pub fn base_url(&self) -> Option<&str> {
495 self.base_url.as_deref()
496 }
497
498 pub fn max_batch_size(&self) -> usize {
499 self.max_batch_size
500 }
501
502 pub fn timeout_ms(&self) -> u64 {
503 self.timeout_ms
504 }
505
506 pub fn fingerprint(
507 &mut self,
508 config: &SemanticBackendConfig,
509 ) -> Result<SemanticIndexFingerprint, String> {
510 let dimension = self.dimension()?;
511 Ok(SemanticIndexFingerprint::from_config(config, dimension))
512 }
513
514 pub fn dimension(&mut self) -> Result<usize, String> {
515 if let Some(dimension) = self.dimension {
516 return Ok(dimension);
517 }
518
519 let dimension = match &mut self.engine {
520 SemanticEmbeddingEngine::Fastembed(model) => {
521 let vectors = model
522 .embed(vec!["semantic index fingerprint probe".to_string()], None)
523 .map_err(|error| format_embedding_init_error(error.to_string()))?;
524 vectors
525 .first()
526 .map(|v| v.len())
527 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
528 }
529 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
530 let vectors =
531 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
532 vectors
533 .first()
534 .map(|v| v.len())
535 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
536 }
537 SemanticEmbeddingEngine::Ollama { .. } => {
538 let vectors =
539 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
540 vectors
541 .first()
542 .map(|v| v.len())
543 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
544 }
545 };
546
547 self.dimension = Some(dimension);
548 Ok(dimension)
549 }
550
551 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
552 self.embed_texts(texts)
553 }
554
555 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
556 if let Some(vector) = self.query_embedding_cache.get(query) {
557 self.query_embedding_cache_hits += 1;
558 return Ok(vector.clone());
559 }
560
561 self.query_embedding_cache_misses += 1;
562 let embeddings = self.embed_texts(vec![query.to_string()])?;
563 let vector = embeddings
564 .first()
565 .cloned()
566 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
567
568 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
569 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
570 self.query_embedding_cache.remove(&oldest);
571 }
572 }
573 self.query_embedding_cache
574 .insert(query.to_string(), vector.clone());
575 self.query_embedding_cache_order
576 .push_back(query.to_string());
577
578 Ok(vector)
579 }
580
581 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
582 (
583 self.query_embedding_cache_hits,
584 self.query_embedding_cache_misses,
585 self.query_embedding_cache.len(),
586 )
587 }
588
589 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
590 match &mut self.engine {
591 SemanticEmbeddingEngine::Fastembed(model) => model
592 .embed(texts, None::<usize>)
593 .map_err(|error| format_embedding_init_error(error.to_string()))
594 .map_err(|error| format!("failed to embed batch: {error}")),
595 SemanticEmbeddingEngine::OpenAiCompatible {
596 client,
597 model,
598 base_url,
599 api_key,
600 } => {
601 let expected_text_count = texts.len();
602 let endpoint = build_openai_embeddings_endpoint(base_url);
603 let body = serde_json::json!({
604 "input": texts,
605 "model": model,
606 });
607
608 let raw = send_embedding_request(
609 || {
610 let mut request = client.post(&endpoint).json(&body);
620
621 if let Some(api_key) = api_key {
622 request = request.header("Authorization", format!("Bearer {api_key}"));
623 }
624
625 request
626 },
627 "openai compatible",
628 )?;
629
630 #[derive(Deserialize)]
631 struct OpenAiResponse {
632 data: Vec<OpenAiEmbeddingResult>,
633 }
634
635 #[derive(Deserialize)]
636 struct OpenAiEmbeddingResult {
637 embedding: Vec<f32>,
638 index: Option<u32>,
639 }
640
641 let parsed: OpenAiResponse = serde_json::from_str(&raw)
642 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
643 if parsed.data.len() != expected_text_count {
644 return Err(format!(
645 "openai compatible response returned {} embeddings for {} inputs",
646 parsed.data.len(),
647 expected_text_count
648 ));
649 }
650
651 let mut vectors = vec![Vec::new(); parsed.data.len()];
652 for (i, item) in parsed.data.into_iter().enumerate() {
653 let index = item.index.unwrap_or(i as u32) as usize;
654 if index >= vectors.len() {
655 return Err(
656 "openai compatible response contains invalid vector index".to_string()
657 );
658 }
659 vectors[index] = item.embedding;
660 }
661
662 for vector in &vectors {
663 if vector.is_empty() {
664 return Err(
665 "openai compatible response contained missing vectors".to_string()
666 );
667 }
668 }
669
670 self.dimension = vectors.first().map(Vec::len);
671 Ok(vectors)
672 }
673 SemanticEmbeddingEngine::Ollama {
674 client,
675 model,
676 base_url,
677 } => {
678 let expected_text_count = texts.len();
679 let endpoint = build_ollama_embeddings_endpoint(base_url);
680
681 #[derive(Serialize)]
682 struct OllamaPayload<'a> {
683 model: &'a str,
684 input: Vec<String>,
685 }
686
687 let payload = OllamaPayload {
688 model,
689 input: texts,
690 };
691
692 let raw = send_embedding_request(
693 || {
694 client.post(&endpoint).json(&payload)
699 },
700 "ollama",
701 )?;
702
703 #[derive(Deserialize)]
704 struct OllamaResponse {
705 embeddings: Vec<Vec<f32>>,
706 }
707
708 let parsed: OllamaResponse = serde_json::from_str(&raw)
709 .map_err(|error| format!("invalid ollama response: {error}"))?;
710 if parsed.embeddings.is_empty() {
711 return Err("ollama response returned no embeddings".to_string());
712 }
713 if parsed.embeddings.len() != expected_text_count {
714 return Err(format!(
715 "ollama response returned {} embeddings for {} inputs",
716 parsed.embeddings.len(),
717 expected_text_count
718 ));
719 }
720
721 let vectors = parsed.embeddings;
722 for vector in &vectors {
723 if vector.is_empty() {
724 return Err("ollama response contained empty embeddings".to_string());
725 }
726 }
727
728 self.dimension = vectors.first().map(Vec::len);
729 Ok(vectors)
730 }
731 }
732 }
733}
734
735pub fn pre_validate_onnx_runtime() -> Result<(), String> {
739 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
740
741 #[cfg(any(target_os = "linux", target_os = "macos"))]
742 {
743 #[cfg(target_os = "linux")]
744 let default_name = "libonnxruntime.so";
745 #[cfg(target_os = "macos")]
746 let default_name = "libonnxruntime.dylib";
747
748 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
749
750 unsafe {
751 let c_name = std::ffi::CString::new(lib_name)
752 .map_err(|e| format!("invalid library path: {}", e))?;
753 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
754 if handle.is_null() {
755 let err = libc::dlerror();
756 let msg = if err.is_null() {
757 "unknown dlopen error".to_string()
758 } else {
759 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
760 };
761 return Err(format!(
762 "ONNX Runtime not found. dlopen('{}') failed: {}. \
763 Run `npx @cortexkit/aft doctor` to diagnose.",
764 lib_name, msg
765 ));
766 }
767
768 let detected_version = detect_ort_version_from_path(lib_name);
771
772 libc::dlclose(handle);
773
774 if let Some(ref version) = detected_version {
776 let parts: Vec<&str> = version.split('.').collect();
777 if let (Some(major), Some(minor)) = (
778 parts.first().and_then(|s| s.parse::<u32>().ok()),
779 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
780 ) {
781 if major != 1 || minor < 20 {
782 return Err(format_ort_version_mismatch(version, lib_name));
783 }
784 }
785 }
786 }
787 }
788
789 #[cfg(target_os = "windows")]
790 {
791 let _ = dylib_path;
793 }
794
795 Ok(())
796}
797
798#[cfg(any(test, target_os = "linux", target_os = "macos"))]
801fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
802 let path = std::path::Path::new(lib_path);
803
804 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
806 .into_iter()
807 .flatten()
808 {
809 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
810 if let Some(version) = extract_version_from_filename(name) {
811 return Some(version);
812 }
813 }
814 }
815
816 if let Some(parent) = path.parent() {
818 if let Ok(entries) = std::fs::read_dir(parent) {
819 for entry in entries.flatten() {
820 if let Some(name) = entry.file_name().to_str() {
821 if name.starts_with("libonnxruntime") {
822 if let Some(version) = extract_version_from_filename(name) {
823 return Some(version);
824 }
825 }
826 }
827 }
828 }
829 }
830
831 None
832}
833
834#[cfg(any(test, target_os = "linux", target_os = "macos"))]
836fn extract_version_from_filename(name: &str) -> Option<String> {
837 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
839 re.find(name).map(|m| m.as_str().to_string())
840}
841
842#[cfg(any(test, target_os = "linux", target_os = "macos"))]
843fn suggest_removal_command(lib_path: &str) -> String {
844 if lib_path.starts_with("/usr/local/lib")
845 || lib_path == "libonnxruntime.so"
846 || lib_path == "libonnxruntime.dylib"
847 {
848 #[cfg(target_os = "linux")]
849 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
850 #[cfg(target_os = "macos")]
851 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
852 #[cfg(target_os = "windows")]
853 return " Delete the ONNX Runtime DLL from your PATH".to_string();
854 }
855 format!(" rm '{}'", lib_path)
856}
857
858#[cfg(any(test, target_os = "linux", target_os = "macos"))]
864pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
865 format!(
866 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
867 Solutions:\n\
868 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
869 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
870 configures the bridge to load it instead of the system library — no \
871 changes to '{}'.\n\
872 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
873 {}\n\
874 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
875 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
876 version,
877 lib_name,
878 lib_name,
879 suggest_removal_command(lib_name),
880 )
881}
882
883pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
884 pre_validate_onnx_runtime()?;
886
887 let selected_model = match model {
888 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
889 _ => {
890 return Err(format!(
891 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
892 model
893 ))
894 }
895 };
896
897 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
898}
899
900pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
901 if message.trim_start().starts_with("ONNX Runtime not found.") {
902 return true;
903 }
904
905 let message = message.to_ascii_lowercase();
906 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
907 .iter()
908 .any(|pattern| message.contains(pattern));
909 let mentions_dynamic_load_failure = [
910 "shared library",
911 "dynamic library",
912 "failed to load",
913 "could not load",
914 "unable to load",
915 "dlopen",
916 "loadlibrary",
917 "no such file",
918 "not found",
919 ]
920 .iter()
921 .any(|pattern| message.contains(pattern));
922
923 mentions_onnx_runtime && mentions_dynamic_load_failure
924}
925
926fn format_embedding_init_error(error: impl Display) -> String {
927 let message = error.to_string();
928
929 if is_onnx_runtime_unavailable(&message) {
930 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
931 }
932
933 format!("failed to initialize semantic embedding model: {message}")
934}
935
936#[derive(Debug, Clone)]
938pub struct SemanticChunk {
939 pub file: PathBuf,
941 pub name: String,
943 pub kind: SymbolKind,
945 pub start_line: u32,
947 pub end_line: u32,
948 pub exported: bool,
950 pub embed_text: String,
952 pub snippet: String,
954}
955
956#[derive(Debug)]
958struct EmbeddingEntry {
959 chunk: SemanticChunk,
960 vector: Vec<f32>,
961}
962
963#[derive(Debug)]
965pub struct SemanticIndex {
966 entries: Vec<EmbeddingEntry>,
967 file_mtimes: HashMap<PathBuf, SystemTime>,
969 file_sizes: HashMap<PathBuf, u64>,
971 file_hashes: HashMap<PathBuf, blake3::Hash>,
972 dimension: usize,
974 fingerprint: Option<SemanticIndexFingerprint>,
975 project_root: PathBuf,
976}
977
978#[derive(Debug, Clone, Copy)]
979struct IndexedFileMetadata {
980 mtime: SystemTime,
981 size: u64,
982 content_hash: blake3::Hash,
983}
984
985#[derive(Debug, Default, Clone, Copy)]
988pub struct RefreshSummary {
989 pub changed: usize,
990 pub added: usize,
991 pub deleted: usize,
992 pub total_processed: usize,
993}
994
995impl RefreshSummary {
996 pub fn is_noop(&self) -> bool {
998 self.changed == 0 && self.added == 0 && self.deleted == 0
999 }
1000}
1001
1002#[derive(Debug, Clone)]
1004pub struct SemanticResult {
1005 pub file: PathBuf,
1006 pub name: String,
1007 pub kind: SymbolKind,
1008 pub start_line: u32,
1009 pub end_line: u32,
1010 pub exported: bool,
1011 pub snippet: String,
1012 pub score: f32,
1013 pub source: &'static str,
1014}
1015
1016impl SemanticIndex {
1017 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1018 debug_assert!(project_root.is_absolute());
1019 Self {
1020 entries: Vec::new(),
1021 file_mtimes: HashMap::new(),
1022 file_sizes: HashMap::new(),
1023 file_hashes: HashMap::new(),
1024 dimension,
1025 fingerprint: None,
1026 project_root,
1027 }
1028 }
1029
1030 pub fn entry_count(&self) -> usize {
1032 self.entries.len()
1033 }
1034
1035 pub fn status_label(&self) -> &'static str {
1037 if self.entries.is_empty() {
1038 "empty"
1039 } else {
1040 "ready"
1041 }
1042 }
1043
1044 fn collect_chunks(
1045 project_root: &Path,
1046 files: &[PathBuf],
1047 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1048 let per_file: Vec<(
1049 PathBuf,
1050 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1051 )> = files
1052 .par_iter()
1053 .map_init(HashMap::new, |parsers, file| {
1054 let result = collect_file_metadata(file).and_then(|metadata| {
1055 collect_file_chunks(project_root, file, parsers)
1056 .map(|chunks| (metadata, chunks))
1057 });
1058 (file.clone(), result)
1059 })
1060 .collect();
1061
1062 let mut chunks: Vec<SemanticChunk> = Vec::new();
1063 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1064
1065 for (file, result) in per_file {
1066 match result {
1067 Ok((metadata, file_chunks)) => {
1068 file_metadata.insert(file, metadata);
1069 chunks.extend(file_chunks);
1070 }
1071 Err(error) => {
1072 if error == "unsupported file extension" {
1078 continue;
1079 }
1080 slog_warn!(
1081 "failed to collect semantic chunks for {}: {}",
1082 file.display(),
1083 error
1084 );
1085 }
1086 }
1087 }
1088
1089 (chunks, file_metadata)
1090 }
1091
1092 fn build_from_chunks<F, P>(
1093 project_root: &Path,
1094 chunks: Vec<SemanticChunk>,
1095 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1096 embed_fn: &mut F,
1097 max_batch_size: usize,
1098 mut progress: Option<&mut P>,
1099 ) -> Result<Self, String>
1100 where
1101 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1102 P: FnMut(usize, usize),
1103 {
1104 debug_assert!(project_root.is_absolute());
1105 let total_chunks = chunks.len();
1106
1107 if chunks.is_empty() {
1108 return Ok(Self {
1109 entries: Vec::new(),
1110 file_mtimes: file_metadata
1111 .iter()
1112 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1113 .collect(),
1114 file_sizes: file_metadata
1115 .iter()
1116 .map(|(path, metadata)| (path.clone(), metadata.size))
1117 .collect(),
1118 file_hashes: file_metadata
1119 .into_iter()
1120 .map(|(path, metadata)| (path, metadata.content_hash))
1121 .collect(),
1122 dimension: DEFAULT_DIMENSION,
1123 fingerprint: None,
1124 project_root: project_root.to_path_buf(),
1125 });
1126 }
1127
1128 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1130 let mut expected_dimension: Option<usize> = None;
1131 let batch_size = max_batch_size.max(1);
1132 for batch_start in (0..chunks.len()).step_by(batch_size) {
1133 let batch_end = (batch_start + batch_size).min(chunks.len());
1134 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1135 .iter()
1136 .map(|c| c.embed_text.clone())
1137 .collect();
1138
1139 let vectors = embed_fn(batch_texts)?;
1140 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1141
1142 if let Some(dim) = vectors.first().map(|v| v.len()) {
1144 match expected_dimension {
1145 None => expected_dimension = Some(dim),
1146 Some(expected) if dim != expected => {
1147 return Err(format!(
1148 "embedding dimension changed across batches: expected {expected}, got {dim}"
1149 ));
1150 }
1151 _ => {}
1152 }
1153 }
1154
1155 for (i, vector) in vectors.into_iter().enumerate() {
1156 let chunk_idx = batch_start + i;
1157 entries.push(EmbeddingEntry {
1158 chunk: chunks[chunk_idx].clone(),
1159 vector,
1160 });
1161 }
1162
1163 if let Some(callback) = progress.as_mut() {
1164 callback(entries.len(), total_chunks);
1165 }
1166 }
1167
1168 let dimension = entries
1169 .first()
1170 .map(|e| e.vector.len())
1171 .unwrap_or(DEFAULT_DIMENSION);
1172
1173 Ok(Self {
1174 entries,
1175 file_mtimes: file_metadata
1176 .iter()
1177 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1178 .collect(),
1179 file_sizes: file_metadata
1180 .iter()
1181 .map(|(path, metadata)| (path.clone(), metadata.size))
1182 .collect(),
1183 file_hashes: file_metadata
1184 .into_iter()
1185 .map(|(path, metadata)| (path, metadata.content_hash))
1186 .collect(),
1187 dimension,
1188 fingerprint: None,
1189 project_root: project_root.to_path_buf(),
1190 })
1191 }
1192
1193 pub fn build<F>(
1196 project_root: &Path,
1197 files: &[PathBuf],
1198 embed_fn: &mut F,
1199 max_batch_size: usize,
1200 ) -> Result<Self, String>
1201 where
1202 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1203 {
1204 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1205 Self::build_from_chunks(
1206 project_root,
1207 chunks,
1208 file_mtimes,
1209 embed_fn,
1210 max_batch_size,
1211 Option::<&mut fn(usize, usize)>::None,
1212 )
1213 }
1214
1215 pub fn build_with_progress<F, P>(
1217 project_root: &Path,
1218 files: &[PathBuf],
1219 embed_fn: &mut F,
1220 max_batch_size: usize,
1221 progress: &mut P,
1222 ) -> Result<Self, String>
1223 where
1224 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1225 P: FnMut(usize, usize),
1226 {
1227 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1228 let total_chunks = chunks.len();
1229 progress(0, total_chunks);
1230 Self::build_from_chunks(
1231 project_root,
1232 chunks,
1233 file_mtimes,
1234 embed_fn,
1235 max_batch_size,
1236 Some(progress),
1237 )
1238 }
1239
1240 pub fn refresh_stale_files<F, P>(
1251 &mut self,
1252 project_root: &Path,
1253 current_files: &[PathBuf],
1254 embed_fn: &mut F,
1255 max_batch_size: usize,
1256 progress: &mut P,
1257 ) -> Result<RefreshSummary, String>
1258 where
1259 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1260 P: FnMut(usize, usize),
1261 {
1262 self.backfill_missing_file_sizes();
1263
1264 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1266 let total_processed = current_set.len() + self.file_mtimes.len()
1267 - self
1268 .file_mtimes
1269 .keys()
1270 .filter(|path| current_set.contains(path.as_path()))
1271 .count();
1272
1273 let mut deleted: Vec<PathBuf> = Vec::new();
1276 let mut changed: Vec<PathBuf> = Vec::new();
1277 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1278 for indexed_path in &indexed_paths {
1279 if !current_set.contains(indexed_path.as_path()) {
1280 deleted.push(indexed_path.clone());
1281 continue;
1282 }
1283 let cached = match (
1284 self.file_mtimes.get(indexed_path),
1285 self.file_sizes.get(indexed_path),
1286 self.file_hashes.get(indexed_path),
1287 ) {
1288 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1289 mtime: *mtime,
1290 size: *size,
1291 content_hash: *hash,
1292 }),
1293 _ => None,
1294 };
1295 match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1296 Some(FreshnessVerdict::HotFresh) => {}
1297 Some(FreshnessVerdict::ContentFresh {
1298 new_mtime,
1299 new_size,
1300 }) => {
1301 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1302 self.file_sizes.insert(indexed_path.clone(), new_size);
1303 }
1304 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1305 changed.push(indexed_path.clone());
1306 }
1307 }
1308 }
1309
1310 let mut added: Vec<PathBuf> = Vec::new();
1312 for path in current_files {
1313 if !self.file_mtimes.contains_key(path) {
1314 added.push(path.clone());
1315 }
1316 }
1317
1318 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1320 progress(0, 0);
1321 return Ok(RefreshSummary {
1322 total_processed,
1323 ..RefreshSummary::default()
1324 });
1325 }
1326
1327 if !deleted.is_empty() {
1331 self.remove_indexed_files(&deleted);
1332 }
1333
1334 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1336 to_embed.extend(changed.iter().cloned());
1337 to_embed.extend(added.iter().cloned());
1338
1339 if to_embed.is_empty() {
1340 progress(0, 0);
1342 return Ok(RefreshSummary {
1343 changed: 0,
1344 added: 0,
1345 deleted: deleted.len(),
1346 total_processed,
1347 });
1348 }
1349
1350 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1351 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1352 let vanished = to_embed
1353 .iter()
1354 .filter(|path| {
1355 changed_set.contains(path.as_path())
1356 && !fresh_metadata.contains_key(*path)
1357 && !path.exists()
1358 })
1359 .cloned()
1360 .collect::<Vec<_>>();
1361 if !vanished.is_empty() {
1362 self.remove_indexed_files(&vanished);
1363 deleted.extend(vanished);
1364 }
1365
1366 if chunks.is_empty() {
1367 progress(0, 0);
1368 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1369 if !successful_files.is_empty() {
1370 self.entries
1371 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1372 }
1373 let changed_count = changed
1374 .iter()
1375 .filter(|path| successful_files.contains(*path))
1376 .count();
1377 let added_count = added
1378 .iter()
1379 .filter(|path| successful_files.contains(*path))
1380 .count();
1381 for (file, metadata) in fresh_metadata {
1382 self.file_mtimes.insert(file.clone(), metadata.mtime);
1383 self.file_sizes.insert(file.clone(), metadata.size);
1384 self.file_hashes.insert(file.clone(), metadata.content_hash);
1385 }
1386 return Ok(RefreshSummary {
1387 changed: changed_count,
1388 added: added_count,
1389 deleted: deleted.len(),
1390 total_processed,
1391 });
1392 }
1393
1394 let total_chunks = chunks.len();
1396 progress(0, total_chunks);
1397 let batch_size = max_batch_size.max(1);
1398 let existing_dimension = if self.entries.is_empty() {
1399 None
1400 } else {
1401 Some(self.dimension)
1402 };
1403 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1404 let mut observed_dimension: Option<usize> = existing_dimension;
1405
1406 for batch_start in (0..chunks.len()).step_by(batch_size) {
1407 let batch_end = (batch_start + batch_size).min(chunks.len());
1408 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1409 .iter()
1410 .map(|c| c.embed_text.clone())
1411 .collect();
1412
1413 let vectors = embed_fn(batch_texts)?;
1414 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1415
1416 if let Some(dim) = vectors.first().map(|v| v.len()) {
1417 match observed_dimension {
1418 None => observed_dimension = Some(dim),
1419 Some(expected) if dim != expected => {
1420 return Err(format!(
1423 "embedding dimension changed during incremental refresh: \
1424 cached index uses {expected}, new vectors use {dim}"
1425 ));
1426 }
1427 _ => {}
1428 }
1429 }
1430
1431 for (i, vector) in vectors.into_iter().enumerate() {
1432 let chunk_idx = batch_start + i;
1433 new_entries.push(EmbeddingEntry {
1434 chunk: chunks[chunk_idx].clone(),
1435 vector,
1436 });
1437 }
1438
1439 progress(new_entries.len(), total_chunks);
1440 }
1441
1442 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1443 if !successful_files.is_empty() {
1444 self.entries
1445 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1446 }
1447
1448 self.entries.extend(new_entries);
1449 for (file, metadata) in fresh_metadata {
1450 self.file_mtimes.insert(file.clone(), metadata.mtime);
1451 self.file_sizes.insert(file.clone(), metadata.size);
1452 self.file_hashes.insert(file, metadata.content_hash);
1453 }
1454 if let Some(dim) = observed_dimension {
1455 self.dimension = dim;
1456 }
1457
1458 Ok(RefreshSummary {
1459 changed: changed
1460 .iter()
1461 .filter(|path| successful_files.contains(*path))
1462 .count(),
1463 added: added
1464 .iter()
1465 .filter(|path| successful_files.contains(*path))
1466 .count(),
1467 deleted: deleted.len(),
1468 total_processed,
1469 })
1470 }
1471
1472 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1473 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1474 self.entries
1475 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1476 for path in files {
1477 self.file_mtimes.remove(path);
1478 self.file_sizes.remove(path);
1479 self.file_hashes.remove(path);
1480 }
1481 }
1482
1483 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1485 if self.entries.is_empty() || query_vector.len() != self.dimension {
1486 return Vec::new();
1487 }
1488
1489 let mut scored: Vec<(f32, usize)> = self
1490 .entries
1491 .iter()
1492 .enumerate()
1493 .map(|(i, entry)| {
1494 let mut score = cosine_similarity(query_vector, &entry.vector);
1495 if entry.chunk.exported {
1496 score *= 1.1;
1497 }
1498 (score, i)
1499 })
1500 .collect();
1501
1502 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1504
1505 scored
1506 .into_iter()
1507 .take(top_k)
1508 .map(|(score, idx)| {
1512 let entry = &self.entries[idx];
1513 SemanticResult {
1514 file: entry.chunk.file.clone(),
1515 name: entry.chunk.name.clone(),
1516 kind: entry.chunk.kind.clone(),
1517 start_line: entry.chunk.start_line,
1518 end_line: entry.chunk.end_line,
1519 exported: entry.chunk.exported,
1520 snippet: entry.chunk.snippet.clone(),
1521 score,
1522 source: "semantic",
1523 }
1524 })
1525 .collect()
1526 }
1527
1528 pub fn len(&self) -> usize {
1530 self.entries.len()
1531 }
1532
1533 pub fn is_file_stale(&self, file: &Path) -> bool {
1535 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1536 return true;
1537 };
1538 let Some(stored_size) = self.file_sizes.get(file) else {
1539 return true;
1540 };
1541 let Some(stored_hash) = self.file_hashes.get(file) else {
1542 return true;
1543 };
1544 let cached = FileFreshness {
1545 mtime: *stored_mtime,
1546 size: *stored_size,
1547 content_hash: *stored_hash,
1548 };
1549 match cache_freshness::verify_file(file, &cached) {
1550 FreshnessVerdict::HotFresh => false,
1551 FreshnessVerdict::ContentFresh { .. } => false,
1552 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1553 }
1554 }
1555
1556 fn backfill_missing_file_sizes(&mut self) {
1557 for path in self.file_mtimes.keys() {
1558 if self.file_sizes.contains_key(path) {
1559 continue;
1560 }
1561 if let Ok(metadata) = fs::metadata(path) {
1562 self.file_sizes.insert(path.clone(), metadata.len());
1563 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1564 self.file_hashes.insert(path.clone(), hash);
1565 }
1566 }
1567 }
1568 }
1569
1570 pub fn remove_file(&mut self, file: &Path) {
1572 self.invalidate_file(file);
1573 }
1574
1575 pub fn invalidate_file(&mut self, file: &Path) {
1576 self.entries.retain(|e| e.chunk.file != file);
1577 self.file_mtimes.remove(file);
1578 self.file_sizes.remove(file);
1579 self.file_hashes.remove(file);
1580 }
1581
1582 pub fn dimension(&self) -> usize {
1584 self.dimension
1585 }
1586
1587 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1588 self.fingerprint.as_ref()
1589 }
1590
1591 pub fn backend_label(&self) -> Option<&str> {
1592 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1593 }
1594
1595 pub fn model_label(&self) -> Option<&str> {
1596 self.fingerprint.as_ref().map(|f| f.model.as_str())
1597 }
1598
1599 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1600 self.fingerprint = Some(fingerprint);
1601 }
1602
1603 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1605 if self.entries.is_empty() {
1608 slog_info!("skipping semantic index persistence (0 entries)");
1609 return;
1610 }
1611 let dir = storage_dir.join("semantic").join(project_key);
1612 if let Err(e) = fs::create_dir_all(&dir) {
1613 slog_warn!("failed to create semantic cache dir: {}", e);
1614 return;
1615 }
1616 let data_path = dir.join("semantic.bin");
1617 let tmp_path = dir.join(format!(
1618 "semantic.bin.tmp.{}.{}",
1619 std::process::id(),
1620 SystemTime::now()
1621 .duration_since(SystemTime::UNIX_EPOCH)
1622 .unwrap_or(Duration::ZERO)
1623 .as_nanos()
1624 ));
1625 let bytes = self.to_bytes();
1626 let write_result = (|| -> std::io::Result<()> {
1627 use std::io::Write;
1628 let mut file = fs::File::create(&tmp_path)?;
1629 file.write_all(&bytes)?;
1630 file.sync_all()?;
1631 Ok(())
1632 })();
1633 if let Err(e) = write_result {
1634 slog_warn!("failed to write semantic index: {}", e);
1635 let _ = fs::remove_file(&tmp_path);
1636 return;
1637 }
1638 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1639 slog_warn!("failed to rename semantic index: {}", e);
1640 let _ = fs::remove_file(&tmp_path);
1641 return;
1642 }
1643 slog_info!(
1644 "semantic index persisted: {} entries, {:.1} KB",
1645 self.entries.len(),
1646 bytes.len() as f64 / 1024.0
1647 );
1648 }
1649
1650 pub fn read_from_disk(
1652 storage_dir: &Path,
1653 project_key: &str,
1654 current_canonical_root: &Path,
1655 is_worktree_bridge: bool,
1656 expected_fingerprint: Option<&str>,
1657 ) -> Option<Self> {
1658 debug_assert!(current_canonical_root.is_absolute());
1659 let data_path = storage_dir
1660 .join("semantic")
1661 .join(project_key)
1662 .join("semantic.bin");
1663 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1664 if file_len < HEADER_BYTES_V1 {
1665 slog_warn!(
1666 "corrupt semantic index (too small: {} bytes), removing",
1667 file_len
1668 );
1669 if !is_worktree_bridge {
1670 let _ = fs::remove_file(&data_path);
1671 }
1672 return None;
1673 }
1674
1675 let bytes = fs::read(&data_path).ok()?;
1676 let version = bytes[0];
1677 if version != SEMANTIC_INDEX_VERSION_V6 {
1678 slog_info!(
1679 "cached semantic index version {} is older than {}, rebuilding",
1680 version,
1681 SEMANTIC_INDEX_VERSION_V6
1682 );
1683 if !is_worktree_bridge {
1684 let _ = fs::remove_file(&data_path);
1685 }
1686 return None;
1687 }
1688 match Self::from_bytes(&bytes, current_canonical_root) {
1689 Ok(index) => {
1690 if index.entries.is_empty() {
1691 slog_info!("cached semantic index is empty, will rebuild");
1692 if !is_worktree_bridge {
1693 let _ = fs::remove_file(&data_path);
1694 }
1695 return None;
1696 }
1697 if let Some(expected) = expected_fingerprint {
1698 let matches = index
1699 .fingerprint()
1700 .map(|fingerprint| fingerprint.matches_expected(expected))
1701 .unwrap_or(false);
1702 if !matches {
1703 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1704 if !is_worktree_bridge {
1705 let _ = fs::remove_file(&data_path);
1706 }
1707 return None;
1708 }
1709 }
1710 slog_info!(
1711 "loaded semantic index from disk: {} entries",
1712 index.entries.len()
1713 );
1714 Some(index)
1715 }
1716 Err(e) => {
1717 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1718 if !is_worktree_bridge {
1719 let _ = fs::remove_file(&data_path);
1720 }
1721 None
1722 }
1723 }
1724 }
1725
1726 pub fn to_bytes(&self) -> Vec<u8> {
1728 let mut buf = Vec::new();
1729 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1730 let encoded = fingerprint.as_string();
1731 if encoded.is_empty() {
1732 None
1733 } else {
1734 Some(encoded.into_bytes())
1735 }
1736 });
1737 let file_mtimes: Vec<_> = self
1738 .file_mtimes
1739 .iter()
1740 .filter_map(|(path, mtime)| {
1741 cache_relative_path(&self.project_root, path)
1742 .map(|relative| (relative, path, mtime))
1743 })
1744 .collect();
1745 let entries: Vec<_> = self
1746 .entries
1747 .iter()
1748 .filter_map(|entry| {
1749 cache_relative_path(&self.project_root, &entry.chunk.file)
1750 .map(|relative| (relative, entry))
1751 })
1752 .collect();
1753
1754 let version = SEMANTIC_INDEX_VERSION_V6;
1767 buf.push(version);
1768 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1769 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1770 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1771 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1772 buf.extend_from_slice(fp_bytes_ref);
1773
1774 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1777 for (relative, path, mtime) in &file_mtimes {
1778 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1779 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1780 buf.extend_from_slice(&path_bytes);
1781 let duration = mtime
1782 .duration_since(SystemTime::UNIX_EPOCH)
1783 .unwrap_or_default();
1784 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1785 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1786 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1787 buf.extend_from_slice(&size.to_le_bytes());
1788 let hash = self
1789 .file_hashes
1790 .get(*path)
1791 .copied()
1792 .unwrap_or_else(cache_freshness::zero_hash);
1793 buf.extend_from_slice(hash.as_bytes());
1794 }
1795
1796 for (relative, entry) in &entries {
1798 let c = &entry.chunk;
1799
1800 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1802 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1803 buf.extend_from_slice(&file_bytes);
1804
1805 let name_bytes = c.name.as_bytes();
1807 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1808 buf.extend_from_slice(name_bytes);
1809
1810 buf.push(symbol_kind_to_u8(&c.kind));
1812
1813 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1815 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1816 buf.push(c.exported as u8);
1817
1818 let snippet_bytes = c.snippet.as_bytes();
1820 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1821 buf.extend_from_slice(snippet_bytes);
1822
1823 let embed_bytes = c.embed_text.as_bytes();
1825 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1826 buf.extend_from_slice(embed_bytes);
1827
1828 for &val in &entry.vector {
1830 buf.extend_from_slice(&val.to_le_bytes());
1831 }
1832 }
1833
1834 buf
1835 }
1836
1837 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1839 debug_assert!(current_canonical_root.is_absolute());
1840 let mut pos = 0;
1841
1842 if data.len() < HEADER_BYTES_V1 {
1843 return Err("data too short".to_string());
1844 }
1845
1846 let version = data[pos];
1847 pos += 1;
1848 if version != SEMANTIC_INDEX_VERSION_V1
1849 && version != SEMANTIC_INDEX_VERSION_V2
1850 && version != SEMANTIC_INDEX_VERSION_V3
1851 && version != SEMANTIC_INDEX_VERSION_V4
1852 && version != SEMANTIC_INDEX_VERSION_V5
1853 && version != SEMANTIC_INDEX_VERSION_V6
1854 {
1855 return Err(format!("unsupported version: {}", version));
1856 }
1857 if (version == SEMANTIC_INDEX_VERSION_V2
1861 || version == SEMANTIC_INDEX_VERSION_V3
1862 || version == SEMANTIC_INDEX_VERSION_V4
1863 || version == SEMANTIC_INDEX_VERSION_V5
1864 || version == SEMANTIC_INDEX_VERSION_V6)
1865 && data.len() < HEADER_BYTES_V2
1866 {
1867 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1868 }
1869
1870 let dimension = read_u32(data, &mut pos)? as usize;
1871 let entry_count = read_u32(data, &mut pos)? as usize;
1872 validate_embedding_dimension(dimension)?;
1873 if entry_count > MAX_ENTRIES {
1874 return Err(format!("too many semantic index entries: {}", entry_count));
1875 }
1876
1877 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1883 || version == SEMANTIC_INDEX_VERSION_V3
1884 || version == SEMANTIC_INDEX_VERSION_V4
1885 || version == SEMANTIC_INDEX_VERSION_V5
1886 || version == SEMANTIC_INDEX_VERSION_V6;
1887 let fingerprint = if has_fingerprint_field {
1888 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1889 if pos + fingerprint_len > data.len() {
1890 return Err("unexpected end of data reading fingerprint".to_string());
1891 }
1892 if fingerprint_len == 0 {
1893 None
1894 } else {
1895 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1896 pos += fingerprint_len;
1897 Some(
1898 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1899 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1900 )
1901 }
1902 } else {
1903 None
1904 };
1905
1906 let mtime_count = read_u32(data, &mut pos)? as usize;
1908 if mtime_count > MAX_ENTRIES {
1909 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1910 }
1911
1912 let vector_bytes = entry_count
1913 .checked_mul(dimension)
1914 .and_then(|count| count.checked_mul(F32_BYTES))
1915 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1916 if vector_bytes > data.len().saturating_sub(pos) {
1917 return Err("semantic index vectors exceed available data".to_string());
1918 }
1919
1920 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1921 let mut file_sizes = HashMap::with_capacity(mtime_count);
1922 let mut file_hashes = HashMap::with_capacity(mtime_count);
1923 for _ in 0..mtime_count {
1924 let path = read_string(data, &mut pos)?;
1925 let secs = read_u64(data, &mut pos)?;
1926 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1932 || version == SEMANTIC_INDEX_VERSION_V4
1933 || version == SEMANTIC_INDEX_VERSION_V5
1934 || version == SEMANTIC_INDEX_VERSION_V6
1935 {
1936 read_u32(data, &mut pos)?
1937 } else {
1938 0
1939 };
1940 let size =
1941 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1942 read_u64(data, &mut pos)?
1943 } else {
1944 0
1945 };
1946 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1947 if pos + 32 > data.len() {
1948 return Err("unexpected end of data reading content hash".to_string());
1949 }
1950 let mut hash_bytes = [0u8; 32];
1951 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1952 pos += 32;
1953 blake3::Hash::from_bytes(hash_bytes)
1954 } else {
1955 cache_freshness::zero_hash()
1956 };
1957 if nanos >= 1_000_000_000 {
1964 return Err(format!(
1965 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1966 nanos
1967 ));
1968 }
1969 let duration = std::time::Duration::new(secs, nanos);
1970 let mtime = SystemTime::UNIX_EPOCH
1971 .checked_add(duration)
1972 .ok_or_else(|| {
1973 format!(
1974 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1975 secs, nanos
1976 )
1977 })?;
1978 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1979 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1980 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1981 } else {
1982 PathBuf::from(path)
1983 };
1984 file_mtimes.insert(path.clone(), mtime);
1985 file_sizes.insert(path.clone(), size);
1986 file_hashes.insert(path, content_hash);
1987 }
1988
1989 let mut entries = Vec::with_capacity(entry_count);
1991 for _ in 0..entry_count {
1992 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1993 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1994 cached_path_under_root(current_canonical_root, &raw_file)
1995 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1996 } else {
1997 raw_file
1998 };
1999 let name = read_string(data, &mut pos)?;
2000
2001 if pos >= data.len() {
2002 return Err("unexpected end of data".to_string());
2003 }
2004 let kind = u8_to_symbol_kind(data[pos]);
2005 pos += 1;
2006
2007 let start_line = read_u32(data, &mut pos)?;
2008 let end_line = read_u32(data, &mut pos)?;
2009
2010 if pos >= data.len() {
2011 return Err("unexpected end of data".to_string());
2012 }
2013 let exported = data[pos] != 0;
2014 pos += 1;
2015
2016 let snippet = read_string(data, &mut pos)?;
2017 let embed_text = read_string(data, &mut pos)?;
2018
2019 let vec_bytes = dimension
2021 .checked_mul(F32_BYTES)
2022 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2023 if pos + vec_bytes > data.len() {
2024 return Err("unexpected end of data reading vector".to_string());
2025 }
2026 let mut vector = Vec::with_capacity(dimension);
2027 for _ in 0..dimension {
2028 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2029 vector.push(f32::from_le_bytes(bytes));
2030 pos += 4;
2031 }
2032
2033 entries.push(EmbeddingEntry {
2034 chunk: SemanticChunk {
2035 file,
2036 name,
2037 kind,
2038 start_line,
2039 end_line,
2040 exported,
2041 embed_text,
2042 snippet,
2043 },
2044 vector,
2045 });
2046 }
2047
2048 if entries.len() != entry_count {
2049 return Err(format!(
2050 "semantic cache entry count drift: header={} decoded={}",
2051 entry_count,
2052 entries.len()
2053 ));
2054 }
2055 for entry in &entries {
2056 if !file_mtimes.contains_key(&entry.chunk.file) {
2057 return Err(format!(
2058 "semantic cache metadata missing for entry file {}",
2059 entry.chunk.file.display()
2060 ));
2061 }
2062 }
2063
2064 Ok(Self {
2065 entries,
2066 file_mtimes,
2067 file_sizes,
2068 file_hashes,
2069 dimension,
2070 fingerprint,
2071 project_root: current_canonical_root.to_path_buf(),
2072 })
2073 }
2074}
2075
2076fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2078 let relative = file
2079 .strip_prefix(project_root)
2080 .unwrap_or(file)
2081 .to_string_lossy();
2082
2083 let kind_label = match &symbol.kind {
2084 SymbolKind::Function => "function",
2085 SymbolKind::Class => "class",
2086 SymbolKind::Method => "method",
2087 SymbolKind::Struct => "struct",
2088 SymbolKind::Interface => "interface",
2089 SymbolKind::Enum => "enum",
2090 SymbolKind::TypeAlias => "type",
2091 SymbolKind::Variable => "variable",
2092 SymbolKind::Heading => "heading",
2093 SymbolKind::FileSummary => "file-summary",
2094 };
2095
2096 let name = &symbol.name;
2098 let mut text = format!(
2099 "name:{name} file:{} kind:{} name:{name}",
2100 relative, kind_label
2101 );
2102
2103 if let Some(sig) = &symbol.signature {
2104 text.push_str(&format!(" signature:{}", sig));
2105 }
2106
2107 let lines: Vec<&str> = source.lines().collect();
2109 let start = (symbol.range.start_line as usize).min(lines.len());
2110 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2112 if start < end {
2113 let body: String = lines[start..end]
2114 .iter()
2115 .take(15) .copied()
2117 .collect::<Vec<&str>>()
2118 .join("\n");
2119 let snippet = if body.len() > 300 {
2120 format!("{}...", &body[..body.floor_char_boundary(300)])
2121 } else {
2122 body
2123 };
2124 text.push_str(&format!(" body:{}", snippet));
2125 }
2126
2127 text
2128}
2129
2130fn truncate_chars(value: &str, max_chars: usize) -> String {
2131 value.chars().take(max_chars).collect()
2132}
2133
2134fn first_leading_doc_comment(source: &str) -> String {
2135 let lines: Vec<&str> = source.lines().collect();
2136 let Some((start, first)) = lines
2137 .iter()
2138 .enumerate()
2139 .find(|(_, line)| !line.trim().is_empty())
2140 else {
2141 return String::new();
2142 };
2143
2144 let trimmed = first.trim_start();
2145 if trimmed.starts_with("/**") {
2146 let mut comment = Vec::new();
2147 for line in lines.iter().skip(start) {
2148 comment.push(*line);
2149 if line.contains("*/") {
2150 break;
2151 }
2152 }
2153 return truncate_chars(&comment.join("\n"), 200);
2154 }
2155
2156 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2157 let comment = lines
2158 .iter()
2159 .skip(start)
2160 .take_while(|line| {
2161 let trimmed = line.trim_start();
2162 trimmed.starts_with("///") || trimmed.starts_with("//!")
2163 })
2164 .copied()
2165 .collect::<Vec<_>>()
2166 .join("\n");
2167 return truncate_chars(&comment, 200);
2168 }
2169
2170 String::new()
2171}
2172
2173pub fn build_file_summary_chunk(
2174 file: &Path,
2175 project_root: &Path,
2176 source: &str,
2177 top_exports: &[&str],
2178 top_export_signatures: &[Option<&str>],
2179) -> SemanticChunk {
2180 let relative = file.strip_prefix(project_root).unwrap_or(file);
2181 let rel_path = relative.to_string_lossy();
2182 let parent_dir = relative
2183 .parent()
2184 .map(|parent| parent.to_string_lossy().to_string())
2185 .unwrap_or_default();
2186 let name = file
2187 .file_stem()
2188 .map(|stem| stem.to_string_lossy().to_string())
2189 .unwrap_or_default();
2190 let doc = first_leading_doc_comment(source);
2191 let exports = top_exports
2192 .iter()
2193 .take(5)
2194 .copied()
2195 .collect::<Vec<_>>()
2196 .join(",");
2197 let snippet = if doc.is_empty() {
2198 top_export_signatures
2199 .first()
2200 .and_then(|signature| signature.as_deref())
2201 .map(|signature| truncate_chars(signature, 200))
2202 .unwrap_or_default()
2203 } else {
2204 doc.clone()
2205 };
2206
2207 SemanticChunk {
2208 file: file.to_path_buf(),
2209 name,
2210 kind: SymbolKind::FileSummary,
2211 start_line: 0,
2212 end_line: 0,
2213 exported: false,
2214 embed_text: format!(
2215 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2216 file.file_stem()
2217 .map(|stem| stem.to_string_lossy().to_string())
2218 .unwrap_or_default()
2219 ),
2220 snippet,
2221 }
2222}
2223
2224fn parser_for(
2225 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2226 lang: crate::parser::LangId,
2227) -> Result<&mut Parser, String> {
2228 use std::collections::hash_map::Entry;
2229
2230 match parsers.entry(lang) {
2231 Entry::Occupied(entry) => Ok(entry.into_mut()),
2232 Entry::Vacant(entry) => {
2233 let grammar = grammar_for(lang);
2234 let mut parser = Parser::new();
2235 parser
2236 .set_language(&grammar)
2237 .map_err(|error| error.to_string())?;
2238 Ok(entry.insert(parser))
2239 }
2240 }
2241}
2242
2243pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2244 matches!(
2245 path.extension().and_then(|extension| extension.to_str()),
2246 Some(
2247 "ts" | "tsx"
2248 | "js"
2249 | "jsx"
2250 | "py"
2251 | "rs"
2252 | "go"
2253 | "c"
2254 | "h"
2255 | "cc"
2256 | "cpp"
2257 | "cxx"
2258 | "hpp"
2259 | "hh"
2260 | "zig"
2261 | "cs"
2262 | "sh"
2263 | "bash"
2264 | "zsh"
2265 | "sol"
2266 | "vue"
2267 )
2268 )
2269}
2270
2271fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2272 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2273 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2274 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2275 .map_err(|error| error.to_string())?
2276 .unwrap_or_else(cache_freshness::zero_hash);
2277 Ok(IndexedFileMetadata {
2278 mtime,
2279 size: metadata.len(),
2280 content_hash,
2281 })
2282}
2283
2284fn collect_file_chunks(
2285 project_root: &Path,
2286 file: &Path,
2287 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2288) -> Result<Vec<SemanticChunk>, String> {
2289 if !is_semantic_indexed_extension(file) {
2290 return Err("unsupported file extension".to_string());
2291 }
2292 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2293 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2294 let tree = parser_for(parsers, lang)?
2295 .parse(&source, None)
2296 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2297 let symbols =
2298 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2299
2300 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2301}
2302
2303fn build_snippet(symbol: &Symbol, source: &str) -> String {
2305 let lines: Vec<&str> = source.lines().collect();
2306 let start = (symbol.range.start_line as usize).min(lines.len());
2307 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2309 if start < end {
2310 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2311 let mut snippet = snippet_lines.join("\n");
2312 if end - start > 5 {
2313 snippet.push_str("\n ...");
2314 }
2315 if snippet.len() > 300 {
2316 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2317 }
2318 snippet
2319 } else {
2320 String::new()
2321 }
2322}
2323
2324fn symbols_to_chunks(
2326 file: &Path,
2327 symbols: &[Symbol],
2328 source: &str,
2329 project_root: &Path,
2330) -> Vec<SemanticChunk> {
2331 let mut chunks = Vec::new();
2332 let top_exports_with_signatures = symbols
2333 .iter()
2334 .filter(|symbol| {
2335 symbol.exported
2336 && symbol.parent.is_none()
2337 && !matches!(symbol.kind, SymbolKind::Heading)
2338 })
2339 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2340 .collect::<Vec<_>>();
2341
2342 let has_only_headings = !symbols.is_empty()
2343 && symbols
2344 .iter()
2345 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2346 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2347 let top_exports = top_exports_with_signatures
2348 .iter()
2349 .map(|(name, _)| *name)
2350 .collect::<Vec<_>>();
2351 let top_export_signatures = top_exports_with_signatures
2352 .iter()
2353 .map(|(_, signature)| *signature)
2354 .collect::<Vec<_>>();
2355 chunks.push(build_file_summary_chunk(
2356 file,
2357 project_root,
2358 source,
2359 &top_exports,
2360 &top_export_signatures,
2361 ));
2362 }
2363
2364 for symbol in symbols {
2365 if matches!(symbol.kind, SymbolKind::Heading) {
2370 continue;
2371 }
2372
2373 let line_count = symbol
2375 .range
2376 .end_line
2377 .saturating_sub(symbol.range.start_line)
2378 + 1;
2379 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2380 continue;
2381 }
2382
2383 let embed_text = build_embed_text(symbol, source, file, project_root);
2384 let snippet = build_snippet(symbol, source);
2385
2386 chunks.push(SemanticChunk {
2387 file: file.to_path_buf(),
2388 name: symbol.name.clone(),
2389 kind: symbol.kind.clone(),
2390 start_line: symbol.range.start_line,
2391 end_line: symbol.range.end_line,
2392 exported: symbol.exported,
2393 embed_text,
2394 snippet,
2395 });
2396
2397 }
2400
2401 chunks
2402}
2403
2404fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2406 if a.len() != b.len() {
2407 return 0.0;
2408 }
2409
2410 let mut dot = 0.0f32;
2411 let mut norm_a = 0.0f32;
2412 let mut norm_b = 0.0f32;
2413
2414 for i in 0..a.len() {
2415 dot += a[i] * b[i];
2416 norm_a += a[i] * a[i];
2417 norm_b += b[i] * b[i];
2418 }
2419
2420 let denom = norm_a.sqrt() * norm_b.sqrt();
2421 if denom == 0.0 {
2422 0.0
2423 } else {
2424 dot / denom
2425 }
2426}
2427
2428fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2430 match kind {
2431 SymbolKind::Function => 0,
2432 SymbolKind::Class => 1,
2433 SymbolKind::Method => 2,
2434 SymbolKind::Struct => 3,
2435 SymbolKind::Interface => 4,
2436 SymbolKind::Enum => 5,
2437 SymbolKind::TypeAlias => 6,
2438 SymbolKind::Variable => 7,
2439 SymbolKind::Heading => 8,
2440 SymbolKind::FileSummary => 9,
2441 }
2442}
2443
2444fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2445 match v {
2446 0 => SymbolKind::Function,
2447 1 => SymbolKind::Class,
2448 2 => SymbolKind::Method,
2449 3 => SymbolKind::Struct,
2450 4 => SymbolKind::Interface,
2451 5 => SymbolKind::Enum,
2452 6 => SymbolKind::TypeAlias,
2453 7 => SymbolKind::Variable,
2454 8 => SymbolKind::Heading,
2455 9 => SymbolKind::FileSummary,
2456 _ => SymbolKind::Heading,
2457 }
2458}
2459
2460fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2461 if *pos + 4 > data.len() {
2462 return Err("unexpected end of data reading u32".to_string());
2463 }
2464 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2465 *pos += 4;
2466 Ok(val)
2467}
2468
2469fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2470 if *pos + 8 > data.len() {
2471 return Err("unexpected end of data reading u64".to_string());
2472 }
2473 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2474 *pos += 8;
2475 Ok(u64::from_le_bytes(bytes))
2476}
2477
2478fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2479 let len = read_u32(data, pos)? as usize;
2480 if *pos + len > data.len() {
2481 return Err("unexpected end of data reading string".to_string());
2482 }
2483 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2484 *pos += len;
2485 Ok(s)
2486}
2487
2488#[cfg(test)]
2489mod tests {
2490 use super::*;
2491 use crate::config::{SemanticBackend, SemanticBackendConfig};
2492 use crate::parser::FileParser;
2493 use std::io::{Read, Write};
2494 use std::net::TcpListener;
2495 use std::thread;
2496
2497 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2498 where
2499 F: Fn(String, String, String) -> String + Send + 'static,
2500 {
2501 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2502 let addr = listener.local_addr().expect("local addr");
2503 let handle = thread::spawn(move || {
2504 let (mut stream, _) = listener.accept().expect("accept request");
2505 let mut buf = Vec::new();
2506 let mut chunk = [0u8; 4096];
2507 let mut header_end = None;
2508 let mut content_length = 0usize;
2509 loop {
2510 let n = stream.read(&mut chunk).expect("read request");
2511 if n == 0 {
2512 break;
2513 }
2514 buf.extend_from_slice(&chunk[..n]);
2515 if header_end.is_none() {
2516 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2517 header_end = Some(pos + 4);
2518 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2519 for line in headers.lines() {
2520 if let Some(value) = line.strip_prefix("Content-Length:") {
2521 content_length = value.trim().parse::<usize>().unwrap_or(0);
2522 }
2523 }
2524 }
2525 }
2526 if let Some(end) = header_end {
2527 if buf.len() >= end + content_length {
2528 break;
2529 }
2530 }
2531 }
2532
2533 let end = header_end.expect("header terminator");
2534 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2535 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2536 let mut lines = request.lines();
2537 let request_line = lines.next().expect("request line").to_string();
2538 let path = request_line
2539 .split_whitespace()
2540 .nth(1)
2541 .expect("request path")
2542 .to_string();
2543 let response_body = handler(request_line, path, body);
2544 let response = format!(
2545 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2546 response_body.len(),
2547 response_body
2548 );
2549 stream
2550 .write_all(response.as_bytes())
2551 .expect("write response");
2552 });
2553
2554 (format!("http://{}", addr), handle)
2555 }
2556
2557 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2558 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2559 }
2560
2561 fn write_rust_file(path: &Path, function_name: &str) {
2562 fs::write(
2563 path,
2564 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2565 )
2566 .unwrap();
2567 }
2568
2569 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2570 let mut embed = test_vector_for_texts;
2571 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2572 }
2573
2574 fn test_project_root() -> PathBuf {
2575 std::env::current_dir().unwrap()
2576 }
2577
2578 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2579 index.file_mtimes.insert(file.to_path_buf(), mtime);
2580 index.file_sizes.insert(file.to_path_buf(), size);
2581 index
2582 .file_hashes
2583 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2584 }
2585
2586 #[test]
2587 fn semantic_cache_serialization_skips_paths_outside_project_root() {
2588 let dir = tempfile::tempdir().expect("create temp dir");
2589 let project = fs::canonicalize(dir.path()).expect("canonical project");
2590 let outside = project.join("..").join("outside.rs");
2591 let mut index = SemanticIndex::new(project.clone(), 3);
2592 index
2593 .file_mtimes
2594 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2595 index.file_sizes.insert(outside.clone(), 1);
2596 index
2597 .file_hashes
2598 .insert(outside.clone(), cache_freshness::zero_hash());
2599 index.entries.push(EmbeddingEntry {
2600 chunk: SemanticChunk {
2601 file: outside,
2602 name: "outside".to_string(),
2603 kind: SymbolKind::Function,
2604 start_line: 0,
2605 end_line: 0,
2606 exported: false,
2607 embed_text: "outside".to_string(),
2608 snippet: "outside".to_string(),
2609 },
2610 vector: vec![1.0, 0.0, 0.0],
2611 });
2612
2613 let bytes = index.to_bytes();
2614 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2615 assert_eq!(loaded.entries.len(), 0);
2616 assert!(loaded.file_mtimes.is_empty());
2617 }
2618
2619 #[test]
2620 fn test_cosine_similarity_identical() {
2621 let a = vec![1.0, 0.0, 0.0];
2622 let b = vec![1.0, 0.0, 0.0];
2623 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2624 }
2625
2626 #[test]
2627 fn test_cosine_similarity_orthogonal() {
2628 let a = vec![1.0, 0.0, 0.0];
2629 let b = vec![0.0, 1.0, 0.0];
2630 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2631 }
2632
2633 #[test]
2634 fn test_cosine_similarity_opposite() {
2635 let a = vec![1.0, 0.0, 0.0];
2636 let b = vec![-1.0, 0.0, 0.0];
2637 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2638 }
2639
2640 #[test]
2641 fn test_serialization_roundtrip() {
2642 let project_root = test_project_root();
2643 let file = project_root.join("src/main.rs");
2644 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2645 index.entries.push(EmbeddingEntry {
2646 chunk: SemanticChunk {
2647 file: file.clone(),
2648 name: "handle_request".to_string(),
2649 kind: SymbolKind::Function,
2650 start_line: 10,
2651 end_line: 25,
2652 exported: true,
2653 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2654 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2655 },
2656 vector: vec![0.1, 0.2, 0.3, 0.4],
2657 });
2658 index.dimension = 4;
2659 index
2660 .file_mtimes
2661 .insert(file.clone(), SystemTime::UNIX_EPOCH);
2662 index.file_sizes.insert(file, 0);
2663 index.set_fingerprint(SemanticIndexFingerprint {
2664 backend: "fastembed".to_string(),
2665 model: "all-MiniLM-L6-v2".to_string(),
2666 base_url: FALLBACK_BACKEND.to_string(),
2667 dimension: 4,
2668 chunking_version: default_chunking_version(),
2669 });
2670
2671 let bytes = index.to_bytes();
2672 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2673
2674 assert_eq!(restored.entries.len(), 1);
2675 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2676 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2677 assert_eq!(restored.dimension, 4);
2678 assert_eq!(restored.backend_label(), Some("fastembed"));
2679 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2680 }
2681
2682 #[test]
2683 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2684 let cases = [
2685 (SymbolKind::Function, 0),
2686 (SymbolKind::Class, 1),
2687 (SymbolKind::Method, 2),
2688 (SymbolKind::Struct, 3),
2689 (SymbolKind::Interface, 4),
2690 (SymbolKind::Enum, 5),
2691 (SymbolKind::TypeAlias, 6),
2692 (SymbolKind::Variable, 7),
2693 (SymbolKind::Heading, 8),
2694 (SymbolKind::FileSummary, 9),
2695 ];
2696
2697 for (kind, encoded) in cases {
2698 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2699 assert_eq!(u8_to_symbol_kind(encoded), kind);
2700 }
2701 }
2702
2703 #[test]
2704 fn test_search_top_k() {
2705 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2706 index.dimension = 3;
2707
2708 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2710 let mut vec = vec![0.0f32; 3];
2711 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2713 chunk: SemanticChunk {
2714 file: PathBuf::from("/src/lib.rs"),
2715 name: name.to_string(),
2716 kind: SymbolKind::Function,
2717 start_line: (i * 10 + 1) as u32,
2718 end_line: (i * 10 + 5) as u32,
2719 exported: true,
2720 embed_text: format!("kind:function name:{}", name),
2721 snippet: format!("fn {}() {{}}", name),
2722 },
2723 vector: vec,
2724 });
2725 }
2726
2727 let query = vec![0.9, 0.1, 0.0];
2729 let results = index.search(&query, 2);
2730
2731 assert_eq!(results.len(), 2);
2732 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2734 }
2735
2736 #[test]
2737 fn test_empty_index_search() {
2738 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2739 let results = index.search(&[0.1, 0.2, 0.3], 10);
2740 assert!(results.is_empty());
2741 }
2742
2743 #[test]
2744 fn single_line_symbol_builds_non_empty_snippet() {
2745 let symbol = Symbol {
2746 name: "answer".to_string(),
2747 kind: SymbolKind::Variable,
2748 range: crate::symbols::Range {
2749 start_line: 0,
2750 start_col: 0,
2751 end_line: 0,
2752 end_col: 24,
2753 },
2754 signature: Some("const answer = 42".to_string()),
2755 scope_chain: Vec::new(),
2756 exported: true,
2757 parent: None,
2758 };
2759 let source = "export const answer = 42;\n";
2760
2761 let snippet = build_snippet(&symbol, source);
2762
2763 assert_eq!(snippet, "export const answer = 42;");
2764 }
2765
2766 #[test]
2767 fn optimized_file_chunk_collection_matches_file_parser_path() {
2768 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2769 let file = project_root.join("src/semantic_index.rs");
2770 let source = std::fs::read_to_string(&file).unwrap();
2771
2772 let mut legacy_parser = FileParser::new();
2773 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2774 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2775
2776 let mut parsers = HashMap::new();
2777 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2778
2779 assert_eq!(
2780 chunk_fingerprint(&optimized_chunks),
2781 chunk_fingerprint(&legacy_chunks)
2782 );
2783 }
2784
2785 fn chunk_fingerprint(
2786 chunks: &[SemanticChunk],
2787 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2788 chunks
2789 .iter()
2790 .map(|chunk| {
2791 (
2792 chunk.name.clone(),
2793 chunk.kind.clone(),
2794 chunk.start_line,
2795 chunk.end_line,
2796 chunk.exported,
2797 chunk.embed_text.clone(),
2798 chunk.snippet.clone(),
2799 )
2800 })
2801 .collect()
2802 }
2803
2804 #[test]
2805 fn rejects_oversized_dimension_during_deserialization() {
2806 let mut bytes = Vec::new();
2807 bytes.push(1u8);
2808 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2809 bytes.extend_from_slice(&0u32.to_le_bytes());
2810 bytes.extend_from_slice(&0u32.to_le_bytes());
2811
2812 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2813 }
2814
2815 #[test]
2816 fn rejects_oversized_entry_count_during_deserialization() {
2817 let mut bytes = Vec::new();
2818 bytes.push(1u8);
2819 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2820 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2821 bytes.extend_from_slice(&0u32.to_le_bytes());
2822
2823 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2824 }
2825
2826 #[test]
2827 fn invalidate_file_removes_entries_and_mtime() {
2828 let target = PathBuf::from("/src/main.rs");
2829 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2830 index.entries.push(EmbeddingEntry {
2831 chunk: SemanticChunk {
2832 file: target.clone(),
2833 name: "main".to_string(),
2834 kind: SymbolKind::Function,
2835 start_line: 0,
2836 end_line: 1,
2837 exported: false,
2838 embed_text: "main".to_string(),
2839 snippet: "fn main() {}".to_string(),
2840 },
2841 vector: vec![1.0; DEFAULT_DIMENSION],
2842 });
2843 index
2844 .file_mtimes
2845 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2846 index.file_sizes.insert(target.clone(), 0);
2847
2848 index.invalidate_file(&target);
2849
2850 assert!(index.entries.is_empty());
2851 assert!(!index.file_mtimes.contains_key(&target));
2852 assert!(!index.file_sizes.contains_key(&target));
2853 }
2854
2855 #[test]
2856 fn refresh_missing_changed_file_is_purged_after_collect() {
2857 let temp = tempfile::tempdir().unwrap();
2858 let project_root = temp.path();
2859 let file = project_root.join("src/lib.rs");
2860 fs::create_dir_all(file.parent().unwrap()).unwrap();
2861 write_rust_file(&file, "vanished_symbol");
2862
2863 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2864 let original_size = *index.file_sizes.get(&file).unwrap();
2865 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
2866 fs::remove_file(&file).unwrap();
2867
2868 let mut embed = test_vector_for_texts;
2869 let mut progress = |_done: usize, _total: usize| {};
2870 let summary = index
2871 .refresh_stale_files(
2872 project_root,
2873 std::slice::from_ref(&file),
2874 &mut embed,
2875 8,
2876 &mut progress,
2877 )
2878 .unwrap();
2879
2880 assert_eq!(summary.changed, 0);
2881 assert_eq!(summary.added, 0);
2882 assert_eq!(summary.deleted, 1);
2883 assert!(index.entries.is_empty());
2884 assert!(!index.file_mtimes.contains_key(&file));
2885 assert!(!index.file_sizes.contains_key(&file));
2886 assert!(!index.file_hashes.contains_key(&file));
2887 }
2888
2889 #[test]
2890 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
2891 let temp = tempfile::tempdir().unwrap();
2892 let project_root = temp.path();
2893 let file = project_root.join("src/lib.rs");
2894 fs::create_dir_all(file.parent().unwrap()).unwrap();
2895 write_rust_file(&file, "kept_symbol");
2896
2897 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2898 let original_entry_count = index.entries.len();
2899 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2900 let original_size = *index.file_sizes.get(&file).unwrap();
2901
2902 let stale_mtime = SystemTime::UNIX_EPOCH;
2903 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2904 fs::remove_file(&file).unwrap();
2905 fs::create_dir(&file).unwrap();
2906
2907 let mut embed = test_vector_for_texts;
2908 let mut progress = |_done: usize, _total: usize| {};
2909 let summary = index
2910 .refresh_stale_files(
2911 project_root,
2912 std::slice::from_ref(&file),
2913 &mut embed,
2914 8,
2915 &mut progress,
2916 )
2917 .unwrap();
2918
2919 assert_eq!(summary.changed, 0);
2920 assert_eq!(summary.added, 0);
2921 assert_eq!(summary.deleted, 0);
2922 assert_eq!(index.entries.len(), original_entry_count);
2923 assert!(index
2924 .entries
2925 .iter()
2926 .any(|entry| entry.chunk.name == "kept_symbol"));
2927 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2928 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2929 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2930 }
2931
2932 #[test]
2933 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2934 let temp = tempfile::tempdir().unwrap();
2935 let project_root = temp.path();
2936 let missing = project_root.join("src/missing.rs");
2937 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2938
2939 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2940 let mut embed = test_vector_for_texts;
2941 let mut progress = |_done: usize, _total: usize| {};
2942 let summary = index
2943 .refresh_stale_files(
2944 project_root,
2945 std::slice::from_ref(&missing),
2946 &mut embed,
2947 8,
2948 &mut progress,
2949 )
2950 .unwrap();
2951
2952 assert_eq!(summary.added, 0);
2953 assert_eq!(summary.changed, 0);
2954 assert_eq!(summary.deleted, 0);
2955 assert!(!index.file_mtimes.contains_key(&missing));
2956 assert!(!index.file_sizes.contains_key(&missing));
2957 assert!(index.entries.is_empty());
2958 }
2959
2960 #[test]
2961 fn refresh_reports_added_for_new_files() {
2962 let temp = tempfile::tempdir().unwrap();
2963 let project_root = temp.path();
2964 let existing = project_root.join("src/lib.rs");
2965 let added = project_root.join("src/new.rs");
2966 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2967 write_rust_file(&existing, "existing_symbol");
2968 write_rust_file(&added, "added_symbol");
2969
2970 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2971 let mut embed = test_vector_for_texts;
2972 let mut progress = |_done: usize, _total: usize| {};
2973 let summary = index
2974 .refresh_stale_files(
2975 project_root,
2976 &[existing.clone(), added.clone()],
2977 &mut embed,
2978 8,
2979 &mut progress,
2980 )
2981 .unwrap();
2982
2983 assert_eq!(summary.added, 1);
2984 assert_eq!(summary.changed, 0);
2985 assert_eq!(summary.deleted, 0);
2986 assert_eq!(summary.total_processed, 2);
2987 assert!(index.file_mtimes.contains_key(&added));
2988 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2989 }
2990
2991 #[test]
2992 fn refresh_reports_deleted_for_removed_files() {
2993 let temp = tempfile::tempdir().unwrap();
2994 let project_root = temp.path();
2995 let deleted = project_root.join("src/deleted.rs");
2996 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2997 write_rust_file(&deleted, "deleted_symbol");
2998
2999 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3000 fs::remove_file(&deleted).unwrap();
3001
3002 let mut embed = test_vector_for_texts;
3003 let mut progress = |_done: usize, _total: usize| {};
3004 let summary = index
3005 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3006 .unwrap();
3007
3008 assert_eq!(summary.deleted, 1);
3009 assert_eq!(summary.changed, 0);
3010 assert_eq!(summary.added, 0);
3011 assert_eq!(summary.total_processed, 1);
3012 assert!(!index.file_mtimes.contains_key(&deleted));
3013 assert!(index.entries.is_empty());
3014 }
3015
3016 #[test]
3017 fn refresh_reports_changed_for_modified_files() {
3018 let temp = tempfile::tempdir().unwrap();
3019 let project_root = temp.path();
3020 let file = project_root.join("src/lib.rs");
3021 fs::create_dir_all(file.parent().unwrap()).unwrap();
3022 write_rust_file(&file, "old_symbol");
3023
3024 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3025 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3026 write_rust_file(&file, "new_symbol");
3027
3028 let mut embed = test_vector_for_texts;
3029 let mut progress = |_done: usize, _total: usize| {};
3030 let summary = index
3031 .refresh_stale_files(
3032 project_root,
3033 std::slice::from_ref(&file),
3034 &mut embed,
3035 8,
3036 &mut progress,
3037 )
3038 .unwrap();
3039
3040 assert_eq!(summary.changed, 1);
3041 assert_eq!(summary.added, 0);
3042 assert_eq!(summary.deleted, 0);
3043 assert_eq!(summary.total_processed, 1);
3044 assert!(index
3045 .entries
3046 .iter()
3047 .any(|entry| entry.chunk.name == "new_symbol"));
3048 assert!(!index
3049 .entries
3050 .iter()
3051 .any(|entry| entry.chunk.name == "old_symbol"));
3052 }
3053
3054 #[test]
3055 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3056 let temp = tempfile::tempdir().unwrap();
3057 let project_root = temp.path();
3058 let file = project_root.join("src/lib.rs");
3059 fs::create_dir_all(file.parent().unwrap()).unwrap();
3060 write_rust_file(&file, "clean_symbol");
3061
3062 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3063 let original_entries = index.entries.len();
3064 let mut embed_called = false;
3065 let mut embed = |texts: Vec<String>| {
3066 embed_called = true;
3067 test_vector_for_texts(texts)
3068 };
3069 let mut progress = |_done: usize, _total: usize| {};
3070 let summary = index
3071 .refresh_stale_files(
3072 project_root,
3073 std::slice::from_ref(&file),
3074 &mut embed,
3075 8,
3076 &mut progress,
3077 )
3078 .unwrap();
3079
3080 assert!(summary.is_noop());
3081 assert_eq!(summary.total_processed, 1);
3082 assert!(!embed_called);
3083 assert_eq!(index.entries.len(), original_entries);
3084 }
3085
3086 #[test]
3087 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3088 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3089
3090 assert!(is_onnx_runtime_unavailable(message));
3091 }
3092
3093 #[test]
3094 fn formats_missing_onnx_runtime_with_install_hint() {
3095 let message = format_embedding_init_error(
3096 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3097 );
3098
3099 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3100 assert!(message.contains("Original error:"));
3101 }
3102
3103 #[test]
3104 fn openai_compatible_backend_embeds_with_mock_server() {
3105 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3106 assert!(request_line.starts_with("POST "));
3107 assert_eq!(path, "/v1/embeddings");
3108 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3109 });
3110
3111 let config = SemanticBackendConfig {
3112 backend: SemanticBackend::OpenAiCompatible,
3113 model: "test-embedding".to_string(),
3114 base_url: Some(base_url),
3115 api_key_env: None,
3116 timeout_ms: 5_000,
3117 max_batch_size: 64,
3118 };
3119
3120 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3121 let vectors = model
3122 .embed(vec!["hello".to_string(), "world".to_string()])
3123 .unwrap();
3124
3125 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3126 handle.join().unwrap();
3127 }
3128
3129 #[test]
3139 fn openai_compatible_request_has_single_content_type_header() {
3140 use std::sync::{Arc, Mutex};
3141 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3142 let captured_for_thread = Arc::clone(&captured);
3143
3144 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3145 let addr = listener.local_addr().expect("local addr");
3146 let handle = thread::spawn(move || {
3147 let (mut stream, _) = listener.accept().expect("accept");
3148 let mut buf = Vec::new();
3149 let mut chunk = [0u8; 4096];
3150 let mut header_end = None;
3151 let mut content_length = 0usize;
3152 loop {
3153 let n = stream.read(&mut chunk).expect("read");
3154 if n == 0 {
3155 break;
3156 }
3157 buf.extend_from_slice(&chunk[..n]);
3158 if header_end.is_none() {
3159 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3160 header_end = Some(pos + 4);
3161 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3162 if let Some(value) = line.strip_prefix("Content-Length:") {
3163 content_length = value.trim().parse::<usize>().unwrap_or(0);
3164 }
3165 }
3166 }
3167 }
3168 if let Some(end) = header_end {
3169 if buf.len() >= end + content_length {
3170 break;
3171 }
3172 }
3173 }
3174 *captured_for_thread.lock().unwrap() = buf;
3175 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3176 let response = format!(
3177 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3178 body.len(),
3179 body
3180 );
3181 let _ = stream.write_all(response.as_bytes());
3182 });
3183
3184 let config = SemanticBackendConfig {
3185 backend: SemanticBackend::OpenAiCompatible,
3186 model: "text-embedding-3-small".to_string(),
3187 base_url: Some(format!("http://{}", addr)),
3188 api_key_env: None,
3189 timeout_ms: 5_000,
3190 max_batch_size: 64,
3191 };
3192 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3193 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3194 handle.join().unwrap();
3195
3196 let bytes = captured.lock().unwrap().clone();
3197 let request = String::from_utf8_lossy(&bytes);
3198
3199 let content_type_lines = request
3202 .lines()
3203 .filter(|line| {
3204 let lower = line.to_ascii_lowercase();
3205 lower.starts_with("content-type:")
3206 })
3207 .count();
3208 assert_eq!(
3209 content_type_lines, 1,
3210 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3211 );
3212
3213 assert!(
3216 request.contains(r#""model":"text-embedding-3-small""#),
3217 "request body should contain model field; full request:\n{request}",
3218 );
3219 }
3220
3221 #[test]
3222 fn ollama_backend_embeds_with_mock_server() {
3223 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3224 assert!(request_line.starts_with("POST "));
3225 assert_eq!(path, "/api/embed");
3226 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3227 });
3228
3229 let config = SemanticBackendConfig {
3230 backend: SemanticBackend::Ollama,
3231 model: "embeddinggemma".to_string(),
3232 base_url: Some(base_url),
3233 api_key_env: None,
3234 timeout_ms: 5_000,
3235 max_batch_size: 64,
3236 };
3237
3238 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3239 let vectors = model
3240 .embed(vec!["hello".to_string(), "world".to_string()])
3241 .unwrap();
3242
3243 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3244 handle.join().unwrap();
3245 }
3246
3247 #[test]
3248 fn read_from_disk_rejects_fingerprint_mismatch() {
3249 let storage = tempfile::tempdir().unwrap();
3250 let project_key = "proj";
3251
3252 let project_root = test_project_root();
3253 let file = project_root.join("src/main.rs");
3254 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3255 index.entries.push(EmbeddingEntry {
3256 chunk: SemanticChunk {
3257 file: file.clone(),
3258 name: "handle_request".to_string(),
3259 kind: SymbolKind::Function,
3260 start_line: 10,
3261 end_line: 25,
3262 exported: true,
3263 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3264 snippet: "fn handle_request() {}".to_string(),
3265 },
3266 vector: vec![0.1, 0.2, 0.3],
3267 });
3268 index.dimension = 3;
3269 index
3270 .file_mtimes
3271 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3272 index.file_sizes.insert(file, 0);
3273 index.set_fingerprint(SemanticIndexFingerprint {
3274 backend: "openai_compatible".to_string(),
3275 model: "test-embedding".to_string(),
3276 base_url: "http://127.0.0.1:1234/v1".to_string(),
3277 dimension: 3,
3278 chunking_version: default_chunking_version(),
3279 });
3280 index.write_to_disk(storage.path(), project_key);
3281
3282 let matching = index.fingerprint().unwrap().as_string();
3283 assert!(SemanticIndex::read_from_disk(
3284 storage.path(),
3285 project_key,
3286 &project_root,
3287 false,
3288 Some(&matching),
3289 )
3290 .is_some());
3291
3292 let mismatched = SemanticIndexFingerprint {
3293 backend: "ollama".to_string(),
3294 model: "embeddinggemma".to_string(),
3295 base_url: "http://127.0.0.1:11434".to_string(),
3296 dimension: 3,
3297 chunking_version: default_chunking_version(),
3298 }
3299 .as_string();
3300 assert!(SemanticIndex::read_from_disk(
3301 storage.path(),
3302 project_key,
3303 &project_root,
3304 false,
3305 Some(&mismatched),
3306 )
3307 .is_none());
3308 }
3309
3310 #[test]
3311 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3312 let storage = tempfile::tempdir().unwrap();
3313 let project_key = "proj-v3";
3314 let dir = storage.path().join("semantic").join(project_key);
3315 fs::create_dir_all(&dir).unwrap();
3316
3317 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3318 index.entries.push(EmbeddingEntry {
3319 chunk: SemanticChunk {
3320 file: PathBuf::from("/src/main.rs"),
3321 name: "handle_request".to_string(),
3322 kind: SymbolKind::Function,
3323 start_line: 0,
3324 end_line: 0,
3325 exported: true,
3326 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3327 snippet: "fn handle_request() {}".to_string(),
3328 },
3329 vector: vec![0.1, 0.2, 0.3],
3330 });
3331 index.dimension = 3;
3332 index
3333 .file_mtimes
3334 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3335 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3336 let fingerprint = SemanticIndexFingerprint {
3337 backend: "fastembed".to_string(),
3338 model: "test".to_string(),
3339 base_url: FALLBACK_BACKEND.to_string(),
3340 dimension: 3,
3341 chunking_version: default_chunking_version(),
3342 };
3343 index.set_fingerprint(fingerprint.clone());
3344
3345 let mut bytes = index.to_bytes();
3346 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3347 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3348
3349 assert!(SemanticIndex::read_from_disk(
3350 storage.path(),
3351 project_key,
3352 &test_project_root(),
3353 false,
3354 Some(&fingerprint.as_string())
3355 )
3356 .is_none());
3357 assert!(!dir.join("semantic.bin").exists());
3358 }
3359
3360 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3361 crate::symbols::Symbol {
3362 name: name.to_string(),
3363 kind,
3364 range: crate::symbols::Range {
3365 start_line: start,
3366 start_col: 0,
3367 end_line: end,
3368 end_col: 0,
3369 },
3370 signature: None,
3371 scope_chain: Vec::new(),
3372 exported: false,
3373 parent: None,
3374 }
3375 }
3376
3377 #[test]
3382 fn symbols_to_chunks_skips_heading_symbols() {
3383 let project_root = PathBuf::from("/proj");
3384 let file = project_root.join("README.md");
3385 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3386
3387 let symbols = vec![
3388 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3389 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3390 ];
3391
3392 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3393 assert!(
3394 chunks.is_empty(),
3395 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3396 chunks.len()
3397 );
3398 }
3399
3400 #[test]
3404 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3405 let project_root = PathBuf::from("/proj");
3406 let file = project_root.join("src/lib.rs");
3407 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3408
3409 let symbols = vec![
3410 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3412 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3413 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3414 ];
3415
3416 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3417 assert_eq!(
3418 chunks.len(),
3419 3,
3420 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3421 chunks.len()
3422 );
3423 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3424 assert!(chunks
3425 .iter()
3426 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3427 assert!(names.contains(&"handle_request"));
3428 assert!(names.contains(&"AuthService"));
3429 assert!(
3430 !names.contains(&"doc heading"),
3431 "Heading symbol leaked into chunks: {names:?}"
3432 );
3433 }
3434
3435 #[test]
3436 fn validate_ssrf_allows_loopback_hostnames() {
3437 for host in &[
3440 "http://localhost",
3441 "http://localhost:8080",
3442 "http://localhost:11434", "http://localhost.localdomain",
3444 "http://foo.localhost",
3445 ] {
3446 assert!(
3447 validate_base_url_no_ssrf(host).is_ok(),
3448 "Expected {host} to be allowed (loopback), got: {:?}",
3449 validate_base_url_no_ssrf(host)
3450 );
3451 }
3452 }
3453
3454 #[test]
3455 fn validate_ssrf_allows_loopback_ips() {
3456 for url in &[
3459 "http://127.0.0.1",
3460 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3462 "http://127.1.2.3",
3463 ] {
3464 let result = validate_base_url_no_ssrf(url);
3465 assert!(
3466 result.is_ok(),
3467 "Expected {url} to be allowed (loopback), got: {:?}",
3468 result
3469 );
3470 }
3471 }
3472
3473 #[test]
3474 fn validate_ssrf_rejects_private_non_loopback_ips() {
3475 for url in &[
3480 "http://192.168.1.1",
3481 "http://10.0.0.1",
3482 "http://172.16.0.1",
3483 "http://169.254.169.254",
3484 "http://100.64.0.1",
3485 ] {
3486 let result = validate_base_url_no_ssrf(url);
3487 assert!(
3488 result.is_err(),
3489 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3490 result
3491 );
3492 }
3493 }
3494
3495 #[test]
3496 fn validate_ssrf_rejects_mdns_local_hostnames() {
3497 for host in &[
3500 "http://printer.local",
3501 "http://nas.local:8080",
3502 "http://homelab.local",
3503 ] {
3504 let result = validate_base_url_no_ssrf(host);
3505 assert!(
3506 result.is_err(),
3507 "Expected {host} to be rejected (mDNS), got: {:?}",
3508 result
3509 );
3510 }
3511 }
3512
3513 #[test]
3514 fn normalize_base_url_allows_localhost_for_tests() {
3515 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3518 assert!(normalize_base_url("http://localhost:8080").is_ok());
3519 }
3520
3521 #[test]
3528 fn ort_mismatch_message_recommends_auto_fix_first() {
3529 let msg =
3530 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3531
3532 assert!(
3534 msg.contains("v1.9.0"),
3535 "should report detected version: {msg}"
3536 );
3537 assert!(
3538 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3539 "should report system path: {msg}"
3540 );
3541 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3542
3543 let auto_fix_pos = msg
3545 .find("Auto-fix")
3546 .expect("Auto-fix solution missing — users won't discover --fix");
3547 let remove_pos = msg
3548 .find("Remove the old library")
3549 .expect("system-rm solution missing");
3550 assert!(
3551 auto_fix_pos < remove_pos,
3552 "Auto-fix must come before manual rm — see PR comment thread"
3553 );
3554
3555 assert!(
3557 msg.contains("npx @cortexkit/aft doctor --fix"),
3558 "auto-fix command must be present and copy-pasteable: {msg}"
3559 );
3560 }
3561
3562 #[test]
3566 fn ort_mismatch_message_handles_macos_dylib_path() {
3567 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3568 assert!(msg.contains("v1.9.0"));
3569 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3570 assert!(
3574 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3575 "system path should be quoted in the auto-fix sentence: {msg}"
3576 );
3577 }
3578}