1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Local(LocalEmbedder),
133 OpenAiCompatible {
134 client: Client,
135 model: String,
136 base_url: String,
137 api_key: Option<String>,
138 },
139 Ollama {
140 client: Client,
141 model: String,
142 base_url: String,
143 },
144}
145
146pub struct SemanticEmbeddingModel {
147 backend: SemanticBackend,
148 model: String,
149 base_url: Option<String>,
150 timeout_ms: u64,
151 max_batch_size: usize,
152 dimension: Option<usize>,
153 engine: SemanticEmbeddingEngine,
154 query_embedding_cache: HashMap<String, Vec<f32>>,
155 query_embedding_cache_order: VecDeque<String>,
156 query_embedding_cache_hits: u64,
157 query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163 vectors: &[Vec<f32>],
164 expected_count: usize,
165 context: &str,
166) -> Result<(), String> {
167 if expected_count > 0 && vectors.is_empty() {
168 return Err(format!(
169 "{context} returned no vectors for {expected_count} inputs"
170 ));
171 }
172
173 if vectors.len() != expected_count {
174 return Err(format!(
175 "{context} returned {} vectors for {} inputs",
176 vectors.len(),
177 expected_count
178 ));
179 }
180
181 let Some(first_vector) = vectors.first() else {
182 return Ok(());
183 };
184 let expected_dimension = first_vector.len();
185 validate_embedding_dimension(expected_dimension)
186 .map_err(|error| format!("{context} returned {error}"))?;
187 for (index, vector) in vectors.iter().enumerate() {
188 if vector.len() != expected_dimension {
189 return Err(format!(
190 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191 vector.len()
192 ));
193 }
194 }
195
196 Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200 if dimension == 0 || dimension > MAX_DIMENSION {
201 return Err(format!(
202 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203 ));
204 }
205
206 Ok(())
207}
208
209fn normalize_base_url(raw: &str) -> Result<String, String> {
213 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214 let scheme = parsed.scheme();
215 if scheme != "http" && scheme != "https" {
216 return Err(format!(
217 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218 scheme
219 ));
220 }
221 Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239 use std::net::{IpAddr, ToSocketAddrs};
240
241 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243 let host = parsed.host_str().unwrap_or("");
244
245 let is_loopback_host =
250 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251 if is_loopback_host {
252 return Ok(());
253 }
254
255 if host.ends_with(".local") {
258 return Err(format!(
259 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260 ));
261 }
262
263 let port = parsed.port_or_known_default().unwrap_or(443);
266 let addr_str = format!("{host}:{port}");
267 let addrs: Vec<IpAddr> = addr_str
268 .to_socket_addrs()
269 .map(|iter| iter.map(|sa| sa.ip()).collect())
270 .unwrap_or_default();
271 for ip in &addrs {
272 if is_private_non_loopback_ip(ip) {
273 return Err(format!(
274 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275 ));
276 }
277 }
278
279 Ok(())
280}
281
282fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287 match ip {
288 IpAddr::V4(v4) => {
289 let o = v4.octets();
290 o[0] == 10
293 || (o[0] == 172 && (16..=31).contains(&o[1]))
295 || (o[0] == 192 && o[1] == 168)
297 || (o[0] == 169 && o[1] == 254)
299 || (o[0] == 100 && (64..=127).contains(&o[1]))
301 || o[0] == 0
303 }
304 IpAddr::V6(v6) => {
305 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
309 || (v6.segments()[0] & 0xfe00) == 0xfc00
311 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313 && v6.segments()[2] == 0 && v6.segments()[3] == 0
314 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315 && {
316 let [a, b] = v6.segments()[6..8] else { return false; };
317 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319 })
320 }
321 }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325 if base_url.ends_with("/v1") {
326 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327 } else {
328 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329 }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333 if base_url.ends_with("/api") {
334 format!("{base_url}/embed")
335 } else {
336 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337 }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341 value.and_then(|token| {
342 let token = token.trim();
343 if token.is_empty() {
344 None
345 } else {
346 Some(token.to_string())
347 }
348 })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355fn embedding_response_body_is_transient(raw: &str) -> bool {
363 let lower = raw.to_ascii_lowercase();
364 lower.contains("model was unloaded")
365 || lower.contains("model is loading")
366 || lower.contains("model not loaded")
367 || lower.contains("loading model")
368 || lower.contains("is currently loading")
369 || lower.contains("model is being loaded")
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373 error.is_connect()
374}
375
376fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382 error.is_connect() || error.is_timeout()
383}
384
385pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
392
393pub fn embedding_failure_is_transient(error: &str) -> bool {
396 error.contains(TRANSIENT_EMBEDDING_MARKER)
397}
398
399pub fn strip_transient_embedding_marker(error: &str) -> String {
401 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
402}
403
404fn sleep_before_embedding_retry(attempt_index: usize) {
405 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
406 std::thread::sleep(Duration::from_millis(*delay_ms));
407 }
408}
409
410fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
411where
412 F: FnMut() -> reqwest::blocking::RequestBuilder,
413{
414 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
415 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
416
417 let response = match make_request().send() {
418 Ok(response) => response,
419 Err(error) => {
420 if !last_attempt && is_retryable_embedding_error(&error) {
421 sleep_before_embedding_retry(attempt_index);
422 continue;
423 }
424 let marker = if embedding_send_error_is_transient(&error) {
428 TRANSIENT_EMBEDDING_MARKER
429 } else {
430 ""
431 };
432 return Err(format!("{marker}{backend_label} request failed: {error}"));
433 }
434 };
435
436 let status = response.status();
437 let raw = match response.text() {
438 Ok(raw) => raw,
439 Err(error) => {
440 if !last_attempt && is_retryable_embedding_error(&error) {
441 sleep_before_embedding_retry(attempt_index);
442 continue;
443 }
444 return Err(format!("{backend_label} response read failed: {error}"));
445 }
446 };
447
448 if status.is_success() {
449 return Ok(raw);
450 }
451
452 let body_transient = embedding_response_body_is_transient(&raw);
456 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
457 sleep_before_embedding_retry(attempt_index);
458 continue;
459 }
460
461 let marker = if is_retryable_embedding_status(status) || body_transient {
467 TRANSIENT_EMBEDDING_MARKER
468 } else {
469 ""
470 };
471 return Err(format!(
472 "{marker}{backend_label} request failed (HTTP {}): {}",
473 status, raw
474 ));
475 }
476
477 unreachable!("embedding request retries exhausted without returning")
478}
479
480impl SemanticEmbeddingModel {
481 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
482 let timeout_ms = if config.timeout_ms == 0 {
483 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
484 } else {
485 config.timeout_ms
486 };
487
488 let max_batch_size = if config.max_batch_size == 0 {
489 DEFAULT_MAX_BATCH_SIZE
490 } else {
491 config.max_batch_size
492 };
493
494 let api_key_env = normalize_api_key(config.api_key_env.clone());
495 let model = config.model.clone();
496
497 let client = Client::builder()
498 .timeout(Duration::from_millis(timeout_ms))
499 .redirect(reqwest::redirect::Policy::none())
500 .build()
501 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
502
503 let engine = match config.backend {
504 SemanticBackend::Fastembed => {
505 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
506 }
507 SemanticBackend::OpenAiCompatible => {
508 let raw = config.base_url.as_ref().ok_or_else(|| {
509 "base_url is required for openai_compatible backend".to_string()
510 })?;
511 let base_url = normalize_base_url(raw)?;
512
513 let api_key = match api_key_env {
514 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
515 format!("missing api_key_env '{var_name}' for openai_compatible backend")
516 })?),
517 None => None,
518 };
519
520 SemanticEmbeddingEngine::OpenAiCompatible {
521 client,
522 model,
523 base_url,
524 api_key,
525 }
526 }
527 SemanticBackend::Ollama => {
528 let raw = config
529 .base_url
530 .as_ref()
531 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
532 let base_url = normalize_base_url(raw)?;
533
534 SemanticEmbeddingEngine::Ollama {
535 client,
536 model,
537 base_url,
538 }
539 }
540 };
541
542 Ok(Self {
543 backend: config.backend,
544 model: config.model.clone(),
545 base_url: config.base_url.clone(),
546 timeout_ms,
547 max_batch_size,
548 dimension: None,
549 engine,
550 query_embedding_cache: HashMap::new(),
551 query_embedding_cache_order: VecDeque::new(),
552 query_embedding_cache_hits: 0,
553 query_embedding_cache_misses: 0,
554 })
555 }
556
557 pub fn backend(&self) -> SemanticBackend {
558 self.backend
559 }
560
561 pub fn model(&self) -> &str {
562 &self.model
563 }
564
565 pub fn base_url(&self) -> Option<&str> {
566 self.base_url.as_deref()
567 }
568
569 pub fn max_batch_size(&self) -> usize {
570 self.max_batch_size
571 }
572
573 pub fn timeout_ms(&self) -> u64 {
574 self.timeout_ms
575 }
576
577 pub fn fingerprint(
578 &mut self,
579 config: &SemanticBackendConfig,
580 ) -> Result<SemanticIndexFingerprint, String> {
581 let dimension = self.dimension()?;
582 Ok(SemanticIndexFingerprint::from_config(config, dimension))
583 }
584
585 pub fn dimension(&mut self) -> Result<usize, String> {
586 if let Some(dimension) = self.dimension {
587 return Ok(dimension);
588 }
589
590 let dimension = match &mut self.engine {
591 SemanticEmbeddingEngine::Local(model) => {
592 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
593 vectors
594 .first()
595 .map(|v| v.len())
596 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
597 }
598 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
599 let vectors =
600 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
601 vectors
602 .first()
603 .map(|v| v.len())
604 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
605 }
606 SemanticEmbeddingEngine::Ollama { .. } => {
607 let vectors =
608 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
609 vectors
610 .first()
611 .map(|v| v.len())
612 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
613 }
614 };
615
616 self.dimension = Some(dimension);
617 Ok(dimension)
618 }
619
620 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
621 self.embed_texts(texts)
622 }
623
624 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
625 if let Some(vector) = self.query_embedding_cache.get(query) {
626 self.query_embedding_cache_hits += 1;
627 return Ok(vector.clone());
628 }
629
630 self.query_embedding_cache_misses += 1;
631 let embeddings = self.embed_texts(vec![query.to_string()])?;
632 let vector = embeddings
633 .first()
634 .cloned()
635 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
636
637 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
638 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
639 self.query_embedding_cache.remove(&oldest);
640 }
641 }
642 self.query_embedding_cache
643 .insert(query.to_string(), vector.clone());
644 self.query_embedding_cache_order
645 .push_back(query.to_string());
646
647 Ok(vector)
648 }
649
650 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
651 (
652 self.query_embedding_cache_hits,
653 self.query_embedding_cache_misses,
654 self.query_embedding_cache.len(),
655 )
656 }
657
658 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
659 match &mut self.engine {
660 SemanticEmbeddingEngine::Local(model) => model
661 .embed(&texts)
662 .map_err(|error| format!("failed to embed batch: {error}")),
663 SemanticEmbeddingEngine::OpenAiCompatible {
664 client,
665 model,
666 base_url,
667 api_key,
668 } => {
669 let expected_text_count = texts.len();
670 let endpoint = build_openai_embeddings_endpoint(base_url);
671 let body = serde_json::json!({
672 "input": texts,
673 "model": model,
674 });
675
676 let raw = send_embedding_request(
677 || {
678 let mut request = client.post(&endpoint).json(&body);
688
689 if let Some(api_key) = api_key {
690 request = request.header("Authorization", format!("Bearer {api_key}"));
691 }
692
693 request
694 },
695 "openai compatible",
696 )?;
697
698 #[derive(Deserialize)]
699 struct OpenAiResponse {
700 data: Vec<OpenAiEmbeddingResult>,
701 }
702
703 #[derive(Deserialize)]
704 struct OpenAiEmbeddingResult {
705 embedding: Vec<f32>,
706 index: Option<u32>,
707 }
708
709 let parsed: OpenAiResponse = serde_json::from_str(&raw)
710 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
711 if parsed.data.len() != expected_text_count {
712 return Err(format!(
713 "openai compatible response returned {} embeddings for {} inputs",
714 parsed.data.len(),
715 expected_text_count
716 ));
717 }
718
719 let mut vectors = vec![Vec::new(); parsed.data.len()];
720 for (i, item) in parsed.data.into_iter().enumerate() {
721 let index = item.index.unwrap_or(i as u32) as usize;
722 if index >= vectors.len() {
723 return Err(
724 "openai compatible response contains invalid vector index".to_string()
725 );
726 }
727 vectors[index] = item.embedding;
728 }
729
730 for vector in &vectors {
731 if vector.is_empty() {
732 return Err(
733 "openai compatible response contained missing vectors".to_string()
734 );
735 }
736 }
737
738 self.dimension = vectors.first().map(Vec::len);
739 Ok(vectors)
740 }
741 SemanticEmbeddingEngine::Ollama {
742 client,
743 model,
744 base_url,
745 } => {
746 let expected_text_count = texts.len();
747 let endpoint = build_ollama_embeddings_endpoint(base_url);
748
749 #[derive(Serialize)]
750 struct OllamaPayload<'a> {
751 model: &'a str,
752 input: Vec<String>,
753 }
754
755 let payload = OllamaPayload {
756 model,
757 input: texts,
758 };
759
760 let raw = send_embedding_request(
761 || {
762 client.post(&endpoint).json(&payload)
767 },
768 "ollama",
769 )?;
770
771 #[derive(Deserialize)]
772 struct OllamaResponse {
773 embeddings: Vec<Vec<f32>>,
774 }
775
776 let parsed: OllamaResponse = serde_json::from_str(&raw)
777 .map_err(|error| format!("invalid ollama response: {error}"))?;
778 if parsed.embeddings.is_empty() {
779 return Err("ollama response returned no embeddings".to_string());
780 }
781 if parsed.embeddings.len() != expected_text_count {
782 return Err(format!(
783 "ollama response returned {} embeddings for {} inputs",
784 parsed.embeddings.len(),
785 expected_text_count
786 ));
787 }
788
789 let vectors = parsed.embeddings;
790 for vector in &vectors {
791 if vector.is_empty() {
792 return Err("ollama response contained empty embeddings".to_string());
793 }
794 }
795
796 self.dimension = vectors.first().map(Vec::len);
797 Ok(vectors)
798 }
799 }
800 }
801}
802
803pub fn pre_validate_onnx_runtime() -> Result<(), String> {
807 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
808
809 #[cfg(any(target_os = "linux", target_os = "macos"))]
810 {
811 #[cfg(target_os = "linux")]
812 let default_name = "libonnxruntime.so";
813 #[cfg(target_os = "macos")]
814 let default_name = "libonnxruntime.dylib";
815
816 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
817
818 unsafe {
819 let c_name = std::ffi::CString::new(lib_name)
820 .map_err(|e| format!("invalid library path: {}", e))?;
821 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
822 if handle.is_null() {
823 let err = libc::dlerror();
824 let msg = if err.is_null() {
825 "unknown dlopen error".to_string()
826 } else {
827 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
828 };
829 return Err(format!(
830 "ONNX Runtime not found. dlopen('{}') failed: {}. \
831 Run `npx @cortexkit/aft doctor` to diagnose.",
832 lib_name, msg
833 ));
834 }
835
836 let detected_version = detect_ort_version_from_path(lib_name);
839
840 libc::dlclose(handle);
841
842 if let Some(ref version) = detected_version {
844 let parts: Vec<&str> = version.split('.').collect();
845 if let (Some(major), Some(minor)) = (
846 parts.first().and_then(|s| s.parse::<u32>().ok()),
847 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
848 ) {
849 if major != 1 || minor < 20 {
850 return Err(format_ort_version_mismatch(version, lib_name));
851 }
852 }
853 }
854 }
855 }
856
857 #[cfg(target_os = "windows")]
858 {
859 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
864
865 #[link(name = "kernel32")]
869 extern "system" {
870 fn LoadLibraryExW(
871 lpLibFileName: *const u16,
872 hFile: *mut std::ffi::c_void,
873 dwFlags: u32,
874 ) -> *mut std::ffi::c_void;
875 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
876 fn GetModuleFileNameW(
877 hModule: *mut std::ffi::c_void,
878 lpFilename: *mut u16,
879 nSize: u32,
880 ) -> u32;
881 }
882
883 #[link(name = "version")]
884 extern "system" {
885 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
886 fn GetFileVersionInfoW(
887 lptstrFilename: *const u16,
888 dwHandle: u32,
889 dwLen: u32,
890 lpData: *mut std::ffi::c_void,
891 ) -> i32;
892 fn VerQueryValueW(
893 pBlock: *mut std::ffi::c_void,
894 lpSubBlock: *const u16,
895 lplpBuffer: *mut *mut std::ffi::c_void,
896 puLen: *mut u32,
897 ) -> i32;
898 }
899
900 #[repr(C)]
901 struct VS_FIXEDFILEINFO {
902 dw_signature: u32,
903 dw_struc_version: u32,
904 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
907 dw_product_version_ls: u32,
908 dw_file_flags_mask: u32,
909 dw_file_flags: u32,
910 dw_file_os: u32,
911 dw_file_type: u32,
912 dw_file_subtype: u32,
913 dw_file_date_ms: u32,
914 dw_file_date_ls: u32,
915 }
916
917 unsafe {
918 use std::os::windows::ffi::OsStrExt;
919 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
920 .encode_wide()
921 .chain(std::iter::once(0))
922 .collect();
923
924 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
925 if handle.is_null() {
926 let err = std::io::Error::last_os_error();
927 return Err(format!(
928 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
929 Run `npx @cortexkit/aft doctor` to diagnose.",
930 lib_name, err
931 ));
932 }
933
934 let mut detected_major: u32 = 0;
937 let mut detected_minor: u32 = 0;
938 let mut path_buf = [0u16; 32767];
944 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
945 if path_len > 0 {
946 let mut dummy_handle: u32 = 0;
947 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
948 if info_size > 0 {
949 let mut info = vec![0u8; info_size as usize];
950 if GetFileVersionInfoW(
951 path_buf.as_ptr(),
952 0,
953 info_size,
954 info.as_mut_ptr() as *mut std::ffi::c_void,
955 ) != 0
956 {
957 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
958 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
959 let mut vs_len: u32 = 0;
960 if VerQueryValueW(
961 info.as_mut_ptr() as *mut std::ffi::c_void,
962 sub_block.as_ptr(),
963 &mut vs_info,
964 &mut vs_len,
965 ) != 0
966 && !vs_info.is_null()
967 {
968 let fixed = vs_info as *const VS_FIXEDFILEINFO;
969 detected_major = (*fixed).dw_file_version_ms >> 16;
970 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
971 }
972 }
973 }
974 }
975
976 FreeLibrary(handle);
977
978 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
982 let ver = format!("{}.{}", detected_major, detected_minor);
983 return Err(format_ort_version_mismatch(&ver, lib_name));
984 }
985 }
986 }
987
988 Ok(())
989}
990
991#[cfg(any(target_os = "linux", target_os = "macos"))]
994fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
995 let path = std::path::Path::new(lib_path);
996
997 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
999 .into_iter()
1000 .flatten()
1001 {
1002 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1003 if let Some(version) = extract_version_from_filename(name) {
1004 return Some(version);
1005 }
1006 }
1007 }
1008
1009 if let Some(parent) = path.parent() {
1011 if let Ok(entries) = std::fs::read_dir(parent) {
1012 for entry in entries.flatten() {
1013 if let Some(name) = entry.file_name().to_str() {
1014 if name.starts_with("libonnxruntime") {
1015 if let Some(version) = extract_version_from_filename(name) {
1016 return Some(version);
1017 }
1018 }
1019 }
1020 }
1021 }
1022 }
1023
1024 None
1025}
1026
1027#[cfg(any(target_os = "linux", target_os = "macos"))]
1029fn extract_version_from_filename(name: &str) -> Option<String> {
1030 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1032 re.find(name).map(|m| m.as_str().to_string())
1033}
1034
1035fn suggest_removal_command(lib_path: &str) -> String {
1036 if lib_path.starts_with("/usr/local/lib")
1037 || lib_path == "libonnxruntime.so"
1038 || lib_path == "libonnxruntime.dylib"
1039 {
1040 #[cfg(target_os = "linux")]
1041 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1042 #[cfg(target_os = "macos")]
1043 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1044 }
1045 format!(" rm '{}'", lib_path)
1046}
1047
1048pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1054 format!(
1055 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1056 Solutions:\n\
1057 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1058 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1059 configures the bridge to load it instead of the system library — no \
1060 changes to '{}'.\n\
1061 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1062 {}\n\
1063 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1064 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1065 version,
1066 lib_name,
1067 lib_name,
1068 suggest_removal_command(lib_name),
1069 )
1070}
1071
1072pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1073 if message.trim_start().starts_with("ONNX Runtime not found.") {
1074 return true;
1075 }
1076
1077 let message = message.to_ascii_lowercase();
1078 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1079 .iter()
1080 .any(|pattern| message.contains(pattern));
1081 let mentions_dynamic_load_failure = [
1082 "shared library",
1083 "dynamic library",
1084 "failed to load",
1085 "could not load",
1086 "unable to load",
1087 "dlopen",
1088 "loadlibrary",
1089 "no such file",
1090 "not found",
1091 ]
1092 .iter()
1093 .any(|pattern| message.contains(pattern));
1094
1095 mentions_onnx_runtime && mentions_dynamic_load_failure
1096}
1097
1098pub fn format_embedding_init_error(error: impl Display) -> String {
1099 let message = error.to_string();
1100
1101 if is_onnx_runtime_unavailable(&message) {
1102 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1103 }
1104
1105 format!("failed to initialize semantic embedding model: {message}")
1106}
1107
1108#[derive(Debug, Clone)]
1110pub struct SemanticChunk {
1111 pub file: PathBuf,
1113 pub name: String,
1115 pub kind: SymbolKind,
1117 pub start_line: u32,
1119 pub end_line: u32,
1120 pub exported: bool,
1122 pub embed_text: String,
1124 pub snippet: String,
1126}
1127
1128#[derive(Debug, Clone)]
1130pub struct EmbeddingEntry {
1131 chunk: SemanticChunk,
1132 vector: Vec<f32>,
1133}
1134
1135#[derive(Debug, Clone)]
1137pub struct SemanticIndex {
1138 entries: Vec<EmbeddingEntry>,
1139 file_mtimes: HashMap<PathBuf, SystemTime>,
1141 file_sizes: HashMap<PathBuf, u64>,
1143 file_hashes: HashMap<PathBuf, blake3::Hash>,
1144 dimension: usize,
1146 fingerprint: Option<SemanticIndexFingerprint>,
1147 project_root: PathBuf,
1148 deferred_files: HashSet<PathBuf>,
1149}
1150
1151#[derive(Debug, Clone, Copy)]
1152struct IndexedFileMetadata {
1153 mtime: SystemTime,
1154 size: u64,
1155 content_hash: blake3::Hash,
1156}
1157
1158#[derive(Debug, Default, Clone, Copy)]
1161pub struct RefreshSummary {
1162 pub changed: usize,
1163 pub added: usize,
1164 pub deleted: usize,
1165 pub total_processed: usize,
1166}
1167
1168impl RefreshSummary {
1169 pub fn is_noop(&self) -> bool {
1171 self.changed == 0 && self.added == 0 && self.deleted == 0
1172 }
1173}
1174
1175#[derive(Debug, Default)]
1176pub struct InvalidatedFilesRefresh {
1177 pub added_entries: Vec<EmbeddingEntry>,
1178 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1179 pub completed_paths: Vec<PathBuf>,
1180 pub summary: RefreshSummary,
1181}
1182
1183#[derive(Debug, Clone)]
1185pub struct SemanticResult {
1186 pub file: PathBuf,
1187 pub name: String,
1188 pub kind: SymbolKind,
1189 pub start_line: u32,
1190 pub end_line: u32,
1191 pub exported: bool,
1192 pub snippet: String,
1193 pub score: f32,
1194 pub source: &'static str,
1195}
1196
1197impl SemanticIndex {
1198 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1199 debug_assert!(project_root.is_absolute());
1200 Self {
1201 entries: Vec::new(),
1202 file_mtimes: HashMap::new(),
1203 file_sizes: HashMap::new(),
1204 file_hashes: HashMap::new(),
1205 dimension,
1206 fingerprint: None,
1207 project_root,
1208 deferred_files: HashSet::new(),
1209 }
1210 }
1211
1212 pub fn entry_count(&self) -> usize {
1214 self.entries.len()
1215 }
1216
1217 pub fn indexed_file_count(&self) -> usize {
1219 self.file_mtimes.len()
1220 }
1221
1222 pub fn status_label(&self) -> &'static str {
1224 if self.entries.is_empty() {
1225 "empty"
1226 } else {
1227 "ready"
1228 }
1229 }
1230
1231 fn collect_chunks(
1232 project_root: &Path,
1233 files: &[PathBuf],
1234 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1235 let collect_started = std::time::Instant::now();
1236 let per_file: Vec<(
1237 PathBuf,
1238 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1239 )> = files
1240 .par_iter()
1241 .map_init(HashMap::new, |parsers, file| {
1242 let result = collect_file_metadata(file).and_then(|metadata| {
1243 collect_file_chunks(project_root, file, parsers)
1244 .map(|chunks| (metadata, chunks))
1245 });
1246 (file.clone(), result)
1247 })
1248 .collect();
1249
1250 let mut chunks: Vec<SemanticChunk> = Vec::new();
1251 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1252
1253 for (file, result) in per_file {
1254 match result {
1255 Ok((metadata, file_chunks)) => {
1256 file_metadata.insert(file, metadata);
1257 chunks.extend(file_chunks);
1258 }
1259 Err(error) => {
1260 if error == "unsupported file extension" {
1266 continue;
1267 }
1268 slog_warn!(
1269 "failed to collect semantic chunks for {}: {}",
1270 file.display(),
1271 error
1272 );
1273 }
1274 }
1275 }
1276
1277 slog_info!(
1278 "semantic collect: {} chunks from {} files in {} ms",
1279 chunks.len(),
1280 file_metadata.len(),
1281 collect_started.elapsed().as_millis()
1282 );
1283
1284 (chunks, file_metadata)
1285 }
1286
1287 fn build_from_chunks<F, P>(
1288 project_root: &Path,
1289 chunks: Vec<SemanticChunk>,
1290 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1291 embed_fn: &mut F,
1292 max_batch_size: usize,
1293 mut progress: Option<&mut P>,
1294 ) -> Result<Self, String>
1295 where
1296 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1297 P: FnMut(usize, usize),
1298 {
1299 debug_assert!(project_root.is_absolute());
1300 let total_chunks = chunks.len();
1301
1302 if chunks.is_empty() {
1303 return Ok(Self {
1304 entries: Vec::new(),
1305 file_mtimes: file_metadata
1306 .iter()
1307 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1308 .collect(),
1309 file_sizes: file_metadata
1310 .iter()
1311 .map(|(path, metadata)| (path.clone(), metadata.size))
1312 .collect(),
1313 file_hashes: file_metadata
1314 .into_iter()
1315 .map(|(path, metadata)| (path, metadata.content_hash))
1316 .collect(),
1317 dimension: DEFAULT_DIMENSION,
1318 fingerprint: None,
1319 project_root: project_root.to_path_buf(),
1320 deferred_files: HashSet::new(),
1321 });
1322 }
1323
1324 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1326 let mut expected_dimension: Option<usize> = None;
1327 let batch_size = max_batch_size.max(1);
1328 let embed_started = std::time::Instant::now();
1329 let batch_count = total_chunks.div_ceil(batch_size);
1330 for batch_start in (0..chunks.len()).step_by(batch_size) {
1331 let batch_end = (batch_start + batch_size).min(chunks.len());
1332 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1333 .iter()
1334 .map(|c| c.embed_text.clone())
1335 .collect();
1336
1337 let vectors = embed_fn(batch_texts)?;
1338 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1339
1340 if let Some(dim) = vectors.first().map(|v| v.len()) {
1342 match expected_dimension {
1343 None => expected_dimension = Some(dim),
1344 Some(expected) if dim != expected => {
1345 return Err(format!(
1346 "embedding dimension changed across batches: expected {expected}, got {dim}"
1347 ));
1348 }
1349 _ => {}
1350 }
1351 }
1352
1353 for (i, vector) in vectors.into_iter().enumerate() {
1354 let chunk_idx = batch_start + i;
1355 entries.push(EmbeddingEntry {
1356 chunk: chunks[chunk_idx].clone(),
1357 vector,
1358 });
1359 }
1360
1361 if let Some(callback) = progress.as_mut() {
1362 callback(entries.len(), total_chunks);
1363 }
1364 }
1365
1366 let embed_ms = embed_started.elapsed().as_millis();
1367 let rate = (total_chunks as u128 * 1000)
1368 .checked_div(embed_ms)
1369 .unwrap_or(0) as u64;
1370 slog_info!(
1371 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1372 total_chunks,
1373 batch_count,
1374 embed_ms,
1375 rate
1376 );
1377
1378 let dimension = entries
1379 .first()
1380 .map(|e| e.vector.len())
1381 .unwrap_or(DEFAULT_DIMENSION);
1382
1383 Ok(Self {
1384 entries,
1385 file_mtimes: file_metadata
1386 .iter()
1387 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1388 .collect(),
1389 file_sizes: file_metadata
1390 .iter()
1391 .map(|(path, metadata)| (path.clone(), metadata.size))
1392 .collect(),
1393 file_hashes: file_metadata
1394 .into_iter()
1395 .map(|(path, metadata)| (path, metadata.content_hash))
1396 .collect(),
1397 dimension,
1398 fingerprint: None,
1399 project_root: project_root.to_path_buf(),
1400 deferred_files: HashSet::new(),
1401 })
1402 }
1403
1404 pub fn build<F>(
1407 project_root: &Path,
1408 files: &[PathBuf],
1409 embed_fn: &mut F,
1410 max_batch_size: usize,
1411 ) -> Result<Self, String>
1412 where
1413 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1414 {
1415 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1416 Self::build_from_chunks(
1417 project_root,
1418 chunks,
1419 file_mtimes,
1420 embed_fn,
1421 max_batch_size,
1422 Option::<&mut fn(usize, usize)>::None,
1423 )
1424 }
1425
1426 pub fn build_with_progress<F, P>(
1428 project_root: &Path,
1429 files: &[PathBuf],
1430 embed_fn: &mut F,
1431 max_batch_size: usize,
1432 progress: &mut P,
1433 ) -> Result<Self, String>
1434 where
1435 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1436 P: FnMut(usize, usize),
1437 {
1438 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1439 let total_chunks = chunks.len();
1440 progress(0, total_chunks);
1441 Self::build_from_chunks(
1442 project_root,
1443 chunks,
1444 file_mtimes,
1445 embed_fn,
1446 max_batch_size,
1447 Some(progress),
1448 )
1449 }
1450
1451 pub fn refresh_stale_files<F, P>(
1462 &mut self,
1463 project_root: &Path,
1464 current_files: &[PathBuf],
1465 embed_fn: &mut F,
1466 max_batch_size: usize,
1467 progress: &mut P,
1468 ) -> Result<RefreshSummary, String>
1469 where
1470 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1471 P: FnMut(usize, usize),
1472 {
1473 self.backfill_missing_file_sizes();
1474
1475 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1477 self.deferred_files
1478 .retain(|path| current_set.contains(path.as_path()));
1479 let total_processed = current_set.len() + self.file_mtimes.len()
1480 - self
1481 .file_mtimes
1482 .keys()
1483 .filter(|path| current_set.contains(path.as_path()))
1484 .count();
1485
1486 let mut deleted: Vec<PathBuf> = Vec::new();
1489 let mut changed: Vec<PathBuf> = Vec::new();
1490 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1491 for indexed_path in &indexed_paths {
1492 if !current_set.contains(indexed_path.as_path()) {
1493 deleted.push(indexed_path.clone());
1494 continue;
1495 }
1496 let cached = match (
1497 self.file_mtimes.get(indexed_path),
1498 self.file_sizes.get(indexed_path),
1499 self.file_hashes.get(indexed_path),
1500 ) {
1501 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1502 mtime: *mtime,
1503 size: *size,
1504 content_hash: *hash,
1505 }),
1506 _ => None,
1507 };
1508 match cached
1509 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1510 {
1511 Some(FreshnessVerdict::HotFresh) => {}
1512 Some(FreshnessVerdict::ContentFresh {
1513 new_mtime,
1514 new_size,
1515 }) => {
1516 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1517 self.file_sizes.insert(indexed_path.clone(), new_size);
1518 }
1519 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1520 changed.push(indexed_path.clone());
1521 }
1522 }
1523 }
1524
1525 let mut added: Vec<PathBuf> = Vec::new();
1527 for path in current_files {
1528 if !self.file_mtimes.contains_key(path) {
1529 added.push(path.clone());
1530 }
1531 }
1532
1533 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1535 progress(0, 0);
1536 return Ok(RefreshSummary {
1537 total_processed,
1538 ..RefreshSummary::default()
1539 });
1540 }
1541
1542 if !deleted.is_empty() {
1546 self.remove_indexed_files(&deleted);
1547 }
1548
1549 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1551 to_embed.extend(changed.iter().cloned());
1552 to_embed.extend(added.iter().cloned());
1553
1554 if to_embed.is_empty() {
1555 progress(0, 0);
1557 return Ok(RefreshSummary {
1558 changed: 0,
1559 added: 0,
1560 deleted: deleted.len(),
1561 total_processed,
1562 });
1563 }
1564
1565 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1566 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1567 let vanished = to_embed
1568 .iter()
1569 .filter(|path| {
1570 changed_set.contains(path.as_path())
1571 && !fresh_metadata.contains_key(*path)
1572 && !path.exists()
1573 })
1574 .cloned()
1575 .collect::<Vec<_>>();
1576 if !vanished.is_empty() {
1577 self.remove_indexed_files(&vanished);
1578 deleted.extend(vanished);
1579 }
1580
1581 if chunks.is_empty() {
1582 progress(0, 0);
1583 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1584 for file in &successful_files {
1585 self.deferred_files.remove(file);
1586 }
1587 if !successful_files.is_empty() {
1588 self.entries
1589 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1590 }
1591 let changed_count = changed
1592 .iter()
1593 .filter(|path| successful_files.contains(*path))
1594 .count();
1595 let added_count = added
1596 .iter()
1597 .filter(|path| successful_files.contains(*path))
1598 .count();
1599 for (file, metadata) in fresh_metadata {
1600 self.file_mtimes.insert(file.clone(), metadata.mtime);
1601 self.file_sizes.insert(file.clone(), metadata.size);
1602 self.file_hashes.insert(file.clone(), metadata.content_hash);
1603 }
1604 return Ok(RefreshSummary {
1605 changed: changed_count,
1606 added: added_count,
1607 deleted: deleted.len(),
1608 total_processed,
1609 });
1610 }
1611
1612 let total_chunks = chunks.len();
1614 progress(0, total_chunks);
1615 let batch_size = max_batch_size.max(1);
1616 let existing_dimension = if self.entries.is_empty() {
1617 None
1618 } else {
1619 Some(self.dimension)
1620 };
1621 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1622 let mut observed_dimension: Option<usize> = existing_dimension;
1623
1624 for batch_start in (0..chunks.len()).step_by(batch_size) {
1625 let batch_end = (batch_start + batch_size).min(chunks.len());
1626 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1627 .iter()
1628 .map(|c| c.embed_text.clone())
1629 .collect();
1630
1631 let vectors = embed_fn(batch_texts)?;
1632 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1633
1634 if let Some(dim) = vectors.first().map(|v| v.len()) {
1635 match observed_dimension {
1636 None => observed_dimension = Some(dim),
1637 Some(expected) if dim != expected => {
1638 return Err(format!(
1641 "embedding dimension changed during incremental refresh: \
1642 cached index uses {expected}, new vectors use {dim}"
1643 ));
1644 }
1645 _ => {}
1646 }
1647 }
1648
1649 for (i, vector) in vectors.into_iter().enumerate() {
1650 let chunk_idx = batch_start + i;
1651 new_entries.push(EmbeddingEntry {
1652 chunk: chunks[chunk_idx].clone(),
1653 vector,
1654 });
1655 }
1656
1657 progress(new_entries.len(), total_chunks);
1658 }
1659
1660 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1661 for file in &successful_files {
1662 self.deferred_files.remove(file);
1663 }
1664 if !successful_files.is_empty() {
1665 self.entries
1666 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1667 }
1668
1669 self.entries.extend(new_entries);
1670 for (file, metadata) in fresh_metadata {
1671 self.file_mtimes.insert(file.clone(), metadata.mtime);
1672 self.file_sizes.insert(file.clone(), metadata.size);
1673 self.file_hashes.insert(file, metadata.content_hash);
1674 }
1675 if let Some(dim) = observed_dimension {
1676 self.dimension = dim;
1677 }
1678
1679 Ok(RefreshSummary {
1680 changed: changed
1681 .iter()
1682 .filter(|path| successful_files.contains(*path))
1683 .count(),
1684 added: added
1685 .iter()
1686 .filter(|path| successful_files.contains(*path))
1687 .count(),
1688 deleted: deleted.len(),
1689 total_processed,
1690 })
1691 }
1692
1693 pub fn refresh_invalidated_files<F, P>(
1700 &mut self,
1701 project_root: &Path,
1702 paths: &[PathBuf],
1703 embed_fn: &mut F,
1704 max_batch_size: usize,
1705 max_files: usize,
1706 progress: &mut P,
1707 ) -> Result<InvalidatedFilesRefresh, String>
1708 where
1709 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1710 P: FnMut(usize, usize),
1711 {
1712 self.backfill_missing_file_sizes();
1713
1714 self.deferred_files.retain(|path| path.exists());
1715 let mut requested_paths = paths.to_vec();
1716 requested_paths.extend(self.deferred_files.iter().cloned());
1717 requested_paths.sort();
1718 requested_paths.dedup();
1719 let total_processed = requested_paths.len();
1720
1721 if requested_paths.is_empty() {
1722 progress(0, 0);
1723 return Ok(InvalidatedFilesRefresh {
1724 summary: RefreshSummary {
1725 total_processed,
1726 ..RefreshSummary::default()
1727 },
1728 ..InvalidatedFilesRefresh::default()
1729 });
1730 }
1731
1732 let previously_indexed: HashSet<PathBuf> = requested_paths
1733 .iter()
1734 .filter(|path| self.file_mtimes.contains_key(*path))
1735 .cloned()
1736 .collect();
1737
1738 self.remove_indexed_files(&requested_paths);
1742
1743 let existing_paths = requested_paths
1744 .iter()
1745 .filter(|path| path.exists())
1746 .cloned()
1747 .collect::<Vec<_>>();
1748 let deleted = requested_paths
1749 .iter()
1750 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1751 .count();
1752
1753 if existing_paths.is_empty() {
1754 for path in &requested_paths {
1755 if !path.exists() {
1756 self.deferred_files.remove(path);
1757 }
1758 }
1759 progress(0, 0);
1760 return Ok(InvalidatedFilesRefresh {
1761 completed_paths: requested_paths,
1762 summary: RefreshSummary {
1763 deleted,
1764 total_processed,
1765 ..RefreshSummary::default()
1766 },
1767 ..InvalidatedFilesRefresh::default()
1768 });
1769 }
1770
1771 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1772
1773 let retained_file_count = self.file_mtimes.len();
1774 let changed_successful_count = existing_paths
1775 .iter()
1776 .filter(|path| {
1777 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1778 })
1779 .count();
1780 let available_new_files =
1781 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1782 let new_successful_files = existing_paths
1783 .iter()
1784 .filter(|path| {
1785 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1786 })
1787 .cloned()
1788 .collect::<Vec<_>>();
1789 if new_successful_files.len() > available_new_files {
1790 let allowed_new_files = new_successful_files
1791 .iter()
1792 .take(available_new_files)
1793 .cloned()
1794 .collect::<HashSet<_>>();
1795 let deferred_new_files = new_successful_files
1796 .into_iter()
1797 .filter(|path| !allowed_new_files.contains(path))
1798 .collect::<HashSet<_>>();
1799
1800 fresh_metadata.retain(|file, _| {
1801 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1802 });
1803 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1804
1805 if !deferred_new_files.is_empty() {
1806 for path in &deferred_new_files {
1807 self.deferred_files.insert(path.clone());
1808 }
1809 slog_warn!(
1810 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1811 deferred_new_files.len(),
1812 max_files
1813 );
1814 }
1815 }
1816
1817 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1818 for file in &successful_files {
1819 self.deferred_files.remove(file);
1820 }
1821 let changed = successful_files
1822 .iter()
1823 .filter(|path| previously_indexed.contains(path.as_path()))
1824 .count();
1825 let added = successful_files.len().saturating_sub(changed);
1826 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1827
1828 if chunks.is_empty() {
1829 progress(0, 0);
1830 for (file, metadata) in fresh_metadata {
1831 let freshness = FileFreshness {
1832 mtime: metadata.mtime,
1833 size: metadata.size,
1834 content_hash: metadata.content_hash,
1835 };
1836 self.file_mtimes.insert(file.clone(), freshness.mtime);
1837 self.file_sizes.insert(file.clone(), freshness.size);
1838 self.file_hashes
1839 .insert(file.clone(), freshness.content_hash);
1840 updated_metadata.push((file, freshness));
1841 }
1842
1843 return Ok(InvalidatedFilesRefresh {
1844 updated_metadata,
1845 completed_paths: requested_paths,
1846 summary: RefreshSummary {
1847 changed,
1848 added,
1849 deleted,
1850 total_processed,
1851 },
1852 ..InvalidatedFilesRefresh::default()
1853 });
1854 }
1855
1856 let total_chunks = chunks.len();
1857 progress(0, total_chunks);
1858 let batch_size = max_batch_size.max(1);
1859 let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1860 None
1861 } else {
1862 Some(self.dimension)
1863 };
1864 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1865
1866 for batch_start in (0..chunks.len()).step_by(batch_size) {
1867 let batch_end = (batch_start + batch_size).min(chunks.len());
1868 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1869 .iter()
1870 .map(|chunk| chunk.embed_text.clone())
1871 .collect();
1872
1873 let vectors = embed_fn(batch_texts)?;
1874 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1875
1876 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1877 match observed_dimension {
1878 None => observed_dimension = Some(dim),
1879 Some(expected) if dim != expected => {
1880 return Err(format!(
1881 "embedding dimension changed during invalidated-file refresh: \
1882 cached index uses {expected}, new vectors use {dim}"
1883 ));
1884 }
1885 _ => {}
1886 }
1887 }
1888
1889 for (i, vector) in vectors.into_iter().enumerate() {
1890 let chunk_idx = batch_start + i;
1891 new_entries.push(EmbeddingEntry {
1892 chunk: chunks[chunk_idx].clone(),
1893 vector,
1894 });
1895 }
1896
1897 progress(new_entries.len(), total_chunks);
1898 }
1899
1900 let added_entries = new_entries.clone();
1901 self.entries.extend(new_entries);
1902 for (file, metadata) in fresh_metadata {
1903 let freshness = FileFreshness {
1904 mtime: metadata.mtime,
1905 size: metadata.size,
1906 content_hash: metadata.content_hash,
1907 };
1908 self.file_mtimes.insert(file.clone(), freshness.mtime);
1909 self.file_sizes.insert(file.clone(), freshness.size);
1910 self.file_hashes
1911 .insert(file.clone(), freshness.content_hash);
1912 updated_metadata.push((file, freshness));
1913 }
1914 if let Some(dim) = observed_dimension {
1915 self.dimension = dim;
1916 }
1917
1918 Ok(InvalidatedFilesRefresh {
1919 added_entries,
1920 updated_metadata,
1921 completed_paths: requested_paths,
1922 summary: RefreshSummary {
1923 changed,
1924 added,
1925 deleted,
1926 total_processed,
1927 },
1928 })
1929 }
1930
1931 pub fn apply_refresh_update(
1932 &mut self,
1933 added_entries: Vec<EmbeddingEntry>,
1934 updated_metadata: Vec<(PathBuf, FileFreshness)>,
1935 completed_paths: &[PathBuf],
1936 ) {
1937 self.remove_indexed_files(completed_paths);
1938
1939 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1940 self.entries.extend(added_entries);
1941 for (file, freshness) in updated_metadata {
1942 self.file_mtimes.insert(file.clone(), freshness.mtime);
1943 self.file_sizes.insert(file.clone(), freshness.size);
1944 self.file_hashes.insert(file, freshness.content_hash);
1945 }
1946 if let Some(dim) = observed_dimension {
1947 self.dimension = dim;
1948 }
1949 }
1950
1951 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1952 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1953 self.entries
1954 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1955 for path in files {
1956 self.file_mtimes.remove(path);
1957 self.file_sizes.remove(path);
1958 self.file_hashes.remove(path);
1959 }
1960 }
1961
1962 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1964 if self.entries.is_empty() || query_vector.len() != self.dimension {
1965 return Vec::new();
1966 }
1967
1968 let mut scored: Vec<(f32, usize)> = self
1969 .entries
1970 .iter()
1971 .enumerate()
1972 .map(|(i, entry)| {
1973 let mut score = cosine_similarity(query_vector, &entry.vector);
1974 if entry.chunk.exported {
1975 score *= 1.1;
1976 }
1977 (score, i)
1978 })
1979 .collect();
1980
1981 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1983
1984 scored
1985 .into_iter()
1986 .take(top_k)
1987 .map(|(score, idx)| {
1991 let entry = &self.entries[idx];
1992 SemanticResult {
1993 file: entry.chunk.file.clone(),
1994 name: entry.chunk.name.clone(),
1995 kind: entry.chunk.kind.clone(),
1996 start_line: entry.chunk.start_line,
1997 end_line: entry.chunk.end_line,
1998 exported: entry.chunk.exported,
1999 snippet: entry.chunk.snippet.clone(),
2000 score,
2001 source: "semantic",
2002 }
2003 })
2004 .collect()
2005 }
2006
2007 pub fn len(&self) -> usize {
2009 self.entries.len()
2010 }
2011
2012 pub fn is_file_stale(&self, file: &Path) -> bool {
2014 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2015 return true;
2016 };
2017 let Some(stored_size) = self.file_sizes.get(file) else {
2018 return true;
2019 };
2020 let Some(stored_hash) = self.file_hashes.get(file) else {
2021 return true;
2022 };
2023 let cached = FileFreshness {
2024 mtime: *stored_mtime,
2025 size: *stored_size,
2026 content_hash: *stored_hash,
2027 };
2028 match cache_freshness::verify_file_strict(file, &cached) {
2029 FreshnessVerdict::HotFresh => false,
2030 FreshnessVerdict::ContentFresh { .. } => false,
2031 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2032 }
2033 }
2034
2035 fn backfill_missing_file_sizes(&mut self) {
2036 for path in self.file_mtimes.keys() {
2037 if self.file_sizes.contains_key(path) {
2038 continue;
2039 }
2040 if let Ok(metadata) = fs::metadata(path) {
2041 self.file_sizes.insert(path.clone(), metadata.len());
2042 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2043 self.file_hashes.insert(path.clone(), hash);
2044 }
2045 }
2046 }
2047 }
2048
2049 pub fn remove_file(&mut self, file: &Path) {
2051 self.invalidate_file(file);
2052 }
2053
2054 pub fn invalidate_file(&mut self, file: &Path) {
2055 let canonical_file = canonicalize_existing_or_deleted_path(file);
2056 self.entries
2057 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2058 self.file_mtimes.remove(file);
2059 self.file_sizes.remove(file);
2060 self.file_hashes.remove(file);
2061 if canonical_file.as_path() != file {
2062 self.file_mtimes.remove(&canonical_file);
2063 self.file_sizes.remove(&canonical_file);
2064 self.file_hashes.remove(&canonical_file);
2065 }
2066 }
2067
2068 pub fn dimension(&self) -> usize {
2070 self.dimension
2071 }
2072
2073 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2074 self.fingerprint.as_ref()
2075 }
2076
2077 pub fn backend_label(&self) -> Option<&str> {
2078 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2079 }
2080
2081 pub fn model_label(&self) -> Option<&str> {
2082 self.fingerprint.as_ref().map(|f| f.model.as_str())
2083 }
2084
2085 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2086 self.fingerprint = Some(fingerprint);
2087 }
2088
2089 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2091 if self.entries.is_empty() {
2094 slog_info!("skipping semantic index persistence (0 entries)");
2095 return;
2096 }
2097 let dir = storage_dir.join("semantic").join(project_key);
2098 if let Err(e) = fs::create_dir_all(&dir) {
2099 slog_warn!("failed to create semantic cache dir: {}", e);
2100 return;
2101 }
2102 let data_path = dir.join("semantic.bin");
2103 let tmp_path = dir.join(format!(
2104 "semantic.bin.tmp.{}.{}",
2105 std::process::id(),
2106 SystemTime::now()
2107 .duration_since(SystemTime::UNIX_EPOCH)
2108 .unwrap_or(Duration::ZERO)
2109 .as_nanos()
2110 ));
2111 let bytes = self.to_bytes();
2112 let write_result = (|| -> std::io::Result<()> {
2113 use std::io::Write;
2114 let mut file = fs::File::create(&tmp_path)?;
2115 file.write_all(&bytes)?;
2116 file.sync_all()?;
2117 Ok(())
2118 })();
2119 if let Err(e) = write_result {
2120 slog_warn!("failed to write semantic index: {}", e);
2121 let _ = fs::remove_file(&tmp_path);
2122 return;
2123 }
2124 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2125 slog_warn!("failed to rename semantic index: {}", e);
2126 let _ = fs::remove_file(&tmp_path);
2127 return;
2128 }
2129 slog_info!(
2130 "semantic index persisted: {} entries, {:.1} KB",
2131 self.entries.len(),
2132 bytes.len() as f64 / 1024.0
2133 );
2134 }
2135
2136 pub fn read_from_disk(
2138 storage_dir: &Path,
2139 project_key: &str,
2140 current_canonical_root: &Path,
2141 is_worktree_bridge: bool,
2142 expected_fingerprint: Option<&str>,
2143 ) -> Option<Self> {
2144 debug_assert!(current_canonical_root.is_absolute());
2145 let data_path = storage_dir
2146 .join("semantic")
2147 .join(project_key)
2148 .join("semantic.bin");
2149 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2150 if file_len < HEADER_BYTES_V1 {
2151 slog_warn!(
2152 "corrupt semantic index (too small: {} bytes), removing",
2153 file_len
2154 );
2155 if !is_worktree_bridge {
2156 let _ = fs::remove_file(&data_path);
2157 }
2158 return None;
2159 }
2160
2161 let bytes = fs::read(&data_path).ok()?;
2162 let version = bytes[0];
2163 if version != SEMANTIC_INDEX_VERSION_V6 {
2164 slog_info!(
2165 "cached semantic index version {} is older than {}, rebuilding",
2166 version,
2167 SEMANTIC_INDEX_VERSION_V6
2168 );
2169 if !is_worktree_bridge {
2170 let _ = fs::remove_file(&data_path);
2171 }
2172 return None;
2173 }
2174 match Self::from_bytes(&bytes, current_canonical_root) {
2175 Ok(index) => {
2176 if index.entries.is_empty() {
2177 slog_info!("cached semantic index is empty, will rebuild");
2178 if !is_worktree_bridge {
2179 let _ = fs::remove_file(&data_path);
2180 }
2181 return None;
2182 }
2183 if let Some(expected) = expected_fingerprint {
2184 let matches = index
2185 .fingerprint()
2186 .map(|fingerprint| fingerprint.matches_expected(expected))
2187 .unwrap_or(false);
2188 if !matches {
2189 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2190 if !is_worktree_bridge {
2191 let _ = fs::remove_file(&data_path);
2192 }
2193 return None;
2194 }
2195 }
2196 slog_info!(
2197 "loaded semantic index from disk: {} entries",
2198 index.entries.len()
2199 );
2200 Some(index)
2201 }
2202 Err(e) => {
2203 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2204 if !is_worktree_bridge {
2205 let _ = fs::remove_file(&data_path);
2206 }
2207 None
2208 }
2209 }
2210 }
2211
2212 pub fn to_bytes(&self) -> Vec<u8> {
2214 let mut buf = Vec::new();
2215 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2216 let encoded = fingerprint.as_string();
2217 if encoded.is_empty() {
2218 None
2219 } else {
2220 Some(encoded.into_bytes())
2221 }
2222 });
2223 let file_mtimes: Vec<_> = self
2224 .file_mtimes
2225 .iter()
2226 .filter_map(|(path, mtime)| {
2227 cache_relative_path(&self.project_root, path)
2228 .map(|relative| (relative, path, mtime))
2229 })
2230 .collect();
2231 let entries: Vec<_> = self
2232 .entries
2233 .iter()
2234 .filter_map(|entry| {
2235 cache_relative_path(&self.project_root, &entry.chunk.file)
2236 .map(|relative| (relative, entry))
2237 })
2238 .collect();
2239
2240 let version = SEMANTIC_INDEX_VERSION_V6;
2253 buf.push(version);
2254 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2255 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2256 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2257 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2258 buf.extend_from_slice(fp_bytes_ref);
2259
2260 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2263 for (relative, path, mtime) in &file_mtimes {
2264 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2265 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2266 buf.extend_from_slice(&path_bytes);
2267 let duration = mtime
2268 .duration_since(SystemTime::UNIX_EPOCH)
2269 .unwrap_or_default();
2270 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2271 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2272 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2273 buf.extend_from_slice(&size.to_le_bytes());
2274 let hash = self
2275 .file_hashes
2276 .get(*path)
2277 .copied()
2278 .unwrap_or_else(cache_freshness::zero_hash);
2279 buf.extend_from_slice(hash.as_bytes());
2280 }
2281
2282 for (relative, entry) in &entries {
2284 let c = &entry.chunk;
2285
2286 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2288 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2289 buf.extend_from_slice(&file_bytes);
2290
2291 let name_bytes = c.name.as_bytes();
2293 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2294 buf.extend_from_slice(name_bytes);
2295
2296 buf.push(symbol_kind_to_u8(&c.kind));
2298
2299 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2301 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2302 buf.push(c.exported as u8);
2303
2304 let snippet_bytes = c.snippet.as_bytes();
2306 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2307 buf.extend_from_slice(snippet_bytes);
2308
2309 let embed_bytes = c.embed_text.as_bytes();
2311 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2312 buf.extend_from_slice(embed_bytes);
2313
2314 for &val in &entry.vector {
2316 buf.extend_from_slice(&val.to_le_bytes());
2317 }
2318 }
2319
2320 buf
2321 }
2322
2323 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2325 debug_assert!(current_canonical_root.is_absolute());
2326 let mut pos = 0;
2327
2328 if data.len() < HEADER_BYTES_V1 {
2329 return Err("data too short".to_string());
2330 }
2331
2332 let version = data[pos];
2333 pos += 1;
2334 if version != SEMANTIC_INDEX_VERSION_V1
2335 && version != SEMANTIC_INDEX_VERSION_V2
2336 && version != SEMANTIC_INDEX_VERSION_V3
2337 && version != SEMANTIC_INDEX_VERSION_V4
2338 && version != SEMANTIC_INDEX_VERSION_V5
2339 && version != SEMANTIC_INDEX_VERSION_V6
2340 {
2341 return Err(format!("unsupported version: {}", version));
2342 }
2343 if (version == SEMANTIC_INDEX_VERSION_V2
2347 || version == SEMANTIC_INDEX_VERSION_V3
2348 || version == SEMANTIC_INDEX_VERSION_V4
2349 || version == SEMANTIC_INDEX_VERSION_V5
2350 || version == SEMANTIC_INDEX_VERSION_V6)
2351 && data.len() < HEADER_BYTES_V2
2352 {
2353 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2354 }
2355
2356 let dimension = read_u32(data, &mut pos)? as usize;
2357 let entry_count = read_u32(data, &mut pos)? as usize;
2358 validate_embedding_dimension(dimension)?;
2359 if entry_count > MAX_ENTRIES {
2360 return Err(format!("too many semantic index entries: {}", entry_count));
2361 }
2362
2363 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2369 || version == SEMANTIC_INDEX_VERSION_V3
2370 || version == SEMANTIC_INDEX_VERSION_V4
2371 || version == SEMANTIC_INDEX_VERSION_V5
2372 || version == SEMANTIC_INDEX_VERSION_V6;
2373 let fingerprint = if has_fingerprint_field {
2374 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2375 if pos + fingerprint_len > data.len() {
2376 return Err("unexpected end of data reading fingerprint".to_string());
2377 }
2378 if fingerprint_len == 0 {
2379 None
2380 } else {
2381 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2382 pos += fingerprint_len;
2383 Some(
2384 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2385 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2386 )
2387 }
2388 } else {
2389 None
2390 };
2391
2392 let mtime_count = read_u32(data, &mut pos)? as usize;
2394 if mtime_count > MAX_ENTRIES {
2395 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2396 }
2397
2398 let vector_bytes = entry_count
2399 .checked_mul(dimension)
2400 .and_then(|count| count.checked_mul(F32_BYTES))
2401 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2402 if vector_bytes > data.len().saturating_sub(pos) {
2403 return Err("semantic index vectors exceed available data".to_string());
2404 }
2405
2406 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2407 let mut file_sizes = HashMap::with_capacity(mtime_count);
2408 let mut file_hashes = HashMap::with_capacity(mtime_count);
2409 for _ in 0..mtime_count {
2410 let path = read_string(data, &mut pos)?;
2411 let secs = read_u64(data, &mut pos)?;
2412 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2418 || version == SEMANTIC_INDEX_VERSION_V4
2419 || version == SEMANTIC_INDEX_VERSION_V5
2420 || version == SEMANTIC_INDEX_VERSION_V6
2421 {
2422 read_u32(data, &mut pos)?
2423 } else {
2424 0
2425 };
2426 let size =
2427 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2428 read_u64(data, &mut pos)?
2429 } else {
2430 0
2431 };
2432 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2433 if pos + 32 > data.len() {
2434 return Err("unexpected end of data reading content hash".to_string());
2435 }
2436 let mut hash_bytes = [0u8; 32];
2437 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2438 pos += 32;
2439 blake3::Hash::from_bytes(hash_bytes)
2440 } else {
2441 cache_freshness::zero_hash()
2442 };
2443 if nanos >= 1_000_000_000 {
2450 return Err(format!(
2451 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2452 nanos
2453 ));
2454 }
2455 let duration = std::time::Duration::new(secs, nanos);
2456 let mtime = SystemTime::UNIX_EPOCH
2457 .checked_add(duration)
2458 .ok_or_else(|| {
2459 format!(
2460 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2461 secs, nanos
2462 )
2463 })?;
2464 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2465 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2466 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2467 } else {
2468 PathBuf::from(path)
2469 };
2470 file_mtimes.insert(path.clone(), mtime);
2471 file_sizes.insert(path.clone(), size);
2472 file_hashes.insert(path, content_hash);
2473 }
2474
2475 let mut entries = Vec::with_capacity(entry_count);
2477 for _ in 0..entry_count {
2478 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2479 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2480 cached_path_under_root(current_canonical_root, &raw_file)
2481 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2482 } else {
2483 raw_file
2484 };
2485 let name = read_string(data, &mut pos)?;
2486
2487 if pos >= data.len() {
2488 return Err("unexpected end of data".to_string());
2489 }
2490 let kind = u8_to_symbol_kind(data[pos]);
2491 pos += 1;
2492
2493 let start_line = read_u32(data, &mut pos)?;
2494 let end_line = read_u32(data, &mut pos)?;
2495
2496 if pos >= data.len() {
2497 return Err("unexpected end of data".to_string());
2498 }
2499 let exported = data[pos] != 0;
2500 pos += 1;
2501
2502 let snippet = read_string(data, &mut pos)?;
2503 let embed_text = read_string(data, &mut pos)?;
2504
2505 let vec_bytes = dimension
2507 .checked_mul(F32_BYTES)
2508 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2509 if pos + vec_bytes > data.len() {
2510 return Err("unexpected end of data reading vector".to_string());
2511 }
2512 let mut vector = Vec::with_capacity(dimension);
2513 for _ in 0..dimension {
2514 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2515 vector.push(f32::from_le_bytes(bytes));
2516 pos += 4;
2517 }
2518
2519 entries.push(EmbeddingEntry {
2520 chunk: SemanticChunk {
2521 file,
2522 name,
2523 kind,
2524 start_line,
2525 end_line,
2526 exported,
2527 embed_text,
2528 snippet,
2529 },
2530 vector,
2531 });
2532 }
2533
2534 if entries.len() != entry_count {
2535 return Err(format!(
2536 "semantic cache entry count drift: header={} decoded={}",
2537 entry_count,
2538 entries.len()
2539 ));
2540 }
2541 for entry in &entries {
2542 if !file_mtimes.contains_key(&entry.chunk.file) {
2543 return Err(format!(
2544 "semantic cache metadata missing for entry file {}",
2545 entry.chunk.file.display()
2546 ));
2547 }
2548 }
2549
2550 Ok(Self {
2551 entries,
2552 file_mtimes,
2553 file_sizes,
2554 file_hashes,
2555 dimension,
2556 fingerprint,
2557 project_root: current_canonical_root.to_path_buf(),
2558 deferred_files: HashSet::new(),
2559 })
2560 }
2561}
2562
2563fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2565 let relative = file
2566 .strip_prefix(project_root)
2567 .unwrap_or(file)
2568 .to_string_lossy();
2569
2570 let kind_label = match &symbol.kind {
2571 SymbolKind::Function => "function",
2572 SymbolKind::Class => "class",
2573 SymbolKind::Method => "method",
2574 SymbolKind::Struct => "struct",
2575 SymbolKind::Interface => "interface",
2576 SymbolKind::Enum => "enum",
2577 SymbolKind::TypeAlias => "type",
2578 SymbolKind::Variable => "variable",
2579 SymbolKind::Heading => "heading",
2580 SymbolKind::FileSummary => "file-summary",
2581 };
2582
2583 let name = &symbol.name;
2585 let mut text = format!(
2586 "name:{name} file:{} kind:{} name:{name}",
2587 relative, kind_label
2588 );
2589
2590 if let Some(sig) = &symbol.signature {
2591 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2599 }
2600
2601 let lines: Vec<&str> = source.lines().collect();
2603 let start = (symbol.range.start_line as usize).min(lines.len());
2604 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2606 if start < end {
2607 let body: String = lines[start..end]
2608 .iter()
2609 .take(15) .copied()
2611 .collect::<Vec<&str>>()
2612 .join("\n");
2613 let snippet = if body.len() > 300 {
2614 format!("{}...", &body[..body.floor_char_boundary(300)])
2615 } else {
2616 body
2617 };
2618 text.push_str(&format!(" body:{}", snippet));
2619 }
2620
2621 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2626}
2627
2628const MAX_EMBED_TEXT_CHARS: usize = 1600;
2632
2633fn truncate_chars(value: &str, max_chars: usize) -> String {
2634 value.chars().take(max_chars).collect()
2635}
2636
2637fn first_leading_doc_comment(source: &str) -> String {
2638 let lines: Vec<&str> = source.lines().collect();
2639 let Some((start, first)) = lines
2640 .iter()
2641 .enumerate()
2642 .find(|(_, line)| !line.trim().is_empty())
2643 else {
2644 return String::new();
2645 };
2646
2647 let trimmed = first.trim_start();
2648 if trimmed.starts_with("/**") {
2649 let mut comment = Vec::new();
2650 for line in lines.iter().skip(start) {
2651 comment.push(*line);
2652 if line.contains("*/") {
2653 break;
2654 }
2655 }
2656 return truncate_chars(&comment.join("\n"), 200);
2657 }
2658
2659 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2660 let comment = lines
2661 .iter()
2662 .skip(start)
2663 .take_while(|line| {
2664 let trimmed = line.trim_start();
2665 trimmed.starts_with("///") || trimmed.starts_with("//!")
2666 })
2667 .copied()
2668 .collect::<Vec<_>>()
2669 .join("\n");
2670 return truncate_chars(&comment, 200);
2671 }
2672
2673 String::new()
2674}
2675
2676pub fn build_file_summary_chunk(
2677 file: &Path,
2678 project_root: &Path,
2679 source: &str,
2680 top_exports: &[&str],
2681 top_export_signatures: &[Option<&str>],
2682) -> SemanticChunk {
2683 let relative = file.strip_prefix(project_root).unwrap_or(file);
2684 let rel_path = relative.to_string_lossy();
2685 let parent_dir = relative
2686 .parent()
2687 .map(|parent| parent.to_string_lossy().to_string())
2688 .unwrap_or_default();
2689 let name = file
2690 .file_stem()
2691 .map(|stem| stem.to_string_lossy().to_string())
2692 .unwrap_or_default();
2693 let doc = first_leading_doc_comment(source);
2694 let exports = top_exports
2695 .iter()
2696 .take(5)
2697 .copied()
2698 .collect::<Vec<_>>()
2699 .join(",");
2700 let snippet = if doc.is_empty() {
2701 top_export_signatures
2702 .first()
2703 .and_then(|signature| signature.as_deref())
2704 .map(|signature| truncate_chars(signature, 200))
2705 .unwrap_or_default()
2706 } else {
2707 doc.clone()
2708 };
2709
2710 SemanticChunk {
2711 file: file.to_path_buf(),
2712 name,
2713 kind: SymbolKind::FileSummary,
2714 start_line: 0,
2715 end_line: 0,
2716 exported: false,
2717 embed_text: truncate_chars(
2718 &format!(
2719 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2720 file.file_stem()
2721 .map(|stem| stem.to_string_lossy().to_string())
2722 .unwrap_or_default()
2723 ),
2724 MAX_EMBED_TEXT_CHARS,
2725 ),
2726 snippet,
2727 }
2728}
2729
2730fn parser_for(
2731 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2732 lang: crate::parser::LangId,
2733) -> Result<&mut Parser, String> {
2734 use std::collections::hash_map::Entry;
2735
2736 match parsers.entry(lang) {
2737 Entry::Occupied(entry) => Ok(entry.into_mut()),
2738 Entry::Vacant(entry) => {
2739 let grammar = grammar_for(lang);
2740 let mut parser = Parser::new();
2741 parser
2742 .set_language(&grammar)
2743 .map_err(|error| error.to_string())?;
2744 Ok(entry.insert(parser))
2745 }
2746 }
2747}
2748
2749pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2750 matches!(
2751 path.extension().and_then(|extension| extension.to_str()),
2752 Some(
2753 "ts" | "tsx"
2754 | "js"
2755 | "jsx"
2756 | "py"
2757 | "rs"
2758 | "go"
2759 | "c"
2760 | "h"
2761 | "cc"
2762 | "cpp"
2763 | "cxx"
2764 | "hpp"
2765 | "hh"
2766 | "zig"
2767 | "cs"
2768 | "sh"
2769 | "bash"
2770 | "zsh"
2771 | "inc"
2772 | "php"
2773 | "sol"
2774 | "scss"
2775 | "vue"
2776 | "yaml"
2777 | "yml"
2778 )
2779 )
2780}
2781
2782fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2783 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2784 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2785 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2786 .map_err(|error| error.to_string())?
2787 .unwrap_or_else(cache_freshness::zero_hash);
2788 Ok(IndexedFileMetadata {
2789 mtime,
2790 size: metadata.len(),
2791 content_hash,
2792 })
2793}
2794
2795fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2796 if let Ok(canonical) = fs::canonicalize(path) {
2797 return canonical;
2798 }
2799
2800 let Some(parent) = path.parent() else {
2801 return path.to_path_buf();
2802 };
2803 let Some(file_name) = path.file_name() else {
2804 return path.to_path_buf();
2805 };
2806
2807 fs::canonicalize(parent)
2808 .map(|canonical_parent| canonical_parent.join(file_name))
2809 .unwrap_or_else(|_| path.to_path_buf())
2810}
2811
2812fn collect_file_chunks(
2813 project_root: &Path,
2814 file: &Path,
2815 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2816) -> Result<Vec<SemanticChunk>, String> {
2817 if !is_semantic_indexed_extension(file) {
2818 return Err("unsupported file extension".to_string());
2819 }
2820 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2821 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2822 let tree = parser_for(parsers, lang)?
2823 .parse(&source, None)
2824 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2825 let symbols =
2826 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2827
2828 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2829}
2830
2831fn build_snippet(symbol: &Symbol, source: &str) -> String {
2833 let lines: Vec<&str> = source.lines().collect();
2834 let start = (symbol.range.start_line as usize).min(lines.len());
2835 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2837 if start < end {
2838 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2839 let mut snippet = snippet_lines.join("\n");
2840 if end - start > 5 {
2841 snippet.push_str("\n ...");
2842 }
2843 if snippet.len() > 300 {
2844 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2845 }
2846 snippet
2847 } else {
2848 String::new()
2849 }
2850}
2851
2852fn symbols_to_chunks(
2854 file: &Path,
2855 symbols: &[Symbol],
2856 source: &str,
2857 project_root: &Path,
2858) -> Vec<SemanticChunk> {
2859 let mut chunks = Vec::new();
2860 let top_exports_with_signatures = symbols
2861 .iter()
2862 .filter(|symbol| {
2863 symbol.exported
2864 && symbol.parent.is_none()
2865 && !matches!(symbol.kind, SymbolKind::Heading)
2866 })
2867 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2868 .collect::<Vec<_>>();
2869
2870 let has_only_headings = !symbols.is_empty()
2871 && symbols
2872 .iter()
2873 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2874 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2875 let top_exports = top_exports_with_signatures
2876 .iter()
2877 .map(|(name, _)| *name)
2878 .collect::<Vec<_>>();
2879 let top_export_signatures = top_exports_with_signatures
2880 .iter()
2881 .map(|(_, signature)| *signature)
2882 .collect::<Vec<_>>();
2883 chunks.push(build_file_summary_chunk(
2884 file,
2885 project_root,
2886 source,
2887 &top_exports,
2888 &top_export_signatures,
2889 ));
2890 }
2891
2892 for symbol in symbols {
2893 if matches!(symbol.kind, SymbolKind::Heading) {
2898 continue;
2899 }
2900
2901 let line_count = symbol
2903 .range
2904 .end_line
2905 .saturating_sub(symbol.range.start_line)
2906 + 1;
2907 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2908 continue;
2909 }
2910
2911 let embed_text = build_embed_text(symbol, source, file, project_root);
2912 let snippet = build_snippet(symbol, source);
2913
2914 chunks.push(SemanticChunk {
2915 file: file.to_path_buf(),
2916 name: symbol.name.clone(),
2917 kind: symbol.kind.clone(),
2918 start_line: symbol.range.start_line,
2919 end_line: symbol.range.end_line,
2920 exported: symbol.exported,
2921 embed_text,
2922 snippet,
2923 });
2924
2925 }
2928
2929 chunks
2930}
2931
2932fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2934 if a.len() != b.len() {
2935 return 0.0;
2936 }
2937
2938 let mut dot = 0.0f32;
2939 let mut norm_a = 0.0f32;
2940 let mut norm_b = 0.0f32;
2941
2942 for i in 0..a.len() {
2943 dot += a[i] * b[i];
2944 norm_a += a[i] * a[i];
2945 norm_b += b[i] * b[i];
2946 }
2947
2948 let denom = norm_a.sqrt() * norm_b.sqrt();
2949 if denom == 0.0 {
2950 0.0
2951 } else {
2952 dot / denom
2953 }
2954}
2955
2956fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2958 match kind {
2959 SymbolKind::Function => 0,
2960 SymbolKind::Class => 1,
2961 SymbolKind::Method => 2,
2962 SymbolKind::Struct => 3,
2963 SymbolKind::Interface => 4,
2964 SymbolKind::Enum => 5,
2965 SymbolKind::TypeAlias => 6,
2966 SymbolKind::Variable => 7,
2967 SymbolKind::Heading => 8,
2968 SymbolKind::FileSummary => 9,
2969 }
2970}
2971
2972fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2973 match v {
2974 0 => SymbolKind::Function,
2975 1 => SymbolKind::Class,
2976 2 => SymbolKind::Method,
2977 3 => SymbolKind::Struct,
2978 4 => SymbolKind::Interface,
2979 5 => SymbolKind::Enum,
2980 6 => SymbolKind::TypeAlias,
2981 7 => SymbolKind::Variable,
2982 8 => SymbolKind::Heading,
2983 9 => SymbolKind::FileSummary,
2984 _ => SymbolKind::Heading,
2985 }
2986}
2987
2988fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2989 if *pos + 4 > data.len() {
2990 return Err("unexpected end of data reading u32".to_string());
2991 }
2992 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2993 *pos += 4;
2994 Ok(val)
2995}
2996
2997fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2998 if *pos + 8 > data.len() {
2999 return Err("unexpected end of data reading u64".to_string());
3000 }
3001 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
3002 *pos += 8;
3003 Ok(u64::from_le_bytes(bytes))
3004}
3005
3006fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
3007 let len = read_u32(data, pos)? as usize;
3008 if *pos + len > data.len() {
3009 return Err("unexpected end of data reading string".to_string());
3010 }
3011 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
3012 *pos += len;
3013 Ok(s)
3014}
3015
3016#[cfg(test)]
3017mod tests {
3018 use super::*;
3019 use crate::config::{SemanticBackend, SemanticBackendConfig};
3020 use crate::parser::FileParser;
3021 use std::io::{Read, Write};
3022 use std::net::TcpListener;
3023 use std::thread;
3024
3025 #[test]
3026 fn semantic_index_includes_php_inc_and_scss_extensions() {
3027 for file in ["partial.inc", "index.php", "styles.scss"] {
3028 assert!(
3029 is_semantic_indexed_extension(Path::new(file)),
3030 "{file} should be semantic-index eligible"
3031 );
3032 }
3033 }
3034
3035 #[test]
3036 fn transient_marker_round_trips_and_classifies() {
3037 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3040 assert!(embedding_failure_is_transient(&marked));
3041 let clean = strip_transient_embedding_marker(&marked);
3042 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3043 assert!(clean.starts_with("openai compatible request failed:"));
3044
3045 for permanent in [
3048 "openai compatible request failed (HTTP 401): Unauthorized",
3049 "embedding dimension mismatch: index has 384, model returned 768",
3050 "too many files (>20000) for semantic indexing (max 20000)",
3051 ] {
3052 assert!(
3053 !embedding_failure_is_transient(permanent),
3054 "{permanent:?} must not be transient"
3055 );
3056 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3058 }
3059 }
3060
3061 #[test]
3062 fn send_error_transience_separates_connect_timeout_from_4xx() {
3063 assert!(is_retryable_embedding_status(
3065 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3066 ));
3067 assert!(is_retryable_embedding_status(
3068 reqwest::StatusCode::TOO_MANY_REQUESTS
3069 ));
3070 assert!(!is_retryable_embedding_status(
3071 reqwest::StatusCode::UNAUTHORIZED
3072 ));
3073 assert!(!is_retryable_embedding_status(
3074 reqwest::StatusCode::BAD_REQUEST
3075 ));
3076 }
3077
3078 #[test]
3079 fn local_backend_model_loading_body_is_transient() {
3080 for body in [
3083 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3084 r#"{"error":"model is loading, please wait"}"#,
3085 r#"{"error":"Model not loaded"}"#,
3086 "Loading model into memory",
3087 ] {
3088 assert!(
3089 embedding_response_body_is_transient(body),
3090 "{body:?} should be body-transient"
3091 );
3092 }
3093
3094 for body in [
3096 r#"{"error":"invalid api key"}"#,
3097 r#"{"error":"model 'foo' not found"}"#,
3098 "Bad Request: unknown field",
3099 ] {
3100 assert!(
3101 !embedding_response_body_is_transient(body),
3102 "{body:?} must not be body-transient"
3103 );
3104 }
3105 }
3106
3107 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3108 where
3109 F: Fn(String, String, String) -> String + Send + 'static,
3110 {
3111 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3112 let addr = listener.local_addr().expect("local addr");
3113 let handle = thread::spawn(move || {
3114 let (mut stream, _) = listener.accept().expect("accept request");
3115 let mut buf = Vec::new();
3116 let mut chunk = [0u8; 4096];
3117 let mut header_end = None;
3118 let mut content_length = 0usize;
3119 loop {
3120 let n = stream.read(&mut chunk).expect("read request");
3121 if n == 0 {
3122 break;
3123 }
3124 buf.extend_from_slice(&chunk[..n]);
3125 if header_end.is_none() {
3126 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3127 header_end = Some(pos + 4);
3128 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3129 for line in headers.lines() {
3130 if let Some(value) = line.strip_prefix("Content-Length:") {
3131 content_length = value.trim().parse::<usize>().unwrap_or(0);
3132 }
3133 }
3134 }
3135 }
3136 if let Some(end) = header_end {
3137 if buf.len() >= end + content_length {
3138 break;
3139 }
3140 }
3141 }
3142
3143 let end = header_end.expect("header terminator");
3144 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3145 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3146 let mut lines = request.lines();
3147 let request_line = lines.next().expect("request line").to_string();
3148 let path = request_line
3149 .split_whitespace()
3150 .nth(1)
3151 .expect("request path")
3152 .to_string();
3153 let response_body = handler(request_line, path, body);
3154 let response = format!(
3155 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3156 response_body.len(),
3157 response_body
3158 );
3159 stream
3160 .write_all(response.as_bytes())
3161 .expect("write response");
3162 });
3163
3164 (format!("http://{}", addr), handle)
3165 }
3166
3167 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3168 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3169 }
3170
3171 fn write_rust_file(path: &Path, function_name: &str) {
3172 fs::write(
3173 path,
3174 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3175 )
3176 .unwrap();
3177 }
3178
3179 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3180 let mut embed = test_vector_for_texts;
3181 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3182 }
3183
3184 fn test_project_root() -> PathBuf {
3185 std::env::current_dir().unwrap()
3186 }
3187
3188 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3189 index.file_mtimes.insert(file.to_path_buf(), mtime);
3190 index.file_sizes.insert(file.to_path_buf(), size);
3191 index
3192 .file_hashes
3193 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3194 }
3195
3196 #[test]
3197 fn semantic_cache_serialization_skips_paths_outside_project_root() {
3198 let dir = tempfile::tempdir().expect("create temp dir");
3199 let project = fs::canonicalize(dir.path()).expect("canonical project");
3200 let outside = project.join("..").join("outside.rs");
3201 let mut index = SemanticIndex::new(project.clone(), 3);
3202 index
3203 .file_mtimes
3204 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3205 index.file_sizes.insert(outside.clone(), 1);
3206 index
3207 .file_hashes
3208 .insert(outside.clone(), cache_freshness::zero_hash());
3209 index.entries.push(EmbeddingEntry {
3210 chunk: SemanticChunk {
3211 file: outside,
3212 name: "outside".to_string(),
3213 kind: SymbolKind::Function,
3214 start_line: 0,
3215 end_line: 0,
3216 exported: false,
3217 embed_text: "outside".to_string(),
3218 snippet: "outside".to_string(),
3219 },
3220 vector: vec![1.0, 0.0, 0.0],
3221 });
3222
3223 let bytes = index.to_bytes();
3224 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3225 assert_eq!(loaded.entries.len(), 0);
3226 assert!(loaded.file_mtimes.is_empty());
3227 }
3228
3229 #[test]
3230 fn test_cosine_similarity_identical() {
3231 let a = vec![1.0, 0.0, 0.0];
3232 let b = vec![1.0, 0.0, 0.0];
3233 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3234 }
3235
3236 #[test]
3237 fn test_cosine_similarity_orthogonal() {
3238 let a = vec![1.0, 0.0, 0.0];
3239 let b = vec![0.0, 1.0, 0.0];
3240 assert!(cosine_similarity(&a, &b).abs() < 0.001);
3241 }
3242
3243 #[test]
3244 fn test_cosine_similarity_opposite() {
3245 let a = vec![1.0, 0.0, 0.0];
3246 let b = vec![-1.0, 0.0, 0.0];
3247 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3248 }
3249
3250 #[test]
3251 fn test_serialization_roundtrip() {
3252 let project_root = test_project_root();
3253 let file = project_root.join("src/main.rs");
3254 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3255 index.entries.push(EmbeddingEntry {
3256 chunk: SemanticChunk {
3257 file: file.clone(),
3258 name: "handle_request".to_string(),
3259 kind: SymbolKind::Function,
3260 start_line: 10,
3261 end_line: 25,
3262 exported: true,
3263 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3264 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3265 },
3266 vector: vec![0.1, 0.2, 0.3, 0.4],
3267 });
3268 index.dimension = 4;
3269 index
3270 .file_mtimes
3271 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3272 index.file_sizes.insert(file, 0);
3273 index.set_fingerprint(SemanticIndexFingerprint {
3274 backend: "fastembed".to_string(),
3275 model: "all-MiniLM-L6-v2".to_string(),
3276 base_url: FALLBACK_BACKEND.to_string(),
3277 dimension: 4,
3278 chunking_version: default_chunking_version(),
3279 });
3280
3281 let bytes = index.to_bytes();
3282 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3283
3284 assert_eq!(restored.entries.len(), 1);
3285 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3286 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3287 assert_eq!(restored.dimension, 4);
3288 assert_eq!(restored.backend_label(), Some("fastembed"));
3289 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3290 }
3291
3292 #[test]
3293 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3294 let cases = [
3295 (SymbolKind::Function, 0),
3296 (SymbolKind::Class, 1),
3297 (SymbolKind::Method, 2),
3298 (SymbolKind::Struct, 3),
3299 (SymbolKind::Interface, 4),
3300 (SymbolKind::Enum, 5),
3301 (SymbolKind::TypeAlias, 6),
3302 (SymbolKind::Variable, 7),
3303 (SymbolKind::Heading, 8),
3304 (SymbolKind::FileSummary, 9),
3305 ];
3306
3307 for (kind, encoded) in cases {
3308 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3309 assert_eq!(u8_to_symbol_kind(encoded), kind);
3310 }
3311 }
3312
3313 #[test]
3314 fn test_search_top_k() {
3315 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3316 index.dimension = 3;
3317
3318 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3320 let mut vec = vec![0.0f32; 3];
3321 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3323 chunk: SemanticChunk {
3324 file: PathBuf::from("/src/lib.rs"),
3325 name: name.to_string(),
3326 kind: SymbolKind::Function,
3327 start_line: (i * 10 + 1) as u32,
3328 end_line: (i * 10 + 5) as u32,
3329 exported: true,
3330 embed_text: format!("kind:function name:{}", name),
3331 snippet: format!("fn {}() {{}}", name),
3332 },
3333 vector: vec,
3334 });
3335 }
3336
3337 let query = vec![0.9, 0.1, 0.0];
3339 let results = index.search(&query, 2);
3340
3341 assert_eq!(results.len(), 2);
3342 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3344 }
3345
3346 #[test]
3347 fn test_empty_index_search() {
3348 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3349 let results = index.search(&[0.1, 0.2, 0.3], 10);
3350 assert!(results.is_empty());
3351 }
3352
3353 #[test]
3354 fn single_line_symbol_builds_non_empty_snippet() {
3355 let symbol = Symbol {
3356 name: "answer".to_string(),
3357 kind: SymbolKind::Variable,
3358 range: crate::symbols::Range {
3359 start_line: 0,
3360 start_col: 0,
3361 end_line: 0,
3362 end_col: 24,
3363 },
3364 signature: Some("const answer = 42".to_string()),
3365 scope_chain: Vec::new(),
3366 exported: true,
3367 parent: None,
3368 };
3369 let source = "export const answer = 42;\n";
3370
3371 let snippet = build_snippet(&symbol, source);
3372
3373 assert_eq!(snippet, "export const answer = 42;");
3374 }
3375
3376 #[test]
3377 fn optimized_file_chunk_collection_matches_file_parser_path() {
3378 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3379 let file = project_root.join("src/semantic_index.rs");
3380 let source = std::fs::read_to_string(&file).unwrap();
3381
3382 let mut legacy_parser = FileParser::new();
3383 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3384 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3385
3386 let mut parsers = HashMap::new();
3387 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3388
3389 assert_eq!(
3390 chunk_fingerprint(&optimized_chunks),
3391 chunk_fingerprint(&legacy_chunks)
3392 );
3393 }
3394
3395 fn chunk_fingerprint(
3396 chunks: &[SemanticChunk],
3397 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3398 chunks
3399 .iter()
3400 .map(|chunk| {
3401 (
3402 chunk.name.clone(),
3403 chunk.kind.clone(),
3404 chunk.start_line,
3405 chunk.end_line,
3406 chunk.exported,
3407 chunk.embed_text.clone(),
3408 chunk.snippet.clone(),
3409 )
3410 })
3411 .collect()
3412 }
3413
3414 #[test]
3415 fn rejects_oversized_dimension_during_deserialization() {
3416 let mut bytes = Vec::new();
3417 bytes.push(1u8);
3418 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3419 bytes.extend_from_slice(&0u32.to_le_bytes());
3420 bytes.extend_from_slice(&0u32.to_le_bytes());
3421
3422 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3423 }
3424
3425 #[test]
3426 fn rejects_oversized_entry_count_during_deserialization() {
3427 let mut bytes = Vec::new();
3428 bytes.push(1u8);
3429 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3430 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3431 bytes.extend_from_slice(&0u32.to_le_bytes());
3432
3433 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3434 }
3435
3436 #[test]
3437 fn invalidate_file_removes_entries_and_mtime() {
3438 let target = PathBuf::from("/src/main.rs");
3439 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3440 index.entries.push(EmbeddingEntry {
3441 chunk: SemanticChunk {
3442 file: target.clone(),
3443 name: "main".to_string(),
3444 kind: SymbolKind::Function,
3445 start_line: 0,
3446 end_line: 1,
3447 exported: false,
3448 embed_text: "main".to_string(),
3449 snippet: "fn main() {}".to_string(),
3450 },
3451 vector: vec![1.0; DEFAULT_DIMENSION],
3452 });
3453 index
3454 .file_mtimes
3455 .insert(target.clone(), SystemTime::UNIX_EPOCH);
3456 index.file_sizes.insert(target.clone(), 0);
3457
3458 index.invalidate_file(&target);
3459
3460 assert!(index.entries.is_empty());
3461 assert!(!index.file_mtimes.contains_key(&target));
3462 assert!(!index.file_sizes.contains_key(&target));
3463 }
3464
3465 #[test]
3466 fn refresh_missing_changed_file_is_purged_after_collect() {
3467 let temp = tempfile::tempdir().unwrap();
3468 let project_root = temp.path();
3469 let file = project_root.join("src/lib.rs");
3470 fs::create_dir_all(file.parent().unwrap()).unwrap();
3471 write_rust_file(&file, "vanished_symbol");
3472
3473 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3474 let original_size = *index.file_sizes.get(&file).unwrap();
3475 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3476 fs::remove_file(&file).unwrap();
3477
3478 let mut embed = test_vector_for_texts;
3479 let mut progress = |_done: usize, _total: usize| {};
3480 let summary = index
3481 .refresh_stale_files(
3482 project_root,
3483 std::slice::from_ref(&file),
3484 &mut embed,
3485 8,
3486 &mut progress,
3487 )
3488 .unwrap();
3489
3490 assert_eq!(summary.changed, 0);
3491 assert_eq!(summary.added, 0);
3492 assert_eq!(summary.deleted, 1);
3493 assert!(index.entries.is_empty());
3494 assert!(!index.file_mtimes.contains_key(&file));
3495 assert!(!index.file_sizes.contains_key(&file));
3496 assert!(!index.file_hashes.contains_key(&file));
3497 }
3498
3499 #[test]
3500 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3501 let temp = tempfile::tempdir().unwrap();
3502 let project_root = temp.path();
3503 let file = project_root.join("src/lib.rs");
3504 fs::create_dir_all(file.parent().unwrap()).unwrap();
3505 write_rust_file(&file, "kept_symbol");
3506
3507 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3508 let original_entry_count = index.entries.len();
3509 let original_mtime = *index.file_mtimes.get(&file).unwrap();
3510 let original_size = *index.file_sizes.get(&file).unwrap();
3511
3512 let stale_mtime = SystemTime::UNIX_EPOCH;
3513 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3514 fs::remove_file(&file).unwrap();
3515 fs::create_dir(&file).unwrap();
3516
3517 let mut embed = test_vector_for_texts;
3518 let mut progress = |_done: usize, _total: usize| {};
3519 let summary = index
3520 .refresh_stale_files(
3521 project_root,
3522 std::slice::from_ref(&file),
3523 &mut embed,
3524 8,
3525 &mut progress,
3526 )
3527 .unwrap();
3528
3529 assert_eq!(summary.changed, 0);
3530 assert_eq!(summary.added, 0);
3531 assert_eq!(summary.deleted, 0);
3532 assert_eq!(index.entries.len(), original_entry_count);
3533 assert!(index
3534 .entries
3535 .iter()
3536 .any(|entry| entry.chunk.name == "kept_symbol"));
3537 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3538 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3539 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3540 }
3541
3542 #[test]
3543 fn refresh_never_indexed_file_error_does_not_record_mtime() {
3544 let temp = tempfile::tempdir().unwrap();
3545 let project_root = temp.path();
3546 let missing = project_root.join("src/missing.rs");
3547 fs::create_dir_all(missing.parent().unwrap()).unwrap();
3548
3549 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3550 let mut embed = test_vector_for_texts;
3551 let mut progress = |_done: usize, _total: usize| {};
3552 let summary = index
3553 .refresh_stale_files(
3554 project_root,
3555 std::slice::from_ref(&missing),
3556 &mut embed,
3557 8,
3558 &mut progress,
3559 )
3560 .unwrap();
3561
3562 assert_eq!(summary.added, 0);
3563 assert_eq!(summary.changed, 0);
3564 assert_eq!(summary.deleted, 0);
3565 assert!(!index.file_mtimes.contains_key(&missing));
3566 assert!(!index.file_sizes.contains_key(&missing));
3567 assert!(index.entries.is_empty());
3568 }
3569
3570 #[test]
3571 fn refresh_reports_added_for_new_files() {
3572 let temp = tempfile::tempdir().unwrap();
3573 let project_root = temp.path();
3574 let existing = project_root.join("src/lib.rs");
3575 let added = project_root.join("src/new.rs");
3576 fs::create_dir_all(existing.parent().unwrap()).unwrap();
3577 write_rust_file(&existing, "existing_symbol");
3578 write_rust_file(&added, "added_symbol");
3579
3580 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3581 let mut embed = test_vector_for_texts;
3582 let mut progress = |_done: usize, _total: usize| {};
3583 let summary = index
3584 .refresh_stale_files(
3585 project_root,
3586 &[existing.clone(), added.clone()],
3587 &mut embed,
3588 8,
3589 &mut progress,
3590 )
3591 .unwrap();
3592
3593 assert_eq!(summary.added, 1);
3594 assert_eq!(summary.changed, 0);
3595 assert_eq!(summary.deleted, 0);
3596 assert_eq!(summary.total_processed, 2);
3597 assert!(index.file_mtimes.contains_key(&added));
3598 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3599 }
3600
3601 #[test]
3602 fn refresh_reports_deleted_for_removed_files() {
3603 let temp = tempfile::tempdir().unwrap();
3604 let project_root = temp.path();
3605 let deleted = project_root.join("src/deleted.rs");
3606 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3607 write_rust_file(&deleted, "deleted_symbol");
3608
3609 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3610 fs::remove_file(&deleted).unwrap();
3611
3612 let mut embed = test_vector_for_texts;
3613 let mut progress = |_done: usize, _total: usize| {};
3614 let summary = index
3615 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3616 .unwrap();
3617
3618 assert_eq!(summary.deleted, 1);
3619 assert_eq!(summary.changed, 0);
3620 assert_eq!(summary.added, 0);
3621 assert_eq!(summary.total_processed, 1);
3622 assert!(!index.file_mtimes.contains_key(&deleted));
3623 assert!(index.entries.is_empty());
3624 }
3625
3626 #[test]
3627 fn refresh_reports_changed_for_modified_files() {
3628 let temp = tempfile::tempdir().unwrap();
3629 let project_root = temp.path();
3630 let file = project_root.join("src/lib.rs");
3631 fs::create_dir_all(file.parent().unwrap()).unwrap();
3632 write_rust_file(&file, "old_symbol");
3633
3634 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3635 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3636 write_rust_file(&file, "new_symbol");
3637
3638 let mut embed = test_vector_for_texts;
3639 let mut progress = |_done: usize, _total: usize| {};
3640 let summary = index
3641 .refresh_stale_files(
3642 project_root,
3643 std::slice::from_ref(&file),
3644 &mut embed,
3645 8,
3646 &mut progress,
3647 )
3648 .unwrap();
3649
3650 assert_eq!(summary.changed, 1);
3651 assert_eq!(summary.added, 0);
3652 assert_eq!(summary.deleted, 0);
3653 assert_eq!(summary.total_processed, 1);
3654 assert!(index
3655 .entries
3656 .iter()
3657 .any(|entry| entry.chunk.name == "new_symbol"));
3658 assert!(!index
3659 .entries
3660 .iter()
3661 .any(|entry| entry.chunk.name == "old_symbol"));
3662 }
3663
3664 #[test]
3665 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3666 let temp = tempfile::tempdir().unwrap();
3667 let project_root = temp.path();
3668 let file = project_root.join("src/lib.rs");
3669 fs::create_dir_all(file.parent().unwrap()).unwrap();
3670 write_rust_file(&file, "clean_symbol");
3671
3672 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3673 let original_entries = index.entries.len();
3674 let mut embed_called = false;
3675 let mut embed = |texts: Vec<String>| {
3676 embed_called = true;
3677 test_vector_for_texts(texts)
3678 };
3679 let mut progress = |_done: usize, _total: usize| {};
3680 let summary = index
3681 .refresh_stale_files(
3682 project_root,
3683 std::slice::from_ref(&file),
3684 &mut embed,
3685 8,
3686 &mut progress,
3687 )
3688 .unwrap();
3689
3690 assert!(summary.is_noop());
3691 assert_eq!(summary.total_processed, 1);
3692 assert!(!embed_called);
3693 assert_eq!(index.entries.len(), original_entries);
3694 }
3695
3696 #[test]
3697 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3698 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3699
3700 assert!(is_onnx_runtime_unavailable(message));
3701 }
3702
3703 #[test]
3704 fn formats_missing_onnx_runtime_with_install_hint() {
3705 let message = format_embedding_init_error(
3706 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3707 );
3708
3709 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3710 assert!(message.contains("Original error:"));
3711 }
3712
3713 #[test]
3714 fn openai_compatible_backend_embeds_with_mock_server() {
3715 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3716 assert!(request_line.starts_with("POST "));
3717 assert_eq!(path, "/v1/embeddings");
3718 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3719 });
3720
3721 let config = SemanticBackendConfig {
3722 backend: SemanticBackend::OpenAiCompatible,
3723 model: "test-embedding".to_string(),
3724 base_url: Some(base_url),
3725 api_key_env: None,
3726 timeout_ms: 5_000,
3727 max_batch_size: 64,
3728 max_files: 20_000,
3729 };
3730
3731 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3732 let vectors = model
3733 .embed(vec!["hello".to_string(), "world".to_string()])
3734 .unwrap();
3735
3736 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3737 handle.join().unwrap();
3738 }
3739
3740 #[test]
3750 fn openai_compatible_request_has_single_content_type_header() {
3751 use std::sync::{Arc, Mutex};
3752 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3753 let captured_for_thread = Arc::clone(&captured);
3754
3755 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3756 let addr = listener.local_addr().expect("local addr");
3757 let handle = thread::spawn(move || {
3758 let (mut stream, _) = listener.accept().expect("accept");
3759 let mut buf = Vec::new();
3760 let mut chunk = [0u8; 4096];
3761 let mut header_end = None;
3762 let mut content_length = 0usize;
3763 loop {
3764 let n = stream.read(&mut chunk).expect("read");
3765 if n == 0 {
3766 break;
3767 }
3768 buf.extend_from_slice(&chunk[..n]);
3769 if header_end.is_none() {
3770 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3771 header_end = Some(pos + 4);
3772 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3773 if let Some(value) = line.strip_prefix("Content-Length:") {
3774 content_length = value.trim().parse::<usize>().unwrap_or(0);
3775 }
3776 }
3777 }
3778 }
3779 if let Some(end) = header_end {
3780 if buf.len() >= end + content_length {
3781 break;
3782 }
3783 }
3784 }
3785 *captured_for_thread.lock().unwrap() = buf;
3786 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3787 let response = format!(
3788 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3789 body.len(),
3790 body
3791 );
3792 let _ = stream.write_all(response.as_bytes());
3793 });
3794
3795 let config = SemanticBackendConfig {
3796 backend: SemanticBackend::OpenAiCompatible,
3797 model: "text-embedding-3-small".to_string(),
3798 base_url: Some(format!("http://{}", addr)),
3799 api_key_env: None,
3800 timeout_ms: 5_000,
3801 max_batch_size: 64,
3802 max_files: 20_000,
3803 };
3804 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3805 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3806 handle.join().unwrap();
3807
3808 let bytes = captured.lock().unwrap().clone();
3809 let request = String::from_utf8_lossy(&bytes);
3810
3811 let content_type_lines = request
3814 .lines()
3815 .filter(|line| {
3816 let lower = line.to_ascii_lowercase();
3817 lower.starts_with("content-type:")
3818 })
3819 .count();
3820 assert_eq!(
3821 content_type_lines, 1,
3822 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3823 );
3824
3825 assert!(
3828 request.contains(r#""model":"text-embedding-3-small""#),
3829 "request body should contain model field; full request:\n{request}",
3830 );
3831 }
3832
3833 #[test]
3834 fn ollama_backend_embeds_with_mock_server() {
3835 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3836 assert!(request_line.starts_with("POST "));
3837 assert_eq!(path, "/api/embed");
3838 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3839 });
3840
3841 let config = SemanticBackendConfig {
3842 backend: SemanticBackend::Ollama,
3843 model: "embeddinggemma".to_string(),
3844 base_url: Some(base_url),
3845 api_key_env: None,
3846 timeout_ms: 5_000,
3847 max_batch_size: 64,
3848 max_files: 20_000,
3849 };
3850
3851 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3852 let vectors = model
3853 .embed(vec!["hello".to_string(), "world".to_string()])
3854 .unwrap();
3855
3856 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3857 handle.join().unwrap();
3858 }
3859
3860 #[test]
3861 fn read_from_disk_rejects_fingerprint_mismatch() {
3862 let storage = tempfile::tempdir().unwrap();
3863 let project_key = "proj";
3864
3865 let project_root = test_project_root();
3866 let file = project_root.join("src/main.rs");
3867 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3868 index.entries.push(EmbeddingEntry {
3869 chunk: SemanticChunk {
3870 file: file.clone(),
3871 name: "handle_request".to_string(),
3872 kind: SymbolKind::Function,
3873 start_line: 10,
3874 end_line: 25,
3875 exported: true,
3876 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3877 snippet: "fn handle_request() {}".to_string(),
3878 },
3879 vector: vec![0.1, 0.2, 0.3],
3880 });
3881 index.dimension = 3;
3882 index
3883 .file_mtimes
3884 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3885 index.file_sizes.insert(file, 0);
3886 index.set_fingerprint(SemanticIndexFingerprint {
3887 backend: "openai_compatible".to_string(),
3888 model: "test-embedding".to_string(),
3889 base_url: "http://127.0.0.1:1234/v1".to_string(),
3890 dimension: 3,
3891 chunking_version: default_chunking_version(),
3892 });
3893 index.write_to_disk(storage.path(), project_key);
3894
3895 let matching = index.fingerprint().unwrap().as_string();
3896 assert!(SemanticIndex::read_from_disk(
3897 storage.path(),
3898 project_key,
3899 &project_root,
3900 false,
3901 Some(&matching),
3902 )
3903 .is_some());
3904
3905 let mismatched = SemanticIndexFingerprint {
3906 backend: "ollama".to_string(),
3907 model: "embeddinggemma".to_string(),
3908 base_url: "http://127.0.0.1:11434".to_string(),
3909 dimension: 3,
3910 chunking_version: default_chunking_version(),
3911 }
3912 .as_string();
3913 assert!(SemanticIndex::read_from_disk(
3914 storage.path(),
3915 project_key,
3916 &project_root,
3917 false,
3918 Some(&mismatched),
3919 )
3920 .is_none());
3921 }
3922
3923 #[test]
3924 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3925 let storage = tempfile::tempdir().unwrap();
3926 let project_key = "proj-v3";
3927 let dir = storage.path().join("semantic").join(project_key);
3928 fs::create_dir_all(&dir).unwrap();
3929
3930 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3931 index.entries.push(EmbeddingEntry {
3932 chunk: SemanticChunk {
3933 file: PathBuf::from("/src/main.rs"),
3934 name: "handle_request".to_string(),
3935 kind: SymbolKind::Function,
3936 start_line: 0,
3937 end_line: 0,
3938 exported: true,
3939 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3940 snippet: "fn handle_request() {}".to_string(),
3941 },
3942 vector: vec![0.1, 0.2, 0.3],
3943 });
3944 index.dimension = 3;
3945 index
3946 .file_mtimes
3947 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3948 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3949 let fingerprint = SemanticIndexFingerprint {
3950 backend: "fastembed".to_string(),
3951 model: "test".to_string(),
3952 base_url: FALLBACK_BACKEND.to_string(),
3953 dimension: 3,
3954 chunking_version: default_chunking_version(),
3955 };
3956 index.set_fingerprint(fingerprint.clone());
3957
3958 let mut bytes = index.to_bytes();
3959 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3960 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3961
3962 assert!(SemanticIndex::read_from_disk(
3963 storage.path(),
3964 project_key,
3965 &test_project_root(),
3966 false,
3967 Some(&fingerprint.as_string())
3968 )
3969 .is_none());
3970 assert!(!dir.join("semantic.bin").exists());
3971 }
3972
3973 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3974 crate::symbols::Symbol {
3975 name: name.to_string(),
3976 kind,
3977 range: crate::symbols::Range {
3978 start_line: start,
3979 start_col: 0,
3980 end_line: end,
3981 end_col: 0,
3982 },
3983 signature: None,
3984 scope_chain: Vec::new(),
3985 exported: false,
3986 parent: None,
3987 }
3988 }
3989
3990 #[test]
3995 fn symbols_to_chunks_skips_heading_symbols() {
3996 let project_root = PathBuf::from("/proj");
3997 let file = project_root.join("README.md");
3998 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3999
4000 let symbols = vec![
4001 make_symbol(SymbolKind::Heading, "Title", 0, 2),
4002 make_symbol(SymbolKind::Heading, "Section", 4, 6),
4003 ];
4004
4005 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4006 assert!(
4007 chunks.is_empty(),
4008 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
4009 chunks.len()
4010 );
4011 }
4012
4013 #[test]
4020 fn build_embed_text_clamps_oversized_signature() {
4021 let project_root = PathBuf::from("/proj");
4022 let file = project_root.join("cronjob.yaml");
4023 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
4025
4026 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
4027 symbol.signature = Some(huge_sig);
4028
4029 let text = build_embed_text(&symbol, source, &file, &project_root);
4030 assert!(
4031 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
4032 "embed_text must be clamped to {} chars, got {}",
4033 MAX_EMBED_TEXT_CHARS,
4034 text.chars().count()
4035 );
4036 }
4037
4038 #[test]
4042 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
4043 let project_root = PathBuf::from("/proj");
4044 let file = project_root.join("src/lib.rs");
4045 let source = "pub fn handle_request() -> bool {\n true\n}\n";
4046
4047 let symbols = vec![
4048 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
4050 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
4051 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
4052 ];
4053
4054 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4055 assert_eq!(
4056 chunks.len(),
4057 3,
4058 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
4059 chunks.len()
4060 );
4061 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
4062 assert!(chunks
4063 .iter()
4064 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
4065 assert!(names.contains(&"handle_request"));
4066 assert!(names.contains(&"AuthService"));
4067 assert!(
4068 !names.contains(&"doc heading"),
4069 "Heading symbol leaked into chunks: {names:?}"
4070 );
4071 }
4072
4073 #[test]
4074 fn validate_ssrf_allows_loopback_hostnames() {
4075 for host in &[
4078 "http://localhost",
4079 "http://localhost:8080",
4080 "http://localhost:11434", "http://localhost.localdomain",
4082 "http://foo.localhost",
4083 ] {
4084 assert!(
4085 validate_base_url_no_ssrf(host).is_ok(),
4086 "Expected {host} to be allowed (loopback), got: {:?}",
4087 validate_base_url_no_ssrf(host)
4088 );
4089 }
4090 }
4091
4092 #[test]
4093 fn validate_ssrf_allows_loopback_ips() {
4094 for url in &[
4097 "http://127.0.0.1",
4098 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
4100 "http://127.1.2.3",
4101 ] {
4102 let result = validate_base_url_no_ssrf(url);
4103 assert!(
4104 result.is_ok(),
4105 "Expected {url} to be allowed (loopback), got: {:?}",
4106 result
4107 );
4108 }
4109 }
4110
4111 #[test]
4112 fn validate_ssrf_rejects_private_non_loopback_ips() {
4113 for url in &[
4118 "http://192.168.1.1",
4119 "http://10.0.0.1",
4120 "http://172.16.0.1",
4121 "http://169.254.169.254",
4122 "http://100.64.0.1",
4123 ] {
4124 let result = validate_base_url_no_ssrf(url);
4125 assert!(
4126 result.is_err(),
4127 "Expected {url} to be rejected (non-loopback private), got: {:?}",
4128 result
4129 );
4130 }
4131 }
4132
4133 #[test]
4134 fn validate_ssrf_rejects_mdns_local_hostnames() {
4135 for host in &[
4138 "http://printer.local",
4139 "http://nas.local:8080",
4140 "http://homelab.local",
4141 ] {
4142 let result = validate_base_url_no_ssrf(host);
4143 assert!(
4144 result.is_err(),
4145 "Expected {host} to be rejected (mDNS), got: {:?}",
4146 result
4147 );
4148 }
4149 }
4150
4151 #[test]
4152 fn normalize_base_url_allows_localhost_for_tests() {
4153 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4156 assert!(normalize_base_url("http://localhost:8080").is_ok());
4157 }
4158
4159 #[test]
4166 fn ort_mismatch_message_recommends_auto_fix_first() {
4167 let msg =
4168 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4169
4170 assert!(
4172 msg.contains("v1.9.0"),
4173 "should report detected version: {msg}"
4174 );
4175 assert!(
4176 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4177 "should report system path: {msg}"
4178 );
4179 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4180
4181 let auto_fix_pos = msg
4183 .find("Auto-fix")
4184 .expect("Auto-fix solution missing — users won't discover --fix");
4185 let remove_pos = msg
4186 .find("Remove the old library")
4187 .expect("system-rm solution missing");
4188 assert!(
4189 auto_fix_pos < remove_pos,
4190 "Auto-fix must come before manual rm — see PR comment thread"
4191 );
4192
4193 assert!(
4195 msg.contains("npx @cortexkit/aft doctor --fix"),
4196 "auto-fix command must be present and copy-pasteable: {msg}"
4197 );
4198 }
4199
4200 #[test]
4204 fn ort_mismatch_message_handles_macos_dylib_path() {
4205 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4206 assert!(msg.contains("v1.9.0"));
4207 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4208 assert!(
4212 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4213 "system path should be quoted in the auto-fix sentence: {msg}"
4214 );
4215 }
4216}