1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Local(LocalEmbedder),
133 OpenAiCompatible {
134 client: Client,
135 model: String,
136 base_url: String,
137 api_key: Option<String>,
138 },
139 Ollama {
140 client: Client,
141 model: String,
142 base_url: String,
143 },
144}
145
146pub struct SemanticEmbeddingModel {
147 backend: SemanticBackend,
148 model: String,
149 base_url: Option<String>,
150 timeout_ms: u64,
151 max_batch_size: usize,
152 dimension: Option<usize>,
153 engine: SemanticEmbeddingEngine,
154 query_embedding_cache: HashMap<String, Vec<f32>>,
155 query_embedding_cache_order: VecDeque<String>,
156 query_embedding_cache_hits: u64,
157 query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163 vectors: &[Vec<f32>],
164 expected_count: usize,
165 context: &str,
166) -> Result<(), String> {
167 if expected_count > 0 && vectors.is_empty() {
168 return Err(format!(
169 "{context} returned no vectors for {expected_count} inputs"
170 ));
171 }
172
173 if vectors.len() != expected_count {
174 return Err(format!(
175 "{context} returned {} vectors for {} inputs",
176 vectors.len(),
177 expected_count
178 ));
179 }
180
181 let Some(first_vector) = vectors.first() else {
182 return Ok(());
183 };
184 let expected_dimension = first_vector.len();
185 validate_embedding_dimension(expected_dimension)
186 .map_err(|error| format!("{context} returned {error}"))?;
187 for (index, vector) in vectors.iter().enumerate() {
188 if vector.len() != expected_dimension {
189 return Err(format!(
190 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191 vector.len()
192 ));
193 }
194 }
195
196 Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200 if dimension == 0 || dimension > MAX_DIMENSION {
201 return Err(format!(
202 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203 ));
204 }
205
206 Ok(())
207}
208
209fn normalize_base_url(raw: &str) -> Result<String, String> {
213 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214 let scheme = parsed.scheme();
215 if scheme != "http" && scheme != "https" {
216 return Err(format!(
217 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218 scheme
219 ));
220 }
221 Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239 use std::net::{IpAddr, ToSocketAddrs};
240
241 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243 let host = parsed.host_str().unwrap_or("");
244
245 let is_loopback_host =
250 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251 if is_loopback_host {
252 return Ok(());
253 }
254
255 if host.ends_with(".local") {
258 return Err(format!(
259 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260 ));
261 }
262
263 let port = parsed.port_or_known_default().unwrap_or(443);
266 let addr_str = format!("{host}:{port}");
267 let addrs: Vec<IpAddr> = addr_str
268 .to_socket_addrs()
269 .map(|iter| iter.map(|sa| sa.ip()).collect())
270 .unwrap_or_default();
271 for ip in &addrs {
272 if is_private_non_loopback_ip(ip) {
273 return Err(format!(
274 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275 ));
276 }
277 }
278
279 Ok(())
280}
281
282fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287 match ip {
288 IpAddr::V4(v4) => {
289 let o = v4.octets();
290 o[0] == 10
293 || (o[0] == 172 && (16..=31).contains(&o[1]))
295 || (o[0] == 192 && o[1] == 168)
297 || (o[0] == 169 && o[1] == 254)
299 || (o[0] == 100 && (64..=127).contains(&o[1]))
301 || o[0] == 0
303 }
304 IpAddr::V6(v6) => {
305 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
309 || (v6.segments()[0] & 0xfe00) == 0xfc00
311 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313 && v6.segments()[2] == 0 && v6.segments()[3] == 0
314 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315 && {
316 let [a, b] = v6.segments()[6..8] else { return false; };
317 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319 })
320 }
321 }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325 if base_url.ends_with("/v1") {
326 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327 } else {
328 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329 }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333 if base_url.ends_with("/api") {
334 format!("{base_url}/embed")
335 } else {
336 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337 }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341 value.and_then(|token| {
342 let token = token.trim();
343 if token.is_empty() {
344 None
345 } else {
346 Some(token.to_string())
347 }
348 })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
356 error.is_connect()
357}
358
359fn sleep_before_embedding_retry(attempt_index: usize) {
360 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
361 std::thread::sleep(Duration::from_millis(*delay_ms));
362 }
363}
364
365fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
366where
367 F: FnMut() -> reqwest::blocking::RequestBuilder,
368{
369 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
370 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
371
372 let response = match make_request().send() {
373 Ok(response) => response,
374 Err(error) => {
375 if !last_attempt && is_retryable_embedding_error(&error) {
376 sleep_before_embedding_retry(attempt_index);
377 continue;
378 }
379 return Err(format!("{backend_label} request failed: {error}"));
380 }
381 };
382
383 let status = response.status();
384 let raw = match response.text() {
385 Ok(raw) => raw,
386 Err(error) => {
387 if !last_attempt && is_retryable_embedding_error(&error) {
388 sleep_before_embedding_retry(attempt_index);
389 continue;
390 }
391 return Err(format!("{backend_label} response read failed: {error}"));
392 }
393 };
394
395 if status.is_success() {
396 return Ok(raw);
397 }
398
399 if !last_attempt && is_retryable_embedding_status(status) {
400 sleep_before_embedding_retry(attempt_index);
401 continue;
402 }
403
404 return Err(format!(
405 "{backend_label} request failed (HTTP {}): {}",
406 status, raw
407 ));
408 }
409
410 unreachable!("embedding request retries exhausted without returning")
411}
412
413impl SemanticEmbeddingModel {
414 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
415 let timeout_ms = if config.timeout_ms == 0 {
416 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
417 } else {
418 config.timeout_ms
419 };
420
421 let max_batch_size = if config.max_batch_size == 0 {
422 DEFAULT_MAX_BATCH_SIZE
423 } else {
424 config.max_batch_size
425 };
426
427 let api_key_env = normalize_api_key(config.api_key_env.clone());
428 let model = config.model.clone();
429
430 let client = Client::builder()
431 .timeout(Duration::from_millis(timeout_ms))
432 .redirect(reqwest::redirect::Policy::none())
433 .build()
434 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
435
436 let engine = match config.backend {
437 SemanticBackend::Fastembed => {
438 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
439 }
440 SemanticBackend::OpenAiCompatible => {
441 let raw = config.base_url.as_ref().ok_or_else(|| {
442 "base_url is required for openai_compatible backend".to_string()
443 })?;
444 let base_url = normalize_base_url(raw)?;
445
446 let api_key = match api_key_env {
447 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
448 format!("missing api_key_env '{var_name}' for openai_compatible backend")
449 })?),
450 None => None,
451 };
452
453 SemanticEmbeddingEngine::OpenAiCompatible {
454 client,
455 model,
456 base_url,
457 api_key,
458 }
459 }
460 SemanticBackend::Ollama => {
461 let raw = config
462 .base_url
463 .as_ref()
464 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
465 let base_url = normalize_base_url(raw)?;
466
467 SemanticEmbeddingEngine::Ollama {
468 client,
469 model,
470 base_url,
471 }
472 }
473 };
474
475 Ok(Self {
476 backend: config.backend,
477 model: config.model.clone(),
478 base_url: config.base_url.clone(),
479 timeout_ms,
480 max_batch_size,
481 dimension: None,
482 engine,
483 query_embedding_cache: HashMap::new(),
484 query_embedding_cache_order: VecDeque::new(),
485 query_embedding_cache_hits: 0,
486 query_embedding_cache_misses: 0,
487 })
488 }
489
490 pub fn backend(&self) -> SemanticBackend {
491 self.backend
492 }
493
494 pub fn model(&self) -> &str {
495 &self.model
496 }
497
498 pub fn base_url(&self) -> Option<&str> {
499 self.base_url.as_deref()
500 }
501
502 pub fn max_batch_size(&self) -> usize {
503 self.max_batch_size
504 }
505
506 pub fn timeout_ms(&self) -> u64 {
507 self.timeout_ms
508 }
509
510 pub fn fingerprint(
511 &mut self,
512 config: &SemanticBackendConfig,
513 ) -> Result<SemanticIndexFingerprint, String> {
514 let dimension = self.dimension()?;
515 Ok(SemanticIndexFingerprint::from_config(config, dimension))
516 }
517
518 pub fn dimension(&mut self) -> Result<usize, String> {
519 if let Some(dimension) = self.dimension {
520 return Ok(dimension);
521 }
522
523 let dimension = match &mut self.engine {
524 SemanticEmbeddingEngine::Local(model) => {
525 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
526 vectors
527 .first()
528 .map(|v| v.len())
529 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530 }
531 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
532 let vectors =
533 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
534 vectors
535 .first()
536 .map(|v| v.len())
537 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
538 }
539 SemanticEmbeddingEngine::Ollama { .. } => {
540 let vectors =
541 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
542 vectors
543 .first()
544 .map(|v| v.len())
545 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
546 }
547 };
548
549 self.dimension = Some(dimension);
550 Ok(dimension)
551 }
552
553 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
554 self.embed_texts(texts)
555 }
556
557 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
558 if let Some(vector) = self.query_embedding_cache.get(query) {
559 self.query_embedding_cache_hits += 1;
560 return Ok(vector.clone());
561 }
562
563 self.query_embedding_cache_misses += 1;
564 let embeddings = self.embed_texts(vec![query.to_string()])?;
565 let vector = embeddings
566 .first()
567 .cloned()
568 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
569
570 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
571 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
572 self.query_embedding_cache.remove(&oldest);
573 }
574 }
575 self.query_embedding_cache
576 .insert(query.to_string(), vector.clone());
577 self.query_embedding_cache_order
578 .push_back(query.to_string());
579
580 Ok(vector)
581 }
582
583 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
584 (
585 self.query_embedding_cache_hits,
586 self.query_embedding_cache_misses,
587 self.query_embedding_cache.len(),
588 )
589 }
590
591 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
592 match &mut self.engine {
593 SemanticEmbeddingEngine::Local(model) => model
594 .embed(&texts)
595 .map_err(|error| format!("failed to embed batch: {error}")),
596 SemanticEmbeddingEngine::OpenAiCompatible {
597 client,
598 model,
599 base_url,
600 api_key,
601 } => {
602 let expected_text_count = texts.len();
603 let endpoint = build_openai_embeddings_endpoint(base_url);
604 let body = serde_json::json!({
605 "input": texts,
606 "model": model,
607 });
608
609 let raw = send_embedding_request(
610 || {
611 let mut request = client.post(&endpoint).json(&body);
621
622 if let Some(api_key) = api_key {
623 request = request.header("Authorization", format!("Bearer {api_key}"));
624 }
625
626 request
627 },
628 "openai compatible",
629 )?;
630
631 #[derive(Deserialize)]
632 struct OpenAiResponse {
633 data: Vec<OpenAiEmbeddingResult>,
634 }
635
636 #[derive(Deserialize)]
637 struct OpenAiEmbeddingResult {
638 embedding: Vec<f32>,
639 index: Option<u32>,
640 }
641
642 let parsed: OpenAiResponse = serde_json::from_str(&raw)
643 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
644 if parsed.data.len() != expected_text_count {
645 return Err(format!(
646 "openai compatible response returned {} embeddings for {} inputs",
647 parsed.data.len(),
648 expected_text_count
649 ));
650 }
651
652 let mut vectors = vec![Vec::new(); parsed.data.len()];
653 for (i, item) in parsed.data.into_iter().enumerate() {
654 let index = item.index.unwrap_or(i as u32) as usize;
655 if index >= vectors.len() {
656 return Err(
657 "openai compatible response contains invalid vector index".to_string()
658 );
659 }
660 vectors[index] = item.embedding;
661 }
662
663 for vector in &vectors {
664 if vector.is_empty() {
665 return Err(
666 "openai compatible response contained missing vectors".to_string()
667 );
668 }
669 }
670
671 self.dimension = vectors.first().map(Vec::len);
672 Ok(vectors)
673 }
674 SemanticEmbeddingEngine::Ollama {
675 client,
676 model,
677 base_url,
678 } => {
679 let expected_text_count = texts.len();
680 let endpoint = build_ollama_embeddings_endpoint(base_url);
681
682 #[derive(Serialize)]
683 struct OllamaPayload<'a> {
684 model: &'a str,
685 input: Vec<String>,
686 }
687
688 let payload = OllamaPayload {
689 model,
690 input: texts,
691 };
692
693 let raw = send_embedding_request(
694 || {
695 client.post(&endpoint).json(&payload)
700 },
701 "ollama",
702 )?;
703
704 #[derive(Deserialize)]
705 struct OllamaResponse {
706 embeddings: Vec<Vec<f32>>,
707 }
708
709 let parsed: OllamaResponse = serde_json::from_str(&raw)
710 .map_err(|error| format!("invalid ollama response: {error}"))?;
711 if parsed.embeddings.is_empty() {
712 return Err("ollama response returned no embeddings".to_string());
713 }
714 if parsed.embeddings.len() != expected_text_count {
715 return Err(format!(
716 "ollama response returned {} embeddings for {} inputs",
717 parsed.embeddings.len(),
718 expected_text_count
719 ));
720 }
721
722 let vectors = parsed.embeddings;
723 for vector in &vectors {
724 if vector.is_empty() {
725 return Err("ollama response contained empty embeddings".to_string());
726 }
727 }
728
729 self.dimension = vectors.first().map(Vec::len);
730 Ok(vectors)
731 }
732 }
733 }
734}
735
736pub fn pre_validate_onnx_runtime() -> Result<(), String> {
740 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
741
742 #[cfg(any(target_os = "linux", target_os = "macos"))]
743 {
744 #[cfg(target_os = "linux")]
745 let default_name = "libonnxruntime.so";
746 #[cfg(target_os = "macos")]
747 let default_name = "libonnxruntime.dylib";
748
749 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
750
751 unsafe {
752 let c_name = std::ffi::CString::new(lib_name)
753 .map_err(|e| format!("invalid library path: {}", e))?;
754 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
755 if handle.is_null() {
756 let err = libc::dlerror();
757 let msg = if err.is_null() {
758 "unknown dlopen error".to_string()
759 } else {
760 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
761 };
762 return Err(format!(
763 "ONNX Runtime not found. dlopen('{}') failed: {}. \
764 Run `npx @cortexkit/aft doctor` to diagnose.",
765 lib_name, msg
766 ));
767 }
768
769 let detected_version = detect_ort_version_from_path(lib_name);
772
773 libc::dlclose(handle);
774
775 if let Some(ref version) = detected_version {
777 let parts: Vec<&str> = version.split('.').collect();
778 if let (Some(major), Some(minor)) = (
779 parts.first().and_then(|s| s.parse::<u32>().ok()),
780 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
781 ) {
782 if major != 1 || minor < 20 {
783 return Err(format_ort_version_mismatch(version, lib_name));
784 }
785 }
786 }
787 }
788 }
789
790 #[cfg(target_os = "windows")]
791 {
792 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
797
798 #[link(name = "kernel32")]
802 extern "system" {
803 fn LoadLibraryExW(
804 lpLibFileName: *const u16,
805 hFile: *mut std::ffi::c_void,
806 dwFlags: u32,
807 ) -> *mut std::ffi::c_void;
808 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
809 fn GetModuleFileNameW(
810 hModule: *mut std::ffi::c_void,
811 lpFilename: *mut u16,
812 nSize: u32,
813 ) -> u32;
814 }
815
816 #[link(name = "version")]
817 extern "system" {
818 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
819 fn GetFileVersionInfoW(
820 lptstrFilename: *const u16,
821 dwHandle: u32,
822 dwLen: u32,
823 lpData: *mut std::ffi::c_void,
824 ) -> i32;
825 fn VerQueryValueW(
826 pBlock: *mut std::ffi::c_void,
827 lpSubBlock: *const u16,
828 lplpBuffer: *mut *mut std::ffi::c_void,
829 puLen: *mut u32,
830 ) -> i32;
831 }
832
833 #[repr(C)]
834 struct VS_FIXEDFILEINFO {
835 dw_signature: u32,
836 dw_struc_version: u32,
837 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
840 dw_product_version_ls: u32,
841 dw_file_flags_mask: u32,
842 dw_file_flags: u32,
843 dw_file_os: u32,
844 dw_file_type: u32,
845 dw_file_subtype: u32,
846 dw_file_date_ms: u32,
847 dw_file_date_ls: u32,
848 }
849
850 unsafe {
851 use std::os::windows::ffi::OsStrExt;
852 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
853 .encode_wide()
854 .chain(std::iter::once(0))
855 .collect();
856
857 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
858 if handle.is_null() {
859 let err = std::io::Error::last_os_error();
860 return Err(format!(
861 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
862 Run `npx @cortexkit/aft doctor` to diagnose.",
863 lib_name, err
864 ));
865 }
866
867 let mut detected_major: u32 = 0;
870 let mut detected_minor: u32 = 0;
871 let mut path_buf = [0u16; 32767];
877 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
878 if path_len > 0 {
879 let mut dummy_handle: u32 = 0;
880 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
881 if info_size > 0 {
882 let mut info = vec![0u8; info_size as usize];
883 if GetFileVersionInfoW(
884 path_buf.as_ptr(),
885 0,
886 info_size,
887 info.as_mut_ptr() as *mut std::ffi::c_void,
888 ) != 0
889 {
890 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
891 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
892 let mut vs_len: u32 = 0;
893 if VerQueryValueW(
894 info.as_mut_ptr() as *mut std::ffi::c_void,
895 sub_block.as_ptr(),
896 &mut vs_info,
897 &mut vs_len,
898 ) != 0
899 && !vs_info.is_null()
900 {
901 let fixed = vs_info as *const VS_FIXEDFILEINFO;
902 detected_major = (*fixed).dw_file_version_ms >> 16;
903 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
904 }
905 }
906 }
907 }
908
909 FreeLibrary(handle);
910
911 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
915 let ver = format!("{}.{}", detected_major, detected_minor);
916 return Err(format_ort_version_mismatch(&ver, lib_name));
917 }
918 }
919 }
920
921 Ok(())
922}
923
924#[cfg(any(target_os = "linux", target_os = "macos"))]
927fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
928 let path = std::path::Path::new(lib_path);
929
930 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
932 .into_iter()
933 .flatten()
934 {
935 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
936 if let Some(version) = extract_version_from_filename(name) {
937 return Some(version);
938 }
939 }
940 }
941
942 if let Some(parent) = path.parent() {
944 if let Ok(entries) = std::fs::read_dir(parent) {
945 for entry in entries.flatten() {
946 if let Some(name) = entry.file_name().to_str() {
947 if name.starts_with("libonnxruntime") {
948 if let Some(version) = extract_version_from_filename(name) {
949 return Some(version);
950 }
951 }
952 }
953 }
954 }
955 }
956
957 None
958}
959
960#[cfg(any(target_os = "linux", target_os = "macos"))]
962fn extract_version_from_filename(name: &str) -> Option<String> {
963 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
965 re.find(name).map(|m| m.as_str().to_string())
966}
967
968fn suggest_removal_command(lib_path: &str) -> String {
969 if lib_path.starts_with("/usr/local/lib")
970 || lib_path == "libonnxruntime.so"
971 || lib_path == "libonnxruntime.dylib"
972 {
973 #[cfg(target_os = "linux")]
974 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
975 #[cfg(target_os = "macos")]
976 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
977 }
978 format!(" rm '{}'", lib_path)
979}
980
981pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
987 format!(
988 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
989 Solutions:\n\
990 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
991 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
992 configures the bridge to load it instead of the system library — no \
993 changes to '{}'.\n\
994 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
995 {}\n\
996 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
997 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
998 version,
999 lib_name,
1000 lib_name,
1001 suggest_removal_command(lib_name),
1002 )
1003}
1004
1005pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1006 if message.trim_start().starts_with("ONNX Runtime not found.") {
1007 return true;
1008 }
1009
1010 let message = message.to_ascii_lowercase();
1011 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1012 .iter()
1013 .any(|pattern| message.contains(pattern));
1014 let mentions_dynamic_load_failure = [
1015 "shared library",
1016 "dynamic library",
1017 "failed to load",
1018 "could not load",
1019 "unable to load",
1020 "dlopen",
1021 "loadlibrary",
1022 "no such file",
1023 "not found",
1024 ]
1025 .iter()
1026 .any(|pattern| message.contains(pattern));
1027
1028 mentions_onnx_runtime && mentions_dynamic_load_failure
1029}
1030
1031pub fn format_embedding_init_error(error: impl Display) -> String {
1032 let message = error.to_string();
1033
1034 if is_onnx_runtime_unavailable(&message) {
1035 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1036 }
1037
1038 format!("failed to initialize semantic embedding model: {message}")
1039}
1040
1041#[derive(Debug, Clone)]
1043pub struct SemanticChunk {
1044 pub file: PathBuf,
1046 pub name: String,
1048 pub kind: SymbolKind,
1050 pub start_line: u32,
1052 pub end_line: u32,
1053 pub exported: bool,
1055 pub embed_text: String,
1057 pub snippet: String,
1059}
1060
1061#[derive(Debug, Clone)]
1063pub struct EmbeddingEntry {
1064 chunk: SemanticChunk,
1065 vector: Vec<f32>,
1066}
1067
1068#[derive(Debug, Clone)]
1070pub struct SemanticIndex {
1071 entries: Vec<EmbeddingEntry>,
1072 file_mtimes: HashMap<PathBuf, SystemTime>,
1074 file_sizes: HashMap<PathBuf, u64>,
1076 file_hashes: HashMap<PathBuf, blake3::Hash>,
1077 dimension: usize,
1079 fingerprint: Option<SemanticIndexFingerprint>,
1080 project_root: PathBuf,
1081 deferred_files: HashSet<PathBuf>,
1082}
1083
1084#[derive(Debug, Clone, Copy)]
1085struct IndexedFileMetadata {
1086 mtime: SystemTime,
1087 size: u64,
1088 content_hash: blake3::Hash,
1089}
1090
1091#[derive(Debug, Default, Clone, Copy)]
1094pub struct RefreshSummary {
1095 pub changed: usize,
1096 pub added: usize,
1097 pub deleted: usize,
1098 pub total_processed: usize,
1099}
1100
1101impl RefreshSummary {
1102 pub fn is_noop(&self) -> bool {
1104 self.changed == 0 && self.added == 0 && self.deleted == 0
1105 }
1106}
1107
1108#[derive(Debug, Default)]
1109pub struct InvalidatedFilesRefresh {
1110 pub added_entries: Vec<EmbeddingEntry>,
1111 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1112 pub completed_paths: Vec<PathBuf>,
1113 pub summary: RefreshSummary,
1114}
1115
1116#[derive(Debug, Clone)]
1118pub struct SemanticResult {
1119 pub file: PathBuf,
1120 pub name: String,
1121 pub kind: SymbolKind,
1122 pub start_line: u32,
1123 pub end_line: u32,
1124 pub exported: bool,
1125 pub snippet: String,
1126 pub score: f32,
1127 pub source: &'static str,
1128}
1129
1130impl SemanticIndex {
1131 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1132 debug_assert!(project_root.is_absolute());
1133 Self {
1134 entries: Vec::new(),
1135 file_mtimes: HashMap::new(),
1136 file_sizes: HashMap::new(),
1137 file_hashes: HashMap::new(),
1138 dimension,
1139 fingerprint: None,
1140 project_root,
1141 deferred_files: HashSet::new(),
1142 }
1143 }
1144
1145 pub fn entry_count(&self) -> usize {
1147 self.entries.len()
1148 }
1149
1150 pub fn indexed_file_count(&self) -> usize {
1152 self.file_mtimes.len()
1153 }
1154
1155 pub fn status_label(&self) -> &'static str {
1157 if self.entries.is_empty() {
1158 "empty"
1159 } else {
1160 "ready"
1161 }
1162 }
1163
1164 fn collect_chunks(
1165 project_root: &Path,
1166 files: &[PathBuf],
1167 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1168 let collect_started = std::time::Instant::now();
1169 let per_file: Vec<(
1170 PathBuf,
1171 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1172 )> = files
1173 .par_iter()
1174 .map_init(HashMap::new, |parsers, file| {
1175 let result = collect_file_metadata(file).and_then(|metadata| {
1176 collect_file_chunks(project_root, file, parsers)
1177 .map(|chunks| (metadata, chunks))
1178 });
1179 (file.clone(), result)
1180 })
1181 .collect();
1182
1183 let mut chunks: Vec<SemanticChunk> = Vec::new();
1184 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1185
1186 for (file, result) in per_file {
1187 match result {
1188 Ok((metadata, file_chunks)) => {
1189 file_metadata.insert(file, metadata);
1190 chunks.extend(file_chunks);
1191 }
1192 Err(error) => {
1193 if error == "unsupported file extension" {
1199 continue;
1200 }
1201 slog_warn!(
1202 "failed to collect semantic chunks for {}: {}",
1203 file.display(),
1204 error
1205 );
1206 }
1207 }
1208 }
1209
1210 slog_info!(
1211 "semantic collect: {} chunks from {} files in {} ms",
1212 chunks.len(),
1213 file_metadata.len(),
1214 collect_started.elapsed().as_millis()
1215 );
1216
1217 (chunks, file_metadata)
1218 }
1219
1220 fn build_from_chunks<F, P>(
1221 project_root: &Path,
1222 chunks: Vec<SemanticChunk>,
1223 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1224 embed_fn: &mut F,
1225 max_batch_size: usize,
1226 mut progress: Option<&mut P>,
1227 ) -> Result<Self, String>
1228 where
1229 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1230 P: FnMut(usize, usize),
1231 {
1232 debug_assert!(project_root.is_absolute());
1233 let total_chunks = chunks.len();
1234
1235 if chunks.is_empty() {
1236 return Ok(Self {
1237 entries: Vec::new(),
1238 file_mtimes: file_metadata
1239 .iter()
1240 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1241 .collect(),
1242 file_sizes: file_metadata
1243 .iter()
1244 .map(|(path, metadata)| (path.clone(), metadata.size))
1245 .collect(),
1246 file_hashes: file_metadata
1247 .into_iter()
1248 .map(|(path, metadata)| (path, metadata.content_hash))
1249 .collect(),
1250 dimension: DEFAULT_DIMENSION,
1251 fingerprint: None,
1252 project_root: project_root.to_path_buf(),
1253 deferred_files: HashSet::new(),
1254 });
1255 }
1256
1257 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1259 let mut expected_dimension: Option<usize> = None;
1260 let batch_size = max_batch_size.max(1);
1261 let embed_started = std::time::Instant::now();
1262 let batch_count = total_chunks.div_ceil(batch_size);
1263 for batch_start in (0..chunks.len()).step_by(batch_size) {
1264 let batch_end = (batch_start + batch_size).min(chunks.len());
1265 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1266 .iter()
1267 .map(|c| c.embed_text.clone())
1268 .collect();
1269
1270 let vectors = embed_fn(batch_texts)?;
1271 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1272
1273 if let Some(dim) = vectors.first().map(|v| v.len()) {
1275 match expected_dimension {
1276 None => expected_dimension = Some(dim),
1277 Some(expected) if dim != expected => {
1278 return Err(format!(
1279 "embedding dimension changed across batches: expected {expected}, got {dim}"
1280 ));
1281 }
1282 _ => {}
1283 }
1284 }
1285
1286 for (i, vector) in vectors.into_iter().enumerate() {
1287 let chunk_idx = batch_start + i;
1288 entries.push(EmbeddingEntry {
1289 chunk: chunks[chunk_idx].clone(),
1290 vector,
1291 });
1292 }
1293
1294 if let Some(callback) = progress.as_mut() {
1295 callback(entries.len(), total_chunks);
1296 }
1297 }
1298
1299 let embed_ms = embed_started.elapsed().as_millis();
1300 let rate = (total_chunks as u128 * 1000)
1301 .checked_div(embed_ms)
1302 .unwrap_or(0) as u64;
1303 slog_info!(
1304 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1305 total_chunks,
1306 batch_count,
1307 embed_ms,
1308 rate
1309 );
1310
1311 let dimension = entries
1312 .first()
1313 .map(|e| e.vector.len())
1314 .unwrap_or(DEFAULT_DIMENSION);
1315
1316 Ok(Self {
1317 entries,
1318 file_mtimes: file_metadata
1319 .iter()
1320 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1321 .collect(),
1322 file_sizes: file_metadata
1323 .iter()
1324 .map(|(path, metadata)| (path.clone(), metadata.size))
1325 .collect(),
1326 file_hashes: file_metadata
1327 .into_iter()
1328 .map(|(path, metadata)| (path, metadata.content_hash))
1329 .collect(),
1330 dimension,
1331 fingerprint: None,
1332 project_root: project_root.to_path_buf(),
1333 deferred_files: HashSet::new(),
1334 })
1335 }
1336
1337 pub fn build<F>(
1340 project_root: &Path,
1341 files: &[PathBuf],
1342 embed_fn: &mut F,
1343 max_batch_size: usize,
1344 ) -> Result<Self, String>
1345 where
1346 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1347 {
1348 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1349 Self::build_from_chunks(
1350 project_root,
1351 chunks,
1352 file_mtimes,
1353 embed_fn,
1354 max_batch_size,
1355 Option::<&mut fn(usize, usize)>::None,
1356 )
1357 }
1358
1359 pub fn build_with_progress<F, P>(
1361 project_root: &Path,
1362 files: &[PathBuf],
1363 embed_fn: &mut F,
1364 max_batch_size: usize,
1365 progress: &mut P,
1366 ) -> Result<Self, String>
1367 where
1368 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1369 P: FnMut(usize, usize),
1370 {
1371 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1372 let total_chunks = chunks.len();
1373 progress(0, total_chunks);
1374 Self::build_from_chunks(
1375 project_root,
1376 chunks,
1377 file_mtimes,
1378 embed_fn,
1379 max_batch_size,
1380 Some(progress),
1381 )
1382 }
1383
1384 pub fn refresh_stale_files<F, P>(
1395 &mut self,
1396 project_root: &Path,
1397 current_files: &[PathBuf],
1398 embed_fn: &mut F,
1399 max_batch_size: usize,
1400 progress: &mut P,
1401 ) -> Result<RefreshSummary, String>
1402 where
1403 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1404 P: FnMut(usize, usize),
1405 {
1406 self.backfill_missing_file_sizes();
1407
1408 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1410 self.deferred_files
1411 .retain(|path| current_set.contains(path.as_path()));
1412 let total_processed = current_set.len() + self.file_mtimes.len()
1413 - self
1414 .file_mtimes
1415 .keys()
1416 .filter(|path| current_set.contains(path.as_path()))
1417 .count();
1418
1419 let mut deleted: Vec<PathBuf> = Vec::new();
1422 let mut changed: Vec<PathBuf> = Vec::new();
1423 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1424 for indexed_path in &indexed_paths {
1425 if !current_set.contains(indexed_path.as_path()) {
1426 deleted.push(indexed_path.clone());
1427 continue;
1428 }
1429 let cached = match (
1430 self.file_mtimes.get(indexed_path),
1431 self.file_sizes.get(indexed_path),
1432 self.file_hashes.get(indexed_path),
1433 ) {
1434 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1435 mtime: *mtime,
1436 size: *size,
1437 content_hash: *hash,
1438 }),
1439 _ => None,
1440 };
1441 match cached
1442 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1443 {
1444 Some(FreshnessVerdict::HotFresh) => {}
1445 Some(FreshnessVerdict::ContentFresh {
1446 new_mtime,
1447 new_size,
1448 }) => {
1449 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1450 self.file_sizes.insert(indexed_path.clone(), new_size);
1451 }
1452 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1453 changed.push(indexed_path.clone());
1454 }
1455 }
1456 }
1457
1458 let mut added: Vec<PathBuf> = Vec::new();
1460 for path in current_files {
1461 if !self.file_mtimes.contains_key(path) {
1462 added.push(path.clone());
1463 }
1464 }
1465
1466 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1468 progress(0, 0);
1469 return Ok(RefreshSummary {
1470 total_processed,
1471 ..RefreshSummary::default()
1472 });
1473 }
1474
1475 if !deleted.is_empty() {
1479 self.remove_indexed_files(&deleted);
1480 }
1481
1482 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1484 to_embed.extend(changed.iter().cloned());
1485 to_embed.extend(added.iter().cloned());
1486
1487 if to_embed.is_empty() {
1488 progress(0, 0);
1490 return Ok(RefreshSummary {
1491 changed: 0,
1492 added: 0,
1493 deleted: deleted.len(),
1494 total_processed,
1495 });
1496 }
1497
1498 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1499 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1500 let vanished = to_embed
1501 .iter()
1502 .filter(|path| {
1503 changed_set.contains(path.as_path())
1504 && !fresh_metadata.contains_key(*path)
1505 && !path.exists()
1506 })
1507 .cloned()
1508 .collect::<Vec<_>>();
1509 if !vanished.is_empty() {
1510 self.remove_indexed_files(&vanished);
1511 deleted.extend(vanished);
1512 }
1513
1514 if chunks.is_empty() {
1515 progress(0, 0);
1516 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1517 for file in &successful_files {
1518 self.deferred_files.remove(file);
1519 }
1520 if !successful_files.is_empty() {
1521 self.entries
1522 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1523 }
1524 let changed_count = changed
1525 .iter()
1526 .filter(|path| successful_files.contains(*path))
1527 .count();
1528 let added_count = added
1529 .iter()
1530 .filter(|path| successful_files.contains(*path))
1531 .count();
1532 for (file, metadata) in fresh_metadata {
1533 self.file_mtimes.insert(file.clone(), metadata.mtime);
1534 self.file_sizes.insert(file.clone(), metadata.size);
1535 self.file_hashes.insert(file.clone(), metadata.content_hash);
1536 }
1537 return Ok(RefreshSummary {
1538 changed: changed_count,
1539 added: added_count,
1540 deleted: deleted.len(),
1541 total_processed,
1542 });
1543 }
1544
1545 let total_chunks = chunks.len();
1547 progress(0, total_chunks);
1548 let batch_size = max_batch_size.max(1);
1549 let existing_dimension = if self.entries.is_empty() {
1550 None
1551 } else {
1552 Some(self.dimension)
1553 };
1554 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1555 let mut observed_dimension: Option<usize> = existing_dimension;
1556
1557 for batch_start in (0..chunks.len()).step_by(batch_size) {
1558 let batch_end = (batch_start + batch_size).min(chunks.len());
1559 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1560 .iter()
1561 .map(|c| c.embed_text.clone())
1562 .collect();
1563
1564 let vectors = embed_fn(batch_texts)?;
1565 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1566
1567 if let Some(dim) = vectors.first().map(|v| v.len()) {
1568 match observed_dimension {
1569 None => observed_dimension = Some(dim),
1570 Some(expected) if dim != expected => {
1571 return Err(format!(
1574 "embedding dimension changed during incremental refresh: \
1575 cached index uses {expected}, new vectors use {dim}"
1576 ));
1577 }
1578 _ => {}
1579 }
1580 }
1581
1582 for (i, vector) in vectors.into_iter().enumerate() {
1583 let chunk_idx = batch_start + i;
1584 new_entries.push(EmbeddingEntry {
1585 chunk: chunks[chunk_idx].clone(),
1586 vector,
1587 });
1588 }
1589
1590 progress(new_entries.len(), total_chunks);
1591 }
1592
1593 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1594 for file in &successful_files {
1595 self.deferred_files.remove(file);
1596 }
1597 if !successful_files.is_empty() {
1598 self.entries
1599 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1600 }
1601
1602 self.entries.extend(new_entries);
1603 for (file, metadata) in fresh_metadata {
1604 self.file_mtimes.insert(file.clone(), metadata.mtime);
1605 self.file_sizes.insert(file.clone(), metadata.size);
1606 self.file_hashes.insert(file, metadata.content_hash);
1607 }
1608 if let Some(dim) = observed_dimension {
1609 self.dimension = dim;
1610 }
1611
1612 Ok(RefreshSummary {
1613 changed: changed
1614 .iter()
1615 .filter(|path| successful_files.contains(*path))
1616 .count(),
1617 added: added
1618 .iter()
1619 .filter(|path| successful_files.contains(*path))
1620 .count(),
1621 deleted: deleted.len(),
1622 total_processed,
1623 })
1624 }
1625
1626 pub fn refresh_invalidated_files<F, P>(
1633 &mut self,
1634 project_root: &Path,
1635 paths: &[PathBuf],
1636 embed_fn: &mut F,
1637 max_batch_size: usize,
1638 max_files: usize,
1639 progress: &mut P,
1640 ) -> Result<InvalidatedFilesRefresh, String>
1641 where
1642 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1643 P: FnMut(usize, usize),
1644 {
1645 self.backfill_missing_file_sizes();
1646
1647 self.deferred_files.retain(|path| path.exists());
1648 let mut requested_paths = paths.to_vec();
1649 requested_paths.extend(self.deferred_files.iter().cloned());
1650 requested_paths.sort();
1651 requested_paths.dedup();
1652 let total_processed = requested_paths.len();
1653
1654 if requested_paths.is_empty() {
1655 progress(0, 0);
1656 return Ok(InvalidatedFilesRefresh {
1657 summary: RefreshSummary {
1658 total_processed,
1659 ..RefreshSummary::default()
1660 },
1661 ..InvalidatedFilesRefresh::default()
1662 });
1663 }
1664
1665 let previously_indexed: HashSet<PathBuf> = requested_paths
1666 .iter()
1667 .filter(|path| self.file_mtimes.contains_key(*path))
1668 .cloned()
1669 .collect();
1670
1671 self.remove_indexed_files(&requested_paths);
1675
1676 let existing_paths = requested_paths
1677 .iter()
1678 .filter(|path| path.exists())
1679 .cloned()
1680 .collect::<Vec<_>>();
1681 let deleted = requested_paths
1682 .iter()
1683 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1684 .count();
1685
1686 if existing_paths.is_empty() {
1687 for path in &requested_paths {
1688 if !path.exists() {
1689 self.deferred_files.remove(path);
1690 }
1691 }
1692 progress(0, 0);
1693 return Ok(InvalidatedFilesRefresh {
1694 completed_paths: requested_paths,
1695 summary: RefreshSummary {
1696 deleted,
1697 total_processed,
1698 ..RefreshSummary::default()
1699 },
1700 ..InvalidatedFilesRefresh::default()
1701 });
1702 }
1703
1704 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1705
1706 let retained_file_count = self.file_mtimes.len();
1707 let changed_successful_count = existing_paths
1708 .iter()
1709 .filter(|path| {
1710 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1711 })
1712 .count();
1713 let available_new_files =
1714 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1715 let new_successful_files = existing_paths
1716 .iter()
1717 .filter(|path| {
1718 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1719 })
1720 .cloned()
1721 .collect::<Vec<_>>();
1722 if new_successful_files.len() > available_new_files {
1723 let allowed_new_files = new_successful_files
1724 .iter()
1725 .take(available_new_files)
1726 .cloned()
1727 .collect::<HashSet<_>>();
1728 let deferred_new_files = new_successful_files
1729 .into_iter()
1730 .filter(|path| !allowed_new_files.contains(path))
1731 .collect::<HashSet<_>>();
1732
1733 fresh_metadata.retain(|file, _| {
1734 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1735 });
1736 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1737
1738 if !deferred_new_files.is_empty() {
1739 for path in &deferred_new_files {
1740 self.deferred_files.insert(path.clone());
1741 }
1742 slog_warn!(
1743 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1744 deferred_new_files.len(),
1745 max_files
1746 );
1747 }
1748 }
1749
1750 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1751 for file in &successful_files {
1752 self.deferred_files.remove(file);
1753 }
1754 let changed = successful_files
1755 .iter()
1756 .filter(|path| previously_indexed.contains(path.as_path()))
1757 .count();
1758 let added = successful_files.len().saturating_sub(changed);
1759 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1760
1761 if chunks.is_empty() {
1762 progress(0, 0);
1763 for (file, metadata) in fresh_metadata {
1764 let freshness = FileFreshness {
1765 mtime: metadata.mtime,
1766 size: metadata.size,
1767 content_hash: metadata.content_hash,
1768 };
1769 self.file_mtimes.insert(file.clone(), freshness.mtime);
1770 self.file_sizes.insert(file.clone(), freshness.size);
1771 self.file_hashes
1772 .insert(file.clone(), freshness.content_hash);
1773 updated_metadata.push((file, freshness));
1774 }
1775
1776 return Ok(InvalidatedFilesRefresh {
1777 updated_metadata,
1778 completed_paths: requested_paths,
1779 summary: RefreshSummary {
1780 changed,
1781 added,
1782 deleted,
1783 total_processed,
1784 },
1785 ..InvalidatedFilesRefresh::default()
1786 });
1787 }
1788
1789 let total_chunks = chunks.len();
1790 progress(0, total_chunks);
1791 let batch_size = max_batch_size.max(1);
1792 let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1793 None
1794 } else {
1795 Some(self.dimension)
1796 };
1797 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1798
1799 for batch_start in (0..chunks.len()).step_by(batch_size) {
1800 let batch_end = (batch_start + batch_size).min(chunks.len());
1801 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1802 .iter()
1803 .map(|chunk| chunk.embed_text.clone())
1804 .collect();
1805
1806 let vectors = embed_fn(batch_texts)?;
1807 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1808
1809 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1810 match observed_dimension {
1811 None => observed_dimension = Some(dim),
1812 Some(expected) if dim != expected => {
1813 return Err(format!(
1814 "embedding dimension changed during invalidated-file refresh: \
1815 cached index uses {expected}, new vectors use {dim}"
1816 ));
1817 }
1818 _ => {}
1819 }
1820 }
1821
1822 for (i, vector) in vectors.into_iter().enumerate() {
1823 let chunk_idx = batch_start + i;
1824 new_entries.push(EmbeddingEntry {
1825 chunk: chunks[chunk_idx].clone(),
1826 vector,
1827 });
1828 }
1829
1830 progress(new_entries.len(), total_chunks);
1831 }
1832
1833 let added_entries = new_entries.clone();
1834 self.entries.extend(new_entries);
1835 for (file, metadata) in fresh_metadata {
1836 let freshness = FileFreshness {
1837 mtime: metadata.mtime,
1838 size: metadata.size,
1839 content_hash: metadata.content_hash,
1840 };
1841 self.file_mtimes.insert(file.clone(), freshness.mtime);
1842 self.file_sizes.insert(file.clone(), freshness.size);
1843 self.file_hashes
1844 .insert(file.clone(), freshness.content_hash);
1845 updated_metadata.push((file, freshness));
1846 }
1847 if let Some(dim) = observed_dimension {
1848 self.dimension = dim;
1849 }
1850
1851 Ok(InvalidatedFilesRefresh {
1852 added_entries,
1853 updated_metadata,
1854 completed_paths: requested_paths,
1855 summary: RefreshSummary {
1856 changed,
1857 added,
1858 deleted,
1859 total_processed,
1860 },
1861 })
1862 }
1863
1864 pub fn apply_refresh_update(
1865 &mut self,
1866 added_entries: Vec<EmbeddingEntry>,
1867 updated_metadata: Vec<(PathBuf, FileFreshness)>,
1868 completed_paths: &[PathBuf],
1869 ) {
1870 self.remove_indexed_files(completed_paths);
1871
1872 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1873 self.entries.extend(added_entries);
1874 for (file, freshness) in updated_metadata {
1875 self.file_mtimes.insert(file.clone(), freshness.mtime);
1876 self.file_sizes.insert(file.clone(), freshness.size);
1877 self.file_hashes.insert(file, freshness.content_hash);
1878 }
1879 if let Some(dim) = observed_dimension {
1880 self.dimension = dim;
1881 }
1882 }
1883
1884 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1885 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1886 self.entries
1887 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1888 for path in files {
1889 self.file_mtimes.remove(path);
1890 self.file_sizes.remove(path);
1891 self.file_hashes.remove(path);
1892 }
1893 }
1894
1895 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1897 if self.entries.is_empty() || query_vector.len() != self.dimension {
1898 return Vec::new();
1899 }
1900
1901 let mut scored: Vec<(f32, usize)> = self
1902 .entries
1903 .iter()
1904 .enumerate()
1905 .map(|(i, entry)| {
1906 let mut score = cosine_similarity(query_vector, &entry.vector);
1907 if entry.chunk.exported {
1908 score *= 1.1;
1909 }
1910 (score, i)
1911 })
1912 .collect();
1913
1914 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1916
1917 scored
1918 .into_iter()
1919 .take(top_k)
1920 .map(|(score, idx)| {
1924 let entry = &self.entries[idx];
1925 SemanticResult {
1926 file: entry.chunk.file.clone(),
1927 name: entry.chunk.name.clone(),
1928 kind: entry.chunk.kind.clone(),
1929 start_line: entry.chunk.start_line,
1930 end_line: entry.chunk.end_line,
1931 exported: entry.chunk.exported,
1932 snippet: entry.chunk.snippet.clone(),
1933 score,
1934 source: "semantic",
1935 }
1936 })
1937 .collect()
1938 }
1939
1940 pub fn len(&self) -> usize {
1942 self.entries.len()
1943 }
1944
1945 pub fn is_file_stale(&self, file: &Path) -> bool {
1947 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1948 return true;
1949 };
1950 let Some(stored_size) = self.file_sizes.get(file) else {
1951 return true;
1952 };
1953 let Some(stored_hash) = self.file_hashes.get(file) else {
1954 return true;
1955 };
1956 let cached = FileFreshness {
1957 mtime: *stored_mtime,
1958 size: *stored_size,
1959 content_hash: *stored_hash,
1960 };
1961 match cache_freshness::verify_file_strict(file, &cached) {
1962 FreshnessVerdict::HotFresh => false,
1963 FreshnessVerdict::ContentFresh { .. } => false,
1964 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1965 }
1966 }
1967
1968 fn backfill_missing_file_sizes(&mut self) {
1969 for path in self.file_mtimes.keys() {
1970 if self.file_sizes.contains_key(path) {
1971 continue;
1972 }
1973 if let Ok(metadata) = fs::metadata(path) {
1974 self.file_sizes.insert(path.clone(), metadata.len());
1975 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1976 self.file_hashes.insert(path.clone(), hash);
1977 }
1978 }
1979 }
1980 }
1981
1982 pub fn remove_file(&mut self, file: &Path) {
1984 self.invalidate_file(file);
1985 }
1986
1987 pub fn invalidate_file(&mut self, file: &Path) {
1988 let canonical_file = canonicalize_existing_or_deleted_path(file);
1989 self.entries
1990 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
1991 self.file_mtimes.remove(file);
1992 self.file_sizes.remove(file);
1993 self.file_hashes.remove(file);
1994 if canonical_file.as_path() != file {
1995 self.file_mtimes.remove(&canonical_file);
1996 self.file_sizes.remove(&canonical_file);
1997 self.file_hashes.remove(&canonical_file);
1998 }
1999 }
2000
2001 pub fn dimension(&self) -> usize {
2003 self.dimension
2004 }
2005
2006 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2007 self.fingerprint.as_ref()
2008 }
2009
2010 pub fn backend_label(&self) -> Option<&str> {
2011 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2012 }
2013
2014 pub fn model_label(&self) -> Option<&str> {
2015 self.fingerprint.as_ref().map(|f| f.model.as_str())
2016 }
2017
2018 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2019 self.fingerprint = Some(fingerprint);
2020 }
2021
2022 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2024 if self.entries.is_empty() {
2027 slog_info!("skipping semantic index persistence (0 entries)");
2028 return;
2029 }
2030 let dir = storage_dir.join("semantic").join(project_key);
2031 if let Err(e) = fs::create_dir_all(&dir) {
2032 slog_warn!("failed to create semantic cache dir: {}", e);
2033 return;
2034 }
2035 let data_path = dir.join("semantic.bin");
2036 let tmp_path = dir.join(format!(
2037 "semantic.bin.tmp.{}.{}",
2038 std::process::id(),
2039 SystemTime::now()
2040 .duration_since(SystemTime::UNIX_EPOCH)
2041 .unwrap_or(Duration::ZERO)
2042 .as_nanos()
2043 ));
2044 let bytes = self.to_bytes();
2045 let write_result = (|| -> std::io::Result<()> {
2046 use std::io::Write;
2047 let mut file = fs::File::create(&tmp_path)?;
2048 file.write_all(&bytes)?;
2049 file.sync_all()?;
2050 Ok(())
2051 })();
2052 if let Err(e) = write_result {
2053 slog_warn!("failed to write semantic index: {}", e);
2054 let _ = fs::remove_file(&tmp_path);
2055 return;
2056 }
2057 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2058 slog_warn!("failed to rename semantic index: {}", e);
2059 let _ = fs::remove_file(&tmp_path);
2060 return;
2061 }
2062 slog_info!(
2063 "semantic index persisted: {} entries, {:.1} KB",
2064 self.entries.len(),
2065 bytes.len() as f64 / 1024.0
2066 );
2067 }
2068
2069 pub fn read_from_disk(
2071 storage_dir: &Path,
2072 project_key: &str,
2073 current_canonical_root: &Path,
2074 is_worktree_bridge: bool,
2075 expected_fingerprint: Option<&str>,
2076 ) -> Option<Self> {
2077 debug_assert!(current_canonical_root.is_absolute());
2078 let data_path = storage_dir
2079 .join("semantic")
2080 .join(project_key)
2081 .join("semantic.bin");
2082 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2083 if file_len < HEADER_BYTES_V1 {
2084 slog_warn!(
2085 "corrupt semantic index (too small: {} bytes), removing",
2086 file_len
2087 );
2088 if !is_worktree_bridge {
2089 let _ = fs::remove_file(&data_path);
2090 }
2091 return None;
2092 }
2093
2094 let bytes = fs::read(&data_path).ok()?;
2095 let version = bytes[0];
2096 if version != SEMANTIC_INDEX_VERSION_V6 {
2097 slog_info!(
2098 "cached semantic index version {} is older than {}, rebuilding",
2099 version,
2100 SEMANTIC_INDEX_VERSION_V6
2101 );
2102 if !is_worktree_bridge {
2103 let _ = fs::remove_file(&data_path);
2104 }
2105 return None;
2106 }
2107 match Self::from_bytes(&bytes, current_canonical_root) {
2108 Ok(index) => {
2109 if index.entries.is_empty() {
2110 slog_info!("cached semantic index is empty, will rebuild");
2111 if !is_worktree_bridge {
2112 let _ = fs::remove_file(&data_path);
2113 }
2114 return None;
2115 }
2116 if let Some(expected) = expected_fingerprint {
2117 let matches = index
2118 .fingerprint()
2119 .map(|fingerprint| fingerprint.matches_expected(expected))
2120 .unwrap_or(false);
2121 if !matches {
2122 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2123 if !is_worktree_bridge {
2124 let _ = fs::remove_file(&data_path);
2125 }
2126 return None;
2127 }
2128 }
2129 slog_info!(
2130 "loaded semantic index from disk: {} entries",
2131 index.entries.len()
2132 );
2133 Some(index)
2134 }
2135 Err(e) => {
2136 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2137 if !is_worktree_bridge {
2138 let _ = fs::remove_file(&data_path);
2139 }
2140 None
2141 }
2142 }
2143 }
2144
2145 pub fn to_bytes(&self) -> Vec<u8> {
2147 let mut buf = Vec::new();
2148 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2149 let encoded = fingerprint.as_string();
2150 if encoded.is_empty() {
2151 None
2152 } else {
2153 Some(encoded.into_bytes())
2154 }
2155 });
2156 let file_mtimes: Vec<_> = self
2157 .file_mtimes
2158 .iter()
2159 .filter_map(|(path, mtime)| {
2160 cache_relative_path(&self.project_root, path)
2161 .map(|relative| (relative, path, mtime))
2162 })
2163 .collect();
2164 let entries: Vec<_> = self
2165 .entries
2166 .iter()
2167 .filter_map(|entry| {
2168 cache_relative_path(&self.project_root, &entry.chunk.file)
2169 .map(|relative| (relative, entry))
2170 })
2171 .collect();
2172
2173 let version = SEMANTIC_INDEX_VERSION_V6;
2186 buf.push(version);
2187 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2188 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2189 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2190 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2191 buf.extend_from_slice(fp_bytes_ref);
2192
2193 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2196 for (relative, path, mtime) in &file_mtimes {
2197 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2198 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2199 buf.extend_from_slice(&path_bytes);
2200 let duration = mtime
2201 .duration_since(SystemTime::UNIX_EPOCH)
2202 .unwrap_or_default();
2203 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2204 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2205 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2206 buf.extend_from_slice(&size.to_le_bytes());
2207 let hash = self
2208 .file_hashes
2209 .get(*path)
2210 .copied()
2211 .unwrap_or_else(cache_freshness::zero_hash);
2212 buf.extend_from_slice(hash.as_bytes());
2213 }
2214
2215 for (relative, entry) in &entries {
2217 let c = &entry.chunk;
2218
2219 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2221 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2222 buf.extend_from_slice(&file_bytes);
2223
2224 let name_bytes = c.name.as_bytes();
2226 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2227 buf.extend_from_slice(name_bytes);
2228
2229 buf.push(symbol_kind_to_u8(&c.kind));
2231
2232 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2234 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2235 buf.push(c.exported as u8);
2236
2237 let snippet_bytes = c.snippet.as_bytes();
2239 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2240 buf.extend_from_slice(snippet_bytes);
2241
2242 let embed_bytes = c.embed_text.as_bytes();
2244 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2245 buf.extend_from_slice(embed_bytes);
2246
2247 for &val in &entry.vector {
2249 buf.extend_from_slice(&val.to_le_bytes());
2250 }
2251 }
2252
2253 buf
2254 }
2255
2256 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2258 debug_assert!(current_canonical_root.is_absolute());
2259 let mut pos = 0;
2260
2261 if data.len() < HEADER_BYTES_V1 {
2262 return Err("data too short".to_string());
2263 }
2264
2265 let version = data[pos];
2266 pos += 1;
2267 if version != SEMANTIC_INDEX_VERSION_V1
2268 && version != SEMANTIC_INDEX_VERSION_V2
2269 && version != SEMANTIC_INDEX_VERSION_V3
2270 && version != SEMANTIC_INDEX_VERSION_V4
2271 && version != SEMANTIC_INDEX_VERSION_V5
2272 && version != SEMANTIC_INDEX_VERSION_V6
2273 {
2274 return Err(format!("unsupported version: {}", version));
2275 }
2276 if (version == SEMANTIC_INDEX_VERSION_V2
2280 || version == SEMANTIC_INDEX_VERSION_V3
2281 || version == SEMANTIC_INDEX_VERSION_V4
2282 || version == SEMANTIC_INDEX_VERSION_V5
2283 || version == SEMANTIC_INDEX_VERSION_V6)
2284 && data.len() < HEADER_BYTES_V2
2285 {
2286 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2287 }
2288
2289 let dimension = read_u32(data, &mut pos)? as usize;
2290 let entry_count = read_u32(data, &mut pos)? as usize;
2291 validate_embedding_dimension(dimension)?;
2292 if entry_count > MAX_ENTRIES {
2293 return Err(format!("too many semantic index entries: {}", entry_count));
2294 }
2295
2296 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2302 || version == SEMANTIC_INDEX_VERSION_V3
2303 || version == SEMANTIC_INDEX_VERSION_V4
2304 || version == SEMANTIC_INDEX_VERSION_V5
2305 || version == SEMANTIC_INDEX_VERSION_V6;
2306 let fingerprint = if has_fingerprint_field {
2307 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2308 if pos + fingerprint_len > data.len() {
2309 return Err("unexpected end of data reading fingerprint".to_string());
2310 }
2311 if fingerprint_len == 0 {
2312 None
2313 } else {
2314 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2315 pos += fingerprint_len;
2316 Some(
2317 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2318 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2319 )
2320 }
2321 } else {
2322 None
2323 };
2324
2325 let mtime_count = read_u32(data, &mut pos)? as usize;
2327 if mtime_count > MAX_ENTRIES {
2328 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2329 }
2330
2331 let vector_bytes = entry_count
2332 .checked_mul(dimension)
2333 .and_then(|count| count.checked_mul(F32_BYTES))
2334 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2335 if vector_bytes > data.len().saturating_sub(pos) {
2336 return Err("semantic index vectors exceed available data".to_string());
2337 }
2338
2339 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2340 let mut file_sizes = HashMap::with_capacity(mtime_count);
2341 let mut file_hashes = HashMap::with_capacity(mtime_count);
2342 for _ in 0..mtime_count {
2343 let path = read_string(data, &mut pos)?;
2344 let secs = read_u64(data, &mut pos)?;
2345 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2351 || version == SEMANTIC_INDEX_VERSION_V4
2352 || version == SEMANTIC_INDEX_VERSION_V5
2353 || version == SEMANTIC_INDEX_VERSION_V6
2354 {
2355 read_u32(data, &mut pos)?
2356 } else {
2357 0
2358 };
2359 let size =
2360 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2361 read_u64(data, &mut pos)?
2362 } else {
2363 0
2364 };
2365 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2366 if pos + 32 > data.len() {
2367 return Err("unexpected end of data reading content hash".to_string());
2368 }
2369 let mut hash_bytes = [0u8; 32];
2370 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2371 pos += 32;
2372 blake3::Hash::from_bytes(hash_bytes)
2373 } else {
2374 cache_freshness::zero_hash()
2375 };
2376 if nanos >= 1_000_000_000 {
2383 return Err(format!(
2384 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2385 nanos
2386 ));
2387 }
2388 let duration = std::time::Duration::new(secs, nanos);
2389 let mtime = SystemTime::UNIX_EPOCH
2390 .checked_add(duration)
2391 .ok_or_else(|| {
2392 format!(
2393 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2394 secs, nanos
2395 )
2396 })?;
2397 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2398 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2399 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2400 } else {
2401 PathBuf::from(path)
2402 };
2403 file_mtimes.insert(path.clone(), mtime);
2404 file_sizes.insert(path.clone(), size);
2405 file_hashes.insert(path, content_hash);
2406 }
2407
2408 let mut entries = Vec::with_capacity(entry_count);
2410 for _ in 0..entry_count {
2411 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2412 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2413 cached_path_under_root(current_canonical_root, &raw_file)
2414 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2415 } else {
2416 raw_file
2417 };
2418 let name = read_string(data, &mut pos)?;
2419
2420 if pos >= data.len() {
2421 return Err("unexpected end of data".to_string());
2422 }
2423 let kind = u8_to_symbol_kind(data[pos]);
2424 pos += 1;
2425
2426 let start_line = read_u32(data, &mut pos)?;
2427 let end_line = read_u32(data, &mut pos)?;
2428
2429 if pos >= data.len() {
2430 return Err("unexpected end of data".to_string());
2431 }
2432 let exported = data[pos] != 0;
2433 pos += 1;
2434
2435 let snippet = read_string(data, &mut pos)?;
2436 let embed_text = read_string(data, &mut pos)?;
2437
2438 let vec_bytes = dimension
2440 .checked_mul(F32_BYTES)
2441 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2442 if pos + vec_bytes > data.len() {
2443 return Err("unexpected end of data reading vector".to_string());
2444 }
2445 let mut vector = Vec::with_capacity(dimension);
2446 for _ in 0..dimension {
2447 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2448 vector.push(f32::from_le_bytes(bytes));
2449 pos += 4;
2450 }
2451
2452 entries.push(EmbeddingEntry {
2453 chunk: SemanticChunk {
2454 file,
2455 name,
2456 kind,
2457 start_line,
2458 end_line,
2459 exported,
2460 embed_text,
2461 snippet,
2462 },
2463 vector,
2464 });
2465 }
2466
2467 if entries.len() != entry_count {
2468 return Err(format!(
2469 "semantic cache entry count drift: header={} decoded={}",
2470 entry_count,
2471 entries.len()
2472 ));
2473 }
2474 for entry in &entries {
2475 if !file_mtimes.contains_key(&entry.chunk.file) {
2476 return Err(format!(
2477 "semantic cache metadata missing for entry file {}",
2478 entry.chunk.file.display()
2479 ));
2480 }
2481 }
2482
2483 Ok(Self {
2484 entries,
2485 file_mtimes,
2486 file_sizes,
2487 file_hashes,
2488 dimension,
2489 fingerprint,
2490 project_root: current_canonical_root.to_path_buf(),
2491 deferred_files: HashSet::new(),
2492 })
2493 }
2494}
2495
2496fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2498 let relative = file
2499 .strip_prefix(project_root)
2500 .unwrap_or(file)
2501 .to_string_lossy();
2502
2503 let kind_label = match &symbol.kind {
2504 SymbolKind::Function => "function",
2505 SymbolKind::Class => "class",
2506 SymbolKind::Method => "method",
2507 SymbolKind::Struct => "struct",
2508 SymbolKind::Interface => "interface",
2509 SymbolKind::Enum => "enum",
2510 SymbolKind::TypeAlias => "type",
2511 SymbolKind::Variable => "variable",
2512 SymbolKind::Heading => "heading",
2513 SymbolKind::FileSummary => "file-summary",
2514 };
2515
2516 let name = &symbol.name;
2518 let mut text = format!(
2519 "name:{name} file:{} kind:{} name:{name}",
2520 relative, kind_label
2521 );
2522
2523 if let Some(sig) = &symbol.signature {
2524 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2532 }
2533
2534 let lines: Vec<&str> = source.lines().collect();
2536 let start = (symbol.range.start_line as usize).min(lines.len());
2537 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2539 if start < end {
2540 let body: String = lines[start..end]
2541 .iter()
2542 .take(15) .copied()
2544 .collect::<Vec<&str>>()
2545 .join("\n");
2546 let snippet = if body.len() > 300 {
2547 format!("{}...", &body[..body.floor_char_boundary(300)])
2548 } else {
2549 body
2550 };
2551 text.push_str(&format!(" body:{}", snippet));
2552 }
2553
2554 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2559}
2560
2561const MAX_EMBED_TEXT_CHARS: usize = 1600;
2565
2566fn truncate_chars(value: &str, max_chars: usize) -> String {
2567 value.chars().take(max_chars).collect()
2568}
2569
2570fn first_leading_doc_comment(source: &str) -> String {
2571 let lines: Vec<&str> = source.lines().collect();
2572 let Some((start, first)) = lines
2573 .iter()
2574 .enumerate()
2575 .find(|(_, line)| !line.trim().is_empty())
2576 else {
2577 return String::new();
2578 };
2579
2580 let trimmed = first.trim_start();
2581 if trimmed.starts_with("/**") {
2582 let mut comment = Vec::new();
2583 for line in lines.iter().skip(start) {
2584 comment.push(*line);
2585 if line.contains("*/") {
2586 break;
2587 }
2588 }
2589 return truncate_chars(&comment.join("\n"), 200);
2590 }
2591
2592 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2593 let comment = lines
2594 .iter()
2595 .skip(start)
2596 .take_while(|line| {
2597 let trimmed = line.trim_start();
2598 trimmed.starts_with("///") || trimmed.starts_with("//!")
2599 })
2600 .copied()
2601 .collect::<Vec<_>>()
2602 .join("\n");
2603 return truncate_chars(&comment, 200);
2604 }
2605
2606 String::new()
2607}
2608
2609pub fn build_file_summary_chunk(
2610 file: &Path,
2611 project_root: &Path,
2612 source: &str,
2613 top_exports: &[&str],
2614 top_export_signatures: &[Option<&str>],
2615) -> SemanticChunk {
2616 let relative = file.strip_prefix(project_root).unwrap_or(file);
2617 let rel_path = relative.to_string_lossy();
2618 let parent_dir = relative
2619 .parent()
2620 .map(|parent| parent.to_string_lossy().to_string())
2621 .unwrap_or_default();
2622 let name = file
2623 .file_stem()
2624 .map(|stem| stem.to_string_lossy().to_string())
2625 .unwrap_or_default();
2626 let doc = first_leading_doc_comment(source);
2627 let exports = top_exports
2628 .iter()
2629 .take(5)
2630 .copied()
2631 .collect::<Vec<_>>()
2632 .join(",");
2633 let snippet = if doc.is_empty() {
2634 top_export_signatures
2635 .first()
2636 .and_then(|signature| signature.as_deref())
2637 .map(|signature| truncate_chars(signature, 200))
2638 .unwrap_or_default()
2639 } else {
2640 doc.clone()
2641 };
2642
2643 SemanticChunk {
2644 file: file.to_path_buf(),
2645 name,
2646 kind: SymbolKind::FileSummary,
2647 start_line: 0,
2648 end_line: 0,
2649 exported: false,
2650 embed_text: truncate_chars(
2651 &format!(
2652 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2653 file.file_stem()
2654 .map(|stem| stem.to_string_lossy().to_string())
2655 .unwrap_or_default()
2656 ),
2657 MAX_EMBED_TEXT_CHARS,
2658 ),
2659 snippet,
2660 }
2661}
2662
2663fn parser_for(
2664 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2665 lang: crate::parser::LangId,
2666) -> Result<&mut Parser, String> {
2667 use std::collections::hash_map::Entry;
2668
2669 match parsers.entry(lang) {
2670 Entry::Occupied(entry) => Ok(entry.into_mut()),
2671 Entry::Vacant(entry) => {
2672 let grammar = grammar_for(lang);
2673 let mut parser = Parser::new();
2674 parser
2675 .set_language(&grammar)
2676 .map_err(|error| error.to_string())?;
2677 Ok(entry.insert(parser))
2678 }
2679 }
2680}
2681
2682pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2683 matches!(
2684 path.extension().and_then(|extension| extension.to_str()),
2685 Some(
2686 "ts" | "tsx"
2687 | "js"
2688 | "jsx"
2689 | "py"
2690 | "rs"
2691 | "go"
2692 | "c"
2693 | "h"
2694 | "cc"
2695 | "cpp"
2696 | "cxx"
2697 | "hpp"
2698 | "hh"
2699 | "zig"
2700 | "cs"
2701 | "sh"
2702 | "bash"
2703 | "zsh"
2704 | "inc"
2705 | "php"
2706 | "sol"
2707 | "scss"
2708 | "vue"
2709 | "yaml"
2710 | "yml"
2711 )
2712 )
2713}
2714
2715fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2716 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2717 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2718 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2719 .map_err(|error| error.to_string())?
2720 .unwrap_or_else(cache_freshness::zero_hash);
2721 Ok(IndexedFileMetadata {
2722 mtime,
2723 size: metadata.len(),
2724 content_hash,
2725 })
2726}
2727
2728fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2729 if let Ok(canonical) = fs::canonicalize(path) {
2730 return canonical;
2731 }
2732
2733 let Some(parent) = path.parent() else {
2734 return path.to_path_buf();
2735 };
2736 let Some(file_name) = path.file_name() else {
2737 return path.to_path_buf();
2738 };
2739
2740 fs::canonicalize(parent)
2741 .map(|canonical_parent| canonical_parent.join(file_name))
2742 .unwrap_or_else(|_| path.to_path_buf())
2743}
2744
2745fn collect_file_chunks(
2746 project_root: &Path,
2747 file: &Path,
2748 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2749) -> Result<Vec<SemanticChunk>, String> {
2750 if !is_semantic_indexed_extension(file) {
2751 return Err("unsupported file extension".to_string());
2752 }
2753 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2754 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2755 let tree = parser_for(parsers, lang)?
2756 .parse(&source, None)
2757 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2758 let symbols =
2759 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2760
2761 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2762}
2763
2764fn build_snippet(symbol: &Symbol, source: &str) -> String {
2766 let lines: Vec<&str> = source.lines().collect();
2767 let start = (symbol.range.start_line as usize).min(lines.len());
2768 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2770 if start < end {
2771 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2772 let mut snippet = snippet_lines.join("\n");
2773 if end - start > 5 {
2774 snippet.push_str("\n ...");
2775 }
2776 if snippet.len() > 300 {
2777 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2778 }
2779 snippet
2780 } else {
2781 String::new()
2782 }
2783}
2784
2785fn symbols_to_chunks(
2787 file: &Path,
2788 symbols: &[Symbol],
2789 source: &str,
2790 project_root: &Path,
2791) -> Vec<SemanticChunk> {
2792 let mut chunks = Vec::new();
2793 let top_exports_with_signatures = symbols
2794 .iter()
2795 .filter(|symbol| {
2796 symbol.exported
2797 && symbol.parent.is_none()
2798 && !matches!(symbol.kind, SymbolKind::Heading)
2799 })
2800 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2801 .collect::<Vec<_>>();
2802
2803 let has_only_headings = !symbols.is_empty()
2804 && symbols
2805 .iter()
2806 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2807 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2808 let top_exports = top_exports_with_signatures
2809 .iter()
2810 .map(|(name, _)| *name)
2811 .collect::<Vec<_>>();
2812 let top_export_signatures = top_exports_with_signatures
2813 .iter()
2814 .map(|(_, signature)| *signature)
2815 .collect::<Vec<_>>();
2816 chunks.push(build_file_summary_chunk(
2817 file,
2818 project_root,
2819 source,
2820 &top_exports,
2821 &top_export_signatures,
2822 ));
2823 }
2824
2825 for symbol in symbols {
2826 if matches!(symbol.kind, SymbolKind::Heading) {
2831 continue;
2832 }
2833
2834 let line_count = symbol
2836 .range
2837 .end_line
2838 .saturating_sub(symbol.range.start_line)
2839 + 1;
2840 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2841 continue;
2842 }
2843
2844 let embed_text = build_embed_text(symbol, source, file, project_root);
2845 let snippet = build_snippet(symbol, source);
2846
2847 chunks.push(SemanticChunk {
2848 file: file.to_path_buf(),
2849 name: symbol.name.clone(),
2850 kind: symbol.kind.clone(),
2851 start_line: symbol.range.start_line,
2852 end_line: symbol.range.end_line,
2853 exported: symbol.exported,
2854 embed_text,
2855 snippet,
2856 });
2857
2858 }
2861
2862 chunks
2863}
2864
2865fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2867 if a.len() != b.len() {
2868 return 0.0;
2869 }
2870
2871 let mut dot = 0.0f32;
2872 let mut norm_a = 0.0f32;
2873 let mut norm_b = 0.0f32;
2874
2875 for i in 0..a.len() {
2876 dot += a[i] * b[i];
2877 norm_a += a[i] * a[i];
2878 norm_b += b[i] * b[i];
2879 }
2880
2881 let denom = norm_a.sqrt() * norm_b.sqrt();
2882 if denom == 0.0 {
2883 0.0
2884 } else {
2885 dot / denom
2886 }
2887}
2888
2889fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2891 match kind {
2892 SymbolKind::Function => 0,
2893 SymbolKind::Class => 1,
2894 SymbolKind::Method => 2,
2895 SymbolKind::Struct => 3,
2896 SymbolKind::Interface => 4,
2897 SymbolKind::Enum => 5,
2898 SymbolKind::TypeAlias => 6,
2899 SymbolKind::Variable => 7,
2900 SymbolKind::Heading => 8,
2901 SymbolKind::FileSummary => 9,
2902 }
2903}
2904
2905fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2906 match v {
2907 0 => SymbolKind::Function,
2908 1 => SymbolKind::Class,
2909 2 => SymbolKind::Method,
2910 3 => SymbolKind::Struct,
2911 4 => SymbolKind::Interface,
2912 5 => SymbolKind::Enum,
2913 6 => SymbolKind::TypeAlias,
2914 7 => SymbolKind::Variable,
2915 8 => SymbolKind::Heading,
2916 9 => SymbolKind::FileSummary,
2917 _ => SymbolKind::Heading,
2918 }
2919}
2920
2921fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2922 if *pos + 4 > data.len() {
2923 return Err("unexpected end of data reading u32".to_string());
2924 }
2925 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2926 *pos += 4;
2927 Ok(val)
2928}
2929
2930fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2931 if *pos + 8 > data.len() {
2932 return Err("unexpected end of data reading u64".to_string());
2933 }
2934 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2935 *pos += 8;
2936 Ok(u64::from_le_bytes(bytes))
2937}
2938
2939fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2940 let len = read_u32(data, pos)? as usize;
2941 if *pos + len > data.len() {
2942 return Err("unexpected end of data reading string".to_string());
2943 }
2944 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2945 *pos += len;
2946 Ok(s)
2947}
2948
2949#[cfg(test)]
2950mod tests {
2951 use super::*;
2952 use crate::config::{SemanticBackend, SemanticBackendConfig};
2953 use crate::parser::FileParser;
2954 use std::io::{Read, Write};
2955 use std::net::TcpListener;
2956 use std::thread;
2957
2958 #[test]
2959 fn semantic_index_includes_php_inc_and_scss_extensions() {
2960 for file in ["partial.inc", "index.php", "styles.scss"] {
2961 assert!(
2962 is_semantic_indexed_extension(Path::new(file)),
2963 "{file} should be semantic-index eligible"
2964 );
2965 }
2966 }
2967
2968 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2969 where
2970 F: Fn(String, String, String) -> String + Send + 'static,
2971 {
2972 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2973 let addr = listener.local_addr().expect("local addr");
2974 let handle = thread::spawn(move || {
2975 let (mut stream, _) = listener.accept().expect("accept request");
2976 let mut buf = Vec::new();
2977 let mut chunk = [0u8; 4096];
2978 let mut header_end = None;
2979 let mut content_length = 0usize;
2980 loop {
2981 let n = stream.read(&mut chunk).expect("read request");
2982 if n == 0 {
2983 break;
2984 }
2985 buf.extend_from_slice(&chunk[..n]);
2986 if header_end.is_none() {
2987 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2988 header_end = Some(pos + 4);
2989 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2990 for line in headers.lines() {
2991 if let Some(value) = line.strip_prefix("Content-Length:") {
2992 content_length = value.trim().parse::<usize>().unwrap_or(0);
2993 }
2994 }
2995 }
2996 }
2997 if let Some(end) = header_end {
2998 if buf.len() >= end + content_length {
2999 break;
3000 }
3001 }
3002 }
3003
3004 let end = header_end.expect("header terminator");
3005 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3006 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3007 let mut lines = request.lines();
3008 let request_line = lines.next().expect("request line").to_string();
3009 let path = request_line
3010 .split_whitespace()
3011 .nth(1)
3012 .expect("request path")
3013 .to_string();
3014 let response_body = handler(request_line, path, body);
3015 let response = format!(
3016 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3017 response_body.len(),
3018 response_body
3019 );
3020 stream
3021 .write_all(response.as_bytes())
3022 .expect("write response");
3023 });
3024
3025 (format!("http://{}", addr), handle)
3026 }
3027
3028 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3029 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3030 }
3031
3032 fn write_rust_file(path: &Path, function_name: &str) {
3033 fs::write(
3034 path,
3035 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3036 )
3037 .unwrap();
3038 }
3039
3040 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3041 let mut embed = test_vector_for_texts;
3042 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3043 }
3044
3045 fn test_project_root() -> PathBuf {
3046 std::env::current_dir().unwrap()
3047 }
3048
3049 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3050 index.file_mtimes.insert(file.to_path_buf(), mtime);
3051 index.file_sizes.insert(file.to_path_buf(), size);
3052 index
3053 .file_hashes
3054 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3055 }
3056
3057 #[test]
3058 fn semantic_cache_serialization_skips_paths_outside_project_root() {
3059 let dir = tempfile::tempdir().expect("create temp dir");
3060 let project = fs::canonicalize(dir.path()).expect("canonical project");
3061 let outside = project.join("..").join("outside.rs");
3062 let mut index = SemanticIndex::new(project.clone(), 3);
3063 index
3064 .file_mtimes
3065 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3066 index.file_sizes.insert(outside.clone(), 1);
3067 index
3068 .file_hashes
3069 .insert(outside.clone(), cache_freshness::zero_hash());
3070 index.entries.push(EmbeddingEntry {
3071 chunk: SemanticChunk {
3072 file: outside,
3073 name: "outside".to_string(),
3074 kind: SymbolKind::Function,
3075 start_line: 0,
3076 end_line: 0,
3077 exported: false,
3078 embed_text: "outside".to_string(),
3079 snippet: "outside".to_string(),
3080 },
3081 vector: vec![1.0, 0.0, 0.0],
3082 });
3083
3084 let bytes = index.to_bytes();
3085 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3086 assert_eq!(loaded.entries.len(), 0);
3087 assert!(loaded.file_mtimes.is_empty());
3088 }
3089
3090 #[test]
3091 fn test_cosine_similarity_identical() {
3092 let a = vec![1.0, 0.0, 0.0];
3093 let b = vec![1.0, 0.0, 0.0];
3094 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3095 }
3096
3097 #[test]
3098 fn test_cosine_similarity_orthogonal() {
3099 let a = vec![1.0, 0.0, 0.0];
3100 let b = vec![0.0, 1.0, 0.0];
3101 assert!(cosine_similarity(&a, &b).abs() < 0.001);
3102 }
3103
3104 #[test]
3105 fn test_cosine_similarity_opposite() {
3106 let a = vec![1.0, 0.0, 0.0];
3107 let b = vec![-1.0, 0.0, 0.0];
3108 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3109 }
3110
3111 #[test]
3112 fn test_serialization_roundtrip() {
3113 let project_root = test_project_root();
3114 let file = project_root.join("src/main.rs");
3115 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3116 index.entries.push(EmbeddingEntry {
3117 chunk: SemanticChunk {
3118 file: file.clone(),
3119 name: "handle_request".to_string(),
3120 kind: SymbolKind::Function,
3121 start_line: 10,
3122 end_line: 25,
3123 exported: true,
3124 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3125 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3126 },
3127 vector: vec![0.1, 0.2, 0.3, 0.4],
3128 });
3129 index.dimension = 4;
3130 index
3131 .file_mtimes
3132 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3133 index.file_sizes.insert(file, 0);
3134 index.set_fingerprint(SemanticIndexFingerprint {
3135 backend: "fastembed".to_string(),
3136 model: "all-MiniLM-L6-v2".to_string(),
3137 base_url: FALLBACK_BACKEND.to_string(),
3138 dimension: 4,
3139 chunking_version: default_chunking_version(),
3140 });
3141
3142 let bytes = index.to_bytes();
3143 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3144
3145 assert_eq!(restored.entries.len(), 1);
3146 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3147 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3148 assert_eq!(restored.dimension, 4);
3149 assert_eq!(restored.backend_label(), Some("fastembed"));
3150 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3151 }
3152
3153 #[test]
3154 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3155 let cases = [
3156 (SymbolKind::Function, 0),
3157 (SymbolKind::Class, 1),
3158 (SymbolKind::Method, 2),
3159 (SymbolKind::Struct, 3),
3160 (SymbolKind::Interface, 4),
3161 (SymbolKind::Enum, 5),
3162 (SymbolKind::TypeAlias, 6),
3163 (SymbolKind::Variable, 7),
3164 (SymbolKind::Heading, 8),
3165 (SymbolKind::FileSummary, 9),
3166 ];
3167
3168 for (kind, encoded) in cases {
3169 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3170 assert_eq!(u8_to_symbol_kind(encoded), kind);
3171 }
3172 }
3173
3174 #[test]
3175 fn test_search_top_k() {
3176 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3177 index.dimension = 3;
3178
3179 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3181 let mut vec = vec![0.0f32; 3];
3182 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3184 chunk: SemanticChunk {
3185 file: PathBuf::from("/src/lib.rs"),
3186 name: name.to_string(),
3187 kind: SymbolKind::Function,
3188 start_line: (i * 10 + 1) as u32,
3189 end_line: (i * 10 + 5) as u32,
3190 exported: true,
3191 embed_text: format!("kind:function name:{}", name),
3192 snippet: format!("fn {}() {{}}", name),
3193 },
3194 vector: vec,
3195 });
3196 }
3197
3198 let query = vec![0.9, 0.1, 0.0];
3200 let results = index.search(&query, 2);
3201
3202 assert_eq!(results.len(), 2);
3203 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3205 }
3206
3207 #[test]
3208 fn test_empty_index_search() {
3209 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3210 let results = index.search(&[0.1, 0.2, 0.3], 10);
3211 assert!(results.is_empty());
3212 }
3213
3214 #[test]
3215 fn single_line_symbol_builds_non_empty_snippet() {
3216 let symbol = Symbol {
3217 name: "answer".to_string(),
3218 kind: SymbolKind::Variable,
3219 range: crate::symbols::Range {
3220 start_line: 0,
3221 start_col: 0,
3222 end_line: 0,
3223 end_col: 24,
3224 },
3225 signature: Some("const answer = 42".to_string()),
3226 scope_chain: Vec::new(),
3227 exported: true,
3228 parent: None,
3229 };
3230 let source = "export const answer = 42;\n";
3231
3232 let snippet = build_snippet(&symbol, source);
3233
3234 assert_eq!(snippet, "export const answer = 42;");
3235 }
3236
3237 #[test]
3238 fn optimized_file_chunk_collection_matches_file_parser_path() {
3239 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3240 let file = project_root.join("src/semantic_index.rs");
3241 let source = std::fs::read_to_string(&file).unwrap();
3242
3243 let mut legacy_parser = FileParser::new();
3244 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3245 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3246
3247 let mut parsers = HashMap::new();
3248 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3249
3250 assert_eq!(
3251 chunk_fingerprint(&optimized_chunks),
3252 chunk_fingerprint(&legacy_chunks)
3253 );
3254 }
3255
3256 fn chunk_fingerprint(
3257 chunks: &[SemanticChunk],
3258 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3259 chunks
3260 .iter()
3261 .map(|chunk| {
3262 (
3263 chunk.name.clone(),
3264 chunk.kind.clone(),
3265 chunk.start_line,
3266 chunk.end_line,
3267 chunk.exported,
3268 chunk.embed_text.clone(),
3269 chunk.snippet.clone(),
3270 )
3271 })
3272 .collect()
3273 }
3274
3275 #[test]
3276 fn rejects_oversized_dimension_during_deserialization() {
3277 let mut bytes = Vec::new();
3278 bytes.push(1u8);
3279 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3280 bytes.extend_from_slice(&0u32.to_le_bytes());
3281 bytes.extend_from_slice(&0u32.to_le_bytes());
3282
3283 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3284 }
3285
3286 #[test]
3287 fn rejects_oversized_entry_count_during_deserialization() {
3288 let mut bytes = Vec::new();
3289 bytes.push(1u8);
3290 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3291 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3292 bytes.extend_from_slice(&0u32.to_le_bytes());
3293
3294 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3295 }
3296
3297 #[test]
3298 fn invalidate_file_removes_entries_and_mtime() {
3299 let target = PathBuf::from("/src/main.rs");
3300 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3301 index.entries.push(EmbeddingEntry {
3302 chunk: SemanticChunk {
3303 file: target.clone(),
3304 name: "main".to_string(),
3305 kind: SymbolKind::Function,
3306 start_line: 0,
3307 end_line: 1,
3308 exported: false,
3309 embed_text: "main".to_string(),
3310 snippet: "fn main() {}".to_string(),
3311 },
3312 vector: vec![1.0; DEFAULT_DIMENSION],
3313 });
3314 index
3315 .file_mtimes
3316 .insert(target.clone(), SystemTime::UNIX_EPOCH);
3317 index.file_sizes.insert(target.clone(), 0);
3318
3319 index.invalidate_file(&target);
3320
3321 assert!(index.entries.is_empty());
3322 assert!(!index.file_mtimes.contains_key(&target));
3323 assert!(!index.file_sizes.contains_key(&target));
3324 }
3325
3326 #[test]
3327 fn refresh_missing_changed_file_is_purged_after_collect() {
3328 let temp = tempfile::tempdir().unwrap();
3329 let project_root = temp.path();
3330 let file = project_root.join("src/lib.rs");
3331 fs::create_dir_all(file.parent().unwrap()).unwrap();
3332 write_rust_file(&file, "vanished_symbol");
3333
3334 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3335 let original_size = *index.file_sizes.get(&file).unwrap();
3336 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3337 fs::remove_file(&file).unwrap();
3338
3339 let mut embed = test_vector_for_texts;
3340 let mut progress = |_done: usize, _total: usize| {};
3341 let summary = index
3342 .refresh_stale_files(
3343 project_root,
3344 std::slice::from_ref(&file),
3345 &mut embed,
3346 8,
3347 &mut progress,
3348 )
3349 .unwrap();
3350
3351 assert_eq!(summary.changed, 0);
3352 assert_eq!(summary.added, 0);
3353 assert_eq!(summary.deleted, 1);
3354 assert!(index.entries.is_empty());
3355 assert!(!index.file_mtimes.contains_key(&file));
3356 assert!(!index.file_sizes.contains_key(&file));
3357 assert!(!index.file_hashes.contains_key(&file));
3358 }
3359
3360 #[test]
3361 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3362 let temp = tempfile::tempdir().unwrap();
3363 let project_root = temp.path();
3364 let file = project_root.join("src/lib.rs");
3365 fs::create_dir_all(file.parent().unwrap()).unwrap();
3366 write_rust_file(&file, "kept_symbol");
3367
3368 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3369 let original_entry_count = index.entries.len();
3370 let original_mtime = *index.file_mtimes.get(&file).unwrap();
3371 let original_size = *index.file_sizes.get(&file).unwrap();
3372
3373 let stale_mtime = SystemTime::UNIX_EPOCH;
3374 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3375 fs::remove_file(&file).unwrap();
3376 fs::create_dir(&file).unwrap();
3377
3378 let mut embed = test_vector_for_texts;
3379 let mut progress = |_done: usize, _total: usize| {};
3380 let summary = index
3381 .refresh_stale_files(
3382 project_root,
3383 std::slice::from_ref(&file),
3384 &mut embed,
3385 8,
3386 &mut progress,
3387 )
3388 .unwrap();
3389
3390 assert_eq!(summary.changed, 0);
3391 assert_eq!(summary.added, 0);
3392 assert_eq!(summary.deleted, 0);
3393 assert_eq!(index.entries.len(), original_entry_count);
3394 assert!(index
3395 .entries
3396 .iter()
3397 .any(|entry| entry.chunk.name == "kept_symbol"));
3398 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3399 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3400 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3401 }
3402
3403 #[test]
3404 fn refresh_never_indexed_file_error_does_not_record_mtime() {
3405 let temp = tempfile::tempdir().unwrap();
3406 let project_root = temp.path();
3407 let missing = project_root.join("src/missing.rs");
3408 fs::create_dir_all(missing.parent().unwrap()).unwrap();
3409
3410 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3411 let mut embed = test_vector_for_texts;
3412 let mut progress = |_done: usize, _total: usize| {};
3413 let summary = index
3414 .refresh_stale_files(
3415 project_root,
3416 std::slice::from_ref(&missing),
3417 &mut embed,
3418 8,
3419 &mut progress,
3420 )
3421 .unwrap();
3422
3423 assert_eq!(summary.added, 0);
3424 assert_eq!(summary.changed, 0);
3425 assert_eq!(summary.deleted, 0);
3426 assert!(!index.file_mtimes.contains_key(&missing));
3427 assert!(!index.file_sizes.contains_key(&missing));
3428 assert!(index.entries.is_empty());
3429 }
3430
3431 #[test]
3432 fn refresh_reports_added_for_new_files() {
3433 let temp = tempfile::tempdir().unwrap();
3434 let project_root = temp.path();
3435 let existing = project_root.join("src/lib.rs");
3436 let added = project_root.join("src/new.rs");
3437 fs::create_dir_all(existing.parent().unwrap()).unwrap();
3438 write_rust_file(&existing, "existing_symbol");
3439 write_rust_file(&added, "added_symbol");
3440
3441 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3442 let mut embed = test_vector_for_texts;
3443 let mut progress = |_done: usize, _total: usize| {};
3444 let summary = index
3445 .refresh_stale_files(
3446 project_root,
3447 &[existing.clone(), added.clone()],
3448 &mut embed,
3449 8,
3450 &mut progress,
3451 )
3452 .unwrap();
3453
3454 assert_eq!(summary.added, 1);
3455 assert_eq!(summary.changed, 0);
3456 assert_eq!(summary.deleted, 0);
3457 assert_eq!(summary.total_processed, 2);
3458 assert!(index.file_mtimes.contains_key(&added));
3459 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3460 }
3461
3462 #[test]
3463 fn refresh_reports_deleted_for_removed_files() {
3464 let temp = tempfile::tempdir().unwrap();
3465 let project_root = temp.path();
3466 let deleted = project_root.join("src/deleted.rs");
3467 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3468 write_rust_file(&deleted, "deleted_symbol");
3469
3470 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3471 fs::remove_file(&deleted).unwrap();
3472
3473 let mut embed = test_vector_for_texts;
3474 let mut progress = |_done: usize, _total: usize| {};
3475 let summary = index
3476 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3477 .unwrap();
3478
3479 assert_eq!(summary.deleted, 1);
3480 assert_eq!(summary.changed, 0);
3481 assert_eq!(summary.added, 0);
3482 assert_eq!(summary.total_processed, 1);
3483 assert!(!index.file_mtimes.contains_key(&deleted));
3484 assert!(index.entries.is_empty());
3485 }
3486
3487 #[test]
3488 fn refresh_reports_changed_for_modified_files() {
3489 let temp = tempfile::tempdir().unwrap();
3490 let project_root = temp.path();
3491 let file = project_root.join("src/lib.rs");
3492 fs::create_dir_all(file.parent().unwrap()).unwrap();
3493 write_rust_file(&file, "old_symbol");
3494
3495 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3496 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3497 write_rust_file(&file, "new_symbol");
3498
3499 let mut embed = test_vector_for_texts;
3500 let mut progress = |_done: usize, _total: usize| {};
3501 let summary = index
3502 .refresh_stale_files(
3503 project_root,
3504 std::slice::from_ref(&file),
3505 &mut embed,
3506 8,
3507 &mut progress,
3508 )
3509 .unwrap();
3510
3511 assert_eq!(summary.changed, 1);
3512 assert_eq!(summary.added, 0);
3513 assert_eq!(summary.deleted, 0);
3514 assert_eq!(summary.total_processed, 1);
3515 assert!(index
3516 .entries
3517 .iter()
3518 .any(|entry| entry.chunk.name == "new_symbol"));
3519 assert!(!index
3520 .entries
3521 .iter()
3522 .any(|entry| entry.chunk.name == "old_symbol"));
3523 }
3524
3525 #[test]
3526 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3527 let temp = tempfile::tempdir().unwrap();
3528 let project_root = temp.path();
3529 let file = project_root.join("src/lib.rs");
3530 fs::create_dir_all(file.parent().unwrap()).unwrap();
3531 write_rust_file(&file, "clean_symbol");
3532
3533 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3534 let original_entries = index.entries.len();
3535 let mut embed_called = false;
3536 let mut embed = |texts: Vec<String>| {
3537 embed_called = true;
3538 test_vector_for_texts(texts)
3539 };
3540 let mut progress = |_done: usize, _total: usize| {};
3541 let summary = index
3542 .refresh_stale_files(
3543 project_root,
3544 std::slice::from_ref(&file),
3545 &mut embed,
3546 8,
3547 &mut progress,
3548 )
3549 .unwrap();
3550
3551 assert!(summary.is_noop());
3552 assert_eq!(summary.total_processed, 1);
3553 assert!(!embed_called);
3554 assert_eq!(index.entries.len(), original_entries);
3555 }
3556
3557 #[test]
3558 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3559 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3560
3561 assert!(is_onnx_runtime_unavailable(message));
3562 }
3563
3564 #[test]
3565 fn formats_missing_onnx_runtime_with_install_hint() {
3566 let message = format_embedding_init_error(
3567 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3568 );
3569
3570 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3571 assert!(message.contains("Original error:"));
3572 }
3573
3574 #[test]
3575 fn openai_compatible_backend_embeds_with_mock_server() {
3576 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3577 assert!(request_line.starts_with("POST "));
3578 assert_eq!(path, "/v1/embeddings");
3579 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3580 });
3581
3582 let config = SemanticBackendConfig {
3583 backend: SemanticBackend::OpenAiCompatible,
3584 model: "test-embedding".to_string(),
3585 base_url: Some(base_url),
3586 api_key_env: None,
3587 timeout_ms: 5_000,
3588 max_batch_size: 64,
3589 max_files: 20_000,
3590 };
3591
3592 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3593 let vectors = model
3594 .embed(vec!["hello".to_string(), "world".to_string()])
3595 .unwrap();
3596
3597 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3598 handle.join().unwrap();
3599 }
3600
3601 #[test]
3611 fn openai_compatible_request_has_single_content_type_header() {
3612 use std::sync::{Arc, Mutex};
3613 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3614 let captured_for_thread = Arc::clone(&captured);
3615
3616 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3617 let addr = listener.local_addr().expect("local addr");
3618 let handle = thread::spawn(move || {
3619 let (mut stream, _) = listener.accept().expect("accept");
3620 let mut buf = Vec::new();
3621 let mut chunk = [0u8; 4096];
3622 let mut header_end = None;
3623 let mut content_length = 0usize;
3624 loop {
3625 let n = stream.read(&mut chunk).expect("read");
3626 if n == 0 {
3627 break;
3628 }
3629 buf.extend_from_slice(&chunk[..n]);
3630 if header_end.is_none() {
3631 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3632 header_end = Some(pos + 4);
3633 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3634 if let Some(value) = line.strip_prefix("Content-Length:") {
3635 content_length = value.trim().parse::<usize>().unwrap_or(0);
3636 }
3637 }
3638 }
3639 }
3640 if let Some(end) = header_end {
3641 if buf.len() >= end + content_length {
3642 break;
3643 }
3644 }
3645 }
3646 *captured_for_thread.lock().unwrap() = buf;
3647 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3648 let response = format!(
3649 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3650 body.len(),
3651 body
3652 );
3653 let _ = stream.write_all(response.as_bytes());
3654 });
3655
3656 let config = SemanticBackendConfig {
3657 backend: SemanticBackend::OpenAiCompatible,
3658 model: "text-embedding-3-small".to_string(),
3659 base_url: Some(format!("http://{}", addr)),
3660 api_key_env: None,
3661 timeout_ms: 5_000,
3662 max_batch_size: 64,
3663 max_files: 20_000,
3664 };
3665 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3666 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3667 handle.join().unwrap();
3668
3669 let bytes = captured.lock().unwrap().clone();
3670 let request = String::from_utf8_lossy(&bytes);
3671
3672 let content_type_lines = request
3675 .lines()
3676 .filter(|line| {
3677 let lower = line.to_ascii_lowercase();
3678 lower.starts_with("content-type:")
3679 })
3680 .count();
3681 assert_eq!(
3682 content_type_lines, 1,
3683 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3684 );
3685
3686 assert!(
3689 request.contains(r#""model":"text-embedding-3-small""#),
3690 "request body should contain model field; full request:\n{request}",
3691 );
3692 }
3693
3694 #[test]
3695 fn ollama_backend_embeds_with_mock_server() {
3696 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3697 assert!(request_line.starts_with("POST "));
3698 assert_eq!(path, "/api/embed");
3699 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3700 });
3701
3702 let config = SemanticBackendConfig {
3703 backend: SemanticBackend::Ollama,
3704 model: "embeddinggemma".to_string(),
3705 base_url: Some(base_url),
3706 api_key_env: None,
3707 timeout_ms: 5_000,
3708 max_batch_size: 64,
3709 max_files: 20_000,
3710 };
3711
3712 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3713 let vectors = model
3714 .embed(vec!["hello".to_string(), "world".to_string()])
3715 .unwrap();
3716
3717 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3718 handle.join().unwrap();
3719 }
3720
3721 #[test]
3722 fn read_from_disk_rejects_fingerprint_mismatch() {
3723 let storage = tempfile::tempdir().unwrap();
3724 let project_key = "proj";
3725
3726 let project_root = test_project_root();
3727 let file = project_root.join("src/main.rs");
3728 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3729 index.entries.push(EmbeddingEntry {
3730 chunk: SemanticChunk {
3731 file: file.clone(),
3732 name: "handle_request".to_string(),
3733 kind: SymbolKind::Function,
3734 start_line: 10,
3735 end_line: 25,
3736 exported: true,
3737 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3738 snippet: "fn handle_request() {}".to_string(),
3739 },
3740 vector: vec![0.1, 0.2, 0.3],
3741 });
3742 index.dimension = 3;
3743 index
3744 .file_mtimes
3745 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3746 index.file_sizes.insert(file, 0);
3747 index.set_fingerprint(SemanticIndexFingerprint {
3748 backend: "openai_compatible".to_string(),
3749 model: "test-embedding".to_string(),
3750 base_url: "http://127.0.0.1:1234/v1".to_string(),
3751 dimension: 3,
3752 chunking_version: default_chunking_version(),
3753 });
3754 index.write_to_disk(storage.path(), project_key);
3755
3756 let matching = index.fingerprint().unwrap().as_string();
3757 assert!(SemanticIndex::read_from_disk(
3758 storage.path(),
3759 project_key,
3760 &project_root,
3761 false,
3762 Some(&matching),
3763 )
3764 .is_some());
3765
3766 let mismatched = SemanticIndexFingerprint {
3767 backend: "ollama".to_string(),
3768 model: "embeddinggemma".to_string(),
3769 base_url: "http://127.0.0.1:11434".to_string(),
3770 dimension: 3,
3771 chunking_version: default_chunking_version(),
3772 }
3773 .as_string();
3774 assert!(SemanticIndex::read_from_disk(
3775 storage.path(),
3776 project_key,
3777 &project_root,
3778 false,
3779 Some(&mismatched),
3780 )
3781 .is_none());
3782 }
3783
3784 #[test]
3785 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3786 let storage = tempfile::tempdir().unwrap();
3787 let project_key = "proj-v3";
3788 let dir = storage.path().join("semantic").join(project_key);
3789 fs::create_dir_all(&dir).unwrap();
3790
3791 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3792 index.entries.push(EmbeddingEntry {
3793 chunk: SemanticChunk {
3794 file: PathBuf::from("/src/main.rs"),
3795 name: "handle_request".to_string(),
3796 kind: SymbolKind::Function,
3797 start_line: 0,
3798 end_line: 0,
3799 exported: true,
3800 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3801 snippet: "fn handle_request() {}".to_string(),
3802 },
3803 vector: vec![0.1, 0.2, 0.3],
3804 });
3805 index.dimension = 3;
3806 index
3807 .file_mtimes
3808 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3809 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3810 let fingerprint = SemanticIndexFingerprint {
3811 backend: "fastembed".to_string(),
3812 model: "test".to_string(),
3813 base_url: FALLBACK_BACKEND.to_string(),
3814 dimension: 3,
3815 chunking_version: default_chunking_version(),
3816 };
3817 index.set_fingerprint(fingerprint.clone());
3818
3819 let mut bytes = index.to_bytes();
3820 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3821 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3822
3823 assert!(SemanticIndex::read_from_disk(
3824 storage.path(),
3825 project_key,
3826 &test_project_root(),
3827 false,
3828 Some(&fingerprint.as_string())
3829 )
3830 .is_none());
3831 assert!(!dir.join("semantic.bin").exists());
3832 }
3833
3834 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3835 crate::symbols::Symbol {
3836 name: name.to_string(),
3837 kind,
3838 range: crate::symbols::Range {
3839 start_line: start,
3840 start_col: 0,
3841 end_line: end,
3842 end_col: 0,
3843 },
3844 signature: None,
3845 scope_chain: Vec::new(),
3846 exported: false,
3847 parent: None,
3848 }
3849 }
3850
3851 #[test]
3856 fn symbols_to_chunks_skips_heading_symbols() {
3857 let project_root = PathBuf::from("/proj");
3858 let file = project_root.join("README.md");
3859 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3860
3861 let symbols = vec![
3862 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3863 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3864 ];
3865
3866 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3867 assert!(
3868 chunks.is_empty(),
3869 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3870 chunks.len()
3871 );
3872 }
3873
3874 #[test]
3881 fn build_embed_text_clamps_oversized_signature() {
3882 let project_root = PathBuf::from("/proj");
3883 let file = project_root.join("cronjob.yaml");
3884 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
3886
3887 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
3888 symbol.signature = Some(huge_sig);
3889
3890 let text = build_embed_text(&symbol, source, &file, &project_root);
3891 assert!(
3892 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
3893 "embed_text must be clamped to {} chars, got {}",
3894 MAX_EMBED_TEXT_CHARS,
3895 text.chars().count()
3896 );
3897 }
3898
3899 #[test]
3903 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3904 let project_root = PathBuf::from("/proj");
3905 let file = project_root.join("src/lib.rs");
3906 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3907
3908 let symbols = vec![
3909 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3911 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3912 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3913 ];
3914
3915 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3916 assert_eq!(
3917 chunks.len(),
3918 3,
3919 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3920 chunks.len()
3921 );
3922 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3923 assert!(chunks
3924 .iter()
3925 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3926 assert!(names.contains(&"handle_request"));
3927 assert!(names.contains(&"AuthService"));
3928 assert!(
3929 !names.contains(&"doc heading"),
3930 "Heading symbol leaked into chunks: {names:?}"
3931 );
3932 }
3933
3934 #[test]
3935 fn validate_ssrf_allows_loopback_hostnames() {
3936 for host in &[
3939 "http://localhost",
3940 "http://localhost:8080",
3941 "http://localhost:11434", "http://localhost.localdomain",
3943 "http://foo.localhost",
3944 ] {
3945 assert!(
3946 validate_base_url_no_ssrf(host).is_ok(),
3947 "Expected {host} to be allowed (loopback), got: {:?}",
3948 validate_base_url_no_ssrf(host)
3949 );
3950 }
3951 }
3952
3953 #[test]
3954 fn validate_ssrf_allows_loopback_ips() {
3955 for url in &[
3958 "http://127.0.0.1",
3959 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3961 "http://127.1.2.3",
3962 ] {
3963 let result = validate_base_url_no_ssrf(url);
3964 assert!(
3965 result.is_ok(),
3966 "Expected {url} to be allowed (loopback), got: {:?}",
3967 result
3968 );
3969 }
3970 }
3971
3972 #[test]
3973 fn validate_ssrf_rejects_private_non_loopback_ips() {
3974 for url in &[
3979 "http://192.168.1.1",
3980 "http://10.0.0.1",
3981 "http://172.16.0.1",
3982 "http://169.254.169.254",
3983 "http://100.64.0.1",
3984 ] {
3985 let result = validate_base_url_no_ssrf(url);
3986 assert!(
3987 result.is_err(),
3988 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3989 result
3990 );
3991 }
3992 }
3993
3994 #[test]
3995 fn validate_ssrf_rejects_mdns_local_hostnames() {
3996 for host in &[
3999 "http://printer.local",
4000 "http://nas.local:8080",
4001 "http://homelab.local",
4002 ] {
4003 let result = validate_base_url_no_ssrf(host);
4004 assert!(
4005 result.is_err(),
4006 "Expected {host} to be rejected (mDNS), got: {:?}",
4007 result
4008 );
4009 }
4010 }
4011
4012 #[test]
4013 fn normalize_base_url_allows_localhost_for_tests() {
4014 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4017 assert!(normalize_base_url("http://localhost:8080").is_ok());
4018 }
4019
4020 #[test]
4027 fn ort_mismatch_message_recommends_auto_fix_first() {
4028 let msg =
4029 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4030
4031 assert!(
4033 msg.contains("v1.9.0"),
4034 "should report detected version: {msg}"
4035 );
4036 assert!(
4037 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4038 "should report system path: {msg}"
4039 );
4040 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4041
4042 let auto_fix_pos = msg
4044 .find("Auto-fix")
4045 .expect("Auto-fix solution missing — users won't discover --fix");
4046 let remove_pos = msg
4047 .find("Remove the old library")
4048 .expect("system-rm solution missing");
4049 assert!(
4050 auto_fix_pos < remove_pos,
4051 "Auto-fix must come before manual rm — see PR comment thread"
4052 );
4053
4054 assert!(
4056 msg.contains("npx @cortexkit/aft doctor --fix"),
4057 "auto-fix command must be present and copy-pasteable: {msg}"
4058 );
4059 }
4060
4061 #[test]
4065 fn ort_mismatch_message_handles_macos_dylib_path() {
4066 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4067 assert!(msg.contains("v1.9.0"));
4068 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4069 assert!(
4073 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4074 "system path should be quoted in the auto-fix sentence: {msg}"
4075 );
4076 }
4077}