1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Fastembed(TextEmbedding),
131 OpenAiCompatible {
132 client: Client,
133 model: String,
134 base_url: String,
135 api_key: Option<String>,
136 },
137 Ollama {
138 client: Client,
139 model: String,
140 base_url: String,
141 },
142}
143
144pub struct SemanticEmbeddingModel {
145 backend: SemanticBackend,
146 model: String,
147 base_url: Option<String>,
148 timeout_ms: u64,
149 max_batch_size: usize,
150 dimension: Option<usize>,
151 engine: SemanticEmbeddingEngine,
152 query_embedding_cache: HashMap<String, Vec<f32>>,
153 query_embedding_cache_order: VecDeque<String>,
154 query_embedding_cache_hits: u64,
155 query_embedding_cache_misses: u64,
156}
157
158pub type EmbeddingModel = SemanticEmbeddingModel;
159
160fn validate_embedding_batch(
161 vectors: &[Vec<f32>],
162 expected_count: usize,
163 context: &str,
164) -> Result<(), String> {
165 if expected_count > 0 && vectors.is_empty() {
166 return Err(format!(
167 "{context} returned no vectors for {expected_count} inputs"
168 ));
169 }
170
171 if vectors.len() != expected_count {
172 return Err(format!(
173 "{context} returned {} vectors for {} inputs",
174 vectors.len(),
175 expected_count
176 ));
177 }
178
179 let Some(first_vector) = vectors.first() else {
180 return Ok(());
181 };
182 let expected_dimension = first_vector.len();
183 validate_embedding_dimension(expected_dimension)
184 .map_err(|error| format!("{context} returned {error}"))?;
185 for (index, vector) in vectors.iter().enumerate() {
186 if vector.len() != expected_dimension {
187 return Err(format!(
188 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
189 vector.len()
190 ));
191 }
192 }
193
194 Ok(())
195}
196
197fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
198 if dimension == 0 || dimension > MAX_DIMENSION {
199 return Err(format!(
200 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
201 ));
202 }
203
204 Ok(())
205}
206
207fn normalize_base_url(raw: &str) -> Result<String, String> {
211 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
212 let scheme = parsed.scheme();
213 if scheme != "http" && scheme != "https" {
214 return Err(format!(
215 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
216 scheme
217 ));
218 }
219 Ok(parsed.to_string().trim_end_matches('/').to_string())
220}
221
222pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
237 use std::net::{IpAddr, ToSocketAddrs};
238
239 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
240
241 let host = parsed.host_str().unwrap_or("");
242
243 let is_loopback_host =
248 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
249 if is_loopback_host {
250 return Ok(());
251 }
252
253 if host.ends_with(".local") {
256 return Err(format!(
257 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
258 ));
259 }
260
261 let port = parsed.port_or_known_default().unwrap_or(443);
264 let addr_str = format!("{host}:{port}");
265 let addrs: Vec<IpAddr> = addr_str
266 .to_socket_addrs()
267 .map(|iter| iter.map(|sa| sa.ip()).collect())
268 .unwrap_or_default();
269 for ip in &addrs {
270 if is_private_non_loopback_ip(ip) {
271 return Err(format!(
272 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
273 ));
274 }
275 }
276
277 Ok(())
278}
279
280fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
284 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
285 match ip {
286 IpAddr::V4(v4) => {
287 let o = v4.octets();
288 o[0] == 10
291 || (o[0] == 172 && (16..=31).contains(&o[1]))
293 || (o[0] == 192 && o[1] == 168)
295 || (o[0] == 169 && o[1] == 254)
297 || (o[0] == 100 && (64..=127).contains(&o[1]))
299 || o[0] == 0
301 }
302 IpAddr::V6(v6) => {
303 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
307 || (v6.segments()[0] & 0xfe00) == 0xfc00
309 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
311 && v6.segments()[2] == 0 && v6.segments()[3] == 0
312 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
313 && {
314 let [a, b] = v6.segments()[6..8] else { return false; };
315 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
316 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
317 })
318 }
319 }
320}
321
322fn build_openai_embeddings_endpoint(base_url: &str) -> String {
323 if base_url.ends_with("/v1") {
324 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
325 } else {
326 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
327 }
328}
329
330fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
331 if base_url.ends_with("/api") {
332 format!("{base_url}/embed")
333 } else {
334 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
335 }
336}
337
338fn normalize_api_key(value: Option<String>) -> Option<String> {
339 value.and_then(|token| {
340 let token = token.trim();
341 if token.is_empty() {
342 None
343 } else {
344 Some(token.to_string())
345 }
346 })
347}
348
349fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
350 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
351}
352
353fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
354 error.is_connect()
355}
356
357fn sleep_before_embedding_retry(attempt_index: usize) {
358 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
359 std::thread::sleep(Duration::from_millis(*delay_ms));
360 }
361}
362
363fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
364where
365 F: FnMut() -> reqwest::blocking::RequestBuilder,
366{
367 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
368 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
369
370 let response = match make_request().send() {
371 Ok(response) => response,
372 Err(error) => {
373 if !last_attempt && is_retryable_embedding_error(&error) {
374 sleep_before_embedding_retry(attempt_index);
375 continue;
376 }
377 return Err(format!("{backend_label} request failed: {error}"));
378 }
379 };
380
381 let status = response.status();
382 let raw = match response.text() {
383 Ok(raw) => raw,
384 Err(error) => {
385 if !last_attempt && is_retryable_embedding_error(&error) {
386 sleep_before_embedding_retry(attempt_index);
387 continue;
388 }
389 return Err(format!("{backend_label} response read failed: {error}"));
390 }
391 };
392
393 if status.is_success() {
394 return Ok(raw);
395 }
396
397 if !last_attempt && is_retryable_embedding_status(status) {
398 sleep_before_embedding_retry(attempt_index);
399 continue;
400 }
401
402 return Err(format!(
403 "{backend_label} request failed (HTTP {}): {}",
404 status, raw
405 ));
406 }
407
408 unreachable!("embedding request retries exhausted without returning")
409}
410
411impl SemanticEmbeddingModel {
412 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
413 let timeout_ms = if config.timeout_ms == 0 {
414 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
415 } else {
416 config.timeout_ms
417 };
418
419 let max_batch_size = if config.max_batch_size == 0 {
420 DEFAULT_MAX_BATCH_SIZE
421 } else {
422 config.max_batch_size
423 };
424
425 let api_key_env = normalize_api_key(config.api_key_env.clone());
426 let model = config.model.clone();
427
428 let client = Client::builder()
429 .timeout(Duration::from_millis(timeout_ms))
430 .redirect(reqwest::redirect::Policy::none())
431 .build()
432 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
433
434 let engine = match config.backend {
435 SemanticBackend::Fastembed => {
436 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
437 }
438 SemanticBackend::OpenAiCompatible => {
439 let raw = config.base_url.as_ref().ok_or_else(|| {
440 "base_url is required for openai_compatible backend".to_string()
441 })?;
442 let base_url = normalize_base_url(raw)?;
443
444 let api_key = match api_key_env {
445 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
446 format!("missing api_key_env '{var_name}' for openai_compatible backend")
447 })?),
448 None => None,
449 };
450
451 SemanticEmbeddingEngine::OpenAiCompatible {
452 client,
453 model,
454 base_url,
455 api_key,
456 }
457 }
458 SemanticBackend::Ollama => {
459 let raw = config
460 .base_url
461 .as_ref()
462 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
463 let base_url = normalize_base_url(raw)?;
464
465 SemanticEmbeddingEngine::Ollama {
466 client,
467 model,
468 base_url,
469 }
470 }
471 };
472
473 Ok(Self {
474 backend: config.backend,
475 model: config.model.clone(),
476 base_url: config.base_url.clone(),
477 timeout_ms,
478 max_batch_size,
479 dimension: None,
480 engine,
481 query_embedding_cache: HashMap::new(),
482 query_embedding_cache_order: VecDeque::new(),
483 query_embedding_cache_hits: 0,
484 query_embedding_cache_misses: 0,
485 })
486 }
487
488 pub fn backend(&self) -> SemanticBackend {
489 self.backend
490 }
491
492 pub fn model(&self) -> &str {
493 &self.model
494 }
495
496 pub fn base_url(&self) -> Option<&str> {
497 self.base_url.as_deref()
498 }
499
500 pub fn max_batch_size(&self) -> usize {
501 self.max_batch_size
502 }
503
504 pub fn timeout_ms(&self) -> u64 {
505 self.timeout_ms
506 }
507
508 pub fn fingerprint(
509 &mut self,
510 config: &SemanticBackendConfig,
511 ) -> Result<SemanticIndexFingerprint, String> {
512 let dimension = self.dimension()?;
513 Ok(SemanticIndexFingerprint::from_config(config, dimension))
514 }
515
516 pub fn dimension(&mut self) -> Result<usize, String> {
517 if let Some(dimension) = self.dimension {
518 return Ok(dimension);
519 }
520
521 let dimension = match &mut self.engine {
522 SemanticEmbeddingEngine::Fastembed(model) => {
523 let vectors = model
524 .embed(vec!["semantic index fingerprint probe".to_string()], None)
525 .map_err(|error| format_embedding_init_error(error.to_string()))?;
526 vectors
527 .first()
528 .map(|v| v.len())
529 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530 }
531 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
532 let vectors =
533 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
534 vectors
535 .first()
536 .map(|v| v.len())
537 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
538 }
539 SemanticEmbeddingEngine::Ollama { .. } => {
540 let vectors =
541 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
542 vectors
543 .first()
544 .map(|v| v.len())
545 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
546 }
547 };
548
549 self.dimension = Some(dimension);
550 Ok(dimension)
551 }
552
553 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
554 self.embed_texts(texts)
555 }
556
557 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
558 if let Some(vector) = self.query_embedding_cache.get(query) {
559 self.query_embedding_cache_hits += 1;
560 return Ok(vector.clone());
561 }
562
563 self.query_embedding_cache_misses += 1;
564 let embeddings = self.embed_texts(vec![query.to_string()])?;
565 let vector = embeddings
566 .first()
567 .cloned()
568 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
569
570 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
571 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
572 self.query_embedding_cache.remove(&oldest);
573 }
574 }
575 self.query_embedding_cache
576 .insert(query.to_string(), vector.clone());
577 self.query_embedding_cache_order
578 .push_back(query.to_string());
579
580 Ok(vector)
581 }
582
583 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
584 (
585 self.query_embedding_cache_hits,
586 self.query_embedding_cache_misses,
587 self.query_embedding_cache.len(),
588 )
589 }
590
591 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
592 match &mut self.engine {
593 SemanticEmbeddingEngine::Fastembed(model) => model
594 .embed(texts, None::<usize>)
595 .map_err(|error| format_embedding_init_error(error.to_string()))
596 .map_err(|error| format!("failed to embed batch: {error}")),
597 SemanticEmbeddingEngine::OpenAiCompatible {
598 client,
599 model,
600 base_url,
601 api_key,
602 } => {
603 let expected_text_count = texts.len();
604 let endpoint = build_openai_embeddings_endpoint(base_url);
605 let body = serde_json::json!({
606 "input": texts,
607 "model": model,
608 });
609
610 let raw = send_embedding_request(
611 || {
612 let mut request = client.post(&endpoint).json(&body);
622
623 if let Some(api_key) = api_key {
624 request = request.header("Authorization", format!("Bearer {api_key}"));
625 }
626
627 request
628 },
629 "openai compatible",
630 )?;
631
632 #[derive(Deserialize)]
633 struct OpenAiResponse {
634 data: Vec<OpenAiEmbeddingResult>,
635 }
636
637 #[derive(Deserialize)]
638 struct OpenAiEmbeddingResult {
639 embedding: Vec<f32>,
640 index: Option<u32>,
641 }
642
643 let parsed: OpenAiResponse = serde_json::from_str(&raw)
644 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
645 if parsed.data.len() != expected_text_count {
646 return Err(format!(
647 "openai compatible response returned {} embeddings for {} inputs",
648 parsed.data.len(),
649 expected_text_count
650 ));
651 }
652
653 let mut vectors = vec![Vec::new(); parsed.data.len()];
654 for (i, item) in parsed.data.into_iter().enumerate() {
655 let index = item.index.unwrap_or(i as u32) as usize;
656 if index >= vectors.len() {
657 return Err(
658 "openai compatible response contains invalid vector index".to_string()
659 );
660 }
661 vectors[index] = item.embedding;
662 }
663
664 for vector in &vectors {
665 if vector.is_empty() {
666 return Err(
667 "openai compatible response contained missing vectors".to_string()
668 );
669 }
670 }
671
672 self.dimension = vectors.first().map(Vec::len);
673 Ok(vectors)
674 }
675 SemanticEmbeddingEngine::Ollama {
676 client,
677 model,
678 base_url,
679 } => {
680 let expected_text_count = texts.len();
681 let endpoint = build_ollama_embeddings_endpoint(base_url);
682
683 #[derive(Serialize)]
684 struct OllamaPayload<'a> {
685 model: &'a str,
686 input: Vec<String>,
687 }
688
689 let payload = OllamaPayload {
690 model,
691 input: texts,
692 };
693
694 let raw = send_embedding_request(
695 || {
696 client.post(&endpoint).json(&payload)
701 },
702 "ollama",
703 )?;
704
705 #[derive(Deserialize)]
706 struct OllamaResponse {
707 embeddings: Vec<Vec<f32>>,
708 }
709
710 let parsed: OllamaResponse = serde_json::from_str(&raw)
711 .map_err(|error| format!("invalid ollama response: {error}"))?;
712 if parsed.embeddings.is_empty() {
713 return Err("ollama response returned no embeddings".to_string());
714 }
715 if parsed.embeddings.len() != expected_text_count {
716 return Err(format!(
717 "ollama response returned {} embeddings for {} inputs",
718 parsed.embeddings.len(),
719 expected_text_count
720 ));
721 }
722
723 let vectors = parsed.embeddings;
724 for vector in &vectors {
725 if vector.is_empty() {
726 return Err("ollama response contained empty embeddings".to_string());
727 }
728 }
729
730 self.dimension = vectors.first().map(Vec::len);
731 Ok(vectors)
732 }
733 }
734 }
735}
736
737pub fn pre_validate_onnx_runtime() -> Result<(), String> {
741 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
742
743 #[cfg(any(target_os = "linux", target_os = "macos"))]
744 {
745 #[cfg(target_os = "linux")]
746 let default_name = "libonnxruntime.so";
747 #[cfg(target_os = "macos")]
748 let default_name = "libonnxruntime.dylib";
749
750 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
751
752 unsafe {
753 let c_name = std::ffi::CString::new(lib_name)
754 .map_err(|e| format!("invalid library path: {}", e))?;
755 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
756 if handle.is_null() {
757 let err = libc::dlerror();
758 let msg = if err.is_null() {
759 "unknown dlopen error".to_string()
760 } else {
761 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
762 };
763 return Err(format!(
764 "ONNX Runtime not found. dlopen('{}') failed: {}. \
765 Run `npx @cortexkit/aft doctor` to diagnose.",
766 lib_name, msg
767 ));
768 }
769
770 let detected_version = detect_ort_version_from_path(lib_name);
773
774 libc::dlclose(handle);
775
776 if let Some(ref version) = detected_version {
778 let parts: Vec<&str> = version.split('.').collect();
779 if let (Some(major), Some(minor)) = (
780 parts.first().and_then(|s| s.parse::<u32>().ok()),
781 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
782 ) {
783 if major != 1 || minor < 20 {
784 return Err(format_ort_version_mismatch(version, lib_name));
785 }
786 }
787 }
788 }
789 }
790
791 #[cfg(target_os = "windows")]
792 {
793 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
798
799 #[link(name = "kernel32")]
803 extern "system" {
804 fn LoadLibraryExW(
805 lpLibFileName: *const u16,
806 hFile: *mut std::ffi::c_void,
807 dwFlags: u32,
808 ) -> *mut std::ffi::c_void;
809 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
810 fn GetModuleFileNameW(
811 hModule: *mut std::ffi::c_void,
812 lpFilename: *mut u16,
813 nSize: u32,
814 ) -> u32;
815 }
816
817 #[link(name = "version")]
818 extern "system" {
819 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
820 fn GetFileVersionInfoW(
821 lptstrFilename: *const u16,
822 dwHandle: u32,
823 dwLen: u32,
824 lpData: *mut std::ffi::c_void,
825 ) -> i32;
826 fn VerQueryValueW(
827 pBlock: *mut std::ffi::c_void,
828 lpSubBlock: *const u16,
829 lplpBuffer: *mut *mut std::ffi::c_void,
830 puLen: *mut u32,
831 ) -> i32;
832 }
833
834 #[repr(C)]
835 struct VS_FIXEDFILEINFO {
836 dw_signature: u32,
837 dw_struc_version: u32,
838 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
841 dw_product_version_ls: u32,
842 dw_file_flags_mask: u32,
843 dw_file_flags: u32,
844 dw_file_os: u32,
845 dw_file_type: u32,
846 dw_file_subtype: u32,
847 dw_file_date_ms: u32,
848 dw_file_date_ls: u32,
849 }
850
851 unsafe {
852 use std::os::windows::ffi::OsStrExt;
853 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
854 .encode_wide()
855 .chain(std::iter::once(0))
856 .collect();
857
858 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
859 if handle.is_null() {
860 let err = std::io::Error::last_os_error();
861 return Err(format!(
862 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
863 Run `npx @cortexkit/aft doctor` to diagnose.",
864 lib_name, err
865 ));
866 }
867
868 let mut detected_major: u32 = 0;
871 let mut detected_minor: u32 = 0;
872 let mut path_buf = [0u16; 32767];
878 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
879 if path_len > 0 {
880 let mut dummy_handle: u32 = 0;
881 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
882 if info_size > 0 {
883 let mut info = vec![0u8; info_size as usize];
884 if GetFileVersionInfoW(
885 path_buf.as_ptr(),
886 0,
887 info_size,
888 info.as_mut_ptr() as *mut std::ffi::c_void,
889 ) != 0
890 {
891 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
892 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
893 let mut vs_len: u32 = 0;
894 if VerQueryValueW(
895 info.as_mut_ptr() as *mut std::ffi::c_void,
896 sub_block.as_ptr(),
897 &mut vs_info,
898 &mut vs_len,
899 ) != 0
900 && !vs_info.is_null()
901 {
902 let fixed = vs_info as *const VS_FIXEDFILEINFO;
903 detected_major = (*fixed).dw_file_version_ms >> 16;
904 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
905 }
906 }
907 }
908 }
909
910 FreeLibrary(handle);
911
912 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
916 let ver = format!("{}.{}", detected_major, detected_minor);
917 return Err(format_ort_version_mismatch(&ver, lib_name));
918 }
919 }
920 }
921
922 Ok(())
923}
924
925#[cfg(any(test, target_os = "linux", target_os = "macos"))]
928fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
929 let path = std::path::Path::new(lib_path);
930
931 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
933 .into_iter()
934 .flatten()
935 {
936 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
937 if let Some(version) = extract_version_from_filename(name) {
938 return Some(version);
939 }
940 }
941 }
942
943 if let Some(parent) = path.parent() {
945 if let Ok(entries) = std::fs::read_dir(parent) {
946 for entry in entries.flatten() {
947 if let Some(name) = entry.file_name().to_str() {
948 if name.starts_with("libonnxruntime") {
949 if let Some(version) = extract_version_from_filename(name) {
950 return Some(version);
951 }
952 }
953 }
954 }
955 }
956 }
957
958 None
959}
960
961#[cfg(any(test, target_os = "linux", target_os = "macos"))]
963fn extract_version_from_filename(name: &str) -> Option<String> {
964 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
966 re.find(name).map(|m| m.as_str().to_string())
967}
968
969fn suggest_removal_command(lib_path: &str) -> String {
970 if lib_path.starts_with("/usr/local/lib")
971 || lib_path == "libonnxruntime.so"
972 || lib_path == "libonnxruntime.dylib"
973 {
974 #[cfg(target_os = "linux")]
975 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
976 #[cfg(target_os = "macos")]
977 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
978 }
979 format!(" rm '{}'", lib_path)
980}
981
982pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
988 format!(
989 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
990 Solutions:\n\
991 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
992 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
993 configures the bridge to load it instead of the system library — no \
994 changes to '{}'.\n\
995 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
996 {}\n\
997 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
998 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
999 version,
1000 lib_name,
1001 lib_name,
1002 suggest_removal_command(lib_name),
1003 )
1004}
1005
1006pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
1007 pre_validate_onnx_runtime()?;
1009
1010 let selected_model = match model {
1011 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
1012 _ => {
1013 return Err(format!(
1014 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
1015 model
1016 ))
1017 }
1018 };
1019
1020 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
1021}
1022
1023pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1024 if message.trim_start().starts_with("ONNX Runtime not found.") {
1025 return true;
1026 }
1027
1028 let message = message.to_ascii_lowercase();
1029 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1030 .iter()
1031 .any(|pattern| message.contains(pattern));
1032 let mentions_dynamic_load_failure = [
1033 "shared library",
1034 "dynamic library",
1035 "failed to load",
1036 "could not load",
1037 "unable to load",
1038 "dlopen",
1039 "loadlibrary",
1040 "no such file",
1041 "not found",
1042 ]
1043 .iter()
1044 .any(|pattern| message.contains(pattern));
1045
1046 mentions_onnx_runtime && mentions_dynamic_load_failure
1047}
1048
1049fn format_embedding_init_error(error: impl Display) -> String {
1050 let message = error.to_string();
1051
1052 if is_onnx_runtime_unavailable(&message) {
1053 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1054 }
1055
1056 format!("failed to initialize semantic embedding model: {message}")
1057}
1058
1059#[derive(Debug, Clone)]
1061pub struct SemanticChunk {
1062 pub file: PathBuf,
1064 pub name: String,
1066 pub kind: SymbolKind,
1068 pub start_line: u32,
1070 pub end_line: u32,
1071 pub exported: bool,
1073 pub embed_text: String,
1075 pub snippet: String,
1077}
1078
1079#[derive(Debug, Clone)]
1081pub struct EmbeddingEntry {
1082 chunk: SemanticChunk,
1083 vector: Vec<f32>,
1084}
1085
1086#[derive(Debug, Clone)]
1088pub struct SemanticIndex {
1089 entries: Vec<EmbeddingEntry>,
1090 file_mtimes: HashMap<PathBuf, SystemTime>,
1092 file_sizes: HashMap<PathBuf, u64>,
1094 file_hashes: HashMap<PathBuf, blake3::Hash>,
1095 dimension: usize,
1097 fingerprint: Option<SemanticIndexFingerprint>,
1098 project_root: PathBuf,
1099}
1100
1101#[derive(Debug, Clone, Copy)]
1102struct IndexedFileMetadata {
1103 mtime: SystemTime,
1104 size: u64,
1105 content_hash: blake3::Hash,
1106}
1107
1108#[derive(Debug, Default, Clone, Copy)]
1111pub struct RefreshSummary {
1112 pub changed: usize,
1113 pub added: usize,
1114 pub deleted: usize,
1115 pub total_processed: usize,
1116}
1117
1118impl RefreshSummary {
1119 pub fn is_noop(&self) -> bool {
1121 self.changed == 0 && self.added == 0 && self.deleted == 0
1122 }
1123}
1124
1125#[derive(Debug, Default)]
1126pub struct InvalidatedFilesRefresh {
1127 pub added_entries: Vec<EmbeddingEntry>,
1128 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1129 pub completed_paths: Vec<PathBuf>,
1130 pub summary: RefreshSummary,
1131}
1132
1133#[derive(Debug, Clone)]
1135pub struct SemanticResult {
1136 pub file: PathBuf,
1137 pub name: String,
1138 pub kind: SymbolKind,
1139 pub start_line: u32,
1140 pub end_line: u32,
1141 pub exported: bool,
1142 pub snippet: String,
1143 pub score: f32,
1144 pub source: &'static str,
1145}
1146
1147impl SemanticIndex {
1148 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1149 debug_assert!(project_root.is_absolute());
1150 Self {
1151 entries: Vec::new(),
1152 file_mtimes: HashMap::new(),
1153 file_sizes: HashMap::new(),
1154 file_hashes: HashMap::new(),
1155 dimension,
1156 fingerprint: None,
1157 project_root,
1158 }
1159 }
1160
1161 pub fn entry_count(&self) -> usize {
1163 self.entries.len()
1164 }
1165
1166 pub fn status_label(&self) -> &'static str {
1168 if self.entries.is_empty() {
1169 "empty"
1170 } else {
1171 "ready"
1172 }
1173 }
1174
1175 fn collect_chunks(
1176 project_root: &Path,
1177 files: &[PathBuf],
1178 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1179 let per_file: Vec<(
1180 PathBuf,
1181 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1182 )> = files
1183 .par_iter()
1184 .map_init(HashMap::new, |parsers, file| {
1185 let result = collect_file_metadata(file).and_then(|metadata| {
1186 collect_file_chunks(project_root, file, parsers)
1187 .map(|chunks| (metadata, chunks))
1188 });
1189 (file.clone(), result)
1190 })
1191 .collect();
1192
1193 let mut chunks: Vec<SemanticChunk> = Vec::new();
1194 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1195
1196 for (file, result) in per_file {
1197 match result {
1198 Ok((metadata, file_chunks)) => {
1199 file_metadata.insert(file, metadata);
1200 chunks.extend(file_chunks);
1201 }
1202 Err(error) => {
1203 if error == "unsupported file extension" {
1209 continue;
1210 }
1211 slog_warn!(
1212 "failed to collect semantic chunks for {}: {}",
1213 file.display(),
1214 error
1215 );
1216 }
1217 }
1218 }
1219
1220 (chunks, file_metadata)
1221 }
1222
1223 fn build_from_chunks<F, P>(
1224 project_root: &Path,
1225 chunks: Vec<SemanticChunk>,
1226 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1227 embed_fn: &mut F,
1228 max_batch_size: usize,
1229 mut progress: Option<&mut P>,
1230 ) -> Result<Self, String>
1231 where
1232 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1233 P: FnMut(usize, usize),
1234 {
1235 debug_assert!(project_root.is_absolute());
1236 let total_chunks = chunks.len();
1237
1238 if chunks.is_empty() {
1239 return Ok(Self {
1240 entries: Vec::new(),
1241 file_mtimes: file_metadata
1242 .iter()
1243 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1244 .collect(),
1245 file_sizes: file_metadata
1246 .iter()
1247 .map(|(path, metadata)| (path.clone(), metadata.size))
1248 .collect(),
1249 file_hashes: file_metadata
1250 .into_iter()
1251 .map(|(path, metadata)| (path, metadata.content_hash))
1252 .collect(),
1253 dimension: DEFAULT_DIMENSION,
1254 fingerprint: None,
1255 project_root: project_root.to_path_buf(),
1256 });
1257 }
1258
1259 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1261 let mut expected_dimension: Option<usize> = None;
1262 let batch_size = max_batch_size.max(1);
1263 for batch_start in (0..chunks.len()).step_by(batch_size) {
1264 let batch_end = (batch_start + batch_size).min(chunks.len());
1265 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1266 .iter()
1267 .map(|c| c.embed_text.clone())
1268 .collect();
1269
1270 let vectors = embed_fn(batch_texts)?;
1271 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1272
1273 if let Some(dim) = vectors.first().map(|v| v.len()) {
1275 match expected_dimension {
1276 None => expected_dimension = Some(dim),
1277 Some(expected) if dim != expected => {
1278 return Err(format!(
1279 "embedding dimension changed across batches: expected {expected}, got {dim}"
1280 ));
1281 }
1282 _ => {}
1283 }
1284 }
1285
1286 for (i, vector) in vectors.into_iter().enumerate() {
1287 let chunk_idx = batch_start + i;
1288 entries.push(EmbeddingEntry {
1289 chunk: chunks[chunk_idx].clone(),
1290 vector,
1291 });
1292 }
1293
1294 if let Some(callback) = progress.as_mut() {
1295 callback(entries.len(), total_chunks);
1296 }
1297 }
1298
1299 let dimension = entries
1300 .first()
1301 .map(|e| e.vector.len())
1302 .unwrap_or(DEFAULT_DIMENSION);
1303
1304 Ok(Self {
1305 entries,
1306 file_mtimes: file_metadata
1307 .iter()
1308 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1309 .collect(),
1310 file_sizes: file_metadata
1311 .iter()
1312 .map(|(path, metadata)| (path.clone(), metadata.size))
1313 .collect(),
1314 file_hashes: file_metadata
1315 .into_iter()
1316 .map(|(path, metadata)| (path, metadata.content_hash))
1317 .collect(),
1318 dimension,
1319 fingerprint: None,
1320 project_root: project_root.to_path_buf(),
1321 })
1322 }
1323
1324 pub fn build<F>(
1327 project_root: &Path,
1328 files: &[PathBuf],
1329 embed_fn: &mut F,
1330 max_batch_size: usize,
1331 ) -> Result<Self, String>
1332 where
1333 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1334 {
1335 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1336 Self::build_from_chunks(
1337 project_root,
1338 chunks,
1339 file_mtimes,
1340 embed_fn,
1341 max_batch_size,
1342 Option::<&mut fn(usize, usize)>::None,
1343 )
1344 }
1345
1346 pub fn build_with_progress<F, P>(
1348 project_root: &Path,
1349 files: &[PathBuf],
1350 embed_fn: &mut F,
1351 max_batch_size: usize,
1352 progress: &mut P,
1353 ) -> Result<Self, String>
1354 where
1355 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1356 P: FnMut(usize, usize),
1357 {
1358 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1359 let total_chunks = chunks.len();
1360 progress(0, total_chunks);
1361 Self::build_from_chunks(
1362 project_root,
1363 chunks,
1364 file_mtimes,
1365 embed_fn,
1366 max_batch_size,
1367 Some(progress),
1368 )
1369 }
1370
1371 pub fn refresh_stale_files<F, P>(
1382 &mut self,
1383 project_root: &Path,
1384 current_files: &[PathBuf],
1385 embed_fn: &mut F,
1386 max_batch_size: usize,
1387 progress: &mut P,
1388 ) -> Result<RefreshSummary, String>
1389 where
1390 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1391 P: FnMut(usize, usize),
1392 {
1393 self.backfill_missing_file_sizes();
1394
1395 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1397 let total_processed = current_set.len() + self.file_mtimes.len()
1398 - self
1399 .file_mtimes
1400 .keys()
1401 .filter(|path| current_set.contains(path.as_path()))
1402 .count();
1403
1404 let mut deleted: Vec<PathBuf> = Vec::new();
1407 let mut changed: Vec<PathBuf> = Vec::new();
1408 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1409 for indexed_path in &indexed_paths {
1410 if !current_set.contains(indexed_path.as_path()) {
1411 deleted.push(indexed_path.clone());
1412 continue;
1413 }
1414 let cached = match (
1415 self.file_mtimes.get(indexed_path),
1416 self.file_sizes.get(indexed_path),
1417 self.file_hashes.get(indexed_path),
1418 ) {
1419 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1420 mtime: *mtime,
1421 size: *size,
1422 content_hash: *hash,
1423 }),
1424 _ => None,
1425 };
1426 match cached
1427 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1428 {
1429 Some(FreshnessVerdict::HotFresh) => {}
1430 Some(FreshnessVerdict::ContentFresh {
1431 new_mtime,
1432 new_size,
1433 }) => {
1434 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1435 self.file_sizes.insert(indexed_path.clone(), new_size);
1436 }
1437 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1438 changed.push(indexed_path.clone());
1439 }
1440 }
1441 }
1442
1443 let mut added: Vec<PathBuf> = Vec::new();
1445 for path in current_files {
1446 if !self.file_mtimes.contains_key(path) {
1447 added.push(path.clone());
1448 }
1449 }
1450
1451 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1453 progress(0, 0);
1454 return Ok(RefreshSummary {
1455 total_processed,
1456 ..RefreshSummary::default()
1457 });
1458 }
1459
1460 if !deleted.is_empty() {
1464 self.remove_indexed_files(&deleted);
1465 }
1466
1467 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1469 to_embed.extend(changed.iter().cloned());
1470 to_embed.extend(added.iter().cloned());
1471
1472 if to_embed.is_empty() {
1473 progress(0, 0);
1475 return Ok(RefreshSummary {
1476 changed: 0,
1477 added: 0,
1478 deleted: deleted.len(),
1479 total_processed,
1480 });
1481 }
1482
1483 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1484 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1485 let vanished = to_embed
1486 .iter()
1487 .filter(|path| {
1488 changed_set.contains(path.as_path())
1489 && !fresh_metadata.contains_key(*path)
1490 && !path.exists()
1491 })
1492 .cloned()
1493 .collect::<Vec<_>>();
1494 if !vanished.is_empty() {
1495 self.remove_indexed_files(&vanished);
1496 deleted.extend(vanished);
1497 }
1498
1499 if chunks.is_empty() {
1500 progress(0, 0);
1501 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1502 if !successful_files.is_empty() {
1503 self.entries
1504 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1505 }
1506 let changed_count = changed
1507 .iter()
1508 .filter(|path| successful_files.contains(*path))
1509 .count();
1510 let added_count = added
1511 .iter()
1512 .filter(|path| successful_files.contains(*path))
1513 .count();
1514 for (file, metadata) in fresh_metadata {
1515 self.file_mtimes.insert(file.clone(), metadata.mtime);
1516 self.file_sizes.insert(file.clone(), metadata.size);
1517 self.file_hashes.insert(file.clone(), metadata.content_hash);
1518 }
1519 return Ok(RefreshSummary {
1520 changed: changed_count,
1521 added: added_count,
1522 deleted: deleted.len(),
1523 total_processed,
1524 });
1525 }
1526
1527 let total_chunks = chunks.len();
1529 progress(0, total_chunks);
1530 let batch_size = max_batch_size.max(1);
1531 let existing_dimension = if self.entries.is_empty() {
1532 None
1533 } else {
1534 Some(self.dimension)
1535 };
1536 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1537 let mut observed_dimension: Option<usize> = existing_dimension;
1538
1539 for batch_start in (0..chunks.len()).step_by(batch_size) {
1540 let batch_end = (batch_start + batch_size).min(chunks.len());
1541 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1542 .iter()
1543 .map(|c| c.embed_text.clone())
1544 .collect();
1545
1546 let vectors = embed_fn(batch_texts)?;
1547 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1548
1549 if let Some(dim) = vectors.first().map(|v| v.len()) {
1550 match observed_dimension {
1551 None => observed_dimension = Some(dim),
1552 Some(expected) if dim != expected => {
1553 return Err(format!(
1556 "embedding dimension changed during incremental refresh: \
1557 cached index uses {expected}, new vectors use {dim}"
1558 ));
1559 }
1560 _ => {}
1561 }
1562 }
1563
1564 for (i, vector) in vectors.into_iter().enumerate() {
1565 let chunk_idx = batch_start + i;
1566 new_entries.push(EmbeddingEntry {
1567 chunk: chunks[chunk_idx].clone(),
1568 vector,
1569 });
1570 }
1571
1572 progress(new_entries.len(), total_chunks);
1573 }
1574
1575 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1576 if !successful_files.is_empty() {
1577 self.entries
1578 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1579 }
1580
1581 self.entries.extend(new_entries);
1582 for (file, metadata) in fresh_metadata {
1583 self.file_mtimes.insert(file.clone(), metadata.mtime);
1584 self.file_sizes.insert(file.clone(), metadata.size);
1585 self.file_hashes.insert(file, metadata.content_hash);
1586 }
1587 if let Some(dim) = observed_dimension {
1588 self.dimension = dim;
1589 }
1590
1591 Ok(RefreshSummary {
1592 changed: changed
1593 .iter()
1594 .filter(|path| successful_files.contains(*path))
1595 .count(),
1596 added: added
1597 .iter()
1598 .filter(|path| successful_files.contains(*path))
1599 .count(),
1600 deleted: deleted.len(),
1601 total_processed,
1602 })
1603 }
1604
1605 pub fn refresh_invalidated_files<F, P>(
1612 &mut self,
1613 project_root: &Path,
1614 paths: &[PathBuf],
1615 embed_fn: &mut F,
1616 max_batch_size: usize,
1617 progress: &mut P,
1618 ) -> Result<InvalidatedFilesRefresh, String>
1619 where
1620 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1621 P: FnMut(usize, usize),
1622 {
1623 self.backfill_missing_file_sizes();
1624
1625 let mut requested_paths = paths.to_vec();
1626 requested_paths.sort();
1627 requested_paths.dedup();
1628 let total_processed = requested_paths.len();
1629
1630 if requested_paths.is_empty() {
1631 progress(0, 0);
1632 return Ok(InvalidatedFilesRefresh {
1633 summary: RefreshSummary {
1634 total_processed,
1635 ..RefreshSummary::default()
1636 },
1637 ..InvalidatedFilesRefresh::default()
1638 });
1639 }
1640
1641 let previously_indexed: HashSet<PathBuf> = requested_paths
1642 .iter()
1643 .filter(|path| self.file_mtimes.contains_key(*path))
1644 .cloned()
1645 .collect();
1646
1647 self.remove_indexed_files(&requested_paths);
1651
1652 let existing_paths = requested_paths
1653 .iter()
1654 .filter(|path| path.exists())
1655 .cloned()
1656 .collect::<Vec<_>>();
1657 let deleted = requested_paths
1658 .iter()
1659 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1660 .count();
1661
1662 if existing_paths.is_empty() {
1663 progress(0, 0);
1664 return Ok(InvalidatedFilesRefresh {
1665 completed_paths: requested_paths,
1666 summary: RefreshSummary {
1667 deleted,
1668 total_processed,
1669 ..RefreshSummary::default()
1670 },
1671 ..InvalidatedFilesRefresh::default()
1672 });
1673 }
1674
1675 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1676 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1677 let changed = successful_files
1678 .iter()
1679 .filter(|path| previously_indexed.contains(path.as_path()))
1680 .count();
1681 let added = successful_files.len().saturating_sub(changed);
1682 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1683
1684 if chunks.is_empty() {
1685 progress(0, 0);
1686 for (file, metadata) in fresh_metadata {
1687 let freshness = FileFreshness {
1688 mtime: metadata.mtime,
1689 size: metadata.size,
1690 content_hash: metadata.content_hash,
1691 };
1692 self.file_mtimes.insert(file.clone(), freshness.mtime);
1693 self.file_sizes.insert(file.clone(), freshness.size);
1694 self.file_hashes
1695 .insert(file.clone(), freshness.content_hash);
1696 updated_metadata.push((file, freshness));
1697 }
1698
1699 return Ok(InvalidatedFilesRefresh {
1700 updated_metadata,
1701 completed_paths: requested_paths,
1702 summary: RefreshSummary {
1703 changed,
1704 added,
1705 deleted,
1706 total_processed,
1707 },
1708 ..InvalidatedFilesRefresh::default()
1709 });
1710 }
1711
1712 let total_chunks = chunks.len();
1713 progress(0, total_chunks);
1714 let batch_size = max_batch_size.max(1);
1715 let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1716 None
1717 } else {
1718 Some(self.dimension)
1719 };
1720 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1721
1722 for batch_start in (0..chunks.len()).step_by(batch_size) {
1723 let batch_end = (batch_start + batch_size).min(chunks.len());
1724 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1725 .iter()
1726 .map(|chunk| chunk.embed_text.clone())
1727 .collect();
1728
1729 let vectors = embed_fn(batch_texts)?;
1730 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1731
1732 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1733 match observed_dimension {
1734 None => observed_dimension = Some(dim),
1735 Some(expected) if dim != expected => {
1736 return Err(format!(
1737 "embedding dimension changed during invalidated-file refresh: \
1738 cached index uses {expected}, new vectors use {dim}"
1739 ));
1740 }
1741 _ => {}
1742 }
1743 }
1744
1745 for (i, vector) in vectors.into_iter().enumerate() {
1746 let chunk_idx = batch_start + i;
1747 new_entries.push(EmbeddingEntry {
1748 chunk: chunks[chunk_idx].clone(),
1749 vector,
1750 });
1751 }
1752
1753 progress(new_entries.len(), total_chunks);
1754 }
1755
1756 let added_entries = new_entries.clone();
1757 self.entries.extend(new_entries);
1758 for (file, metadata) in fresh_metadata {
1759 let freshness = FileFreshness {
1760 mtime: metadata.mtime,
1761 size: metadata.size,
1762 content_hash: metadata.content_hash,
1763 };
1764 self.file_mtimes.insert(file.clone(), freshness.mtime);
1765 self.file_sizes.insert(file.clone(), freshness.size);
1766 self.file_hashes
1767 .insert(file.clone(), freshness.content_hash);
1768 updated_metadata.push((file, freshness));
1769 }
1770 if let Some(dim) = observed_dimension {
1771 self.dimension = dim;
1772 }
1773
1774 Ok(InvalidatedFilesRefresh {
1775 added_entries,
1776 updated_metadata,
1777 completed_paths: requested_paths,
1778 summary: RefreshSummary {
1779 changed,
1780 added,
1781 deleted,
1782 total_processed,
1783 },
1784 })
1785 }
1786
1787 pub fn apply_refresh_update(
1788 &mut self,
1789 added_entries: Vec<EmbeddingEntry>,
1790 updated_metadata: Vec<(PathBuf, FileFreshness)>,
1791 completed_paths: &[PathBuf],
1792 ) {
1793 self.remove_indexed_files(completed_paths);
1794
1795 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1796 self.entries.extend(added_entries);
1797 for (file, freshness) in updated_metadata {
1798 self.file_mtimes.insert(file.clone(), freshness.mtime);
1799 self.file_sizes.insert(file.clone(), freshness.size);
1800 self.file_hashes.insert(file, freshness.content_hash);
1801 }
1802 if let Some(dim) = observed_dimension {
1803 self.dimension = dim;
1804 }
1805 }
1806
1807 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1808 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1809 self.entries
1810 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1811 for path in files {
1812 self.file_mtimes.remove(path);
1813 self.file_sizes.remove(path);
1814 self.file_hashes.remove(path);
1815 }
1816 }
1817
1818 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1820 if self.entries.is_empty() || query_vector.len() != self.dimension {
1821 return Vec::new();
1822 }
1823
1824 let mut scored: Vec<(f32, usize)> = self
1825 .entries
1826 .iter()
1827 .enumerate()
1828 .map(|(i, entry)| {
1829 let mut score = cosine_similarity(query_vector, &entry.vector);
1830 if entry.chunk.exported {
1831 score *= 1.1;
1832 }
1833 (score, i)
1834 })
1835 .collect();
1836
1837 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1839
1840 scored
1841 .into_iter()
1842 .take(top_k)
1843 .map(|(score, idx)| {
1847 let entry = &self.entries[idx];
1848 SemanticResult {
1849 file: entry.chunk.file.clone(),
1850 name: entry.chunk.name.clone(),
1851 kind: entry.chunk.kind.clone(),
1852 start_line: entry.chunk.start_line,
1853 end_line: entry.chunk.end_line,
1854 exported: entry.chunk.exported,
1855 snippet: entry.chunk.snippet.clone(),
1856 score,
1857 source: "semantic",
1858 }
1859 })
1860 .collect()
1861 }
1862
1863 pub fn len(&self) -> usize {
1865 self.entries.len()
1866 }
1867
1868 pub fn is_file_stale(&self, file: &Path) -> bool {
1870 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1871 return true;
1872 };
1873 let Some(stored_size) = self.file_sizes.get(file) else {
1874 return true;
1875 };
1876 let Some(stored_hash) = self.file_hashes.get(file) else {
1877 return true;
1878 };
1879 let cached = FileFreshness {
1880 mtime: *stored_mtime,
1881 size: *stored_size,
1882 content_hash: *stored_hash,
1883 };
1884 match cache_freshness::verify_file_strict(file, &cached) {
1885 FreshnessVerdict::HotFresh => false,
1886 FreshnessVerdict::ContentFresh { .. } => false,
1887 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1888 }
1889 }
1890
1891 fn backfill_missing_file_sizes(&mut self) {
1892 for path in self.file_mtimes.keys() {
1893 if self.file_sizes.contains_key(path) {
1894 continue;
1895 }
1896 if let Ok(metadata) = fs::metadata(path) {
1897 self.file_sizes.insert(path.clone(), metadata.len());
1898 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1899 self.file_hashes.insert(path.clone(), hash);
1900 }
1901 }
1902 }
1903 }
1904
1905 pub fn remove_file(&mut self, file: &Path) {
1907 self.invalidate_file(file);
1908 }
1909
1910 pub fn invalidate_file(&mut self, file: &Path) {
1911 let canonical_file = canonicalize_existing_or_deleted_path(file);
1912 self.entries
1913 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
1914 self.file_mtimes.remove(file);
1915 self.file_sizes.remove(file);
1916 self.file_hashes.remove(file);
1917 if canonical_file.as_path() != file {
1918 self.file_mtimes.remove(&canonical_file);
1919 self.file_sizes.remove(&canonical_file);
1920 self.file_hashes.remove(&canonical_file);
1921 }
1922 }
1923
1924 pub fn dimension(&self) -> usize {
1926 self.dimension
1927 }
1928
1929 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1930 self.fingerprint.as_ref()
1931 }
1932
1933 pub fn backend_label(&self) -> Option<&str> {
1934 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1935 }
1936
1937 pub fn model_label(&self) -> Option<&str> {
1938 self.fingerprint.as_ref().map(|f| f.model.as_str())
1939 }
1940
1941 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1942 self.fingerprint = Some(fingerprint);
1943 }
1944
1945 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1947 if self.entries.is_empty() {
1950 slog_info!("skipping semantic index persistence (0 entries)");
1951 return;
1952 }
1953 let dir = storage_dir.join("semantic").join(project_key);
1954 if let Err(e) = fs::create_dir_all(&dir) {
1955 slog_warn!("failed to create semantic cache dir: {}", e);
1956 return;
1957 }
1958 let data_path = dir.join("semantic.bin");
1959 let tmp_path = dir.join(format!(
1960 "semantic.bin.tmp.{}.{}",
1961 std::process::id(),
1962 SystemTime::now()
1963 .duration_since(SystemTime::UNIX_EPOCH)
1964 .unwrap_or(Duration::ZERO)
1965 .as_nanos()
1966 ));
1967 let bytes = self.to_bytes();
1968 let write_result = (|| -> std::io::Result<()> {
1969 use std::io::Write;
1970 let mut file = fs::File::create(&tmp_path)?;
1971 file.write_all(&bytes)?;
1972 file.sync_all()?;
1973 Ok(())
1974 })();
1975 if let Err(e) = write_result {
1976 slog_warn!("failed to write semantic index: {}", e);
1977 let _ = fs::remove_file(&tmp_path);
1978 return;
1979 }
1980 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1981 slog_warn!("failed to rename semantic index: {}", e);
1982 let _ = fs::remove_file(&tmp_path);
1983 return;
1984 }
1985 slog_info!(
1986 "semantic index persisted: {} entries, {:.1} KB",
1987 self.entries.len(),
1988 bytes.len() as f64 / 1024.0
1989 );
1990 }
1991
1992 pub fn read_from_disk(
1994 storage_dir: &Path,
1995 project_key: &str,
1996 current_canonical_root: &Path,
1997 is_worktree_bridge: bool,
1998 expected_fingerprint: Option<&str>,
1999 ) -> Option<Self> {
2000 debug_assert!(current_canonical_root.is_absolute());
2001 let data_path = storage_dir
2002 .join("semantic")
2003 .join(project_key)
2004 .join("semantic.bin");
2005 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2006 if file_len < HEADER_BYTES_V1 {
2007 slog_warn!(
2008 "corrupt semantic index (too small: {} bytes), removing",
2009 file_len
2010 );
2011 if !is_worktree_bridge {
2012 let _ = fs::remove_file(&data_path);
2013 }
2014 return None;
2015 }
2016
2017 let bytes = fs::read(&data_path).ok()?;
2018 let version = bytes[0];
2019 if version != SEMANTIC_INDEX_VERSION_V6 {
2020 slog_info!(
2021 "cached semantic index version {} is older than {}, rebuilding",
2022 version,
2023 SEMANTIC_INDEX_VERSION_V6
2024 );
2025 if !is_worktree_bridge {
2026 let _ = fs::remove_file(&data_path);
2027 }
2028 return None;
2029 }
2030 match Self::from_bytes(&bytes, current_canonical_root) {
2031 Ok(index) => {
2032 if index.entries.is_empty() {
2033 slog_info!("cached semantic index is empty, will rebuild");
2034 if !is_worktree_bridge {
2035 let _ = fs::remove_file(&data_path);
2036 }
2037 return None;
2038 }
2039 if let Some(expected) = expected_fingerprint {
2040 let matches = index
2041 .fingerprint()
2042 .map(|fingerprint| fingerprint.matches_expected(expected))
2043 .unwrap_or(false);
2044 if !matches {
2045 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2046 if !is_worktree_bridge {
2047 let _ = fs::remove_file(&data_path);
2048 }
2049 return None;
2050 }
2051 }
2052 slog_info!(
2053 "loaded semantic index from disk: {} entries",
2054 index.entries.len()
2055 );
2056 Some(index)
2057 }
2058 Err(e) => {
2059 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2060 if !is_worktree_bridge {
2061 let _ = fs::remove_file(&data_path);
2062 }
2063 None
2064 }
2065 }
2066 }
2067
2068 pub fn to_bytes(&self) -> Vec<u8> {
2070 let mut buf = Vec::new();
2071 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2072 let encoded = fingerprint.as_string();
2073 if encoded.is_empty() {
2074 None
2075 } else {
2076 Some(encoded.into_bytes())
2077 }
2078 });
2079 let file_mtimes: Vec<_> = self
2080 .file_mtimes
2081 .iter()
2082 .filter_map(|(path, mtime)| {
2083 cache_relative_path(&self.project_root, path)
2084 .map(|relative| (relative, path, mtime))
2085 })
2086 .collect();
2087 let entries: Vec<_> = self
2088 .entries
2089 .iter()
2090 .filter_map(|entry| {
2091 cache_relative_path(&self.project_root, &entry.chunk.file)
2092 .map(|relative| (relative, entry))
2093 })
2094 .collect();
2095
2096 let version = SEMANTIC_INDEX_VERSION_V6;
2109 buf.push(version);
2110 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2111 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2112 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2113 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2114 buf.extend_from_slice(fp_bytes_ref);
2115
2116 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2119 for (relative, path, mtime) in &file_mtimes {
2120 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2121 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2122 buf.extend_from_slice(&path_bytes);
2123 let duration = mtime
2124 .duration_since(SystemTime::UNIX_EPOCH)
2125 .unwrap_or_default();
2126 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2127 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2128 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2129 buf.extend_from_slice(&size.to_le_bytes());
2130 let hash = self
2131 .file_hashes
2132 .get(*path)
2133 .copied()
2134 .unwrap_or_else(cache_freshness::zero_hash);
2135 buf.extend_from_slice(hash.as_bytes());
2136 }
2137
2138 for (relative, entry) in &entries {
2140 let c = &entry.chunk;
2141
2142 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2144 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2145 buf.extend_from_slice(&file_bytes);
2146
2147 let name_bytes = c.name.as_bytes();
2149 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2150 buf.extend_from_slice(name_bytes);
2151
2152 buf.push(symbol_kind_to_u8(&c.kind));
2154
2155 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2157 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2158 buf.push(c.exported as u8);
2159
2160 let snippet_bytes = c.snippet.as_bytes();
2162 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2163 buf.extend_from_slice(snippet_bytes);
2164
2165 let embed_bytes = c.embed_text.as_bytes();
2167 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2168 buf.extend_from_slice(embed_bytes);
2169
2170 for &val in &entry.vector {
2172 buf.extend_from_slice(&val.to_le_bytes());
2173 }
2174 }
2175
2176 buf
2177 }
2178
2179 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2181 debug_assert!(current_canonical_root.is_absolute());
2182 let mut pos = 0;
2183
2184 if data.len() < HEADER_BYTES_V1 {
2185 return Err("data too short".to_string());
2186 }
2187
2188 let version = data[pos];
2189 pos += 1;
2190 if version != SEMANTIC_INDEX_VERSION_V1
2191 && version != SEMANTIC_INDEX_VERSION_V2
2192 && version != SEMANTIC_INDEX_VERSION_V3
2193 && version != SEMANTIC_INDEX_VERSION_V4
2194 && version != SEMANTIC_INDEX_VERSION_V5
2195 && version != SEMANTIC_INDEX_VERSION_V6
2196 {
2197 return Err(format!("unsupported version: {}", version));
2198 }
2199 if (version == SEMANTIC_INDEX_VERSION_V2
2203 || version == SEMANTIC_INDEX_VERSION_V3
2204 || version == SEMANTIC_INDEX_VERSION_V4
2205 || version == SEMANTIC_INDEX_VERSION_V5
2206 || version == SEMANTIC_INDEX_VERSION_V6)
2207 && data.len() < HEADER_BYTES_V2
2208 {
2209 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2210 }
2211
2212 let dimension = read_u32(data, &mut pos)? as usize;
2213 let entry_count = read_u32(data, &mut pos)? as usize;
2214 validate_embedding_dimension(dimension)?;
2215 if entry_count > MAX_ENTRIES {
2216 return Err(format!("too many semantic index entries: {}", entry_count));
2217 }
2218
2219 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2225 || version == SEMANTIC_INDEX_VERSION_V3
2226 || version == SEMANTIC_INDEX_VERSION_V4
2227 || version == SEMANTIC_INDEX_VERSION_V5
2228 || version == SEMANTIC_INDEX_VERSION_V6;
2229 let fingerprint = if has_fingerprint_field {
2230 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2231 if pos + fingerprint_len > data.len() {
2232 return Err("unexpected end of data reading fingerprint".to_string());
2233 }
2234 if fingerprint_len == 0 {
2235 None
2236 } else {
2237 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2238 pos += fingerprint_len;
2239 Some(
2240 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2241 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2242 )
2243 }
2244 } else {
2245 None
2246 };
2247
2248 let mtime_count = read_u32(data, &mut pos)? as usize;
2250 if mtime_count > MAX_ENTRIES {
2251 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2252 }
2253
2254 let vector_bytes = entry_count
2255 .checked_mul(dimension)
2256 .and_then(|count| count.checked_mul(F32_BYTES))
2257 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2258 if vector_bytes > data.len().saturating_sub(pos) {
2259 return Err("semantic index vectors exceed available data".to_string());
2260 }
2261
2262 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2263 let mut file_sizes = HashMap::with_capacity(mtime_count);
2264 let mut file_hashes = HashMap::with_capacity(mtime_count);
2265 for _ in 0..mtime_count {
2266 let path = read_string(data, &mut pos)?;
2267 let secs = read_u64(data, &mut pos)?;
2268 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2274 || version == SEMANTIC_INDEX_VERSION_V4
2275 || version == SEMANTIC_INDEX_VERSION_V5
2276 || version == SEMANTIC_INDEX_VERSION_V6
2277 {
2278 read_u32(data, &mut pos)?
2279 } else {
2280 0
2281 };
2282 let size =
2283 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2284 read_u64(data, &mut pos)?
2285 } else {
2286 0
2287 };
2288 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2289 if pos + 32 > data.len() {
2290 return Err("unexpected end of data reading content hash".to_string());
2291 }
2292 let mut hash_bytes = [0u8; 32];
2293 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2294 pos += 32;
2295 blake3::Hash::from_bytes(hash_bytes)
2296 } else {
2297 cache_freshness::zero_hash()
2298 };
2299 if nanos >= 1_000_000_000 {
2306 return Err(format!(
2307 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2308 nanos
2309 ));
2310 }
2311 let duration = std::time::Duration::new(secs, nanos);
2312 let mtime = SystemTime::UNIX_EPOCH
2313 .checked_add(duration)
2314 .ok_or_else(|| {
2315 format!(
2316 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2317 secs, nanos
2318 )
2319 })?;
2320 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2321 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2322 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2323 } else {
2324 PathBuf::from(path)
2325 };
2326 file_mtimes.insert(path.clone(), mtime);
2327 file_sizes.insert(path.clone(), size);
2328 file_hashes.insert(path, content_hash);
2329 }
2330
2331 let mut entries = Vec::with_capacity(entry_count);
2333 for _ in 0..entry_count {
2334 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2335 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2336 cached_path_under_root(current_canonical_root, &raw_file)
2337 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2338 } else {
2339 raw_file
2340 };
2341 let name = read_string(data, &mut pos)?;
2342
2343 if pos >= data.len() {
2344 return Err("unexpected end of data".to_string());
2345 }
2346 let kind = u8_to_symbol_kind(data[pos]);
2347 pos += 1;
2348
2349 let start_line = read_u32(data, &mut pos)?;
2350 let end_line = read_u32(data, &mut pos)?;
2351
2352 if pos >= data.len() {
2353 return Err("unexpected end of data".to_string());
2354 }
2355 let exported = data[pos] != 0;
2356 pos += 1;
2357
2358 let snippet = read_string(data, &mut pos)?;
2359 let embed_text = read_string(data, &mut pos)?;
2360
2361 let vec_bytes = dimension
2363 .checked_mul(F32_BYTES)
2364 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2365 if pos + vec_bytes > data.len() {
2366 return Err("unexpected end of data reading vector".to_string());
2367 }
2368 let mut vector = Vec::with_capacity(dimension);
2369 for _ in 0..dimension {
2370 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2371 vector.push(f32::from_le_bytes(bytes));
2372 pos += 4;
2373 }
2374
2375 entries.push(EmbeddingEntry {
2376 chunk: SemanticChunk {
2377 file,
2378 name,
2379 kind,
2380 start_line,
2381 end_line,
2382 exported,
2383 embed_text,
2384 snippet,
2385 },
2386 vector,
2387 });
2388 }
2389
2390 if entries.len() != entry_count {
2391 return Err(format!(
2392 "semantic cache entry count drift: header={} decoded={}",
2393 entry_count,
2394 entries.len()
2395 ));
2396 }
2397 for entry in &entries {
2398 if !file_mtimes.contains_key(&entry.chunk.file) {
2399 return Err(format!(
2400 "semantic cache metadata missing for entry file {}",
2401 entry.chunk.file.display()
2402 ));
2403 }
2404 }
2405
2406 Ok(Self {
2407 entries,
2408 file_mtimes,
2409 file_sizes,
2410 file_hashes,
2411 dimension,
2412 fingerprint,
2413 project_root: current_canonical_root.to_path_buf(),
2414 })
2415 }
2416}
2417
2418fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2420 let relative = file
2421 .strip_prefix(project_root)
2422 .unwrap_or(file)
2423 .to_string_lossy();
2424
2425 let kind_label = match &symbol.kind {
2426 SymbolKind::Function => "function",
2427 SymbolKind::Class => "class",
2428 SymbolKind::Method => "method",
2429 SymbolKind::Struct => "struct",
2430 SymbolKind::Interface => "interface",
2431 SymbolKind::Enum => "enum",
2432 SymbolKind::TypeAlias => "type",
2433 SymbolKind::Variable => "variable",
2434 SymbolKind::Heading => "heading",
2435 SymbolKind::FileSummary => "file-summary",
2436 };
2437
2438 let name = &symbol.name;
2440 let mut text = format!(
2441 "name:{name} file:{} kind:{} name:{name}",
2442 relative, kind_label
2443 );
2444
2445 if let Some(sig) = &symbol.signature {
2446 text.push_str(&format!(" signature:{}", sig));
2447 }
2448
2449 let lines: Vec<&str> = source.lines().collect();
2451 let start = (symbol.range.start_line as usize).min(lines.len());
2452 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2454 if start < end {
2455 let body: String = lines[start..end]
2456 .iter()
2457 .take(15) .copied()
2459 .collect::<Vec<&str>>()
2460 .join("\n");
2461 let snippet = if body.len() > 300 {
2462 format!("{}...", &body[..body.floor_char_boundary(300)])
2463 } else {
2464 body
2465 };
2466 text.push_str(&format!(" body:{}", snippet));
2467 }
2468
2469 text
2470}
2471
2472fn truncate_chars(value: &str, max_chars: usize) -> String {
2473 value.chars().take(max_chars).collect()
2474}
2475
2476fn first_leading_doc_comment(source: &str) -> String {
2477 let lines: Vec<&str> = source.lines().collect();
2478 let Some((start, first)) = lines
2479 .iter()
2480 .enumerate()
2481 .find(|(_, line)| !line.trim().is_empty())
2482 else {
2483 return String::new();
2484 };
2485
2486 let trimmed = first.trim_start();
2487 if trimmed.starts_with("/**") {
2488 let mut comment = Vec::new();
2489 for line in lines.iter().skip(start) {
2490 comment.push(*line);
2491 if line.contains("*/") {
2492 break;
2493 }
2494 }
2495 return truncate_chars(&comment.join("\n"), 200);
2496 }
2497
2498 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2499 let comment = lines
2500 .iter()
2501 .skip(start)
2502 .take_while(|line| {
2503 let trimmed = line.trim_start();
2504 trimmed.starts_with("///") || trimmed.starts_with("//!")
2505 })
2506 .copied()
2507 .collect::<Vec<_>>()
2508 .join("\n");
2509 return truncate_chars(&comment, 200);
2510 }
2511
2512 String::new()
2513}
2514
2515pub fn build_file_summary_chunk(
2516 file: &Path,
2517 project_root: &Path,
2518 source: &str,
2519 top_exports: &[&str],
2520 top_export_signatures: &[Option<&str>],
2521) -> SemanticChunk {
2522 let relative = file.strip_prefix(project_root).unwrap_or(file);
2523 let rel_path = relative.to_string_lossy();
2524 let parent_dir = relative
2525 .parent()
2526 .map(|parent| parent.to_string_lossy().to_string())
2527 .unwrap_or_default();
2528 let name = file
2529 .file_stem()
2530 .map(|stem| stem.to_string_lossy().to_string())
2531 .unwrap_or_default();
2532 let doc = first_leading_doc_comment(source);
2533 let exports = top_exports
2534 .iter()
2535 .take(5)
2536 .copied()
2537 .collect::<Vec<_>>()
2538 .join(",");
2539 let snippet = if doc.is_empty() {
2540 top_export_signatures
2541 .first()
2542 .and_then(|signature| signature.as_deref())
2543 .map(|signature| truncate_chars(signature, 200))
2544 .unwrap_or_default()
2545 } else {
2546 doc.clone()
2547 };
2548
2549 SemanticChunk {
2550 file: file.to_path_buf(),
2551 name,
2552 kind: SymbolKind::FileSummary,
2553 start_line: 0,
2554 end_line: 0,
2555 exported: false,
2556 embed_text: format!(
2557 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2558 file.file_stem()
2559 .map(|stem| stem.to_string_lossy().to_string())
2560 .unwrap_or_default()
2561 ),
2562 snippet,
2563 }
2564}
2565
2566fn parser_for(
2567 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2568 lang: crate::parser::LangId,
2569) -> Result<&mut Parser, String> {
2570 use std::collections::hash_map::Entry;
2571
2572 match parsers.entry(lang) {
2573 Entry::Occupied(entry) => Ok(entry.into_mut()),
2574 Entry::Vacant(entry) => {
2575 let grammar = grammar_for(lang);
2576 let mut parser = Parser::new();
2577 parser
2578 .set_language(&grammar)
2579 .map_err(|error| error.to_string())?;
2580 Ok(entry.insert(parser))
2581 }
2582 }
2583}
2584
2585pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2586 matches!(
2587 path.extension().and_then(|extension| extension.to_str()),
2588 Some(
2589 "ts" | "tsx"
2590 | "js"
2591 | "jsx"
2592 | "py"
2593 | "rs"
2594 | "go"
2595 | "c"
2596 | "h"
2597 | "cc"
2598 | "cpp"
2599 | "cxx"
2600 | "hpp"
2601 | "hh"
2602 | "zig"
2603 | "cs"
2604 | "sh"
2605 | "bash"
2606 | "zsh"
2607 | "sol"
2608 | "vue"
2609 )
2610 )
2611}
2612
2613fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2614 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2615 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2616 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2617 .map_err(|error| error.to_string())?
2618 .unwrap_or_else(cache_freshness::zero_hash);
2619 Ok(IndexedFileMetadata {
2620 mtime,
2621 size: metadata.len(),
2622 content_hash,
2623 })
2624}
2625
2626fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2627 if let Ok(canonical) = fs::canonicalize(path) {
2628 return canonical;
2629 }
2630
2631 let Some(parent) = path.parent() else {
2632 return path.to_path_buf();
2633 };
2634 let Some(file_name) = path.file_name() else {
2635 return path.to_path_buf();
2636 };
2637
2638 fs::canonicalize(parent)
2639 .map(|canonical_parent| canonical_parent.join(file_name))
2640 .unwrap_or_else(|_| path.to_path_buf())
2641}
2642
2643fn collect_file_chunks(
2644 project_root: &Path,
2645 file: &Path,
2646 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2647) -> Result<Vec<SemanticChunk>, String> {
2648 if !is_semantic_indexed_extension(file) {
2649 return Err("unsupported file extension".to_string());
2650 }
2651 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2652 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2653 let tree = parser_for(parsers, lang)?
2654 .parse(&source, None)
2655 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2656 let symbols =
2657 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2658
2659 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2660}
2661
2662fn build_snippet(symbol: &Symbol, source: &str) -> String {
2664 let lines: Vec<&str> = source.lines().collect();
2665 let start = (symbol.range.start_line as usize).min(lines.len());
2666 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2668 if start < end {
2669 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2670 let mut snippet = snippet_lines.join("\n");
2671 if end - start > 5 {
2672 snippet.push_str("\n ...");
2673 }
2674 if snippet.len() > 300 {
2675 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2676 }
2677 snippet
2678 } else {
2679 String::new()
2680 }
2681}
2682
2683fn symbols_to_chunks(
2685 file: &Path,
2686 symbols: &[Symbol],
2687 source: &str,
2688 project_root: &Path,
2689) -> Vec<SemanticChunk> {
2690 let mut chunks = Vec::new();
2691 let top_exports_with_signatures = symbols
2692 .iter()
2693 .filter(|symbol| {
2694 symbol.exported
2695 && symbol.parent.is_none()
2696 && !matches!(symbol.kind, SymbolKind::Heading)
2697 })
2698 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2699 .collect::<Vec<_>>();
2700
2701 let has_only_headings = !symbols.is_empty()
2702 && symbols
2703 .iter()
2704 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2705 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2706 let top_exports = top_exports_with_signatures
2707 .iter()
2708 .map(|(name, _)| *name)
2709 .collect::<Vec<_>>();
2710 let top_export_signatures = top_exports_with_signatures
2711 .iter()
2712 .map(|(_, signature)| *signature)
2713 .collect::<Vec<_>>();
2714 chunks.push(build_file_summary_chunk(
2715 file,
2716 project_root,
2717 source,
2718 &top_exports,
2719 &top_export_signatures,
2720 ));
2721 }
2722
2723 for symbol in symbols {
2724 if matches!(symbol.kind, SymbolKind::Heading) {
2729 continue;
2730 }
2731
2732 let line_count = symbol
2734 .range
2735 .end_line
2736 .saturating_sub(symbol.range.start_line)
2737 + 1;
2738 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2739 continue;
2740 }
2741
2742 let embed_text = build_embed_text(symbol, source, file, project_root);
2743 let snippet = build_snippet(symbol, source);
2744
2745 chunks.push(SemanticChunk {
2746 file: file.to_path_buf(),
2747 name: symbol.name.clone(),
2748 kind: symbol.kind.clone(),
2749 start_line: symbol.range.start_line,
2750 end_line: symbol.range.end_line,
2751 exported: symbol.exported,
2752 embed_text,
2753 snippet,
2754 });
2755
2756 }
2759
2760 chunks
2761}
2762
2763fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2765 if a.len() != b.len() {
2766 return 0.0;
2767 }
2768
2769 let mut dot = 0.0f32;
2770 let mut norm_a = 0.0f32;
2771 let mut norm_b = 0.0f32;
2772
2773 for i in 0..a.len() {
2774 dot += a[i] * b[i];
2775 norm_a += a[i] * a[i];
2776 norm_b += b[i] * b[i];
2777 }
2778
2779 let denom = norm_a.sqrt() * norm_b.sqrt();
2780 if denom == 0.0 {
2781 0.0
2782 } else {
2783 dot / denom
2784 }
2785}
2786
2787fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2789 match kind {
2790 SymbolKind::Function => 0,
2791 SymbolKind::Class => 1,
2792 SymbolKind::Method => 2,
2793 SymbolKind::Struct => 3,
2794 SymbolKind::Interface => 4,
2795 SymbolKind::Enum => 5,
2796 SymbolKind::TypeAlias => 6,
2797 SymbolKind::Variable => 7,
2798 SymbolKind::Heading => 8,
2799 SymbolKind::FileSummary => 9,
2800 }
2801}
2802
2803fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2804 match v {
2805 0 => SymbolKind::Function,
2806 1 => SymbolKind::Class,
2807 2 => SymbolKind::Method,
2808 3 => SymbolKind::Struct,
2809 4 => SymbolKind::Interface,
2810 5 => SymbolKind::Enum,
2811 6 => SymbolKind::TypeAlias,
2812 7 => SymbolKind::Variable,
2813 8 => SymbolKind::Heading,
2814 9 => SymbolKind::FileSummary,
2815 _ => SymbolKind::Heading,
2816 }
2817}
2818
2819fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2820 if *pos + 4 > data.len() {
2821 return Err("unexpected end of data reading u32".to_string());
2822 }
2823 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2824 *pos += 4;
2825 Ok(val)
2826}
2827
2828fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2829 if *pos + 8 > data.len() {
2830 return Err("unexpected end of data reading u64".to_string());
2831 }
2832 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2833 *pos += 8;
2834 Ok(u64::from_le_bytes(bytes))
2835}
2836
2837fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2838 let len = read_u32(data, pos)? as usize;
2839 if *pos + len > data.len() {
2840 return Err("unexpected end of data reading string".to_string());
2841 }
2842 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2843 *pos += len;
2844 Ok(s)
2845}
2846
2847#[cfg(test)]
2848mod tests {
2849 use super::*;
2850 use crate::config::{SemanticBackend, SemanticBackendConfig};
2851 use crate::parser::FileParser;
2852 use std::io::{Read, Write};
2853 use std::net::TcpListener;
2854 use std::thread;
2855
2856 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2857 where
2858 F: Fn(String, String, String) -> String + Send + 'static,
2859 {
2860 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2861 let addr = listener.local_addr().expect("local addr");
2862 let handle = thread::spawn(move || {
2863 let (mut stream, _) = listener.accept().expect("accept request");
2864 let mut buf = Vec::new();
2865 let mut chunk = [0u8; 4096];
2866 let mut header_end = None;
2867 let mut content_length = 0usize;
2868 loop {
2869 let n = stream.read(&mut chunk).expect("read request");
2870 if n == 0 {
2871 break;
2872 }
2873 buf.extend_from_slice(&chunk[..n]);
2874 if header_end.is_none() {
2875 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2876 header_end = Some(pos + 4);
2877 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2878 for line in headers.lines() {
2879 if let Some(value) = line.strip_prefix("Content-Length:") {
2880 content_length = value.trim().parse::<usize>().unwrap_or(0);
2881 }
2882 }
2883 }
2884 }
2885 if let Some(end) = header_end {
2886 if buf.len() >= end + content_length {
2887 break;
2888 }
2889 }
2890 }
2891
2892 let end = header_end.expect("header terminator");
2893 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2894 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2895 let mut lines = request.lines();
2896 let request_line = lines.next().expect("request line").to_string();
2897 let path = request_line
2898 .split_whitespace()
2899 .nth(1)
2900 .expect("request path")
2901 .to_string();
2902 let response_body = handler(request_line, path, body);
2903 let response = format!(
2904 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2905 response_body.len(),
2906 response_body
2907 );
2908 stream
2909 .write_all(response.as_bytes())
2910 .expect("write response");
2911 });
2912
2913 (format!("http://{}", addr), handle)
2914 }
2915
2916 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2917 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2918 }
2919
2920 fn write_rust_file(path: &Path, function_name: &str) {
2921 fs::write(
2922 path,
2923 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2924 )
2925 .unwrap();
2926 }
2927
2928 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2929 let mut embed = test_vector_for_texts;
2930 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2931 }
2932
2933 fn test_project_root() -> PathBuf {
2934 std::env::current_dir().unwrap()
2935 }
2936
2937 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2938 index.file_mtimes.insert(file.to_path_buf(), mtime);
2939 index.file_sizes.insert(file.to_path_buf(), size);
2940 index
2941 .file_hashes
2942 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2943 }
2944
2945 #[test]
2946 fn semantic_cache_serialization_skips_paths_outside_project_root() {
2947 let dir = tempfile::tempdir().expect("create temp dir");
2948 let project = fs::canonicalize(dir.path()).expect("canonical project");
2949 let outside = project.join("..").join("outside.rs");
2950 let mut index = SemanticIndex::new(project.clone(), 3);
2951 index
2952 .file_mtimes
2953 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2954 index.file_sizes.insert(outside.clone(), 1);
2955 index
2956 .file_hashes
2957 .insert(outside.clone(), cache_freshness::zero_hash());
2958 index.entries.push(EmbeddingEntry {
2959 chunk: SemanticChunk {
2960 file: outside,
2961 name: "outside".to_string(),
2962 kind: SymbolKind::Function,
2963 start_line: 0,
2964 end_line: 0,
2965 exported: false,
2966 embed_text: "outside".to_string(),
2967 snippet: "outside".to_string(),
2968 },
2969 vector: vec![1.0, 0.0, 0.0],
2970 });
2971
2972 let bytes = index.to_bytes();
2973 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2974 assert_eq!(loaded.entries.len(), 0);
2975 assert!(loaded.file_mtimes.is_empty());
2976 }
2977
2978 #[test]
2979 fn test_cosine_similarity_identical() {
2980 let a = vec![1.0, 0.0, 0.0];
2981 let b = vec![1.0, 0.0, 0.0];
2982 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2983 }
2984
2985 #[test]
2986 fn test_cosine_similarity_orthogonal() {
2987 let a = vec![1.0, 0.0, 0.0];
2988 let b = vec![0.0, 1.0, 0.0];
2989 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2990 }
2991
2992 #[test]
2993 fn test_cosine_similarity_opposite() {
2994 let a = vec![1.0, 0.0, 0.0];
2995 let b = vec![-1.0, 0.0, 0.0];
2996 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2997 }
2998
2999 #[test]
3000 fn test_serialization_roundtrip() {
3001 let project_root = test_project_root();
3002 let file = project_root.join("src/main.rs");
3003 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3004 index.entries.push(EmbeddingEntry {
3005 chunk: SemanticChunk {
3006 file: file.clone(),
3007 name: "handle_request".to_string(),
3008 kind: SymbolKind::Function,
3009 start_line: 10,
3010 end_line: 25,
3011 exported: true,
3012 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3013 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3014 },
3015 vector: vec![0.1, 0.2, 0.3, 0.4],
3016 });
3017 index.dimension = 4;
3018 index
3019 .file_mtimes
3020 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3021 index.file_sizes.insert(file, 0);
3022 index.set_fingerprint(SemanticIndexFingerprint {
3023 backend: "fastembed".to_string(),
3024 model: "all-MiniLM-L6-v2".to_string(),
3025 base_url: FALLBACK_BACKEND.to_string(),
3026 dimension: 4,
3027 chunking_version: default_chunking_version(),
3028 });
3029
3030 let bytes = index.to_bytes();
3031 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3032
3033 assert_eq!(restored.entries.len(), 1);
3034 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3035 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3036 assert_eq!(restored.dimension, 4);
3037 assert_eq!(restored.backend_label(), Some("fastembed"));
3038 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3039 }
3040
3041 #[test]
3042 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3043 let cases = [
3044 (SymbolKind::Function, 0),
3045 (SymbolKind::Class, 1),
3046 (SymbolKind::Method, 2),
3047 (SymbolKind::Struct, 3),
3048 (SymbolKind::Interface, 4),
3049 (SymbolKind::Enum, 5),
3050 (SymbolKind::TypeAlias, 6),
3051 (SymbolKind::Variable, 7),
3052 (SymbolKind::Heading, 8),
3053 (SymbolKind::FileSummary, 9),
3054 ];
3055
3056 for (kind, encoded) in cases {
3057 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3058 assert_eq!(u8_to_symbol_kind(encoded), kind);
3059 }
3060 }
3061
3062 #[test]
3063 fn test_search_top_k() {
3064 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3065 index.dimension = 3;
3066
3067 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3069 let mut vec = vec![0.0f32; 3];
3070 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3072 chunk: SemanticChunk {
3073 file: PathBuf::from("/src/lib.rs"),
3074 name: name.to_string(),
3075 kind: SymbolKind::Function,
3076 start_line: (i * 10 + 1) as u32,
3077 end_line: (i * 10 + 5) as u32,
3078 exported: true,
3079 embed_text: format!("kind:function name:{}", name),
3080 snippet: format!("fn {}() {{}}", name),
3081 },
3082 vector: vec,
3083 });
3084 }
3085
3086 let query = vec![0.9, 0.1, 0.0];
3088 let results = index.search(&query, 2);
3089
3090 assert_eq!(results.len(), 2);
3091 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3093 }
3094
3095 #[test]
3096 fn test_empty_index_search() {
3097 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3098 let results = index.search(&[0.1, 0.2, 0.3], 10);
3099 assert!(results.is_empty());
3100 }
3101
3102 #[test]
3103 fn single_line_symbol_builds_non_empty_snippet() {
3104 let symbol = Symbol {
3105 name: "answer".to_string(),
3106 kind: SymbolKind::Variable,
3107 range: crate::symbols::Range {
3108 start_line: 0,
3109 start_col: 0,
3110 end_line: 0,
3111 end_col: 24,
3112 },
3113 signature: Some("const answer = 42".to_string()),
3114 scope_chain: Vec::new(),
3115 exported: true,
3116 parent: None,
3117 };
3118 let source = "export const answer = 42;\n";
3119
3120 let snippet = build_snippet(&symbol, source);
3121
3122 assert_eq!(snippet, "export const answer = 42;");
3123 }
3124
3125 #[test]
3126 fn optimized_file_chunk_collection_matches_file_parser_path() {
3127 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3128 let file = project_root.join("src/semantic_index.rs");
3129 let source = std::fs::read_to_string(&file).unwrap();
3130
3131 let mut legacy_parser = FileParser::new();
3132 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3133 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3134
3135 let mut parsers = HashMap::new();
3136 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3137
3138 assert_eq!(
3139 chunk_fingerprint(&optimized_chunks),
3140 chunk_fingerprint(&legacy_chunks)
3141 );
3142 }
3143
3144 fn chunk_fingerprint(
3145 chunks: &[SemanticChunk],
3146 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3147 chunks
3148 .iter()
3149 .map(|chunk| {
3150 (
3151 chunk.name.clone(),
3152 chunk.kind.clone(),
3153 chunk.start_line,
3154 chunk.end_line,
3155 chunk.exported,
3156 chunk.embed_text.clone(),
3157 chunk.snippet.clone(),
3158 )
3159 })
3160 .collect()
3161 }
3162
3163 #[test]
3164 fn rejects_oversized_dimension_during_deserialization() {
3165 let mut bytes = Vec::new();
3166 bytes.push(1u8);
3167 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3168 bytes.extend_from_slice(&0u32.to_le_bytes());
3169 bytes.extend_from_slice(&0u32.to_le_bytes());
3170
3171 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3172 }
3173
3174 #[test]
3175 fn rejects_oversized_entry_count_during_deserialization() {
3176 let mut bytes = Vec::new();
3177 bytes.push(1u8);
3178 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3179 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3180 bytes.extend_from_slice(&0u32.to_le_bytes());
3181
3182 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3183 }
3184
3185 #[test]
3186 fn invalidate_file_removes_entries_and_mtime() {
3187 let target = PathBuf::from("/src/main.rs");
3188 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3189 index.entries.push(EmbeddingEntry {
3190 chunk: SemanticChunk {
3191 file: target.clone(),
3192 name: "main".to_string(),
3193 kind: SymbolKind::Function,
3194 start_line: 0,
3195 end_line: 1,
3196 exported: false,
3197 embed_text: "main".to_string(),
3198 snippet: "fn main() {}".to_string(),
3199 },
3200 vector: vec![1.0; DEFAULT_DIMENSION],
3201 });
3202 index
3203 .file_mtimes
3204 .insert(target.clone(), SystemTime::UNIX_EPOCH);
3205 index.file_sizes.insert(target.clone(), 0);
3206
3207 index.invalidate_file(&target);
3208
3209 assert!(index.entries.is_empty());
3210 assert!(!index.file_mtimes.contains_key(&target));
3211 assert!(!index.file_sizes.contains_key(&target));
3212 }
3213
3214 #[test]
3215 fn refresh_missing_changed_file_is_purged_after_collect() {
3216 let temp = tempfile::tempdir().unwrap();
3217 let project_root = temp.path();
3218 let file = project_root.join("src/lib.rs");
3219 fs::create_dir_all(file.parent().unwrap()).unwrap();
3220 write_rust_file(&file, "vanished_symbol");
3221
3222 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3223 let original_size = *index.file_sizes.get(&file).unwrap();
3224 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3225 fs::remove_file(&file).unwrap();
3226
3227 let mut embed = test_vector_for_texts;
3228 let mut progress = |_done: usize, _total: usize| {};
3229 let summary = index
3230 .refresh_stale_files(
3231 project_root,
3232 std::slice::from_ref(&file),
3233 &mut embed,
3234 8,
3235 &mut progress,
3236 )
3237 .unwrap();
3238
3239 assert_eq!(summary.changed, 0);
3240 assert_eq!(summary.added, 0);
3241 assert_eq!(summary.deleted, 1);
3242 assert!(index.entries.is_empty());
3243 assert!(!index.file_mtimes.contains_key(&file));
3244 assert!(!index.file_sizes.contains_key(&file));
3245 assert!(!index.file_hashes.contains_key(&file));
3246 }
3247
3248 #[test]
3249 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3250 let temp = tempfile::tempdir().unwrap();
3251 let project_root = temp.path();
3252 let file = project_root.join("src/lib.rs");
3253 fs::create_dir_all(file.parent().unwrap()).unwrap();
3254 write_rust_file(&file, "kept_symbol");
3255
3256 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3257 let original_entry_count = index.entries.len();
3258 let original_mtime = *index.file_mtimes.get(&file).unwrap();
3259 let original_size = *index.file_sizes.get(&file).unwrap();
3260
3261 let stale_mtime = SystemTime::UNIX_EPOCH;
3262 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3263 fs::remove_file(&file).unwrap();
3264 fs::create_dir(&file).unwrap();
3265
3266 let mut embed = test_vector_for_texts;
3267 let mut progress = |_done: usize, _total: usize| {};
3268 let summary = index
3269 .refresh_stale_files(
3270 project_root,
3271 std::slice::from_ref(&file),
3272 &mut embed,
3273 8,
3274 &mut progress,
3275 )
3276 .unwrap();
3277
3278 assert_eq!(summary.changed, 0);
3279 assert_eq!(summary.added, 0);
3280 assert_eq!(summary.deleted, 0);
3281 assert_eq!(index.entries.len(), original_entry_count);
3282 assert!(index
3283 .entries
3284 .iter()
3285 .any(|entry| entry.chunk.name == "kept_symbol"));
3286 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3287 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3288 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3289 }
3290
3291 #[test]
3292 fn refresh_never_indexed_file_error_does_not_record_mtime() {
3293 let temp = tempfile::tempdir().unwrap();
3294 let project_root = temp.path();
3295 let missing = project_root.join("src/missing.rs");
3296 fs::create_dir_all(missing.parent().unwrap()).unwrap();
3297
3298 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3299 let mut embed = test_vector_for_texts;
3300 let mut progress = |_done: usize, _total: usize| {};
3301 let summary = index
3302 .refresh_stale_files(
3303 project_root,
3304 std::slice::from_ref(&missing),
3305 &mut embed,
3306 8,
3307 &mut progress,
3308 )
3309 .unwrap();
3310
3311 assert_eq!(summary.added, 0);
3312 assert_eq!(summary.changed, 0);
3313 assert_eq!(summary.deleted, 0);
3314 assert!(!index.file_mtimes.contains_key(&missing));
3315 assert!(!index.file_sizes.contains_key(&missing));
3316 assert!(index.entries.is_empty());
3317 }
3318
3319 #[test]
3320 fn refresh_reports_added_for_new_files() {
3321 let temp = tempfile::tempdir().unwrap();
3322 let project_root = temp.path();
3323 let existing = project_root.join("src/lib.rs");
3324 let added = project_root.join("src/new.rs");
3325 fs::create_dir_all(existing.parent().unwrap()).unwrap();
3326 write_rust_file(&existing, "existing_symbol");
3327 write_rust_file(&added, "added_symbol");
3328
3329 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3330 let mut embed = test_vector_for_texts;
3331 let mut progress = |_done: usize, _total: usize| {};
3332 let summary = index
3333 .refresh_stale_files(
3334 project_root,
3335 &[existing.clone(), added.clone()],
3336 &mut embed,
3337 8,
3338 &mut progress,
3339 )
3340 .unwrap();
3341
3342 assert_eq!(summary.added, 1);
3343 assert_eq!(summary.changed, 0);
3344 assert_eq!(summary.deleted, 0);
3345 assert_eq!(summary.total_processed, 2);
3346 assert!(index.file_mtimes.contains_key(&added));
3347 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3348 }
3349
3350 #[test]
3351 fn refresh_reports_deleted_for_removed_files() {
3352 let temp = tempfile::tempdir().unwrap();
3353 let project_root = temp.path();
3354 let deleted = project_root.join("src/deleted.rs");
3355 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3356 write_rust_file(&deleted, "deleted_symbol");
3357
3358 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3359 fs::remove_file(&deleted).unwrap();
3360
3361 let mut embed = test_vector_for_texts;
3362 let mut progress = |_done: usize, _total: usize| {};
3363 let summary = index
3364 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3365 .unwrap();
3366
3367 assert_eq!(summary.deleted, 1);
3368 assert_eq!(summary.changed, 0);
3369 assert_eq!(summary.added, 0);
3370 assert_eq!(summary.total_processed, 1);
3371 assert!(!index.file_mtimes.contains_key(&deleted));
3372 assert!(index.entries.is_empty());
3373 }
3374
3375 #[test]
3376 fn refresh_reports_changed_for_modified_files() {
3377 let temp = tempfile::tempdir().unwrap();
3378 let project_root = temp.path();
3379 let file = project_root.join("src/lib.rs");
3380 fs::create_dir_all(file.parent().unwrap()).unwrap();
3381 write_rust_file(&file, "old_symbol");
3382
3383 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3384 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3385 write_rust_file(&file, "new_symbol");
3386
3387 let mut embed = test_vector_for_texts;
3388 let mut progress = |_done: usize, _total: usize| {};
3389 let summary = index
3390 .refresh_stale_files(
3391 project_root,
3392 std::slice::from_ref(&file),
3393 &mut embed,
3394 8,
3395 &mut progress,
3396 )
3397 .unwrap();
3398
3399 assert_eq!(summary.changed, 1);
3400 assert_eq!(summary.added, 0);
3401 assert_eq!(summary.deleted, 0);
3402 assert_eq!(summary.total_processed, 1);
3403 assert!(index
3404 .entries
3405 .iter()
3406 .any(|entry| entry.chunk.name == "new_symbol"));
3407 assert!(!index
3408 .entries
3409 .iter()
3410 .any(|entry| entry.chunk.name == "old_symbol"));
3411 }
3412
3413 #[test]
3414 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3415 let temp = tempfile::tempdir().unwrap();
3416 let project_root = temp.path();
3417 let file = project_root.join("src/lib.rs");
3418 fs::create_dir_all(file.parent().unwrap()).unwrap();
3419 write_rust_file(&file, "clean_symbol");
3420
3421 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3422 let original_entries = index.entries.len();
3423 let mut embed_called = false;
3424 let mut embed = |texts: Vec<String>| {
3425 embed_called = true;
3426 test_vector_for_texts(texts)
3427 };
3428 let mut progress = |_done: usize, _total: usize| {};
3429 let summary = index
3430 .refresh_stale_files(
3431 project_root,
3432 std::slice::from_ref(&file),
3433 &mut embed,
3434 8,
3435 &mut progress,
3436 )
3437 .unwrap();
3438
3439 assert!(summary.is_noop());
3440 assert_eq!(summary.total_processed, 1);
3441 assert!(!embed_called);
3442 assert_eq!(index.entries.len(), original_entries);
3443 }
3444
3445 #[test]
3446 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3447 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3448
3449 assert!(is_onnx_runtime_unavailable(message));
3450 }
3451
3452 #[test]
3453 fn formats_missing_onnx_runtime_with_install_hint() {
3454 let message = format_embedding_init_error(
3455 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3456 );
3457
3458 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3459 assert!(message.contains("Original error:"));
3460 }
3461
3462 #[test]
3463 fn openai_compatible_backend_embeds_with_mock_server() {
3464 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3465 assert!(request_line.starts_with("POST "));
3466 assert_eq!(path, "/v1/embeddings");
3467 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3468 });
3469
3470 let config = SemanticBackendConfig {
3471 backend: SemanticBackend::OpenAiCompatible,
3472 model: "test-embedding".to_string(),
3473 base_url: Some(base_url),
3474 api_key_env: None,
3475 timeout_ms: 5_000,
3476 max_batch_size: 64,
3477 };
3478
3479 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3480 let vectors = model
3481 .embed(vec!["hello".to_string(), "world".to_string()])
3482 .unwrap();
3483
3484 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3485 handle.join().unwrap();
3486 }
3487
3488 #[test]
3498 fn openai_compatible_request_has_single_content_type_header() {
3499 use std::sync::{Arc, Mutex};
3500 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3501 let captured_for_thread = Arc::clone(&captured);
3502
3503 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3504 let addr = listener.local_addr().expect("local addr");
3505 let handle = thread::spawn(move || {
3506 let (mut stream, _) = listener.accept().expect("accept");
3507 let mut buf = Vec::new();
3508 let mut chunk = [0u8; 4096];
3509 let mut header_end = None;
3510 let mut content_length = 0usize;
3511 loop {
3512 let n = stream.read(&mut chunk).expect("read");
3513 if n == 0 {
3514 break;
3515 }
3516 buf.extend_from_slice(&chunk[..n]);
3517 if header_end.is_none() {
3518 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3519 header_end = Some(pos + 4);
3520 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3521 if let Some(value) = line.strip_prefix("Content-Length:") {
3522 content_length = value.trim().parse::<usize>().unwrap_or(0);
3523 }
3524 }
3525 }
3526 }
3527 if let Some(end) = header_end {
3528 if buf.len() >= end + content_length {
3529 break;
3530 }
3531 }
3532 }
3533 *captured_for_thread.lock().unwrap() = buf;
3534 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3535 let response = format!(
3536 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3537 body.len(),
3538 body
3539 );
3540 let _ = stream.write_all(response.as_bytes());
3541 });
3542
3543 let config = SemanticBackendConfig {
3544 backend: SemanticBackend::OpenAiCompatible,
3545 model: "text-embedding-3-small".to_string(),
3546 base_url: Some(format!("http://{}", addr)),
3547 api_key_env: None,
3548 timeout_ms: 5_000,
3549 max_batch_size: 64,
3550 };
3551 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3552 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3553 handle.join().unwrap();
3554
3555 let bytes = captured.lock().unwrap().clone();
3556 let request = String::from_utf8_lossy(&bytes);
3557
3558 let content_type_lines = request
3561 .lines()
3562 .filter(|line| {
3563 let lower = line.to_ascii_lowercase();
3564 lower.starts_with("content-type:")
3565 })
3566 .count();
3567 assert_eq!(
3568 content_type_lines, 1,
3569 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3570 );
3571
3572 assert!(
3575 request.contains(r#""model":"text-embedding-3-small""#),
3576 "request body should contain model field; full request:\n{request}",
3577 );
3578 }
3579
3580 #[test]
3581 fn ollama_backend_embeds_with_mock_server() {
3582 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3583 assert!(request_line.starts_with("POST "));
3584 assert_eq!(path, "/api/embed");
3585 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3586 });
3587
3588 let config = SemanticBackendConfig {
3589 backend: SemanticBackend::Ollama,
3590 model: "embeddinggemma".to_string(),
3591 base_url: Some(base_url),
3592 api_key_env: None,
3593 timeout_ms: 5_000,
3594 max_batch_size: 64,
3595 };
3596
3597 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3598 let vectors = model
3599 .embed(vec!["hello".to_string(), "world".to_string()])
3600 .unwrap();
3601
3602 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3603 handle.join().unwrap();
3604 }
3605
3606 #[test]
3607 fn read_from_disk_rejects_fingerprint_mismatch() {
3608 let storage = tempfile::tempdir().unwrap();
3609 let project_key = "proj";
3610
3611 let project_root = test_project_root();
3612 let file = project_root.join("src/main.rs");
3613 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3614 index.entries.push(EmbeddingEntry {
3615 chunk: SemanticChunk {
3616 file: file.clone(),
3617 name: "handle_request".to_string(),
3618 kind: SymbolKind::Function,
3619 start_line: 10,
3620 end_line: 25,
3621 exported: true,
3622 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3623 snippet: "fn handle_request() {}".to_string(),
3624 },
3625 vector: vec![0.1, 0.2, 0.3],
3626 });
3627 index.dimension = 3;
3628 index
3629 .file_mtimes
3630 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3631 index.file_sizes.insert(file, 0);
3632 index.set_fingerprint(SemanticIndexFingerprint {
3633 backend: "openai_compatible".to_string(),
3634 model: "test-embedding".to_string(),
3635 base_url: "http://127.0.0.1:1234/v1".to_string(),
3636 dimension: 3,
3637 chunking_version: default_chunking_version(),
3638 });
3639 index.write_to_disk(storage.path(), project_key);
3640
3641 let matching = index.fingerprint().unwrap().as_string();
3642 assert!(SemanticIndex::read_from_disk(
3643 storage.path(),
3644 project_key,
3645 &project_root,
3646 false,
3647 Some(&matching),
3648 )
3649 .is_some());
3650
3651 let mismatched = SemanticIndexFingerprint {
3652 backend: "ollama".to_string(),
3653 model: "embeddinggemma".to_string(),
3654 base_url: "http://127.0.0.1:11434".to_string(),
3655 dimension: 3,
3656 chunking_version: default_chunking_version(),
3657 }
3658 .as_string();
3659 assert!(SemanticIndex::read_from_disk(
3660 storage.path(),
3661 project_key,
3662 &project_root,
3663 false,
3664 Some(&mismatched),
3665 )
3666 .is_none());
3667 }
3668
3669 #[test]
3670 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3671 let storage = tempfile::tempdir().unwrap();
3672 let project_key = "proj-v3";
3673 let dir = storage.path().join("semantic").join(project_key);
3674 fs::create_dir_all(&dir).unwrap();
3675
3676 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3677 index.entries.push(EmbeddingEntry {
3678 chunk: SemanticChunk {
3679 file: PathBuf::from("/src/main.rs"),
3680 name: "handle_request".to_string(),
3681 kind: SymbolKind::Function,
3682 start_line: 0,
3683 end_line: 0,
3684 exported: true,
3685 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3686 snippet: "fn handle_request() {}".to_string(),
3687 },
3688 vector: vec![0.1, 0.2, 0.3],
3689 });
3690 index.dimension = 3;
3691 index
3692 .file_mtimes
3693 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3694 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3695 let fingerprint = SemanticIndexFingerprint {
3696 backend: "fastembed".to_string(),
3697 model: "test".to_string(),
3698 base_url: FALLBACK_BACKEND.to_string(),
3699 dimension: 3,
3700 chunking_version: default_chunking_version(),
3701 };
3702 index.set_fingerprint(fingerprint.clone());
3703
3704 let mut bytes = index.to_bytes();
3705 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3706 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3707
3708 assert!(SemanticIndex::read_from_disk(
3709 storage.path(),
3710 project_key,
3711 &test_project_root(),
3712 false,
3713 Some(&fingerprint.as_string())
3714 )
3715 .is_none());
3716 assert!(!dir.join("semantic.bin").exists());
3717 }
3718
3719 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3720 crate::symbols::Symbol {
3721 name: name.to_string(),
3722 kind,
3723 range: crate::symbols::Range {
3724 start_line: start,
3725 start_col: 0,
3726 end_line: end,
3727 end_col: 0,
3728 },
3729 signature: None,
3730 scope_chain: Vec::new(),
3731 exported: false,
3732 parent: None,
3733 }
3734 }
3735
3736 #[test]
3741 fn symbols_to_chunks_skips_heading_symbols() {
3742 let project_root = PathBuf::from("/proj");
3743 let file = project_root.join("README.md");
3744 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3745
3746 let symbols = vec![
3747 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3748 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3749 ];
3750
3751 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3752 assert!(
3753 chunks.is_empty(),
3754 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3755 chunks.len()
3756 );
3757 }
3758
3759 #[test]
3763 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3764 let project_root = PathBuf::from("/proj");
3765 let file = project_root.join("src/lib.rs");
3766 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3767
3768 let symbols = vec![
3769 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3771 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3772 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3773 ];
3774
3775 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3776 assert_eq!(
3777 chunks.len(),
3778 3,
3779 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3780 chunks.len()
3781 );
3782 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3783 assert!(chunks
3784 .iter()
3785 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3786 assert!(names.contains(&"handle_request"));
3787 assert!(names.contains(&"AuthService"));
3788 assert!(
3789 !names.contains(&"doc heading"),
3790 "Heading symbol leaked into chunks: {names:?}"
3791 );
3792 }
3793
3794 #[test]
3795 fn validate_ssrf_allows_loopback_hostnames() {
3796 for host in &[
3799 "http://localhost",
3800 "http://localhost:8080",
3801 "http://localhost:11434", "http://localhost.localdomain",
3803 "http://foo.localhost",
3804 ] {
3805 assert!(
3806 validate_base_url_no_ssrf(host).is_ok(),
3807 "Expected {host} to be allowed (loopback), got: {:?}",
3808 validate_base_url_no_ssrf(host)
3809 );
3810 }
3811 }
3812
3813 #[test]
3814 fn validate_ssrf_allows_loopback_ips() {
3815 for url in &[
3818 "http://127.0.0.1",
3819 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3821 "http://127.1.2.3",
3822 ] {
3823 let result = validate_base_url_no_ssrf(url);
3824 assert!(
3825 result.is_ok(),
3826 "Expected {url} to be allowed (loopback), got: {:?}",
3827 result
3828 );
3829 }
3830 }
3831
3832 #[test]
3833 fn validate_ssrf_rejects_private_non_loopback_ips() {
3834 for url in &[
3839 "http://192.168.1.1",
3840 "http://10.0.0.1",
3841 "http://172.16.0.1",
3842 "http://169.254.169.254",
3843 "http://100.64.0.1",
3844 ] {
3845 let result = validate_base_url_no_ssrf(url);
3846 assert!(
3847 result.is_err(),
3848 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3849 result
3850 );
3851 }
3852 }
3853
3854 #[test]
3855 fn validate_ssrf_rejects_mdns_local_hostnames() {
3856 for host in &[
3859 "http://printer.local",
3860 "http://nas.local:8080",
3861 "http://homelab.local",
3862 ] {
3863 let result = validate_base_url_no_ssrf(host);
3864 assert!(
3865 result.is_err(),
3866 "Expected {host} to be rejected (mDNS), got: {:?}",
3867 result
3868 );
3869 }
3870 }
3871
3872 #[test]
3873 fn normalize_base_url_allows_localhost_for_tests() {
3874 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3877 assert!(normalize_base_url("http://localhost:8080").is_ok());
3878 }
3879
3880 #[test]
3887 fn ort_mismatch_message_recommends_auto_fix_first() {
3888 let msg =
3889 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3890
3891 assert!(
3893 msg.contains("v1.9.0"),
3894 "should report detected version: {msg}"
3895 );
3896 assert!(
3897 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3898 "should report system path: {msg}"
3899 );
3900 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3901
3902 let auto_fix_pos = msg
3904 .find("Auto-fix")
3905 .expect("Auto-fix solution missing — users won't discover --fix");
3906 let remove_pos = msg
3907 .find("Remove the old library")
3908 .expect("system-rm solution missing");
3909 assert!(
3910 auto_fix_pos < remove_pos,
3911 "Auto-fix must come before manual rm — see PR comment thread"
3912 );
3913
3914 assert!(
3916 msg.contains("npx @cortexkit/aft doctor --fix"),
3917 "auto-fix command must be present and copy-pasteable: {msg}"
3918 );
3919 }
3920
3921 #[test]
3925 fn ort_mismatch_message_handles_macos_dylib_path() {
3926 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3927 assert!(msg.contains("v1.9.0"));
3928 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3929 assert!(
3933 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3934 "system path should be quoted in the auto-fix sentence: {msg}"
3935 );
3936 }
3937}