1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const SEMANTIC_INDEX_VERSION_V7: u8 = 7;
55const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
56const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
57const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
60const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
63const DEFAULT_MAX_BATCH_SIZE: usize = 64;
64const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
65const FALLBACK_BACKEND: &str = "none";
66const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
67const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
68static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
69
70pub struct SemanticIndexLock {
71 _guard: fs_lock::LockGuard,
72}
73
74impl SemanticIndexLock {
75 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
76 let dir = storage_dir.join("semantic").join(project_key);
77 fs::create_dir_all(&dir)?;
78 let path = dir.join("cache.lock");
79 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
80 .lock()
81 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
82 fs_lock::try_acquire(&path, Duration::from_secs(2))
83 .map(|guard| Self { _guard: guard })
84 .map_err(|error| match error {
85 fs_lock::AcquireError::Timeout => {
86 std::io::Error::other("timed out acquiring semantic cache lock")
87 }
88 fs_lock::AcquireError::Io(error) => error,
89 })
90 }
91}
92
93#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct SemanticIndexFingerprint {
95 pub backend: String,
96 pub model: String,
97 #[serde(default)]
98 pub base_url: String,
99 pub dimension: usize,
100 #[serde(default = "default_chunking_version")]
101 pub chunking_version: u32,
102}
103
104fn default_chunking_version() -> u32 {
105 2
106}
107
108impl SemanticIndexFingerprint {
109 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
110 let base_url = config
113 .base_url
114 .as_ref()
115 .and_then(|u| normalize_base_url(u).ok())
116 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
117 Self {
118 backend: config.backend.as_str().to_string(),
119 model: config.model.clone(),
120 base_url,
121 dimension,
122 chunking_version: default_chunking_version(),
123 }
124 }
125
126 pub fn as_string(&self) -> String {
127 serde_json::to_string(self).unwrap_or_else(|_| String::new())
128 }
129
130 fn matches_expected(&self, expected: &str) -> bool {
131 let encoded = self.as_string();
132 !encoded.is_empty() && encoded == expected
133 }
134}
135
136enum SemanticEmbeddingEngine {
137 Local(LocalEmbedder),
140 OpenAiCompatible {
141 client: Client,
142 model: String,
143 base_url: String,
144 api_key: Option<String>,
145 },
146 Ollama {
147 client: Client,
148 model: String,
149 base_url: String,
150 },
151}
152
153pub struct SemanticEmbeddingModel {
154 backend: SemanticBackend,
155 model: String,
156 base_url: Option<String>,
157 timeout_ms: u64,
158 max_batch_size: usize,
159 dimension: Option<usize>,
160 engine: SemanticEmbeddingEngine,
161 query_embedding_cache: HashMap<String, Vec<f32>>,
162 query_embedding_cache_order: VecDeque<String>,
163 query_embedding_cache_hits: u64,
164 query_embedding_cache_misses: u64,
165}
166
167pub type EmbeddingModel = SemanticEmbeddingModel;
168
169fn validate_embedding_batch(
170 vectors: &[Vec<f32>],
171 expected_count: usize,
172 context: &str,
173) -> Result<(), String> {
174 if expected_count > 0 && vectors.is_empty() {
175 return Err(format!(
176 "{context} returned no vectors for {expected_count} inputs"
177 ));
178 }
179
180 if vectors.len() != expected_count {
181 return Err(format!(
182 "{context} returned {} vectors for {} inputs",
183 vectors.len(),
184 expected_count
185 ));
186 }
187
188 let Some(first_vector) = vectors.first() else {
189 return Ok(());
190 };
191 let expected_dimension = first_vector.len();
192 validate_embedding_dimension(expected_dimension)
193 .map_err(|error| format!("{context} returned {error}"))?;
194 for (index, vector) in vectors.iter().enumerate() {
195 if vector.len() != expected_dimension {
196 return Err(format!(
197 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
198 vector.len()
199 ));
200 }
201 }
202
203 Ok(())
204}
205
206fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
207 if dimension == 0 || dimension > MAX_DIMENSION {
208 return Err(format!(
209 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
210 ));
211 }
212
213 Ok(())
214}
215
216fn normalize_base_url(raw: &str) -> Result<String, String> {
220 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
221 let scheme = parsed.scheme();
222 if scheme != "http" && scheme != "https" {
223 return Err(format!(
224 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
225 scheme
226 ));
227 }
228 Ok(parsed.to_string().trim_end_matches('/').to_string())
229}
230
231pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
246 use std::net::{IpAddr, ToSocketAddrs};
247
248 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
249
250 let host = parsed.host_str().unwrap_or("");
251
252 let is_loopback_host =
257 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
258 if is_loopback_host {
259 return Ok(());
260 }
261
262 if host.ends_with(".local") {
265 return Err(format!(
266 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
267 ));
268 }
269
270 let port = parsed.port_or_known_default().unwrap_or(443);
273 let addr_str = format!("{host}:{port}");
274 let addrs: Vec<IpAddr> = addr_str
275 .to_socket_addrs()
276 .map(|iter| iter.map(|sa| sa.ip()).collect())
277 .unwrap_or_default();
278 for ip in &addrs {
279 if is_private_non_loopback_ip(ip) {
280 return Err(format!(
281 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
282 ));
283 }
284 }
285
286 Ok(())
287}
288
289fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
300 if ip.to_canonical().is_loopback() {
303 return false;
304 }
305 crate::url_fetch::is_private_or_reserved_ip(*ip)
306}
307
308fn build_openai_embeddings_endpoint(base_url: &str) -> String {
309 if base_url.ends_with("/v1") {
310 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
311 } else {
312 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
313 }
314}
315
316fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
317 if base_url.ends_with("/api") {
318 format!("{base_url}/embed")
319 } else {
320 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
321 }
322}
323
324fn normalize_api_key(value: Option<String>) -> Option<String> {
325 value.and_then(|token| {
326 let token = token.trim();
327 if token.is_empty() {
328 None
329 } else {
330 Some(token.to_string())
331 }
332 })
333}
334
335fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
336 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
337}
338
339fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
345 if !matches!(
346 status,
347 reqwest::StatusCode::BAD_REQUEST
348 | reqwest::StatusCode::CONFLICT
349 | reqwest::StatusCode::REQUEST_TIMEOUT
350 | reqwest::StatusCode::LOCKED
351 | reqwest::StatusCode::TOO_EARLY
352 ) {
353 return false;
354 }
355
356 let lower = raw.to_ascii_lowercase();
357 let normalized = lower.trim();
358
359 normalized.contains("model was unloaded while the request was still in queue")
360 || normalized == "model is loading"
361 || normalized.starts_with("model is loading,")
362 || normalized.contains(r#""error":"model is loading"#)
363 || normalized.contains(r#""message":"model is loading"#)
364 || normalized == "model not loaded"
365 || normalized.contains(r#""error":"model not loaded""#)
366 || normalized.contains(r#""message":"model not loaded""#)
367 || normalized == "loading model into memory"
368 || normalized.contains(r#""error":"loading model into memory""#)
369 || normalized.contains(r#""message":"loading model into memory""#)
370 || normalized == "model is being loaded"
371 || normalized.contains(r#""error":"model is being loaded""#)
372 || normalized.contains(r#""message":"model is being loaded""#)
373 || normalized == "model is currently loading"
374 || normalized.contains(r#""error":"model is currently loading""#)
375 || normalized.contains(r#""message":"model is currently loading""#)
376}
377
378fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
379 error.is_connect()
380}
381
382fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
388 error.is_connect() || error.is_timeout()
389}
390
391fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
392 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
393}
394
395pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
402
403pub fn embedding_failure_is_transient(error: &str) -> bool {
406 error.contains(TRANSIENT_EMBEDDING_MARKER)
407}
408
409pub fn strip_transient_embedding_marker(error: &str) -> String {
411 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
412}
413
414fn sleep_before_embedding_retry(attempt_index: usize) {
415 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
416 std::thread::sleep(Duration::from_millis(*delay_ms));
417 }
418}
419
420fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
421where
422 F: FnMut() -> reqwest::blocking::RequestBuilder,
423{
424 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
425 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
426
427 let response = match make_request().send() {
428 Ok(response) => response,
429 Err(error) => {
430 if !last_attempt && is_retryable_embedding_error(&error) {
431 sleep_before_embedding_retry(attempt_index);
432 continue;
433 }
434 let marker = if embedding_send_error_is_transient(&error) {
438 TRANSIENT_EMBEDDING_MARKER
439 } else {
440 ""
441 };
442 return Err(format!("{marker}{backend_label} request failed: {error}"));
443 }
444 };
445
446 let status = response.status();
447 let raw = match response.text() {
448 Ok(raw) => raw,
449 Err(error) => {
450 if !last_attempt && embedding_response_read_error_is_transient(&error) {
451 sleep_before_embedding_retry(attempt_index);
452 continue;
453 }
454 let marker = if embedding_response_read_error_is_transient(&error) {
455 TRANSIENT_EMBEDDING_MARKER
456 } else {
457 ""
458 };
459 return Err(format!(
460 "{marker}{backend_label} response read failed: {error}"
461 ));
462 }
463 };
464
465 if status.is_success() {
466 return Ok(raw);
467 }
468
469 let body_transient = embedding_response_body_is_transient(status, &raw);
473 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
474 sleep_before_embedding_retry(attempt_index);
475 continue;
476 }
477
478 let marker = if is_retryable_embedding_status(status) || body_transient {
484 TRANSIENT_EMBEDDING_MARKER
485 } else {
486 ""
487 };
488 return Err(format!(
489 "{marker}{backend_label} request failed (HTTP {}): {}",
490 status, raw
491 ));
492 }
493
494 unreachable!("embedding request retries exhausted without returning")
495}
496
497fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
498 if config.timeout_ms == 0 {
499 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
500 } else {
501 config.timeout_ms
502 }
503}
504
505impl SemanticEmbeddingModel {
506 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
507 Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
508 }
509
510 pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
511 let timeout_ms =
512 configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
513 Self::from_config_with_timeout_ms(config, timeout_ms)
514 }
515
516 fn from_config_with_timeout_ms(
517 config: &SemanticBackendConfig,
518 timeout_ms: u64,
519 ) -> Result<Self, String> {
520 let max_batch_size = if config.max_batch_size == 0 {
521 DEFAULT_MAX_BATCH_SIZE
522 } else {
523 config.max_batch_size
524 };
525
526 let api_key_env = normalize_api_key(config.api_key_env.clone());
527 let model = config.model.clone();
528
529 let client = Client::builder()
530 .timeout(Duration::from_millis(timeout_ms))
531 .redirect(reqwest::redirect::Policy::none())
532 .build()
533 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
534
535 let engine = match config.backend {
536 SemanticBackend::Fastembed => {
537 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
538 }
539 SemanticBackend::OpenAiCompatible => {
540 let raw = config.base_url.as_ref().ok_or_else(|| {
541 "base_url is required for openai_compatible backend".to_string()
542 })?;
543 let base_url = normalize_base_url(raw)?;
544
545 let api_key = match api_key_env {
546 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
547 format!("missing api_key_env '{var_name}' for openai_compatible backend")
548 })?),
549 None => None,
550 };
551
552 SemanticEmbeddingEngine::OpenAiCompatible {
553 client,
554 model,
555 base_url,
556 api_key,
557 }
558 }
559 SemanticBackend::Ollama => {
560 let raw = config
561 .base_url
562 .as_ref()
563 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
564 let base_url = normalize_base_url(raw)?;
565
566 SemanticEmbeddingEngine::Ollama {
567 client,
568 model,
569 base_url,
570 }
571 }
572 };
573
574 Ok(Self {
575 backend: config.backend,
576 model: config.model.clone(),
577 base_url: config.base_url.clone(),
578 timeout_ms,
579 max_batch_size,
580 dimension: None,
581 engine,
582 query_embedding_cache: HashMap::new(),
583 query_embedding_cache_order: VecDeque::new(),
584 query_embedding_cache_hits: 0,
585 query_embedding_cache_misses: 0,
586 })
587 }
588
589 pub fn backend(&self) -> SemanticBackend {
590 self.backend
591 }
592
593 pub fn model(&self) -> &str {
594 &self.model
595 }
596
597 pub fn base_url(&self) -> Option<&str> {
598 self.base_url.as_deref()
599 }
600
601 pub fn max_batch_size(&self) -> usize {
602 self.max_batch_size
603 }
604
605 pub fn timeout_ms(&self) -> u64 {
606 self.timeout_ms
607 }
608
609 pub fn fingerprint(
610 &mut self,
611 config: &SemanticBackendConfig,
612 ) -> Result<SemanticIndexFingerprint, String> {
613 let dimension = self.dimension()?;
614 Ok(SemanticIndexFingerprint::from_config(config, dimension))
615 }
616
617 pub fn dimension(&mut self) -> Result<usize, String> {
618 if let Some(dimension) = self.dimension {
619 return Ok(dimension);
620 }
621
622 let dimension = match &mut self.engine {
623 SemanticEmbeddingEngine::Local(model) => {
624 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
625 vectors
626 .first()
627 .map(|v| v.len())
628 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
629 }
630 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
631 let vectors =
632 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
633 vectors
634 .first()
635 .map(|v| v.len())
636 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
637 }
638 SemanticEmbeddingEngine::Ollama { .. } => {
639 let vectors =
640 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
641 vectors
642 .first()
643 .map(|v| v.len())
644 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
645 }
646 };
647
648 self.dimension = Some(dimension);
649 Ok(dimension)
650 }
651
652 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
653 self.embed_texts(texts)
654 }
655
656 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
657 if let Some(vector) = self.query_embedding_cache.get(query) {
658 self.query_embedding_cache_hits += 1;
659 return Ok(vector.clone());
660 }
661
662 self.query_embedding_cache_misses += 1;
663 let embeddings = self.embed_texts(vec![query.to_string()])?;
664 let vector = embeddings
665 .first()
666 .cloned()
667 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
668
669 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
670 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
671 self.query_embedding_cache.remove(&oldest);
672 }
673 }
674 self.query_embedding_cache
675 .insert(query.to_string(), vector.clone());
676 self.query_embedding_cache_order
677 .push_back(query.to_string());
678
679 Ok(vector)
680 }
681
682 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
683 (
684 self.query_embedding_cache_hits,
685 self.query_embedding_cache_misses,
686 self.query_embedding_cache.len(),
687 )
688 }
689
690 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
691 match &mut self.engine {
692 SemanticEmbeddingEngine::Local(model) => model
693 .embed(&texts)
694 .map_err(|error| format!("failed to embed batch: {error}")),
695 SemanticEmbeddingEngine::OpenAiCompatible {
696 client,
697 model,
698 base_url,
699 api_key,
700 } => {
701 let expected_text_count = texts.len();
702 let endpoint = build_openai_embeddings_endpoint(base_url);
703 let body = serde_json::json!({
704 "input": texts,
705 "model": model,
706 });
707
708 let raw = send_embedding_request(
709 || {
710 let mut request = client.post(&endpoint).json(&body);
720
721 if let Some(api_key) = api_key {
722 request = request.header("Authorization", format!("Bearer {api_key}"));
723 }
724
725 request
726 },
727 "openai compatible",
728 )?;
729
730 #[derive(Deserialize)]
731 struct OpenAiResponse {
732 data: Vec<OpenAiEmbeddingResult>,
733 }
734
735 #[derive(Deserialize)]
736 struct OpenAiEmbeddingResult {
737 embedding: Vec<f32>,
738 index: Option<u32>,
739 }
740
741 let parsed: OpenAiResponse = serde_json::from_str(&raw)
742 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
743 if parsed.data.len() != expected_text_count {
744 return Err(format!(
745 "openai compatible response returned {} embeddings for {} inputs",
746 parsed.data.len(),
747 expected_text_count
748 ));
749 }
750
751 let mut vectors = vec![Vec::new(); parsed.data.len()];
752 for (i, item) in parsed.data.into_iter().enumerate() {
753 let index = item.index.unwrap_or(i as u32) as usize;
754 if index >= vectors.len() {
755 return Err(
756 "openai compatible response contains invalid vector index".to_string()
757 );
758 }
759 vectors[index] = item.embedding;
760 }
761
762 for vector in &vectors {
763 if vector.is_empty() {
764 return Err(
765 "openai compatible response contained missing vectors".to_string()
766 );
767 }
768 }
769
770 self.dimension = vectors.first().map(Vec::len);
771 Ok(vectors)
772 }
773 SemanticEmbeddingEngine::Ollama {
774 client,
775 model,
776 base_url,
777 } => {
778 let expected_text_count = texts.len();
779 let endpoint = build_ollama_embeddings_endpoint(base_url);
780
781 #[derive(Serialize)]
782 struct OllamaPayload<'a> {
783 model: &'a str,
784 input: Vec<String>,
785 }
786
787 let payload = OllamaPayload {
788 model,
789 input: texts,
790 };
791
792 let raw = send_embedding_request(
793 || {
794 client.post(&endpoint).json(&payload)
799 },
800 "ollama",
801 )?;
802
803 #[derive(Deserialize)]
804 struct OllamaResponse {
805 embeddings: Vec<Vec<f32>>,
806 }
807
808 let parsed: OllamaResponse = serde_json::from_str(&raw)
809 .map_err(|error| format!("invalid ollama response: {error}"))?;
810 if parsed.embeddings.is_empty() {
811 return Err("ollama response returned no embeddings".to_string());
812 }
813 if parsed.embeddings.len() != expected_text_count {
814 return Err(format!(
815 "ollama response returned {} embeddings for {} inputs",
816 parsed.embeddings.len(),
817 expected_text_count
818 ));
819 }
820
821 let vectors = parsed.embeddings;
822 for vector in &vectors {
823 if vector.is_empty() {
824 return Err("ollama response contained empty embeddings".to_string());
825 }
826 }
827
828 self.dimension = vectors.first().map(Vec::len);
829 Ok(vectors)
830 }
831 }
832 }
833}
834
835pub fn pre_validate_onnx_runtime() -> Result<(), String> {
839 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
840
841 #[cfg(any(target_os = "linux", target_os = "macos"))]
842 {
843 #[cfg(target_os = "linux")]
844 let default_name = "libonnxruntime.so";
845 #[cfg(target_os = "macos")]
846 let default_name = "libonnxruntime.dylib";
847
848 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
849
850 unsafe {
851 let c_name = std::ffi::CString::new(lib_name)
852 .map_err(|e| format!("invalid library path: {}", e))?;
853 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
854 if handle.is_null() {
855 let err = libc::dlerror();
856 let msg = if err.is_null() {
857 "unknown dlopen error".to_string()
858 } else {
859 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
860 };
861 return Err(format!(
862 "ONNX Runtime not found. dlopen('{}') failed: {}. \
863 Run `npx @cortexkit/aft doctor` to diagnose.",
864 lib_name, msg
865 ));
866 }
867
868 let (detected_version, version_source) =
873 detect_ort_version_from_loaded_library(handle, lib_name);
874
875 libc::dlclose(handle);
876
877 if let Some(ref version) = detected_version {
879 let parts: Vec<&str> = version.split('.').collect();
880 if let (Some(major), Some(minor)) = (
881 parts.first().and_then(|s| s.parse::<u32>().ok()),
882 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
883 ) {
884 if major != 1 || minor < 20 {
885 return Err(format_ort_version_mismatch(version, &version_source));
886 }
887 }
888 }
889 }
890 }
891
892 #[cfg(target_os = "windows")]
893 {
894 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
899
900 #[link(name = "kernel32")]
904 extern "system" {
905 fn LoadLibraryExW(
906 lpLibFileName: *const u16,
907 hFile: *mut std::ffi::c_void,
908 dwFlags: u32,
909 ) -> *mut std::ffi::c_void;
910 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
911 fn GetModuleFileNameW(
912 hModule: *mut std::ffi::c_void,
913 lpFilename: *mut u16,
914 nSize: u32,
915 ) -> u32;
916 }
917
918 #[link(name = "version")]
919 extern "system" {
920 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
921 fn GetFileVersionInfoW(
922 lptstrFilename: *const u16,
923 dwHandle: u32,
924 dwLen: u32,
925 lpData: *mut std::ffi::c_void,
926 ) -> i32;
927 fn VerQueryValueW(
928 pBlock: *mut std::ffi::c_void,
929 lpSubBlock: *const u16,
930 lplpBuffer: *mut *mut std::ffi::c_void,
931 puLen: *mut u32,
932 ) -> i32;
933 }
934
935 #[repr(C)]
936 struct VS_FIXEDFILEINFO {
937 dw_signature: u32,
938 dw_struc_version: u32,
939 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
942 dw_product_version_ls: u32,
943 dw_file_flags_mask: u32,
944 dw_file_flags: u32,
945 dw_file_os: u32,
946 dw_file_type: u32,
947 dw_file_subtype: u32,
948 dw_file_date_ms: u32,
949 dw_file_date_ls: u32,
950 }
951
952 unsafe {
953 use std::os::windows::ffi::OsStrExt;
954 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
955 .encode_wide()
956 .chain(std::iter::once(0))
957 .collect();
958
959 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
960 if handle.is_null() {
961 let err = std::io::Error::last_os_error();
962 return Err(format!(
963 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
964 Run `npx @cortexkit/aft doctor` to diagnose.",
965 lib_name, err
966 ));
967 }
968
969 let mut detected_major: u32 = 0;
972 let mut detected_minor: u32 = 0;
973 let mut path_buf = [0u16; 32767];
979 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
980 if path_len > 0 {
981 let mut dummy_handle: u32 = 0;
982 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
983 if info_size > 0 {
984 let mut info = vec![0u8; info_size as usize];
985 if GetFileVersionInfoW(
986 path_buf.as_ptr(),
987 0,
988 info_size,
989 info.as_mut_ptr() as *mut std::ffi::c_void,
990 ) != 0
991 {
992 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
993 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
994 let mut vs_len: u32 = 0;
995 if VerQueryValueW(
996 info.as_mut_ptr() as *mut std::ffi::c_void,
997 sub_block.as_ptr(),
998 &mut vs_info,
999 &mut vs_len,
1000 ) != 0
1001 && !vs_info.is_null()
1002 {
1003 let fixed = vs_info as *const VS_FIXEDFILEINFO;
1004 detected_major = (*fixed).dw_file_version_ms >> 16;
1005 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1006 }
1007 }
1008 }
1009 }
1010
1011 FreeLibrary(handle);
1012
1013 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1017 let ver = format!("{}.{}", detected_major, detected_minor);
1018 return Err(format_ort_version_mismatch(&ver, lib_name));
1019 }
1020 }
1021 }
1022
1023 Ok(())
1024}
1025
1026#[cfg(any(target_os = "linux", target_os = "macos"))]
1027unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1028 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1029 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1030 if symbol.is_null() {
1031 return None;
1032 }
1033
1034 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1035 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1036 return None;
1037 }
1038
1039 let info = unsafe { info.assume_init() };
1040 if info.dli_fname.is_null() {
1041 return None;
1042 }
1043
1044 Some(
1045 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1046 .to_string_lossy()
1047 .into_owned(),
1048 )
1049}
1050
1051#[cfg(any(target_os = "linux", target_os = "macos"))]
1052fn detect_ort_version_from_resolved_or_requested(
1053 resolved_path: Option<String>,
1054 requested_lib_name: &str,
1055) -> (Option<String>, String) {
1056 if let Some(path) = resolved_path {
1057 if let Some(version) = detect_ort_version_from_path(&path) {
1058 return (Some(version), path);
1059 }
1060 return (detect_ort_version_from_path(requested_lib_name), path);
1061 }
1062
1063 (
1064 detect_ort_version_from_path(requested_lib_name),
1065 requested_lib_name.to_string(),
1066 )
1067}
1068
1069#[cfg(any(target_os = "linux", target_os = "macos"))]
1070fn detect_ort_version_from_loaded_library(
1071 handle: *mut std::ffi::c_void,
1072 requested_lib_name: &str,
1073) -> (Option<String>, String) {
1074 detect_ort_version_from_resolved_or_requested(
1075 unsafe { loaded_library_path_from_handle(handle) },
1076 requested_lib_name,
1077 )
1078}
1079
1080#[cfg(any(target_os = "linux", target_os = "macos"))]
1083fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1084 let path = std::path::Path::new(lib_path);
1085
1086 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1088 .into_iter()
1089 .flatten()
1090 {
1091 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1092 if let Some(version) = extract_version_from_filename(name) {
1093 return Some(version);
1094 }
1095 }
1096 }
1097
1098 if let Some(parent) = path.parent() {
1100 if let Ok(entries) = std::fs::read_dir(parent) {
1101 for entry in entries.flatten() {
1102 if let Some(name) = entry.file_name().to_str() {
1103 if name.starts_with("libonnxruntime") {
1104 if let Some(version) = extract_version_from_filename(name) {
1105 return Some(version);
1106 }
1107 }
1108 }
1109 }
1110 }
1111 }
1112
1113 None
1114}
1115
1116#[cfg(any(target_os = "linux", target_os = "macos"))]
1118fn extract_version_from_filename(name: &str) -> Option<String> {
1119 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1121 re.find(name).map(|m| m.as_str().to_string())
1122}
1123
1124fn suggest_removal_command(lib_path: &str) -> String {
1125 if lib_path.starts_with("/usr/local/lib")
1126 || lib_path == "libonnxruntime.so"
1127 || lib_path == "libonnxruntime.dylib"
1128 {
1129 #[cfg(target_os = "linux")]
1130 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1131 #[cfg(target_os = "macos")]
1132 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1133 }
1134 format!(" rm '{}'", lib_path)
1135}
1136
1137pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1143 format!(
1144 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1145 Solutions:\n\
1146 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1147 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1148 configures the bridge to load it instead of the system library — no \
1149 changes to '{}'.\n\
1150 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1151 {}\n\
1152 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1153 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1154 version,
1155 lib_name,
1156 lib_name,
1157 suggest_removal_command(lib_name),
1158 )
1159}
1160
1161pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1162 if message.trim_start().starts_with("ONNX Runtime not found.") {
1163 return true;
1164 }
1165
1166 let message = message.to_ascii_lowercase();
1167 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1168 .iter()
1169 .any(|pattern| message.contains(pattern));
1170 let mentions_dynamic_load_failure = [
1171 "shared library",
1172 "dynamic library",
1173 "failed to load",
1174 "could not load",
1175 "unable to load",
1176 "dlopen",
1177 "loadlibrary",
1178 "no such file",
1179 "not found",
1180 ]
1181 .iter()
1182 .any(|pattern| message.contains(pattern));
1183
1184 mentions_onnx_runtime && mentions_dynamic_load_failure
1185}
1186
1187pub fn format_embedding_init_error(error: impl Display) -> String {
1188 let message = error.to_string();
1189
1190 if is_onnx_runtime_unavailable(&message) {
1191 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1192 }
1193
1194 format!("failed to initialize semantic embedding model: {message}")
1195}
1196
1197#[derive(Debug, Clone)]
1199pub struct SemanticChunk {
1200 pub file: PathBuf,
1202 pub name: String,
1204 pub qualified_name: Option<String>,
1206 pub kind: SymbolKind,
1208 pub start_line: u32,
1210 pub end_line: u32,
1211 pub exported: bool,
1213 pub embed_text: String,
1215 pub snippet: String,
1217}
1218
1219#[derive(Debug, Clone)]
1221pub struct EmbeddingEntry {
1222 chunk: SemanticChunk,
1223 vector: Vec<f32>,
1224}
1225
1226#[derive(Debug, Clone)]
1228pub struct SemanticIndex {
1229 entries: Vec<EmbeddingEntry>,
1230 file_mtimes: HashMap<PathBuf, SystemTime>,
1232 file_sizes: HashMap<PathBuf, u64>,
1234 file_hashes: HashMap<PathBuf, blake3::Hash>,
1235 dimension: usize,
1237 fingerprint: Option<SemanticIndexFingerprint>,
1238 project_root: PathBuf,
1239 deferred_files: HashSet<PathBuf>,
1240}
1241
1242#[derive(Debug, Clone, Copy)]
1243struct IndexedFileMetadata {
1244 mtime: SystemTime,
1245 size: u64,
1246 content_hash: blake3::Hash,
1247}
1248
1249#[derive(Debug, Default, Clone, Copy)]
1252pub struct RefreshSummary {
1253 pub changed: usize,
1254 pub added: usize,
1255 pub deleted: usize,
1256 pub total_processed: usize,
1257}
1258
1259impl RefreshSummary {
1260 pub fn is_noop(&self) -> bool {
1262 self.changed == 0 && self.added == 0 && self.deleted == 0
1263 }
1264}
1265
1266#[derive(Debug, Default)]
1267pub struct InvalidatedFilesRefresh {
1268 pub added_entries: Vec<EmbeddingEntry>,
1272 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1273 pub completed_paths: Vec<PathBuf>,
1274 pub summary: RefreshSummary,
1275}
1276
1277#[derive(Debug, Clone)]
1278struct ReusableEmbedding {
1279 embed_text: String,
1280 vector: Vec<f32>,
1281}
1282
1283type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1284
1285#[derive(Debug, Clone)]
1287pub struct SemanticResult {
1288 pub file: PathBuf,
1289 pub name: String,
1290 pub qualified_name: Option<String>,
1291 pub kind: SymbolKind,
1292 pub start_line: u32,
1293 pub end_line: u32,
1294 pub exported: bool,
1295 pub snippet: String,
1296 pub score: f32,
1297 pub rank_score: f32,
1298 pub cap_protected: bool,
1299 pub source: &'static str,
1300}
1301
1302impl SemanticIndex {
1303 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1304 debug_assert!(project_root.is_absolute());
1305 Self {
1306 entries: Vec::new(),
1307 file_mtimes: HashMap::new(),
1308 file_sizes: HashMap::new(),
1309 file_hashes: HashMap::new(),
1310 dimension,
1311 fingerprint: None,
1312 project_root,
1313 deferred_files: HashSet::new(),
1314 }
1315 }
1316
1317 pub fn entry_count(&self) -> usize {
1319 self.entries.len()
1320 }
1321
1322 pub fn indexed_file_count(&self) -> usize {
1324 self.file_mtimes.len()
1325 }
1326
1327 pub fn status_label(&self) -> &'static str {
1329 if self.entries.is_empty() {
1330 "empty"
1331 } else {
1332 "ready"
1333 }
1334 }
1335
1336 fn collect_chunks(
1337 project_root: &Path,
1338 files: &[PathBuf],
1339 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1340 let collect_started = std::time::Instant::now();
1341 let per_file: Vec<(
1342 PathBuf,
1343 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1344 )> = files
1345 .par_iter()
1346 .map_init(HashMap::new, |parsers, file| {
1347 let result = collect_semantic_file(project_root, file, parsers);
1348 (file.clone(), result)
1349 })
1350 .collect();
1351
1352 let mut chunks: Vec<SemanticChunk> = Vec::new();
1353 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1354
1355 for (file, result) in per_file {
1356 match result {
1357 Ok((metadata, file_chunks)) => {
1358 file_metadata.insert(file, metadata);
1359 chunks.extend(file_chunks);
1360 }
1361 Err(error) => {
1362 if error == "unsupported file extension" {
1368 continue;
1369 }
1370 slog_warn!(
1371 "failed to collect semantic chunks for {}: {}",
1372 file.display(),
1373 error
1374 );
1375 }
1376 }
1377 }
1378
1379 slog_info!(
1380 "semantic collect: {} chunks from {} files in {} ms",
1381 chunks.len(),
1382 file_metadata.len(),
1383 collect_started.elapsed().as_millis()
1384 );
1385
1386 (chunks, file_metadata)
1387 }
1388
1389 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1390 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1391 let mut reuse_map: ChunkReuseMap = HashMap::new();
1392
1393 for entry in &self.entries {
1394 if !requested.contains(entry.chunk.file.as_path()) {
1395 continue;
1396 }
1397
1398 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1403 reuse_map
1404 .entry(entry.chunk.file.clone())
1405 .or_default()
1406 .entry(hash)
1407 .or_default()
1408 .push(ReusableEmbedding {
1409 embed_text: entry.chunk.embed_text.clone(),
1410 vector: entry.vector.clone(),
1411 });
1412 }
1413
1414 reuse_map
1415 }
1416
1417 fn reusable_vector_for_chunk(
1418 reuse_map: &ChunkReuseMap,
1419 chunk: &SemanticChunk,
1420 ) -> Option<Vec<f32>> {
1421 let hash = blake3::hash(chunk.embed_text.as_bytes());
1422 reuse_map
1423 .get(&chunk.file)?
1424 .get(&hash)?
1425 .iter()
1426 .find(|candidate| candidate.embed_text == chunk.embed_text)
1427 .map(|candidate| candidate.vector.clone())
1428 }
1429
1430 fn entries_for_chunks_with_reuse<F, P>(
1431 chunks: Vec<SemanticChunk>,
1432 reuse_map: &ChunkReuseMap,
1433 embed_fn: &mut F,
1434 max_batch_size: usize,
1435 initial_observed_dimension: Option<usize>,
1436 refresh_label: &str,
1437 progress: &mut P,
1438 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1439 where
1440 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1441 P: FnMut(usize, usize),
1442 {
1443 let total_chunks = chunks.len();
1444 progress(0, total_chunks);
1445
1446 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1447 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1448
1449 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1450 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1451 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1452 } else {
1453 misses.push((chunk_index, chunk));
1454 }
1455 }
1456
1457 let mut completed = total_chunks.saturating_sub(misses.len());
1458 if completed > 0 {
1459 progress(completed, total_chunks);
1460 }
1461
1462 let batch_size = max_batch_size.max(1);
1463 let mut observed_dimension = initial_observed_dimension;
1464
1465 for batch_start in (0..misses.len()).step_by(batch_size) {
1466 let batch_end = (batch_start + batch_size).min(misses.len());
1467 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1468 .iter()
1469 .map(|(_, chunk)| chunk.embed_text.clone())
1470 .collect();
1471
1472 let vectors = embed_fn(batch_texts)?;
1473 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1474
1475 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1476 match observed_dimension {
1477 None => observed_dimension = Some(dim),
1478 Some(expected) if dim != expected => {
1479 return Err(format!(
1480 "embedding dimension changed during {refresh_label}: \
1481 cached index uses {expected}, new vectors use {dim}"
1482 ));
1483 }
1484 _ => {}
1485 }
1486 }
1487
1488 for (i, vector) in vectors.into_iter().enumerate() {
1489 let (chunk_index, chunk) = misses[batch_start + i].clone();
1490 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1491 }
1492
1493 completed += batch_end - batch_start;
1494 progress(completed, total_chunks);
1495 }
1496
1497 let entries = entries_by_chunk
1498 .into_iter()
1499 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1500 .collect();
1501
1502 Ok((entries, observed_dimension))
1503 }
1504
1505 fn build_from_chunks<F, P>(
1506 project_root: &Path,
1507 chunks: Vec<SemanticChunk>,
1508 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1509 embed_fn: &mut F,
1510 max_batch_size: usize,
1511 mut progress: Option<&mut P>,
1512 ) -> Result<Self, String>
1513 where
1514 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1515 P: FnMut(usize, usize),
1516 {
1517 debug_assert!(project_root.is_absolute());
1518 let total_chunks = chunks.len();
1519
1520 if chunks.is_empty() {
1521 return Ok(Self {
1522 entries: Vec::new(),
1523 file_mtimes: file_metadata
1524 .iter()
1525 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1526 .collect(),
1527 file_sizes: file_metadata
1528 .iter()
1529 .map(|(path, metadata)| (path.clone(), metadata.size))
1530 .collect(),
1531 file_hashes: file_metadata
1532 .into_iter()
1533 .map(|(path, metadata)| (path, metadata.content_hash))
1534 .collect(),
1535 dimension: DEFAULT_DIMENSION,
1536 fingerprint: None,
1537 project_root: project_root.to_path_buf(),
1538 deferred_files: HashSet::new(),
1539 });
1540 }
1541
1542 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1544 let mut expected_dimension: Option<usize> = None;
1545 let batch_size = max_batch_size.max(1);
1546 let embed_started = std::time::Instant::now();
1547 let batch_count = total_chunks.div_ceil(batch_size);
1548 for batch_start in (0..chunks.len()).step_by(batch_size) {
1549 let batch_end = (batch_start + batch_size).min(chunks.len());
1550 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1551 .iter()
1552 .map(|c| c.embed_text.clone())
1553 .collect();
1554
1555 let vectors = embed_fn(batch_texts)?;
1556 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1557
1558 if let Some(dim) = vectors.first().map(|v| v.len()) {
1560 match expected_dimension {
1561 None => expected_dimension = Some(dim),
1562 Some(expected) if dim != expected => {
1563 return Err(format!(
1564 "embedding dimension changed across batches: expected {expected}, got {dim}"
1565 ));
1566 }
1567 _ => {}
1568 }
1569 }
1570
1571 for (i, vector) in vectors.into_iter().enumerate() {
1572 let chunk_idx = batch_start + i;
1573 entries.push(EmbeddingEntry {
1574 chunk: chunks[chunk_idx].clone(),
1575 vector,
1576 });
1577 }
1578
1579 if let Some(callback) = progress.as_mut() {
1580 callback(entries.len(), total_chunks);
1581 }
1582 }
1583
1584 let embed_ms = embed_started.elapsed().as_millis();
1585 let rate = (total_chunks as u128 * 1000)
1586 .checked_div(embed_ms)
1587 .unwrap_or(0) as u64;
1588 slog_info!(
1589 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1590 total_chunks,
1591 batch_count,
1592 embed_ms,
1593 rate
1594 );
1595
1596 let dimension = entries
1597 .first()
1598 .map(|e| e.vector.len())
1599 .unwrap_or(DEFAULT_DIMENSION);
1600
1601 Ok(Self {
1602 entries,
1603 file_mtimes: file_metadata
1604 .iter()
1605 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1606 .collect(),
1607 file_sizes: file_metadata
1608 .iter()
1609 .map(|(path, metadata)| (path.clone(), metadata.size))
1610 .collect(),
1611 file_hashes: file_metadata
1612 .into_iter()
1613 .map(|(path, metadata)| (path, metadata.content_hash))
1614 .collect(),
1615 dimension,
1616 fingerprint: None,
1617 project_root: project_root.to_path_buf(),
1618 deferred_files: HashSet::new(),
1619 })
1620 }
1621
1622 pub fn build<F>(
1625 project_root: &Path,
1626 files: &[PathBuf],
1627 embed_fn: &mut F,
1628 max_batch_size: usize,
1629 ) -> Result<Self, String>
1630 where
1631 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1632 {
1633 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634 Self::build_from_chunks(
1635 project_root,
1636 chunks,
1637 file_mtimes,
1638 embed_fn,
1639 max_batch_size,
1640 Option::<&mut fn(usize, usize)>::None,
1641 )
1642 }
1643
1644 pub fn build_with_progress<F, P>(
1646 project_root: &Path,
1647 files: &[PathBuf],
1648 embed_fn: &mut F,
1649 max_batch_size: usize,
1650 progress: &mut P,
1651 ) -> Result<Self, String>
1652 where
1653 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1654 P: FnMut(usize, usize),
1655 {
1656 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1657 let total_chunks = chunks.len();
1658 progress(0, total_chunks);
1659 Self::build_from_chunks(
1660 project_root,
1661 chunks,
1662 file_mtimes,
1663 embed_fn,
1664 max_batch_size,
1665 Some(progress),
1666 )
1667 }
1668
1669 pub fn refresh_stale_files<F, P>(
1680 &mut self,
1681 project_root: &Path,
1682 current_files: &[PathBuf],
1683 embed_fn: &mut F,
1684 max_batch_size: usize,
1685 progress: &mut P,
1686 ) -> Result<RefreshSummary, String>
1687 where
1688 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1689 P: FnMut(usize, usize),
1690 {
1691 self.backfill_missing_file_sizes();
1692
1693 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1695 self.deferred_files
1696 .retain(|path| current_set.contains(path.as_path()));
1697 let total_processed = current_set.len() + self.file_mtimes.len()
1698 - self
1699 .file_mtimes
1700 .keys()
1701 .filter(|path| current_set.contains(path.as_path()))
1702 .count();
1703
1704 enum IndexedFileCheck {
1707 Deleted(PathBuf),
1708 MissingMetadata(PathBuf),
1709 Verified(PathBuf, FreshnessVerdict),
1710 }
1711
1712 let mut deleted: Vec<PathBuf> = Vec::new();
1713 let mut changed: Vec<PathBuf> = Vec::new();
1714 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1715 let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1716 let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1717
1718 for indexed_path in indexed_paths {
1719 let check_index = checks.len();
1720 if !current_set.contains(indexed_path.as_path()) {
1721 checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1722 continue;
1723 }
1724 let cached = match (
1725 self.file_mtimes.get(&indexed_path),
1726 self.file_sizes.get(&indexed_path),
1727 self.file_hashes.get(&indexed_path),
1728 ) {
1729 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1730 mtime: *mtime,
1731 size: *size,
1732 content_hash: *hash,
1733 }),
1734 _ => None,
1735 };
1736 if let Some(freshness) = cached {
1737 strict_verify_inputs.push((check_index, indexed_path, freshness));
1738 checks.push(None);
1739 } else {
1740 checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1741 }
1742 }
1743
1744 for (check_index, path, verdict) in
1745 cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1746 {
1747 checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1748 }
1749
1750 for check in checks {
1751 match check.expect("strict freshness check should be populated") {
1752 IndexedFileCheck::Deleted(path) => deleted.push(path),
1753 IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1754 IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1755 IndexedFileCheck::Verified(
1756 path,
1757 FreshnessVerdict::ContentFresh {
1758 new_mtime,
1759 new_size,
1760 },
1761 ) => {
1762 self.file_mtimes.insert(path.clone(), new_mtime);
1763 self.file_sizes.insert(path, new_size);
1764 }
1765 IndexedFileCheck::Verified(
1766 path,
1767 FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1768 ) => {
1769 changed.push(path);
1770 }
1771 }
1772 }
1773
1774 let mut added: Vec<PathBuf> = Vec::new();
1776 for path in current_files {
1777 if !self.file_mtimes.contains_key(path) {
1778 added.push(path.clone());
1779 }
1780 }
1781
1782 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1784 progress(0, 0);
1785 return Ok(RefreshSummary {
1786 total_processed,
1787 ..RefreshSummary::default()
1788 });
1789 }
1790
1791 if !deleted.is_empty() {
1795 self.remove_indexed_files(&deleted);
1796 }
1797
1798 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1800 to_embed.extend(changed.iter().cloned());
1801 to_embed.extend(added.iter().cloned());
1802
1803 if to_embed.is_empty() {
1804 progress(0, 0);
1806 return Ok(RefreshSummary {
1807 changed: 0,
1808 added: 0,
1809 deleted: deleted.len(),
1810 total_processed,
1811 });
1812 }
1813
1814 let reuse_map = self.build_chunk_reuse_map(&changed);
1815 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1816 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1817 let vanished = to_embed
1818 .iter()
1819 .filter(|path| {
1820 changed_set.contains(path.as_path())
1821 && !fresh_metadata.contains_key(*path)
1822 && !path.exists()
1823 })
1824 .cloned()
1825 .collect::<Vec<_>>();
1826 if !vanished.is_empty() {
1827 self.remove_indexed_files(&vanished);
1828 deleted.extend(vanished);
1829 }
1830
1831 if chunks.is_empty() {
1832 progress(0, 0);
1833 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1834 for file in &successful_files {
1835 self.deferred_files.remove(file);
1836 }
1837 if !successful_files.is_empty() {
1838 self.entries
1839 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1840 }
1841 let changed_count = changed
1842 .iter()
1843 .filter(|path| successful_files.contains(*path))
1844 .count();
1845 let added_count = added
1846 .iter()
1847 .filter(|path| successful_files.contains(*path))
1848 .count();
1849 for (file, metadata) in fresh_metadata {
1850 self.file_mtimes.insert(file.clone(), metadata.mtime);
1851 self.file_sizes.insert(file.clone(), metadata.size);
1852 self.file_hashes.insert(file.clone(), metadata.content_hash);
1853 }
1854 return Ok(RefreshSummary {
1855 changed: changed_count,
1856 added: added_count,
1857 deleted: deleted.len(),
1858 total_processed,
1859 });
1860 }
1861
1862 let existing_dimension = if self.entries.is_empty() {
1865 None
1866 } else {
1867 Some(self.dimension)
1868 };
1869 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1870 chunks,
1871 &reuse_map,
1872 embed_fn,
1873 max_batch_size,
1874 existing_dimension,
1875 "incremental refresh",
1876 progress,
1877 )?;
1878
1879 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1880 for file in &successful_files {
1881 self.deferred_files.remove(file);
1882 }
1883 if !successful_files.is_empty() {
1884 self.entries
1885 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1886 }
1887
1888 self.entries.extend(new_entries);
1889 for (file, metadata) in fresh_metadata {
1890 self.file_mtimes.insert(file.clone(), metadata.mtime);
1891 self.file_sizes.insert(file.clone(), metadata.size);
1892 self.file_hashes.insert(file, metadata.content_hash);
1893 }
1894 if let Some(dim) = observed_dimension {
1895 self.dimension = dim;
1896 }
1897
1898 Ok(RefreshSummary {
1899 changed: changed
1900 .iter()
1901 .filter(|path| successful_files.contains(*path))
1902 .count(),
1903 added: added
1904 .iter()
1905 .filter(|path| successful_files.contains(*path))
1906 .count(),
1907 deleted: deleted.len(),
1908 total_processed,
1909 })
1910 }
1911
1912 pub fn refresh_invalidated_files<F, P>(
1919 &mut self,
1920 project_root: &Path,
1921 paths: &[PathBuf],
1922 embed_fn: &mut F,
1923 max_batch_size: usize,
1924 max_files: usize,
1925 progress: &mut P,
1926 ) -> Result<InvalidatedFilesRefresh, String>
1927 where
1928 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1929 P: FnMut(usize, usize),
1930 {
1931 self.backfill_missing_file_sizes();
1932
1933 self.deferred_files.retain(|path| path.exists());
1934 let mut requested_paths = paths.to_vec();
1935 requested_paths.extend(self.deferred_files.iter().cloned());
1936 requested_paths.sort();
1937 requested_paths.dedup();
1938 let total_processed = requested_paths.len();
1939
1940 if requested_paths.is_empty() {
1941 progress(0, 0);
1942 return Ok(InvalidatedFilesRefresh {
1943 summary: RefreshSummary {
1944 total_processed,
1945 ..RefreshSummary::default()
1946 },
1947 ..InvalidatedFilesRefresh::default()
1948 });
1949 }
1950
1951 let previously_indexed: HashSet<PathBuf> = requested_paths
1952 .iter()
1953 .filter(|path| self.file_mtimes.contains_key(*path))
1954 .cloned()
1955 .collect();
1956 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1957
1958 self.remove_indexed_files(&requested_paths);
1962
1963 let existing_paths = requested_paths
1964 .iter()
1965 .filter(|path| path.exists())
1966 .cloned()
1967 .collect::<Vec<_>>();
1968 let deleted = requested_paths
1969 .iter()
1970 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1971 .count();
1972
1973 if existing_paths.is_empty() {
1974 for path in &requested_paths {
1975 if !path.exists() {
1976 self.deferred_files.remove(path);
1977 }
1978 }
1979 progress(0, 0);
1980 return Ok(InvalidatedFilesRefresh {
1981 completed_paths: requested_paths,
1982 summary: RefreshSummary {
1983 deleted,
1984 total_processed,
1985 ..RefreshSummary::default()
1986 },
1987 ..InvalidatedFilesRefresh::default()
1988 });
1989 }
1990
1991 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1992
1993 let retained_file_count = self.file_mtimes.len();
1994 let changed_successful_count = existing_paths
1995 .iter()
1996 .filter(|path| {
1997 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1998 })
1999 .count();
2000 let available_new_files =
2001 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
2002 let new_successful_files = existing_paths
2003 .iter()
2004 .filter(|path| {
2005 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
2006 })
2007 .cloned()
2008 .collect::<Vec<_>>();
2009 if new_successful_files.len() > available_new_files {
2010 let allowed_new_files = new_successful_files
2011 .iter()
2012 .take(available_new_files)
2013 .cloned()
2014 .collect::<HashSet<_>>();
2015 let deferred_new_files = new_successful_files
2016 .into_iter()
2017 .filter(|path| !allowed_new_files.contains(path))
2018 .collect::<HashSet<_>>();
2019
2020 fresh_metadata.retain(|file, _| {
2021 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2022 });
2023 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2024
2025 if !deferred_new_files.is_empty() {
2026 for path in &deferred_new_files {
2027 self.deferred_files.insert(path.clone());
2028 }
2029 slog_warn!(
2030 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2031 deferred_new_files.len(),
2032 max_files
2033 );
2034 }
2035 }
2036
2037 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2038 for file in &successful_files {
2039 self.deferred_files.remove(file);
2040 }
2041 let changed = successful_files
2042 .iter()
2043 .filter(|path| previously_indexed.contains(path.as_path()))
2044 .count();
2045 let added = successful_files.len().saturating_sub(changed);
2046 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2047
2048 if chunks.is_empty() {
2049 progress(0, 0);
2050 for (file, metadata) in fresh_metadata {
2051 let freshness = FileFreshness {
2052 mtime: metadata.mtime,
2053 size: metadata.size,
2054 content_hash: metadata.content_hash,
2055 };
2056 self.file_mtimes.insert(file.clone(), freshness.mtime);
2057 self.file_sizes.insert(file.clone(), freshness.size);
2058 self.file_hashes
2059 .insert(file.clone(), freshness.content_hash);
2060 updated_metadata.push((file, freshness));
2061 }
2062
2063 return Ok(InvalidatedFilesRefresh {
2064 updated_metadata,
2065 completed_paths: requested_paths,
2066 summary: RefreshSummary {
2067 changed,
2068 added,
2069 deleted,
2070 total_processed,
2071 },
2072 ..InvalidatedFilesRefresh::default()
2073 });
2074 }
2075
2076 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2077 {
2078 None
2079 } else {
2080 Some(self.dimension)
2081 };
2082 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2083 chunks,
2084 &reuse_map,
2085 embed_fn,
2086 max_batch_size,
2087 initial_observed_dimension,
2088 "invalidated-file refresh",
2089 progress,
2090 )?;
2091
2092 let added_entries = new_entries.clone();
2093 self.entries.extend(new_entries);
2094 for (file, metadata) in fresh_metadata {
2095 let freshness = FileFreshness {
2096 mtime: metadata.mtime,
2097 size: metadata.size,
2098 content_hash: metadata.content_hash,
2099 };
2100 self.file_mtimes.insert(file.clone(), freshness.mtime);
2101 self.file_sizes.insert(file.clone(), freshness.size);
2102 self.file_hashes
2103 .insert(file.clone(), freshness.content_hash);
2104 updated_metadata.push((file, freshness));
2105 }
2106 if let Some(dim) = observed_dimension {
2107 self.dimension = dim;
2108 }
2109
2110 Ok(InvalidatedFilesRefresh {
2111 added_entries,
2112 updated_metadata,
2113 completed_paths: requested_paths,
2114 summary: RefreshSummary {
2115 changed,
2116 added,
2117 deleted,
2118 total_processed,
2119 },
2120 })
2121 }
2122
2123 pub fn apply_refresh_update(
2124 &mut self,
2125 added_entries: Vec<EmbeddingEntry>,
2126 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2127 completed_paths: &[PathBuf],
2128 ) {
2129 self.remove_indexed_files(completed_paths);
2133
2134 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2135 self.entries.extend(added_entries);
2136 for (file, freshness) in updated_metadata {
2137 self.file_mtimes.insert(file.clone(), freshness.mtime);
2138 self.file_sizes.insert(file.clone(), freshness.size);
2139 self.file_hashes.insert(file, freshness.content_hash);
2140 }
2141 if let Some(dim) = observed_dimension {
2142 self.dimension = dim;
2143 }
2144 }
2145
2146 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2147 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2148 self.entries
2149 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2150 for path in files {
2151 self.file_mtimes.remove(path);
2152 self.file_sizes.remove(path);
2153 self.file_hashes.remove(path);
2154 }
2155 }
2156
2157 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2159 if self.entries.is_empty() || query_vector.len() != self.dimension {
2160 return Vec::new();
2161 }
2162
2163 let mut scored: Vec<(f32, usize)> = self
2164 .entries
2165 .iter()
2166 .enumerate()
2167 .map(|(i, entry)| {
2168 let mut score = cosine_similarity(query_vector, &entry.vector);
2169 if entry.chunk.exported {
2170 score *= 1.1;
2171 }
2172 (score, i)
2173 })
2174 .collect();
2175
2176 let keep = top_k.min(scored.len());
2177 if keep == 0 {
2178 return Vec::new();
2179 }
2180
2181 if keep < scored.len() {
2182 scored.select_nth_unstable_by(keep, semantic_score_order);
2183 scored.truncate(keep);
2184 }
2185 scored.sort_by(semantic_score_order);
2186
2187 scored
2188 .into_iter()
2189 .map(|(score, idx)| {
2193 let entry = &self.entries[idx];
2194 SemanticResult {
2195 file: entry.chunk.file.clone(),
2196 name: entry.chunk.name.clone(),
2197 qualified_name: entry.chunk.qualified_name.clone(),
2198 kind: entry.chunk.kind.clone(),
2199 start_line: entry.chunk.start_line,
2200 end_line: entry.chunk.end_line,
2201 exported: entry.chunk.exported,
2202 snippet: entry.chunk.snippet.clone(),
2203 score,
2204 rank_score: score,
2205 cap_protected: false,
2206 source: "semantic",
2207 }
2208 })
2209 .collect()
2210 }
2211
2212 pub fn len(&self) -> usize {
2214 self.entries.len()
2215 }
2216
2217 pub fn is_file_stale(&self, file: &Path) -> bool {
2219 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2220 return true;
2221 };
2222 let Some(stored_size) = self.file_sizes.get(file) else {
2223 return true;
2224 };
2225 let Some(stored_hash) = self.file_hashes.get(file) else {
2226 return true;
2227 };
2228 let cached = FileFreshness {
2229 mtime: *stored_mtime,
2230 size: *stored_size,
2231 content_hash: *stored_hash,
2232 };
2233 match cache_freshness::verify_file_strict(file, &cached) {
2234 FreshnessVerdict::HotFresh => false,
2235 FreshnessVerdict::ContentFresh { .. } => false,
2236 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2237 }
2238 }
2239
2240 fn backfill_missing_file_sizes(&mut self) {
2241 for path in self.file_mtimes.keys() {
2242 if self.file_sizes.contains_key(path) {
2243 continue;
2244 }
2245 if let Ok(metadata) = fs::metadata(path) {
2246 self.file_sizes.insert(path.clone(), metadata.len());
2247 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2248 self.file_hashes.insert(path.clone(), hash);
2249 }
2250 }
2251 }
2252 }
2253
2254 pub fn remove_file(&mut self, file: &Path) {
2256 self.invalidate_file(file);
2257 }
2258
2259 pub fn invalidate_file(&mut self, file: &Path) {
2260 let canonical_file = canonicalize_existing_or_deleted_path(file);
2261 self.entries
2262 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2263 self.file_mtimes.remove(file);
2264 self.file_sizes.remove(file);
2265 self.file_hashes.remove(file);
2266 if canonical_file.as_path() != file {
2267 self.file_mtimes.remove(&canonical_file);
2268 self.file_sizes.remove(&canonical_file);
2269 self.file_hashes.remove(&canonical_file);
2270 }
2271 }
2272
2273 pub fn dimension(&self) -> usize {
2275 self.dimension
2276 }
2277
2278 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2279 self.fingerprint.as_ref()
2280 }
2281
2282 pub fn backend_label(&self) -> Option<&str> {
2283 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2284 }
2285
2286 pub fn model_label(&self) -> Option<&str> {
2287 self.fingerprint.as_ref().map(|f| f.model.as_str())
2288 }
2289
2290 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2291 self.fingerprint = Some(fingerprint);
2292 }
2293
2294 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2296 if self.entries.is_empty() {
2299 slog_info!("skipping semantic index persistence (0 entries)");
2300 return;
2301 }
2302 let dir = storage_dir.join("semantic").join(project_key);
2303 if let Err(e) = fs::create_dir_all(&dir) {
2304 slog_warn!("failed to create semantic cache dir: {}", e);
2305 return;
2306 }
2307 let data_path = dir.join("semantic.bin");
2308 let tmp_path = dir.join(format!(
2309 "semantic.bin.tmp.{}.{}",
2310 std::process::id(),
2311 SystemTime::now()
2312 .duration_since(SystemTime::UNIX_EPOCH)
2313 .unwrap_or(Duration::ZERO)
2314 .as_nanos()
2315 ));
2316 let write_result = (|| -> io::Result<usize> {
2317 let file = fs::File::create(&tmp_path)?;
2318 let mut writer = BufWriter::new(file);
2319 let bytes_written = self.write_to_writer(&mut writer)?;
2320 writer.flush()?;
2321 writer.get_ref().sync_all()?;
2322 Ok(bytes_written)
2323 })();
2324 let bytes_written = match write_result {
2325 Ok(bytes_written) => bytes_written,
2326 Err(e) => {
2327 slog_warn!("failed to write semantic index: {}", e);
2328 let _ = fs::remove_file(&tmp_path);
2329 return;
2330 }
2331 };
2332 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2333 slog_warn!("failed to rename semantic index: {}", e);
2334 let _ = fs::remove_file(&tmp_path);
2335 return;
2336 }
2337 slog_info!(
2338 "semantic index persisted: {} entries, {:.1} KB",
2339 self.entries.len(),
2340 bytes_written as f64 / 1024.0
2341 );
2342 }
2343
2344 pub fn read_from_disk(
2346 storage_dir: &Path,
2347 project_key: &str,
2348 current_canonical_root: &Path,
2349 is_worktree_bridge: bool,
2350 expected_fingerprint: Option<&str>,
2351 ) -> Option<Self> {
2352 debug_assert!(current_canonical_root.is_absolute());
2353 let data_path = storage_dir
2354 .join("semantic")
2355 .join(project_key)
2356 .join("semantic.bin");
2357 let file = fs::File::open(&data_path).ok()?;
2358 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2359 if file_len < HEADER_BYTES_V1 {
2360 slog_warn!(
2361 "corrupt semantic index (too small: {} bytes), removing",
2362 file_len
2363 );
2364 if !is_worktree_bridge {
2365 let _ = fs::remove_file(&data_path);
2366 }
2367 return None;
2368 }
2369
2370 let mut reader = BufReader::new(file);
2371 let mut version_buf = [0u8; 1];
2372 reader.read_exact(&mut version_buf).ok()?;
2373 let version = version_buf[0];
2374 if version != SEMANTIC_INDEX_VERSION_V6 && version != SEMANTIC_INDEX_VERSION_V7 {
2375 slog_info!(
2376 "cached semantic index version {} is not compatible with {}, rebuilding",
2377 version,
2378 SEMANTIC_INDEX_VERSION_V7
2379 );
2380 if !is_worktree_bridge {
2381 let _ = fs::remove_file(&data_path);
2382 }
2383 return None;
2384 }
2385 match Self::from_reader_after_version(
2386 reader,
2387 version,
2388 current_canonical_root,
2389 Some(file_len),
2390 1,
2391 ) {
2392 Ok(index) => {
2393 if index.entries.is_empty() {
2394 slog_info!("cached semantic index is empty, will rebuild");
2395 if !is_worktree_bridge {
2396 let _ = fs::remove_file(&data_path);
2397 }
2398 return None;
2399 }
2400 if let Some(expected) = expected_fingerprint {
2401 let matches = index
2402 .fingerprint()
2403 .map(|fingerprint| fingerprint.matches_expected(expected))
2404 .unwrap_or(false);
2405 if !matches {
2406 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2407 if !is_worktree_bridge {
2408 let _ = fs::remove_file(&data_path);
2409 }
2410 return None;
2411 }
2412 }
2413 slog_info!(
2414 "loaded semantic index from disk: {} entries",
2415 index.entries.len()
2416 );
2417 Some(index)
2418 }
2419 Err(e) => {
2420 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2421 if !is_worktree_bridge {
2422 let _ = fs::remove_file(&data_path);
2423 }
2424 None
2425 }
2426 }
2427 }
2428
2429 pub fn to_bytes(&self) -> Vec<u8> {
2431 let mut buf = Vec::new();
2432 self.write_to_writer(&mut buf)
2433 .expect("writing semantic index to Vec cannot fail");
2434 buf
2435 }
2436
2437 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2438 let mut bytes_written = 0usize;
2439 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2440 let encoded = fingerprint.as_string();
2441 if encoded.is_empty() {
2442 None
2443 } else {
2444 Some(encoded)
2445 }
2446 });
2447 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2448 let file_mtime_count = self
2449 .file_mtimes
2450 .iter()
2451 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2452 .count();
2453 let entry_count = self
2454 .entries
2455 .iter()
2456 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2457 .count();
2458
2459 let version = SEMANTIC_INDEX_VERSION_V7;
2474 write_counted(writer, &[version], &mut bytes_written)?;
2475 write_counted(
2476 writer,
2477 &(self.dimension as u32).to_le_bytes(),
2478 &mut bytes_written,
2479 )?;
2480 write_counted(
2481 writer,
2482 &(entry_count as u32).to_le_bytes(),
2483 &mut bytes_written,
2484 )?;
2485 write_counted(
2486 writer,
2487 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2488 &mut bytes_written,
2489 )?;
2490 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2491
2492 write_counted(
2495 writer,
2496 &(file_mtime_count as u32).to_le_bytes(),
2497 &mut bytes_written,
2498 )?;
2499 for (path, mtime) in &self.file_mtimes {
2500 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2501 continue;
2502 };
2503 let relative = relative.to_string_lossy();
2504 let path_bytes = relative.as_bytes();
2505 write_counted(
2506 writer,
2507 &(path_bytes.len() as u32).to_le_bytes(),
2508 &mut bytes_written,
2509 )?;
2510 write_counted(writer, path_bytes, &mut bytes_written)?;
2511 let duration = mtime
2512 .duration_since(SystemTime::UNIX_EPOCH)
2513 .unwrap_or_default();
2514 write_counted(
2515 writer,
2516 &duration.as_secs().to_le_bytes(),
2517 &mut bytes_written,
2518 )?;
2519 write_counted(
2520 writer,
2521 &duration.subsec_nanos().to_le_bytes(),
2522 &mut bytes_written,
2523 )?;
2524 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2525 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2526 let hash = self
2527 .file_hashes
2528 .get(path)
2529 .copied()
2530 .unwrap_or_else(cache_freshness::zero_hash);
2531 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2532 }
2533
2534 for entry in &self.entries {
2536 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2537 continue;
2538 };
2539 let c = &entry.chunk;
2540
2541 let relative = relative.to_string_lossy();
2543 let file_bytes = relative.as_bytes();
2544 write_counted(
2545 writer,
2546 &(file_bytes.len() as u32).to_le_bytes(),
2547 &mut bytes_written,
2548 )?;
2549 write_counted(writer, file_bytes, &mut bytes_written)?;
2550
2551 let name_bytes = c.name.as_bytes();
2553 write_counted(
2554 writer,
2555 &(name_bytes.len() as u32).to_le_bytes(),
2556 &mut bytes_written,
2557 )?;
2558 write_counted(writer, name_bytes, &mut bytes_written)?;
2559
2560 let qualified_name_bytes = c.qualified_name.as_deref().unwrap_or_default().as_bytes();
2562 write_counted(
2563 writer,
2564 &(qualified_name_bytes.len() as u32).to_le_bytes(),
2565 &mut bytes_written,
2566 )?;
2567 write_counted(writer, qualified_name_bytes, &mut bytes_written)?;
2568
2569 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2571
2572 write_counted(
2574 writer,
2575 &(c.start_line as u32).to_le_bytes(),
2576 &mut bytes_written,
2577 )?;
2578 write_counted(
2579 writer,
2580 &(c.end_line as u32).to_le_bytes(),
2581 &mut bytes_written,
2582 )?;
2583 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2584
2585 let snippet_bytes = c.snippet.as_bytes();
2587 write_counted(
2588 writer,
2589 &(snippet_bytes.len() as u32).to_le_bytes(),
2590 &mut bytes_written,
2591 )?;
2592 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2593
2594 let embed_bytes = c.embed_text.as_bytes();
2596 write_counted(
2597 writer,
2598 &(embed_bytes.len() as u32).to_le_bytes(),
2599 &mut bytes_written,
2600 )?;
2601 write_counted(writer, embed_bytes, &mut bytes_written)?;
2602
2603 for &val in &entry.vector {
2605 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2606 }
2607 }
2608
2609 Ok(bytes_written)
2610 }
2611
2612 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2614 debug_assert!(current_canonical_root.is_absolute());
2615 if data.len() < HEADER_BYTES_V1 {
2616 return Err("data too short".to_string());
2617 }
2618
2619 Self::from_reader_after_version(
2620 Cursor::new(&data[1..]),
2621 data[0],
2622 current_canonical_root,
2623 Some(data.len()),
2624 1,
2625 )
2626 }
2627
2628 fn from_reader_after_version<R: Read>(
2629 reader: R,
2630 version: u8,
2631 current_canonical_root: &Path,
2632 total_len: Option<usize>,
2633 bytes_read: usize,
2634 ) -> Result<Self, String> {
2635 debug_assert!(current_canonical_root.is_absolute());
2636 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2637
2638 if version != SEMANTIC_INDEX_VERSION_V1
2639 && version != SEMANTIC_INDEX_VERSION_V2
2640 && version != SEMANTIC_INDEX_VERSION_V3
2641 && version != SEMANTIC_INDEX_VERSION_V4
2642 && version != SEMANTIC_INDEX_VERSION_V5
2643 && version != SEMANTIC_INDEX_VERSION_V6
2644 && version != SEMANTIC_INDEX_VERSION_V7
2645 {
2646 return Err(format!("unsupported version: {}", version));
2647 }
2648 if (version == SEMANTIC_INDEX_VERSION_V2
2652 || version == SEMANTIC_INDEX_VERSION_V3
2653 || version == SEMANTIC_INDEX_VERSION_V4
2654 || version == SEMANTIC_INDEX_VERSION_V5
2655 || version == SEMANTIC_INDEX_VERSION_V6
2656 || version == SEMANTIC_INDEX_VERSION_V7)
2657 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2658 {
2659 return Err("data too short for semantic index v2/v3/v4/v5/v6/v7 header".to_string());
2660 }
2661
2662 let dimension = read_u32_stream(&mut reader)? as usize;
2663 let entry_count = read_u32_stream(&mut reader)? as usize;
2664 validate_embedding_dimension(dimension)?;
2665 if entry_count > MAX_ENTRIES {
2666 return Err(format!("too many semantic index entries: {}", entry_count));
2667 }
2668
2669 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2675 || version == SEMANTIC_INDEX_VERSION_V3
2676 || version == SEMANTIC_INDEX_VERSION_V4
2677 || version == SEMANTIC_INDEX_VERSION_V5
2678 || version == SEMANTIC_INDEX_VERSION_V6
2679 || version == SEMANTIC_INDEX_VERSION_V7;
2680 let fingerprint = if has_fingerprint_field {
2681 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2682 if total_len
2683 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2684 {
2685 return Err("unexpected end of data reading fingerprint".to_string());
2686 }
2687 if fingerprint_len == 0 {
2688 None
2689 } else {
2690 let mut raw = vec![0u8; fingerprint_len];
2691 read_exact_stream(
2692 &mut reader,
2693 &mut raw,
2694 "unexpected end of data reading fingerprint",
2695 )?;
2696 let raw = String::from_utf8_lossy(&raw).to_string();
2697 Some(
2698 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2699 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2700 )
2701 }
2702 } else {
2703 None
2704 };
2705
2706 let mtime_count = read_u32_stream(&mut reader)? as usize;
2708 if mtime_count > MAX_ENTRIES {
2709 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2710 }
2711
2712 let vector_bytes = entry_count
2713 .checked_mul(dimension)
2714 .and_then(|count| count.checked_mul(F32_BYTES))
2715 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2716 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2717 return Err("semantic index vectors exceed available data".to_string());
2718 }
2719
2720 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2721 let mut file_sizes = HashMap::with_capacity(mtime_count);
2722 let mut file_hashes = HashMap::with_capacity(mtime_count);
2723 for _ in 0..mtime_count {
2724 let path = read_string_stream(&mut reader, total_len)?;
2725 let secs = read_u64_stream(&mut reader)?;
2726 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2732 || version == SEMANTIC_INDEX_VERSION_V4
2733 || version == SEMANTIC_INDEX_VERSION_V5
2734 || version == SEMANTIC_INDEX_VERSION_V6
2735 || version == SEMANTIC_INDEX_VERSION_V7
2736 {
2737 read_u32_stream(&mut reader)?
2738 } else {
2739 0
2740 };
2741 let size = if version == SEMANTIC_INDEX_VERSION_V5
2742 || version == SEMANTIC_INDEX_VERSION_V6
2743 || version == SEMANTIC_INDEX_VERSION_V7
2744 {
2745 read_u64_stream(&mut reader)?
2746 } else {
2747 0
2748 };
2749 let content_hash =
2750 if version == SEMANTIC_INDEX_VERSION_V6 || version == SEMANTIC_INDEX_VERSION_V7 {
2751 let mut hash_bytes = [0u8; 32];
2752 read_exact_stream(
2753 &mut reader,
2754 &mut hash_bytes,
2755 "unexpected end of data reading content hash",
2756 )?;
2757 blake3::Hash::from_bytes(hash_bytes)
2758 } else {
2759 cache_freshness::zero_hash()
2760 };
2761 if nanos >= 1_000_000_000 {
2768 return Err(format!(
2769 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2770 nanos
2771 ));
2772 }
2773 let duration = std::time::Duration::new(secs, nanos);
2774 let mtime = SystemTime::UNIX_EPOCH
2775 .checked_add(duration)
2776 .ok_or_else(|| {
2777 format!(
2778 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2779 secs, nanos
2780 )
2781 })?;
2782 let path = if version == SEMANTIC_INDEX_VERSION_V6
2783 || version == SEMANTIC_INDEX_VERSION_V7
2784 {
2785 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2786 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2787 } else {
2788 PathBuf::from(path)
2789 };
2790 file_mtimes.insert(path.clone(), mtime);
2791 file_sizes.insert(path.clone(), size);
2792 file_hashes.insert(path, content_hash);
2793 }
2794
2795 let mut entries = Vec::with_capacity(entry_count);
2797 for _ in 0..entry_count {
2798 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2799 let file = if version == SEMANTIC_INDEX_VERSION_V6
2800 || version == SEMANTIC_INDEX_VERSION_V7
2801 {
2802 cached_path_under_root(current_canonical_root, &raw_file)
2803 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2804 } else {
2805 raw_file
2806 };
2807 let name = read_string_stream(&mut reader, total_len)?;
2808 let qualified_name = if version == SEMANTIC_INDEX_VERSION_V7 {
2809 let qualified_name = read_string_stream(&mut reader, total_len)?;
2810 if qualified_name.is_empty() {
2811 None
2812 } else {
2813 Some(qualified_name)
2814 }
2815 } else {
2816 None
2817 };
2818
2819 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2820
2821 let start_line = read_u32_stream(&mut reader)?;
2822 let end_line = read_u32_stream(&mut reader)?;
2823
2824 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2825
2826 let snippet = read_string_stream(&mut reader, total_len)?;
2827 let embed_text = read_string_stream(&mut reader, total_len)?;
2828
2829 let vec_bytes = dimension
2831 .checked_mul(F32_BYTES)
2832 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2833 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2834 return Err("unexpected end of data reading vector".to_string());
2835 }
2836 let mut vector = Vec::with_capacity(dimension);
2837 for _ in 0..dimension {
2838 let mut bytes = [0u8; F32_BYTES];
2839 read_exact_stream(
2840 &mut reader,
2841 &mut bytes,
2842 "unexpected end of data reading vector",
2843 )?;
2844 vector.push(f32::from_le_bytes(bytes));
2845 }
2846
2847 entries.push(EmbeddingEntry {
2848 chunk: SemanticChunk {
2849 file,
2850 name,
2851 qualified_name,
2852 kind,
2853 start_line,
2854 end_line,
2855 exported,
2856 embed_text,
2857 snippet,
2858 },
2859 vector,
2860 });
2861 }
2862
2863 if entries.len() != entry_count {
2864 return Err(format!(
2865 "semantic cache entry count drift: header={} decoded={}",
2866 entry_count,
2867 entries.len()
2868 ));
2869 }
2870 for entry in &entries {
2871 if !file_mtimes.contains_key(&entry.chunk.file) {
2872 return Err(format!(
2873 "semantic cache metadata missing for entry file {}",
2874 entry.chunk.file.display()
2875 ));
2876 }
2877 }
2878
2879 Ok(Self {
2880 entries,
2881 file_mtimes,
2882 file_sizes,
2883 file_hashes,
2884 dimension,
2885 fingerprint,
2886 project_root: current_canonical_root.to_path_buf(),
2887 deferred_files: HashSet::new(),
2888 })
2889 }
2890}
2891
2892fn write_counted<W: Write>(
2893 writer: &mut W,
2894 bytes: &[u8],
2895 bytes_written: &mut usize,
2896) -> io::Result<()> {
2897 writer.write_all(bytes)?;
2898 *bytes_written = bytes_written.saturating_add(bytes.len());
2899 Ok(())
2900}
2901
2902struct CountingReader<R> {
2903 inner: R,
2904 bytes_read: usize,
2905}
2906
2907impl<R> CountingReader<R> {
2908 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2909 Self { inner, bytes_read }
2910 }
2911
2912 fn bytes_read(&self) -> usize {
2913 self.bytes_read
2914 }
2915}
2916
2917impl<R: Read> Read for CountingReader<R> {
2918 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2919 let read = self.inner.read(buf)?;
2920 self.bytes_read = self.bytes_read.saturating_add(read);
2921 Ok(read)
2922 }
2923}
2924
2925fn read_exact_stream<R: Read>(
2926 reader: &mut CountingReader<R>,
2927 buf: &mut [u8],
2928 eof_message: &'static str,
2929) -> Result<(), String> {
2930 reader.read_exact(buf).map_err(|error| {
2931 if error.kind() == io::ErrorKind::UnexpectedEof {
2932 eof_message.to_string()
2933 } else {
2934 format!("{eof_message}: {error}")
2935 }
2936 })
2937}
2938
2939fn read_u8_stream<R: Read>(
2940 reader: &mut CountingReader<R>,
2941 eof_message: &'static str,
2942) -> Result<u8, String> {
2943 let mut bytes = [0u8; 1];
2944 read_exact_stream(reader, &mut bytes, eof_message)?;
2945 Ok(bytes[0])
2946}
2947
2948fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2949 let mut bytes = [0u8; 4];
2950 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2951 Ok(u32::from_le_bytes(bytes))
2952}
2953
2954fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2955 let mut bytes = [0u8; 8];
2956 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2957 Ok(u64::from_le_bytes(bytes))
2958}
2959
2960fn read_string_stream<R: Read>(
2961 reader: &mut CountingReader<R>,
2962 total_len: Option<usize>,
2963) -> Result<String, String> {
2964 let len = read_u32_stream(reader)? as usize;
2965 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2966 return Err("unexpected end of data reading string".to_string());
2967 }
2968 let mut bytes = vec![0u8; len];
2969 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2970 Ok(String::from_utf8_lossy(&bytes).to_string())
2971}
2972
2973struct SourceLineCache<'a> {
2974 lines: Vec<&'a str>,
2975 line_starts: Vec<usize>,
2976}
2977
2978impl<'a> SourceLineCache<'a> {
2979 fn new(source: &'a str) -> Self {
2980 let lines: Vec<&'a str> = source.lines().collect();
2981 let mut line_starts = Vec::with_capacity(lines.len());
2982 let bytes = source.as_bytes();
2983 let mut offset = 0usize;
2984 for line in &lines {
2985 line_starts.push(offset);
2986 offset += line.len();
2987 if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2988 offset += 2;
2989 } else if bytes.get(offset) == Some(&b'\n') {
2990 offset += 1;
2991 }
2992 }
2993 Self { lines, line_starts }
2994 }
2995
2996 fn len(&self) -> usize {
2997 debug_assert_eq!(self.lines.len(), self.line_starts.len());
2998 self.line_starts.len()
2999 }
3000}
3001
3002fn build_embed_text_with_lines(
3004 symbol: &Symbol,
3005 line_cache: &SourceLineCache<'_>,
3006 file: &Path,
3007 project_root: &Path,
3008) -> String {
3009 let relative = file
3010 .strip_prefix(project_root)
3011 .unwrap_or(file)
3012 .to_string_lossy();
3013
3014 let kind_label = match &symbol.kind {
3015 SymbolKind::Function => "function",
3016 SymbolKind::Class => "class",
3017 SymbolKind::Method => "method",
3018 SymbolKind::Struct => "struct",
3019 SymbolKind::Interface => "interface",
3020 SymbolKind::Enum => "enum",
3021 SymbolKind::TypeAlias => "type",
3022 SymbolKind::Variable => "variable",
3023 SymbolKind::Heading => "heading",
3024 SymbolKind::FileSummary => "file-summary",
3025 };
3026
3027 let name = &symbol.name;
3029 let mut text = format!(
3030 "name:{name} file:{} kind:{} name:{name}",
3031 relative, kind_label
3032 );
3033
3034 if let Some(sig) = &symbol.signature {
3035 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3043 }
3044
3045 let start = (symbol.range.start_line as usize).min(line_cache.len());
3047 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3049 if start < end {
3050 let body: String = line_cache.lines[start..end]
3051 .iter()
3052 .take(15) .copied()
3054 .collect::<Vec<&str>>()
3055 .join("\n");
3056 let snippet = if body.len() > 300 {
3057 format!("{}...", &body[..body.floor_char_boundary(300)])
3058 } else {
3059 body
3060 };
3061 text.push_str(&format!(" body:{}", snippet));
3062 }
3063
3064 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3069}
3070
3071#[cfg(test)]
3072fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3073 let line_cache = SourceLineCache::new(source);
3074 build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3075}
3076
3077const MAX_EMBED_TEXT_CHARS: usize = 1600;
3081
3082fn truncate_chars(value: &str, max_chars: usize) -> String {
3083 value.chars().take(max_chars).collect()
3084}
3085
3086fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3087 let Some((start, first)) = line_cache
3088 .lines
3089 .iter()
3090 .enumerate()
3091 .find(|(_, line)| !line.trim().is_empty())
3092 else {
3093 return String::new();
3094 };
3095
3096 let trimmed = first.trim_start();
3097 if trimmed.starts_with("/**") {
3098 let mut comment = Vec::new();
3099 for line in line_cache.lines.iter().skip(start) {
3100 comment.push(*line);
3101 if line.contains("*/") {
3102 break;
3103 }
3104 }
3105 return truncate_chars(&comment.join("\n"), 200);
3106 }
3107
3108 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3109 let comment = line_cache
3110 .lines
3111 .iter()
3112 .skip(start)
3113 .take_while(|line| {
3114 let trimmed = line.trim_start();
3115 trimmed.starts_with("///") || trimmed.starts_with("//!")
3116 })
3117 .copied()
3118 .collect::<Vec<_>>()
3119 .join("\n");
3120 return truncate_chars(&comment, 200);
3121 }
3122
3123 String::new()
3124}
3125
3126pub fn build_file_summary_chunk(
3127 file: &Path,
3128 project_root: &Path,
3129 source: &str,
3130 top_exports: &[&str],
3131 top_export_signatures: &[Option<&str>],
3132) -> SemanticChunk {
3133 let line_cache = SourceLineCache::new(source);
3134 build_file_summary_chunk_with_lines(
3135 file,
3136 project_root,
3137 &line_cache,
3138 top_exports,
3139 top_export_signatures,
3140 )
3141}
3142
3143fn build_file_summary_chunk_with_lines(
3144 file: &Path,
3145 project_root: &Path,
3146 line_cache: &SourceLineCache<'_>,
3147 top_exports: &[&str],
3148 top_export_signatures: &[Option<&str>],
3149) -> SemanticChunk {
3150 let relative = file.strip_prefix(project_root).unwrap_or(file);
3151 let rel_path = relative.to_string_lossy();
3152 let parent_dir = relative
3153 .parent()
3154 .map(|parent| parent.to_string_lossy().to_string())
3155 .unwrap_or_default();
3156 let name = file
3157 .file_stem()
3158 .map(|stem| stem.to_string_lossy().to_string())
3159 .unwrap_or_default();
3160 let doc = first_leading_doc_comment(line_cache);
3161 let exports = top_exports
3162 .iter()
3163 .take(5)
3164 .copied()
3165 .collect::<Vec<_>>()
3166 .join(",");
3167 let snippet = if doc.is_empty() {
3168 top_export_signatures
3169 .first()
3170 .and_then(|signature| signature.as_deref())
3171 .map(|signature| truncate_chars(signature, 200))
3172 .unwrap_or_default()
3173 } else {
3174 doc.clone()
3175 };
3176
3177 SemanticChunk {
3178 file: file.to_path_buf(),
3179 name,
3180 qualified_name: None,
3181 kind: SymbolKind::FileSummary,
3182 start_line: 0,
3183 end_line: 0,
3184 exported: false,
3185 embed_text: truncate_chars(
3186 &format!(
3187 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3188 file.file_stem()
3189 .map(|stem| stem.to_string_lossy().to_string())
3190 .unwrap_or_default()
3191 ),
3192 MAX_EMBED_TEXT_CHARS,
3193 ),
3194 snippet,
3195 }
3196}
3197
3198fn parser_for(
3199 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3200 lang: crate::parser::LangId,
3201) -> Result<&mut Parser, String> {
3202 use std::collections::hash_map::Entry;
3203
3204 match parsers.entry(lang) {
3205 Entry::Occupied(entry) => Ok(entry.into_mut()),
3206 Entry::Vacant(entry) => {
3207 let grammar = grammar_for(lang);
3208 let mut parser = Parser::new();
3209 parser
3210 .set_language(&grammar)
3211 .map_err(|error| error.to_string())?;
3212 Ok(entry.insert(parser))
3213 }
3214 }
3215}
3216
3217pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3218 matches!(
3219 path.extension().and_then(|extension| extension.to_str()),
3220 Some(
3221 "ts" | "tsx"
3222 | "js"
3223 | "jsx"
3224 | "py"
3225 | "rs"
3226 | "go"
3227 | "c"
3228 | "h"
3229 | "cc"
3230 | "cpp"
3231 | "cxx"
3232 | "hpp"
3233 | "hh"
3234 | "zig"
3235 | "cs"
3236 | "sh"
3237 | "bash"
3238 | "zsh"
3239 | "inc"
3240 | "php"
3241 | "sol"
3242 | "scss"
3243 | "vue"
3244 | "yaml"
3245 | "yml"
3246 | "pas"
3247 | "pp"
3248 | "dpr"
3249 | "dpk"
3250 | "lpr"
3251 | "java"
3252 | "kt"
3253 | "kts"
3254 | "rb"
3255 | "swift"
3256 | "scala"
3257 | "sc"
3258 | "lua"
3259 | "pl"
3260 | "pm"
3261 | "t"
3262 | "r"
3263 | "R",
3264 )
3265 )
3266}
3267
3268fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3269 if let Ok(canonical) = fs::canonicalize(path) {
3270 return canonical;
3271 }
3272
3273 let Some(parent) = path.parent() else {
3274 return path.to_path_buf();
3275 };
3276 let Some(file_name) = path.file_name() else {
3277 return path.to_path_buf();
3278 };
3279
3280 fs::canonicalize(parent)
3281 .map(|canonical_parent| canonical_parent.join(file_name))
3282 .unwrap_or_else(|_| path.to_path_buf())
3283}
3284
3285const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3295
3296fn collect_semantic_file(
3297 project_root: &Path,
3298 file: &Path,
3299 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3300) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3301 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3302 if !metadata.is_file() {
3303 return Err("not a regular file".to_string());
3304 }
3305 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3306 let size = metadata.len();
3307
3308 if !is_semantic_indexed_extension(file) {
3309 return Err("unsupported file extension".to_string());
3310 }
3311 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3312
3313 let mut indexed_metadata = IndexedFileMetadata {
3314 mtime,
3315 size,
3316 content_hash: cache_freshness::zero_hash(),
3317 };
3318
3319 if size > MAX_SEMANTIC_FILE_BYTES {
3322 return Ok((indexed_metadata, Vec::new()));
3323 }
3324
3325 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3326 indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3327 cache_freshness::hash_bytes(source.as_bytes())
3328 } else {
3329 cache_freshness::zero_hash()
3330 };
3331
3332 let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3333 Ok((indexed_metadata, chunks))
3334}
3335
3336#[cfg(test)]
3337fn collect_file_chunks(
3338 project_root: &Path,
3339 file: &Path,
3340 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3341) -> Result<Vec<SemanticChunk>, String> {
3342 if !is_semantic_indexed_extension(file) {
3343 return Err("unsupported file extension".to_string());
3344 }
3345 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3346 if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3349 return Ok(Vec::new());
3350 }
3351 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3352 collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3353}
3354
3355fn collect_file_chunks_from_source(
3356 project_root: &Path,
3357 file: &Path,
3358 lang: crate::parser::LangId,
3359 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3360 source: &str,
3361) -> Result<Vec<SemanticChunk>, String> {
3362 let tree = parser_for(parsers, lang)?
3363 .parse(source, None)
3364 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3365 let symbols =
3366 extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3367
3368 Ok(symbols_to_chunks(file, &symbols, source, project_root))
3369}
3370
3371fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3373 let start = (symbol.range.start_line as usize).min(line_cache.len());
3374 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3376 if start < end {
3377 let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3378 .iter()
3379 .take(5)
3380 .copied()
3381 .collect();
3382 let mut snippet = snippet_lines.join("\n");
3383 if end - start > 5 {
3384 snippet.push_str("\n ...");
3385 }
3386 if snippet.len() > 300 {
3387 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3388 }
3389 snippet
3390 } else {
3391 String::new()
3392 }
3393}
3394
3395#[cfg(test)]
3396fn build_snippet(symbol: &Symbol, source: &str) -> String {
3397 let line_cache = SourceLineCache::new(source);
3398 build_snippet_with_lines(symbol, &line_cache)
3399}
3400
3401fn qualified_name_for_symbol(symbol: &Symbol) -> Option<String> {
3402 let mut parts = symbol
3403 .scope_chain
3404 .iter()
3405 .filter(|part| !part.is_empty())
3406 .cloned()
3407 .collect::<Vec<_>>();
3408 if !symbol.name.is_empty() {
3409 parts.push(symbol.name.clone());
3410 }
3411 (!parts.is_empty()).then(|| parts.join("."))
3412}
3413
3414fn symbols_to_chunks(
3416 file: &Path,
3417 symbols: &[Symbol],
3418 source: &str,
3419 project_root: &Path,
3420) -> Vec<SemanticChunk> {
3421 let line_cache = SourceLineCache::new(source);
3422 let mut chunks = Vec::new();
3423 let top_exports_with_signatures = symbols
3424 .iter()
3425 .filter(|symbol| {
3426 symbol.exported
3427 && symbol.parent.is_none()
3428 && !matches!(symbol.kind, SymbolKind::Heading)
3429 })
3430 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3431 .collect::<Vec<_>>();
3432
3433 let has_only_headings = !symbols.is_empty()
3434 && symbols
3435 .iter()
3436 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3437 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3438 let top_exports = top_exports_with_signatures
3439 .iter()
3440 .map(|(name, _)| *name)
3441 .collect::<Vec<_>>();
3442 let top_export_signatures = top_exports_with_signatures
3443 .iter()
3444 .map(|(_, signature)| *signature)
3445 .collect::<Vec<_>>();
3446 chunks.push(build_file_summary_chunk_with_lines(
3447 file,
3448 project_root,
3449 &line_cache,
3450 &top_exports,
3451 &top_export_signatures,
3452 ));
3453 }
3454
3455 for symbol in symbols {
3456 if matches!(symbol.kind, SymbolKind::Heading) {
3461 continue;
3462 }
3463
3464 let line_count = symbol
3466 .range
3467 .end_line
3468 .saturating_sub(symbol.range.start_line)
3469 + 1;
3470 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3471 continue;
3472 }
3473
3474 let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3475 let snippet = build_snippet_with_lines(symbol, &line_cache);
3476
3477 chunks.push(SemanticChunk {
3478 file: file.to_path_buf(),
3479 name: symbol.name.clone(),
3480 qualified_name: qualified_name_for_symbol(symbol),
3481 kind: symbol.kind.clone(),
3482 start_line: symbol.range.start_line,
3483 end_line: symbol.range.end_line,
3484 exported: symbol.exported,
3485 embed_text,
3486 snippet,
3487 });
3488
3489 }
3492
3493 chunks
3494}
3495
3496fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3497 b.0.partial_cmp(&a.0)
3498 .unwrap_or(std::cmp::Ordering::Equal)
3499 .then_with(|| a.1.cmp(&b.1))
3500}
3501
3502fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3504 if a.len() != b.len() {
3505 return 0.0;
3506 }
3507
3508 let mut dot = 0.0f32;
3509 let mut norm_a = 0.0f32;
3510 let mut norm_b = 0.0f32;
3511
3512 for i in 0..a.len() {
3513 dot += a[i] * b[i];
3514 norm_a += a[i] * a[i];
3515 norm_b += b[i] * b[i];
3516 }
3517
3518 let denom = norm_a.sqrt() * norm_b.sqrt();
3519 if denom == 0.0 {
3520 0.0
3521 } else {
3522 dot / denom
3523 }
3524}
3525
3526fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3528 match kind {
3529 SymbolKind::Function => 0,
3530 SymbolKind::Class => 1,
3531 SymbolKind::Method => 2,
3532 SymbolKind::Struct => 3,
3533 SymbolKind::Interface => 4,
3534 SymbolKind::Enum => 5,
3535 SymbolKind::TypeAlias => 6,
3536 SymbolKind::Variable => 7,
3537 SymbolKind::Heading => 8,
3538 SymbolKind::FileSummary => 9,
3539 }
3540}
3541
3542fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3543 match v {
3544 0 => SymbolKind::Function,
3545 1 => SymbolKind::Class,
3546 2 => SymbolKind::Method,
3547 3 => SymbolKind::Struct,
3548 4 => SymbolKind::Interface,
3549 5 => SymbolKind::Enum,
3550 6 => SymbolKind::TypeAlias,
3551 7 => SymbolKind::Variable,
3552 8 => SymbolKind::Heading,
3553 9 => SymbolKind::FileSummary,
3554 _ => SymbolKind::Heading,
3555 }
3556}
3557
3558#[cfg(test)]
3559mod tests {
3560 use super::*;
3561 use crate::config::{SemanticBackend, SemanticBackendConfig};
3562 use crate::parser::FileParser;
3563 use std::io::{Read, Write};
3564 use std::net::TcpListener;
3565 use std::thread;
3566
3567 #[test]
3568 fn semantic_index_includes_php_inc_and_scss_extensions() {
3569 for file in ["partial.inc", "index.php", "styles.scss"] {
3570 assert!(
3571 is_semantic_indexed_extension(Path::new(file)),
3572 "{file} should be semantic-index eligible"
3573 );
3574 }
3575 }
3576
3577 #[test]
3578 fn transient_marker_round_trips_and_classifies() {
3579 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3582 assert!(embedding_failure_is_transient(&marked));
3583 let clean = strip_transient_embedding_marker(&marked);
3584 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3585 assert!(clean.starts_with("openai compatible request failed:"));
3586
3587 for permanent in [
3590 "openai compatible request failed (HTTP 401): Unauthorized",
3591 "embedding dimension mismatch: index has 384, model returned 768",
3592 "too many files (>20000) for semantic indexing (max 20000)",
3593 ] {
3594 assert!(
3595 !embedding_failure_is_transient(permanent),
3596 "{permanent:?} must not be transient"
3597 );
3598 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3600 }
3601 }
3602
3603 #[test]
3604 fn send_error_transience_separates_connect_timeout_from_4xx() {
3605 assert!(is_retryable_embedding_status(
3607 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3608 ));
3609 assert!(is_retryable_embedding_status(
3610 reqwest::StatusCode::TOO_MANY_REQUESTS
3611 ));
3612 assert!(!is_retryable_embedding_status(
3613 reqwest::StatusCode::UNAUTHORIZED
3614 ));
3615 assert!(!is_retryable_embedding_status(
3616 reqwest::StatusCode::BAD_REQUEST
3617 ));
3618 }
3619
3620 #[test]
3621 fn local_backend_model_loading_body_is_transient() {
3622 for body in [
3625 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3626 r#"{"error":"model is loading, please wait"}"#,
3627 r#"{"error":"Model not loaded"}"#,
3628 "Loading model into memory",
3629 ] {
3630 assert!(
3631 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3632 "{body:?} should be body-transient"
3633 );
3634 }
3635
3636 for body in [
3640 r#"{"error":"invalid api key"}"#,
3641 r#"{"error":"model 'foo' not found"}"#,
3642 "Bad Request: unknown field",
3643 "Bad Request: invalid loading model option",
3644 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3645 ] {
3646 assert!(
3647 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3648 "{body:?} must not be body-transient"
3649 );
3650 }
3651
3652 assert!(
3653 !embedding_response_body_is_transient(
3654 reqwest::StatusCode::UNAUTHORIZED,
3655 r#"{"error":"model is loading, please wait"}"#
3656 ),
3657 "permanent auth failures must not become transient because of body text"
3658 );
3659 }
3660
3661 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3662 where
3663 F: Fn(String, String, String) -> String + Send + 'static,
3664 {
3665 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3666 let addr = listener.local_addr().expect("local addr");
3667 let handle = thread::spawn(move || {
3668 let (mut stream, _) = listener.accept().expect("accept request");
3669 let mut buf = Vec::new();
3670 let mut chunk = [0u8; 4096];
3671 let mut header_end = None;
3672 let mut content_length = 0usize;
3673 loop {
3674 let n = stream.read(&mut chunk).expect("read request");
3675 if n == 0 {
3676 break;
3677 }
3678 buf.extend_from_slice(&chunk[..n]);
3679 if header_end.is_none() {
3680 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3681 header_end = Some(pos + 4);
3682 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3683 for line in headers.lines() {
3684 if let Some(value) = line.strip_prefix("Content-Length:") {
3685 content_length = value.trim().parse::<usize>().unwrap_or(0);
3686 }
3687 }
3688 }
3689 }
3690 if let Some(end) = header_end {
3691 if buf.len() >= end + content_length {
3692 break;
3693 }
3694 }
3695 }
3696
3697 let end = header_end.expect("header terminator");
3698 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3699 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3700 let mut lines = request.lines();
3701 let request_line = lines.next().expect("request line").to_string();
3702 let path = request_line
3703 .split_whitespace()
3704 .nth(1)
3705 .expect("request path")
3706 .to_string();
3707 let response_body = handler(request_line, path, body);
3708 let response = format!(
3709 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3710 response_body.len(),
3711 response_body
3712 );
3713 stream
3714 .write_all(response.as_bytes())
3715 .expect("write response");
3716 });
3717
3718 (format!("http://{}", addr), handle)
3719 }
3720
3721 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3722 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3723 listener
3724 .set_nonblocking(true)
3725 .expect("nonblocking listener");
3726 let addr = listener.local_addr().expect("local addr");
3727 let handle = thread::spawn(move || {
3728 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3729 let mut accepted = 0usize;
3730 while accepted < attempts && std::time::Instant::now() < deadline {
3731 match listener.accept() {
3732 Ok((mut stream, _)) => {
3733 accepted += 1;
3734 let mut buf = [0u8; 4096];
3735 let _ = stream.read(&mut buf);
3743 let response = "HTTP/1.1 200 OK
3744Content-Type: application/json
3745Content-Length: 128
3746Connection: close
3747
3748{";
3749 let _ = stream.write_all(response.as_bytes());
3750 }
3751 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3752 thread::sleep(Duration::from_millis(10));
3753 }
3754 Err(error) => panic!("accept request: {error}"),
3755 }
3756 }
3757 });
3758
3759 (format!("http://{}", addr), handle)
3760 }
3761
3762 #[test]
3763 fn response_body_read_failures_are_marked_transient() {
3764 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3765 let client = Client::builder()
3766 .timeout(Duration::from_millis(250))
3767 .build()
3768 .expect("client");
3769
3770 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3771 .expect_err("truncated body should fail");
3772
3773 handle.join().unwrap();
3774 assert!(
3775 embedding_failure_is_transient(&error),
3776 "body read failures should be transient-marked: {error}"
3777 );
3778 assert!(error.contains("response read failed"));
3779 }
3780
3781 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3782 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3783 }
3784
3785 fn write_rust_file(path: &Path, function_name: &str) {
3786 fs::write(
3787 path,
3788 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3789 )
3790 .unwrap();
3791 }
3792
3793 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3794 let mut embed = test_vector_for_texts;
3795 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3796 }
3797
3798 fn test_project_root() -> PathBuf {
3799 std::env::current_dir().unwrap()
3800 }
3801
3802 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3803 index.file_mtimes.insert(file.to_path_buf(), mtime);
3804 index.file_sizes.insert(file.to_path_buf(), size);
3805 index
3806 .file_hashes
3807 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3808 }
3809
3810 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3811 let mut buf = Vec::new();
3812 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3813 let encoded = fingerprint.as_string();
3814 if encoded.is_empty() {
3815 None
3816 } else {
3817 Some(encoded.into_bytes())
3818 }
3819 });
3820 let file_mtimes: Vec<_> = index
3821 .file_mtimes
3822 .iter()
3823 .filter_map(|(path, mtime)| {
3824 cache_relative_path(&index.project_root, path)
3825 .map(|relative| (relative, path, mtime))
3826 })
3827 .collect();
3828 let entries: Vec<_> = index
3829 .entries
3830 .iter()
3831 .filter_map(|entry| {
3832 cache_relative_path(&index.project_root, &entry.chunk.file)
3833 .map(|relative| (relative, entry))
3834 })
3835 .collect();
3836
3837 buf.push(SEMANTIC_INDEX_VERSION_V6);
3838 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3839 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3840 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3841 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3842 buf.extend_from_slice(fp_bytes_ref);
3843
3844 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3845 for (relative, path, mtime) in &file_mtimes {
3846 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3847 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3848 buf.extend_from_slice(&path_bytes);
3849 let duration = mtime
3850 .duration_since(SystemTime::UNIX_EPOCH)
3851 .unwrap_or_default();
3852 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3853 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3854 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3855 buf.extend_from_slice(&size.to_le_bytes());
3856 let hash = index
3857 .file_hashes
3858 .get(*path)
3859 .copied()
3860 .unwrap_or_else(cache_freshness::zero_hash);
3861 buf.extend_from_slice(hash.as_bytes());
3862 }
3863
3864 for (relative, entry) in &entries {
3865 let c = &entry.chunk;
3866 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3867 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3868 buf.extend_from_slice(&file_bytes);
3869
3870 let name_bytes = c.name.as_bytes();
3871 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3872 buf.extend_from_slice(name_bytes);
3873
3874 buf.push(symbol_kind_to_u8(&c.kind));
3875 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3876 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3877 buf.push(c.exported as u8);
3878
3879 let snippet_bytes = c.snippet.as_bytes();
3880 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3881 buf.extend_from_slice(snippet_bytes);
3882
3883 let embed_bytes = c.embed_text.as_bytes();
3884 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3885 buf.extend_from_slice(embed_bytes);
3886
3887 for &val in &entry.vector {
3888 buf.extend_from_slice(&val.to_le_bytes());
3889 }
3890 }
3891
3892 buf
3893 }
3894
3895 #[derive(Default)]
3896 struct RecordingEmbedder {
3897 calls: Vec<Vec<String>>,
3898 }
3899
3900 impl RecordingEmbedder {
3901 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3902 let vectors = texts
3903 .iter()
3904 .map(|text| deterministic_test_vector(text))
3905 .collect();
3906 self.calls.push(texts);
3907 Ok(vectors)
3908 }
3909
3910 fn total_embedded_texts(&self) -> usize {
3911 self.calls.iter().map(Vec::len).sum()
3912 }
3913
3914 fn embedded_texts(&self) -> Vec<&str> {
3915 self.calls
3916 .iter()
3917 .flat_map(|batch| batch.iter().map(String::as_str))
3918 .collect()
3919 }
3920 }
3921
3922 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3923 let hash = blake3::hash(text.as_bytes());
3924 let bytes = hash.as_bytes();
3925 vec![
3926 1.0,
3927 bytes[0] as f32 / 255.0,
3928 bytes[1] as f32 / 255.0,
3929 bytes[2] as f32 / 255.0,
3930 ]
3931 }
3932
3933 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3934 let mut embedder = RecordingEmbedder::default();
3935 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3936 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3937 }
3938
3939 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3940 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3941 }
3942
3943 fn write_source(path: &Path, source: &str) {
3944 if let Some(parent) = path.parent() {
3945 fs::create_dir_all(parent).unwrap();
3946 }
3947 fs::write(path, source).unwrap();
3948 }
3949
3950 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3951 index
3952 .entries
3953 .iter()
3954 .filter(|entry| entry.chunk.file == file)
3955 .collect()
3956 }
3957
3958 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3959 index
3960 .entries
3961 .iter()
3962 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3963 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3964 }
3965
3966 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3967 index
3968 .entries
3969 .iter()
3970 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3971 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3972 }
3973
3974 #[test]
3975 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3976 let temp = tempfile::tempdir().unwrap();
3977 let project_root = temp.path();
3978 let file = project_root.join("src/lib.rs");
3979 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3980 write_source(&file, original);
3981
3982 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3983 let original_entry_count = index.entries.len();
3984 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3985
3986 write_source(&file, &format!("\n{original}"));
3987 force_stale(&mut index, &file);
3988
3989 let mut embedder = RecordingEmbedder::default();
3990 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3991 let mut progress = |_done: usize, _total: usize| {};
3992 let summary = index
3993 .refresh_stale_files(
3994 project_root,
3995 std::slice::from_ref(&file),
3996 &mut embed,
3997 16,
3998 &mut progress,
3999 )
4000 .unwrap();
4001
4002 assert_eq!(summary.changed, 1);
4003 assert_eq!(embedder.total_embedded_texts(), 0);
4004 assert_eq!(index.entries.len(), original_entry_count);
4005 let shifted_alpha = entry_by_name(&index, &file, "alpha");
4006 assert_eq!(shifted_alpha.chunk.start_line, 1);
4007 assert_eq!(shifted_alpha.vector, original_alpha_vector);
4008 }
4009
4010 #[test]
4011 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
4012 let temp = tempfile::tempdir().unwrap();
4013 let project_root = temp.path();
4014 let file = project_root.join("src/lib.rs");
4015 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
4016 write_source(&file, original);
4017
4018 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4019 let mut serving_index = worker_index.clone();
4020 let original_entry_count = worker_index.entries.len();
4021
4022 write_source(&file, &format!("\n{original}"));
4023
4024 let mut embedder = RecordingEmbedder::default();
4025 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4026 let mut progress = |_done: usize, _total: usize| {};
4027 let update = worker_index
4028 .refresh_invalidated_files(
4029 project_root,
4030 std::slice::from_ref(&file),
4031 &mut embed,
4032 16,
4033 100,
4034 &mut progress,
4035 )
4036 .unwrap();
4037
4038 assert_eq!(embedder.total_embedded_texts(), 0);
4039 assert_eq!(update.added_entries.len(), original_entry_count);
4040 assert_eq!(worker_index.entries.len(), original_entry_count);
4041
4042 serving_index.apply_refresh_update(
4043 update.added_entries,
4044 update.updated_metadata,
4045 &update.completed_paths,
4046 );
4047
4048 assert_eq!(serving_index.entries.len(), original_entry_count);
4049 assert_eq!(
4050 entries_for_file(&serving_index, &file).len(),
4051 original_entry_count
4052 );
4053 assert_eq!(
4054 entry_by_name(&serving_index, &file, "alpha")
4055 .chunk
4056 .start_line,
4057 1
4058 );
4059 }
4060
4061 #[test]
4062 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
4063 let temp = tempfile::tempdir().unwrap();
4064 let project_root = temp.path();
4065 let file = project_root.join("src/lib.rs");
4066 write_source(
4067 &file,
4068 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
4069 );
4070
4071 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4072 let original_entry_count = index.entries.len();
4073 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4074
4075 write_source(
4076 &file,
4077 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
4078 );
4079
4080 let mut embedder = RecordingEmbedder::default();
4081 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4082 let mut progress = |_done: usize, _total: usize| {};
4083 let update = index
4084 .refresh_invalidated_files(
4085 project_root,
4086 std::slice::from_ref(&file),
4087 &mut embed,
4088 16,
4089 100,
4090 &mut progress,
4091 )
4092 .unwrap();
4093
4094 assert_eq!(embedder.total_embedded_texts(), 1);
4095 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4096 assert_eq!(update.added_entries.len(), original_entry_count);
4097 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4098 }
4099
4100 #[test]
4101 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4102 let temp = tempfile::tempdir().unwrap();
4103 let project_root = temp.path();
4104 let file = project_root.join("src/dupe.js");
4105 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
4106 write_source(&file, one_duplicate);
4107
4108 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4109 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4110
4111 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4112
4113 let mut embedder = RecordingEmbedder::default();
4114 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4115 let mut progress = |_done: usize, _total: usize| {};
4116 index
4117 .refresh_invalidated_files(
4118 project_root,
4119 std::slice::from_ref(&file),
4120 &mut embed,
4121 16,
4122 100,
4123 &mut progress,
4124 )
4125 .unwrap();
4126
4127 let duplicate_entries = index
4128 .entries
4129 .iter()
4130 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4131 .collect::<Vec<_>>();
4132 assert_eq!(duplicate_entries.len(), 2);
4133 assert_eq!(embedder.total_embedded_texts(), 0);
4134 assert_eq!(duplicate_entries[0].vector, original_vector);
4135 assert_eq!(duplicate_entries[1].vector, original_vector);
4136 }
4137
4138 #[test]
4139 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4140 let temp = tempfile::tempdir().unwrap();
4141 let project_root = temp.path();
4142 let file = project_root.join("src/lib.rs");
4143 write_source(
4144 &file,
4145 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
4146 );
4147
4148 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4149 let summary_before = file_summary_entry(&index, &file).vector.clone();
4150
4151 write_source(
4152 &file,
4153 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
4154 );
4155 let mut body_embedder = RecordingEmbedder::default();
4156 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4157 let mut progress = |_done: usize, _total: usize| {};
4158 index
4159 .refresh_invalidated_files(
4160 project_root,
4161 std::slice::from_ref(&file),
4162 &mut body_embed,
4163 16,
4164 100,
4165 &mut progress,
4166 )
4167 .unwrap();
4168 assert_eq!(body_embedder.total_embedded_texts(), 1);
4169 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4170 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4171
4172 write_source(
4173 &file,
4174 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
4175 );
4176 let mut doc_embedder = RecordingEmbedder::default();
4177 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4178 index
4179 .refresh_invalidated_files(
4180 project_root,
4181 std::slice::from_ref(&file),
4182 &mut doc_embed,
4183 16,
4184 100,
4185 &mut progress,
4186 )
4187 .unwrap();
4188
4189 assert_eq!(doc_embedder.total_embedded_texts(), 1);
4190 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4191 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4192 }
4193
4194 #[test]
4195 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4196 let temp = tempfile::tempdir().unwrap();
4197 let project_root = temp.path();
4198 let file = project_root.join("src/lib.rs");
4199 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4200
4201 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4202 let mut serving_index = worker_index.clone();
4203 fs::remove_file(&file).unwrap();
4204
4205 let mut embedder = RecordingEmbedder::default();
4206 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4207 let mut progress = |_done: usize, _total: usize| {};
4208 let update = worker_index
4209 .refresh_invalidated_files(
4210 project_root,
4211 std::slice::from_ref(&file),
4212 &mut embed,
4213 16,
4214 100,
4215 &mut progress,
4216 )
4217 .unwrap();
4218
4219 assert_eq!(update.summary.deleted, 1);
4220 assert_eq!(embedder.total_embedded_texts(), 0);
4221 assert!(worker_index.entries.is_empty());
4222
4223 serving_index.apply_refresh_update(
4224 update.added_entries,
4225 update.updated_metadata,
4226 &update.completed_paths,
4227 );
4228 assert!(serving_index.entries.is_empty());
4229 }
4230
4231 #[test]
4232 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4233 let temp = tempfile::tempdir().unwrap();
4234 let project_root = temp.path();
4235 let file = project_root.join("src/lib.rs");
4236 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4237
4238 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4239 let mut serving_index = worker_index.clone();
4240 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4241
4242 let mut embedder = RecordingEmbedder::default();
4243 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4244 let mut progress = |_done: usize, _total: usize| {};
4245 let update = worker_index
4246 .refresh_invalidated_files(
4247 project_root,
4248 std::slice::from_ref(&file),
4249 &mut embed,
4250 16,
4251 100,
4252 &mut progress,
4253 )
4254 .unwrap();
4255
4256 assert_eq!(embedder.total_embedded_texts(), 0);
4257 assert!(update.added_entries.is_empty());
4258 assert!(worker_index.entries.is_empty());
4259 assert!(!worker_index.file_mtimes.contains_key(&file));
4260
4261 serving_index.apply_refresh_update(
4262 update.added_entries,
4263 update.updated_metadata,
4264 &update.completed_paths,
4265 );
4266 assert!(serving_index.entries.is_empty());
4267 assert!(!serving_index.file_mtimes.contains_key(&file));
4268 }
4269
4270 #[test]
4271 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4272 let temp = tempfile::tempdir().unwrap();
4273 let project_root = temp.path();
4274 let indexed = project_root.join("src/a.rs");
4275 let deferred = project_root.join("src/b.rs");
4276 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4277 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4278
4279 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4280 let mut embedder = RecordingEmbedder::default();
4281 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4282 let mut progress = |_done: usize, _total: usize| {};
4283 let update = index
4284 .refresh_invalidated_files(
4285 project_root,
4286 std::slice::from_ref(&deferred),
4287 &mut embed,
4288 16,
4289 1,
4290 &mut progress,
4291 )
4292 .unwrap();
4293
4294 assert_eq!(update.summary.total_processed, 1);
4295 assert_eq!(update.summary.added, 0);
4296 assert_eq!(embedder.total_embedded_texts(), 0);
4297 assert_eq!(index.indexed_file_count(), 1);
4298 assert!(index.deferred_files.contains(&deferred));
4299 assert!(entries_for_file(&index, &deferred).is_empty());
4300 }
4301
4302 #[test]
4303 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4304 let dir = tempfile::tempdir().expect("create temp dir");
4305 let project = fs::canonicalize(dir.path()).expect("canonical project");
4306 let outside = project.join("..").join("outside.rs");
4307 let mut index = SemanticIndex::new(project.clone(), 3);
4308 index
4309 .file_mtimes
4310 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4311 index.file_sizes.insert(outside.clone(), 1);
4312 index
4313 .file_hashes
4314 .insert(outside.clone(), cache_freshness::zero_hash());
4315 index.entries.push(EmbeddingEntry {
4316 chunk: SemanticChunk {
4317 file: outside,
4318 name: "outside".to_string(),
4319 qualified_name: None,
4320 kind: SymbolKind::Function,
4321 start_line: 0,
4322 end_line: 0,
4323 exported: false,
4324 embed_text: "outside".to_string(),
4325 snippet: "outside".to_string(),
4326 },
4327 vector: vec![1.0, 0.0, 0.0],
4328 });
4329
4330 let bytes = index.to_bytes();
4331 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4332 assert_eq!(loaded.entries.len(), 0);
4333 assert!(loaded.file_mtimes.is_empty());
4334 }
4335
4336 #[test]
4337 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4338 let project_root = test_project_root();
4339 let file = project_root.join("src/lib.rs");
4340 let mut index = SemanticIndex::new(project_root, 2);
4341 let entries = [
4342 ("alpha", vec![1.0, 0.0], false),
4343 ("beta", vec![0.0, 1.0], false),
4344 ("gamma", vec![1.0, 0.0], false),
4345 ("delta", vec![0.5, 0.5], true),
4346 ("epsilon", vec![-1.0, 0.0], false),
4347 ];
4348 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4349 index.entries.push(EmbeddingEntry {
4350 chunk: SemanticChunk {
4351 file: file.clone(),
4352 name: name.to_string(),
4353 qualified_name: None,
4354 kind: SymbolKind::Function,
4355 start_line: line as u32 + 1,
4356 end_line: line as u32 + 1,
4357 exported,
4358 embed_text: name.to_string(),
4359 snippet: format!("fn {name}() {{}}"),
4360 },
4361 vector,
4362 });
4363 }
4364
4365 let query = vec![1.0, 0.0];
4366 let top_k = 4;
4367 let mut reference: Vec<(f32, usize)> = index
4368 .entries
4369 .iter()
4370 .enumerate()
4371 .map(|(idx, entry)| {
4372 let mut score = cosine_similarity(&query, &entry.vector);
4373 if entry.chunk.exported {
4374 score *= 1.1;
4375 }
4376 (score, idx)
4377 })
4378 .collect();
4379 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4380 let expected: Vec<(String, f32)> = reference
4381 .into_iter()
4382 .take(top_k)
4383 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4384 .collect();
4385
4386 let actual: Vec<(String, f32)> = index
4387 .search(&query, top_k)
4388 .into_iter()
4389 .map(|result| (result.name, result.score))
4390 .collect();
4391
4392 assert_eq!(
4393 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4394 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4395 );
4396 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4397 assert!((actual_score - expected_score).abs() < 1e-6);
4398 }
4399 assert_eq!(actual[0].0, "alpha");
4400 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4401 assert!(index.search(&query, 0).is_empty());
4402 }
4403
4404 #[test]
4405 fn test_cosine_similarity_identical() {
4406 let a = vec![1.0, 0.0, 0.0];
4407 let b = vec![1.0, 0.0, 0.0];
4408 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4409 }
4410
4411 #[test]
4412 fn test_cosine_similarity_orthogonal() {
4413 let a = vec![1.0, 0.0, 0.0];
4414 let b = vec![0.0, 1.0, 0.0];
4415 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4416 }
4417
4418 #[test]
4419 fn test_cosine_similarity_opposite() {
4420 let a = vec![1.0, 0.0, 0.0];
4421 let b = vec![-1.0, 0.0, 0.0];
4422 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4423 }
4424
4425 #[test]
4426 fn test_serialization_roundtrip() {
4427 let project_root = test_project_root();
4428 let file = project_root.join("src/main.rs");
4429 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4430 index.entries.push(EmbeddingEntry {
4431 chunk: SemanticChunk {
4432 file: file.clone(),
4433 name: "handle_request".to_string(),
4434 qualified_name: None,
4435 kind: SymbolKind::Function,
4436 start_line: 10,
4437 end_line: 25,
4438 exported: true,
4439 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4440 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4441 },
4442 vector: vec![0.1, 0.2, 0.3, 0.4],
4443 });
4444 index.dimension = 4;
4445 index
4446 .file_mtimes
4447 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4448 index.file_sizes.insert(file, 0);
4449 index.set_fingerprint(SemanticIndexFingerprint {
4450 backend: "fastembed".to_string(),
4451 model: "all-MiniLM-L6-v2".to_string(),
4452 base_url: FALLBACK_BACKEND.to_string(),
4453 dimension: 4,
4454 chunking_version: default_chunking_version(),
4455 });
4456
4457 let bytes = index.to_bytes();
4458 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4459
4460 assert_eq!(restored.entries.len(), 1);
4461 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4462 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4463 assert_eq!(restored.dimension, 4);
4464 assert_eq!(restored.backend_label(), Some("fastembed"));
4465 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4466 }
4467
4468 #[test]
4469 fn semantic_cache_v6_loads_and_v7_round_trips_qualified_names() {
4470 let storage = tempfile::tempdir().expect("create storage dir");
4471 let project = storage.path().join("project");
4472 fs::create_dir_all(project.join("src")).expect("create project src");
4473 let file = project.join("src/lib.rs");
4474 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4475 let project_root = fs::canonicalize(&project).expect("canonical project");
4476 let file = fs::canonicalize(&file).expect("canonical file");
4477
4478 let mut index = SemanticIndex::new(project_root.clone(), 3);
4479 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4480 index.file_mtimes.insert(file.clone(), mtime);
4481 index.file_sizes.insert(file.clone(), 42);
4482 index
4483 .file_hashes
4484 .insert(file.clone(), cache_freshness::zero_hash());
4485 index.entries.push(EmbeddingEntry {
4486 chunk: SemanticChunk {
4487 file: file.clone(),
4488 name: "alpha".to_string(),
4489 qualified_name: Some("Service.alpha".to_string()),
4490 kind: SymbolKind::Function,
4491 start_line: 0,
4492 end_line: 0,
4493 exported: true,
4494 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4495 snippet: "pub fn alpha() {}".to_string(),
4496 },
4497 vector: vec![0.1, 0.2, 0.3],
4498 });
4499 index.entries.push(EmbeddingEntry {
4500 chunk: SemanticChunk {
4501 file: file.clone(),
4502 name: "beta".to_string(),
4503 qualified_name: Some("Service.beta".to_string()),
4504 kind: SymbolKind::Function,
4505 start_line: 1,
4506 end_line: 1,
4507 exported: true,
4508 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4509 snippet: "pub fn beta() {}".to_string(),
4510 },
4511 vector: vec![0.4, 0.5, 0.6],
4512 });
4513 let fingerprint = SemanticIndexFingerprint {
4514 backend: "fastembed".to_string(),
4515 model: "all-MiniLM-L6-v2".to_string(),
4516 base_url: FALLBACK_BACKEND.to_string(),
4517 dimension: 3,
4518 chunking_version: default_chunking_version(),
4519 };
4520 let fingerprint_before = fingerprint.as_string();
4521 index.set_fingerprint(fingerprint.clone());
4522
4523 let legacy_bytes = legacy_semantic_index_bytes(&index);
4524 assert_eq!(legacy_bytes[0], SEMANTIC_INDEX_VERSION_V6);
4525 let legacy_dir = storage.path().join("semantic/legacy-proj");
4526 fs::create_dir_all(&legacy_dir).expect("create legacy semantic dir");
4527 let legacy_path = legacy_dir.join("semantic.bin");
4528 fs::write(&legacy_path, &legacy_bytes).expect("write legacy semantic.bin");
4529 let legacy_loaded = SemanticIndex::read_from_disk(
4530 storage.path(),
4531 "legacy-proj",
4532 &project_root,
4533 false,
4534 Some(&fingerprint_before),
4535 )
4536 .expect("load v6 semantic index");
4537 assert!(
4538 legacy_path.exists(),
4539 "compatible V6 cache must not be deleted"
4540 );
4541 assert!(legacy_loaded
4542 .entries
4543 .iter()
4544 .all(|entry| entry.chunk.qualified_name.is_none()));
4545 assert_eq!(
4546 legacy_loaded.fingerprint().unwrap().as_string(),
4547 fingerprint_before
4548 );
4549
4550 let v7_bytes = index.to_bytes();
4551 assert_eq!(v7_bytes[0], SEMANTIC_INDEX_VERSION_V7);
4552 assert_ne!(v7_bytes, legacy_bytes);
4553 let restored = SemanticIndex::from_bytes(&v7_bytes, &project_root).unwrap();
4554 assert_eq!(
4555 restored.entries[0].chunk.qualified_name.as_deref(),
4556 Some("Service.alpha")
4557 );
4558 assert_eq!(
4559 restored.entries[1].chunk.qualified_name.as_deref(),
4560 Some("Service.beta")
4561 );
4562 assert_eq!(
4563 restored.fingerprint().unwrap().as_string(),
4564 fingerprint_before
4565 );
4566
4567 index.write_to_disk(storage.path(), "proj");
4568 let data_path = storage.path().join("semantic/proj/semantic.bin");
4569 let persisted = fs::read(&data_path).expect("read semantic.bin");
4570 assert_eq!(persisted[0], SEMANTIC_INDEX_VERSION_V7);
4571
4572 let loaded = SemanticIndex::read_from_disk(
4573 storage.path(),
4574 "proj",
4575 &project_root,
4576 false,
4577 Some(&fingerprint_before),
4578 )
4579 .expect("load semantic index");
4580 assert_eq!(loaded.entries.len(), index.entries.len());
4581 assert_eq!(loaded.dimension, index.dimension);
4582 assert_eq!(
4583 loaded.fingerprint().unwrap().as_string(),
4584 fingerprint_before
4585 );
4586 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4587 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4588 assert_eq!(
4589 loaded.file_hashes.get(&file),
4590 Some(&cache_freshness::zero_hash())
4591 );
4592 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4593 assert_eq!(actual.chunk.file, expected.chunk.file);
4594 assert_eq!(actual.chunk.name, expected.chunk.name);
4595 assert_eq!(actual.chunk.qualified_name, expected.chunk.qualified_name);
4596 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4597 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4598 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4599 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4600 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4601 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4602 assert_eq!(actual.vector, expected.vector);
4603 }
4604 assert_eq!(loaded.to_bytes(), persisted);
4605 assert_eq!(fingerprint.as_string(), fingerprint_before);
4606 }
4607
4608 #[test]
4609 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4610 let cases = [
4611 (SymbolKind::Function, 0),
4612 (SymbolKind::Class, 1),
4613 (SymbolKind::Method, 2),
4614 (SymbolKind::Struct, 3),
4615 (SymbolKind::Interface, 4),
4616 (SymbolKind::Enum, 5),
4617 (SymbolKind::TypeAlias, 6),
4618 (SymbolKind::Variable, 7),
4619 (SymbolKind::Heading, 8),
4620 (SymbolKind::FileSummary, 9),
4621 ];
4622
4623 for (kind, encoded) in cases {
4624 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4625 assert_eq!(u8_to_symbol_kind(encoded), kind);
4626 }
4627 }
4628
4629 #[test]
4630 fn test_search_top_k() {
4631 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4632 index.dimension = 3;
4633
4634 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4636 let mut vec = vec![0.0f32; 3];
4637 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4639 chunk: SemanticChunk {
4640 file: PathBuf::from("/src/lib.rs"),
4641 name: name.to_string(),
4642 qualified_name: None,
4643 kind: SymbolKind::Function,
4644 start_line: (i * 10 + 1) as u32,
4645 end_line: (i * 10 + 5) as u32,
4646 exported: true,
4647 embed_text: format!("kind:function name:{}", name),
4648 snippet: format!("fn {}() {{}}", name),
4649 },
4650 vector: vec,
4651 });
4652 }
4653
4654 let query = vec![0.9, 0.1, 0.0];
4656 let results = index.search(&query, 2);
4657
4658 assert_eq!(results.len(), 2);
4659 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4661 }
4662
4663 #[test]
4664 fn test_empty_index_search() {
4665 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4666 let results = index.search(&[0.1, 0.2, 0.3], 10);
4667 assert!(results.is_empty());
4668 }
4669
4670 #[test]
4671 fn single_line_symbol_builds_non_empty_snippet() {
4672 let symbol = Symbol {
4673 name: "answer".to_string(),
4674 kind: SymbolKind::Variable,
4675 range: crate::symbols::Range {
4676 start_line: 0,
4677 start_col: 0,
4678 end_line: 0,
4679 end_col: 24,
4680 },
4681 signature: Some("const answer = 42".to_string()),
4682 scope_chain: Vec::new(),
4683 exported: true,
4684 parent: None,
4685 };
4686 let source = "export const answer = 42;\n";
4687
4688 let snippet = build_snippet(&symbol, source);
4689
4690 assert_eq!(snippet, "export const answer = 42;");
4691 }
4692
4693 #[test]
4694 fn optimized_file_chunk_collection_matches_file_parser_path() {
4695 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4696 let file = project_root.join("src/semantic_index.rs");
4697 let source = std::fs::read_to_string(&file).unwrap();
4698
4699 let mut legacy_parser = FileParser::new();
4700 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4701 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4702
4703 let mut parsers = HashMap::new();
4704 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4705
4706 assert_eq!(
4707 chunk_fingerprint(&optimized_chunks),
4708 chunk_fingerprint(&legacy_chunks)
4709 );
4710 }
4711
4712 #[test]
4713 fn collect_file_chunks_indexes_java_symbols() {
4714 let dir = tempfile::tempdir().unwrap();
4715 let file = dir.path().join("Greeter.java");
4716 std::fs::write(
4717 &file,
4718 r#"package example;
4719
4720public class Greeter {
4721 public String greet(String name) {
4722 return "Hello, " + name;
4723 }
4724}
4725"#,
4726 )
4727 .unwrap();
4728
4729 let mut parsers = HashMap::new();
4730 let chunks = collect_file_chunks(dir.path(), &file, &mut parsers).unwrap();
4731
4732 assert!(
4733 !chunks.is_empty(),
4734 "Java file should produce semantic chunks"
4735 );
4736 assert!(
4737 chunks
4738 .iter()
4739 .any(|chunk| chunk.name == "Greeter" && chunk.kind == SymbolKind::Class),
4740 "Java class symbol should be chunked: {chunks:?}"
4741 );
4742 assert!(
4743 chunks
4744 .iter()
4745 .any(|chunk| chunk.name == "greet" && chunk.kind == SymbolKind::Method),
4746 "Java method symbol should be chunked: {chunks:?}"
4747 );
4748 }
4749
4750 fn chunk_fingerprint(
4751 chunks: &[SemanticChunk],
4752 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4753 chunks
4754 .iter()
4755 .map(|chunk| {
4756 (
4757 chunk.name.clone(),
4758 chunk.kind.clone(),
4759 chunk.start_line,
4760 chunk.end_line,
4761 chunk.exported,
4762 chunk.embed_text.clone(),
4763 chunk.snippet.clone(),
4764 )
4765 })
4766 .collect()
4767 }
4768
4769 #[test]
4770 fn collect_file_chunks_skips_oversized_file() {
4771 let dir = tempfile::tempdir().unwrap();
4772 let big = dir.path().join("huge.ts");
4773 let filler = "export const x = 1;\n"
4775 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4776 std::fs::write(&big, &filler).unwrap();
4777 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4778
4779 let mut parsers = HashMap::new();
4780 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4783 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4784
4785 let small = dir.path().join("small.ts");
4787 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4788 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4789 assert!(!small_chunks.is_empty(), "small file should still chunk");
4790 }
4791
4792 #[test]
4793 fn rejects_oversized_dimension_during_deserialization() {
4794 let mut bytes = Vec::new();
4795 bytes.push(1u8);
4796 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4797 bytes.extend_from_slice(&0u32.to_le_bytes());
4798 bytes.extend_from_slice(&0u32.to_le_bytes());
4799
4800 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4801 }
4802
4803 #[test]
4804 fn rejects_oversized_entry_count_during_deserialization() {
4805 let mut bytes = Vec::new();
4806 bytes.push(1u8);
4807 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4808 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4809 bytes.extend_from_slice(&0u32.to_le_bytes());
4810
4811 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4812 }
4813
4814 #[test]
4815 fn invalidate_file_removes_entries_and_mtime() {
4816 let target = PathBuf::from("/src/main.rs");
4817 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4818 index.entries.push(EmbeddingEntry {
4819 chunk: SemanticChunk {
4820 file: target.clone(),
4821 name: "main".to_string(),
4822 qualified_name: None,
4823 kind: SymbolKind::Function,
4824 start_line: 0,
4825 end_line: 1,
4826 exported: false,
4827 embed_text: "main".to_string(),
4828 snippet: "fn main() {}".to_string(),
4829 },
4830 vector: vec![1.0; DEFAULT_DIMENSION],
4831 });
4832 index
4833 .file_mtimes
4834 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4835 index.file_sizes.insert(target.clone(), 0);
4836
4837 index.invalidate_file(&target);
4838
4839 assert!(index.entries.is_empty());
4840 assert!(!index.file_mtimes.contains_key(&target));
4841 assert!(!index.file_sizes.contains_key(&target));
4842 }
4843
4844 #[test]
4845 fn refresh_missing_changed_file_is_purged_after_collect() {
4846 let temp = tempfile::tempdir().unwrap();
4847 let project_root = temp.path();
4848 let file = project_root.join("src/lib.rs");
4849 fs::create_dir_all(file.parent().unwrap()).unwrap();
4850 write_rust_file(&file, "vanished_symbol");
4851
4852 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4853 let original_size = *index.file_sizes.get(&file).unwrap();
4854 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4855 fs::remove_file(&file).unwrap();
4856
4857 let mut embed = test_vector_for_texts;
4858 let mut progress = |_done: usize, _total: usize| {};
4859 let summary = index
4860 .refresh_stale_files(
4861 project_root,
4862 std::slice::from_ref(&file),
4863 &mut embed,
4864 8,
4865 &mut progress,
4866 )
4867 .unwrap();
4868
4869 assert_eq!(summary.changed, 0);
4870 assert_eq!(summary.added, 0);
4871 assert_eq!(summary.deleted, 1);
4872 assert!(index.entries.is_empty());
4873 assert!(!index.file_mtimes.contains_key(&file));
4874 assert!(!index.file_sizes.contains_key(&file));
4875 assert!(!index.file_hashes.contains_key(&file));
4876 }
4877
4878 #[test]
4879 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4880 let temp = tempfile::tempdir().unwrap();
4881 let project_root = temp.path();
4882 let file = project_root.join("src/lib.rs");
4883 fs::create_dir_all(file.parent().unwrap()).unwrap();
4884 write_rust_file(&file, "kept_symbol");
4885
4886 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4887 let original_entry_count = index.entries.len();
4888 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4889 let original_size = *index.file_sizes.get(&file).unwrap();
4890
4891 let stale_mtime = SystemTime::UNIX_EPOCH;
4892 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4893 fs::remove_file(&file).unwrap();
4894 fs::create_dir(&file).unwrap();
4895
4896 let mut embed = test_vector_for_texts;
4897 let mut progress = |_done: usize, _total: usize| {};
4898 let summary = index
4899 .refresh_stale_files(
4900 project_root,
4901 std::slice::from_ref(&file),
4902 &mut embed,
4903 8,
4904 &mut progress,
4905 )
4906 .unwrap();
4907
4908 assert_eq!(summary.changed, 0);
4909 assert_eq!(summary.added, 0);
4910 assert_eq!(summary.deleted, 0);
4911 assert_eq!(index.entries.len(), original_entry_count);
4912 assert!(index
4913 .entries
4914 .iter()
4915 .any(|entry| entry.chunk.name == "kept_symbol"));
4916 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4917 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4918 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4919 }
4920
4921 #[test]
4922 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4923 let temp = tempfile::tempdir().unwrap();
4924 let project_root = temp.path();
4925 let missing = project_root.join("src/missing.rs");
4926 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4927
4928 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4929 let mut embed = test_vector_for_texts;
4930 let mut progress = |_done: usize, _total: usize| {};
4931 let summary = index
4932 .refresh_stale_files(
4933 project_root,
4934 std::slice::from_ref(&missing),
4935 &mut embed,
4936 8,
4937 &mut progress,
4938 )
4939 .unwrap();
4940
4941 assert_eq!(summary.added, 0);
4942 assert_eq!(summary.changed, 0);
4943 assert_eq!(summary.deleted, 0);
4944 assert!(!index.file_mtimes.contains_key(&missing));
4945 assert!(!index.file_sizes.contains_key(&missing));
4946 assert!(index.entries.is_empty());
4947 }
4948
4949 #[test]
4950 fn refresh_reports_added_for_new_files() {
4951 let temp = tempfile::tempdir().unwrap();
4952 let project_root = temp.path();
4953 let existing = project_root.join("src/lib.rs");
4954 let added = project_root.join("src/new.rs");
4955 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4956 write_rust_file(&existing, "existing_symbol");
4957 write_rust_file(&added, "added_symbol");
4958
4959 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4960 let mut embed = test_vector_for_texts;
4961 let mut progress = |_done: usize, _total: usize| {};
4962 let summary = index
4963 .refresh_stale_files(
4964 project_root,
4965 &[existing.clone(), added.clone()],
4966 &mut embed,
4967 8,
4968 &mut progress,
4969 )
4970 .unwrap();
4971
4972 assert_eq!(summary.added, 1);
4973 assert_eq!(summary.changed, 0);
4974 assert_eq!(summary.deleted, 0);
4975 assert_eq!(summary.total_processed, 2);
4976 assert!(index.file_mtimes.contains_key(&added));
4977 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4978 }
4979
4980 #[test]
4981 fn refresh_reports_deleted_for_removed_files() {
4982 let temp = tempfile::tempdir().unwrap();
4983 let project_root = temp.path();
4984 let deleted = project_root.join("src/deleted.rs");
4985 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4986 write_rust_file(&deleted, "deleted_symbol");
4987
4988 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4989 fs::remove_file(&deleted).unwrap();
4990
4991 let mut embed = test_vector_for_texts;
4992 let mut progress = |_done: usize, _total: usize| {};
4993 let summary = index
4994 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4995 .unwrap();
4996
4997 assert_eq!(summary.deleted, 1);
4998 assert_eq!(summary.changed, 0);
4999 assert_eq!(summary.added, 0);
5000 assert_eq!(summary.total_processed, 1);
5001 assert!(!index.file_mtimes.contains_key(&deleted));
5002 assert!(index.entries.is_empty());
5003 }
5004
5005 #[test]
5006 fn refresh_reports_changed_for_modified_files() {
5007 let temp = tempfile::tempdir().unwrap();
5008 let project_root = temp.path();
5009 let file = project_root.join("src/lib.rs");
5010 fs::create_dir_all(file.parent().unwrap()).unwrap();
5011 write_rust_file(&file, "old_symbol");
5012
5013 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
5014 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
5015 write_rust_file(&file, "new_symbol");
5016
5017 let mut embed = test_vector_for_texts;
5018 let mut progress = |_done: usize, _total: usize| {};
5019 let summary = index
5020 .refresh_stale_files(
5021 project_root,
5022 std::slice::from_ref(&file),
5023 &mut embed,
5024 8,
5025 &mut progress,
5026 )
5027 .unwrap();
5028
5029 assert_eq!(summary.changed, 1);
5030 assert_eq!(summary.added, 0);
5031 assert_eq!(summary.deleted, 0);
5032 assert_eq!(summary.total_processed, 1);
5033 assert!(index
5034 .entries
5035 .iter()
5036 .any(|entry| entry.chunk.name == "new_symbol"));
5037 assert!(!index
5038 .entries
5039 .iter()
5040 .any(|entry| entry.chunk.name == "old_symbol"));
5041 }
5042
5043 #[test]
5044 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
5045 let temp = tempfile::tempdir().unwrap();
5046 let project_root = temp.path();
5047 let file = project_root.join("src/lib.rs");
5048 fs::create_dir_all(file.parent().unwrap()).unwrap();
5049 write_rust_file(&file, "clean_symbol");
5050
5051 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
5052 let original_entries = index.entries.len();
5053 let mut embed_called = false;
5054 let mut embed = |texts: Vec<String>| {
5055 embed_called = true;
5056 test_vector_for_texts(texts)
5057 };
5058 let mut progress = |_done: usize, _total: usize| {};
5059 let summary = index
5060 .refresh_stale_files(
5061 project_root,
5062 std::slice::from_ref(&file),
5063 &mut embed,
5064 8,
5065 &mut progress,
5066 )
5067 .unwrap();
5068
5069 assert!(summary.is_noop());
5070 assert_eq!(summary.total_processed, 1);
5071 assert!(!embed_called);
5072 assert_eq!(index.entries.len(), original_entries);
5073 }
5074
5075 #[test]
5076 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
5077 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
5078
5079 assert!(is_onnx_runtime_unavailable(message));
5080 }
5081
5082 #[test]
5083 fn formats_missing_onnx_runtime_with_install_hint() {
5084 let message = format_embedding_init_error(
5085 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
5086 );
5087
5088 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
5089 assert!(message.contains("Original error:"));
5090 }
5091
5092 #[test]
5093 fn interactive_query_embedding_model_caps_remote_timeout() {
5094 let mut config = SemanticBackendConfig {
5095 backend: SemanticBackend::OpenAiCompatible,
5096 model: "test-embedding".to_string(),
5097 base_url: Some("http://127.0.0.1:9".to_string()),
5098 api_key_env: None,
5099 timeout_ms: 0,
5100 max_batch_size: 64,
5101 max_files: 20_000,
5102 };
5103
5104 let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
5105 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5106 assert_eq!(
5107 build_model.timeout_ms(),
5108 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
5109 "background build keeps the longer default embedding timeout"
5110 );
5111 assert_eq!(
5112 query_model.timeout_ms(),
5113 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5114 "interactive query embedding is capped below the dispatch transport timeout"
5115 );
5116
5117 config.timeout_ms = 60_000;
5118 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5119 assert_eq!(
5120 query_model.timeout_ms(),
5121 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5122 "explicitly long backend timeouts are capped for interactive queries"
5123 );
5124
5125 config.timeout_ms = 3_000;
5126 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5127 assert_eq!(
5128 query_model.timeout_ms(),
5129 3_000,
5130 "shorter explicit timeouts are respected for interactive queries"
5131 );
5132 }
5133
5134 #[test]
5135 fn openai_compatible_backend_embeds_with_mock_server() {
5136 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5137 assert!(request_line.starts_with("POST "));
5138 assert_eq!(path, "/v1/embeddings");
5139 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
5140 });
5141
5142 let config = SemanticBackendConfig {
5143 backend: SemanticBackend::OpenAiCompatible,
5144 model: "test-embedding".to_string(),
5145 base_url: Some(base_url),
5146 api_key_env: None,
5147 timeout_ms: 5_000,
5148 max_batch_size: 64,
5149 max_files: 20_000,
5150 };
5151
5152 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5153 let vectors = model
5154 .embed(vec!["hello".to_string(), "world".to_string()])
5155 .unwrap();
5156
5157 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5158 handle.join().unwrap();
5159 }
5160
5161 #[test]
5171 fn openai_compatible_request_has_single_content_type_header() {
5172 use std::sync::{Arc, Mutex};
5173 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5174 let captured_for_thread = Arc::clone(&captured);
5175
5176 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5177 let addr = listener.local_addr().expect("local addr");
5178 let handle = thread::spawn(move || {
5179 let (mut stream, _) = listener.accept().expect("accept");
5180 let mut buf = Vec::new();
5181 let mut chunk = [0u8; 4096];
5182 let mut header_end = None;
5183 let mut content_length = 0usize;
5184 loop {
5185 let n = stream.read(&mut chunk).expect("read");
5186 if n == 0 {
5187 break;
5188 }
5189 buf.extend_from_slice(&chunk[..n]);
5190 if header_end.is_none() {
5191 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5192 header_end = Some(pos + 4);
5193 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5194 if let Some(value) = line.strip_prefix("Content-Length:") {
5195 content_length = value.trim().parse::<usize>().unwrap_or(0);
5196 }
5197 }
5198 }
5199 }
5200 if let Some(end) = header_end {
5201 if buf.len() >= end + content_length {
5202 break;
5203 }
5204 }
5205 }
5206 *captured_for_thread.lock().unwrap() = buf;
5207 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5208 let response = format!(
5209 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5210 body.len(),
5211 body
5212 );
5213 let _ = stream.write_all(response.as_bytes());
5214 });
5215
5216 let config = SemanticBackendConfig {
5217 backend: SemanticBackend::OpenAiCompatible,
5218 model: "text-embedding-3-small".to_string(),
5219 base_url: Some(format!("http://{}", addr)),
5220 api_key_env: None,
5221 timeout_ms: 5_000,
5222 max_batch_size: 64,
5223 max_files: 20_000,
5224 };
5225 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5226 let _ = model.embed(vec!["probe".to_string()]).unwrap();
5227 handle.join().unwrap();
5228
5229 let bytes = captured.lock().unwrap().clone();
5230 let request = String::from_utf8_lossy(&bytes);
5231
5232 let content_type_lines = request
5235 .lines()
5236 .filter(|line| {
5237 let lower = line.to_ascii_lowercase();
5238 lower.starts_with("content-type:")
5239 })
5240 .count();
5241 assert_eq!(
5242 content_type_lines, 1,
5243 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5244 );
5245
5246 assert!(
5249 request.contains(r#""model":"text-embedding-3-small""#),
5250 "request body should contain model field; full request:\n{request}",
5251 );
5252 }
5253
5254 #[test]
5255 fn ollama_backend_embeds_with_mock_server() {
5256 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5257 assert!(request_line.starts_with("POST "));
5258 assert_eq!(path, "/api/embed");
5259 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5260 });
5261
5262 let config = SemanticBackendConfig {
5263 backend: SemanticBackend::Ollama,
5264 model: "embeddinggemma".to_string(),
5265 base_url: Some(base_url),
5266 api_key_env: None,
5267 timeout_ms: 5_000,
5268 max_batch_size: 64,
5269 max_files: 20_000,
5270 };
5271
5272 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5273 let vectors = model
5274 .embed(vec!["hello".to_string(), "world".to_string()])
5275 .unwrap();
5276
5277 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5278 handle.join().unwrap();
5279 }
5280
5281 #[test]
5282 fn read_from_disk_rejects_fingerprint_mismatch() {
5283 let storage = tempfile::tempdir().unwrap();
5284 let project_key = "proj";
5285
5286 let project_root = test_project_root();
5287 let file = project_root.join("src/main.rs");
5288 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5289 index.entries.push(EmbeddingEntry {
5290 chunk: SemanticChunk {
5291 file: file.clone(),
5292 name: "handle_request".to_string(),
5293 qualified_name: None,
5294 kind: SymbolKind::Function,
5295 start_line: 10,
5296 end_line: 25,
5297 exported: true,
5298 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5299 snippet: "fn handle_request() {}".to_string(),
5300 },
5301 vector: vec![0.1, 0.2, 0.3],
5302 });
5303 index.dimension = 3;
5304 index
5305 .file_mtimes
5306 .insert(file.clone(), SystemTime::UNIX_EPOCH);
5307 index.file_sizes.insert(file, 0);
5308 index.set_fingerprint(SemanticIndexFingerprint {
5309 backend: "openai_compatible".to_string(),
5310 model: "test-embedding".to_string(),
5311 base_url: "http://127.0.0.1:1234/v1".to_string(),
5312 dimension: 3,
5313 chunking_version: default_chunking_version(),
5314 });
5315 index.write_to_disk(storage.path(), project_key);
5316
5317 let matching = index.fingerprint().unwrap().as_string();
5318 assert!(SemanticIndex::read_from_disk(
5319 storage.path(),
5320 project_key,
5321 &project_root,
5322 false,
5323 Some(&matching),
5324 )
5325 .is_some());
5326
5327 let mismatched = SemanticIndexFingerprint {
5328 backend: "ollama".to_string(),
5329 model: "embeddinggemma".to_string(),
5330 base_url: "http://127.0.0.1:11434".to_string(),
5331 dimension: 3,
5332 chunking_version: default_chunking_version(),
5333 }
5334 .as_string();
5335 assert!(SemanticIndex::read_from_disk(
5336 storage.path(),
5337 project_key,
5338 &project_root,
5339 false,
5340 Some(&mismatched),
5341 )
5342 .is_none());
5343 }
5344
5345 #[test]
5346 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5347 let storage = tempfile::tempdir().unwrap();
5348 let project_key = "proj-v3";
5349 let dir = storage.path().join("semantic").join(project_key);
5350 fs::create_dir_all(&dir).unwrap();
5351
5352 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5353 index.entries.push(EmbeddingEntry {
5354 chunk: SemanticChunk {
5355 file: PathBuf::from("/src/main.rs"),
5356 name: "handle_request".to_string(),
5357 qualified_name: None,
5358 kind: SymbolKind::Function,
5359 start_line: 0,
5360 end_line: 0,
5361 exported: true,
5362 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5363 snippet: "fn handle_request() {}".to_string(),
5364 },
5365 vector: vec![0.1, 0.2, 0.3],
5366 });
5367 index.dimension = 3;
5368 index
5369 .file_mtimes
5370 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5371 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5372 let fingerprint = SemanticIndexFingerprint {
5373 backend: "fastembed".to_string(),
5374 model: "test".to_string(),
5375 base_url: FALLBACK_BACKEND.to_string(),
5376 dimension: 3,
5377 chunking_version: default_chunking_version(),
5378 };
5379 index.set_fingerprint(fingerprint.clone());
5380
5381 let mut bytes = index.to_bytes();
5382 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5383 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5384
5385 assert!(SemanticIndex::read_from_disk(
5386 storage.path(),
5387 project_key,
5388 &test_project_root(),
5389 false,
5390 Some(&fingerprint.as_string())
5391 )
5392 .is_none());
5393 assert!(!dir.join("semantic.bin").exists());
5394 }
5395
5396 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5397 crate::symbols::Symbol {
5398 name: name.to_string(),
5399 kind,
5400 range: crate::symbols::Range {
5401 start_line: start,
5402 start_col: 0,
5403 end_line: end,
5404 end_col: 0,
5405 },
5406 signature: None,
5407 scope_chain: Vec::new(),
5408 exported: false,
5409 parent: None,
5410 }
5411 }
5412
5413 #[test]
5414 fn symbols_to_chunks_sets_qualified_name_without_changing_embed_text() {
5415 let project_root = PathBuf::from("/proj");
5416 let file = project_root.join("src/engine.ts");
5417 let source = "class Index {\n}\n";
5418 let mut symbol = make_symbol(SymbolKind::Class, "Index", 0, 1);
5419 symbol.scope_chain = vec!["Engine".to_string()];
5420 symbol.signature = Some("class Index".to_string());
5421 let embed_text = build_embed_text(&symbol, source, &file, &project_root);
5422
5423 let chunks = symbols_to_chunks(&file, &[symbol], source, &project_root);
5424 let chunk = chunks
5425 .iter()
5426 .find(|chunk| chunk.name == "Index")
5427 .expect("class chunk");
5428
5429 assert_eq!(chunk.name, "Index");
5430 assert_eq!(chunk.qualified_name.as_deref(), Some("Engine.Index"));
5431 assert_eq!(chunk.embed_text, embed_text);
5432 assert!(!chunk.embed_text.contains("Engine.Index"));
5433 }
5434
5435 #[test]
5440 fn symbols_to_chunks_skips_heading_symbols() {
5441 let project_root = PathBuf::from("/proj");
5442 let file = project_root.join("README.md");
5443 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5444
5445 let symbols = vec![
5446 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5447 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5448 ];
5449
5450 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5451 assert!(
5452 chunks.is_empty(),
5453 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5454 chunks.len()
5455 );
5456 }
5457
5458 #[test]
5465 fn build_embed_text_clamps_oversized_signature() {
5466 let project_root = PathBuf::from("/proj");
5467 let file = project_root.join("cronjob.yaml");
5468 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5470
5471 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5472 symbol.signature = Some(huge_sig);
5473
5474 let text = build_embed_text(&symbol, source, &file, &project_root);
5475 assert!(
5476 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5477 "embed_text must be clamped to {} chars, got {}",
5478 MAX_EMBED_TEXT_CHARS,
5479 text.chars().count()
5480 );
5481 }
5482
5483 #[test]
5487 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5488 let project_root = PathBuf::from("/proj");
5489 let file = project_root.join("src/lib.rs");
5490 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5491
5492 let symbols = vec![
5493 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5495 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5496 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5497 ];
5498
5499 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5500 assert_eq!(
5501 chunks.len(),
5502 3,
5503 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5504 chunks.len()
5505 );
5506 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5507 assert!(chunks
5508 .iter()
5509 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5510 assert!(names.contains(&"handle_request"));
5511 assert!(names.contains(&"AuthService"));
5512 assert!(
5513 !names.contains(&"doc heading"),
5514 "Heading symbol leaked into chunks: {names:?}"
5515 );
5516 }
5517
5518 #[test]
5519 fn validate_ssrf_allows_loopback_hostnames() {
5520 for host in &[
5523 "http://localhost",
5524 "http://localhost:8080",
5525 "http://localhost:11434", "http://localhost.localdomain",
5527 "http://foo.localhost",
5528 ] {
5529 assert!(
5530 validate_base_url_no_ssrf(host).is_ok(),
5531 "Expected {host} to be allowed (loopback), got: {:?}",
5532 validate_base_url_no_ssrf(host)
5533 );
5534 }
5535 }
5536
5537 #[test]
5538 fn validate_ssrf_allows_loopback_ips() {
5539 for url in &[
5542 "http://127.0.0.1",
5543 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5545 "http://127.1.2.3",
5546 ] {
5547 let result = validate_base_url_no_ssrf(url);
5548 assert!(
5549 result.is_ok(),
5550 "Expected {url} to be allowed (loopback), got: {:?}",
5551 result
5552 );
5553 }
5554 }
5555
5556 #[test]
5557 fn validate_ssrf_rejects_private_non_loopback_ips() {
5558 for url in &[
5563 "http://192.168.1.1",
5564 "http://10.0.0.1",
5565 "http://172.16.0.1",
5566 "http://169.254.169.254",
5567 "http://100.64.0.1",
5568 ] {
5569 let result = validate_base_url_no_ssrf(url);
5570 assert!(
5571 result.is_err(),
5572 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5573 result
5574 );
5575 }
5576 }
5577
5578 #[test]
5579 fn validate_ssrf_rejects_mdns_local_hostnames() {
5580 for host in &[
5583 "http://printer.local",
5584 "http://nas.local:8080",
5585 "http://homelab.local",
5586 ] {
5587 let result = validate_base_url_no_ssrf(host);
5588 assert!(
5589 result.is_err(),
5590 "Expected {host} to be rejected (mDNS), got: {:?}",
5591 result
5592 );
5593 }
5594 }
5595
5596 #[test]
5597 fn normalize_base_url_allows_localhost_for_tests() {
5598 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5601 assert!(normalize_base_url("http://localhost:8080").is_ok());
5602 }
5603
5604 #[test]
5605 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5606 use std::net::IpAddr;
5607 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5608
5609 assert!(blocked("10.0.0.1"));
5611 assert!(blocked("192.168.1.1"));
5612 assert!(blocked("169.254.0.1"));
5613 assert!(blocked("100.64.0.1"));
5614 assert!(
5616 blocked("198.18.0.1"),
5617 "RFC2544 benchmark range must be blocked"
5618 );
5619 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5620 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5621 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5622
5623 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5625 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5626 assert!(
5627 !blocked("::ffff:127.0.0.1"),
5628 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5629 );
5630
5631 assert!(!blocked("8.8.8.8"));
5633 }
5634
5635 #[test]
5642 fn ort_mismatch_message_recommends_auto_fix_first() {
5643 let msg =
5644 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5645
5646 assert!(
5648 msg.contains("v1.9.0"),
5649 "should report detected version: {msg}"
5650 );
5651 assert!(
5652 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5653 "should report system path: {msg}"
5654 );
5655 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5656
5657 let auto_fix_pos = msg
5659 .find("Auto-fix")
5660 .expect("Auto-fix solution missing — users won't discover --fix");
5661 let remove_pos = msg
5662 .find("Remove the old library")
5663 .expect("system-rm solution missing");
5664 assert!(
5665 auto_fix_pos < remove_pos,
5666 "Auto-fix must come before manual rm — see PR comment thread"
5667 );
5668
5669 assert!(
5671 msg.contains("npx @cortexkit/aft doctor --fix"),
5672 "auto-fix command must be present and copy-pasteable: {msg}"
5673 );
5674 }
5675
5676 #[cfg(any(target_os = "linux", target_os = "macos"))]
5677 #[test]
5678 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5679 let requested = "libonnxruntime.so";
5680 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5681
5682 assert_eq!(detect_ort_version_from_path(requested), None);
5683 let (version, source) =
5684 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5685
5686 assert_eq!(version, Some("1.19.0".to_string()));
5687 assert_eq!(source, actual);
5688
5689 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5690 assert!(msg.contains("v1.19.0"));
5691 assert!(msg.contains(actual));
5692 }
5693
5694 #[test]
5698 fn ort_mismatch_message_handles_macos_dylib_path() {
5699 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5700 assert!(msg.contains("v1.9.0"));
5701 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5702 assert!(
5706 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5707 "system path should be quoted in the auto-fix sentence: {msg}"
5708 );
5709 }
5710}