1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
57const DEFAULT_MAX_BATCH_SIZE: usize = 64;
58const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
59const FALLBACK_BACKEND: &str = "none";
60const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
61const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
62static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
63
64pub struct SemanticIndexLock {
65 _guard: fs_lock::LockGuard,
66}
67
68impl SemanticIndexLock {
69 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
70 let dir = storage_dir.join("semantic").join(project_key);
71 fs::create_dir_all(&dir)?;
72 let path = dir.join("cache.lock");
73 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
74 .lock()
75 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
76 fs_lock::try_acquire(&path, Duration::from_secs(2))
77 .map(|guard| Self { _guard: guard })
78 .map_err(|error| match error {
79 fs_lock::AcquireError::Timeout => {
80 std::io::Error::other("timed out acquiring semantic cache lock")
81 }
82 fs_lock::AcquireError::Io(error) => error,
83 })
84 }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct SemanticIndexFingerprint {
89 pub backend: String,
90 pub model: String,
91 #[serde(default)]
92 pub base_url: String,
93 pub dimension: usize,
94 #[serde(default = "default_chunking_version")]
95 pub chunking_version: u32,
96}
97
98fn default_chunking_version() -> u32 {
99 2
100}
101
102impl SemanticIndexFingerprint {
103 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
104 let base_url = config
107 .base_url
108 .as_ref()
109 .and_then(|u| normalize_base_url(u).ok())
110 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
111 Self {
112 backend: config.backend.as_str().to_string(),
113 model: config.model.clone(),
114 base_url,
115 dimension,
116 chunking_version: default_chunking_version(),
117 }
118 }
119
120 pub fn as_string(&self) -> String {
121 serde_json::to_string(self).unwrap_or_else(|_| String::new())
122 }
123
124 fn matches_expected(&self, expected: &str) -> bool {
125 let encoded = self.as_string();
126 !encoded.is_empty() && encoded == expected
127 }
128}
129
130enum SemanticEmbeddingEngine {
131 Local(LocalEmbedder),
134 OpenAiCompatible {
135 client: Client,
136 model: String,
137 base_url: String,
138 api_key: Option<String>,
139 },
140 Ollama {
141 client: Client,
142 model: String,
143 base_url: String,
144 },
145}
146
147pub struct SemanticEmbeddingModel {
148 backend: SemanticBackend,
149 model: String,
150 base_url: Option<String>,
151 timeout_ms: u64,
152 max_batch_size: usize,
153 dimension: Option<usize>,
154 engine: SemanticEmbeddingEngine,
155 query_embedding_cache: HashMap<String, Vec<f32>>,
156 query_embedding_cache_order: VecDeque<String>,
157 query_embedding_cache_hits: u64,
158 query_embedding_cache_misses: u64,
159}
160
161pub type EmbeddingModel = SemanticEmbeddingModel;
162
163fn validate_embedding_batch(
164 vectors: &[Vec<f32>],
165 expected_count: usize,
166 context: &str,
167) -> Result<(), String> {
168 if expected_count > 0 && vectors.is_empty() {
169 return Err(format!(
170 "{context} returned no vectors for {expected_count} inputs"
171 ));
172 }
173
174 if vectors.len() != expected_count {
175 return Err(format!(
176 "{context} returned {} vectors for {} inputs",
177 vectors.len(),
178 expected_count
179 ));
180 }
181
182 let Some(first_vector) = vectors.first() else {
183 return Ok(());
184 };
185 let expected_dimension = first_vector.len();
186 validate_embedding_dimension(expected_dimension)
187 .map_err(|error| format!("{context} returned {error}"))?;
188 for (index, vector) in vectors.iter().enumerate() {
189 if vector.len() != expected_dimension {
190 return Err(format!(
191 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
192 vector.len()
193 ));
194 }
195 }
196
197 Ok(())
198}
199
200fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
201 if dimension == 0 || dimension > MAX_DIMENSION {
202 return Err(format!(
203 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
204 ));
205 }
206
207 Ok(())
208}
209
210fn normalize_base_url(raw: &str) -> Result<String, String> {
214 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
215 let scheme = parsed.scheme();
216 if scheme != "http" && scheme != "https" {
217 return Err(format!(
218 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
219 scheme
220 ));
221 }
222 Ok(parsed.to_string().trim_end_matches('/').to_string())
223}
224
225pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
240 use std::net::{IpAddr, ToSocketAddrs};
241
242 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
243
244 let host = parsed.host_str().unwrap_or("");
245
246 let is_loopback_host =
251 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
252 if is_loopback_host {
253 return Ok(());
254 }
255
256 if host.ends_with(".local") {
259 return Err(format!(
260 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
261 ));
262 }
263
264 let port = parsed.port_or_known_default().unwrap_or(443);
267 let addr_str = format!("{host}:{port}");
268 let addrs: Vec<IpAddr> = addr_str
269 .to_socket_addrs()
270 .map(|iter| iter.map(|sa| sa.ip()).collect())
271 .unwrap_or_default();
272 for ip in &addrs {
273 if is_private_non_loopback_ip(ip) {
274 return Err(format!(
275 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
276 ));
277 }
278 }
279
280 Ok(())
281}
282
283fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
294 if ip.to_canonical().is_loopback() {
297 return false;
298 }
299 crate::url_fetch::is_private_or_reserved_ip(*ip)
300}
301
302fn build_openai_embeddings_endpoint(base_url: &str) -> String {
303 if base_url.ends_with("/v1") {
304 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
305 } else {
306 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
307 }
308}
309
310fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
311 if base_url.ends_with("/api") {
312 format!("{base_url}/embed")
313 } else {
314 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
315 }
316}
317
318fn normalize_api_key(value: Option<String>) -> Option<String> {
319 value.and_then(|token| {
320 let token = token.trim();
321 if token.is_empty() {
322 None
323 } else {
324 Some(token.to_string())
325 }
326 })
327}
328
329fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
330 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
331}
332
333fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
339 if !matches!(
340 status,
341 reqwest::StatusCode::BAD_REQUEST
342 | reqwest::StatusCode::CONFLICT
343 | reqwest::StatusCode::REQUEST_TIMEOUT
344 | reqwest::StatusCode::LOCKED
345 | reqwest::StatusCode::TOO_EARLY
346 ) {
347 return false;
348 }
349
350 let lower = raw.to_ascii_lowercase();
351 let normalized = lower.trim();
352
353 normalized.contains("model was unloaded while the request was still in queue")
354 || normalized == "model is loading"
355 || normalized.starts_with("model is loading,")
356 || normalized.contains(r#""error":"model is loading"#)
357 || normalized.contains(r#""message":"model is loading"#)
358 || normalized == "model not loaded"
359 || normalized.contains(r#""error":"model not loaded""#)
360 || normalized.contains(r#""message":"model not loaded""#)
361 || normalized == "loading model into memory"
362 || normalized.contains(r#""error":"loading model into memory""#)
363 || normalized.contains(r#""message":"loading model into memory""#)
364 || normalized == "model is being loaded"
365 || normalized.contains(r#""error":"model is being loaded""#)
366 || normalized.contains(r#""message":"model is being loaded""#)
367 || normalized == "model is currently loading"
368 || normalized.contains(r#""error":"model is currently loading""#)
369 || normalized.contains(r#""message":"model is currently loading""#)
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373 error.is_connect()
374}
375
376fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382 error.is_connect() || error.is_timeout()
383}
384
385fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
386 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
387}
388
389pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
396
397pub fn embedding_failure_is_transient(error: &str) -> bool {
400 error.contains(TRANSIENT_EMBEDDING_MARKER)
401}
402
403pub fn strip_transient_embedding_marker(error: &str) -> String {
405 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
406}
407
408fn sleep_before_embedding_retry(attempt_index: usize) {
409 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
410 std::thread::sleep(Duration::from_millis(*delay_ms));
411 }
412}
413
414fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
415where
416 F: FnMut() -> reqwest::blocking::RequestBuilder,
417{
418 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
419 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
420
421 let response = match make_request().send() {
422 Ok(response) => response,
423 Err(error) => {
424 if !last_attempt && is_retryable_embedding_error(&error) {
425 sleep_before_embedding_retry(attempt_index);
426 continue;
427 }
428 let marker = if embedding_send_error_is_transient(&error) {
432 TRANSIENT_EMBEDDING_MARKER
433 } else {
434 ""
435 };
436 return Err(format!("{marker}{backend_label} request failed: {error}"));
437 }
438 };
439
440 let status = response.status();
441 let raw = match response.text() {
442 Ok(raw) => raw,
443 Err(error) => {
444 if !last_attempt && embedding_response_read_error_is_transient(&error) {
445 sleep_before_embedding_retry(attempt_index);
446 continue;
447 }
448 let marker = if embedding_response_read_error_is_transient(&error) {
449 TRANSIENT_EMBEDDING_MARKER
450 } else {
451 ""
452 };
453 return Err(format!(
454 "{marker}{backend_label} response read failed: {error}"
455 ));
456 }
457 };
458
459 if status.is_success() {
460 return Ok(raw);
461 }
462
463 let body_transient = embedding_response_body_is_transient(status, &raw);
467 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
468 sleep_before_embedding_retry(attempt_index);
469 continue;
470 }
471
472 let marker = if is_retryable_embedding_status(status) || body_transient {
478 TRANSIENT_EMBEDDING_MARKER
479 } else {
480 ""
481 };
482 return Err(format!(
483 "{marker}{backend_label} request failed (HTTP {}): {}",
484 status, raw
485 ));
486 }
487
488 unreachable!("embedding request retries exhausted without returning")
489}
490
491impl SemanticEmbeddingModel {
492 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
493 let timeout_ms = if config.timeout_ms == 0 {
494 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
495 } else {
496 config.timeout_ms
497 };
498
499 let max_batch_size = if config.max_batch_size == 0 {
500 DEFAULT_MAX_BATCH_SIZE
501 } else {
502 config.max_batch_size
503 };
504
505 let api_key_env = normalize_api_key(config.api_key_env.clone());
506 let model = config.model.clone();
507
508 let client = Client::builder()
509 .timeout(Duration::from_millis(timeout_ms))
510 .redirect(reqwest::redirect::Policy::none())
511 .build()
512 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
513
514 let engine = match config.backend {
515 SemanticBackend::Fastembed => {
516 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
517 }
518 SemanticBackend::OpenAiCompatible => {
519 let raw = config.base_url.as_ref().ok_or_else(|| {
520 "base_url is required for openai_compatible backend".to_string()
521 })?;
522 let base_url = normalize_base_url(raw)?;
523
524 let api_key = match api_key_env {
525 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
526 format!("missing api_key_env '{var_name}' for openai_compatible backend")
527 })?),
528 None => None,
529 };
530
531 SemanticEmbeddingEngine::OpenAiCompatible {
532 client,
533 model,
534 base_url,
535 api_key,
536 }
537 }
538 SemanticBackend::Ollama => {
539 let raw = config
540 .base_url
541 .as_ref()
542 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
543 let base_url = normalize_base_url(raw)?;
544
545 SemanticEmbeddingEngine::Ollama {
546 client,
547 model,
548 base_url,
549 }
550 }
551 };
552
553 Ok(Self {
554 backend: config.backend,
555 model: config.model.clone(),
556 base_url: config.base_url.clone(),
557 timeout_ms,
558 max_batch_size,
559 dimension: None,
560 engine,
561 query_embedding_cache: HashMap::new(),
562 query_embedding_cache_order: VecDeque::new(),
563 query_embedding_cache_hits: 0,
564 query_embedding_cache_misses: 0,
565 })
566 }
567
568 pub fn backend(&self) -> SemanticBackend {
569 self.backend
570 }
571
572 pub fn model(&self) -> &str {
573 &self.model
574 }
575
576 pub fn base_url(&self) -> Option<&str> {
577 self.base_url.as_deref()
578 }
579
580 pub fn max_batch_size(&self) -> usize {
581 self.max_batch_size
582 }
583
584 pub fn timeout_ms(&self) -> u64 {
585 self.timeout_ms
586 }
587
588 pub fn fingerprint(
589 &mut self,
590 config: &SemanticBackendConfig,
591 ) -> Result<SemanticIndexFingerprint, String> {
592 let dimension = self.dimension()?;
593 Ok(SemanticIndexFingerprint::from_config(config, dimension))
594 }
595
596 pub fn dimension(&mut self) -> Result<usize, String> {
597 if let Some(dimension) = self.dimension {
598 return Ok(dimension);
599 }
600
601 let dimension = match &mut self.engine {
602 SemanticEmbeddingEngine::Local(model) => {
603 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
604 vectors
605 .first()
606 .map(|v| v.len())
607 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
608 }
609 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
610 let vectors =
611 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
612 vectors
613 .first()
614 .map(|v| v.len())
615 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
616 }
617 SemanticEmbeddingEngine::Ollama { .. } => {
618 let vectors =
619 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
620 vectors
621 .first()
622 .map(|v| v.len())
623 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
624 }
625 };
626
627 self.dimension = Some(dimension);
628 Ok(dimension)
629 }
630
631 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
632 self.embed_texts(texts)
633 }
634
635 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
636 if let Some(vector) = self.query_embedding_cache.get(query) {
637 self.query_embedding_cache_hits += 1;
638 return Ok(vector.clone());
639 }
640
641 self.query_embedding_cache_misses += 1;
642 let embeddings = self.embed_texts(vec![query.to_string()])?;
643 let vector = embeddings
644 .first()
645 .cloned()
646 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
647
648 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
649 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
650 self.query_embedding_cache.remove(&oldest);
651 }
652 }
653 self.query_embedding_cache
654 .insert(query.to_string(), vector.clone());
655 self.query_embedding_cache_order
656 .push_back(query.to_string());
657
658 Ok(vector)
659 }
660
661 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
662 (
663 self.query_embedding_cache_hits,
664 self.query_embedding_cache_misses,
665 self.query_embedding_cache.len(),
666 )
667 }
668
669 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
670 match &mut self.engine {
671 SemanticEmbeddingEngine::Local(model) => model
672 .embed(&texts)
673 .map_err(|error| format!("failed to embed batch: {error}")),
674 SemanticEmbeddingEngine::OpenAiCompatible {
675 client,
676 model,
677 base_url,
678 api_key,
679 } => {
680 let expected_text_count = texts.len();
681 let endpoint = build_openai_embeddings_endpoint(base_url);
682 let body = serde_json::json!({
683 "input": texts,
684 "model": model,
685 });
686
687 let raw = send_embedding_request(
688 || {
689 let mut request = client.post(&endpoint).json(&body);
699
700 if let Some(api_key) = api_key {
701 request = request.header("Authorization", format!("Bearer {api_key}"));
702 }
703
704 request
705 },
706 "openai compatible",
707 )?;
708
709 #[derive(Deserialize)]
710 struct OpenAiResponse {
711 data: Vec<OpenAiEmbeddingResult>,
712 }
713
714 #[derive(Deserialize)]
715 struct OpenAiEmbeddingResult {
716 embedding: Vec<f32>,
717 index: Option<u32>,
718 }
719
720 let parsed: OpenAiResponse = serde_json::from_str(&raw)
721 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
722 if parsed.data.len() != expected_text_count {
723 return Err(format!(
724 "openai compatible response returned {} embeddings for {} inputs",
725 parsed.data.len(),
726 expected_text_count
727 ));
728 }
729
730 let mut vectors = vec![Vec::new(); parsed.data.len()];
731 for (i, item) in parsed.data.into_iter().enumerate() {
732 let index = item.index.unwrap_or(i as u32) as usize;
733 if index >= vectors.len() {
734 return Err(
735 "openai compatible response contains invalid vector index".to_string()
736 );
737 }
738 vectors[index] = item.embedding;
739 }
740
741 for vector in &vectors {
742 if vector.is_empty() {
743 return Err(
744 "openai compatible response contained missing vectors".to_string()
745 );
746 }
747 }
748
749 self.dimension = vectors.first().map(Vec::len);
750 Ok(vectors)
751 }
752 SemanticEmbeddingEngine::Ollama {
753 client,
754 model,
755 base_url,
756 } => {
757 let expected_text_count = texts.len();
758 let endpoint = build_ollama_embeddings_endpoint(base_url);
759
760 #[derive(Serialize)]
761 struct OllamaPayload<'a> {
762 model: &'a str,
763 input: Vec<String>,
764 }
765
766 let payload = OllamaPayload {
767 model,
768 input: texts,
769 };
770
771 let raw = send_embedding_request(
772 || {
773 client.post(&endpoint).json(&payload)
778 },
779 "ollama",
780 )?;
781
782 #[derive(Deserialize)]
783 struct OllamaResponse {
784 embeddings: Vec<Vec<f32>>,
785 }
786
787 let parsed: OllamaResponse = serde_json::from_str(&raw)
788 .map_err(|error| format!("invalid ollama response: {error}"))?;
789 if parsed.embeddings.is_empty() {
790 return Err("ollama response returned no embeddings".to_string());
791 }
792 if parsed.embeddings.len() != expected_text_count {
793 return Err(format!(
794 "ollama response returned {} embeddings for {} inputs",
795 parsed.embeddings.len(),
796 expected_text_count
797 ));
798 }
799
800 let vectors = parsed.embeddings;
801 for vector in &vectors {
802 if vector.is_empty() {
803 return Err("ollama response contained empty embeddings".to_string());
804 }
805 }
806
807 self.dimension = vectors.first().map(Vec::len);
808 Ok(vectors)
809 }
810 }
811 }
812}
813
814pub fn pre_validate_onnx_runtime() -> Result<(), String> {
818 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
819
820 #[cfg(any(target_os = "linux", target_os = "macos"))]
821 {
822 #[cfg(target_os = "linux")]
823 let default_name = "libonnxruntime.so";
824 #[cfg(target_os = "macos")]
825 let default_name = "libonnxruntime.dylib";
826
827 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
828
829 unsafe {
830 let c_name = std::ffi::CString::new(lib_name)
831 .map_err(|e| format!("invalid library path: {}", e))?;
832 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
833 if handle.is_null() {
834 let err = libc::dlerror();
835 let msg = if err.is_null() {
836 "unknown dlopen error".to_string()
837 } else {
838 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
839 };
840 return Err(format!(
841 "ONNX Runtime not found. dlopen('{}') failed: {}. \
842 Run `npx @cortexkit/aft doctor` to diagnose.",
843 lib_name, msg
844 ));
845 }
846
847 let (detected_version, version_source) =
852 detect_ort_version_from_loaded_library(handle, lib_name);
853
854 libc::dlclose(handle);
855
856 if let Some(ref version) = detected_version {
858 let parts: Vec<&str> = version.split('.').collect();
859 if let (Some(major), Some(minor)) = (
860 parts.first().and_then(|s| s.parse::<u32>().ok()),
861 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
862 ) {
863 if major != 1 || minor < 20 {
864 return Err(format_ort_version_mismatch(version, &version_source));
865 }
866 }
867 }
868 }
869 }
870
871 #[cfg(target_os = "windows")]
872 {
873 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
878
879 #[link(name = "kernel32")]
883 extern "system" {
884 fn LoadLibraryExW(
885 lpLibFileName: *const u16,
886 hFile: *mut std::ffi::c_void,
887 dwFlags: u32,
888 ) -> *mut std::ffi::c_void;
889 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
890 fn GetModuleFileNameW(
891 hModule: *mut std::ffi::c_void,
892 lpFilename: *mut u16,
893 nSize: u32,
894 ) -> u32;
895 }
896
897 #[link(name = "version")]
898 extern "system" {
899 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
900 fn GetFileVersionInfoW(
901 lptstrFilename: *const u16,
902 dwHandle: u32,
903 dwLen: u32,
904 lpData: *mut std::ffi::c_void,
905 ) -> i32;
906 fn VerQueryValueW(
907 pBlock: *mut std::ffi::c_void,
908 lpSubBlock: *const u16,
909 lplpBuffer: *mut *mut std::ffi::c_void,
910 puLen: *mut u32,
911 ) -> i32;
912 }
913
914 #[repr(C)]
915 struct VS_FIXEDFILEINFO {
916 dw_signature: u32,
917 dw_struc_version: u32,
918 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
921 dw_product_version_ls: u32,
922 dw_file_flags_mask: u32,
923 dw_file_flags: u32,
924 dw_file_os: u32,
925 dw_file_type: u32,
926 dw_file_subtype: u32,
927 dw_file_date_ms: u32,
928 dw_file_date_ls: u32,
929 }
930
931 unsafe {
932 use std::os::windows::ffi::OsStrExt;
933 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
934 .encode_wide()
935 .chain(std::iter::once(0))
936 .collect();
937
938 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
939 if handle.is_null() {
940 let err = std::io::Error::last_os_error();
941 return Err(format!(
942 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
943 Run `npx @cortexkit/aft doctor` to diagnose.",
944 lib_name, err
945 ));
946 }
947
948 let mut detected_major: u32 = 0;
951 let mut detected_minor: u32 = 0;
952 let mut path_buf = [0u16; 32767];
958 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
959 if path_len > 0 {
960 let mut dummy_handle: u32 = 0;
961 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
962 if info_size > 0 {
963 let mut info = vec![0u8; info_size as usize];
964 if GetFileVersionInfoW(
965 path_buf.as_ptr(),
966 0,
967 info_size,
968 info.as_mut_ptr() as *mut std::ffi::c_void,
969 ) != 0
970 {
971 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
972 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
973 let mut vs_len: u32 = 0;
974 if VerQueryValueW(
975 info.as_mut_ptr() as *mut std::ffi::c_void,
976 sub_block.as_ptr(),
977 &mut vs_info,
978 &mut vs_len,
979 ) != 0
980 && !vs_info.is_null()
981 {
982 let fixed = vs_info as *const VS_FIXEDFILEINFO;
983 detected_major = (*fixed).dw_file_version_ms >> 16;
984 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
985 }
986 }
987 }
988 }
989
990 FreeLibrary(handle);
991
992 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
996 let ver = format!("{}.{}", detected_major, detected_minor);
997 return Err(format_ort_version_mismatch(&ver, lib_name));
998 }
999 }
1000 }
1001
1002 Ok(())
1003}
1004
1005#[cfg(any(target_os = "linux", target_os = "macos"))]
1006unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1007 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1008 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1009 if symbol.is_null() {
1010 return None;
1011 }
1012
1013 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1014 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1015 return None;
1016 }
1017
1018 let info = unsafe { info.assume_init() };
1019 if info.dli_fname.is_null() {
1020 return None;
1021 }
1022
1023 Some(
1024 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1025 .to_string_lossy()
1026 .into_owned(),
1027 )
1028}
1029
1030#[cfg(any(target_os = "linux", target_os = "macos"))]
1031fn detect_ort_version_from_resolved_or_requested(
1032 resolved_path: Option<String>,
1033 requested_lib_name: &str,
1034) -> (Option<String>, String) {
1035 if let Some(path) = resolved_path {
1036 if let Some(version) = detect_ort_version_from_path(&path) {
1037 return (Some(version), path);
1038 }
1039 return (detect_ort_version_from_path(requested_lib_name), path);
1040 }
1041
1042 (
1043 detect_ort_version_from_path(requested_lib_name),
1044 requested_lib_name.to_string(),
1045 )
1046}
1047
1048#[cfg(any(target_os = "linux", target_os = "macos"))]
1049fn detect_ort_version_from_loaded_library(
1050 handle: *mut std::ffi::c_void,
1051 requested_lib_name: &str,
1052) -> (Option<String>, String) {
1053 detect_ort_version_from_resolved_or_requested(
1054 unsafe { loaded_library_path_from_handle(handle) },
1055 requested_lib_name,
1056 )
1057}
1058
1059#[cfg(any(target_os = "linux", target_os = "macos"))]
1062fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1063 let path = std::path::Path::new(lib_path);
1064
1065 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1067 .into_iter()
1068 .flatten()
1069 {
1070 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1071 if let Some(version) = extract_version_from_filename(name) {
1072 return Some(version);
1073 }
1074 }
1075 }
1076
1077 if let Some(parent) = path.parent() {
1079 if let Ok(entries) = std::fs::read_dir(parent) {
1080 for entry in entries.flatten() {
1081 if let Some(name) = entry.file_name().to_str() {
1082 if name.starts_with("libonnxruntime") {
1083 if let Some(version) = extract_version_from_filename(name) {
1084 return Some(version);
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091
1092 None
1093}
1094
1095#[cfg(any(target_os = "linux", target_os = "macos"))]
1097fn extract_version_from_filename(name: &str) -> Option<String> {
1098 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1100 re.find(name).map(|m| m.as_str().to_string())
1101}
1102
1103fn suggest_removal_command(lib_path: &str) -> String {
1104 if lib_path.starts_with("/usr/local/lib")
1105 || lib_path == "libonnxruntime.so"
1106 || lib_path == "libonnxruntime.dylib"
1107 {
1108 #[cfg(target_os = "linux")]
1109 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1110 #[cfg(target_os = "macos")]
1111 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1112 }
1113 format!(" rm '{}'", lib_path)
1114}
1115
1116pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1122 format!(
1123 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1124 Solutions:\n\
1125 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1126 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1127 configures the bridge to load it instead of the system library — no \
1128 changes to '{}'.\n\
1129 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1130 {}\n\
1131 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1132 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1133 version,
1134 lib_name,
1135 lib_name,
1136 suggest_removal_command(lib_name),
1137 )
1138}
1139
1140pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1141 if message.trim_start().starts_with("ONNX Runtime not found.") {
1142 return true;
1143 }
1144
1145 let message = message.to_ascii_lowercase();
1146 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1147 .iter()
1148 .any(|pattern| message.contains(pattern));
1149 let mentions_dynamic_load_failure = [
1150 "shared library",
1151 "dynamic library",
1152 "failed to load",
1153 "could not load",
1154 "unable to load",
1155 "dlopen",
1156 "loadlibrary",
1157 "no such file",
1158 "not found",
1159 ]
1160 .iter()
1161 .any(|pattern| message.contains(pattern));
1162
1163 mentions_onnx_runtime && mentions_dynamic_load_failure
1164}
1165
1166pub fn format_embedding_init_error(error: impl Display) -> String {
1167 let message = error.to_string();
1168
1169 if is_onnx_runtime_unavailable(&message) {
1170 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1171 }
1172
1173 format!("failed to initialize semantic embedding model: {message}")
1174}
1175
1176#[derive(Debug, Clone)]
1178pub struct SemanticChunk {
1179 pub file: PathBuf,
1181 pub name: String,
1183 pub kind: SymbolKind,
1185 pub start_line: u32,
1187 pub end_line: u32,
1188 pub exported: bool,
1190 pub embed_text: String,
1192 pub snippet: String,
1194}
1195
1196#[derive(Debug, Clone)]
1198pub struct EmbeddingEntry {
1199 chunk: SemanticChunk,
1200 vector: Vec<f32>,
1201}
1202
1203#[derive(Debug, Clone)]
1205pub struct SemanticIndex {
1206 entries: Vec<EmbeddingEntry>,
1207 file_mtimes: HashMap<PathBuf, SystemTime>,
1209 file_sizes: HashMap<PathBuf, u64>,
1211 file_hashes: HashMap<PathBuf, blake3::Hash>,
1212 dimension: usize,
1214 fingerprint: Option<SemanticIndexFingerprint>,
1215 project_root: PathBuf,
1216 deferred_files: HashSet<PathBuf>,
1217}
1218
1219#[derive(Debug, Clone, Copy)]
1220struct IndexedFileMetadata {
1221 mtime: SystemTime,
1222 size: u64,
1223 content_hash: blake3::Hash,
1224}
1225
1226#[derive(Debug, Default, Clone, Copy)]
1229pub struct RefreshSummary {
1230 pub changed: usize,
1231 pub added: usize,
1232 pub deleted: usize,
1233 pub total_processed: usize,
1234}
1235
1236impl RefreshSummary {
1237 pub fn is_noop(&self) -> bool {
1239 self.changed == 0 && self.added == 0 && self.deleted == 0
1240 }
1241}
1242
1243#[derive(Debug, Default)]
1244pub struct InvalidatedFilesRefresh {
1245 pub added_entries: Vec<EmbeddingEntry>,
1249 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1250 pub completed_paths: Vec<PathBuf>,
1251 pub summary: RefreshSummary,
1252}
1253
1254#[derive(Debug, Clone)]
1255struct ReusableEmbedding {
1256 embed_text: String,
1257 vector: Vec<f32>,
1258}
1259
1260type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1261
1262#[derive(Debug, Clone)]
1264pub struct SemanticResult {
1265 pub file: PathBuf,
1266 pub name: String,
1267 pub kind: SymbolKind,
1268 pub start_line: u32,
1269 pub end_line: u32,
1270 pub exported: bool,
1271 pub snippet: String,
1272 pub score: f32,
1273 pub source: &'static str,
1274}
1275
1276impl SemanticIndex {
1277 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1278 debug_assert!(project_root.is_absolute());
1279 Self {
1280 entries: Vec::new(),
1281 file_mtimes: HashMap::new(),
1282 file_sizes: HashMap::new(),
1283 file_hashes: HashMap::new(),
1284 dimension,
1285 fingerprint: None,
1286 project_root,
1287 deferred_files: HashSet::new(),
1288 }
1289 }
1290
1291 pub fn entry_count(&self) -> usize {
1293 self.entries.len()
1294 }
1295
1296 pub fn indexed_file_count(&self) -> usize {
1298 self.file_mtimes.len()
1299 }
1300
1301 pub fn status_label(&self) -> &'static str {
1303 if self.entries.is_empty() {
1304 "empty"
1305 } else {
1306 "ready"
1307 }
1308 }
1309
1310 fn collect_chunks(
1311 project_root: &Path,
1312 files: &[PathBuf],
1313 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1314 let collect_started = std::time::Instant::now();
1315 let per_file: Vec<(
1316 PathBuf,
1317 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1318 )> = files
1319 .par_iter()
1320 .map_init(HashMap::new, |parsers, file| {
1321 let result = collect_file_metadata(file).and_then(|metadata| {
1322 collect_file_chunks(project_root, file, parsers)
1323 .map(|chunks| (metadata, chunks))
1324 });
1325 (file.clone(), result)
1326 })
1327 .collect();
1328
1329 let mut chunks: Vec<SemanticChunk> = Vec::new();
1330 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1331
1332 for (file, result) in per_file {
1333 match result {
1334 Ok((metadata, file_chunks)) => {
1335 file_metadata.insert(file, metadata);
1336 chunks.extend(file_chunks);
1337 }
1338 Err(error) => {
1339 if error == "unsupported file extension" {
1345 continue;
1346 }
1347 slog_warn!(
1348 "failed to collect semantic chunks for {}: {}",
1349 file.display(),
1350 error
1351 );
1352 }
1353 }
1354 }
1355
1356 slog_info!(
1357 "semantic collect: {} chunks from {} files in {} ms",
1358 chunks.len(),
1359 file_metadata.len(),
1360 collect_started.elapsed().as_millis()
1361 );
1362
1363 (chunks, file_metadata)
1364 }
1365
1366 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1367 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1368 let mut reuse_map: ChunkReuseMap = HashMap::new();
1369
1370 for entry in &self.entries {
1371 if !requested.contains(entry.chunk.file.as_path()) {
1372 continue;
1373 }
1374
1375 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1380 reuse_map
1381 .entry(entry.chunk.file.clone())
1382 .or_default()
1383 .entry(hash)
1384 .or_default()
1385 .push(ReusableEmbedding {
1386 embed_text: entry.chunk.embed_text.clone(),
1387 vector: entry.vector.clone(),
1388 });
1389 }
1390
1391 reuse_map
1392 }
1393
1394 fn reusable_vector_for_chunk(
1395 reuse_map: &ChunkReuseMap,
1396 chunk: &SemanticChunk,
1397 ) -> Option<Vec<f32>> {
1398 let hash = blake3::hash(chunk.embed_text.as_bytes());
1399 reuse_map
1400 .get(&chunk.file)?
1401 .get(&hash)?
1402 .iter()
1403 .find(|candidate| candidate.embed_text == chunk.embed_text)
1404 .map(|candidate| candidate.vector.clone())
1405 }
1406
1407 fn entries_for_chunks_with_reuse<F, P>(
1408 chunks: Vec<SemanticChunk>,
1409 reuse_map: &ChunkReuseMap,
1410 embed_fn: &mut F,
1411 max_batch_size: usize,
1412 initial_observed_dimension: Option<usize>,
1413 refresh_label: &str,
1414 progress: &mut P,
1415 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1416 where
1417 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1418 P: FnMut(usize, usize),
1419 {
1420 let total_chunks = chunks.len();
1421 progress(0, total_chunks);
1422
1423 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1424 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1425
1426 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1427 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1428 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1429 } else {
1430 misses.push((chunk_index, chunk));
1431 }
1432 }
1433
1434 let mut completed = total_chunks.saturating_sub(misses.len());
1435 if completed > 0 {
1436 progress(completed, total_chunks);
1437 }
1438
1439 let batch_size = max_batch_size.max(1);
1440 let mut observed_dimension = initial_observed_dimension;
1441
1442 for batch_start in (0..misses.len()).step_by(batch_size) {
1443 let batch_end = (batch_start + batch_size).min(misses.len());
1444 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1445 .iter()
1446 .map(|(_, chunk)| chunk.embed_text.clone())
1447 .collect();
1448
1449 let vectors = embed_fn(batch_texts)?;
1450 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1451
1452 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1453 match observed_dimension {
1454 None => observed_dimension = Some(dim),
1455 Some(expected) if dim != expected => {
1456 return Err(format!(
1457 "embedding dimension changed during {refresh_label}: \
1458 cached index uses {expected}, new vectors use {dim}"
1459 ));
1460 }
1461 _ => {}
1462 }
1463 }
1464
1465 for (i, vector) in vectors.into_iter().enumerate() {
1466 let (chunk_index, chunk) = misses[batch_start + i].clone();
1467 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1468 }
1469
1470 completed += batch_end - batch_start;
1471 progress(completed, total_chunks);
1472 }
1473
1474 let entries = entries_by_chunk
1475 .into_iter()
1476 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1477 .collect();
1478
1479 Ok((entries, observed_dimension))
1480 }
1481
1482 fn build_from_chunks<F, P>(
1483 project_root: &Path,
1484 chunks: Vec<SemanticChunk>,
1485 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1486 embed_fn: &mut F,
1487 max_batch_size: usize,
1488 mut progress: Option<&mut P>,
1489 ) -> Result<Self, String>
1490 where
1491 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1492 P: FnMut(usize, usize),
1493 {
1494 debug_assert!(project_root.is_absolute());
1495 let total_chunks = chunks.len();
1496
1497 if chunks.is_empty() {
1498 return Ok(Self {
1499 entries: Vec::new(),
1500 file_mtimes: file_metadata
1501 .iter()
1502 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1503 .collect(),
1504 file_sizes: file_metadata
1505 .iter()
1506 .map(|(path, metadata)| (path.clone(), metadata.size))
1507 .collect(),
1508 file_hashes: file_metadata
1509 .into_iter()
1510 .map(|(path, metadata)| (path, metadata.content_hash))
1511 .collect(),
1512 dimension: DEFAULT_DIMENSION,
1513 fingerprint: None,
1514 project_root: project_root.to_path_buf(),
1515 deferred_files: HashSet::new(),
1516 });
1517 }
1518
1519 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1521 let mut expected_dimension: Option<usize> = None;
1522 let batch_size = max_batch_size.max(1);
1523 let embed_started = std::time::Instant::now();
1524 let batch_count = total_chunks.div_ceil(batch_size);
1525 for batch_start in (0..chunks.len()).step_by(batch_size) {
1526 let batch_end = (batch_start + batch_size).min(chunks.len());
1527 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1528 .iter()
1529 .map(|c| c.embed_text.clone())
1530 .collect();
1531
1532 let vectors = embed_fn(batch_texts)?;
1533 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1534
1535 if let Some(dim) = vectors.first().map(|v| v.len()) {
1537 match expected_dimension {
1538 None => expected_dimension = Some(dim),
1539 Some(expected) if dim != expected => {
1540 return Err(format!(
1541 "embedding dimension changed across batches: expected {expected}, got {dim}"
1542 ));
1543 }
1544 _ => {}
1545 }
1546 }
1547
1548 for (i, vector) in vectors.into_iter().enumerate() {
1549 let chunk_idx = batch_start + i;
1550 entries.push(EmbeddingEntry {
1551 chunk: chunks[chunk_idx].clone(),
1552 vector,
1553 });
1554 }
1555
1556 if let Some(callback) = progress.as_mut() {
1557 callback(entries.len(), total_chunks);
1558 }
1559 }
1560
1561 let embed_ms = embed_started.elapsed().as_millis();
1562 let rate = (total_chunks as u128 * 1000)
1563 .checked_div(embed_ms)
1564 .unwrap_or(0) as u64;
1565 slog_info!(
1566 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1567 total_chunks,
1568 batch_count,
1569 embed_ms,
1570 rate
1571 );
1572
1573 let dimension = entries
1574 .first()
1575 .map(|e| e.vector.len())
1576 .unwrap_or(DEFAULT_DIMENSION);
1577
1578 Ok(Self {
1579 entries,
1580 file_mtimes: file_metadata
1581 .iter()
1582 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1583 .collect(),
1584 file_sizes: file_metadata
1585 .iter()
1586 .map(|(path, metadata)| (path.clone(), metadata.size))
1587 .collect(),
1588 file_hashes: file_metadata
1589 .into_iter()
1590 .map(|(path, metadata)| (path, metadata.content_hash))
1591 .collect(),
1592 dimension,
1593 fingerprint: None,
1594 project_root: project_root.to_path_buf(),
1595 deferred_files: HashSet::new(),
1596 })
1597 }
1598
1599 pub fn build<F>(
1602 project_root: &Path,
1603 files: &[PathBuf],
1604 embed_fn: &mut F,
1605 max_batch_size: usize,
1606 ) -> Result<Self, String>
1607 where
1608 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1609 {
1610 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1611 Self::build_from_chunks(
1612 project_root,
1613 chunks,
1614 file_mtimes,
1615 embed_fn,
1616 max_batch_size,
1617 Option::<&mut fn(usize, usize)>::None,
1618 )
1619 }
1620
1621 pub fn build_with_progress<F, P>(
1623 project_root: &Path,
1624 files: &[PathBuf],
1625 embed_fn: &mut F,
1626 max_batch_size: usize,
1627 progress: &mut P,
1628 ) -> Result<Self, String>
1629 where
1630 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631 P: FnMut(usize, usize),
1632 {
1633 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634 let total_chunks = chunks.len();
1635 progress(0, total_chunks);
1636 Self::build_from_chunks(
1637 project_root,
1638 chunks,
1639 file_mtimes,
1640 embed_fn,
1641 max_batch_size,
1642 Some(progress),
1643 )
1644 }
1645
1646 pub fn refresh_stale_files<F, P>(
1657 &mut self,
1658 project_root: &Path,
1659 current_files: &[PathBuf],
1660 embed_fn: &mut F,
1661 max_batch_size: usize,
1662 progress: &mut P,
1663 ) -> Result<RefreshSummary, String>
1664 where
1665 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1666 P: FnMut(usize, usize),
1667 {
1668 self.backfill_missing_file_sizes();
1669
1670 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1672 self.deferred_files
1673 .retain(|path| current_set.contains(path.as_path()));
1674 let total_processed = current_set.len() + self.file_mtimes.len()
1675 - self
1676 .file_mtimes
1677 .keys()
1678 .filter(|path| current_set.contains(path.as_path()))
1679 .count();
1680
1681 enum IndexedFileCheck {
1684 Deleted(PathBuf),
1685 MissingMetadata(PathBuf),
1686 Verified(PathBuf, FreshnessVerdict),
1687 }
1688
1689 let mut deleted: Vec<PathBuf> = Vec::new();
1690 let mut changed: Vec<PathBuf> = Vec::new();
1691 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1692 let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1693 let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1694
1695 for indexed_path in indexed_paths {
1696 let check_index = checks.len();
1697 if !current_set.contains(indexed_path.as_path()) {
1698 checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1699 continue;
1700 }
1701 let cached = match (
1702 self.file_mtimes.get(&indexed_path),
1703 self.file_sizes.get(&indexed_path),
1704 self.file_hashes.get(&indexed_path),
1705 ) {
1706 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1707 mtime: *mtime,
1708 size: *size,
1709 content_hash: *hash,
1710 }),
1711 _ => None,
1712 };
1713 if let Some(freshness) = cached {
1714 strict_verify_inputs.push((check_index, indexed_path, freshness));
1715 checks.push(None);
1716 } else {
1717 checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1718 }
1719 }
1720
1721 for (check_index, path, verdict) in
1722 cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1723 {
1724 checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1725 }
1726
1727 for check in checks {
1728 match check.expect("strict freshness check should be populated") {
1729 IndexedFileCheck::Deleted(path) => deleted.push(path),
1730 IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1731 IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1732 IndexedFileCheck::Verified(
1733 path,
1734 FreshnessVerdict::ContentFresh {
1735 new_mtime,
1736 new_size,
1737 },
1738 ) => {
1739 self.file_mtimes.insert(path.clone(), new_mtime);
1740 self.file_sizes.insert(path, new_size);
1741 }
1742 IndexedFileCheck::Verified(
1743 path,
1744 FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1745 ) => {
1746 changed.push(path);
1747 }
1748 }
1749 }
1750
1751 let mut added: Vec<PathBuf> = Vec::new();
1753 for path in current_files {
1754 if !self.file_mtimes.contains_key(path) {
1755 added.push(path.clone());
1756 }
1757 }
1758
1759 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1761 progress(0, 0);
1762 return Ok(RefreshSummary {
1763 total_processed,
1764 ..RefreshSummary::default()
1765 });
1766 }
1767
1768 if !deleted.is_empty() {
1772 self.remove_indexed_files(&deleted);
1773 }
1774
1775 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1777 to_embed.extend(changed.iter().cloned());
1778 to_embed.extend(added.iter().cloned());
1779
1780 if to_embed.is_empty() {
1781 progress(0, 0);
1783 return Ok(RefreshSummary {
1784 changed: 0,
1785 added: 0,
1786 deleted: deleted.len(),
1787 total_processed,
1788 });
1789 }
1790
1791 let reuse_map = self.build_chunk_reuse_map(&changed);
1792 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1793 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1794 let vanished = to_embed
1795 .iter()
1796 .filter(|path| {
1797 changed_set.contains(path.as_path())
1798 && !fresh_metadata.contains_key(*path)
1799 && !path.exists()
1800 })
1801 .cloned()
1802 .collect::<Vec<_>>();
1803 if !vanished.is_empty() {
1804 self.remove_indexed_files(&vanished);
1805 deleted.extend(vanished);
1806 }
1807
1808 if chunks.is_empty() {
1809 progress(0, 0);
1810 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1811 for file in &successful_files {
1812 self.deferred_files.remove(file);
1813 }
1814 if !successful_files.is_empty() {
1815 self.entries
1816 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1817 }
1818 let changed_count = changed
1819 .iter()
1820 .filter(|path| successful_files.contains(*path))
1821 .count();
1822 let added_count = added
1823 .iter()
1824 .filter(|path| successful_files.contains(*path))
1825 .count();
1826 for (file, metadata) in fresh_metadata {
1827 self.file_mtimes.insert(file.clone(), metadata.mtime);
1828 self.file_sizes.insert(file.clone(), metadata.size);
1829 self.file_hashes.insert(file.clone(), metadata.content_hash);
1830 }
1831 return Ok(RefreshSummary {
1832 changed: changed_count,
1833 added: added_count,
1834 deleted: deleted.len(),
1835 total_processed,
1836 });
1837 }
1838
1839 let existing_dimension = if self.entries.is_empty() {
1842 None
1843 } else {
1844 Some(self.dimension)
1845 };
1846 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1847 chunks,
1848 &reuse_map,
1849 embed_fn,
1850 max_batch_size,
1851 existing_dimension,
1852 "incremental refresh",
1853 progress,
1854 )?;
1855
1856 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1857 for file in &successful_files {
1858 self.deferred_files.remove(file);
1859 }
1860 if !successful_files.is_empty() {
1861 self.entries
1862 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1863 }
1864
1865 self.entries.extend(new_entries);
1866 for (file, metadata) in fresh_metadata {
1867 self.file_mtimes.insert(file.clone(), metadata.mtime);
1868 self.file_sizes.insert(file.clone(), metadata.size);
1869 self.file_hashes.insert(file, metadata.content_hash);
1870 }
1871 if let Some(dim) = observed_dimension {
1872 self.dimension = dim;
1873 }
1874
1875 Ok(RefreshSummary {
1876 changed: changed
1877 .iter()
1878 .filter(|path| successful_files.contains(*path))
1879 .count(),
1880 added: added
1881 .iter()
1882 .filter(|path| successful_files.contains(*path))
1883 .count(),
1884 deleted: deleted.len(),
1885 total_processed,
1886 })
1887 }
1888
1889 pub fn refresh_invalidated_files<F, P>(
1896 &mut self,
1897 project_root: &Path,
1898 paths: &[PathBuf],
1899 embed_fn: &mut F,
1900 max_batch_size: usize,
1901 max_files: usize,
1902 progress: &mut P,
1903 ) -> Result<InvalidatedFilesRefresh, String>
1904 where
1905 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1906 P: FnMut(usize, usize),
1907 {
1908 self.backfill_missing_file_sizes();
1909
1910 self.deferred_files.retain(|path| path.exists());
1911 let mut requested_paths = paths.to_vec();
1912 requested_paths.extend(self.deferred_files.iter().cloned());
1913 requested_paths.sort();
1914 requested_paths.dedup();
1915 let total_processed = requested_paths.len();
1916
1917 if requested_paths.is_empty() {
1918 progress(0, 0);
1919 return Ok(InvalidatedFilesRefresh {
1920 summary: RefreshSummary {
1921 total_processed,
1922 ..RefreshSummary::default()
1923 },
1924 ..InvalidatedFilesRefresh::default()
1925 });
1926 }
1927
1928 let previously_indexed: HashSet<PathBuf> = requested_paths
1929 .iter()
1930 .filter(|path| self.file_mtimes.contains_key(*path))
1931 .cloned()
1932 .collect();
1933 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1934
1935 self.remove_indexed_files(&requested_paths);
1939
1940 let existing_paths = requested_paths
1941 .iter()
1942 .filter(|path| path.exists())
1943 .cloned()
1944 .collect::<Vec<_>>();
1945 let deleted = requested_paths
1946 .iter()
1947 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1948 .count();
1949
1950 if existing_paths.is_empty() {
1951 for path in &requested_paths {
1952 if !path.exists() {
1953 self.deferred_files.remove(path);
1954 }
1955 }
1956 progress(0, 0);
1957 return Ok(InvalidatedFilesRefresh {
1958 completed_paths: requested_paths,
1959 summary: RefreshSummary {
1960 deleted,
1961 total_processed,
1962 ..RefreshSummary::default()
1963 },
1964 ..InvalidatedFilesRefresh::default()
1965 });
1966 }
1967
1968 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1969
1970 let retained_file_count = self.file_mtimes.len();
1971 let changed_successful_count = existing_paths
1972 .iter()
1973 .filter(|path| {
1974 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1975 })
1976 .count();
1977 let available_new_files =
1978 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1979 let new_successful_files = existing_paths
1980 .iter()
1981 .filter(|path| {
1982 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1983 })
1984 .cloned()
1985 .collect::<Vec<_>>();
1986 if new_successful_files.len() > available_new_files {
1987 let allowed_new_files = new_successful_files
1988 .iter()
1989 .take(available_new_files)
1990 .cloned()
1991 .collect::<HashSet<_>>();
1992 let deferred_new_files = new_successful_files
1993 .into_iter()
1994 .filter(|path| !allowed_new_files.contains(path))
1995 .collect::<HashSet<_>>();
1996
1997 fresh_metadata.retain(|file, _| {
1998 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1999 });
2000 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2001
2002 if !deferred_new_files.is_empty() {
2003 for path in &deferred_new_files {
2004 self.deferred_files.insert(path.clone());
2005 }
2006 slog_warn!(
2007 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2008 deferred_new_files.len(),
2009 max_files
2010 );
2011 }
2012 }
2013
2014 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2015 for file in &successful_files {
2016 self.deferred_files.remove(file);
2017 }
2018 let changed = successful_files
2019 .iter()
2020 .filter(|path| previously_indexed.contains(path.as_path()))
2021 .count();
2022 let added = successful_files.len().saturating_sub(changed);
2023 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2024
2025 if chunks.is_empty() {
2026 progress(0, 0);
2027 for (file, metadata) in fresh_metadata {
2028 let freshness = FileFreshness {
2029 mtime: metadata.mtime,
2030 size: metadata.size,
2031 content_hash: metadata.content_hash,
2032 };
2033 self.file_mtimes.insert(file.clone(), freshness.mtime);
2034 self.file_sizes.insert(file.clone(), freshness.size);
2035 self.file_hashes
2036 .insert(file.clone(), freshness.content_hash);
2037 updated_metadata.push((file, freshness));
2038 }
2039
2040 return Ok(InvalidatedFilesRefresh {
2041 updated_metadata,
2042 completed_paths: requested_paths,
2043 summary: RefreshSummary {
2044 changed,
2045 added,
2046 deleted,
2047 total_processed,
2048 },
2049 ..InvalidatedFilesRefresh::default()
2050 });
2051 }
2052
2053 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2054 {
2055 None
2056 } else {
2057 Some(self.dimension)
2058 };
2059 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2060 chunks,
2061 &reuse_map,
2062 embed_fn,
2063 max_batch_size,
2064 initial_observed_dimension,
2065 "invalidated-file refresh",
2066 progress,
2067 )?;
2068
2069 let added_entries = new_entries.clone();
2070 self.entries.extend(new_entries);
2071 for (file, metadata) in fresh_metadata {
2072 let freshness = FileFreshness {
2073 mtime: metadata.mtime,
2074 size: metadata.size,
2075 content_hash: metadata.content_hash,
2076 };
2077 self.file_mtimes.insert(file.clone(), freshness.mtime);
2078 self.file_sizes.insert(file.clone(), freshness.size);
2079 self.file_hashes
2080 .insert(file.clone(), freshness.content_hash);
2081 updated_metadata.push((file, freshness));
2082 }
2083 if let Some(dim) = observed_dimension {
2084 self.dimension = dim;
2085 }
2086
2087 Ok(InvalidatedFilesRefresh {
2088 added_entries,
2089 updated_metadata,
2090 completed_paths: requested_paths,
2091 summary: RefreshSummary {
2092 changed,
2093 added,
2094 deleted,
2095 total_processed,
2096 },
2097 })
2098 }
2099
2100 pub fn apply_refresh_update(
2101 &mut self,
2102 added_entries: Vec<EmbeddingEntry>,
2103 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2104 completed_paths: &[PathBuf],
2105 ) {
2106 self.remove_indexed_files(completed_paths);
2110
2111 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2112 self.entries.extend(added_entries);
2113 for (file, freshness) in updated_metadata {
2114 self.file_mtimes.insert(file.clone(), freshness.mtime);
2115 self.file_sizes.insert(file.clone(), freshness.size);
2116 self.file_hashes.insert(file, freshness.content_hash);
2117 }
2118 if let Some(dim) = observed_dimension {
2119 self.dimension = dim;
2120 }
2121 }
2122
2123 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2124 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2125 self.entries
2126 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2127 for path in files {
2128 self.file_mtimes.remove(path);
2129 self.file_sizes.remove(path);
2130 self.file_hashes.remove(path);
2131 }
2132 }
2133
2134 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2136 if self.entries.is_empty() || query_vector.len() != self.dimension {
2137 return Vec::new();
2138 }
2139
2140 let mut scored: Vec<(f32, usize)> = self
2141 .entries
2142 .iter()
2143 .enumerate()
2144 .map(|(i, entry)| {
2145 let mut score = cosine_similarity(query_vector, &entry.vector);
2146 if entry.chunk.exported {
2147 score *= 1.1;
2148 }
2149 (score, i)
2150 })
2151 .collect();
2152
2153 let keep = top_k.min(scored.len());
2154 if keep == 0 {
2155 return Vec::new();
2156 }
2157
2158 if keep < scored.len() {
2159 scored.select_nth_unstable_by(keep, semantic_score_order);
2160 scored.truncate(keep);
2161 }
2162 scored.sort_by(semantic_score_order);
2163
2164 scored
2165 .into_iter()
2166 .map(|(score, idx)| {
2170 let entry = &self.entries[idx];
2171 SemanticResult {
2172 file: entry.chunk.file.clone(),
2173 name: entry.chunk.name.clone(),
2174 kind: entry.chunk.kind.clone(),
2175 start_line: entry.chunk.start_line,
2176 end_line: entry.chunk.end_line,
2177 exported: entry.chunk.exported,
2178 snippet: entry.chunk.snippet.clone(),
2179 score,
2180 source: "semantic",
2181 }
2182 })
2183 .collect()
2184 }
2185
2186 pub fn len(&self) -> usize {
2188 self.entries.len()
2189 }
2190
2191 pub fn is_file_stale(&self, file: &Path) -> bool {
2193 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2194 return true;
2195 };
2196 let Some(stored_size) = self.file_sizes.get(file) else {
2197 return true;
2198 };
2199 let Some(stored_hash) = self.file_hashes.get(file) else {
2200 return true;
2201 };
2202 let cached = FileFreshness {
2203 mtime: *stored_mtime,
2204 size: *stored_size,
2205 content_hash: *stored_hash,
2206 };
2207 match cache_freshness::verify_file_strict(file, &cached) {
2208 FreshnessVerdict::HotFresh => false,
2209 FreshnessVerdict::ContentFresh { .. } => false,
2210 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2211 }
2212 }
2213
2214 fn backfill_missing_file_sizes(&mut self) {
2215 for path in self.file_mtimes.keys() {
2216 if self.file_sizes.contains_key(path) {
2217 continue;
2218 }
2219 if let Ok(metadata) = fs::metadata(path) {
2220 self.file_sizes.insert(path.clone(), metadata.len());
2221 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2222 self.file_hashes.insert(path.clone(), hash);
2223 }
2224 }
2225 }
2226 }
2227
2228 pub fn remove_file(&mut self, file: &Path) {
2230 self.invalidate_file(file);
2231 }
2232
2233 pub fn invalidate_file(&mut self, file: &Path) {
2234 let canonical_file = canonicalize_existing_or_deleted_path(file);
2235 self.entries
2236 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2237 self.file_mtimes.remove(file);
2238 self.file_sizes.remove(file);
2239 self.file_hashes.remove(file);
2240 if canonical_file.as_path() != file {
2241 self.file_mtimes.remove(&canonical_file);
2242 self.file_sizes.remove(&canonical_file);
2243 self.file_hashes.remove(&canonical_file);
2244 }
2245 }
2246
2247 pub fn dimension(&self) -> usize {
2249 self.dimension
2250 }
2251
2252 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2253 self.fingerprint.as_ref()
2254 }
2255
2256 pub fn backend_label(&self) -> Option<&str> {
2257 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2258 }
2259
2260 pub fn model_label(&self) -> Option<&str> {
2261 self.fingerprint.as_ref().map(|f| f.model.as_str())
2262 }
2263
2264 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2265 self.fingerprint = Some(fingerprint);
2266 }
2267
2268 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2270 if self.entries.is_empty() {
2273 slog_info!("skipping semantic index persistence (0 entries)");
2274 return;
2275 }
2276 let dir = storage_dir.join("semantic").join(project_key);
2277 if let Err(e) = fs::create_dir_all(&dir) {
2278 slog_warn!("failed to create semantic cache dir: {}", e);
2279 return;
2280 }
2281 let data_path = dir.join("semantic.bin");
2282 let tmp_path = dir.join(format!(
2283 "semantic.bin.tmp.{}.{}",
2284 std::process::id(),
2285 SystemTime::now()
2286 .duration_since(SystemTime::UNIX_EPOCH)
2287 .unwrap_or(Duration::ZERO)
2288 .as_nanos()
2289 ));
2290 let write_result = (|| -> io::Result<usize> {
2291 let file = fs::File::create(&tmp_path)?;
2292 let mut writer = BufWriter::new(file);
2293 let bytes_written = self.write_to_writer(&mut writer)?;
2294 writer.flush()?;
2295 writer.get_ref().sync_all()?;
2296 Ok(bytes_written)
2297 })();
2298 let bytes_written = match write_result {
2299 Ok(bytes_written) => bytes_written,
2300 Err(e) => {
2301 slog_warn!("failed to write semantic index: {}", e);
2302 let _ = fs::remove_file(&tmp_path);
2303 return;
2304 }
2305 };
2306 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2307 slog_warn!("failed to rename semantic index: {}", e);
2308 let _ = fs::remove_file(&tmp_path);
2309 return;
2310 }
2311 slog_info!(
2312 "semantic index persisted: {} entries, {:.1} KB",
2313 self.entries.len(),
2314 bytes_written as f64 / 1024.0
2315 );
2316 }
2317
2318 pub fn read_from_disk(
2320 storage_dir: &Path,
2321 project_key: &str,
2322 current_canonical_root: &Path,
2323 is_worktree_bridge: bool,
2324 expected_fingerprint: Option<&str>,
2325 ) -> Option<Self> {
2326 debug_assert!(current_canonical_root.is_absolute());
2327 let data_path = storage_dir
2328 .join("semantic")
2329 .join(project_key)
2330 .join("semantic.bin");
2331 let file = fs::File::open(&data_path).ok()?;
2332 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2333 if file_len < HEADER_BYTES_V1 {
2334 slog_warn!(
2335 "corrupt semantic index (too small: {} bytes), removing",
2336 file_len
2337 );
2338 if !is_worktree_bridge {
2339 let _ = fs::remove_file(&data_path);
2340 }
2341 return None;
2342 }
2343
2344 let mut reader = BufReader::new(file);
2345 let mut version_buf = [0u8; 1];
2346 reader.read_exact(&mut version_buf).ok()?;
2347 let version = version_buf[0];
2348 if version != SEMANTIC_INDEX_VERSION_V6 {
2349 slog_info!(
2350 "cached semantic index version {} is older than {}, rebuilding",
2351 version,
2352 SEMANTIC_INDEX_VERSION_V6
2353 );
2354 if !is_worktree_bridge {
2355 let _ = fs::remove_file(&data_path);
2356 }
2357 return None;
2358 }
2359 match Self::from_reader_after_version(
2360 reader,
2361 version,
2362 current_canonical_root,
2363 Some(file_len),
2364 1,
2365 ) {
2366 Ok(index) => {
2367 if index.entries.is_empty() {
2368 slog_info!("cached semantic index is empty, will rebuild");
2369 if !is_worktree_bridge {
2370 let _ = fs::remove_file(&data_path);
2371 }
2372 return None;
2373 }
2374 if let Some(expected) = expected_fingerprint {
2375 let matches = index
2376 .fingerprint()
2377 .map(|fingerprint| fingerprint.matches_expected(expected))
2378 .unwrap_or(false);
2379 if !matches {
2380 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2381 if !is_worktree_bridge {
2382 let _ = fs::remove_file(&data_path);
2383 }
2384 return None;
2385 }
2386 }
2387 slog_info!(
2388 "loaded semantic index from disk: {} entries",
2389 index.entries.len()
2390 );
2391 Some(index)
2392 }
2393 Err(e) => {
2394 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2395 if !is_worktree_bridge {
2396 let _ = fs::remove_file(&data_path);
2397 }
2398 None
2399 }
2400 }
2401 }
2402
2403 pub fn to_bytes(&self) -> Vec<u8> {
2405 let mut buf = Vec::new();
2406 self.write_to_writer(&mut buf)
2407 .expect("writing semantic index to Vec cannot fail");
2408 buf
2409 }
2410
2411 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2412 let mut bytes_written = 0usize;
2413 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2414 let encoded = fingerprint.as_string();
2415 if encoded.is_empty() {
2416 None
2417 } else {
2418 Some(encoded)
2419 }
2420 });
2421 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2422 let file_mtime_count = self
2423 .file_mtimes
2424 .iter()
2425 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2426 .count();
2427 let entry_count = self
2428 .entries
2429 .iter()
2430 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2431 .count();
2432
2433 let version = SEMANTIC_INDEX_VERSION_V6;
2446 write_counted(writer, &[version], &mut bytes_written)?;
2447 write_counted(
2448 writer,
2449 &(self.dimension as u32).to_le_bytes(),
2450 &mut bytes_written,
2451 )?;
2452 write_counted(
2453 writer,
2454 &(entry_count as u32).to_le_bytes(),
2455 &mut bytes_written,
2456 )?;
2457 write_counted(
2458 writer,
2459 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2460 &mut bytes_written,
2461 )?;
2462 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2463
2464 write_counted(
2467 writer,
2468 &(file_mtime_count as u32).to_le_bytes(),
2469 &mut bytes_written,
2470 )?;
2471 for (path, mtime) in &self.file_mtimes {
2472 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2473 continue;
2474 };
2475 let relative = relative.to_string_lossy();
2476 let path_bytes = relative.as_bytes();
2477 write_counted(
2478 writer,
2479 &(path_bytes.len() as u32).to_le_bytes(),
2480 &mut bytes_written,
2481 )?;
2482 write_counted(writer, path_bytes, &mut bytes_written)?;
2483 let duration = mtime
2484 .duration_since(SystemTime::UNIX_EPOCH)
2485 .unwrap_or_default();
2486 write_counted(
2487 writer,
2488 &duration.as_secs().to_le_bytes(),
2489 &mut bytes_written,
2490 )?;
2491 write_counted(
2492 writer,
2493 &duration.subsec_nanos().to_le_bytes(),
2494 &mut bytes_written,
2495 )?;
2496 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2497 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2498 let hash = self
2499 .file_hashes
2500 .get(path)
2501 .copied()
2502 .unwrap_or_else(cache_freshness::zero_hash);
2503 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2504 }
2505
2506 for entry in &self.entries {
2508 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2509 continue;
2510 };
2511 let c = &entry.chunk;
2512
2513 let relative = relative.to_string_lossy();
2515 let file_bytes = relative.as_bytes();
2516 write_counted(
2517 writer,
2518 &(file_bytes.len() as u32).to_le_bytes(),
2519 &mut bytes_written,
2520 )?;
2521 write_counted(writer, file_bytes, &mut bytes_written)?;
2522
2523 let name_bytes = c.name.as_bytes();
2525 write_counted(
2526 writer,
2527 &(name_bytes.len() as u32).to_le_bytes(),
2528 &mut bytes_written,
2529 )?;
2530 write_counted(writer, name_bytes, &mut bytes_written)?;
2531
2532 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2534
2535 write_counted(
2537 writer,
2538 &(c.start_line as u32).to_le_bytes(),
2539 &mut bytes_written,
2540 )?;
2541 write_counted(
2542 writer,
2543 &(c.end_line as u32).to_le_bytes(),
2544 &mut bytes_written,
2545 )?;
2546 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2547
2548 let snippet_bytes = c.snippet.as_bytes();
2550 write_counted(
2551 writer,
2552 &(snippet_bytes.len() as u32).to_le_bytes(),
2553 &mut bytes_written,
2554 )?;
2555 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2556
2557 let embed_bytes = c.embed_text.as_bytes();
2559 write_counted(
2560 writer,
2561 &(embed_bytes.len() as u32).to_le_bytes(),
2562 &mut bytes_written,
2563 )?;
2564 write_counted(writer, embed_bytes, &mut bytes_written)?;
2565
2566 for &val in &entry.vector {
2568 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2569 }
2570 }
2571
2572 Ok(bytes_written)
2573 }
2574
2575 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2577 debug_assert!(current_canonical_root.is_absolute());
2578 if data.len() < HEADER_BYTES_V1 {
2579 return Err("data too short".to_string());
2580 }
2581
2582 Self::from_reader_after_version(
2583 Cursor::new(&data[1..]),
2584 data[0],
2585 current_canonical_root,
2586 Some(data.len()),
2587 1,
2588 )
2589 }
2590
2591 fn from_reader_after_version<R: Read>(
2592 reader: R,
2593 version: u8,
2594 current_canonical_root: &Path,
2595 total_len: Option<usize>,
2596 bytes_read: usize,
2597 ) -> Result<Self, String> {
2598 debug_assert!(current_canonical_root.is_absolute());
2599 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2600
2601 if version != SEMANTIC_INDEX_VERSION_V1
2602 && version != SEMANTIC_INDEX_VERSION_V2
2603 && version != SEMANTIC_INDEX_VERSION_V3
2604 && version != SEMANTIC_INDEX_VERSION_V4
2605 && version != SEMANTIC_INDEX_VERSION_V5
2606 && version != SEMANTIC_INDEX_VERSION_V6
2607 {
2608 return Err(format!("unsupported version: {}", version));
2609 }
2610 if (version == SEMANTIC_INDEX_VERSION_V2
2614 || version == SEMANTIC_INDEX_VERSION_V3
2615 || version == SEMANTIC_INDEX_VERSION_V4
2616 || version == SEMANTIC_INDEX_VERSION_V5
2617 || version == SEMANTIC_INDEX_VERSION_V6)
2618 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2619 {
2620 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2621 }
2622
2623 let dimension = read_u32_stream(&mut reader)? as usize;
2624 let entry_count = read_u32_stream(&mut reader)? as usize;
2625 validate_embedding_dimension(dimension)?;
2626 if entry_count > MAX_ENTRIES {
2627 return Err(format!("too many semantic index entries: {}", entry_count));
2628 }
2629
2630 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2636 || version == SEMANTIC_INDEX_VERSION_V3
2637 || version == SEMANTIC_INDEX_VERSION_V4
2638 || version == SEMANTIC_INDEX_VERSION_V5
2639 || version == SEMANTIC_INDEX_VERSION_V6;
2640 let fingerprint = if has_fingerprint_field {
2641 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2642 if total_len
2643 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2644 {
2645 return Err("unexpected end of data reading fingerprint".to_string());
2646 }
2647 if fingerprint_len == 0 {
2648 None
2649 } else {
2650 let mut raw = vec![0u8; fingerprint_len];
2651 read_exact_stream(
2652 &mut reader,
2653 &mut raw,
2654 "unexpected end of data reading fingerprint",
2655 )?;
2656 let raw = String::from_utf8_lossy(&raw).to_string();
2657 Some(
2658 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2659 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2660 )
2661 }
2662 } else {
2663 None
2664 };
2665
2666 let mtime_count = read_u32_stream(&mut reader)? as usize;
2668 if mtime_count > MAX_ENTRIES {
2669 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2670 }
2671
2672 let vector_bytes = entry_count
2673 .checked_mul(dimension)
2674 .and_then(|count| count.checked_mul(F32_BYTES))
2675 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2676 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2677 return Err("semantic index vectors exceed available data".to_string());
2678 }
2679
2680 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2681 let mut file_sizes = HashMap::with_capacity(mtime_count);
2682 let mut file_hashes = HashMap::with_capacity(mtime_count);
2683 for _ in 0..mtime_count {
2684 let path = read_string_stream(&mut reader, total_len)?;
2685 let secs = read_u64_stream(&mut reader)?;
2686 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2692 || version == SEMANTIC_INDEX_VERSION_V4
2693 || version == SEMANTIC_INDEX_VERSION_V5
2694 || version == SEMANTIC_INDEX_VERSION_V6
2695 {
2696 read_u32_stream(&mut reader)?
2697 } else {
2698 0
2699 };
2700 let size =
2701 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2702 read_u64_stream(&mut reader)?
2703 } else {
2704 0
2705 };
2706 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2707 let mut hash_bytes = [0u8; 32];
2708 read_exact_stream(
2709 &mut reader,
2710 &mut hash_bytes,
2711 "unexpected end of data reading content hash",
2712 )?;
2713 blake3::Hash::from_bytes(hash_bytes)
2714 } else {
2715 cache_freshness::zero_hash()
2716 };
2717 if nanos >= 1_000_000_000 {
2724 return Err(format!(
2725 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2726 nanos
2727 ));
2728 }
2729 let duration = std::time::Duration::new(secs, nanos);
2730 let mtime = SystemTime::UNIX_EPOCH
2731 .checked_add(duration)
2732 .ok_or_else(|| {
2733 format!(
2734 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2735 secs, nanos
2736 )
2737 })?;
2738 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2739 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2740 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2741 } else {
2742 PathBuf::from(path)
2743 };
2744 file_mtimes.insert(path.clone(), mtime);
2745 file_sizes.insert(path.clone(), size);
2746 file_hashes.insert(path, content_hash);
2747 }
2748
2749 let mut entries = Vec::with_capacity(entry_count);
2751 for _ in 0..entry_count {
2752 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2753 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2754 cached_path_under_root(current_canonical_root, &raw_file)
2755 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2756 } else {
2757 raw_file
2758 };
2759 let name = read_string_stream(&mut reader, total_len)?;
2760
2761 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2762
2763 let start_line = read_u32_stream(&mut reader)?;
2764 let end_line = read_u32_stream(&mut reader)?;
2765
2766 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2767
2768 let snippet = read_string_stream(&mut reader, total_len)?;
2769 let embed_text = read_string_stream(&mut reader, total_len)?;
2770
2771 let vec_bytes = dimension
2773 .checked_mul(F32_BYTES)
2774 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2775 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2776 return Err("unexpected end of data reading vector".to_string());
2777 }
2778 let mut vector = Vec::with_capacity(dimension);
2779 for _ in 0..dimension {
2780 let mut bytes = [0u8; F32_BYTES];
2781 read_exact_stream(
2782 &mut reader,
2783 &mut bytes,
2784 "unexpected end of data reading vector",
2785 )?;
2786 vector.push(f32::from_le_bytes(bytes));
2787 }
2788
2789 entries.push(EmbeddingEntry {
2790 chunk: SemanticChunk {
2791 file,
2792 name,
2793 kind,
2794 start_line,
2795 end_line,
2796 exported,
2797 embed_text,
2798 snippet,
2799 },
2800 vector,
2801 });
2802 }
2803
2804 if entries.len() != entry_count {
2805 return Err(format!(
2806 "semantic cache entry count drift: header={} decoded={}",
2807 entry_count,
2808 entries.len()
2809 ));
2810 }
2811 for entry in &entries {
2812 if !file_mtimes.contains_key(&entry.chunk.file) {
2813 return Err(format!(
2814 "semantic cache metadata missing for entry file {}",
2815 entry.chunk.file.display()
2816 ));
2817 }
2818 }
2819
2820 Ok(Self {
2821 entries,
2822 file_mtimes,
2823 file_sizes,
2824 file_hashes,
2825 dimension,
2826 fingerprint,
2827 project_root: current_canonical_root.to_path_buf(),
2828 deferred_files: HashSet::new(),
2829 })
2830 }
2831}
2832
2833fn write_counted<W: Write>(
2834 writer: &mut W,
2835 bytes: &[u8],
2836 bytes_written: &mut usize,
2837) -> io::Result<()> {
2838 writer.write_all(bytes)?;
2839 *bytes_written = bytes_written.saturating_add(bytes.len());
2840 Ok(())
2841}
2842
2843struct CountingReader<R> {
2844 inner: R,
2845 bytes_read: usize,
2846}
2847
2848impl<R> CountingReader<R> {
2849 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2850 Self { inner, bytes_read }
2851 }
2852
2853 fn bytes_read(&self) -> usize {
2854 self.bytes_read
2855 }
2856}
2857
2858impl<R: Read> Read for CountingReader<R> {
2859 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2860 let read = self.inner.read(buf)?;
2861 self.bytes_read = self.bytes_read.saturating_add(read);
2862 Ok(read)
2863 }
2864}
2865
2866fn read_exact_stream<R: Read>(
2867 reader: &mut CountingReader<R>,
2868 buf: &mut [u8],
2869 eof_message: &'static str,
2870) -> Result<(), String> {
2871 reader.read_exact(buf).map_err(|error| {
2872 if error.kind() == io::ErrorKind::UnexpectedEof {
2873 eof_message.to_string()
2874 } else {
2875 format!("{eof_message}: {error}")
2876 }
2877 })
2878}
2879
2880fn read_u8_stream<R: Read>(
2881 reader: &mut CountingReader<R>,
2882 eof_message: &'static str,
2883) -> Result<u8, String> {
2884 let mut bytes = [0u8; 1];
2885 read_exact_stream(reader, &mut bytes, eof_message)?;
2886 Ok(bytes[0])
2887}
2888
2889fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2890 let mut bytes = [0u8; 4];
2891 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2892 Ok(u32::from_le_bytes(bytes))
2893}
2894
2895fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2896 let mut bytes = [0u8; 8];
2897 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2898 Ok(u64::from_le_bytes(bytes))
2899}
2900
2901fn read_string_stream<R: Read>(
2902 reader: &mut CountingReader<R>,
2903 total_len: Option<usize>,
2904) -> Result<String, String> {
2905 let len = read_u32_stream(reader)? as usize;
2906 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2907 return Err("unexpected end of data reading string".to_string());
2908 }
2909 let mut bytes = vec![0u8; len];
2910 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2911 Ok(String::from_utf8_lossy(&bytes).to_string())
2912}
2913
2914fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2916 let relative = file
2917 .strip_prefix(project_root)
2918 .unwrap_or(file)
2919 .to_string_lossy();
2920
2921 let kind_label = match &symbol.kind {
2922 SymbolKind::Function => "function",
2923 SymbolKind::Class => "class",
2924 SymbolKind::Method => "method",
2925 SymbolKind::Struct => "struct",
2926 SymbolKind::Interface => "interface",
2927 SymbolKind::Enum => "enum",
2928 SymbolKind::TypeAlias => "type",
2929 SymbolKind::Variable => "variable",
2930 SymbolKind::Heading => "heading",
2931 SymbolKind::FileSummary => "file-summary",
2932 };
2933
2934 let name = &symbol.name;
2936 let mut text = format!(
2937 "name:{name} file:{} kind:{} name:{name}",
2938 relative, kind_label
2939 );
2940
2941 if let Some(sig) = &symbol.signature {
2942 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2950 }
2951
2952 let lines: Vec<&str> = source.lines().collect();
2954 let start = (symbol.range.start_line as usize).min(lines.len());
2955 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2957 if start < end {
2958 let body: String = lines[start..end]
2959 .iter()
2960 .take(15) .copied()
2962 .collect::<Vec<&str>>()
2963 .join("\n");
2964 let snippet = if body.len() > 300 {
2965 format!("{}...", &body[..body.floor_char_boundary(300)])
2966 } else {
2967 body
2968 };
2969 text.push_str(&format!(" body:{}", snippet));
2970 }
2971
2972 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2977}
2978
2979const MAX_EMBED_TEXT_CHARS: usize = 1600;
2983
2984fn truncate_chars(value: &str, max_chars: usize) -> String {
2985 value.chars().take(max_chars).collect()
2986}
2987
2988fn first_leading_doc_comment(source: &str) -> String {
2989 let lines: Vec<&str> = source.lines().collect();
2990 let Some((start, first)) = lines
2991 .iter()
2992 .enumerate()
2993 .find(|(_, line)| !line.trim().is_empty())
2994 else {
2995 return String::new();
2996 };
2997
2998 let trimmed = first.trim_start();
2999 if trimmed.starts_with("/**") {
3000 let mut comment = Vec::new();
3001 for line in lines.iter().skip(start) {
3002 comment.push(*line);
3003 if line.contains("*/") {
3004 break;
3005 }
3006 }
3007 return truncate_chars(&comment.join("\n"), 200);
3008 }
3009
3010 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3011 let comment = lines
3012 .iter()
3013 .skip(start)
3014 .take_while(|line| {
3015 let trimmed = line.trim_start();
3016 trimmed.starts_with("///") || trimmed.starts_with("//!")
3017 })
3018 .copied()
3019 .collect::<Vec<_>>()
3020 .join("\n");
3021 return truncate_chars(&comment, 200);
3022 }
3023
3024 String::new()
3025}
3026
3027pub fn build_file_summary_chunk(
3028 file: &Path,
3029 project_root: &Path,
3030 source: &str,
3031 top_exports: &[&str],
3032 top_export_signatures: &[Option<&str>],
3033) -> SemanticChunk {
3034 let relative = file.strip_prefix(project_root).unwrap_or(file);
3035 let rel_path = relative.to_string_lossy();
3036 let parent_dir = relative
3037 .parent()
3038 .map(|parent| parent.to_string_lossy().to_string())
3039 .unwrap_or_default();
3040 let name = file
3041 .file_stem()
3042 .map(|stem| stem.to_string_lossy().to_string())
3043 .unwrap_or_default();
3044 let doc = first_leading_doc_comment(source);
3045 let exports = top_exports
3046 .iter()
3047 .take(5)
3048 .copied()
3049 .collect::<Vec<_>>()
3050 .join(",");
3051 let snippet = if doc.is_empty() {
3052 top_export_signatures
3053 .first()
3054 .and_then(|signature| signature.as_deref())
3055 .map(|signature| truncate_chars(signature, 200))
3056 .unwrap_or_default()
3057 } else {
3058 doc.clone()
3059 };
3060
3061 SemanticChunk {
3062 file: file.to_path_buf(),
3063 name,
3064 kind: SymbolKind::FileSummary,
3065 start_line: 0,
3066 end_line: 0,
3067 exported: false,
3068 embed_text: truncate_chars(
3069 &format!(
3070 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3071 file.file_stem()
3072 .map(|stem| stem.to_string_lossy().to_string())
3073 .unwrap_or_default()
3074 ),
3075 MAX_EMBED_TEXT_CHARS,
3076 ),
3077 snippet,
3078 }
3079}
3080
3081fn parser_for(
3082 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3083 lang: crate::parser::LangId,
3084) -> Result<&mut Parser, String> {
3085 use std::collections::hash_map::Entry;
3086
3087 match parsers.entry(lang) {
3088 Entry::Occupied(entry) => Ok(entry.into_mut()),
3089 Entry::Vacant(entry) => {
3090 let grammar = grammar_for(lang);
3091 let mut parser = Parser::new();
3092 parser
3093 .set_language(&grammar)
3094 .map_err(|error| error.to_string())?;
3095 Ok(entry.insert(parser))
3096 }
3097 }
3098}
3099
3100pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3101 matches!(
3102 path.extension().and_then(|extension| extension.to_str()),
3103 Some(
3104 "ts" | "tsx"
3105 | "js"
3106 | "jsx"
3107 | "py"
3108 | "rs"
3109 | "go"
3110 | "c"
3111 | "h"
3112 | "cc"
3113 | "cpp"
3114 | "cxx"
3115 | "hpp"
3116 | "hh"
3117 | "zig"
3118 | "cs"
3119 | "sh"
3120 | "bash"
3121 | "zsh"
3122 | "inc"
3123 | "php"
3124 | "sol"
3125 | "scss"
3126 | "vue"
3127 | "yaml"
3128 | "yml"
3129 )
3130 )
3131}
3132
3133fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
3134 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3135 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3136 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
3137 .map_err(|error| error.to_string())?
3138 .unwrap_or_else(cache_freshness::zero_hash);
3139 Ok(IndexedFileMetadata {
3140 mtime,
3141 size: metadata.len(),
3142 content_hash,
3143 })
3144}
3145
3146fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3147 if let Ok(canonical) = fs::canonicalize(path) {
3148 return canonical;
3149 }
3150
3151 let Some(parent) = path.parent() else {
3152 return path.to_path_buf();
3153 };
3154 let Some(file_name) = path.file_name() else {
3155 return path.to_path_buf();
3156 };
3157
3158 fs::canonicalize(parent)
3159 .map(|canonical_parent| canonical_parent.join(file_name))
3160 .unwrap_or_else(|_| path.to_path_buf())
3161}
3162
3163const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3173
3174fn collect_file_chunks(
3175 project_root: &Path,
3176 file: &Path,
3177 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3178) -> Result<Vec<SemanticChunk>, String> {
3179 if !is_semantic_indexed_extension(file) {
3180 return Err("unsupported file extension".to_string());
3181 }
3182 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3183 if std::fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3186 return Ok(Vec::new());
3187 }
3188 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
3189 let tree = parser_for(parsers, lang)?
3190 .parse(&source, None)
3191 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3192 let symbols =
3193 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
3194
3195 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
3196}
3197
3198fn build_snippet(symbol: &Symbol, source: &str) -> String {
3200 let lines: Vec<&str> = source.lines().collect();
3201 let start = (symbol.range.start_line as usize).min(lines.len());
3202 let end = (symbol.range.end_line as usize + 1).min(lines.len());
3204 if start < end {
3205 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3206 let mut snippet = snippet_lines.join("\n");
3207 if end - start > 5 {
3208 snippet.push_str("\n ...");
3209 }
3210 if snippet.len() > 300 {
3211 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3212 }
3213 snippet
3214 } else {
3215 String::new()
3216 }
3217}
3218
3219fn symbols_to_chunks(
3221 file: &Path,
3222 symbols: &[Symbol],
3223 source: &str,
3224 project_root: &Path,
3225) -> Vec<SemanticChunk> {
3226 let mut chunks = Vec::new();
3227 let top_exports_with_signatures = symbols
3228 .iter()
3229 .filter(|symbol| {
3230 symbol.exported
3231 && symbol.parent.is_none()
3232 && !matches!(symbol.kind, SymbolKind::Heading)
3233 })
3234 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3235 .collect::<Vec<_>>();
3236
3237 let has_only_headings = !symbols.is_empty()
3238 && symbols
3239 .iter()
3240 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3241 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3242 let top_exports = top_exports_with_signatures
3243 .iter()
3244 .map(|(name, _)| *name)
3245 .collect::<Vec<_>>();
3246 let top_export_signatures = top_exports_with_signatures
3247 .iter()
3248 .map(|(_, signature)| *signature)
3249 .collect::<Vec<_>>();
3250 chunks.push(build_file_summary_chunk(
3251 file,
3252 project_root,
3253 source,
3254 &top_exports,
3255 &top_export_signatures,
3256 ));
3257 }
3258
3259 for symbol in symbols {
3260 if matches!(symbol.kind, SymbolKind::Heading) {
3265 continue;
3266 }
3267
3268 let line_count = symbol
3270 .range
3271 .end_line
3272 .saturating_sub(symbol.range.start_line)
3273 + 1;
3274 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3275 continue;
3276 }
3277
3278 let embed_text = build_embed_text(symbol, source, file, project_root);
3279 let snippet = build_snippet(symbol, source);
3280
3281 chunks.push(SemanticChunk {
3282 file: file.to_path_buf(),
3283 name: symbol.name.clone(),
3284 kind: symbol.kind.clone(),
3285 start_line: symbol.range.start_line,
3286 end_line: symbol.range.end_line,
3287 exported: symbol.exported,
3288 embed_text,
3289 snippet,
3290 });
3291
3292 }
3295
3296 chunks
3297}
3298
3299fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3300 b.0.partial_cmp(&a.0)
3301 .unwrap_or(std::cmp::Ordering::Equal)
3302 .then_with(|| a.1.cmp(&b.1))
3303}
3304
3305fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3307 if a.len() != b.len() {
3308 return 0.0;
3309 }
3310
3311 let mut dot = 0.0f32;
3312 let mut norm_a = 0.0f32;
3313 let mut norm_b = 0.0f32;
3314
3315 for i in 0..a.len() {
3316 dot += a[i] * b[i];
3317 norm_a += a[i] * a[i];
3318 norm_b += b[i] * b[i];
3319 }
3320
3321 let denom = norm_a.sqrt() * norm_b.sqrt();
3322 if denom == 0.0 {
3323 0.0
3324 } else {
3325 dot / denom
3326 }
3327}
3328
3329fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3331 match kind {
3332 SymbolKind::Function => 0,
3333 SymbolKind::Class => 1,
3334 SymbolKind::Method => 2,
3335 SymbolKind::Struct => 3,
3336 SymbolKind::Interface => 4,
3337 SymbolKind::Enum => 5,
3338 SymbolKind::TypeAlias => 6,
3339 SymbolKind::Variable => 7,
3340 SymbolKind::Heading => 8,
3341 SymbolKind::FileSummary => 9,
3342 }
3343}
3344
3345fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3346 match v {
3347 0 => SymbolKind::Function,
3348 1 => SymbolKind::Class,
3349 2 => SymbolKind::Method,
3350 3 => SymbolKind::Struct,
3351 4 => SymbolKind::Interface,
3352 5 => SymbolKind::Enum,
3353 6 => SymbolKind::TypeAlias,
3354 7 => SymbolKind::Variable,
3355 8 => SymbolKind::Heading,
3356 9 => SymbolKind::FileSummary,
3357 _ => SymbolKind::Heading,
3358 }
3359}
3360
3361#[cfg(test)]
3362mod tests {
3363 use super::*;
3364 use crate::config::{SemanticBackend, SemanticBackendConfig};
3365 use crate::parser::FileParser;
3366 use std::io::{Read, Write};
3367 use std::net::TcpListener;
3368 use std::thread;
3369
3370 #[test]
3371 fn semantic_index_includes_php_inc_and_scss_extensions() {
3372 for file in ["partial.inc", "index.php", "styles.scss"] {
3373 assert!(
3374 is_semantic_indexed_extension(Path::new(file)),
3375 "{file} should be semantic-index eligible"
3376 );
3377 }
3378 }
3379
3380 #[test]
3381 fn transient_marker_round_trips_and_classifies() {
3382 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3385 assert!(embedding_failure_is_transient(&marked));
3386 let clean = strip_transient_embedding_marker(&marked);
3387 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3388 assert!(clean.starts_with("openai compatible request failed:"));
3389
3390 for permanent in [
3393 "openai compatible request failed (HTTP 401): Unauthorized",
3394 "embedding dimension mismatch: index has 384, model returned 768",
3395 "too many files (>20000) for semantic indexing (max 20000)",
3396 ] {
3397 assert!(
3398 !embedding_failure_is_transient(permanent),
3399 "{permanent:?} must not be transient"
3400 );
3401 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3403 }
3404 }
3405
3406 #[test]
3407 fn send_error_transience_separates_connect_timeout_from_4xx() {
3408 assert!(is_retryable_embedding_status(
3410 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3411 ));
3412 assert!(is_retryable_embedding_status(
3413 reqwest::StatusCode::TOO_MANY_REQUESTS
3414 ));
3415 assert!(!is_retryable_embedding_status(
3416 reqwest::StatusCode::UNAUTHORIZED
3417 ));
3418 assert!(!is_retryable_embedding_status(
3419 reqwest::StatusCode::BAD_REQUEST
3420 ));
3421 }
3422
3423 #[test]
3424 fn local_backend_model_loading_body_is_transient() {
3425 for body in [
3428 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3429 r#"{"error":"model is loading, please wait"}"#,
3430 r#"{"error":"Model not loaded"}"#,
3431 "Loading model into memory",
3432 ] {
3433 assert!(
3434 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3435 "{body:?} should be body-transient"
3436 );
3437 }
3438
3439 for body in [
3443 r#"{"error":"invalid api key"}"#,
3444 r#"{"error":"model 'foo' not found"}"#,
3445 "Bad Request: unknown field",
3446 "Bad Request: invalid loading model option",
3447 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3448 ] {
3449 assert!(
3450 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3451 "{body:?} must not be body-transient"
3452 );
3453 }
3454
3455 assert!(
3456 !embedding_response_body_is_transient(
3457 reqwest::StatusCode::UNAUTHORIZED,
3458 r#"{"error":"model is loading, please wait"}"#
3459 ),
3460 "permanent auth failures must not become transient because of body text"
3461 );
3462 }
3463
3464 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3465 where
3466 F: Fn(String, String, String) -> String + Send + 'static,
3467 {
3468 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3469 let addr = listener.local_addr().expect("local addr");
3470 let handle = thread::spawn(move || {
3471 let (mut stream, _) = listener.accept().expect("accept request");
3472 let mut buf = Vec::new();
3473 let mut chunk = [0u8; 4096];
3474 let mut header_end = None;
3475 let mut content_length = 0usize;
3476 loop {
3477 let n = stream.read(&mut chunk).expect("read request");
3478 if n == 0 {
3479 break;
3480 }
3481 buf.extend_from_slice(&chunk[..n]);
3482 if header_end.is_none() {
3483 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3484 header_end = Some(pos + 4);
3485 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3486 for line in headers.lines() {
3487 if let Some(value) = line.strip_prefix("Content-Length:") {
3488 content_length = value.trim().parse::<usize>().unwrap_or(0);
3489 }
3490 }
3491 }
3492 }
3493 if let Some(end) = header_end {
3494 if buf.len() >= end + content_length {
3495 break;
3496 }
3497 }
3498 }
3499
3500 let end = header_end.expect("header terminator");
3501 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3502 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3503 let mut lines = request.lines();
3504 let request_line = lines.next().expect("request line").to_string();
3505 let path = request_line
3506 .split_whitespace()
3507 .nth(1)
3508 .expect("request path")
3509 .to_string();
3510 let response_body = handler(request_line, path, body);
3511 let response = format!(
3512 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3513 response_body.len(),
3514 response_body
3515 );
3516 stream
3517 .write_all(response.as_bytes())
3518 .expect("write response");
3519 });
3520
3521 (format!("http://{}", addr), handle)
3522 }
3523
3524 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3525 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3526 listener
3527 .set_nonblocking(true)
3528 .expect("nonblocking listener");
3529 let addr = listener.local_addr().expect("local addr");
3530 let handle = thread::spawn(move || {
3531 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3532 let mut accepted = 0usize;
3533 while accepted < attempts && std::time::Instant::now() < deadline {
3534 match listener.accept() {
3535 Ok((mut stream, _)) => {
3536 accepted += 1;
3537 let mut buf = [0u8; 4096];
3538 let _ = stream.read(&mut buf);
3546 let response = "HTTP/1.1 200 OK
3547Content-Type: application/json
3548Content-Length: 128
3549Connection: close
3550
3551{";
3552 let _ = stream.write_all(response.as_bytes());
3553 }
3554 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3555 thread::sleep(Duration::from_millis(10));
3556 }
3557 Err(error) => panic!("accept request: {error}"),
3558 }
3559 }
3560 });
3561
3562 (format!("http://{}", addr), handle)
3563 }
3564
3565 #[test]
3566 fn response_body_read_failures_are_marked_transient() {
3567 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3568 let client = Client::builder()
3569 .timeout(Duration::from_millis(250))
3570 .build()
3571 .expect("client");
3572
3573 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3574 .expect_err("truncated body should fail");
3575
3576 handle.join().unwrap();
3577 assert!(
3578 embedding_failure_is_transient(&error),
3579 "body read failures should be transient-marked: {error}"
3580 );
3581 assert!(error.contains("response read failed"));
3582 }
3583
3584 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3585 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3586 }
3587
3588 fn write_rust_file(path: &Path, function_name: &str) {
3589 fs::write(
3590 path,
3591 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3592 )
3593 .unwrap();
3594 }
3595
3596 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3597 let mut embed = test_vector_for_texts;
3598 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3599 }
3600
3601 fn test_project_root() -> PathBuf {
3602 std::env::current_dir().unwrap()
3603 }
3604
3605 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3606 index.file_mtimes.insert(file.to_path_buf(), mtime);
3607 index.file_sizes.insert(file.to_path_buf(), size);
3608 index
3609 .file_hashes
3610 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3611 }
3612
3613 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3614 let mut buf = Vec::new();
3615 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3616 let encoded = fingerprint.as_string();
3617 if encoded.is_empty() {
3618 None
3619 } else {
3620 Some(encoded.into_bytes())
3621 }
3622 });
3623 let file_mtimes: Vec<_> = index
3624 .file_mtimes
3625 .iter()
3626 .filter_map(|(path, mtime)| {
3627 cache_relative_path(&index.project_root, path)
3628 .map(|relative| (relative, path, mtime))
3629 })
3630 .collect();
3631 let entries: Vec<_> = index
3632 .entries
3633 .iter()
3634 .filter_map(|entry| {
3635 cache_relative_path(&index.project_root, &entry.chunk.file)
3636 .map(|relative| (relative, entry))
3637 })
3638 .collect();
3639
3640 buf.push(SEMANTIC_INDEX_VERSION_V6);
3641 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3642 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3643 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3644 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3645 buf.extend_from_slice(fp_bytes_ref);
3646
3647 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3648 for (relative, path, mtime) in &file_mtimes {
3649 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3650 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3651 buf.extend_from_slice(&path_bytes);
3652 let duration = mtime
3653 .duration_since(SystemTime::UNIX_EPOCH)
3654 .unwrap_or_default();
3655 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3656 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3657 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3658 buf.extend_from_slice(&size.to_le_bytes());
3659 let hash = index
3660 .file_hashes
3661 .get(*path)
3662 .copied()
3663 .unwrap_or_else(cache_freshness::zero_hash);
3664 buf.extend_from_slice(hash.as_bytes());
3665 }
3666
3667 for (relative, entry) in &entries {
3668 let c = &entry.chunk;
3669 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3670 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3671 buf.extend_from_slice(&file_bytes);
3672
3673 let name_bytes = c.name.as_bytes();
3674 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3675 buf.extend_from_slice(name_bytes);
3676
3677 buf.push(symbol_kind_to_u8(&c.kind));
3678 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3679 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3680 buf.push(c.exported as u8);
3681
3682 let snippet_bytes = c.snippet.as_bytes();
3683 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3684 buf.extend_from_slice(snippet_bytes);
3685
3686 let embed_bytes = c.embed_text.as_bytes();
3687 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3688 buf.extend_from_slice(embed_bytes);
3689
3690 for &val in &entry.vector {
3691 buf.extend_from_slice(&val.to_le_bytes());
3692 }
3693 }
3694
3695 buf
3696 }
3697
3698 #[derive(Default)]
3699 struct RecordingEmbedder {
3700 calls: Vec<Vec<String>>,
3701 }
3702
3703 impl RecordingEmbedder {
3704 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3705 let vectors = texts
3706 .iter()
3707 .map(|text| deterministic_test_vector(text))
3708 .collect();
3709 self.calls.push(texts);
3710 Ok(vectors)
3711 }
3712
3713 fn total_embedded_texts(&self) -> usize {
3714 self.calls.iter().map(Vec::len).sum()
3715 }
3716
3717 fn embedded_texts(&self) -> Vec<&str> {
3718 self.calls
3719 .iter()
3720 .flat_map(|batch| batch.iter().map(String::as_str))
3721 .collect()
3722 }
3723 }
3724
3725 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3726 let hash = blake3::hash(text.as_bytes());
3727 let bytes = hash.as_bytes();
3728 vec![
3729 1.0,
3730 bytes[0] as f32 / 255.0,
3731 bytes[1] as f32 / 255.0,
3732 bytes[2] as f32 / 255.0,
3733 ]
3734 }
3735
3736 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3737 let mut embedder = RecordingEmbedder::default();
3738 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3739 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3740 }
3741
3742 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3743 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3744 }
3745
3746 fn write_source(path: &Path, source: &str) {
3747 if let Some(parent) = path.parent() {
3748 fs::create_dir_all(parent).unwrap();
3749 }
3750 fs::write(path, source).unwrap();
3751 }
3752
3753 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3754 index
3755 .entries
3756 .iter()
3757 .filter(|entry| entry.chunk.file == file)
3758 .collect()
3759 }
3760
3761 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3762 index
3763 .entries
3764 .iter()
3765 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3766 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3767 }
3768
3769 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3770 index
3771 .entries
3772 .iter()
3773 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3774 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3775 }
3776
3777 #[test]
3778 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3779 let temp = tempfile::tempdir().unwrap();
3780 let project_root = temp.path();
3781 let file = project_root.join("src/lib.rs");
3782 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3783 write_source(&file, original);
3784
3785 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3786 let original_entry_count = index.entries.len();
3787 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3788
3789 write_source(&file, &format!("\n{original}"));
3790 force_stale(&mut index, &file);
3791
3792 let mut embedder = RecordingEmbedder::default();
3793 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3794 let mut progress = |_done: usize, _total: usize| {};
3795 let summary = index
3796 .refresh_stale_files(
3797 project_root,
3798 std::slice::from_ref(&file),
3799 &mut embed,
3800 16,
3801 &mut progress,
3802 )
3803 .unwrap();
3804
3805 assert_eq!(summary.changed, 1);
3806 assert_eq!(embedder.total_embedded_texts(), 0);
3807 assert_eq!(index.entries.len(), original_entry_count);
3808 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3809 assert_eq!(shifted_alpha.chunk.start_line, 1);
3810 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3811 }
3812
3813 #[test]
3814 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3815 let temp = tempfile::tempdir().unwrap();
3816 let project_root = temp.path();
3817 let file = project_root.join("src/lib.rs");
3818 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3819 write_source(&file, original);
3820
3821 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3822 let mut serving_index = worker_index.clone();
3823 let original_entry_count = worker_index.entries.len();
3824
3825 write_source(&file, &format!("\n{original}"));
3826
3827 let mut embedder = RecordingEmbedder::default();
3828 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3829 let mut progress = |_done: usize, _total: usize| {};
3830 let update = worker_index
3831 .refresh_invalidated_files(
3832 project_root,
3833 std::slice::from_ref(&file),
3834 &mut embed,
3835 16,
3836 100,
3837 &mut progress,
3838 )
3839 .unwrap();
3840
3841 assert_eq!(embedder.total_embedded_texts(), 0);
3842 assert_eq!(update.added_entries.len(), original_entry_count);
3843 assert_eq!(worker_index.entries.len(), original_entry_count);
3844
3845 serving_index.apply_refresh_update(
3846 update.added_entries,
3847 update.updated_metadata,
3848 &update.completed_paths,
3849 );
3850
3851 assert_eq!(serving_index.entries.len(), original_entry_count);
3852 assert_eq!(
3853 entries_for_file(&serving_index, &file).len(),
3854 original_entry_count
3855 );
3856 assert_eq!(
3857 entry_by_name(&serving_index, &file, "alpha")
3858 .chunk
3859 .start_line,
3860 1
3861 );
3862 }
3863
3864 #[test]
3865 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3866 let temp = tempfile::tempdir().unwrap();
3867 let project_root = temp.path();
3868 let file = project_root.join("src/lib.rs");
3869 write_source(
3870 &file,
3871 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3872 );
3873
3874 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3875 let original_entry_count = index.entries.len();
3876 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3877
3878 write_source(
3879 &file,
3880 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3881 );
3882
3883 let mut embedder = RecordingEmbedder::default();
3884 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3885 let mut progress = |_done: usize, _total: usize| {};
3886 let update = index
3887 .refresh_invalidated_files(
3888 project_root,
3889 std::slice::from_ref(&file),
3890 &mut embed,
3891 16,
3892 100,
3893 &mut progress,
3894 )
3895 .unwrap();
3896
3897 assert_eq!(embedder.total_embedded_texts(), 1);
3898 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3899 assert_eq!(update.added_entries.len(), original_entry_count);
3900 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3901 }
3902
3903 #[test]
3904 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3905 let temp = tempfile::tempdir().unwrap();
3906 let project_root = temp.path();
3907 let file = project_root.join("src/dupe.js");
3908 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
3909 write_source(&file, one_duplicate);
3910
3911 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3912 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3913
3914 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3915
3916 let mut embedder = RecordingEmbedder::default();
3917 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3918 let mut progress = |_done: usize, _total: usize| {};
3919 index
3920 .refresh_invalidated_files(
3921 project_root,
3922 std::slice::from_ref(&file),
3923 &mut embed,
3924 16,
3925 100,
3926 &mut progress,
3927 )
3928 .unwrap();
3929
3930 let duplicate_entries = index
3931 .entries
3932 .iter()
3933 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3934 .collect::<Vec<_>>();
3935 assert_eq!(duplicate_entries.len(), 2);
3936 assert_eq!(embedder.total_embedded_texts(), 0);
3937 assert_eq!(duplicate_entries[0].vector, original_vector);
3938 assert_eq!(duplicate_entries[1].vector, original_vector);
3939 }
3940
3941 #[test]
3942 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3943 let temp = tempfile::tempdir().unwrap();
3944 let project_root = temp.path();
3945 let file = project_root.join("src/lib.rs");
3946 write_source(
3947 &file,
3948 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
3949 );
3950
3951 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3952 let summary_before = file_summary_entry(&index, &file).vector.clone();
3953
3954 write_source(
3955 &file,
3956 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
3957 );
3958 let mut body_embedder = RecordingEmbedder::default();
3959 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3960 let mut progress = |_done: usize, _total: usize| {};
3961 index
3962 .refresh_invalidated_files(
3963 project_root,
3964 std::slice::from_ref(&file),
3965 &mut body_embed,
3966 16,
3967 100,
3968 &mut progress,
3969 )
3970 .unwrap();
3971 assert_eq!(body_embedder.total_embedded_texts(), 1);
3972 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3973 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3974
3975 write_source(
3976 &file,
3977 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
3978 );
3979 let mut doc_embedder = RecordingEmbedder::default();
3980 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3981 index
3982 .refresh_invalidated_files(
3983 project_root,
3984 std::slice::from_ref(&file),
3985 &mut doc_embed,
3986 16,
3987 100,
3988 &mut progress,
3989 )
3990 .unwrap();
3991
3992 assert_eq!(doc_embedder.total_embedded_texts(), 1);
3993 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3994 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3995 }
3996
3997 #[test]
3998 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3999 let temp = tempfile::tempdir().unwrap();
4000 let project_root = temp.path();
4001 let file = project_root.join("src/lib.rs");
4002 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4003
4004 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4005 let mut serving_index = worker_index.clone();
4006 fs::remove_file(&file).unwrap();
4007
4008 let mut embedder = RecordingEmbedder::default();
4009 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4010 let mut progress = |_done: usize, _total: usize| {};
4011 let update = worker_index
4012 .refresh_invalidated_files(
4013 project_root,
4014 std::slice::from_ref(&file),
4015 &mut embed,
4016 16,
4017 100,
4018 &mut progress,
4019 )
4020 .unwrap();
4021
4022 assert_eq!(update.summary.deleted, 1);
4023 assert_eq!(embedder.total_embedded_texts(), 0);
4024 assert!(worker_index.entries.is_empty());
4025
4026 serving_index.apply_refresh_update(
4027 update.added_entries,
4028 update.updated_metadata,
4029 &update.completed_paths,
4030 );
4031 assert!(serving_index.entries.is_empty());
4032 }
4033
4034 #[test]
4035 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4036 let temp = tempfile::tempdir().unwrap();
4037 let project_root = temp.path();
4038 let file = project_root.join("src/lib.rs");
4039 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4040
4041 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4042 let mut serving_index = worker_index.clone();
4043 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4044
4045 let mut embedder = RecordingEmbedder::default();
4046 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4047 let mut progress = |_done: usize, _total: usize| {};
4048 let update = worker_index
4049 .refresh_invalidated_files(
4050 project_root,
4051 std::slice::from_ref(&file),
4052 &mut embed,
4053 16,
4054 100,
4055 &mut progress,
4056 )
4057 .unwrap();
4058
4059 assert_eq!(embedder.total_embedded_texts(), 0);
4060 assert!(update.added_entries.is_empty());
4061 assert!(worker_index.entries.is_empty());
4062 assert!(!worker_index.file_mtimes.contains_key(&file));
4063
4064 serving_index.apply_refresh_update(
4065 update.added_entries,
4066 update.updated_metadata,
4067 &update.completed_paths,
4068 );
4069 assert!(serving_index.entries.is_empty());
4070 assert!(!serving_index.file_mtimes.contains_key(&file));
4071 }
4072
4073 #[test]
4074 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4075 let temp = tempfile::tempdir().unwrap();
4076 let project_root = temp.path();
4077 let indexed = project_root.join("src/a.rs");
4078 let deferred = project_root.join("src/b.rs");
4079 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4080 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4081
4082 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4083 let mut embedder = RecordingEmbedder::default();
4084 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4085 let mut progress = |_done: usize, _total: usize| {};
4086 let update = index
4087 .refresh_invalidated_files(
4088 project_root,
4089 std::slice::from_ref(&deferred),
4090 &mut embed,
4091 16,
4092 1,
4093 &mut progress,
4094 )
4095 .unwrap();
4096
4097 assert_eq!(update.summary.total_processed, 1);
4098 assert_eq!(update.summary.added, 0);
4099 assert_eq!(embedder.total_embedded_texts(), 0);
4100 assert_eq!(index.indexed_file_count(), 1);
4101 assert!(index.deferred_files.contains(&deferred));
4102 assert!(entries_for_file(&index, &deferred).is_empty());
4103 }
4104
4105 #[test]
4106 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4107 let dir = tempfile::tempdir().expect("create temp dir");
4108 let project = fs::canonicalize(dir.path()).expect("canonical project");
4109 let outside = project.join("..").join("outside.rs");
4110 let mut index = SemanticIndex::new(project.clone(), 3);
4111 index
4112 .file_mtimes
4113 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4114 index.file_sizes.insert(outside.clone(), 1);
4115 index
4116 .file_hashes
4117 .insert(outside.clone(), cache_freshness::zero_hash());
4118 index.entries.push(EmbeddingEntry {
4119 chunk: SemanticChunk {
4120 file: outside,
4121 name: "outside".to_string(),
4122 kind: SymbolKind::Function,
4123 start_line: 0,
4124 end_line: 0,
4125 exported: false,
4126 embed_text: "outside".to_string(),
4127 snippet: "outside".to_string(),
4128 },
4129 vector: vec![1.0, 0.0, 0.0],
4130 });
4131
4132 let bytes = index.to_bytes();
4133 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4134 assert_eq!(loaded.entries.len(), 0);
4135 assert!(loaded.file_mtimes.is_empty());
4136 }
4137
4138 #[test]
4139 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4140 let project_root = test_project_root();
4141 let file = project_root.join("src/lib.rs");
4142 let mut index = SemanticIndex::new(project_root, 2);
4143 let entries = [
4144 ("alpha", vec![1.0, 0.0], false),
4145 ("beta", vec![0.0, 1.0], false),
4146 ("gamma", vec![1.0, 0.0], false),
4147 ("delta", vec![0.5, 0.5], true),
4148 ("epsilon", vec![-1.0, 0.0], false),
4149 ];
4150 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4151 index.entries.push(EmbeddingEntry {
4152 chunk: SemanticChunk {
4153 file: file.clone(),
4154 name: name.to_string(),
4155 kind: SymbolKind::Function,
4156 start_line: line as u32 + 1,
4157 end_line: line as u32 + 1,
4158 exported,
4159 embed_text: name.to_string(),
4160 snippet: format!("fn {name}() {{}}"),
4161 },
4162 vector,
4163 });
4164 }
4165
4166 let query = vec![1.0, 0.0];
4167 let top_k = 4;
4168 let mut reference: Vec<(f32, usize)> = index
4169 .entries
4170 .iter()
4171 .enumerate()
4172 .map(|(idx, entry)| {
4173 let mut score = cosine_similarity(&query, &entry.vector);
4174 if entry.chunk.exported {
4175 score *= 1.1;
4176 }
4177 (score, idx)
4178 })
4179 .collect();
4180 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4181 let expected: Vec<(String, f32)> = reference
4182 .into_iter()
4183 .take(top_k)
4184 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4185 .collect();
4186
4187 let actual: Vec<(String, f32)> = index
4188 .search(&query, top_k)
4189 .into_iter()
4190 .map(|result| (result.name, result.score))
4191 .collect();
4192
4193 assert_eq!(
4194 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4195 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4196 );
4197 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4198 assert!((actual_score - expected_score).abs() < 1e-6);
4199 }
4200 assert_eq!(actual[0].0, "alpha");
4201 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4202 assert!(index.search(&query, 0).is_empty());
4203 }
4204
4205 #[test]
4206 fn test_cosine_similarity_identical() {
4207 let a = vec![1.0, 0.0, 0.0];
4208 let b = vec![1.0, 0.0, 0.0];
4209 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4210 }
4211
4212 #[test]
4213 fn test_cosine_similarity_orthogonal() {
4214 let a = vec![1.0, 0.0, 0.0];
4215 let b = vec![0.0, 1.0, 0.0];
4216 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4217 }
4218
4219 #[test]
4220 fn test_cosine_similarity_opposite() {
4221 let a = vec![1.0, 0.0, 0.0];
4222 let b = vec![-1.0, 0.0, 0.0];
4223 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4224 }
4225
4226 #[test]
4227 fn test_serialization_roundtrip() {
4228 let project_root = test_project_root();
4229 let file = project_root.join("src/main.rs");
4230 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4231 index.entries.push(EmbeddingEntry {
4232 chunk: SemanticChunk {
4233 file: file.clone(),
4234 name: "handle_request".to_string(),
4235 kind: SymbolKind::Function,
4236 start_line: 10,
4237 end_line: 25,
4238 exported: true,
4239 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4240 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4241 },
4242 vector: vec![0.1, 0.2, 0.3, 0.4],
4243 });
4244 index.dimension = 4;
4245 index
4246 .file_mtimes
4247 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4248 index.file_sizes.insert(file, 0);
4249 index.set_fingerprint(SemanticIndexFingerprint {
4250 backend: "fastembed".to_string(),
4251 model: "all-MiniLM-L6-v2".to_string(),
4252 base_url: FALLBACK_BACKEND.to_string(),
4253 dimension: 4,
4254 chunking_version: default_chunking_version(),
4255 });
4256
4257 let bytes = index.to_bytes();
4258 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4259
4260 assert_eq!(restored.entries.len(), 1);
4261 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4262 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4263 assert_eq!(restored.dimension, 4);
4264 assert_eq!(restored.backend_label(), Some("fastembed"));
4265 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4266 }
4267
4268 #[test]
4269 fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4270 let storage = tempfile::tempdir().expect("create storage dir");
4271 let project = storage.path().join("project");
4272 fs::create_dir_all(project.join("src")).expect("create project src");
4273 let file = project.join("src/lib.rs");
4274 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4275 let project_root = fs::canonicalize(&project).expect("canonical project");
4276 let file = fs::canonicalize(&file).expect("canonical file");
4277
4278 let mut index = SemanticIndex::new(project_root.clone(), 3);
4279 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4280 index.file_mtimes.insert(file.clone(), mtime);
4281 index.file_sizes.insert(file.clone(), 42);
4282 index
4283 .file_hashes
4284 .insert(file.clone(), cache_freshness::zero_hash());
4285 index.entries.push(EmbeddingEntry {
4286 chunk: SemanticChunk {
4287 file: file.clone(),
4288 name: "alpha".to_string(),
4289 kind: SymbolKind::Function,
4290 start_line: 0,
4291 end_line: 0,
4292 exported: true,
4293 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4294 snippet: "pub fn alpha() {}".to_string(),
4295 },
4296 vector: vec![0.1, 0.2, 0.3],
4297 });
4298 index.entries.push(EmbeddingEntry {
4299 chunk: SemanticChunk {
4300 file: file.clone(),
4301 name: "beta".to_string(),
4302 kind: SymbolKind::Function,
4303 start_line: 1,
4304 end_line: 1,
4305 exported: true,
4306 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4307 snippet: "pub fn beta() {}".to_string(),
4308 },
4309 vector: vec![0.4, 0.5, 0.6],
4310 });
4311 let fingerprint = SemanticIndexFingerprint {
4312 backend: "fastembed".to_string(),
4313 model: "all-MiniLM-L6-v2".to_string(),
4314 base_url: FALLBACK_BACKEND.to_string(),
4315 dimension: 3,
4316 chunking_version: default_chunking_version(),
4317 };
4318 index.set_fingerprint(fingerprint.clone());
4319
4320 let legacy_bytes = legacy_semantic_index_bytes(&index);
4321 assert_eq!(index.to_bytes(), legacy_bytes);
4322
4323 index.write_to_disk(storage.path(), "proj");
4324 let data_path = storage.path().join("semantic/proj/semantic.bin");
4325 assert_eq!(
4326 fs::read(&data_path).expect("read semantic.bin"),
4327 legacy_bytes
4328 );
4329
4330 let loaded = SemanticIndex::read_from_disk(
4331 storage.path(),
4332 "proj",
4333 &project_root,
4334 false,
4335 Some(&fingerprint.as_string()),
4336 )
4337 .expect("load semantic index");
4338 assert_eq!(loaded.entries.len(), index.entries.len());
4339 assert_eq!(loaded.dimension, index.dimension);
4340 assert_eq!(
4341 loaded.fingerprint().unwrap().as_string(),
4342 fingerprint.as_string()
4343 );
4344 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4345 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4346 assert_eq!(
4347 loaded.file_hashes.get(&file),
4348 Some(&cache_freshness::zero_hash())
4349 );
4350 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4351 assert_eq!(actual.chunk.file, expected.chunk.file);
4352 assert_eq!(actual.chunk.name, expected.chunk.name);
4353 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4354 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4355 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4356 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4357 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4358 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4359 assert_eq!(actual.vector, expected.vector);
4360 }
4361 assert_eq!(loaded.to_bytes(), legacy_bytes);
4362 }
4363
4364 #[test]
4365 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4366 let cases = [
4367 (SymbolKind::Function, 0),
4368 (SymbolKind::Class, 1),
4369 (SymbolKind::Method, 2),
4370 (SymbolKind::Struct, 3),
4371 (SymbolKind::Interface, 4),
4372 (SymbolKind::Enum, 5),
4373 (SymbolKind::TypeAlias, 6),
4374 (SymbolKind::Variable, 7),
4375 (SymbolKind::Heading, 8),
4376 (SymbolKind::FileSummary, 9),
4377 ];
4378
4379 for (kind, encoded) in cases {
4380 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4381 assert_eq!(u8_to_symbol_kind(encoded), kind);
4382 }
4383 }
4384
4385 #[test]
4386 fn test_search_top_k() {
4387 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4388 index.dimension = 3;
4389
4390 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4392 let mut vec = vec![0.0f32; 3];
4393 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4395 chunk: SemanticChunk {
4396 file: PathBuf::from("/src/lib.rs"),
4397 name: name.to_string(),
4398 kind: SymbolKind::Function,
4399 start_line: (i * 10 + 1) as u32,
4400 end_line: (i * 10 + 5) as u32,
4401 exported: true,
4402 embed_text: format!("kind:function name:{}", name),
4403 snippet: format!("fn {}() {{}}", name),
4404 },
4405 vector: vec,
4406 });
4407 }
4408
4409 let query = vec![0.9, 0.1, 0.0];
4411 let results = index.search(&query, 2);
4412
4413 assert_eq!(results.len(), 2);
4414 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4416 }
4417
4418 #[test]
4419 fn test_empty_index_search() {
4420 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4421 let results = index.search(&[0.1, 0.2, 0.3], 10);
4422 assert!(results.is_empty());
4423 }
4424
4425 #[test]
4426 fn single_line_symbol_builds_non_empty_snippet() {
4427 let symbol = Symbol {
4428 name: "answer".to_string(),
4429 kind: SymbolKind::Variable,
4430 range: crate::symbols::Range {
4431 start_line: 0,
4432 start_col: 0,
4433 end_line: 0,
4434 end_col: 24,
4435 },
4436 signature: Some("const answer = 42".to_string()),
4437 scope_chain: Vec::new(),
4438 exported: true,
4439 parent: None,
4440 };
4441 let source = "export const answer = 42;\n";
4442
4443 let snippet = build_snippet(&symbol, source);
4444
4445 assert_eq!(snippet, "export const answer = 42;");
4446 }
4447
4448 #[test]
4449 fn optimized_file_chunk_collection_matches_file_parser_path() {
4450 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4451 let file = project_root.join("src/semantic_index.rs");
4452 let source = std::fs::read_to_string(&file).unwrap();
4453
4454 let mut legacy_parser = FileParser::new();
4455 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4456 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4457
4458 let mut parsers = HashMap::new();
4459 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4460
4461 assert_eq!(
4462 chunk_fingerprint(&optimized_chunks),
4463 chunk_fingerprint(&legacy_chunks)
4464 );
4465 }
4466
4467 fn chunk_fingerprint(
4468 chunks: &[SemanticChunk],
4469 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4470 chunks
4471 .iter()
4472 .map(|chunk| {
4473 (
4474 chunk.name.clone(),
4475 chunk.kind.clone(),
4476 chunk.start_line,
4477 chunk.end_line,
4478 chunk.exported,
4479 chunk.embed_text.clone(),
4480 chunk.snippet.clone(),
4481 )
4482 })
4483 .collect()
4484 }
4485
4486 #[test]
4487 fn collect_file_chunks_skips_oversized_file() {
4488 let dir = tempfile::tempdir().unwrap();
4489 let big = dir.path().join("huge.ts");
4490 let filler = "export const x = 1;\n"
4492 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4493 std::fs::write(&big, &filler).unwrap();
4494 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4495
4496 let mut parsers = HashMap::new();
4497 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4500 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4501
4502 let small = dir.path().join("small.ts");
4504 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4505 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4506 assert!(!small_chunks.is_empty(), "small file should still chunk");
4507 }
4508
4509 #[test]
4510 fn rejects_oversized_dimension_during_deserialization() {
4511 let mut bytes = Vec::new();
4512 bytes.push(1u8);
4513 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4514 bytes.extend_from_slice(&0u32.to_le_bytes());
4515 bytes.extend_from_slice(&0u32.to_le_bytes());
4516
4517 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4518 }
4519
4520 #[test]
4521 fn rejects_oversized_entry_count_during_deserialization() {
4522 let mut bytes = Vec::new();
4523 bytes.push(1u8);
4524 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4525 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4526 bytes.extend_from_slice(&0u32.to_le_bytes());
4527
4528 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4529 }
4530
4531 #[test]
4532 fn invalidate_file_removes_entries_and_mtime() {
4533 let target = PathBuf::from("/src/main.rs");
4534 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4535 index.entries.push(EmbeddingEntry {
4536 chunk: SemanticChunk {
4537 file: target.clone(),
4538 name: "main".to_string(),
4539 kind: SymbolKind::Function,
4540 start_line: 0,
4541 end_line: 1,
4542 exported: false,
4543 embed_text: "main".to_string(),
4544 snippet: "fn main() {}".to_string(),
4545 },
4546 vector: vec![1.0; DEFAULT_DIMENSION],
4547 });
4548 index
4549 .file_mtimes
4550 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4551 index.file_sizes.insert(target.clone(), 0);
4552
4553 index.invalidate_file(&target);
4554
4555 assert!(index.entries.is_empty());
4556 assert!(!index.file_mtimes.contains_key(&target));
4557 assert!(!index.file_sizes.contains_key(&target));
4558 }
4559
4560 #[test]
4561 fn refresh_missing_changed_file_is_purged_after_collect() {
4562 let temp = tempfile::tempdir().unwrap();
4563 let project_root = temp.path();
4564 let file = project_root.join("src/lib.rs");
4565 fs::create_dir_all(file.parent().unwrap()).unwrap();
4566 write_rust_file(&file, "vanished_symbol");
4567
4568 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4569 let original_size = *index.file_sizes.get(&file).unwrap();
4570 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4571 fs::remove_file(&file).unwrap();
4572
4573 let mut embed = test_vector_for_texts;
4574 let mut progress = |_done: usize, _total: usize| {};
4575 let summary = index
4576 .refresh_stale_files(
4577 project_root,
4578 std::slice::from_ref(&file),
4579 &mut embed,
4580 8,
4581 &mut progress,
4582 )
4583 .unwrap();
4584
4585 assert_eq!(summary.changed, 0);
4586 assert_eq!(summary.added, 0);
4587 assert_eq!(summary.deleted, 1);
4588 assert!(index.entries.is_empty());
4589 assert!(!index.file_mtimes.contains_key(&file));
4590 assert!(!index.file_sizes.contains_key(&file));
4591 assert!(!index.file_hashes.contains_key(&file));
4592 }
4593
4594 #[test]
4595 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4596 let temp = tempfile::tempdir().unwrap();
4597 let project_root = temp.path();
4598 let file = project_root.join("src/lib.rs");
4599 fs::create_dir_all(file.parent().unwrap()).unwrap();
4600 write_rust_file(&file, "kept_symbol");
4601
4602 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4603 let original_entry_count = index.entries.len();
4604 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4605 let original_size = *index.file_sizes.get(&file).unwrap();
4606
4607 let stale_mtime = SystemTime::UNIX_EPOCH;
4608 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4609 fs::remove_file(&file).unwrap();
4610 fs::create_dir(&file).unwrap();
4611
4612 let mut embed = test_vector_for_texts;
4613 let mut progress = |_done: usize, _total: usize| {};
4614 let summary = index
4615 .refresh_stale_files(
4616 project_root,
4617 std::slice::from_ref(&file),
4618 &mut embed,
4619 8,
4620 &mut progress,
4621 )
4622 .unwrap();
4623
4624 assert_eq!(summary.changed, 0);
4625 assert_eq!(summary.added, 0);
4626 assert_eq!(summary.deleted, 0);
4627 assert_eq!(index.entries.len(), original_entry_count);
4628 assert!(index
4629 .entries
4630 .iter()
4631 .any(|entry| entry.chunk.name == "kept_symbol"));
4632 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4633 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4634 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4635 }
4636
4637 #[test]
4638 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4639 let temp = tempfile::tempdir().unwrap();
4640 let project_root = temp.path();
4641 let missing = project_root.join("src/missing.rs");
4642 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4643
4644 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4645 let mut embed = test_vector_for_texts;
4646 let mut progress = |_done: usize, _total: usize| {};
4647 let summary = index
4648 .refresh_stale_files(
4649 project_root,
4650 std::slice::from_ref(&missing),
4651 &mut embed,
4652 8,
4653 &mut progress,
4654 )
4655 .unwrap();
4656
4657 assert_eq!(summary.added, 0);
4658 assert_eq!(summary.changed, 0);
4659 assert_eq!(summary.deleted, 0);
4660 assert!(!index.file_mtimes.contains_key(&missing));
4661 assert!(!index.file_sizes.contains_key(&missing));
4662 assert!(index.entries.is_empty());
4663 }
4664
4665 #[test]
4666 fn refresh_reports_added_for_new_files() {
4667 let temp = tempfile::tempdir().unwrap();
4668 let project_root = temp.path();
4669 let existing = project_root.join("src/lib.rs");
4670 let added = project_root.join("src/new.rs");
4671 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4672 write_rust_file(&existing, "existing_symbol");
4673 write_rust_file(&added, "added_symbol");
4674
4675 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4676 let mut embed = test_vector_for_texts;
4677 let mut progress = |_done: usize, _total: usize| {};
4678 let summary = index
4679 .refresh_stale_files(
4680 project_root,
4681 &[existing.clone(), added.clone()],
4682 &mut embed,
4683 8,
4684 &mut progress,
4685 )
4686 .unwrap();
4687
4688 assert_eq!(summary.added, 1);
4689 assert_eq!(summary.changed, 0);
4690 assert_eq!(summary.deleted, 0);
4691 assert_eq!(summary.total_processed, 2);
4692 assert!(index.file_mtimes.contains_key(&added));
4693 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4694 }
4695
4696 #[test]
4697 fn refresh_reports_deleted_for_removed_files() {
4698 let temp = tempfile::tempdir().unwrap();
4699 let project_root = temp.path();
4700 let deleted = project_root.join("src/deleted.rs");
4701 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4702 write_rust_file(&deleted, "deleted_symbol");
4703
4704 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4705 fs::remove_file(&deleted).unwrap();
4706
4707 let mut embed = test_vector_for_texts;
4708 let mut progress = |_done: usize, _total: usize| {};
4709 let summary = index
4710 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4711 .unwrap();
4712
4713 assert_eq!(summary.deleted, 1);
4714 assert_eq!(summary.changed, 0);
4715 assert_eq!(summary.added, 0);
4716 assert_eq!(summary.total_processed, 1);
4717 assert!(!index.file_mtimes.contains_key(&deleted));
4718 assert!(index.entries.is_empty());
4719 }
4720
4721 #[test]
4722 fn refresh_reports_changed_for_modified_files() {
4723 let temp = tempfile::tempdir().unwrap();
4724 let project_root = temp.path();
4725 let file = project_root.join("src/lib.rs");
4726 fs::create_dir_all(file.parent().unwrap()).unwrap();
4727 write_rust_file(&file, "old_symbol");
4728
4729 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4730 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4731 write_rust_file(&file, "new_symbol");
4732
4733 let mut embed = test_vector_for_texts;
4734 let mut progress = |_done: usize, _total: usize| {};
4735 let summary = index
4736 .refresh_stale_files(
4737 project_root,
4738 std::slice::from_ref(&file),
4739 &mut embed,
4740 8,
4741 &mut progress,
4742 )
4743 .unwrap();
4744
4745 assert_eq!(summary.changed, 1);
4746 assert_eq!(summary.added, 0);
4747 assert_eq!(summary.deleted, 0);
4748 assert_eq!(summary.total_processed, 1);
4749 assert!(index
4750 .entries
4751 .iter()
4752 .any(|entry| entry.chunk.name == "new_symbol"));
4753 assert!(!index
4754 .entries
4755 .iter()
4756 .any(|entry| entry.chunk.name == "old_symbol"));
4757 }
4758
4759 #[test]
4760 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4761 let temp = tempfile::tempdir().unwrap();
4762 let project_root = temp.path();
4763 let file = project_root.join("src/lib.rs");
4764 fs::create_dir_all(file.parent().unwrap()).unwrap();
4765 write_rust_file(&file, "clean_symbol");
4766
4767 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4768 let original_entries = index.entries.len();
4769 let mut embed_called = false;
4770 let mut embed = |texts: Vec<String>| {
4771 embed_called = true;
4772 test_vector_for_texts(texts)
4773 };
4774 let mut progress = |_done: usize, _total: usize| {};
4775 let summary = index
4776 .refresh_stale_files(
4777 project_root,
4778 std::slice::from_ref(&file),
4779 &mut embed,
4780 8,
4781 &mut progress,
4782 )
4783 .unwrap();
4784
4785 assert!(summary.is_noop());
4786 assert_eq!(summary.total_processed, 1);
4787 assert!(!embed_called);
4788 assert_eq!(index.entries.len(), original_entries);
4789 }
4790
4791 #[test]
4792 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4793 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4794
4795 assert!(is_onnx_runtime_unavailable(message));
4796 }
4797
4798 #[test]
4799 fn formats_missing_onnx_runtime_with_install_hint() {
4800 let message = format_embedding_init_error(
4801 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4802 );
4803
4804 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4805 assert!(message.contains("Original error:"));
4806 }
4807
4808 #[test]
4809 fn openai_compatible_backend_embeds_with_mock_server() {
4810 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4811 assert!(request_line.starts_with("POST "));
4812 assert_eq!(path, "/v1/embeddings");
4813 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4814 });
4815
4816 let config = SemanticBackendConfig {
4817 backend: SemanticBackend::OpenAiCompatible,
4818 model: "test-embedding".to_string(),
4819 base_url: Some(base_url),
4820 api_key_env: None,
4821 timeout_ms: 5_000,
4822 max_batch_size: 64,
4823 max_files: 20_000,
4824 };
4825
4826 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4827 let vectors = model
4828 .embed(vec!["hello".to_string(), "world".to_string()])
4829 .unwrap();
4830
4831 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4832 handle.join().unwrap();
4833 }
4834
4835 #[test]
4845 fn openai_compatible_request_has_single_content_type_header() {
4846 use std::sync::{Arc, Mutex};
4847 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4848 let captured_for_thread = Arc::clone(&captured);
4849
4850 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4851 let addr = listener.local_addr().expect("local addr");
4852 let handle = thread::spawn(move || {
4853 let (mut stream, _) = listener.accept().expect("accept");
4854 let mut buf = Vec::new();
4855 let mut chunk = [0u8; 4096];
4856 let mut header_end = None;
4857 let mut content_length = 0usize;
4858 loop {
4859 let n = stream.read(&mut chunk).expect("read");
4860 if n == 0 {
4861 break;
4862 }
4863 buf.extend_from_slice(&chunk[..n]);
4864 if header_end.is_none() {
4865 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4866 header_end = Some(pos + 4);
4867 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4868 if let Some(value) = line.strip_prefix("Content-Length:") {
4869 content_length = value.trim().parse::<usize>().unwrap_or(0);
4870 }
4871 }
4872 }
4873 }
4874 if let Some(end) = header_end {
4875 if buf.len() >= end + content_length {
4876 break;
4877 }
4878 }
4879 }
4880 *captured_for_thread.lock().unwrap() = buf;
4881 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4882 let response = format!(
4883 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4884 body.len(),
4885 body
4886 );
4887 let _ = stream.write_all(response.as_bytes());
4888 });
4889
4890 let config = SemanticBackendConfig {
4891 backend: SemanticBackend::OpenAiCompatible,
4892 model: "text-embedding-3-small".to_string(),
4893 base_url: Some(format!("http://{}", addr)),
4894 api_key_env: None,
4895 timeout_ms: 5_000,
4896 max_batch_size: 64,
4897 max_files: 20_000,
4898 };
4899 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4900 let _ = model.embed(vec!["probe".to_string()]).unwrap();
4901 handle.join().unwrap();
4902
4903 let bytes = captured.lock().unwrap().clone();
4904 let request = String::from_utf8_lossy(&bytes);
4905
4906 let content_type_lines = request
4909 .lines()
4910 .filter(|line| {
4911 let lower = line.to_ascii_lowercase();
4912 lower.starts_with("content-type:")
4913 })
4914 .count();
4915 assert_eq!(
4916 content_type_lines, 1,
4917 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4918 );
4919
4920 assert!(
4923 request.contains(r#""model":"text-embedding-3-small""#),
4924 "request body should contain model field; full request:\n{request}",
4925 );
4926 }
4927
4928 #[test]
4929 fn ollama_backend_embeds_with_mock_server() {
4930 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4931 assert!(request_line.starts_with("POST "));
4932 assert_eq!(path, "/api/embed");
4933 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4934 });
4935
4936 let config = SemanticBackendConfig {
4937 backend: SemanticBackend::Ollama,
4938 model: "embeddinggemma".to_string(),
4939 base_url: Some(base_url),
4940 api_key_env: None,
4941 timeout_ms: 5_000,
4942 max_batch_size: 64,
4943 max_files: 20_000,
4944 };
4945
4946 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4947 let vectors = model
4948 .embed(vec!["hello".to_string(), "world".to_string()])
4949 .unwrap();
4950
4951 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4952 handle.join().unwrap();
4953 }
4954
4955 #[test]
4956 fn read_from_disk_rejects_fingerprint_mismatch() {
4957 let storage = tempfile::tempdir().unwrap();
4958 let project_key = "proj";
4959
4960 let project_root = test_project_root();
4961 let file = project_root.join("src/main.rs");
4962 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4963 index.entries.push(EmbeddingEntry {
4964 chunk: SemanticChunk {
4965 file: file.clone(),
4966 name: "handle_request".to_string(),
4967 kind: SymbolKind::Function,
4968 start_line: 10,
4969 end_line: 25,
4970 exported: true,
4971 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4972 snippet: "fn handle_request() {}".to_string(),
4973 },
4974 vector: vec![0.1, 0.2, 0.3],
4975 });
4976 index.dimension = 3;
4977 index
4978 .file_mtimes
4979 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4980 index.file_sizes.insert(file, 0);
4981 index.set_fingerprint(SemanticIndexFingerprint {
4982 backend: "openai_compatible".to_string(),
4983 model: "test-embedding".to_string(),
4984 base_url: "http://127.0.0.1:1234/v1".to_string(),
4985 dimension: 3,
4986 chunking_version: default_chunking_version(),
4987 });
4988 index.write_to_disk(storage.path(), project_key);
4989
4990 let matching = index.fingerprint().unwrap().as_string();
4991 assert!(SemanticIndex::read_from_disk(
4992 storage.path(),
4993 project_key,
4994 &project_root,
4995 false,
4996 Some(&matching),
4997 )
4998 .is_some());
4999
5000 let mismatched = SemanticIndexFingerprint {
5001 backend: "ollama".to_string(),
5002 model: "embeddinggemma".to_string(),
5003 base_url: "http://127.0.0.1:11434".to_string(),
5004 dimension: 3,
5005 chunking_version: default_chunking_version(),
5006 }
5007 .as_string();
5008 assert!(SemanticIndex::read_from_disk(
5009 storage.path(),
5010 project_key,
5011 &project_root,
5012 false,
5013 Some(&mismatched),
5014 )
5015 .is_none());
5016 }
5017
5018 #[test]
5019 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5020 let storage = tempfile::tempdir().unwrap();
5021 let project_key = "proj-v3";
5022 let dir = storage.path().join("semantic").join(project_key);
5023 fs::create_dir_all(&dir).unwrap();
5024
5025 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5026 index.entries.push(EmbeddingEntry {
5027 chunk: SemanticChunk {
5028 file: PathBuf::from("/src/main.rs"),
5029 name: "handle_request".to_string(),
5030 kind: SymbolKind::Function,
5031 start_line: 0,
5032 end_line: 0,
5033 exported: true,
5034 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5035 snippet: "fn handle_request() {}".to_string(),
5036 },
5037 vector: vec![0.1, 0.2, 0.3],
5038 });
5039 index.dimension = 3;
5040 index
5041 .file_mtimes
5042 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5043 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5044 let fingerprint = SemanticIndexFingerprint {
5045 backend: "fastembed".to_string(),
5046 model: "test".to_string(),
5047 base_url: FALLBACK_BACKEND.to_string(),
5048 dimension: 3,
5049 chunking_version: default_chunking_version(),
5050 };
5051 index.set_fingerprint(fingerprint.clone());
5052
5053 let mut bytes = index.to_bytes();
5054 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5055 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5056
5057 assert!(SemanticIndex::read_from_disk(
5058 storage.path(),
5059 project_key,
5060 &test_project_root(),
5061 false,
5062 Some(&fingerprint.as_string())
5063 )
5064 .is_none());
5065 assert!(!dir.join("semantic.bin").exists());
5066 }
5067
5068 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5069 crate::symbols::Symbol {
5070 name: name.to_string(),
5071 kind,
5072 range: crate::symbols::Range {
5073 start_line: start,
5074 start_col: 0,
5075 end_line: end,
5076 end_col: 0,
5077 },
5078 signature: None,
5079 scope_chain: Vec::new(),
5080 exported: false,
5081 parent: None,
5082 }
5083 }
5084
5085 #[test]
5090 fn symbols_to_chunks_skips_heading_symbols() {
5091 let project_root = PathBuf::from("/proj");
5092 let file = project_root.join("README.md");
5093 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5094
5095 let symbols = vec![
5096 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5097 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5098 ];
5099
5100 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5101 assert!(
5102 chunks.is_empty(),
5103 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5104 chunks.len()
5105 );
5106 }
5107
5108 #[test]
5115 fn build_embed_text_clamps_oversized_signature() {
5116 let project_root = PathBuf::from("/proj");
5117 let file = project_root.join("cronjob.yaml");
5118 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5120
5121 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5122 symbol.signature = Some(huge_sig);
5123
5124 let text = build_embed_text(&symbol, source, &file, &project_root);
5125 assert!(
5126 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5127 "embed_text must be clamped to {} chars, got {}",
5128 MAX_EMBED_TEXT_CHARS,
5129 text.chars().count()
5130 );
5131 }
5132
5133 #[test]
5137 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5138 let project_root = PathBuf::from("/proj");
5139 let file = project_root.join("src/lib.rs");
5140 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5141
5142 let symbols = vec![
5143 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5145 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5146 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5147 ];
5148
5149 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5150 assert_eq!(
5151 chunks.len(),
5152 3,
5153 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5154 chunks.len()
5155 );
5156 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5157 assert!(chunks
5158 .iter()
5159 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5160 assert!(names.contains(&"handle_request"));
5161 assert!(names.contains(&"AuthService"));
5162 assert!(
5163 !names.contains(&"doc heading"),
5164 "Heading symbol leaked into chunks: {names:?}"
5165 );
5166 }
5167
5168 #[test]
5169 fn validate_ssrf_allows_loopback_hostnames() {
5170 for host in &[
5173 "http://localhost",
5174 "http://localhost:8080",
5175 "http://localhost:11434", "http://localhost.localdomain",
5177 "http://foo.localhost",
5178 ] {
5179 assert!(
5180 validate_base_url_no_ssrf(host).is_ok(),
5181 "Expected {host} to be allowed (loopback), got: {:?}",
5182 validate_base_url_no_ssrf(host)
5183 );
5184 }
5185 }
5186
5187 #[test]
5188 fn validate_ssrf_allows_loopback_ips() {
5189 for url in &[
5192 "http://127.0.0.1",
5193 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5195 "http://127.1.2.3",
5196 ] {
5197 let result = validate_base_url_no_ssrf(url);
5198 assert!(
5199 result.is_ok(),
5200 "Expected {url} to be allowed (loopback), got: {:?}",
5201 result
5202 );
5203 }
5204 }
5205
5206 #[test]
5207 fn validate_ssrf_rejects_private_non_loopback_ips() {
5208 for url in &[
5213 "http://192.168.1.1",
5214 "http://10.0.0.1",
5215 "http://172.16.0.1",
5216 "http://169.254.169.254",
5217 "http://100.64.0.1",
5218 ] {
5219 let result = validate_base_url_no_ssrf(url);
5220 assert!(
5221 result.is_err(),
5222 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5223 result
5224 );
5225 }
5226 }
5227
5228 #[test]
5229 fn validate_ssrf_rejects_mdns_local_hostnames() {
5230 for host in &[
5233 "http://printer.local",
5234 "http://nas.local:8080",
5235 "http://homelab.local",
5236 ] {
5237 let result = validate_base_url_no_ssrf(host);
5238 assert!(
5239 result.is_err(),
5240 "Expected {host} to be rejected (mDNS), got: {:?}",
5241 result
5242 );
5243 }
5244 }
5245
5246 #[test]
5247 fn normalize_base_url_allows_localhost_for_tests() {
5248 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5251 assert!(normalize_base_url("http://localhost:8080").is_ok());
5252 }
5253
5254 #[test]
5255 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5256 use std::net::IpAddr;
5257 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5258
5259 assert!(blocked("10.0.0.1"));
5261 assert!(blocked("192.168.1.1"));
5262 assert!(blocked("169.254.0.1"));
5263 assert!(blocked("100.64.0.1"));
5264 assert!(
5266 blocked("198.18.0.1"),
5267 "RFC2544 benchmark range must be blocked"
5268 );
5269 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5270 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5271 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5272
5273 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5275 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5276 assert!(
5277 !blocked("::ffff:127.0.0.1"),
5278 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5279 );
5280
5281 assert!(!blocked("8.8.8.8"));
5283 }
5284
5285 #[test]
5292 fn ort_mismatch_message_recommends_auto_fix_first() {
5293 let msg =
5294 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5295
5296 assert!(
5298 msg.contains("v1.9.0"),
5299 "should report detected version: {msg}"
5300 );
5301 assert!(
5302 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5303 "should report system path: {msg}"
5304 );
5305 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5306
5307 let auto_fix_pos = msg
5309 .find("Auto-fix")
5310 .expect("Auto-fix solution missing — users won't discover --fix");
5311 let remove_pos = msg
5312 .find("Remove the old library")
5313 .expect("system-rm solution missing");
5314 assert!(
5315 auto_fix_pos < remove_pos,
5316 "Auto-fix must come before manual rm — see PR comment thread"
5317 );
5318
5319 assert!(
5321 msg.contains("npx @cortexkit/aft doctor --fix"),
5322 "auto-fix command must be present and copy-pasteable: {msg}"
5323 );
5324 }
5325
5326 #[cfg(any(target_os = "linux", target_os = "macos"))]
5327 #[test]
5328 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5329 let requested = "libonnxruntime.so";
5330 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5331
5332 assert_eq!(detect_ort_version_from_path(requested), None);
5333 let (version, source) =
5334 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5335
5336 assert_eq!(version, Some("1.19.0".to_string()));
5337 assert_eq!(source, actual);
5338
5339 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5340 assert!(msg.contains("v1.19.0"));
5341 assert!(msg.contains(actual));
5342 }
5343
5344 #[test]
5348 fn ort_mismatch_message_handles_macos_dylib_path() {
5349 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5350 assert!(msg.contains("v1.9.0"));
5351 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5352 assert!(
5356 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5357 "system path should be quoted in the auto-fix sentence: {msg}"
5358 );
5359 }
5360}