1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
57const DEFAULT_MAX_BATCH_SIZE: usize = 64;
58const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
59const FALLBACK_BACKEND: &str = "none";
60const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
61const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
62static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
63
64pub struct SemanticIndexLock {
65 _guard: fs_lock::LockGuard,
66}
67
68impl SemanticIndexLock {
69 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
70 let dir = storage_dir.join("semantic").join(project_key);
71 fs::create_dir_all(&dir)?;
72 let path = dir.join("cache.lock");
73 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
74 .lock()
75 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
76 fs_lock::try_acquire(&path, Duration::from_secs(2))
77 .map(|guard| Self { _guard: guard })
78 .map_err(|error| match error {
79 fs_lock::AcquireError::Timeout => {
80 std::io::Error::other("timed out acquiring semantic cache lock")
81 }
82 fs_lock::AcquireError::Io(error) => error,
83 })
84 }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct SemanticIndexFingerprint {
89 pub backend: String,
90 pub model: String,
91 #[serde(default)]
92 pub base_url: String,
93 pub dimension: usize,
94 #[serde(default = "default_chunking_version")]
95 pub chunking_version: u32,
96}
97
98fn default_chunking_version() -> u32 {
99 2
100}
101
102impl SemanticIndexFingerprint {
103 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
104 let base_url = config
107 .base_url
108 .as_ref()
109 .and_then(|u| normalize_base_url(u).ok())
110 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
111 Self {
112 backend: config.backend.as_str().to_string(),
113 model: config.model.clone(),
114 base_url,
115 dimension,
116 chunking_version: default_chunking_version(),
117 }
118 }
119
120 pub fn as_string(&self) -> String {
121 serde_json::to_string(self).unwrap_or_else(|_| String::new())
122 }
123
124 fn matches_expected(&self, expected: &str) -> bool {
125 let encoded = self.as_string();
126 !encoded.is_empty() && encoded == expected
127 }
128}
129
130enum SemanticEmbeddingEngine {
131 Local(LocalEmbedder),
134 OpenAiCompatible {
135 client: Client,
136 model: String,
137 base_url: String,
138 api_key: Option<String>,
139 },
140 Ollama {
141 client: Client,
142 model: String,
143 base_url: String,
144 },
145}
146
147pub struct SemanticEmbeddingModel {
148 backend: SemanticBackend,
149 model: String,
150 base_url: Option<String>,
151 timeout_ms: u64,
152 max_batch_size: usize,
153 dimension: Option<usize>,
154 engine: SemanticEmbeddingEngine,
155 query_embedding_cache: HashMap<String, Vec<f32>>,
156 query_embedding_cache_order: VecDeque<String>,
157 query_embedding_cache_hits: u64,
158 query_embedding_cache_misses: u64,
159}
160
161pub type EmbeddingModel = SemanticEmbeddingModel;
162
163fn validate_embedding_batch(
164 vectors: &[Vec<f32>],
165 expected_count: usize,
166 context: &str,
167) -> Result<(), String> {
168 if expected_count > 0 && vectors.is_empty() {
169 return Err(format!(
170 "{context} returned no vectors for {expected_count} inputs"
171 ));
172 }
173
174 if vectors.len() != expected_count {
175 return Err(format!(
176 "{context} returned {} vectors for {} inputs",
177 vectors.len(),
178 expected_count
179 ));
180 }
181
182 let Some(first_vector) = vectors.first() else {
183 return Ok(());
184 };
185 let expected_dimension = first_vector.len();
186 validate_embedding_dimension(expected_dimension)
187 .map_err(|error| format!("{context} returned {error}"))?;
188 for (index, vector) in vectors.iter().enumerate() {
189 if vector.len() != expected_dimension {
190 return Err(format!(
191 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
192 vector.len()
193 ));
194 }
195 }
196
197 Ok(())
198}
199
200fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
201 if dimension == 0 || dimension > MAX_DIMENSION {
202 return Err(format!(
203 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
204 ));
205 }
206
207 Ok(())
208}
209
210fn normalize_base_url(raw: &str) -> Result<String, String> {
214 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
215 let scheme = parsed.scheme();
216 if scheme != "http" && scheme != "https" {
217 return Err(format!(
218 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
219 scheme
220 ));
221 }
222 Ok(parsed.to_string().trim_end_matches('/').to_string())
223}
224
225pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
240 use std::net::{IpAddr, ToSocketAddrs};
241
242 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
243
244 let host = parsed.host_str().unwrap_or("");
245
246 let is_loopback_host =
251 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
252 if is_loopback_host {
253 return Ok(());
254 }
255
256 if host.ends_with(".local") {
259 return Err(format!(
260 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
261 ));
262 }
263
264 let port = parsed.port_or_known_default().unwrap_or(443);
267 let addr_str = format!("{host}:{port}");
268 let addrs: Vec<IpAddr> = addr_str
269 .to_socket_addrs()
270 .map(|iter| iter.map(|sa| sa.ip()).collect())
271 .unwrap_or_default();
272 for ip in &addrs {
273 if is_private_non_loopback_ip(ip) {
274 return Err(format!(
275 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
276 ));
277 }
278 }
279
280 Ok(())
281}
282
283fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
294 if ip.to_canonical().is_loopback() {
297 return false;
298 }
299 crate::url_fetch::is_private_or_reserved_ip(*ip)
300}
301
302fn build_openai_embeddings_endpoint(base_url: &str) -> String {
303 if base_url.ends_with("/v1") {
304 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
305 } else {
306 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
307 }
308}
309
310fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
311 if base_url.ends_with("/api") {
312 format!("{base_url}/embed")
313 } else {
314 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
315 }
316}
317
318fn normalize_api_key(value: Option<String>) -> Option<String> {
319 value.and_then(|token| {
320 let token = token.trim();
321 if token.is_empty() {
322 None
323 } else {
324 Some(token.to_string())
325 }
326 })
327}
328
329fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
330 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
331}
332
333fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
339 if !matches!(
340 status,
341 reqwest::StatusCode::BAD_REQUEST
342 | reqwest::StatusCode::CONFLICT
343 | reqwest::StatusCode::REQUEST_TIMEOUT
344 | reqwest::StatusCode::LOCKED
345 | reqwest::StatusCode::TOO_EARLY
346 ) {
347 return false;
348 }
349
350 let lower = raw.to_ascii_lowercase();
351 let normalized = lower.trim();
352
353 normalized.contains("model was unloaded while the request was still in queue")
354 || normalized == "model is loading"
355 || normalized.starts_with("model is loading,")
356 || normalized.contains(r#""error":"model is loading"#)
357 || normalized.contains(r#""message":"model is loading"#)
358 || normalized == "model not loaded"
359 || normalized.contains(r#""error":"model not loaded""#)
360 || normalized.contains(r#""message":"model not loaded""#)
361 || normalized == "loading model into memory"
362 || normalized.contains(r#""error":"loading model into memory""#)
363 || normalized.contains(r#""message":"loading model into memory""#)
364 || normalized == "model is being loaded"
365 || normalized.contains(r#""error":"model is being loaded""#)
366 || normalized.contains(r#""message":"model is being loaded""#)
367 || normalized == "model is currently loading"
368 || normalized.contains(r#""error":"model is currently loading""#)
369 || normalized.contains(r#""message":"model is currently loading""#)
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373 error.is_connect()
374}
375
376fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382 error.is_connect() || error.is_timeout()
383}
384
385fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
386 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
387}
388
389pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
396
397pub fn embedding_failure_is_transient(error: &str) -> bool {
400 error.contains(TRANSIENT_EMBEDDING_MARKER)
401}
402
403pub fn strip_transient_embedding_marker(error: &str) -> String {
405 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
406}
407
408fn sleep_before_embedding_retry(attempt_index: usize) {
409 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
410 std::thread::sleep(Duration::from_millis(*delay_ms));
411 }
412}
413
414fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
415where
416 F: FnMut() -> reqwest::blocking::RequestBuilder,
417{
418 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
419 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
420
421 let response = match make_request().send() {
422 Ok(response) => response,
423 Err(error) => {
424 if !last_attempt && is_retryable_embedding_error(&error) {
425 sleep_before_embedding_retry(attempt_index);
426 continue;
427 }
428 let marker = if embedding_send_error_is_transient(&error) {
432 TRANSIENT_EMBEDDING_MARKER
433 } else {
434 ""
435 };
436 return Err(format!("{marker}{backend_label} request failed: {error}"));
437 }
438 };
439
440 let status = response.status();
441 let raw = match response.text() {
442 Ok(raw) => raw,
443 Err(error) => {
444 if !last_attempt && embedding_response_read_error_is_transient(&error) {
445 sleep_before_embedding_retry(attempt_index);
446 continue;
447 }
448 let marker = if embedding_response_read_error_is_transient(&error) {
449 TRANSIENT_EMBEDDING_MARKER
450 } else {
451 ""
452 };
453 return Err(format!(
454 "{marker}{backend_label} response read failed: {error}"
455 ));
456 }
457 };
458
459 if status.is_success() {
460 return Ok(raw);
461 }
462
463 let body_transient = embedding_response_body_is_transient(status, &raw);
467 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
468 sleep_before_embedding_retry(attempt_index);
469 continue;
470 }
471
472 let marker = if is_retryable_embedding_status(status) || body_transient {
478 TRANSIENT_EMBEDDING_MARKER
479 } else {
480 ""
481 };
482 return Err(format!(
483 "{marker}{backend_label} request failed (HTTP {}): {}",
484 status, raw
485 ));
486 }
487
488 unreachable!("embedding request retries exhausted without returning")
489}
490
491impl SemanticEmbeddingModel {
492 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
493 let timeout_ms = if config.timeout_ms == 0 {
494 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
495 } else {
496 config.timeout_ms
497 };
498
499 let max_batch_size = if config.max_batch_size == 0 {
500 DEFAULT_MAX_BATCH_SIZE
501 } else {
502 config.max_batch_size
503 };
504
505 let api_key_env = normalize_api_key(config.api_key_env.clone());
506 let model = config.model.clone();
507
508 let client = Client::builder()
509 .timeout(Duration::from_millis(timeout_ms))
510 .redirect(reqwest::redirect::Policy::none())
511 .build()
512 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
513
514 let engine = match config.backend {
515 SemanticBackend::Fastembed => {
516 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
517 }
518 SemanticBackend::OpenAiCompatible => {
519 let raw = config.base_url.as_ref().ok_or_else(|| {
520 "base_url is required for openai_compatible backend".to_string()
521 })?;
522 let base_url = normalize_base_url(raw)?;
523
524 let api_key = match api_key_env {
525 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
526 format!("missing api_key_env '{var_name}' for openai_compatible backend")
527 })?),
528 None => None,
529 };
530
531 SemanticEmbeddingEngine::OpenAiCompatible {
532 client,
533 model,
534 base_url,
535 api_key,
536 }
537 }
538 SemanticBackend::Ollama => {
539 let raw = config
540 .base_url
541 .as_ref()
542 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
543 let base_url = normalize_base_url(raw)?;
544
545 SemanticEmbeddingEngine::Ollama {
546 client,
547 model,
548 base_url,
549 }
550 }
551 };
552
553 Ok(Self {
554 backend: config.backend,
555 model: config.model.clone(),
556 base_url: config.base_url.clone(),
557 timeout_ms,
558 max_batch_size,
559 dimension: None,
560 engine,
561 query_embedding_cache: HashMap::new(),
562 query_embedding_cache_order: VecDeque::new(),
563 query_embedding_cache_hits: 0,
564 query_embedding_cache_misses: 0,
565 })
566 }
567
568 pub fn backend(&self) -> SemanticBackend {
569 self.backend
570 }
571
572 pub fn model(&self) -> &str {
573 &self.model
574 }
575
576 pub fn base_url(&self) -> Option<&str> {
577 self.base_url.as_deref()
578 }
579
580 pub fn max_batch_size(&self) -> usize {
581 self.max_batch_size
582 }
583
584 pub fn timeout_ms(&self) -> u64 {
585 self.timeout_ms
586 }
587
588 pub fn fingerprint(
589 &mut self,
590 config: &SemanticBackendConfig,
591 ) -> Result<SemanticIndexFingerprint, String> {
592 let dimension = self.dimension()?;
593 Ok(SemanticIndexFingerprint::from_config(config, dimension))
594 }
595
596 pub fn dimension(&mut self) -> Result<usize, String> {
597 if let Some(dimension) = self.dimension {
598 return Ok(dimension);
599 }
600
601 let dimension = match &mut self.engine {
602 SemanticEmbeddingEngine::Local(model) => {
603 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
604 vectors
605 .first()
606 .map(|v| v.len())
607 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
608 }
609 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
610 let vectors =
611 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
612 vectors
613 .first()
614 .map(|v| v.len())
615 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
616 }
617 SemanticEmbeddingEngine::Ollama { .. } => {
618 let vectors =
619 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
620 vectors
621 .first()
622 .map(|v| v.len())
623 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
624 }
625 };
626
627 self.dimension = Some(dimension);
628 Ok(dimension)
629 }
630
631 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
632 self.embed_texts(texts)
633 }
634
635 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
636 if let Some(vector) = self.query_embedding_cache.get(query) {
637 self.query_embedding_cache_hits += 1;
638 return Ok(vector.clone());
639 }
640
641 self.query_embedding_cache_misses += 1;
642 let embeddings = self.embed_texts(vec![query.to_string()])?;
643 let vector = embeddings
644 .first()
645 .cloned()
646 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
647
648 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
649 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
650 self.query_embedding_cache.remove(&oldest);
651 }
652 }
653 self.query_embedding_cache
654 .insert(query.to_string(), vector.clone());
655 self.query_embedding_cache_order
656 .push_back(query.to_string());
657
658 Ok(vector)
659 }
660
661 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
662 (
663 self.query_embedding_cache_hits,
664 self.query_embedding_cache_misses,
665 self.query_embedding_cache.len(),
666 )
667 }
668
669 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
670 match &mut self.engine {
671 SemanticEmbeddingEngine::Local(model) => model
672 .embed(&texts)
673 .map_err(|error| format!("failed to embed batch: {error}")),
674 SemanticEmbeddingEngine::OpenAiCompatible {
675 client,
676 model,
677 base_url,
678 api_key,
679 } => {
680 let expected_text_count = texts.len();
681 let endpoint = build_openai_embeddings_endpoint(base_url);
682 let body = serde_json::json!({
683 "input": texts,
684 "model": model,
685 });
686
687 let raw = send_embedding_request(
688 || {
689 let mut request = client.post(&endpoint).json(&body);
699
700 if let Some(api_key) = api_key {
701 request = request.header("Authorization", format!("Bearer {api_key}"));
702 }
703
704 request
705 },
706 "openai compatible",
707 )?;
708
709 #[derive(Deserialize)]
710 struct OpenAiResponse {
711 data: Vec<OpenAiEmbeddingResult>,
712 }
713
714 #[derive(Deserialize)]
715 struct OpenAiEmbeddingResult {
716 embedding: Vec<f32>,
717 index: Option<u32>,
718 }
719
720 let parsed: OpenAiResponse = serde_json::from_str(&raw)
721 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
722 if parsed.data.len() != expected_text_count {
723 return Err(format!(
724 "openai compatible response returned {} embeddings for {} inputs",
725 parsed.data.len(),
726 expected_text_count
727 ));
728 }
729
730 let mut vectors = vec![Vec::new(); parsed.data.len()];
731 for (i, item) in parsed.data.into_iter().enumerate() {
732 let index = item.index.unwrap_or(i as u32) as usize;
733 if index >= vectors.len() {
734 return Err(
735 "openai compatible response contains invalid vector index".to_string()
736 );
737 }
738 vectors[index] = item.embedding;
739 }
740
741 for vector in &vectors {
742 if vector.is_empty() {
743 return Err(
744 "openai compatible response contained missing vectors".to_string()
745 );
746 }
747 }
748
749 self.dimension = vectors.first().map(Vec::len);
750 Ok(vectors)
751 }
752 SemanticEmbeddingEngine::Ollama {
753 client,
754 model,
755 base_url,
756 } => {
757 let expected_text_count = texts.len();
758 let endpoint = build_ollama_embeddings_endpoint(base_url);
759
760 #[derive(Serialize)]
761 struct OllamaPayload<'a> {
762 model: &'a str,
763 input: Vec<String>,
764 }
765
766 let payload = OllamaPayload {
767 model,
768 input: texts,
769 };
770
771 let raw = send_embedding_request(
772 || {
773 client.post(&endpoint).json(&payload)
778 },
779 "ollama",
780 )?;
781
782 #[derive(Deserialize)]
783 struct OllamaResponse {
784 embeddings: Vec<Vec<f32>>,
785 }
786
787 let parsed: OllamaResponse = serde_json::from_str(&raw)
788 .map_err(|error| format!("invalid ollama response: {error}"))?;
789 if parsed.embeddings.is_empty() {
790 return Err("ollama response returned no embeddings".to_string());
791 }
792 if parsed.embeddings.len() != expected_text_count {
793 return Err(format!(
794 "ollama response returned {} embeddings for {} inputs",
795 parsed.embeddings.len(),
796 expected_text_count
797 ));
798 }
799
800 let vectors = parsed.embeddings;
801 for vector in &vectors {
802 if vector.is_empty() {
803 return Err("ollama response contained empty embeddings".to_string());
804 }
805 }
806
807 self.dimension = vectors.first().map(Vec::len);
808 Ok(vectors)
809 }
810 }
811 }
812}
813
814pub fn pre_validate_onnx_runtime() -> Result<(), String> {
818 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
819
820 #[cfg(any(target_os = "linux", target_os = "macos"))]
821 {
822 #[cfg(target_os = "linux")]
823 let default_name = "libonnxruntime.so";
824 #[cfg(target_os = "macos")]
825 let default_name = "libonnxruntime.dylib";
826
827 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
828
829 unsafe {
830 let c_name = std::ffi::CString::new(lib_name)
831 .map_err(|e| format!("invalid library path: {}", e))?;
832 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
833 if handle.is_null() {
834 let err = libc::dlerror();
835 let msg = if err.is_null() {
836 "unknown dlopen error".to_string()
837 } else {
838 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
839 };
840 return Err(format!(
841 "ONNX Runtime not found. dlopen('{}') failed: {}. \
842 Run `npx @cortexkit/aft doctor` to diagnose.",
843 lib_name, msg
844 ));
845 }
846
847 let (detected_version, version_source) =
852 detect_ort_version_from_loaded_library(handle, lib_name);
853
854 libc::dlclose(handle);
855
856 if let Some(ref version) = detected_version {
858 let parts: Vec<&str> = version.split('.').collect();
859 if let (Some(major), Some(minor)) = (
860 parts.first().and_then(|s| s.parse::<u32>().ok()),
861 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
862 ) {
863 if major != 1 || minor < 20 {
864 return Err(format_ort_version_mismatch(version, &version_source));
865 }
866 }
867 }
868 }
869 }
870
871 #[cfg(target_os = "windows")]
872 {
873 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
878
879 #[link(name = "kernel32")]
883 extern "system" {
884 fn LoadLibraryExW(
885 lpLibFileName: *const u16,
886 hFile: *mut std::ffi::c_void,
887 dwFlags: u32,
888 ) -> *mut std::ffi::c_void;
889 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
890 fn GetModuleFileNameW(
891 hModule: *mut std::ffi::c_void,
892 lpFilename: *mut u16,
893 nSize: u32,
894 ) -> u32;
895 }
896
897 #[link(name = "version")]
898 extern "system" {
899 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
900 fn GetFileVersionInfoW(
901 lptstrFilename: *const u16,
902 dwHandle: u32,
903 dwLen: u32,
904 lpData: *mut std::ffi::c_void,
905 ) -> i32;
906 fn VerQueryValueW(
907 pBlock: *mut std::ffi::c_void,
908 lpSubBlock: *const u16,
909 lplpBuffer: *mut *mut std::ffi::c_void,
910 puLen: *mut u32,
911 ) -> i32;
912 }
913
914 #[repr(C)]
915 struct VS_FIXEDFILEINFO {
916 dw_signature: u32,
917 dw_struc_version: u32,
918 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
921 dw_product_version_ls: u32,
922 dw_file_flags_mask: u32,
923 dw_file_flags: u32,
924 dw_file_os: u32,
925 dw_file_type: u32,
926 dw_file_subtype: u32,
927 dw_file_date_ms: u32,
928 dw_file_date_ls: u32,
929 }
930
931 unsafe {
932 use std::os::windows::ffi::OsStrExt;
933 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
934 .encode_wide()
935 .chain(std::iter::once(0))
936 .collect();
937
938 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
939 if handle.is_null() {
940 let err = std::io::Error::last_os_error();
941 return Err(format!(
942 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
943 Run `npx @cortexkit/aft doctor` to diagnose.",
944 lib_name, err
945 ));
946 }
947
948 let mut detected_major: u32 = 0;
951 let mut detected_minor: u32 = 0;
952 let mut path_buf = [0u16; 32767];
958 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
959 if path_len > 0 {
960 let mut dummy_handle: u32 = 0;
961 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
962 if info_size > 0 {
963 let mut info = vec![0u8; info_size as usize];
964 if GetFileVersionInfoW(
965 path_buf.as_ptr(),
966 0,
967 info_size,
968 info.as_mut_ptr() as *mut std::ffi::c_void,
969 ) != 0
970 {
971 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
972 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
973 let mut vs_len: u32 = 0;
974 if VerQueryValueW(
975 info.as_mut_ptr() as *mut std::ffi::c_void,
976 sub_block.as_ptr(),
977 &mut vs_info,
978 &mut vs_len,
979 ) != 0
980 && !vs_info.is_null()
981 {
982 let fixed = vs_info as *const VS_FIXEDFILEINFO;
983 detected_major = (*fixed).dw_file_version_ms >> 16;
984 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
985 }
986 }
987 }
988 }
989
990 FreeLibrary(handle);
991
992 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
996 let ver = format!("{}.{}", detected_major, detected_minor);
997 return Err(format_ort_version_mismatch(&ver, lib_name));
998 }
999 }
1000 }
1001
1002 Ok(())
1003}
1004
1005#[cfg(any(target_os = "linux", target_os = "macos"))]
1006unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1007 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1008 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1009 if symbol.is_null() {
1010 return None;
1011 }
1012
1013 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1014 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1015 return None;
1016 }
1017
1018 let info = unsafe { info.assume_init() };
1019 if info.dli_fname.is_null() {
1020 return None;
1021 }
1022
1023 Some(
1024 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1025 .to_string_lossy()
1026 .into_owned(),
1027 )
1028}
1029
1030#[cfg(any(target_os = "linux", target_os = "macos"))]
1031fn detect_ort_version_from_resolved_or_requested(
1032 resolved_path: Option<String>,
1033 requested_lib_name: &str,
1034) -> (Option<String>, String) {
1035 if let Some(path) = resolved_path {
1036 if let Some(version) = detect_ort_version_from_path(&path) {
1037 return (Some(version), path);
1038 }
1039 return (detect_ort_version_from_path(requested_lib_name), path);
1040 }
1041
1042 (
1043 detect_ort_version_from_path(requested_lib_name),
1044 requested_lib_name.to_string(),
1045 )
1046}
1047
1048#[cfg(any(target_os = "linux", target_os = "macos"))]
1049fn detect_ort_version_from_loaded_library(
1050 handle: *mut std::ffi::c_void,
1051 requested_lib_name: &str,
1052) -> (Option<String>, String) {
1053 detect_ort_version_from_resolved_or_requested(
1054 unsafe { loaded_library_path_from_handle(handle) },
1055 requested_lib_name,
1056 )
1057}
1058
1059#[cfg(any(target_os = "linux", target_os = "macos"))]
1062fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1063 let path = std::path::Path::new(lib_path);
1064
1065 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1067 .into_iter()
1068 .flatten()
1069 {
1070 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1071 if let Some(version) = extract_version_from_filename(name) {
1072 return Some(version);
1073 }
1074 }
1075 }
1076
1077 if let Some(parent) = path.parent() {
1079 if let Ok(entries) = std::fs::read_dir(parent) {
1080 for entry in entries.flatten() {
1081 if let Some(name) = entry.file_name().to_str() {
1082 if name.starts_with("libonnxruntime") {
1083 if let Some(version) = extract_version_from_filename(name) {
1084 return Some(version);
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091
1092 None
1093}
1094
1095#[cfg(any(target_os = "linux", target_os = "macos"))]
1097fn extract_version_from_filename(name: &str) -> Option<String> {
1098 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1100 re.find(name).map(|m| m.as_str().to_string())
1101}
1102
1103fn suggest_removal_command(lib_path: &str) -> String {
1104 if lib_path.starts_with("/usr/local/lib")
1105 || lib_path == "libonnxruntime.so"
1106 || lib_path == "libonnxruntime.dylib"
1107 {
1108 #[cfg(target_os = "linux")]
1109 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1110 #[cfg(target_os = "macos")]
1111 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1112 }
1113 format!(" rm '{}'", lib_path)
1114}
1115
1116pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1122 format!(
1123 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1124 Solutions:\n\
1125 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1126 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1127 configures the bridge to load it instead of the system library — no \
1128 changes to '{}'.\n\
1129 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1130 {}\n\
1131 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1132 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1133 version,
1134 lib_name,
1135 lib_name,
1136 suggest_removal_command(lib_name),
1137 )
1138}
1139
1140pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1141 if message.trim_start().starts_with("ONNX Runtime not found.") {
1142 return true;
1143 }
1144
1145 let message = message.to_ascii_lowercase();
1146 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1147 .iter()
1148 .any(|pattern| message.contains(pattern));
1149 let mentions_dynamic_load_failure = [
1150 "shared library",
1151 "dynamic library",
1152 "failed to load",
1153 "could not load",
1154 "unable to load",
1155 "dlopen",
1156 "loadlibrary",
1157 "no such file",
1158 "not found",
1159 ]
1160 .iter()
1161 .any(|pattern| message.contains(pattern));
1162
1163 mentions_onnx_runtime && mentions_dynamic_load_failure
1164}
1165
1166pub fn format_embedding_init_error(error: impl Display) -> String {
1167 let message = error.to_string();
1168
1169 if is_onnx_runtime_unavailable(&message) {
1170 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1171 }
1172
1173 format!("failed to initialize semantic embedding model: {message}")
1174}
1175
1176#[derive(Debug, Clone)]
1178pub struct SemanticChunk {
1179 pub file: PathBuf,
1181 pub name: String,
1183 pub kind: SymbolKind,
1185 pub start_line: u32,
1187 pub end_line: u32,
1188 pub exported: bool,
1190 pub embed_text: String,
1192 pub snippet: String,
1194}
1195
1196#[derive(Debug, Clone)]
1198pub struct EmbeddingEntry {
1199 chunk: SemanticChunk,
1200 vector: Vec<f32>,
1201}
1202
1203#[derive(Debug, Clone)]
1205pub struct SemanticIndex {
1206 entries: Vec<EmbeddingEntry>,
1207 file_mtimes: HashMap<PathBuf, SystemTime>,
1209 file_sizes: HashMap<PathBuf, u64>,
1211 file_hashes: HashMap<PathBuf, blake3::Hash>,
1212 dimension: usize,
1214 fingerprint: Option<SemanticIndexFingerprint>,
1215 project_root: PathBuf,
1216 deferred_files: HashSet<PathBuf>,
1217}
1218
1219#[derive(Debug, Clone, Copy)]
1220struct IndexedFileMetadata {
1221 mtime: SystemTime,
1222 size: u64,
1223 content_hash: blake3::Hash,
1224}
1225
1226#[derive(Debug, Default, Clone, Copy)]
1229pub struct RefreshSummary {
1230 pub changed: usize,
1231 pub added: usize,
1232 pub deleted: usize,
1233 pub total_processed: usize,
1234}
1235
1236impl RefreshSummary {
1237 pub fn is_noop(&self) -> bool {
1239 self.changed == 0 && self.added == 0 && self.deleted == 0
1240 }
1241}
1242
1243#[derive(Debug, Default)]
1244pub struct InvalidatedFilesRefresh {
1245 pub added_entries: Vec<EmbeddingEntry>,
1249 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1250 pub completed_paths: Vec<PathBuf>,
1251 pub summary: RefreshSummary,
1252}
1253
1254#[derive(Debug, Clone)]
1255struct ReusableEmbedding {
1256 embed_text: String,
1257 vector: Vec<f32>,
1258}
1259
1260type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1261
1262#[derive(Debug, Clone)]
1264pub struct SemanticResult {
1265 pub file: PathBuf,
1266 pub name: String,
1267 pub kind: SymbolKind,
1268 pub start_line: u32,
1269 pub end_line: u32,
1270 pub exported: bool,
1271 pub snippet: String,
1272 pub score: f32,
1273 pub source: &'static str,
1274}
1275
1276impl SemanticIndex {
1277 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1278 debug_assert!(project_root.is_absolute());
1279 Self {
1280 entries: Vec::new(),
1281 file_mtimes: HashMap::new(),
1282 file_sizes: HashMap::new(),
1283 file_hashes: HashMap::new(),
1284 dimension,
1285 fingerprint: None,
1286 project_root,
1287 deferred_files: HashSet::new(),
1288 }
1289 }
1290
1291 pub fn entry_count(&self) -> usize {
1293 self.entries.len()
1294 }
1295
1296 pub fn indexed_file_count(&self) -> usize {
1298 self.file_mtimes.len()
1299 }
1300
1301 pub fn status_label(&self) -> &'static str {
1303 if self.entries.is_empty() {
1304 "empty"
1305 } else {
1306 "ready"
1307 }
1308 }
1309
1310 fn collect_chunks(
1311 project_root: &Path,
1312 files: &[PathBuf],
1313 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1314 let collect_started = std::time::Instant::now();
1315 let per_file: Vec<(
1316 PathBuf,
1317 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1318 )> = files
1319 .par_iter()
1320 .map_init(HashMap::new, |parsers, file| {
1321 let result = collect_file_metadata(file).and_then(|metadata| {
1322 collect_file_chunks(project_root, file, parsers)
1323 .map(|chunks| (metadata, chunks))
1324 });
1325 (file.clone(), result)
1326 })
1327 .collect();
1328
1329 let mut chunks: Vec<SemanticChunk> = Vec::new();
1330 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1331
1332 for (file, result) in per_file {
1333 match result {
1334 Ok((metadata, file_chunks)) => {
1335 file_metadata.insert(file, metadata);
1336 chunks.extend(file_chunks);
1337 }
1338 Err(error) => {
1339 if error == "unsupported file extension" {
1345 continue;
1346 }
1347 slog_warn!(
1348 "failed to collect semantic chunks for {}: {}",
1349 file.display(),
1350 error
1351 );
1352 }
1353 }
1354 }
1355
1356 slog_info!(
1357 "semantic collect: {} chunks from {} files in {} ms",
1358 chunks.len(),
1359 file_metadata.len(),
1360 collect_started.elapsed().as_millis()
1361 );
1362
1363 (chunks, file_metadata)
1364 }
1365
1366 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1367 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1368 let mut reuse_map: ChunkReuseMap = HashMap::new();
1369
1370 for entry in &self.entries {
1371 if !requested.contains(entry.chunk.file.as_path()) {
1372 continue;
1373 }
1374
1375 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1380 reuse_map
1381 .entry(entry.chunk.file.clone())
1382 .or_default()
1383 .entry(hash)
1384 .or_default()
1385 .push(ReusableEmbedding {
1386 embed_text: entry.chunk.embed_text.clone(),
1387 vector: entry.vector.clone(),
1388 });
1389 }
1390
1391 reuse_map
1392 }
1393
1394 fn reusable_vector_for_chunk(
1395 reuse_map: &ChunkReuseMap,
1396 chunk: &SemanticChunk,
1397 ) -> Option<Vec<f32>> {
1398 let hash = blake3::hash(chunk.embed_text.as_bytes());
1399 reuse_map
1400 .get(&chunk.file)?
1401 .get(&hash)?
1402 .iter()
1403 .find(|candidate| candidate.embed_text == chunk.embed_text)
1404 .map(|candidate| candidate.vector.clone())
1405 }
1406
1407 fn entries_for_chunks_with_reuse<F, P>(
1408 chunks: Vec<SemanticChunk>,
1409 reuse_map: &ChunkReuseMap,
1410 embed_fn: &mut F,
1411 max_batch_size: usize,
1412 initial_observed_dimension: Option<usize>,
1413 refresh_label: &str,
1414 progress: &mut P,
1415 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1416 where
1417 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1418 P: FnMut(usize, usize),
1419 {
1420 let total_chunks = chunks.len();
1421 progress(0, total_chunks);
1422
1423 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1424 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1425
1426 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1427 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1428 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1429 } else {
1430 misses.push((chunk_index, chunk));
1431 }
1432 }
1433
1434 let mut completed = total_chunks.saturating_sub(misses.len());
1435 if completed > 0 {
1436 progress(completed, total_chunks);
1437 }
1438
1439 let batch_size = max_batch_size.max(1);
1440 let mut observed_dimension = initial_observed_dimension;
1441
1442 for batch_start in (0..misses.len()).step_by(batch_size) {
1443 let batch_end = (batch_start + batch_size).min(misses.len());
1444 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1445 .iter()
1446 .map(|(_, chunk)| chunk.embed_text.clone())
1447 .collect();
1448
1449 let vectors = embed_fn(batch_texts)?;
1450 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1451
1452 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1453 match observed_dimension {
1454 None => observed_dimension = Some(dim),
1455 Some(expected) if dim != expected => {
1456 return Err(format!(
1457 "embedding dimension changed during {refresh_label}: \
1458 cached index uses {expected}, new vectors use {dim}"
1459 ));
1460 }
1461 _ => {}
1462 }
1463 }
1464
1465 for (i, vector) in vectors.into_iter().enumerate() {
1466 let (chunk_index, chunk) = misses[batch_start + i].clone();
1467 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1468 }
1469
1470 completed += batch_end - batch_start;
1471 progress(completed, total_chunks);
1472 }
1473
1474 let entries = entries_by_chunk
1475 .into_iter()
1476 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1477 .collect();
1478
1479 Ok((entries, observed_dimension))
1480 }
1481
1482 fn build_from_chunks<F, P>(
1483 project_root: &Path,
1484 chunks: Vec<SemanticChunk>,
1485 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1486 embed_fn: &mut F,
1487 max_batch_size: usize,
1488 mut progress: Option<&mut P>,
1489 ) -> Result<Self, String>
1490 where
1491 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1492 P: FnMut(usize, usize),
1493 {
1494 debug_assert!(project_root.is_absolute());
1495 let total_chunks = chunks.len();
1496
1497 if chunks.is_empty() {
1498 return Ok(Self {
1499 entries: Vec::new(),
1500 file_mtimes: file_metadata
1501 .iter()
1502 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1503 .collect(),
1504 file_sizes: file_metadata
1505 .iter()
1506 .map(|(path, metadata)| (path.clone(), metadata.size))
1507 .collect(),
1508 file_hashes: file_metadata
1509 .into_iter()
1510 .map(|(path, metadata)| (path, metadata.content_hash))
1511 .collect(),
1512 dimension: DEFAULT_DIMENSION,
1513 fingerprint: None,
1514 project_root: project_root.to_path_buf(),
1515 deferred_files: HashSet::new(),
1516 });
1517 }
1518
1519 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1521 let mut expected_dimension: Option<usize> = None;
1522 let batch_size = max_batch_size.max(1);
1523 let embed_started = std::time::Instant::now();
1524 let batch_count = total_chunks.div_ceil(batch_size);
1525 for batch_start in (0..chunks.len()).step_by(batch_size) {
1526 let batch_end = (batch_start + batch_size).min(chunks.len());
1527 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1528 .iter()
1529 .map(|c| c.embed_text.clone())
1530 .collect();
1531
1532 let vectors = embed_fn(batch_texts)?;
1533 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1534
1535 if let Some(dim) = vectors.first().map(|v| v.len()) {
1537 match expected_dimension {
1538 None => expected_dimension = Some(dim),
1539 Some(expected) if dim != expected => {
1540 return Err(format!(
1541 "embedding dimension changed across batches: expected {expected}, got {dim}"
1542 ));
1543 }
1544 _ => {}
1545 }
1546 }
1547
1548 for (i, vector) in vectors.into_iter().enumerate() {
1549 let chunk_idx = batch_start + i;
1550 entries.push(EmbeddingEntry {
1551 chunk: chunks[chunk_idx].clone(),
1552 vector,
1553 });
1554 }
1555
1556 if let Some(callback) = progress.as_mut() {
1557 callback(entries.len(), total_chunks);
1558 }
1559 }
1560
1561 let embed_ms = embed_started.elapsed().as_millis();
1562 let rate = (total_chunks as u128 * 1000)
1563 .checked_div(embed_ms)
1564 .unwrap_or(0) as u64;
1565 slog_info!(
1566 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1567 total_chunks,
1568 batch_count,
1569 embed_ms,
1570 rate
1571 );
1572
1573 let dimension = entries
1574 .first()
1575 .map(|e| e.vector.len())
1576 .unwrap_or(DEFAULT_DIMENSION);
1577
1578 Ok(Self {
1579 entries,
1580 file_mtimes: file_metadata
1581 .iter()
1582 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1583 .collect(),
1584 file_sizes: file_metadata
1585 .iter()
1586 .map(|(path, metadata)| (path.clone(), metadata.size))
1587 .collect(),
1588 file_hashes: file_metadata
1589 .into_iter()
1590 .map(|(path, metadata)| (path, metadata.content_hash))
1591 .collect(),
1592 dimension,
1593 fingerprint: None,
1594 project_root: project_root.to_path_buf(),
1595 deferred_files: HashSet::new(),
1596 })
1597 }
1598
1599 pub fn build<F>(
1602 project_root: &Path,
1603 files: &[PathBuf],
1604 embed_fn: &mut F,
1605 max_batch_size: usize,
1606 ) -> Result<Self, String>
1607 where
1608 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1609 {
1610 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1611 Self::build_from_chunks(
1612 project_root,
1613 chunks,
1614 file_mtimes,
1615 embed_fn,
1616 max_batch_size,
1617 Option::<&mut fn(usize, usize)>::None,
1618 )
1619 }
1620
1621 pub fn build_with_progress<F, P>(
1623 project_root: &Path,
1624 files: &[PathBuf],
1625 embed_fn: &mut F,
1626 max_batch_size: usize,
1627 progress: &mut P,
1628 ) -> Result<Self, String>
1629 where
1630 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631 P: FnMut(usize, usize),
1632 {
1633 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634 let total_chunks = chunks.len();
1635 progress(0, total_chunks);
1636 Self::build_from_chunks(
1637 project_root,
1638 chunks,
1639 file_mtimes,
1640 embed_fn,
1641 max_batch_size,
1642 Some(progress),
1643 )
1644 }
1645
1646 pub fn refresh_stale_files<F, P>(
1657 &mut self,
1658 project_root: &Path,
1659 current_files: &[PathBuf],
1660 embed_fn: &mut F,
1661 max_batch_size: usize,
1662 progress: &mut P,
1663 ) -> Result<RefreshSummary, String>
1664 where
1665 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1666 P: FnMut(usize, usize),
1667 {
1668 self.backfill_missing_file_sizes();
1669
1670 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1672 self.deferred_files
1673 .retain(|path| current_set.contains(path.as_path()));
1674 let total_processed = current_set.len() + self.file_mtimes.len()
1675 - self
1676 .file_mtimes
1677 .keys()
1678 .filter(|path| current_set.contains(path.as_path()))
1679 .count();
1680
1681 enum IndexedFileCheck {
1684 Deleted(PathBuf),
1685 MissingMetadata(PathBuf),
1686 Verified(PathBuf, FreshnessVerdict),
1687 }
1688
1689 let mut deleted: Vec<PathBuf> = Vec::new();
1690 let mut changed: Vec<PathBuf> = Vec::new();
1691 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1692 let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1693 let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1694
1695 for indexed_path in indexed_paths {
1696 let check_index = checks.len();
1697 if !current_set.contains(indexed_path.as_path()) {
1698 checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1699 continue;
1700 }
1701 let cached = match (
1702 self.file_mtimes.get(&indexed_path),
1703 self.file_sizes.get(&indexed_path),
1704 self.file_hashes.get(&indexed_path),
1705 ) {
1706 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1707 mtime: *mtime,
1708 size: *size,
1709 content_hash: *hash,
1710 }),
1711 _ => None,
1712 };
1713 if let Some(freshness) = cached {
1714 strict_verify_inputs.push((check_index, indexed_path, freshness));
1715 checks.push(None);
1716 } else {
1717 checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1718 }
1719 }
1720
1721 for (check_index, path, verdict) in
1722 cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1723 {
1724 checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1725 }
1726
1727 for check in checks {
1728 match check.expect("strict freshness check should be populated") {
1729 IndexedFileCheck::Deleted(path) => deleted.push(path),
1730 IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1731 IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1732 IndexedFileCheck::Verified(
1733 path,
1734 FreshnessVerdict::ContentFresh {
1735 new_mtime,
1736 new_size,
1737 },
1738 ) => {
1739 self.file_mtimes.insert(path.clone(), new_mtime);
1740 self.file_sizes.insert(path, new_size);
1741 }
1742 IndexedFileCheck::Verified(
1743 path,
1744 FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1745 ) => {
1746 changed.push(path);
1747 }
1748 }
1749 }
1750
1751 let mut added: Vec<PathBuf> = Vec::new();
1753 for path in current_files {
1754 if !self.file_mtimes.contains_key(path) {
1755 added.push(path.clone());
1756 }
1757 }
1758
1759 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1761 progress(0, 0);
1762 return Ok(RefreshSummary {
1763 total_processed,
1764 ..RefreshSummary::default()
1765 });
1766 }
1767
1768 if !deleted.is_empty() {
1772 self.remove_indexed_files(&deleted);
1773 }
1774
1775 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1777 to_embed.extend(changed.iter().cloned());
1778 to_embed.extend(added.iter().cloned());
1779
1780 if to_embed.is_empty() {
1781 progress(0, 0);
1783 return Ok(RefreshSummary {
1784 changed: 0,
1785 added: 0,
1786 deleted: deleted.len(),
1787 total_processed,
1788 });
1789 }
1790
1791 let reuse_map = self.build_chunk_reuse_map(&changed);
1792 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1793 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1794 let vanished = to_embed
1795 .iter()
1796 .filter(|path| {
1797 changed_set.contains(path.as_path())
1798 && !fresh_metadata.contains_key(*path)
1799 && !path.exists()
1800 })
1801 .cloned()
1802 .collect::<Vec<_>>();
1803 if !vanished.is_empty() {
1804 self.remove_indexed_files(&vanished);
1805 deleted.extend(vanished);
1806 }
1807
1808 if chunks.is_empty() {
1809 progress(0, 0);
1810 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1811 for file in &successful_files {
1812 self.deferred_files.remove(file);
1813 }
1814 if !successful_files.is_empty() {
1815 self.entries
1816 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1817 }
1818 let changed_count = changed
1819 .iter()
1820 .filter(|path| successful_files.contains(*path))
1821 .count();
1822 let added_count = added
1823 .iter()
1824 .filter(|path| successful_files.contains(*path))
1825 .count();
1826 for (file, metadata) in fresh_metadata {
1827 self.file_mtimes.insert(file.clone(), metadata.mtime);
1828 self.file_sizes.insert(file.clone(), metadata.size);
1829 self.file_hashes.insert(file.clone(), metadata.content_hash);
1830 }
1831 return Ok(RefreshSummary {
1832 changed: changed_count,
1833 added: added_count,
1834 deleted: deleted.len(),
1835 total_processed,
1836 });
1837 }
1838
1839 let existing_dimension = if self.entries.is_empty() {
1842 None
1843 } else {
1844 Some(self.dimension)
1845 };
1846 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1847 chunks,
1848 &reuse_map,
1849 embed_fn,
1850 max_batch_size,
1851 existing_dimension,
1852 "incremental refresh",
1853 progress,
1854 )?;
1855
1856 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1857 for file in &successful_files {
1858 self.deferred_files.remove(file);
1859 }
1860 if !successful_files.is_empty() {
1861 self.entries
1862 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1863 }
1864
1865 self.entries.extend(new_entries);
1866 for (file, metadata) in fresh_metadata {
1867 self.file_mtimes.insert(file.clone(), metadata.mtime);
1868 self.file_sizes.insert(file.clone(), metadata.size);
1869 self.file_hashes.insert(file, metadata.content_hash);
1870 }
1871 if let Some(dim) = observed_dimension {
1872 self.dimension = dim;
1873 }
1874
1875 Ok(RefreshSummary {
1876 changed: changed
1877 .iter()
1878 .filter(|path| successful_files.contains(*path))
1879 .count(),
1880 added: added
1881 .iter()
1882 .filter(|path| successful_files.contains(*path))
1883 .count(),
1884 deleted: deleted.len(),
1885 total_processed,
1886 })
1887 }
1888
1889 pub fn refresh_invalidated_files<F, P>(
1896 &mut self,
1897 project_root: &Path,
1898 paths: &[PathBuf],
1899 embed_fn: &mut F,
1900 max_batch_size: usize,
1901 max_files: usize,
1902 progress: &mut P,
1903 ) -> Result<InvalidatedFilesRefresh, String>
1904 where
1905 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1906 P: FnMut(usize, usize),
1907 {
1908 self.backfill_missing_file_sizes();
1909
1910 self.deferred_files.retain(|path| path.exists());
1911 let mut requested_paths = paths.to_vec();
1912 requested_paths.extend(self.deferred_files.iter().cloned());
1913 requested_paths.sort();
1914 requested_paths.dedup();
1915 let total_processed = requested_paths.len();
1916
1917 if requested_paths.is_empty() {
1918 progress(0, 0);
1919 return Ok(InvalidatedFilesRefresh {
1920 summary: RefreshSummary {
1921 total_processed,
1922 ..RefreshSummary::default()
1923 },
1924 ..InvalidatedFilesRefresh::default()
1925 });
1926 }
1927
1928 let previously_indexed: HashSet<PathBuf> = requested_paths
1929 .iter()
1930 .filter(|path| self.file_mtimes.contains_key(*path))
1931 .cloned()
1932 .collect();
1933 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1934
1935 self.remove_indexed_files(&requested_paths);
1939
1940 let existing_paths = requested_paths
1941 .iter()
1942 .filter(|path| path.exists())
1943 .cloned()
1944 .collect::<Vec<_>>();
1945 let deleted = requested_paths
1946 .iter()
1947 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1948 .count();
1949
1950 if existing_paths.is_empty() {
1951 for path in &requested_paths {
1952 if !path.exists() {
1953 self.deferred_files.remove(path);
1954 }
1955 }
1956 progress(0, 0);
1957 return Ok(InvalidatedFilesRefresh {
1958 completed_paths: requested_paths,
1959 summary: RefreshSummary {
1960 deleted,
1961 total_processed,
1962 ..RefreshSummary::default()
1963 },
1964 ..InvalidatedFilesRefresh::default()
1965 });
1966 }
1967
1968 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1969
1970 let retained_file_count = self.file_mtimes.len();
1971 let changed_successful_count = existing_paths
1972 .iter()
1973 .filter(|path| {
1974 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1975 })
1976 .count();
1977 let available_new_files =
1978 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1979 let new_successful_files = existing_paths
1980 .iter()
1981 .filter(|path| {
1982 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1983 })
1984 .cloned()
1985 .collect::<Vec<_>>();
1986 if new_successful_files.len() > available_new_files {
1987 let allowed_new_files = new_successful_files
1988 .iter()
1989 .take(available_new_files)
1990 .cloned()
1991 .collect::<HashSet<_>>();
1992 let deferred_new_files = new_successful_files
1993 .into_iter()
1994 .filter(|path| !allowed_new_files.contains(path))
1995 .collect::<HashSet<_>>();
1996
1997 fresh_metadata.retain(|file, _| {
1998 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1999 });
2000 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2001
2002 if !deferred_new_files.is_empty() {
2003 for path in &deferred_new_files {
2004 self.deferred_files.insert(path.clone());
2005 }
2006 slog_warn!(
2007 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2008 deferred_new_files.len(),
2009 max_files
2010 );
2011 }
2012 }
2013
2014 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2015 for file in &successful_files {
2016 self.deferred_files.remove(file);
2017 }
2018 let changed = successful_files
2019 .iter()
2020 .filter(|path| previously_indexed.contains(path.as_path()))
2021 .count();
2022 let added = successful_files.len().saturating_sub(changed);
2023 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2024
2025 if chunks.is_empty() {
2026 progress(0, 0);
2027 for (file, metadata) in fresh_metadata {
2028 let freshness = FileFreshness {
2029 mtime: metadata.mtime,
2030 size: metadata.size,
2031 content_hash: metadata.content_hash,
2032 };
2033 self.file_mtimes.insert(file.clone(), freshness.mtime);
2034 self.file_sizes.insert(file.clone(), freshness.size);
2035 self.file_hashes
2036 .insert(file.clone(), freshness.content_hash);
2037 updated_metadata.push((file, freshness));
2038 }
2039
2040 return Ok(InvalidatedFilesRefresh {
2041 updated_metadata,
2042 completed_paths: requested_paths,
2043 summary: RefreshSummary {
2044 changed,
2045 added,
2046 deleted,
2047 total_processed,
2048 },
2049 ..InvalidatedFilesRefresh::default()
2050 });
2051 }
2052
2053 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2054 {
2055 None
2056 } else {
2057 Some(self.dimension)
2058 };
2059 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2060 chunks,
2061 &reuse_map,
2062 embed_fn,
2063 max_batch_size,
2064 initial_observed_dimension,
2065 "invalidated-file refresh",
2066 progress,
2067 )?;
2068
2069 let added_entries = new_entries.clone();
2070 self.entries.extend(new_entries);
2071 for (file, metadata) in fresh_metadata {
2072 let freshness = FileFreshness {
2073 mtime: metadata.mtime,
2074 size: metadata.size,
2075 content_hash: metadata.content_hash,
2076 };
2077 self.file_mtimes.insert(file.clone(), freshness.mtime);
2078 self.file_sizes.insert(file.clone(), freshness.size);
2079 self.file_hashes
2080 .insert(file.clone(), freshness.content_hash);
2081 updated_metadata.push((file, freshness));
2082 }
2083 if let Some(dim) = observed_dimension {
2084 self.dimension = dim;
2085 }
2086
2087 Ok(InvalidatedFilesRefresh {
2088 added_entries,
2089 updated_metadata,
2090 completed_paths: requested_paths,
2091 summary: RefreshSummary {
2092 changed,
2093 added,
2094 deleted,
2095 total_processed,
2096 },
2097 })
2098 }
2099
2100 pub fn apply_refresh_update(
2101 &mut self,
2102 added_entries: Vec<EmbeddingEntry>,
2103 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2104 completed_paths: &[PathBuf],
2105 ) {
2106 self.remove_indexed_files(completed_paths);
2110
2111 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2112 self.entries.extend(added_entries);
2113 for (file, freshness) in updated_metadata {
2114 self.file_mtimes.insert(file.clone(), freshness.mtime);
2115 self.file_sizes.insert(file.clone(), freshness.size);
2116 self.file_hashes.insert(file, freshness.content_hash);
2117 }
2118 if let Some(dim) = observed_dimension {
2119 self.dimension = dim;
2120 }
2121 }
2122
2123 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2124 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2125 self.entries
2126 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2127 for path in files {
2128 self.file_mtimes.remove(path);
2129 self.file_sizes.remove(path);
2130 self.file_hashes.remove(path);
2131 }
2132 }
2133
2134 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2136 if self.entries.is_empty() || query_vector.len() != self.dimension {
2137 return Vec::new();
2138 }
2139
2140 let mut scored: Vec<(f32, usize)> = self
2141 .entries
2142 .iter()
2143 .enumerate()
2144 .map(|(i, entry)| {
2145 let mut score = cosine_similarity(query_vector, &entry.vector);
2146 if entry.chunk.exported {
2147 score *= 1.1;
2148 }
2149 (score, i)
2150 })
2151 .collect();
2152
2153 let keep = top_k.min(scored.len());
2154 if keep == 0 {
2155 return Vec::new();
2156 }
2157
2158 if keep < scored.len() {
2159 scored.select_nth_unstable_by(keep, semantic_score_order);
2160 scored.truncate(keep);
2161 }
2162 scored.sort_by(semantic_score_order);
2163
2164 scored
2165 .into_iter()
2166 .map(|(score, idx)| {
2170 let entry = &self.entries[idx];
2171 SemanticResult {
2172 file: entry.chunk.file.clone(),
2173 name: entry.chunk.name.clone(),
2174 kind: entry.chunk.kind.clone(),
2175 start_line: entry.chunk.start_line,
2176 end_line: entry.chunk.end_line,
2177 exported: entry.chunk.exported,
2178 snippet: entry.chunk.snippet.clone(),
2179 score,
2180 source: "semantic",
2181 }
2182 })
2183 .collect()
2184 }
2185
2186 pub fn len(&self) -> usize {
2188 self.entries.len()
2189 }
2190
2191 pub fn is_file_stale(&self, file: &Path) -> bool {
2193 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2194 return true;
2195 };
2196 let Some(stored_size) = self.file_sizes.get(file) else {
2197 return true;
2198 };
2199 let Some(stored_hash) = self.file_hashes.get(file) else {
2200 return true;
2201 };
2202 let cached = FileFreshness {
2203 mtime: *stored_mtime,
2204 size: *stored_size,
2205 content_hash: *stored_hash,
2206 };
2207 match cache_freshness::verify_file_strict(file, &cached) {
2208 FreshnessVerdict::HotFresh => false,
2209 FreshnessVerdict::ContentFresh { .. } => false,
2210 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2211 }
2212 }
2213
2214 fn backfill_missing_file_sizes(&mut self) {
2215 for path in self.file_mtimes.keys() {
2216 if self.file_sizes.contains_key(path) {
2217 continue;
2218 }
2219 if let Ok(metadata) = fs::metadata(path) {
2220 self.file_sizes.insert(path.clone(), metadata.len());
2221 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2222 self.file_hashes.insert(path.clone(), hash);
2223 }
2224 }
2225 }
2226 }
2227
2228 pub fn remove_file(&mut self, file: &Path) {
2230 self.invalidate_file(file);
2231 }
2232
2233 pub fn invalidate_file(&mut self, file: &Path) {
2234 let canonical_file = canonicalize_existing_or_deleted_path(file);
2235 self.entries
2236 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2237 self.file_mtimes.remove(file);
2238 self.file_sizes.remove(file);
2239 self.file_hashes.remove(file);
2240 if canonical_file.as_path() != file {
2241 self.file_mtimes.remove(&canonical_file);
2242 self.file_sizes.remove(&canonical_file);
2243 self.file_hashes.remove(&canonical_file);
2244 }
2245 }
2246
2247 pub fn dimension(&self) -> usize {
2249 self.dimension
2250 }
2251
2252 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2253 self.fingerprint.as_ref()
2254 }
2255
2256 pub fn backend_label(&self) -> Option<&str> {
2257 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2258 }
2259
2260 pub fn model_label(&self) -> Option<&str> {
2261 self.fingerprint.as_ref().map(|f| f.model.as_str())
2262 }
2263
2264 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2265 self.fingerprint = Some(fingerprint);
2266 }
2267
2268 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2270 if self.entries.is_empty() {
2273 slog_info!("skipping semantic index persistence (0 entries)");
2274 return;
2275 }
2276 let dir = storage_dir.join("semantic").join(project_key);
2277 if let Err(e) = fs::create_dir_all(&dir) {
2278 slog_warn!("failed to create semantic cache dir: {}", e);
2279 return;
2280 }
2281 let data_path = dir.join("semantic.bin");
2282 let tmp_path = dir.join(format!(
2283 "semantic.bin.tmp.{}.{}",
2284 std::process::id(),
2285 SystemTime::now()
2286 .duration_since(SystemTime::UNIX_EPOCH)
2287 .unwrap_or(Duration::ZERO)
2288 .as_nanos()
2289 ));
2290 let write_result = (|| -> io::Result<usize> {
2291 let file = fs::File::create(&tmp_path)?;
2292 let mut writer = BufWriter::new(file);
2293 let bytes_written = self.write_to_writer(&mut writer)?;
2294 writer.flush()?;
2295 writer.get_ref().sync_all()?;
2296 Ok(bytes_written)
2297 })();
2298 let bytes_written = match write_result {
2299 Ok(bytes_written) => bytes_written,
2300 Err(e) => {
2301 slog_warn!("failed to write semantic index: {}", e);
2302 let _ = fs::remove_file(&tmp_path);
2303 return;
2304 }
2305 };
2306 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2307 slog_warn!("failed to rename semantic index: {}", e);
2308 let _ = fs::remove_file(&tmp_path);
2309 return;
2310 }
2311 slog_info!(
2312 "semantic index persisted: {} entries, {:.1} KB",
2313 self.entries.len(),
2314 bytes_written as f64 / 1024.0
2315 );
2316 }
2317
2318 pub fn read_from_disk(
2320 storage_dir: &Path,
2321 project_key: &str,
2322 current_canonical_root: &Path,
2323 is_worktree_bridge: bool,
2324 expected_fingerprint: Option<&str>,
2325 ) -> Option<Self> {
2326 debug_assert!(current_canonical_root.is_absolute());
2327 let data_path = storage_dir
2328 .join("semantic")
2329 .join(project_key)
2330 .join("semantic.bin");
2331 let file = fs::File::open(&data_path).ok()?;
2332 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2333 if file_len < HEADER_BYTES_V1 {
2334 slog_warn!(
2335 "corrupt semantic index (too small: {} bytes), removing",
2336 file_len
2337 );
2338 if !is_worktree_bridge {
2339 let _ = fs::remove_file(&data_path);
2340 }
2341 return None;
2342 }
2343
2344 let mut reader = BufReader::new(file);
2345 let mut version_buf = [0u8; 1];
2346 reader.read_exact(&mut version_buf).ok()?;
2347 let version = version_buf[0];
2348 if version != SEMANTIC_INDEX_VERSION_V6 {
2349 slog_info!(
2350 "cached semantic index version {} is older than {}, rebuilding",
2351 version,
2352 SEMANTIC_INDEX_VERSION_V6
2353 );
2354 if !is_worktree_bridge {
2355 let _ = fs::remove_file(&data_path);
2356 }
2357 return None;
2358 }
2359 match Self::from_reader_after_version(
2360 reader,
2361 version,
2362 current_canonical_root,
2363 Some(file_len),
2364 1,
2365 ) {
2366 Ok(index) => {
2367 if index.entries.is_empty() {
2368 slog_info!("cached semantic index is empty, will rebuild");
2369 if !is_worktree_bridge {
2370 let _ = fs::remove_file(&data_path);
2371 }
2372 return None;
2373 }
2374 if let Some(expected) = expected_fingerprint {
2375 let matches = index
2376 .fingerprint()
2377 .map(|fingerprint| fingerprint.matches_expected(expected))
2378 .unwrap_or(false);
2379 if !matches {
2380 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2381 if !is_worktree_bridge {
2382 let _ = fs::remove_file(&data_path);
2383 }
2384 return None;
2385 }
2386 }
2387 slog_info!(
2388 "loaded semantic index from disk: {} entries",
2389 index.entries.len()
2390 );
2391 Some(index)
2392 }
2393 Err(e) => {
2394 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2395 if !is_worktree_bridge {
2396 let _ = fs::remove_file(&data_path);
2397 }
2398 None
2399 }
2400 }
2401 }
2402
2403 pub fn to_bytes(&self) -> Vec<u8> {
2405 let mut buf = Vec::new();
2406 self.write_to_writer(&mut buf)
2407 .expect("writing semantic index to Vec cannot fail");
2408 buf
2409 }
2410
2411 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2412 let mut bytes_written = 0usize;
2413 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2414 let encoded = fingerprint.as_string();
2415 if encoded.is_empty() {
2416 None
2417 } else {
2418 Some(encoded)
2419 }
2420 });
2421 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2422 let file_mtime_count = self
2423 .file_mtimes
2424 .iter()
2425 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2426 .count();
2427 let entry_count = self
2428 .entries
2429 .iter()
2430 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2431 .count();
2432
2433 let version = SEMANTIC_INDEX_VERSION_V6;
2446 write_counted(writer, &[version], &mut bytes_written)?;
2447 write_counted(
2448 writer,
2449 &(self.dimension as u32).to_le_bytes(),
2450 &mut bytes_written,
2451 )?;
2452 write_counted(
2453 writer,
2454 &(entry_count as u32).to_le_bytes(),
2455 &mut bytes_written,
2456 )?;
2457 write_counted(
2458 writer,
2459 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2460 &mut bytes_written,
2461 )?;
2462 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2463
2464 write_counted(
2467 writer,
2468 &(file_mtime_count as u32).to_le_bytes(),
2469 &mut bytes_written,
2470 )?;
2471 for (path, mtime) in &self.file_mtimes {
2472 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2473 continue;
2474 };
2475 let relative = relative.to_string_lossy();
2476 let path_bytes = relative.as_bytes();
2477 write_counted(
2478 writer,
2479 &(path_bytes.len() as u32).to_le_bytes(),
2480 &mut bytes_written,
2481 )?;
2482 write_counted(writer, path_bytes, &mut bytes_written)?;
2483 let duration = mtime
2484 .duration_since(SystemTime::UNIX_EPOCH)
2485 .unwrap_or_default();
2486 write_counted(
2487 writer,
2488 &duration.as_secs().to_le_bytes(),
2489 &mut bytes_written,
2490 )?;
2491 write_counted(
2492 writer,
2493 &duration.subsec_nanos().to_le_bytes(),
2494 &mut bytes_written,
2495 )?;
2496 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2497 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2498 let hash = self
2499 .file_hashes
2500 .get(path)
2501 .copied()
2502 .unwrap_or_else(cache_freshness::zero_hash);
2503 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2504 }
2505
2506 for entry in &self.entries {
2508 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2509 continue;
2510 };
2511 let c = &entry.chunk;
2512
2513 let relative = relative.to_string_lossy();
2515 let file_bytes = relative.as_bytes();
2516 write_counted(
2517 writer,
2518 &(file_bytes.len() as u32).to_le_bytes(),
2519 &mut bytes_written,
2520 )?;
2521 write_counted(writer, file_bytes, &mut bytes_written)?;
2522
2523 let name_bytes = c.name.as_bytes();
2525 write_counted(
2526 writer,
2527 &(name_bytes.len() as u32).to_le_bytes(),
2528 &mut bytes_written,
2529 )?;
2530 write_counted(writer, name_bytes, &mut bytes_written)?;
2531
2532 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2534
2535 write_counted(
2537 writer,
2538 &(c.start_line as u32).to_le_bytes(),
2539 &mut bytes_written,
2540 )?;
2541 write_counted(
2542 writer,
2543 &(c.end_line as u32).to_le_bytes(),
2544 &mut bytes_written,
2545 )?;
2546 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2547
2548 let snippet_bytes = c.snippet.as_bytes();
2550 write_counted(
2551 writer,
2552 &(snippet_bytes.len() as u32).to_le_bytes(),
2553 &mut bytes_written,
2554 )?;
2555 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2556
2557 let embed_bytes = c.embed_text.as_bytes();
2559 write_counted(
2560 writer,
2561 &(embed_bytes.len() as u32).to_le_bytes(),
2562 &mut bytes_written,
2563 )?;
2564 write_counted(writer, embed_bytes, &mut bytes_written)?;
2565
2566 for &val in &entry.vector {
2568 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2569 }
2570 }
2571
2572 Ok(bytes_written)
2573 }
2574
2575 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2577 debug_assert!(current_canonical_root.is_absolute());
2578 if data.len() < HEADER_BYTES_V1 {
2579 return Err("data too short".to_string());
2580 }
2581
2582 Self::from_reader_after_version(
2583 Cursor::new(&data[1..]),
2584 data[0],
2585 current_canonical_root,
2586 Some(data.len()),
2587 1,
2588 )
2589 }
2590
2591 fn from_reader_after_version<R: Read>(
2592 reader: R,
2593 version: u8,
2594 current_canonical_root: &Path,
2595 total_len: Option<usize>,
2596 bytes_read: usize,
2597 ) -> Result<Self, String> {
2598 debug_assert!(current_canonical_root.is_absolute());
2599 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2600
2601 if version != SEMANTIC_INDEX_VERSION_V1
2602 && version != SEMANTIC_INDEX_VERSION_V2
2603 && version != SEMANTIC_INDEX_VERSION_V3
2604 && version != SEMANTIC_INDEX_VERSION_V4
2605 && version != SEMANTIC_INDEX_VERSION_V5
2606 && version != SEMANTIC_INDEX_VERSION_V6
2607 {
2608 return Err(format!("unsupported version: {}", version));
2609 }
2610 if (version == SEMANTIC_INDEX_VERSION_V2
2614 || version == SEMANTIC_INDEX_VERSION_V3
2615 || version == SEMANTIC_INDEX_VERSION_V4
2616 || version == SEMANTIC_INDEX_VERSION_V5
2617 || version == SEMANTIC_INDEX_VERSION_V6)
2618 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2619 {
2620 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2621 }
2622
2623 let dimension = read_u32_stream(&mut reader)? as usize;
2624 let entry_count = read_u32_stream(&mut reader)? as usize;
2625 validate_embedding_dimension(dimension)?;
2626 if entry_count > MAX_ENTRIES {
2627 return Err(format!("too many semantic index entries: {}", entry_count));
2628 }
2629
2630 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2636 || version == SEMANTIC_INDEX_VERSION_V3
2637 || version == SEMANTIC_INDEX_VERSION_V4
2638 || version == SEMANTIC_INDEX_VERSION_V5
2639 || version == SEMANTIC_INDEX_VERSION_V6;
2640 let fingerprint = if has_fingerprint_field {
2641 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2642 if total_len
2643 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2644 {
2645 return Err("unexpected end of data reading fingerprint".to_string());
2646 }
2647 if fingerprint_len == 0 {
2648 None
2649 } else {
2650 let mut raw = vec![0u8; fingerprint_len];
2651 read_exact_stream(
2652 &mut reader,
2653 &mut raw,
2654 "unexpected end of data reading fingerprint",
2655 )?;
2656 let raw = String::from_utf8_lossy(&raw).to_string();
2657 Some(
2658 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2659 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2660 )
2661 }
2662 } else {
2663 None
2664 };
2665
2666 let mtime_count = read_u32_stream(&mut reader)? as usize;
2668 if mtime_count > MAX_ENTRIES {
2669 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2670 }
2671
2672 let vector_bytes = entry_count
2673 .checked_mul(dimension)
2674 .and_then(|count| count.checked_mul(F32_BYTES))
2675 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2676 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2677 return Err("semantic index vectors exceed available data".to_string());
2678 }
2679
2680 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2681 let mut file_sizes = HashMap::with_capacity(mtime_count);
2682 let mut file_hashes = HashMap::with_capacity(mtime_count);
2683 for _ in 0..mtime_count {
2684 let path = read_string_stream(&mut reader, total_len)?;
2685 let secs = read_u64_stream(&mut reader)?;
2686 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2692 || version == SEMANTIC_INDEX_VERSION_V4
2693 || version == SEMANTIC_INDEX_VERSION_V5
2694 || version == SEMANTIC_INDEX_VERSION_V6
2695 {
2696 read_u32_stream(&mut reader)?
2697 } else {
2698 0
2699 };
2700 let size =
2701 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2702 read_u64_stream(&mut reader)?
2703 } else {
2704 0
2705 };
2706 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2707 let mut hash_bytes = [0u8; 32];
2708 read_exact_stream(
2709 &mut reader,
2710 &mut hash_bytes,
2711 "unexpected end of data reading content hash",
2712 )?;
2713 blake3::Hash::from_bytes(hash_bytes)
2714 } else {
2715 cache_freshness::zero_hash()
2716 };
2717 if nanos >= 1_000_000_000 {
2724 return Err(format!(
2725 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2726 nanos
2727 ));
2728 }
2729 let duration = std::time::Duration::new(secs, nanos);
2730 let mtime = SystemTime::UNIX_EPOCH
2731 .checked_add(duration)
2732 .ok_or_else(|| {
2733 format!(
2734 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2735 secs, nanos
2736 )
2737 })?;
2738 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2739 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2740 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2741 } else {
2742 PathBuf::from(path)
2743 };
2744 file_mtimes.insert(path.clone(), mtime);
2745 file_sizes.insert(path.clone(), size);
2746 file_hashes.insert(path, content_hash);
2747 }
2748
2749 let mut entries = Vec::with_capacity(entry_count);
2751 for _ in 0..entry_count {
2752 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2753 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2754 cached_path_under_root(current_canonical_root, &raw_file)
2755 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2756 } else {
2757 raw_file
2758 };
2759 let name = read_string_stream(&mut reader, total_len)?;
2760
2761 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2762
2763 let start_line = read_u32_stream(&mut reader)?;
2764 let end_line = read_u32_stream(&mut reader)?;
2765
2766 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2767
2768 let snippet = read_string_stream(&mut reader, total_len)?;
2769 let embed_text = read_string_stream(&mut reader, total_len)?;
2770
2771 let vec_bytes = dimension
2773 .checked_mul(F32_BYTES)
2774 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2775 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2776 return Err("unexpected end of data reading vector".to_string());
2777 }
2778 let mut vector = Vec::with_capacity(dimension);
2779 for _ in 0..dimension {
2780 let mut bytes = [0u8; F32_BYTES];
2781 read_exact_stream(
2782 &mut reader,
2783 &mut bytes,
2784 "unexpected end of data reading vector",
2785 )?;
2786 vector.push(f32::from_le_bytes(bytes));
2787 }
2788
2789 entries.push(EmbeddingEntry {
2790 chunk: SemanticChunk {
2791 file,
2792 name,
2793 kind,
2794 start_line,
2795 end_line,
2796 exported,
2797 embed_text,
2798 snippet,
2799 },
2800 vector,
2801 });
2802 }
2803
2804 if entries.len() != entry_count {
2805 return Err(format!(
2806 "semantic cache entry count drift: header={} decoded={}",
2807 entry_count,
2808 entries.len()
2809 ));
2810 }
2811 for entry in &entries {
2812 if !file_mtimes.contains_key(&entry.chunk.file) {
2813 return Err(format!(
2814 "semantic cache metadata missing for entry file {}",
2815 entry.chunk.file.display()
2816 ));
2817 }
2818 }
2819
2820 Ok(Self {
2821 entries,
2822 file_mtimes,
2823 file_sizes,
2824 file_hashes,
2825 dimension,
2826 fingerprint,
2827 project_root: current_canonical_root.to_path_buf(),
2828 deferred_files: HashSet::new(),
2829 })
2830 }
2831}
2832
2833fn write_counted<W: Write>(
2834 writer: &mut W,
2835 bytes: &[u8],
2836 bytes_written: &mut usize,
2837) -> io::Result<()> {
2838 writer.write_all(bytes)?;
2839 *bytes_written = bytes_written.saturating_add(bytes.len());
2840 Ok(())
2841}
2842
2843struct CountingReader<R> {
2844 inner: R,
2845 bytes_read: usize,
2846}
2847
2848impl<R> CountingReader<R> {
2849 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2850 Self { inner, bytes_read }
2851 }
2852
2853 fn bytes_read(&self) -> usize {
2854 self.bytes_read
2855 }
2856}
2857
2858impl<R: Read> Read for CountingReader<R> {
2859 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2860 let read = self.inner.read(buf)?;
2861 self.bytes_read = self.bytes_read.saturating_add(read);
2862 Ok(read)
2863 }
2864}
2865
2866fn read_exact_stream<R: Read>(
2867 reader: &mut CountingReader<R>,
2868 buf: &mut [u8],
2869 eof_message: &'static str,
2870) -> Result<(), String> {
2871 reader.read_exact(buf).map_err(|error| {
2872 if error.kind() == io::ErrorKind::UnexpectedEof {
2873 eof_message.to_string()
2874 } else {
2875 format!("{eof_message}: {error}")
2876 }
2877 })
2878}
2879
2880fn read_u8_stream<R: Read>(
2881 reader: &mut CountingReader<R>,
2882 eof_message: &'static str,
2883) -> Result<u8, String> {
2884 let mut bytes = [0u8; 1];
2885 read_exact_stream(reader, &mut bytes, eof_message)?;
2886 Ok(bytes[0])
2887}
2888
2889fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2890 let mut bytes = [0u8; 4];
2891 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2892 Ok(u32::from_le_bytes(bytes))
2893}
2894
2895fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2896 let mut bytes = [0u8; 8];
2897 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2898 Ok(u64::from_le_bytes(bytes))
2899}
2900
2901fn read_string_stream<R: Read>(
2902 reader: &mut CountingReader<R>,
2903 total_len: Option<usize>,
2904) -> Result<String, String> {
2905 let len = read_u32_stream(reader)? as usize;
2906 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2907 return Err("unexpected end of data reading string".to_string());
2908 }
2909 let mut bytes = vec![0u8; len];
2910 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2911 Ok(String::from_utf8_lossy(&bytes).to_string())
2912}
2913
2914fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2916 let relative = file
2917 .strip_prefix(project_root)
2918 .unwrap_or(file)
2919 .to_string_lossy();
2920
2921 let kind_label = match &symbol.kind {
2922 SymbolKind::Function => "function",
2923 SymbolKind::Class => "class",
2924 SymbolKind::Method => "method",
2925 SymbolKind::Struct => "struct",
2926 SymbolKind::Interface => "interface",
2927 SymbolKind::Enum => "enum",
2928 SymbolKind::TypeAlias => "type",
2929 SymbolKind::Variable => "variable",
2930 SymbolKind::Heading => "heading",
2931 SymbolKind::FileSummary => "file-summary",
2932 };
2933
2934 let name = &symbol.name;
2936 let mut text = format!(
2937 "name:{name} file:{} kind:{} name:{name}",
2938 relative, kind_label
2939 );
2940
2941 if let Some(sig) = &symbol.signature {
2942 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2950 }
2951
2952 let lines: Vec<&str> = source.lines().collect();
2954 let start = (symbol.range.start_line as usize).min(lines.len());
2955 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2957 if start < end {
2958 let body: String = lines[start..end]
2959 .iter()
2960 .take(15) .copied()
2962 .collect::<Vec<&str>>()
2963 .join("\n");
2964 let snippet = if body.len() > 300 {
2965 format!("{}...", &body[..body.floor_char_boundary(300)])
2966 } else {
2967 body
2968 };
2969 text.push_str(&format!(" body:{}", snippet));
2970 }
2971
2972 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2977}
2978
2979const MAX_EMBED_TEXT_CHARS: usize = 1600;
2983
2984fn truncate_chars(value: &str, max_chars: usize) -> String {
2985 value.chars().take(max_chars).collect()
2986}
2987
2988fn first_leading_doc_comment(source: &str) -> String {
2989 let lines: Vec<&str> = source.lines().collect();
2990 let Some((start, first)) = lines
2991 .iter()
2992 .enumerate()
2993 .find(|(_, line)| !line.trim().is_empty())
2994 else {
2995 return String::new();
2996 };
2997
2998 let trimmed = first.trim_start();
2999 if trimmed.starts_with("/**") {
3000 let mut comment = Vec::new();
3001 for line in lines.iter().skip(start) {
3002 comment.push(*line);
3003 if line.contains("*/") {
3004 break;
3005 }
3006 }
3007 return truncate_chars(&comment.join("\n"), 200);
3008 }
3009
3010 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3011 let comment = lines
3012 .iter()
3013 .skip(start)
3014 .take_while(|line| {
3015 let trimmed = line.trim_start();
3016 trimmed.starts_with("///") || trimmed.starts_with("//!")
3017 })
3018 .copied()
3019 .collect::<Vec<_>>()
3020 .join("\n");
3021 return truncate_chars(&comment, 200);
3022 }
3023
3024 String::new()
3025}
3026
3027pub fn build_file_summary_chunk(
3028 file: &Path,
3029 project_root: &Path,
3030 source: &str,
3031 top_exports: &[&str],
3032 top_export_signatures: &[Option<&str>],
3033) -> SemanticChunk {
3034 let relative = file.strip_prefix(project_root).unwrap_or(file);
3035 let rel_path = relative.to_string_lossy();
3036 let parent_dir = relative
3037 .parent()
3038 .map(|parent| parent.to_string_lossy().to_string())
3039 .unwrap_or_default();
3040 let name = file
3041 .file_stem()
3042 .map(|stem| stem.to_string_lossy().to_string())
3043 .unwrap_or_default();
3044 let doc = first_leading_doc_comment(source);
3045 let exports = top_exports
3046 .iter()
3047 .take(5)
3048 .copied()
3049 .collect::<Vec<_>>()
3050 .join(",");
3051 let snippet = if doc.is_empty() {
3052 top_export_signatures
3053 .first()
3054 .and_then(|signature| signature.as_deref())
3055 .map(|signature| truncate_chars(signature, 200))
3056 .unwrap_or_default()
3057 } else {
3058 doc.clone()
3059 };
3060
3061 SemanticChunk {
3062 file: file.to_path_buf(),
3063 name,
3064 kind: SymbolKind::FileSummary,
3065 start_line: 0,
3066 end_line: 0,
3067 exported: false,
3068 embed_text: truncate_chars(
3069 &format!(
3070 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3071 file.file_stem()
3072 .map(|stem| stem.to_string_lossy().to_string())
3073 .unwrap_or_default()
3074 ),
3075 MAX_EMBED_TEXT_CHARS,
3076 ),
3077 snippet,
3078 }
3079}
3080
3081fn parser_for(
3082 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3083 lang: crate::parser::LangId,
3084) -> Result<&mut Parser, String> {
3085 use std::collections::hash_map::Entry;
3086
3087 match parsers.entry(lang) {
3088 Entry::Occupied(entry) => Ok(entry.into_mut()),
3089 Entry::Vacant(entry) => {
3090 let grammar = grammar_for(lang);
3091 let mut parser = Parser::new();
3092 parser
3093 .set_language(&grammar)
3094 .map_err(|error| error.to_string())?;
3095 Ok(entry.insert(parser))
3096 }
3097 }
3098}
3099
3100pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3101 matches!(
3102 path.extension().and_then(|extension| extension.to_str()),
3103 Some(
3104 "ts" | "tsx"
3105 | "js"
3106 | "jsx"
3107 | "py"
3108 | "rs"
3109 | "go"
3110 | "c"
3111 | "h"
3112 | "cc"
3113 | "cpp"
3114 | "cxx"
3115 | "hpp"
3116 | "hh"
3117 | "zig"
3118 | "cs"
3119 | "sh"
3120 | "bash"
3121 | "zsh"
3122 | "inc"
3123 | "php"
3124 | "sol"
3125 | "scss"
3126 | "vue"
3127 | "yaml"
3128 | "yml"
3129 | "pas"
3130 | "pp"
3131 | "dpr"
3132 | "dpk"
3133 | "lpr"
3134 )
3135 )
3136}
3137
3138fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
3139 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3140 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3141 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
3142 .map_err(|error| error.to_string())?
3143 .unwrap_or_else(cache_freshness::zero_hash);
3144 Ok(IndexedFileMetadata {
3145 mtime,
3146 size: metadata.len(),
3147 content_hash,
3148 })
3149}
3150
3151fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3152 if let Ok(canonical) = fs::canonicalize(path) {
3153 return canonical;
3154 }
3155
3156 let Some(parent) = path.parent() else {
3157 return path.to_path_buf();
3158 };
3159 let Some(file_name) = path.file_name() else {
3160 return path.to_path_buf();
3161 };
3162
3163 fs::canonicalize(parent)
3164 .map(|canonical_parent| canonical_parent.join(file_name))
3165 .unwrap_or_else(|_| path.to_path_buf())
3166}
3167
3168const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3178
3179fn collect_file_chunks(
3180 project_root: &Path,
3181 file: &Path,
3182 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3183) -> Result<Vec<SemanticChunk>, String> {
3184 if !is_semantic_indexed_extension(file) {
3185 return Err("unsupported file extension".to_string());
3186 }
3187 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3188 if std::fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3191 return Ok(Vec::new());
3192 }
3193 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
3194 let tree = parser_for(parsers, lang)?
3195 .parse(&source, None)
3196 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3197 let symbols =
3198 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
3199
3200 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
3201}
3202
3203fn build_snippet(symbol: &Symbol, source: &str) -> String {
3205 let lines: Vec<&str> = source.lines().collect();
3206 let start = (symbol.range.start_line as usize).min(lines.len());
3207 let end = (symbol.range.end_line as usize + 1).min(lines.len());
3209 if start < end {
3210 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3211 let mut snippet = snippet_lines.join("\n");
3212 if end - start > 5 {
3213 snippet.push_str("\n ...");
3214 }
3215 if snippet.len() > 300 {
3216 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3217 }
3218 snippet
3219 } else {
3220 String::new()
3221 }
3222}
3223
3224fn symbols_to_chunks(
3226 file: &Path,
3227 symbols: &[Symbol],
3228 source: &str,
3229 project_root: &Path,
3230) -> Vec<SemanticChunk> {
3231 let mut chunks = Vec::new();
3232 let top_exports_with_signatures = symbols
3233 .iter()
3234 .filter(|symbol| {
3235 symbol.exported
3236 && symbol.parent.is_none()
3237 && !matches!(symbol.kind, SymbolKind::Heading)
3238 })
3239 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3240 .collect::<Vec<_>>();
3241
3242 let has_only_headings = !symbols.is_empty()
3243 && symbols
3244 .iter()
3245 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3246 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3247 let top_exports = top_exports_with_signatures
3248 .iter()
3249 .map(|(name, _)| *name)
3250 .collect::<Vec<_>>();
3251 let top_export_signatures = top_exports_with_signatures
3252 .iter()
3253 .map(|(_, signature)| *signature)
3254 .collect::<Vec<_>>();
3255 chunks.push(build_file_summary_chunk(
3256 file,
3257 project_root,
3258 source,
3259 &top_exports,
3260 &top_export_signatures,
3261 ));
3262 }
3263
3264 for symbol in symbols {
3265 if matches!(symbol.kind, SymbolKind::Heading) {
3270 continue;
3271 }
3272
3273 let line_count = symbol
3275 .range
3276 .end_line
3277 .saturating_sub(symbol.range.start_line)
3278 + 1;
3279 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3280 continue;
3281 }
3282
3283 let embed_text = build_embed_text(symbol, source, file, project_root);
3284 let snippet = build_snippet(symbol, source);
3285
3286 chunks.push(SemanticChunk {
3287 file: file.to_path_buf(),
3288 name: symbol.name.clone(),
3289 kind: symbol.kind.clone(),
3290 start_line: symbol.range.start_line,
3291 end_line: symbol.range.end_line,
3292 exported: symbol.exported,
3293 embed_text,
3294 snippet,
3295 });
3296
3297 }
3300
3301 chunks
3302}
3303
3304fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3305 b.0.partial_cmp(&a.0)
3306 .unwrap_or(std::cmp::Ordering::Equal)
3307 .then_with(|| a.1.cmp(&b.1))
3308}
3309
3310fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3312 if a.len() != b.len() {
3313 return 0.0;
3314 }
3315
3316 let mut dot = 0.0f32;
3317 let mut norm_a = 0.0f32;
3318 let mut norm_b = 0.0f32;
3319
3320 for i in 0..a.len() {
3321 dot += a[i] * b[i];
3322 norm_a += a[i] * a[i];
3323 norm_b += b[i] * b[i];
3324 }
3325
3326 let denom = norm_a.sqrt() * norm_b.sqrt();
3327 if denom == 0.0 {
3328 0.0
3329 } else {
3330 dot / denom
3331 }
3332}
3333
3334fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3336 match kind {
3337 SymbolKind::Function => 0,
3338 SymbolKind::Class => 1,
3339 SymbolKind::Method => 2,
3340 SymbolKind::Struct => 3,
3341 SymbolKind::Interface => 4,
3342 SymbolKind::Enum => 5,
3343 SymbolKind::TypeAlias => 6,
3344 SymbolKind::Variable => 7,
3345 SymbolKind::Heading => 8,
3346 SymbolKind::FileSummary => 9,
3347 }
3348}
3349
3350fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3351 match v {
3352 0 => SymbolKind::Function,
3353 1 => SymbolKind::Class,
3354 2 => SymbolKind::Method,
3355 3 => SymbolKind::Struct,
3356 4 => SymbolKind::Interface,
3357 5 => SymbolKind::Enum,
3358 6 => SymbolKind::TypeAlias,
3359 7 => SymbolKind::Variable,
3360 8 => SymbolKind::Heading,
3361 9 => SymbolKind::FileSummary,
3362 _ => SymbolKind::Heading,
3363 }
3364}
3365
3366#[cfg(test)]
3367mod tests {
3368 use super::*;
3369 use crate::config::{SemanticBackend, SemanticBackendConfig};
3370 use crate::parser::FileParser;
3371 use std::io::{Read, Write};
3372 use std::net::TcpListener;
3373 use std::thread;
3374
3375 #[test]
3376 fn semantic_index_includes_php_inc_and_scss_extensions() {
3377 for file in ["partial.inc", "index.php", "styles.scss"] {
3378 assert!(
3379 is_semantic_indexed_extension(Path::new(file)),
3380 "{file} should be semantic-index eligible"
3381 );
3382 }
3383 }
3384
3385 #[test]
3386 fn transient_marker_round_trips_and_classifies() {
3387 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3390 assert!(embedding_failure_is_transient(&marked));
3391 let clean = strip_transient_embedding_marker(&marked);
3392 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3393 assert!(clean.starts_with("openai compatible request failed:"));
3394
3395 for permanent in [
3398 "openai compatible request failed (HTTP 401): Unauthorized",
3399 "embedding dimension mismatch: index has 384, model returned 768",
3400 "too many files (>20000) for semantic indexing (max 20000)",
3401 ] {
3402 assert!(
3403 !embedding_failure_is_transient(permanent),
3404 "{permanent:?} must not be transient"
3405 );
3406 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3408 }
3409 }
3410
3411 #[test]
3412 fn send_error_transience_separates_connect_timeout_from_4xx() {
3413 assert!(is_retryable_embedding_status(
3415 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3416 ));
3417 assert!(is_retryable_embedding_status(
3418 reqwest::StatusCode::TOO_MANY_REQUESTS
3419 ));
3420 assert!(!is_retryable_embedding_status(
3421 reqwest::StatusCode::UNAUTHORIZED
3422 ));
3423 assert!(!is_retryable_embedding_status(
3424 reqwest::StatusCode::BAD_REQUEST
3425 ));
3426 }
3427
3428 #[test]
3429 fn local_backend_model_loading_body_is_transient() {
3430 for body in [
3433 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3434 r#"{"error":"model is loading, please wait"}"#,
3435 r#"{"error":"Model not loaded"}"#,
3436 "Loading model into memory",
3437 ] {
3438 assert!(
3439 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3440 "{body:?} should be body-transient"
3441 );
3442 }
3443
3444 for body in [
3448 r#"{"error":"invalid api key"}"#,
3449 r#"{"error":"model 'foo' not found"}"#,
3450 "Bad Request: unknown field",
3451 "Bad Request: invalid loading model option",
3452 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3453 ] {
3454 assert!(
3455 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3456 "{body:?} must not be body-transient"
3457 );
3458 }
3459
3460 assert!(
3461 !embedding_response_body_is_transient(
3462 reqwest::StatusCode::UNAUTHORIZED,
3463 r#"{"error":"model is loading, please wait"}"#
3464 ),
3465 "permanent auth failures must not become transient because of body text"
3466 );
3467 }
3468
3469 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3470 where
3471 F: Fn(String, String, String) -> String + Send + 'static,
3472 {
3473 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3474 let addr = listener.local_addr().expect("local addr");
3475 let handle = thread::spawn(move || {
3476 let (mut stream, _) = listener.accept().expect("accept request");
3477 let mut buf = Vec::new();
3478 let mut chunk = [0u8; 4096];
3479 let mut header_end = None;
3480 let mut content_length = 0usize;
3481 loop {
3482 let n = stream.read(&mut chunk).expect("read request");
3483 if n == 0 {
3484 break;
3485 }
3486 buf.extend_from_slice(&chunk[..n]);
3487 if header_end.is_none() {
3488 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3489 header_end = Some(pos + 4);
3490 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3491 for line in headers.lines() {
3492 if let Some(value) = line.strip_prefix("Content-Length:") {
3493 content_length = value.trim().parse::<usize>().unwrap_or(0);
3494 }
3495 }
3496 }
3497 }
3498 if let Some(end) = header_end {
3499 if buf.len() >= end + content_length {
3500 break;
3501 }
3502 }
3503 }
3504
3505 let end = header_end.expect("header terminator");
3506 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3507 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3508 let mut lines = request.lines();
3509 let request_line = lines.next().expect("request line").to_string();
3510 let path = request_line
3511 .split_whitespace()
3512 .nth(1)
3513 .expect("request path")
3514 .to_string();
3515 let response_body = handler(request_line, path, body);
3516 let response = format!(
3517 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3518 response_body.len(),
3519 response_body
3520 );
3521 stream
3522 .write_all(response.as_bytes())
3523 .expect("write response");
3524 });
3525
3526 (format!("http://{}", addr), handle)
3527 }
3528
3529 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3530 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3531 listener
3532 .set_nonblocking(true)
3533 .expect("nonblocking listener");
3534 let addr = listener.local_addr().expect("local addr");
3535 let handle = thread::spawn(move || {
3536 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3537 let mut accepted = 0usize;
3538 while accepted < attempts && std::time::Instant::now() < deadline {
3539 match listener.accept() {
3540 Ok((mut stream, _)) => {
3541 accepted += 1;
3542 let mut buf = [0u8; 4096];
3543 let _ = stream.read(&mut buf);
3551 let response = "HTTP/1.1 200 OK
3552Content-Type: application/json
3553Content-Length: 128
3554Connection: close
3555
3556{";
3557 let _ = stream.write_all(response.as_bytes());
3558 }
3559 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3560 thread::sleep(Duration::from_millis(10));
3561 }
3562 Err(error) => panic!("accept request: {error}"),
3563 }
3564 }
3565 });
3566
3567 (format!("http://{}", addr), handle)
3568 }
3569
3570 #[test]
3571 fn response_body_read_failures_are_marked_transient() {
3572 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3573 let client = Client::builder()
3574 .timeout(Duration::from_millis(250))
3575 .build()
3576 .expect("client");
3577
3578 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3579 .expect_err("truncated body should fail");
3580
3581 handle.join().unwrap();
3582 assert!(
3583 embedding_failure_is_transient(&error),
3584 "body read failures should be transient-marked: {error}"
3585 );
3586 assert!(error.contains("response read failed"));
3587 }
3588
3589 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3590 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3591 }
3592
3593 fn write_rust_file(path: &Path, function_name: &str) {
3594 fs::write(
3595 path,
3596 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3597 )
3598 .unwrap();
3599 }
3600
3601 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3602 let mut embed = test_vector_for_texts;
3603 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3604 }
3605
3606 fn test_project_root() -> PathBuf {
3607 std::env::current_dir().unwrap()
3608 }
3609
3610 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3611 index.file_mtimes.insert(file.to_path_buf(), mtime);
3612 index.file_sizes.insert(file.to_path_buf(), size);
3613 index
3614 .file_hashes
3615 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3616 }
3617
3618 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3619 let mut buf = Vec::new();
3620 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3621 let encoded = fingerprint.as_string();
3622 if encoded.is_empty() {
3623 None
3624 } else {
3625 Some(encoded.into_bytes())
3626 }
3627 });
3628 let file_mtimes: Vec<_> = index
3629 .file_mtimes
3630 .iter()
3631 .filter_map(|(path, mtime)| {
3632 cache_relative_path(&index.project_root, path)
3633 .map(|relative| (relative, path, mtime))
3634 })
3635 .collect();
3636 let entries: Vec<_> = index
3637 .entries
3638 .iter()
3639 .filter_map(|entry| {
3640 cache_relative_path(&index.project_root, &entry.chunk.file)
3641 .map(|relative| (relative, entry))
3642 })
3643 .collect();
3644
3645 buf.push(SEMANTIC_INDEX_VERSION_V6);
3646 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3647 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3648 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3649 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3650 buf.extend_from_slice(fp_bytes_ref);
3651
3652 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3653 for (relative, path, mtime) in &file_mtimes {
3654 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3655 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3656 buf.extend_from_slice(&path_bytes);
3657 let duration = mtime
3658 .duration_since(SystemTime::UNIX_EPOCH)
3659 .unwrap_or_default();
3660 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3661 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3662 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3663 buf.extend_from_slice(&size.to_le_bytes());
3664 let hash = index
3665 .file_hashes
3666 .get(*path)
3667 .copied()
3668 .unwrap_or_else(cache_freshness::zero_hash);
3669 buf.extend_from_slice(hash.as_bytes());
3670 }
3671
3672 for (relative, entry) in &entries {
3673 let c = &entry.chunk;
3674 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3675 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3676 buf.extend_from_slice(&file_bytes);
3677
3678 let name_bytes = c.name.as_bytes();
3679 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3680 buf.extend_from_slice(name_bytes);
3681
3682 buf.push(symbol_kind_to_u8(&c.kind));
3683 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3684 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3685 buf.push(c.exported as u8);
3686
3687 let snippet_bytes = c.snippet.as_bytes();
3688 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3689 buf.extend_from_slice(snippet_bytes);
3690
3691 let embed_bytes = c.embed_text.as_bytes();
3692 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3693 buf.extend_from_slice(embed_bytes);
3694
3695 for &val in &entry.vector {
3696 buf.extend_from_slice(&val.to_le_bytes());
3697 }
3698 }
3699
3700 buf
3701 }
3702
3703 #[derive(Default)]
3704 struct RecordingEmbedder {
3705 calls: Vec<Vec<String>>,
3706 }
3707
3708 impl RecordingEmbedder {
3709 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3710 let vectors = texts
3711 .iter()
3712 .map(|text| deterministic_test_vector(text))
3713 .collect();
3714 self.calls.push(texts);
3715 Ok(vectors)
3716 }
3717
3718 fn total_embedded_texts(&self) -> usize {
3719 self.calls.iter().map(Vec::len).sum()
3720 }
3721
3722 fn embedded_texts(&self) -> Vec<&str> {
3723 self.calls
3724 .iter()
3725 .flat_map(|batch| batch.iter().map(String::as_str))
3726 .collect()
3727 }
3728 }
3729
3730 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3731 let hash = blake3::hash(text.as_bytes());
3732 let bytes = hash.as_bytes();
3733 vec![
3734 1.0,
3735 bytes[0] as f32 / 255.0,
3736 bytes[1] as f32 / 255.0,
3737 bytes[2] as f32 / 255.0,
3738 ]
3739 }
3740
3741 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3742 let mut embedder = RecordingEmbedder::default();
3743 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3744 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3745 }
3746
3747 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3748 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3749 }
3750
3751 fn write_source(path: &Path, source: &str) {
3752 if let Some(parent) = path.parent() {
3753 fs::create_dir_all(parent).unwrap();
3754 }
3755 fs::write(path, source).unwrap();
3756 }
3757
3758 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3759 index
3760 .entries
3761 .iter()
3762 .filter(|entry| entry.chunk.file == file)
3763 .collect()
3764 }
3765
3766 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3767 index
3768 .entries
3769 .iter()
3770 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3771 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3772 }
3773
3774 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3775 index
3776 .entries
3777 .iter()
3778 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3779 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3780 }
3781
3782 #[test]
3783 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3784 let temp = tempfile::tempdir().unwrap();
3785 let project_root = temp.path();
3786 let file = project_root.join("src/lib.rs");
3787 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3788 write_source(&file, original);
3789
3790 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3791 let original_entry_count = index.entries.len();
3792 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3793
3794 write_source(&file, &format!("\n{original}"));
3795 force_stale(&mut index, &file);
3796
3797 let mut embedder = RecordingEmbedder::default();
3798 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3799 let mut progress = |_done: usize, _total: usize| {};
3800 let summary = index
3801 .refresh_stale_files(
3802 project_root,
3803 std::slice::from_ref(&file),
3804 &mut embed,
3805 16,
3806 &mut progress,
3807 )
3808 .unwrap();
3809
3810 assert_eq!(summary.changed, 1);
3811 assert_eq!(embedder.total_embedded_texts(), 0);
3812 assert_eq!(index.entries.len(), original_entry_count);
3813 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3814 assert_eq!(shifted_alpha.chunk.start_line, 1);
3815 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3816 }
3817
3818 #[test]
3819 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3820 let temp = tempfile::tempdir().unwrap();
3821 let project_root = temp.path();
3822 let file = project_root.join("src/lib.rs");
3823 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3824 write_source(&file, original);
3825
3826 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3827 let mut serving_index = worker_index.clone();
3828 let original_entry_count = worker_index.entries.len();
3829
3830 write_source(&file, &format!("\n{original}"));
3831
3832 let mut embedder = RecordingEmbedder::default();
3833 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3834 let mut progress = |_done: usize, _total: usize| {};
3835 let update = worker_index
3836 .refresh_invalidated_files(
3837 project_root,
3838 std::slice::from_ref(&file),
3839 &mut embed,
3840 16,
3841 100,
3842 &mut progress,
3843 )
3844 .unwrap();
3845
3846 assert_eq!(embedder.total_embedded_texts(), 0);
3847 assert_eq!(update.added_entries.len(), original_entry_count);
3848 assert_eq!(worker_index.entries.len(), original_entry_count);
3849
3850 serving_index.apply_refresh_update(
3851 update.added_entries,
3852 update.updated_metadata,
3853 &update.completed_paths,
3854 );
3855
3856 assert_eq!(serving_index.entries.len(), original_entry_count);
3857 assert_eq!(
3858 entries_for_file(&serving_index, &file).len(),
3859 original_entry_count
3860 );
3861 assert_eq!(
3862 entry_by_name(&serving_index, &file, "alpha")
3863 .chunk
3864 .start_line,
3865 1
3866 );
3867 }
3868
3869 #[test]
3870 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3871 let temp = tempfile::tempdir().unwrap();
3872 let project_root = temp.path();
3873 let file = project_root.join("src/lib.rs");
3874 write_source(
3875 &file,
3876 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3877 );
3878
3879 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3880 let original_entry_count = index.entries.len();
3881 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3882
3883 write_source(
3884 &file,
3885 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3886 );
3887
3888 let mut embedder = RecordingEmbedder::default();
3889 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3890 let mut progress = |_done: usize, _total: usize| {};
3891 let update = index
3892 .refresh_invalidated_files(
3893 project_root,
3894 std::slice::from_ref(&file),
3895 &mut embed,
3896 16,
3897 100,
3898 &mut progress,
3899 )
3900 .unwrap();
3901
3902 assert_eq!(embedder.total_embedded_texts(), 1);
3903 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3904 assert_eq!(update.added_entries.len(), original_entry_count);
3905 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3906 }
3907
3908 #[test]
3909 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3910 let temp = tempfile::tempdir().unwrap();
3911 let project_root = temp.path();
3912 let file = project_root.join("src/dupe.js");
3913 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
3914 write_source(&file, one_duplicate);
3915
3916 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3917 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3918
3919 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3920
3921 let mut embedder = RecordingEmbedder::default();
3922 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3923 let mut progress = |_done: usize, _total: usize| {};
3924 index
3925 .refresh_invalidated_files(
3926 project_root,
3927 std::slice::from_ref(&file),
3928 &mut embed,
3929 16,
3930 100,
3931 &mut progress,
3932 )
3933 .unwrap();
3934
3935 let duplicate_entries = index
3936 .entries
3937 .iter()
3938 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3939 .collect::<Vec<_>>();
3940 assert_eq!(duplicate_entries.len(), 2);
3941 assert_eq!(embedder.total_embedded_texts(), 0);
3942 assert_eq!(duplicate_entries[0].vector, original_vector);
3943 assert_eq!(duplicate_entries[1].vector, original_vector);
3944 }
3945
3946 #[test]
3947 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3948 let temp = tempfile::tempdir().unwrap();
3949 let project_root = temp.path();
3950 let file = project_root.join("src/lib.rs");
3951 write_source(
3952 &file,
3953 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
3954 );
3955
3956 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3957 let summary_before = file_summary_entry(&index, &file).vector.clone();
3958
3959 write_source(
3960 &file,
3961 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
3962 );
3963 let mut body_embedder = RecordingEmbedder::default();
3964 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3965 let mut progress = |_done: usize, _total: usize| {};
3966 index
3967 .refresh_invalidated_files(
3968 project_root,
3969 std::slice::from_ref(&file),
3970 &mut body_embed,
3971 16,
3972 100,
3973 &mut progress,
3974 )
3975 .unwrap();
3976 assert_eq!(body_embedder.total_embedded_texts(), 1);
3977 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3978 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3979
3980 write_source(
3981 &file,
3982 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
3983 );
3984 let mut doc_embedder = RecordingEmbedder::default();
3985 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3986 index
3987 .refresh_invalidated_files(
3988 project_root,
3989 std::slice::from_ref(&file),
3990 &mut doc_embed,
3991 16,
3992 100,
3993 &mut progress,
3994 )
3995 .unwrap();
3996
3997 assert_eq!(doc_embedder.total_embedded_texts(), 1);
3998 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3999 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4000 }
4001
4002 #[test]
4003 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4004 let temp = tempfile::tempdir().unwrap();
4005 let project_root = temp.path();
4006 let file = project_root.join("src/lib.rs");
4007 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4008
4009 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4010 let mut serving_index = worker_index.clone();
4011 fs::remove_file(&file).unwrap();
4012
4013 let mut embedder = RecordingEmbedder::default();
4014 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4015 let mut progress = |_done: usize, _total: usize| {};
4016 let update = worker_index
4017 .refresh_invalidated_files(
4018 project_root,
4019 std::slice::from_ref(&file),
4020 &mut embed,
4021 16,
4022 100,
4023 &mut progress,
4024 )
4025 .unwrap();
4026
4027 assert_eq!(update.summary.deleted, 1);
4028 assert_eq!(embedder.total_embedded_texts(), 0);
4029 assert!(worker_index.entries.is_empty());
4030
4031 serving_index.apply_refresh_update(
4032 update.added_entries,
4033 update.updated_metadata,
4034 &update.completed_paths,
4035 );
4036 assert!(serving_index.entries.is_empty());
4037 }
4038
4039 #[test]
4040 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4041 let temp = tempfile::tempdir().unwrap();
4042 let project_root = temp.path();
4043 let file = project_root.join("src/lib.rs");
4044 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4045
4046 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4047 let mut serving_index = worker_index.clone();
4048 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4049
4050 let mut embedder = RecordingEmbedder::default();
4051 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4052 let mut progress = |_done: usize, _total: usize| {};
4053 let update = worker_index
4054 .refresh_invalidated_files(
4055 project_root,
4056 std::slice::from_ref(&file),
4057 &mut embed,
4058 16,
4059 100,
4060 &mut progress,
4061 )
4062 .unwrap();
4063
4064 assert_eq!(embedder.total_embedded_texts(), 0);
4065 assert!(update.added_entries.is_empty());
4066 assert!(worker_index.entries.is_empty());
4067 assert!(!worker_index.file_mtimes.contains_key(&file));
4068
4069 serving_index.apply_refresh_update(
4070 update.added_entries,
4071 update.updated_metadata,
4072 &update.completed_paths,
4073 );
4074 assert!(serving_index.entries.is_empty());
4075 assert!(!serving_index.file_mtimes.contains_key(&file));
4076 }
4077
4078 #[test]
4079 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4080 let temp = tempfile::tempdir().unwrap();
4081 let project_root = temp.path();
4082 let indexed = project_root.join("src/a.rs");
4083 let deferred = project_root.join("src/b.rs");
4084 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4085 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4086
4087 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4088 let mut embedder = RecordingEmbedder::default();
4089 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4090 let mut progress = |_done: usize, _total: usize| {};
4091 let update = index
4092 .refresh_invalidated_files(
4093 project_root,
4094 std::slice::from_ref(&deferred),
4095 &mut embed,
4096 16,
4097 1,
4098 &mut progress,
4099 )
4100 .unwrap();
4101
4102 assert_eq!(update.summary.total_processed, 1);
4103 assert_eq!(update.summary.added, 0);
4104 assert_eq!(embedder.total_embedded_texts(), 0);
4105 assert_eq!(index.indexed_file_count(), 1);
4106 assert!(index.deferred_files.contains(&deferred));
4107 assert!(entries_for_file(&index, &deferred).is_empty());
4108 }
4109
4110 #[test]
4111 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4112 let dir = tempfile::tempdir().expect("create temp dir");
4113 let project = fs::canonicalize(dir.path()).expect("canonical project");
4114 let outside = project.join("..").join("outside.rs");
4115 let mut index = SemanticIndex::new(project.clone(), 3);
4116 index
4117 .file_mtimes
4118 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4119 index.file_sizes.insert(outside.clone(), 1);
4120 index
4121 .file_hashes
4122 .insert(outside.clone(), cache_freshness::zero_hash());
4123 index.entries.push(EmbeddingEntry {
4124 chunk: SemanticChunk {
4125 file: outside,
4126 name: "outside".to_string(),
4127 kind: SymbolKind::Function,
4128 start_line: 0,
4129 end_line: 0,
4130 exported: false,
4131 embed_text: "outside".to_string(),
4132 snippet: "outside".to_string(),
4133 },
4134 vector: vec![1.0, 0.0, 0.0],
4135 });
4136
4137 let bytes = index.to_bytes();
4138 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4139 assert_eq!(loaded.entries.len(), 0);
4140 assert!(loaded.file_mtimes.is_empty());
4141 }
4142
4143 #[test]
4144 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4145 let project_root = test_project_root();
4146 let file = project_root.join("src/lib.rs");
4147 let mut index = SemanticIndex::new(project_root, 2);
4148 let entries = [
4149 ("alpha", vec![1.0, 0.0], false),
4150 ("beta", vec![0.0, 1.0], false),
4151 ("gamma", vec![1.0, 0.0], false),
4152 ("delta", vec![0.5, 0.5], true),
4153 ("epsilon", vec![-1.0, 0.0], false),
4154 ];
4155 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4156 index.entries.push(EmbeddingEntry {
4157 chunk: SemanticChunk {
4158 file: file.clone(),
4159 name: name.to_string(),
4160 kind: SymbolKind::Function,
4161 start_line: line as u32 + 1,
4162 end_line: line as u32 + 1,
4163 exported,
4164 embed_text: name.to_string(),
4165 snippet: format!("fn {name}() {{}}"),
4166 },
4167 vector,
4168 });
4169 }
4170
4171 let query = vec![1.0, 0.0];
4172 let top_k = 4;
4173 let mut reference: Vec<(f32, usize)> = index
4174 .entries
4175 .iter()
4176 .enumerate()
4177 .map(|(idx, entry)| {
4178 let mut score = cosine_similarity(&query, &entry.vector);
4179 if entry.chunk.exported {
4180 score *= 1.1;
4181 }
4182 (score, idx)
4183 })
4184 .collect();
4185 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4186 let expected: Vec<(String, f32)> = reference
4187 .into_iter()
4188 .take(top_k)
4189 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4190 .collect();
4191
4192 let actual: Vec<(String, f32)> = index
4193 .search(&query, top_k)
4194 .into_iter()
4195 .map(|result| (result.name, result.score))
4196 .collect();
4197
4198 assert_eq!(
4199 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4200 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4201 );
4202 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4203 assert!((actual_score - expected_score).abs() < 1e-6);
4204 }
4205 assert_eq!(actual[0].0, "alpha");
4206 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4207 assert!(index.search(&query, 0).is_empty());
4208 }
4209
4210 #[test]
4211 fn test_cosine_similarity_identical() {
4212 let a = vec![1.0, 0.0, 0.0];
4213 let b = vec![1.0, 0.0, 0.0];
4214 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4215 }
4216
4217 #[test]
4218 fn test_cosine_similarity_orthogonal() {
4219 let a = vec![1.0, 0.0, 0.0];
4220 let b = vec![0.0, 1.0, 0.0];
4221 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4222 }
4223
4224 #[test]
4225 fn test_cosine_similarity_opposite() {
4226 let a = vec![1.0, 0.0, 0.0];
4227 let b = vec![-1.0, 0.0, 0.0];
4228 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4229 }
4230
4231 #[test]
4232 fn test_serialization_roundtrip() {
4233 let project_root = test_project_root();
4234 let file = project_root.join("src/main.rs");
4235 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4236 index.entries.push(EmbeddingEntry {
4237 chunk: SemanticChunk {
4238 file: file.clone(),
4239 name: "handle_request".to_string(),
4240 kind: SymbolKind::Function,
4241 start_line: 10,
4242 end_line: 25,
4243 exported: true,
4244 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4245 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4246 },
4247 vector: vec![0.1, 0.2, 0.3, 0.4],
4248 });
4249 index.dimension = 4;
4250 index
4251 .file_mtimes
4252 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4253 index.file_sizes.insert(file, 0);
4254 index.set_fingerprint(SemanticIndexFingerprint {
4255 backend: "fastembed".to_string(),
4256 model: "all-MiniLM-L6-v2".to_string(),
4257 base_url: FALLBACK_BACKEND.to_string(),
4258 dimension: 4,
4259 chunking_version: default_chunking_version(),
4260 });
4261
4262 let bytes = index.to_bytes();
4263 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4264
4265 assert_eq!(restored.entries.len(), 1);
4266 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4267 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4268 assert_eq!(restored.dimension, 4);
4269 assert_eq!(restored.backend_label(), Some("fastembed"));
4270 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4271 }
4272
4273 #[test]
4274 fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4275 let storage = tempfile::tempdir().expect("create storage dir");
4276 let project = storage.path().join("project");
4277 fs::create_dir_all(project.join("src")).expect("create project src");
4278 let file = project.join("src/lib.rs");
4279 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4280 let project_root = fs::canonicalize(&project).expect("canonical project");
4281 let file = fs::canonicalize(&file).expect("canonical file");
4282
4283 let mut index = SemanticIndex::new(project_root.clone(), 3);
4284 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4285 index.file_mtimes.insert(file.clone(), mtime);
4286 index.file_sizes.insert(file.clone(), 42);
4287 index
4288 .file_hashes
4289 .insert(file.clone(), cache_freshness::zero_hash());
4290 index.entries.push(EmbeddingEntry {
4291 chunk: SemanticChunk {
4292 file: file.clone(),
4293 name: "alpha".to_string(),
4294 kind: SymbolKind::Function,
4295 start_line: 0,
4296 end_line: 0,
4297 exported: true,
4298 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4299 snippet: "pub fn alpha() {}".to_string(),
4300 },
4301 vector: vec![0.1, 0.2, 0.3],
4302 });
4303 index.entries.push(EmbeddingEntry {
4304 chunk: SemanticChunk {
4305 file: file.clone(),
4306 name: "beta".to_string(),
4307 kind: SymbolKind::Function,
4308 start_line: 1,
4309 end_line: 1,
4310 exported: true,
4311 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4312 snippet: "pub fn beta() {}".to_string(),
4313 },
4314 vector: vec![0.4, 0.5, 0.6],
4315 });
4316 let fingerprint = SemanticIndexFingerprint {
4317 backend: "fastembed".to_string(),
4318 model: "all-MiniLM-L6-v2".to_string(),
4319 base_url: FALLBACK_BACKEND.to_string(),
4320 dimension: 3,
4321 chunking_version: default_chunking_version(),
4322 };
4323 index.set_fingerprint(fingerprint.clone());
4324
4325 let legacy_bytes = legacy_semantic_index_bytes(&index);
4326 assert_eq!(index.to_bytes(), legacy_bytes);
4327
4328 index.write_to_disk(storage.path(), "proj");
4329 let data_path = storage.path().join("semantic/proj/semantic.bin");
4330 assert_eq!(
4331 fs::read(&data_path).expect("read semantic.bin"),
4332 legacy_bytes
4333 );
4334
4335 let loaded = SemanticIndex::read_from_disk(
4336 storage.path(),
4337 "proj",
4338 &project_root,
4339 false,
4340 Some(&fingerprint.as_string()),
4341 )
4342 .expect("load semantic index");
4343 assert_eq!(loaded.entries.len(), index.entries.len());
4344 assert_eq!(loaded.dimension, index.dimension);
4345 assert_eq!(
4346 loaded.fingerprint().unwrap().as_string(),
4347 fingerprint.as_string()
4348 );
4349 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4350 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4351 assert_eq!(
4352 loaded.file_hashes.get(&file),
4353 Some(&cache_freshness::zero_hash())
4354 );
4355 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4356 assert_eq!(actual.chunk.file, expected.chunk.file);
4357 assert_eq!(actual.chunk.name, expected.chunk.name);
4358 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4359 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4360 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4361 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4362 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4363 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4364 assert_eq!(actual.vector, expected.vector);
4365 }
4366 assert_eq!(loaded.to_bytes(), legacy_bytes);
4367 }
4368
4369 #[test]
4370 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4371 let cases = [
4372 (SymbolKind::Function, 0),
4373 (SymbolKind::Class, 1),
4374 (SymbolKind::Method, 2),
4375 (SymbolKind::Struct, 3),
4376 (SymbolKind::Interface, 4),
4377 (SymbolKind::Enum, 5),
4378 (SymbolKind::TypeAlias, 6),
4379 (SymbolKind::Variable, 7),
4380 (SymbolKind::Heading, 8),
4381 (SymbolKind::FileSummary, 9),
4382 ];
4383
4384 for (kind, encoded) in cases {
4385 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4386 assert_eq!(u8_to_symbol_kind(encoded), kind);
4387 }
4388 }
4389
4390 #[test]
4391 fn test_search_top_k() {
4392 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4393 index.dimension = 3;
4394
4395 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4397 let mut vec = vec![0.0f32; 3];
4398 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4400 chunk: SemanticChunk {
4401 file: PathBuf::from("/src/lib.rs"),
4402 name: name.to_string(),
4403 kind: SymbolKind::Function,
4404 start_line: (i * 10 + 1) as u32,
4405 end_line: (i * 10 + 5) as u32,
4406 exported: true,
4407 embed_text: format!("kind:function name:{}", name),
4408 snippet: format!("fn {}() {{}}", name),
4409 },
4410 vector: vec,
4411 });
4412 }
4413
4414 let query = vec![0.9, 0.1, 0.0];
4416 let results = index.search(&query, 2);
4417
4418 assert_eq!(results.len(), 2);
4419 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4421 }
4422
4423 #[test]
4424 fn test_empty_index_search() {
4425 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4426 let results = index.search(&[0.1, 0.2, 0.3], 10);
4427 assert!(results.is_empty());
4428 }
4429
4430 #[test]
4431 fn single_line_symbol_builds_non_empty_snippet() {
4432 let symbol = Symbol {
4433 name: "answer".to_string(),
4434 kind: SymbolKind::Variable,
4435 range: crate::symbols::Range {
4436 start_line: 0,
4437 start_col: 0,
4438 end_line: 0,
4439 end_col: 24,
4440 },
4441 signature: Some("const answer = 42".to_string()),
4442 scope_chain: Vec::new(),
4443 exported: true,
4444 parent: None,
4445 };
4446 let source = "export const answer = 42;\n";
4447
4448 let snippet = build_snippet(&symbol, source);
4449
4450 assert_eq!(snippet, "export const answer = 42;");
4451 }
4452
4453 #[test]
4454 fn optimized_file_chunk_collection_matches_file_parser_path() {
4455 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4456 let file = project_root.join("src/semantic_index.rs");
4457 let source = std::fs::read_to_string(&file).unwrap();
4458
4459 let mut legacy_parser = FileParser::new();
4460 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4461 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4462
4463 let mut parsers = HashMap::new();
4464 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4465
4466 assert_eq!(
4467 chunk_fingerprint(&optimized_chunks),
4468 chunk_fingerprint(&legacy_chunks)
4469 );
4470 }
4471
4472 fn chunk_fingerprint(
4473 chunks: &[SemanticChunk],
4474 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4475 chunks
4476 .iter()
4477 .map(|chunk| {
4478 (
4479 chunk.name.clone(),
4480 chunk.kind.clone(),
4481 chunk.start_line,
4482 chunk.end_line,
4483 chunk.exported,
4484 chunk.embed_text.clone(),
4485 chunk.snippet.clone(),
4486 )
4487 })
4488 .collect()
4489 }
4490
4491 #[test]
4492 fn collect_file_chunks_skips_oversized_file() {
4493 let dir = tempfile::tempdir().unwrap();
4494 let big = dir.path().join("huge.ts");
4495 let filler = "export const x = 1;\n"
4497 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4498 std::fs::write(&big, &filler).unwrap();
4499 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4500
4501 let mut parsers = HashMap::new();
4502 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4505 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4506
4507 let small = dir.path().join("small.ts");
4509 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4510 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4511 assert!(!small_chunks.is_empty(), "small file should still chunk");
4512 }
4513
4514 #[test]
4515 fn rejects_oversized_dimension_during_deserialization() {
4516 let mut bytes = Vec::new();
4517 bytes.push(1u8);
4518 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4519 bytes.extend_from_slice(&0u32.to_le_bytes());
4520 bytes.extend_from_slice(&0u32.to_le_bytes());
4521
4522 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4523 }
4524
4525 #[test]
4526 fn rejects_oversized_entry_count_during_deserialization() {
4527 let mut bytes = Vec::new();
4528 bytes.push(1u8);
4529 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4530 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4531 bytes.extend_from_slice(&0u32.to_le_bytes());
4532
4533 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4534 }
4535
4536 #[test]
4537 fn invalidate_file_removes_entries_and_mtime() {
4538 let target = PathBuf::from("/src/main.rs");
4539 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4540 index.entries.push(EmbeddingEntry {
4541 chunk: SemanticChunk {
4542 file: target.clone(),
4543 name: "main".to_string(),
4544 kind: SymbolKind::Function,
4545 start_line: 0,
4546 end_line: 1,
4547 exported: false,
4548 embed_text: "main".to_string(),
4549 snippet: "fn main() {}".to_string(),
4550 },
4551 vector: vec![1.0; DEFAULT_DIMENSION],
4552 });
4553 index
4554 .file_mtimes
4555 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4556 index.file_sizes.insert(target.clone(), 0);
4557
4558 index.invalidate_file(&target);
4559
4560 assert!(index.entries.is_empty());
4561 assert!(!index.file_mtimes.contains_key(&target));
4562 assert!(!index.file_sizes.contains_key(&target));
4563 }
4564
4565 #[test]
4566 fn refresh_missing_changed_file_is_purged_after_collect() {
4567 let temp = tempfile::tempdir().unwrap();
4568 let project_root = temp.path();
4569 let file = project_root.join("src/lib.rs");
4570 fs::create_dir_all(file.parent().unwrap()).unwrap();
4571 write_rust_file(&file, "vanished_symbol");
4572
4573 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4574 let original_size = *index.file_sizes.get(&file).unwrap();
4575 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4576 fs::remove_file(&file).unwrap();
4577
4578 let mut embed = test_vector_for_texts;
4579 let mut progress = |_done: usize, _total: usize| {};
4580 let summary = index
4581 .refresh_stale_files(
4582 project_root,
4583 std::slice::from_ref(&file),
4584 &mut embed,
4585 8,
4586 &mut progress,
4587 )
4588 .unwrap();
4589
4590 assert_eq!(summary.changed, 0);
4591 assert_eq!(summary.added, 0);
4592 assert_eq!(summary.deleted, 1);
4593 assert!(index.entries.is_empty());
4594 assert!(!index.file_mtimes.contains_key(&file));
4595 assert!(!index.file_sizes.contains_key(&file));
4596 assert!(!index.file_hashes.contains_key(&file));
4597 }
4598
4599 #[test]
4600 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4601 let temp = tempfile::tempdir().unwrap();
4602 let project_root = temp.path();
4603 let file = project_root.join("src/lib.rs");
4604 fs::create_dir_all(file.parent().unwrap()).unwrap();
4605 write_rust_file(&file, "kept_symbol");
4606
4607 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4608 let original_entry_count = index.entries.len();
4609 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4610 let original_size = *index.file_sizes.get(&file).unwrap();
4611
4612 let stale_mtime = SystemTime::UNIX_EPOCH;
4613 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4614 fs::remove_file(&file).unwrap();
4615 fs::create_dir(&file).unwrap();
4616
4617 let mut embed = test_vector_for_texts;
4618 let mut progress = |_done: usize, _total: usize| {};
4619 let summary = index
4620 .refresh_stale_files(
4621 project_root,
4622 std::slice::from_ref(&file),
4623 &mut embed,
4624 8,
4625 &mut progress,
4626 )
4627 .unwrap();
4628
4629 assert_eq!(summary.changed, 0);
4630 assert_eq!(summary.added, 0);
4631 assert_eq!(summary.deleted, 0);
4632 assert_eq!(index.entries.len(), original_entry_count);
4633 assert!(index
4634 .entries
4635 .iter()
4636 .any(|entry| entry.chunk.name == "kept_symbol"));
4637 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4638 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4639 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4640 }
4641
4642 #[test]
4643 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4644 let temp = tempfile::tempdir().unwrap();
4645 let project_root = temp.path();
4646 let missing = project_root.join("src/missing.rs");
4647 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4648
4649 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4650 let mut embed = test_vector_for_texts;
4651 let mut progress = |_done: usize, _total: usize| {};
4652 let summary = index
4653 .refresh_stale_files(
4654 project_root,
4655 std::slice::from_ref(&missing),
4656 &mut embed,
4657 8,
4658 &mut progress,
4659 )
4660 .unwrap();
4661
4662 assert_eq!(summary.added, 0);
4663 assert_eq!(summary.changed, 0);
4664 assert_eq!(summary.deleted, 0);
4665 assert!(!index.file_mtimes.contains_key(&missing));
4666 assert!(!index.file_sizes.contains_key(&missing));
4667 assert!(index.entries.is_empty());
4668 }
4669
4670 #[test]
4671 fn refresh_reports_added_for_new_files() {
4672 let temp = tempfile::tempdir().unwrap();
4673 let project_root = temp.path();
4674 let existing = project_root.join("src/lib.rs");
4675 let added = project_root.join("src/new.rs");
4676 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4677 write_rust_file(&existing, "existing_symbol");
4678 write_rust_file(&added, "added_symbol");
4679
4680 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4681 let mut embed = test_vector_for_texts;
4682 let mut progress = |_done: usize, _total: usize| {};
4683 let summary = index
4684 .refresh_stale_files(
4685 project_root,
4686 &[existing.clone(), added.clone()],
4687 &mut embed,
4688 8,
4689 &mut progress,
4690 )
4691 .unwrap();
4692
4693 assert_eq!(summary.added, 1);
4694 assert_eq!(summary.changed, 0);
4695 assert_eq!(summary.deleted, 0);
4696 assert_eq!(summary.total_processed, 2);
4697 assert!(index.file_mtimes.contains_key(&added));
4698 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4699 }
4700
4701 #[test]
4702 fn refresh_reports_deleted_for_removed_files() {
4703 let temp = tempfile::tempdir().unwrap();
4704 let project_root = temp.path();
4705 let deleted = project_root.join("src/deleted.rs");
4706 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4707 write_rust_file(&deleted, "deleted_symbol");
4708
4709 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4710 fs::remove_file(&deleted).unwrap();
4711
4712 let mut embed = test_vector_for_texts;
4713 let mut progress = |_done: usize, _total: usize| {};
4714 let summary = index
4715 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4716 .unwrap();
4717
4718 assert_eq!(summary.deleted, 1);
4719 assert_eq!(summary.changed, 0);
4720 assert_eq!(summary.added, 0);
4721 assert_eq!(summary.total_processed, 1);
4722 assert!(!index.file_mtimes.contains_key(&deleted));
4723 assert!(index.entries.is_empty());
4724 }
4725
4726 #[test]
4727 fn refresh_reports_changed_for_modified_files() {
4728 let temp = tempfile::tempdir().unwrap();
4729 let project_root = temp.path();
4730 let file = project_root.join("src/lib.rs");
4731 fs::create_dir_all(file.parent().unwrap()).unwrap();
4732 write_rust_file(&file, "old_symbol");
4733
4734 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4735 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4736 write_rust_file(&file, "new_symbol");
4737
4738 let mut embed = test_vector_for_texts;
4739 let mut progress = |_done: usize, _total: usize| {};
4740 let summary = index
4741 .refresh_stale_files(
4742 project_root,
4743 std::slice::from_ref(&file),
4744 &mut embed,
4745 8,
4746 &mut progress,
4747 )
4748 .unwrap();
4749
4750 assert_eq!(summary.changed, 1);
4751 assert_eq!(summary.added, 0);
4752 assert_eq!(summary.deleted, 0);
4753 assert_eq!(summary.total_processed, 1);
4754 assert!(index
4755 .entries
4756 .iter()
4757 .any(|entry| entry.chunk.name == "new_symbol"));
4758 assert!(!index
4759 .entries
4760 .iter()
4761 .any(|entry| entry.chunk.name == "old_symbol"));
4762 }
4763
4764 #[test]
4765 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4766 let temp = tempfile::tempdir().unwrap();
4767 let project_root = temp.path();
4768 let file = project_root.join("src/lib.rs");
4769 fs::create_dir_all(file.parent().unwrap()).unwrap();
4770 write_rust_file(&file, "clean_symbol");
4771
4772 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4773 let original_entries = index.entries.len();
4774 let mut embed_called = false;
4775 let mut embed = |texts: Vec<String>| {
4776 embed_called = true;
4777 test_vector_for_texts(texts)
4778 };
4779 let mut progress = |_done: usize, _total: usize| {};
4780 let summary = index
4781 .refresh_stale_files(
4782 project_root,
4783 std::slice::from_ref(&file),
4784 &mut embed,
4785 8,
4786 &mut progress,
4787 )
4788 .unwrap();
4789
4790 assert!(summary.is_noop());
4791 assert_eq!(summary.total_processed, 1);
4792 assert!(!embed_called);
4793 assert_eq!(index.entries.len(), original_entries);
4794 }
4795
4796 #[test]
4797 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4798 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4799
4800 assert!(is_onnx_runtime_unavailable(message));
4801 }
4802
4803 #[test]
4804 fn formats_missing_onnx_runtime_with_install_hint() {
4805 let message = format_embedding_init_error(
4806 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4807 );
4808
4809 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4810 assert!(message.contains("Original error:"));
4811 }
4812
4813 #[test]
4814 fn openai_compatible_backend_embeds_with_mock_server() {
4815 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4816 assert!(request_line.starts_with("POST "));
4817 assert_eq!(path, "/v1/embeddings");
4818 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4819 });
4820
4821 let config = SemanticBackendConfig {
4822 backend: SemanticBackend::OpenAiCompatible,
4823 model: "test-embedding".to_string(),
4824 base_url: Some(base_url),
4825 api_key_env: None,
4826 timeout_ms: 5_000,
4827 max_batch_size: 64,
4828 max_files: 20_000,
4829 };
4830
4831 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4832 let vectors = model
4833 .embed(vec!["hello".to_string(), "world".to_string()])
4834 .unwrap();
4835
4836 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4837 handle.join().unwrap();
4838 }
4839
4840 #[test]
4850 fn openai_compatible_request_has_single_content_type_header() {
4851 use std::sync::{Arc, Mutex};
4852 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4853 let captured_for_thread = Arc::clone(&captured);
4854
4855 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4856 let addr = listener.local_addr().expect("local addr");
4857 let handle = thread::spawn(move || {
4858 let (mut stream, _) = listener.accept().expect("accept");
4859 let mut buf = Vec::new();
4860 let mut chunk = [0u8; 4096];
4861 let mut header_end = None;
4862 let mut content_length = 0usize;
4863 loop {
4864 let n = stream.read(&mut chunk).expect("read");
4865 if n == 0 {
4866 break;
4867 }
4868 buf.extend_from_slice(&chunk[..n]);
4869 if header_end.is_none() {
4870 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4871 header_end = Some(pos + 4);
4872 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4873 if let Some(value) = line.strip_prefix("Content-Length:") {
4874 content_length = value.trim().parse::<usize>().unwrap_or(0);
4875 }
4876 }
4877 }
4878 }
4879 if let Some(end) = header_end {
4880 if buf.len() >= end + content_length {
4881 break;
4882 }
4883 }
4884 }
4885 *captured_for_thread.lock().unwrap() = buf;
4886 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4887 let response = format!(
4888 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4889 body.len(),
4890 body
4891 );
4892 let _ = stream.write_all(response.as_bytes());
4893 });
4894
4895 let config = SemanticBackendConfig {
4896 backend: SemanticBackend::OpenAiCompatible,
4897 model: "text-embedding-3-small".to_string(),
4898 base_url: Some(format!("http://{}", addr)),
4899 api_key_env: None,
4900 timeout_ms: 5_000,
4901 max_batch_size: 64,
4902 max_files: 20_000,
4903 };
4904 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4905 let _ = model.embed(vec!["probe".to_string()]).unwrap();
4906 handle.join().unwrap();
4907
4908 let bytes = captured.lock().unwrap().clone();
4909 let request = String::from_utf8_lossy(&bytes);
4910
4911 let content_type_lines = request
4914 .lines()
4915 .filter(|line| {
4916 let lower = line.to_ascii_lowercase();
4917 lower.starts_with("content-type:")
4918 })
4919 .count();
4920 assert_eq!(
4921 content_type_lines, 1,
4922 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4923 );
4924
4925 assert!(
4928 request.contains(r#""model":"text-embedding-3-small""#),
4929 "request body should contain model field; full request:\n{request}",
4930 );
4931 }
4932
4933 #[test]
4934 fn ollama_backend_embeds_with_mock_server() {
4935 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4936 assert!(request_line.starts_with("POST "));
4937 assert_eq!(path, "/api/embed");
4938 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4939 });
4940
4941 let config = SemanticBackendConfig {
4942 backend: SemanticBackend::Ollama,
4943 model: "embeddinggemma".to_string(),
4944 base_url: Some(base_url),
4945 api_key_env: None,
4946 timeout_ms: 5_000,
4947 max_batch_size: 64,
4948 max_files: 20_000,
4949 };
4950
4951 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4952 let vectors = model
4953 .embed(vec!["hello".to_string(), "world".to_string()])
4954 .unwrap();
4955
4956 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4957 handle.join().unwrap();
4958 }
4959
4960 #[test]
4961 fn read_from_disk_rejects_fingerprint_mismatch() {
4962 let storage = tempfile::tempdir().unwrap();
4963 let project_key = "proj";
4964
4965 let project_root = test_project_root();
4966 let file = project_root.join("src/main.rs");
4967 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4968 index.entries.push(EmbeddingEntry {
4969 chunk: SemanticChunk {
4970 file: file.clone(),
4971 name: "handle_request".to_string(),
4972 kind: SymbolKind::Function,
4973 start_line: 10,
4974 end_line: 25,
4975 exported: true,
4976 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4977 snippet: "fn handle_request() {}".to_string(),
4978 },
4979 vector: vec![0.1, 0.2, 0.3],
4980 });
4981 index.dimension = 3;
4982 index
4983 .file_mtimes
4984 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4985 index.file_sizes.insert(file, 0);
4986 index.set_fingerprint(SemanticIndexFingerprint {
4987 backend: "openai_compatible".to_string(),
4988 model: "test-embedding".to_string(),
4989 base_url: "http://127.0.0.1:1234/v1".to_string(),
4990 dimension: 3,
4991 chunking_version: default_chunking_version(),
4992 });
4993 index.write_to_disk(storage.path(), project_key);
4994
4995 let matching = index.fingerprint().unwrap().as_string();
4996 assert!(SemanticIndex::read_from_disk(
4997 storage.path(),
4998 project_key,
4999 &project_root,
5000 false,
5001 Some(&matching),
5002 )
5003 .is_some());
5004
5005 let mismatched = SemanticIndexFingerprint {
5006 backend: "ollama".to_string(),
5007 model: "embeddinggemma".to_string(),
5008 base_url: "http://127.0.0.1:11434".to_string(),
5009 dimension: 3,
5010 chunking_version: default_chunking_version(),
5011 }
5012 .as_string();
5013 assert!(SemanticIndex::read_from_disk(
5014 storage.path(),
5015 project_key,
5016 &project_root,
5017 false,
5018 Some(&mismatched),
5019 )
5020 .is_none());
5021 }
5022
5023 #[test]
5024 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5025 let storage = tempfile::tempdir().unwrap();
5026 let project_key = "proj-v3";
5027 let dir = storage.path().join("semantic").join(project_key);
5028 fs::create_dir_all(&dir).unwrap();
5029
5030 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5031 index.entries.push(EmbeddingEntry {
5032 chunk: SemanticChunk {
5033 file: PathBuf::from("/src/main.rs"),
5034 name: "handle_request".to_string(),
5035 kind: SymbolKind::Function,
5036 start_line: 0,
5037 end_line: 0,
5038 exported: true,
5039 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5040 snippet: "fn handle_request() {}".to_string(),
5041 },
5042 vector: vec![0.1, 0.2, 0.3],
5043 });
5044 index.dimension = 3;
5045 index
5046 .file_mtimes
5047 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5048 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5049 let fingerprint = SemanticIndexFingerprint {
5050 backend: "fastembed".to_string(),
5051 model: "test".to_string(),
5052 base_url: FALLBACK_BACKEND.to_string(),
5053 dimension: 3,
5054 chunking_version: default_chunking_version(),
5055 };
5056 index.set_fingerprint(fingerprint.clone());
5057
5058 let mut bytes = index.to_bytes();
5059 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5060 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5061
5062 assert!(SemanticIndex::read_from_disk(
5063 storage.path(),
5064 project_key,
5065 &test_project_root(),
5066 false,
5067 Some(&fingerprint.as_string())
5068 )
5069 .is_none());
5070 assert!(!dir.join("semantic.bin").exists());
5071 }
5072
5073 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5074 crate::symbols::Symbol {
5075 name: name.to_string(),
5076 kind,
5077 range: crate::symbols::Range {
5078 start_line: start,
5079 start_col: 0,
5080 end_line: end,
5081 end_col: 0,
5082 },
5083 signature: None,
5084 scope_chain: Vec::new(),
5085 exported: false,
5086 parent: None,
5087 }
5088 }
5089
5090 #[test]
5095 fn symbols_to_chunks_skips_heading_symbols() {
5096 let project_root = PathBuf::from("/proj");
5097 let file = project_root.join("README.md");
5098 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5099
5100 let symbols = vec![
5101 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5102 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5103 ];
5104
5105 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5106 assert!(
5107 chunks.is_empty(),
5108 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5109 chunks.len()
5110 );
5111 }
5112
5113 #[test]
5120 fn build_embed_text_clamps_oversized_signature() {
5121 let project_root = PathBuf::from("/proj");
5122 let file = project_root.join("cronjob.yaml");
5123 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5125
5126 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5127 symbol.signature = Some(huge_sig);
5128
5129 let text = build_embed_text(&symbol, source, &file, &project_root);
5130 assert!(
5131 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5132 "embed_text must be clamped to {} chars, got {}",
5133 MAX_EMBED_TEXT_CHARS,
5134 text.chars().count()
5135 );
5136 }
5137
5138 #[test]
5142 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5143 let project_root = PathBuf::from("/proj");
5144 let file = project_root.join("src/lib.rs");
5145 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5146
5147 let symbols = vec![
5148 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5150 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5151 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5152 ];
5153
5154 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5155 assert_eq!(
5156 chunks.len(),
5157 3,
5158 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5159 chunks.len()
5160 );
5161 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5162 assert!(chunks
5163 .iter()
5164 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5165 assert!(names.contains(&"handle_request"));
5166 assert!(names.contains(&"AuthService"));
5167 assert!(
5168 !names.contains(&"doc heading"),
5169 "Heading symbol leaked into chunks: {names:?}"
5170 );
5171 }
5172
5173 #[test]
5174 fn validate_ssrf_allows_loopback_hostnames() {
5175 for host in &[
5178 "http://localhost",
5179 "http://localhost:8080",
5180 "http://localhost:11434", "http://localhost.localdomain",
5182 "http://foo.localhost",
5183 ] {
5184 assert!(
5185 validate_base_url_no_ssrf(host).is_ok(),
5186 "Expected {host} to be allowed (loopback), got: {:?}",
5187 validate_base_url_no_ssrf(host)
5188 );
5189 }
5190 }
5191
5192 #[test]
5193 fn validate_ssrf_allows_loopback_ips() {
5194 for url in &[
5197 "http://127.0.0.1",
5198 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5200 "http://127.1.2.3",
5201 ] {
5202 let result = validate_base_url_no_ssrf(url);
5203 assert!(
5204 result.is_ok(),
5205 "Expected {url} to be allowed (loopback), got: {:?}",
5206 result
5207 );
5208 }
5209 }
5210
5211 #[test]
5212 fn validate_ssrf_rejects_private_non_loopback_ips() {
5213 for url in &[
5218 "http://192.168.1.1",
5219 "http://10.0.0.1",
5220 "http://172.16.0.1",
5221 "http://169.254.169.254",
5222 "http://100.64.0.1",
5223 ] {
5224 let result = validate_base_url_no_ssrf(url);
5225 assert!(
5226 result.is_err(),
5227 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5228 result
5229 );
5230 }
5231 }
5232
5233 #[test]
5234 fn validate_ssrf_rejects_mdns_local_hostnames() {
5235 for host in &[
5238 "http://printer.local",
5239 "http://nas.local:8080",
5240 "http://homelab.local",
5241 ] {
5242 let result = validate_base_url_no_ssrf(host);
5243 assert!(
5244 result.is_err(),
5245 "Expected {host} to be rejected (mDNS), got: {:?}",
5246 result
5247 );
5248 }
5249 }
5250
5251 #[test]
5252 fn normalize_base_url_allows_localhost_for_tests() {
5253 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5256 assert!(normalize_base_url("http://localhost:8080").is_ok());
5257 }
5258
5259 #[test]
5260 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5261 use std::net::IpAddr;
5262 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5263
5264 assert!(blocked("10.0.0.1"));
5266 assert!(blocked("192.168.1.1"));
5267 assert!(blocked("169.254.0.1"));
5268 assert!(blocked("100.64.0.1"));
5269 assert!(
5271 blocked("198.18.0.1"),
5272 "RFC2544 benchmark range must be blocked"
5273 );
5274 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5275 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5276 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5277
5278 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5280 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5281 assert!(
5282 !blocked("::ffff:127.0.0.1"),
5283 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5284 );
5285
5286 assert!(!blocked("8.8.8.8"));
5288 }
5289
5290 #[test]
5297 fn ort_mismatch_message_recommends_auto_fix_first() {
5298 let msg =
5299 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5300
5301 assert!(
5303 msg.contains("v1.9.0"),
5304 "should report detected version: {msg}"
5305 );
5306 assert!(
5307 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5308 "should report system path: {msg}"
5309 );
5310 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5311
5312 let auto_fix_pos = msg
5314 .find("Auto-fix")
5315 .expect("Auto-fix solution missing — users won't discover --fix");
5316 let remove_pos = msg
5317 .find("Remove the old library")
5318 .expect("system-rm solution missing");
5319 assert!(
5320 auto_fix_pos < remove_pos,
5321 "Auto-fix must come before manual rm — see PR comment thread"
5322 );
5323
5324 assert!(
5326 msg.contains("npx @cortexkit/aft doctor --fix"),
5327 "auto-fix command must be present and copy-pasteable: {msg}"
5328 );
5329 }
5330
5331 #[cfg(any(target_os = "linux", target_os = "macos"))]
5332 #[test]
5333 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5334 let requested = "libonnxruntime.so";
5335 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5336
5337 assert_eq!(detect_ort_version_from_path(requested), None);
5338 let (version, source) =
5339 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5340
5341 assert_eq!(version, Some("1.19.0".to_string()));
5342 assert_eq!(source, actual);
5343
5344 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5345 assert!(msg.contains("v1.19.0"));
5346 assert!(msg.contains(actual));
5347 }
5348
5349 #[test]
5353 fn ort_mismatch_message_handles_macos_dylib_path() {
5354 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5355 assert!(msg.contains("v1.9.0"));
5356 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5357 assert!(
5361 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5362 "system path should be quoted in the auto-fix sentence: {msg}"
5363 );
5364 }
5365}