1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
57const DEFAULT_MAX_BATCH_SIZE: usize = 64;
58const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
59const FALLBACK_BACKEND: &str = "none";
60const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
61const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
62static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
63
64pub struct SemanticIndexLock {
65 _guard: fs_lock::LockGuard,
66}
67
68impl SemanticIndexLock {
69 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
70 let dir = storage_dir.join("semantic").join(project_key);
71 fs::create_dir_all(&dir)?;
72 let path = dir.join("cache.lock");
73 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
74 .lock()
75 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
76 fs_lock::try_acquire(&path, Duration::from_secs(2))
77 .map(|guard| Self { _guard: guard })
78 .map_err(|error| match error {
79 fs_lock::AcquireError::Timeout => {
80 std::io::Error::other("timed out acquiring semantic cache lock")
81 }
82 fs_lock::AcquireError::Io(error) => error,
83 })
84 }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct SemanticIndexFingerprint {
89 pub backend: String,
90 pub model: String,
91 #[serde(default)]
92 pub base_url: String,
93 pub dimension: usize,
94 #[serde(default = "default_chunking_version")]
95 pub chunking_version: u32,
96}
97
98fn default_chunking_version() -> u32 {
99 2
100}
101
102impl SemanticIndexFingerprint {
103 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
104 let base_url = config
107 .base_url
108 .as_ref()
109 .and_then(|u| normalize_base_url(u).ok())
110 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
111 Self {
112 backend: config.backend.as_str().to_string(),
113 model: config.model.clone(),
114 base_url,
115 dimension,
116 chunking_version: default_chunking_version(),
117 }
118 }
119
120 pub fn as_string(&self) -> String {
121 serde_json::to_string(self).unwrap_or_else(|_| String::new())
122 }
123
124 fn matches_expected(&self, expected: &str) -> bool {
125 let encoded = self.as_string();
126 !encoded.is_empty() && encoded == expected
127 }
128}
129
130enum SemanticEmbeddingEngine {
131 Local(LocalEmbedder),
134 OpenAiCompatible {
135 client: Client,
136 model: String,
137 base_url: String,
138 api_key: Option<String>,
139 },
140 Ollama {
141 client: Client,
142 model: String,
143 base_url: String,
144 },
145}
146
147pub struct SemanticEmbeddingModel {
148 backend: SemanticBackend,
149 model: String,
150 base_url: Option<String>,
151 timeout_ms: u64,
152 max_batch_size: usize,
153 dimension: Option<usize>,
154 engine: SemanticEmbeddingEngine,
155 query_embedding_cache: HashMap<String, Vec<f32>>,
156 query_embedding_cache_order: VecDeque<String>,
157 query_embedding_cache_hits: u64,
158 query_embedding_cache_misses: u64,
159}
160
161pub type EmbeddingModel = SemanticEmbeddingModel;
162
163fn validate_embedding_batch(
164 vectors: &[Vec<f32>],
165 expected_count: usize,
166 context: &str,
167) -> Result<(), String> {
168 if expected_count > 0 && vectors.is_empty() {
169 return Err(format!(
170 "{context} returned no vectors for {expected_count} inputs"
171 ));
172 }
173
174 if vectors.len() != expected_count {
175 return Err(format!(
176 "{context} returned {} vectors for {} inputs",
177 vectors.len(),
178 expected_count
179 ));
180 }
181
182 let Some(first_vector) = vectors.first() else {
183 return Ok(());
184 };
185 let expected_dimension = first_vector.len();
186 validate_embedding_dimension(expected_dimension)
187 .map_err(|error| format!("{context} returned {error}"))?;
188 for (index, vector) in vectors.iter().enumerate() {
189 if vector.len() != expected_dimension {
190 return Err(format!(
191 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
192 vector.len()
193 ));
194 }
195 }
196
197 Ok(())
198}
199
200fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
201 if dimension == 0 || dimension > MAX_DIMENSION {
202 return Err(format!(
203 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
204 ));
205 }
206
207 Ok(())
208}
209
210fn normalize_base_url(raw: &str) -> Result<String, String> {
214 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
215 let scheme = parsed.scheme();
216 if scheme != "http" && scheme != "https" {
217 return Err(format!(
218 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
219 scheme
220 ));
221 }
222 Ok(parsed.to_string().trim_end_matches('/').to_string())
223}
224
225pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
240 use std::net::{IpAddr, ToSocketAddrs};
241
242 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
243
244 let host = parsed.host_str().unwrap_or("");
245
246 let is_loopback_host =
251 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
252 if is_loopback_host {
253 return Ok(());
254 }
255
256 if host.ends_with(".local") {
259 return Err(format!(
260 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
261 ));
262 }
263
264 let port = parsed.port_or_known_default().unwrap_or(443);
267 let addr_str = format!("{host}:{port}");
268 let addrs: Vec<IpAddr> = addr_str
269 .to_socket_addrs()
270 .map(|iter| iter.map(|sa| sa.ip()).collect())
271 .unwrap_or_default();
272 for ip in &addrs {
273 if is_private_non_loopback_ip(ip) {
274 return Err(format!(
275 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
276 ));
277 }
278 }
279
280 Ok(())
281}
282
283fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
294 if ip.to_canonical().is_loopback() {
297 return false;
298 }
299 crate::url_fetch::is_private_or_reserved_ip(*ip)
300}
301
302fn build_openai_embeddings_endpoint(base_url: &str) -> String {
303 if base_url.ends_with("/v1") {
304 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
305 } else {
306 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
307 }
308}
309
310fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
311 if base_url.ends_with("/api") {
312 format!("{base_url}/embed")
313 } else {
314 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
315 }
316}
317
318fn normalize_api_key(value: Option<String>) -> Option<String> {
319 value.and_then(|token| {
320 let token = token.trim();
321 if token.is_empty() {
322 None
323 } else {
324 Some(token.to_string())
325 }
326 })
327}
328
329fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
330 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
331}
332
333fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
339 if !matches!(
340 status,
341 reqwest::StatusCode::BAD_REQUEST
342 | reqwest::StatusCode::CONFLICT
343 | reqwest::StatusCode::REQUEST_TIMEOUT
344 | reqwest::StatusCode::LOCKED
345 | reqwest::StatusCode::TOO_EARLY
346 ) {
347 return false;
348 }
349
350 let lower = raw.to_ascii_lowercase();
351 let normalized = lower.trim();
352
353 normalized.contains("model was unloaded while the request was still in queue")
354 || normalized == "model is loading"
355 || normalized.starts_with("model is loading,")
356 || normalized.contains(r#""error":"model is loading"#)
357 || normalized.contains(r#""message":"model is loading"#)
358 || normalized == "model not loaded"
359 || normalized.contains(r#""error":"model not loaded""#)
360 || normalized.contains(r#""message":"model not loaded""#)
361 || normalized == "loading model into memory"
362 || normalized.contains(r#""error":"loading model into memory""#)
363 || normalized.contains(r#""message":"loading model into memory""#)
364 || normalized == "model is being loaded"
365 || normalized.contains(r#""error":"model is being loaded""#)
366 || normalized.contains(r#""message":"model is being loaded""#)
367 || normalized == "model is currently loading"
368 || normalized.contains(r#""error":"model is currently loading""#)
369 || normalized.contains(r#""message":"model is currently loading""#)
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373 error.is_connect()
374}
375
376fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382 error.is_connect() || error.is_timeout()
383}
384
385fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
386 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
387}
388
389pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
396
397pub fn embedding_failure_is_transient(error: &str) -> bool {
400 error.contains(TRANSIENT_EMBEDDING_MARKER)
401}
402
403pub fn strip_transient_embedding_marker(error: &str) -> String {
405 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
406}
407
408fn sleep_before_embedding_retry(attempt_index: usize) {
409 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
410 std::thread::sleep(Duration::from_millis(*delay_ms));
411 }
412}
413
414fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
415where
416 F: FnMut() -> reqwest::blocking::RequestBuilder,
417{
418 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
419 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
420
421 let response = match make_request().send() {
422 Ok(response) => response,
423 Err(error) => {
424 if !last_attempt && is_retryable_embedding_error(&error) {
425 sleep_before_embedding_retry(attempt_index);
426 continue;
427 }
428 let marker = if embedding_send_error_is_transient(&error) {
432 TRANSIENT_EMBEDDING_MARKER
433 } else {
434 ""
435 };
436 return Err(format!("{marker}{backend_label} request failed: {error}"));
437 }
438 };
439
440 let status = response.status();
441 let raw = match response.text() {
442 Ok(raw) => raw,
443 Err(error) => {
444 if !last_attempt && embedding_response_read_error_is_transient(&error) {
445 sleep_before_embedding_retry(attempt_index);
446 continue;
447 }
448 let marker = if embedding_response_read_error_is_transient(&error) {
449 TRANSIENT_EMBEDDING_MARKER
450 } else {
451 ""
452 };
453 return Err(format!(
454 "{marker}{backend_label} response read failed: {error}"
455 ));
456 }
457 };
458
459 if status.is_success() {
460 return Ok(raw);
461 }
462
463 let body_transient = embedding_response_body_is_transient(status, &raw);
467 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
468 sleep_before_embedding_retry(attempt_index);
469 continue;
470 }
471
472 let marker = if is_retryable_embedding_status(status) || body_transient {
478 TRANSIENT_EMBEDDING_MARKER
479 } else {
480 ""
481 };
482 return Err(format!(
483 "{marker}{backend_label} request failed (HTTP {}): {}",
484 status, raw
485 ));
486 }
487
488 unreachable!("embedding request retries exhausted without returning")
489}
490
491impl SemanticEmbeddingModel {
492 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
493 let timeout_ms = if config.timeout_ms == 0 {
494 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
495 } else {
496 config.timeout_ms
497 };
498
499 let max_batch_size = if config.max_batch_size == 0 {
500 DEFAULT_MAX_BATCH_SIZE
501 } else {
502 config.max_batch_size
503 };
504
505 let api_key_env = normalize_api_key(config.api_key_env.clone());
506 let model = config.model.clone();
507
508 let client = Client::builder()
509 .timeout(Duration::from_millis(timeout_ms))
510 .redirect(reqwest::redirect::Policy::none())
511 .build()
512 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
513
514 let engine = match config.backend {
515 SemanticBackend::Fastembed => {
516 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
517 }
518 SemanticBackend::OpenAiCompatible => {
519 let raw = config.base_url.as_ref().ok_or_else(|| {
520 "base_url is required for openai_compatible backend".to_string()
521 })?;
522 let base_url = normalize_base_url(raw)?;
523
524 let api_key = match api_key_env {
525 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
526 format!("missing api_key_env '{var_name}' for openai_compatible backend")
527 })?),
528 None => None,
529 };
530
531 SemanticEmbeddingEngine::OpenAiCompatible {
532 client,
533 model,
534 base_url,
535 api_key,
536 }
537 }
538 SemanticBackend::Ollama => {
539 let raw = config
540 .base_url
541 .as_ref()
542 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
543 let base_url = normalize_base_url(raw)?;
544
545 SemanticEmbeddingEngine::Ollama {
546 client,
547 model,
548 base_url,
549 }
550 }
551 };
552
553 Ok(Self {
554 backend: config.backend,
555 model: config.model.clone(),
556 base_url: config.base_url.clone(),
557 timeout_ms,
558 max_batch_size,
559 dimension: None,
560 engine,
561 query_embedding_cache: HashMap::new(),
562 query_embedding_cache_order: VecDeque::new(),
563 query_embedding_cache_hits: 0,
564 query_embedding_cache_misses: 0,
565 })
566 }
567
568 pub fn backend(&self) -> SemanticBackend {
569 self.backend
570 }
571
572 pub fn model(&self) -> &str {
573 &self.model
574 }
575
576 pub fn base_url(&self) -> Option<&str> {
577 self.base_url.as_deref()
578 }
579
580 pub fn max_batch_size(&self) -> usize {
581 self.max_batch_size
582 }
583
584 pub fn timeout_ms(&self) -> u64 {
585 self.timeout_ms
586 }
587
588 pub fn fingerprint(
589 &mut self,
590 config: &SemanticBackendConfig,
591 ) -> Result<SemanticIndexFingerprint, String> {
592 let dimension = self.dimension()?;
593 Ok(SemanticIndexFingerprint::from_config(config, dimension))
594 }
595
596 pub fn dimension(&mut self) -> Result<usize, String> {
597 if let Some(dimension) = self.dimension {
598 return Ok(dimension);
599 }
600
601 let dimension = match &mut self.engine {
602 SemanticEmbeddingEngine::Local(model) => {
603 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
604 vectors
605 .first()
606 .map(|v| v.len())
607 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
608 }
609 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
610 let vectors =
611 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
612 vectors
613 .first()
614 .map(|v| v.len())
615 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
616 }
617 SemanticEmbeddingEngine::Ollama { .. } => {
618 let vectors =
619 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
620 vectors
621 .first()
622 .map(|v| v.len())
623 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
624 }
625 };
626
627 self.dimension = Some(dimension);
628 Ok(dimension)
629 }
630
631 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
632 self.embed_texts(texts)
633 }
634
635 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
636 if let Some(vector) = self.query_embedding_cache.get(query) {
637 self.query_embedding_cache_hits += 1;
638 return Ok(vector.clone());
639 }
640
641 self.query_embedding_cache_misses += 1;
642 let embeddings = self.embed_texts(vec![query.to_string()])?;
643 let vector = embeddings
644 .first()
645 .cloned()
646 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
647
648 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
649 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
650 self.query_embedding_cache.remove(&oldest);
651 }
652 }
653 self.query_embedding_cache
654 .insert(query.to_string(), vector.clone());
655 self.query_embedding_cache_order
656 .push_back(query.to_string());
657
658 Ok(vector)
659 }
660
661 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
662 (
663 self.query_embedding_cache_hits,
664 self.query_embedding_cache_misses,
665 self.query_embedding_cache.len(),
666 )
667 }
668
669 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
670 match &mut self.engine {
671 SemanticEmbeddingEngine::Local(model) => model
672 .embed(&texts)
673 .map_err(|error| format!("failed to embed batch: {error}")),
674 SemanticEmbeddingEngine::OpenAiCompatible {
675 client,
676 model,
677 base_url,
678 api_key,
679 } => {
680 let expected_text_count = texts.len();
681 let endpoint = build_openai_embeddings_endpoint(base_url);
682 let body = serde_json::json!({
683 "input": texts,
684 "model": model,
685 });
686
687 let raw = send_embedding_request(
688 || {
689 let mut request = client.post(&endpoint).json(&body);
699
700 if let Some(api_key) = api_key {
701 request = request.header("Authorization", format!("Bearer {api_key}"));
702 }
703
704 request
705 },
706 "openai compatible",
707 )?;
708
709 #[derive(Deserialize)]
710 struct OpenAiResponse {
711 data: Vec<OpenAiEmbeddingResult>,
712 }
713
714 #[derive(Deserialize)]
715 struct OpenAiEmbeddingResult {
716 embedding: Vec<f32>,
717 index: Option<u32>,
718 }
719
720 let parsed: OpenAiResponse = serde_json::from_str(&raw)
721 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
722 if parsed.data.len() != expected_text_count {
723 return Err(format!(
724 "openai compatible response returned {} embeddings for {} inputs",
725 parsed.data.len(),
726 expected_text_count
727 ));
728 }
729
730 let mut vectors = vec![Vec::new(); parsed.data.len()];
731 for (i, item) in parsed.data.into_iter().enumerate() {
732 let index = item.index.unwrap_or(i as u32) as usize;
733 if index >= vectors.len() {
734 return Err(
735 "openai compatible response contains invalid vector index".to_string()
736 );
737 }
738 vectors[index] = item.embedding;
739 }
740
741 for vector in &vectors {
742 if vector.is_empty() {
743 return Err(
744 "openai compatible response contained missing vectors".to_string()
745 );
746 }
747 }
748
749 self.dimension = vectors.first().map(Vec::len);
750 Ok(vectors)
751 }
752 SemanticEmbeddingEngine::Ollama {
753 client,
754 model,
755 base_url,
756 } => {
757 let expected_text_count = texts.len();
758 let endpoint = build_ollama_embeddings_endpoint(base_url);
759
760 #[derive(Serialize)]
761 struct OllamaPayload<'a> {
762 model: &'a str,
763 input: Vec<String>,
764 }
765
766 let payload = OllamaPayload {
767 model,
768 input: texts,
769 };
770
771 let raw = send_embedding_request(
772 || {
773 client.post(&endpoint).json(&payload)
778 },
779 "ollama",
780 )?;
781
782 #[derive(Deserialize)]
783 struct OllamaResponse {
784 embeddings: Vec<Vec<f32>>,
785 }
786
787 let parsed: OllamaResponse = serde_json::from_str(&raw)
788 .map_err(|error| format!("invalid ollama response: {error}"))?;
789 if parsed.embeddings.is_empty() {
790 return Err("ollama response returned no embeddings".to_string());
791 }
792 if parsed.embeddings.len() != expected_text_count {
793 return Err(format!(
794 "ollama response returned {} embeddings for {} inputs",
795 parsed.embeddings.len(),
796 expected_text_count
797 ));
798 }
799
800 let vectors = parsed.embeddings;
801 for vector in &vectors {
802 if vector.is_empty() {
803 return Err("ollama response contained empty embeddings".to_string());
804 }
805 }
806
807 self.dimension = vectors.first().map(Vec::len);
808 Ok(vectors)
809 }
810 }
811 }
812}
813
814pub fn pre_validate_onnx_runtime() -> Result<(), String> {
818 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
819
820 #[cfg(any(target_os = "linux", target_os = "macos"))]
821 {
822 #[cfg(target_os = "linux")]
823 let default_name = "libonnxruntime.so";
824 #[cfg(target_os = "macos")]
825 let default_name = "libonnxruntime.dylib";
826
827 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
828
829 unsafe {
830 let c_name = std::ffi::CString::new(lib_name)
831 .map_err(|e| format!("invalid library path: {}", e))?;
832 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
833 if handle.is_null() {
834 let err = libc::dlerror();
835 let msg = if err.is_null() {
836 "unknown dlopen error".to_string()
837 } else {
838 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
839 };
840 return Err(format!(
841 "ONNX Runtime not found. dlopen('{}') failed: {}. \
842 Run `npx @cortexkit/aft doctor` to diagnose.",
843 lib_name, msg
844 ));
845 }
846
847 let (detected_version, version_source) =
852 detect_ort_version_from_loaded_library(handle, lib_name);
853
854 libc::dlclose(handle);
855
856 if let Some(ref version) = detected_version {
858 let parts: Vec<&str> = version.split('.').collect();
859 if let (Some(major), Some(minor)) = (
860 parts.first().and_then(|s| s.parse::<u32>().ok()),
861 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
862 ) {
863 if major != 1 || minor < 20 {
864 return Err(format_ort_version_mismatch(version, &version_source));
865 }
866 }
867 }
868 }
869 }
870
871 #[cfg(target_os = "windows")]
872 {
873 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
878
879 #[link(name = "kernel32")]
883 extern "system" {
884 fn LoadLibraryExW(
885 lpLibFileName: *const u16,
886 hFile: *mut std::ffi::c_void,
887 dwFlags: u32,
888 ) -> *mut std::ffi::c_void;
889 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
890 fn GetModuleFileNameW(
891 hModule: *mut std::ffi::c_void,
892 lpFilename: *mut u16,
893 nSize: u32,
894 ) -> u32;
895 }
896
897 #[link(name = "version")]
898 extern "system" {
899 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
900 fn GetFileVersionInfoW(
901 lptstrFilename: *const u16,
902 dwHandle: u32,
903 dwLen: u32,
904 lpData: *mut std::ffi::c_void,
905 ) -> i32;
906 fn VerQueryValueW(
907 pBlock: *mut std::ffi::c_void,
908 lpSubBlock: *const u16,
909 lplpBuffer: *mut *mut std::ffi::c_void,
910 puLen: *mut u32,
911 ) -> i32;
912 }
913
914 #[repr(C)]
915 struct VS_FIXEDFILEINFO {
916 dw_signature: u32,
917 dw_struc_version: u32,
918 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
921 dw_product_version_ls: u32,
922 dw_file_flags_mask: u32,
923 dw_file_flags: u32,
924 dw_file_os: u32,
925 dw_file_type: u32,
926 dw_file_subtype: u32,
927 dw_file_date_ms: u32,
928 dw_file_date_ls: u32,
929 }
930
931 unsafe {
932 use std::os::windows::ffi::OsStrExt;
933 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
934 .encode_wide()
935 .chain(std::iter::once(0))
936 .collect();
937
938 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
939 if handle.is_null() {
940 let err = std::io::Error::last_os_error();
941 return Err(format!(
942 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
943 Run `npx @cortexkit/aft doctor` to diagnose.",
944 lib_name, err
945 ));
946 }
947
948 let mut detected_major: u32 = 0;
951 let mut detected_minor: u32 = 0;
952 let mut path_buf = [0u16; 32767];
958 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
959 if path_len > 0 {
960 let mut dummy_handle: u32 = 0;
961 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
962 if info_size > 0 {
963 let mut info = vec![0u8; info_size as usize];
964 if GetFileVersionInfoW(
965 path_buf.as_ptr(),
966 0,
967 info_size,
968 info.as_mut_ptr() as *mut std::ffi::c_void,
969 ) != 0
970 {
971 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
972 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
973 let mut vs_len: u32 = 0;
974 if VerQueryValueW(
975 info.as_mut_ptr() as *mut std::ffi::c_void,
976 sub_block.as_ptr(),
977 &mut vs_info,
978 &mut vs_len,
979 ) != 0
980 && !vs_info.is_null()
981 {
982 let fixed = vs_info as *const VS_FIXEDFILEINFO;
983 detected_major = (*fixed).dw_file_version_ms >> 16;
984 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
985 }
986 }
987 }
988 }
989
990 FreeLibrary(handle);
991
992 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
996 let ver = format!("{}.{}", detected_major, detected_minor);
997 return Err(format_ort_version_mismatch(&ver, lib_name));
998 }
999 }
1000 }
1001
1002 Ok(())
1003}
1004
1005#[cfg(any(target_os = "linux", target_os = "macos"))]
1006unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1007 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1008 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1009 if symbol.is_null() {
1010 return None;
1011 }
1012
1013 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1014 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1015 return None;
1016 }
1017
1018 let info = unsafe { info.assume_init() };
1019 if info.dli_fname.is_null() {
1020 return None;
1021 }
1022
1023 Some(
1024 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1025 .to_string_lossy()
1026 .into_owned(),
1027 )
1028}
1029
1030#[cfg(any(target_os = "linux", target_os = "macos"))]
1031fn detect_ort_version_from_resolved_or_requested(
1032 resolved_path: Option<String>,
1033 requested_lib_name: &str,
1034) -> (Option<String>, String) {
1035 if let Some(path) = resolved_path {
1036 if let Some(version) = detect_ort_version_from_path(&path) {
1037 return (Some(version), path);
1038 }
1039 return (detect_ort_version_from_path(requested_lib_name), path);
1040 }
1041
1042 (
1043 detect_ort_version_from_path(requested_lib_name),
1044 requested_lib_name.to_string(),
1045 )
1046}
1047
1048#[cfg(any(target_os = "linux", target_os = "macos"))]
1049fn detect_ort_version_from_loaded_library(
1050 handle: *mut std::ffi::c_void,
1051 requested_lib_name: &str,
1052) -> (Option<String>, String) {
1053 detect_ort_version_from_resolved_or_requested(
1054 unsafe { loaded_library_path_from_handle(handle) },
1055 requested_lib_name,
1056 )
1057}
1058
1059#[cfg(any(target_os = "linux", target_os = "macos"))]
1062fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1063 let path = std::path::Path::new(lib_path);
1064
1065 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1067 .into_iter()
1068 .flatten()
1069 {
1070 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1071 if let Some(version) = extract_version_from_filename(name) {
1072 return Some(version);
1073 }
1074 }
1075 }
1076
1077 if let Some(parent) = path.parent() {
1079 if let Ok(entries) = std::fs::read_dir(parent) {
1080 for entry in entries.flatten() {
1081 if let Some(name) = entry.file_name().to_str() {
1082 if name.starts_with("libonnxruntime") {
1083 if let Some(version) = extract_version_from_filename(name) {
1084 return Some(version);
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091
1092 None
1093}
1094
1095#[cfg(any(target_os = "linux", target_os = "macos"))]
1097fn extract_version_from_filename(name: &str) -> Option<String> {
1098 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1100 re.find(name).map(|m| m.as_str().to_string())
1101}
1102
1103fn suggest_removal_command(lib_path: &str) -> String {
1104 if lib_path.starts_with("/usr/local/lib")
1105 || lib_path == "libonnxruntime.so"
1106 || lib_path == "libonnxruntime.dylib"
1107 {
1108 #[cfg(target_os = "linux")]
1109 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1110 #[cfg(target_os = "macos")]
1111 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1112 }
1113 format!(" rm '{}'", lib_path)
1114}
1115
1116pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1122 format!(
1123 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1124 Solutions:\n\
1125 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1126 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1127 configures the bridge to load it instead of the system library — no \
1128 changes to '{}'.\n\
1129 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1130 {}\n\
1131 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1132 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1133 version,
1134 lib_name,
1135 lib_name,
1136 suggest_removal_command(lib_name),
1137 )
1138}
1139
1140pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1141 if message.trim_start().starts_with("ONNX Runtime not found.") {
1142 return true;
1143 }
1144
1145 let message = message.to_ascii_lowercase();
1146 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1147 .iter()
1148 .any(|pattern| message.contains(pattern));
1149 let mentions_dynamic_load_failure = [
1150 "shared library",
1151 "dynamic library",
1152 "failed to load",
1153 "could not load",
1154 "unable to load",
1155 "dlopen",
1156 "loadlibrary",
1157 "no such file",
1158 "not found",
1159 ]
1160 .iter()
1161 .any(|pattern| message.contains(pattern));
1162
1163 mentions_onnx_runtime && mentions_dynamic_load_failure
1164}
1165
1166pub fn format_embedding_init_error(error: impl Display) -> String {
1167 let message = error.to_string();
1168
1169 if is_onnx_runtime_unavailable(&message) {
1170 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1171 }
1172
1173 format!("failed to initialize semantic embedding model: {message}")
1174}
1175
1176#[derive(Debug, Clone)]
1178pub struct SemanticChunk {
1179 pub file: PathBuf,
1181 pub name: String,
1183 pub kind: SymbolKind,
1185 pub start_line: u32,
1187 pub end_line: u32,
1188 pub exported: bool,
1190 pub embed_text: String,
1192 pub snippet: String,
1194}
1195
1196#[derive(Debug, Clone)]
1198pub struct EmbeddingEntry {
1199 chunk: SemanticChunk,
1200 vector: Vec<f32>,
1201}
1202
1203#[derive(Debug, Clone)]
1205pub struct SemanticIndex {
1206 entries: Vec<EmbeddingEntry>,
1207 file_mtimes: HashMap<PathBuf, SystemTime>,
1209 file_sizes: HashMap<PathBuf, u64>,
1211 file_hashes: HashMap<PathBuf, blake3::Hash>,
1212 dimension: usize,
1214 fingerprint: Option<SemanticIndexFingerprint>,
1215 project_root: PathBuf,
1216 deferred_files: HashSet<PathBuf>,
1217}
1218
1219#[derive(Debug, Clone, Copy)]
1220struct IndexedFileMetadata {
1221 mtime: SystemTime,
1222 size: u64,
1223 content_hash: blake3::Hash,
1224}
1225
1226#[derive(Debug, Default, Clone, Copy)]
1229pub struct RefreshSummary {
1230 pub changed: usize,
1231 pub added: usize,
1232 pub deleted: usize,
1233 pub total_processed: usize,
1234}
1235
1236impl RefreshSummary {
1237 pub fn is_noop(&self) -> bool {
1239 self.changed == 0 && self.added == 0 && self.deleted == 0
1240 }
1241}
1242
1243#[derive(Debug, Default)]
1244pub struct InvalidatedFilesRefresh {
1245 pub added_entries: Vec<EmbeddingEntry>,
1249 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1250 pub completed_paths: Vec<PathBuf>,
1251 pub summary: RefreshSummary,
1252}
1253
1254#[derive(Debug, Clone)]
1255struct ReusableEmbedding {
1256 embed_text: String,
1257 vector: Vec<f32>,
1258}
1259
1260type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1261
1262#[derive(Debug, Clone)]
1264pub struct SemanticResult {
1265 pub file: PathBuf,
1266 pub name: String,
1267 pub kind: SymbolKind,
1268 pub start_line: u32,
1269 pub end_line: u32,
1270 pub exported: bool,
1271 pub snippet: String,
1272 pub score: f32,
1273 pub source: &'static str,
1274}
1275
1276impl SemanticIndex {
1277 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1278 debug_assert!(project_root.is_absolute());
1279 Self {
1280 entries: Vec::new(),
1281 file_mtimes: HashMap::new(),
1282 file_sizes: HashMap::new(),
1283 file_hashes: HashMap::new(),
1284 dimension,
1285 fingerprint: None,
1286 project_root,
1287 deferred_files: HashSet::new(),
1288 }
1289 }
1290
1291 pub fn entry_count(&self) -> usize {
1293 self.entries.len()
1294 }
1295
1296 pub fn indexed_file_count(&self) -> usize {
1298 self.file_mtimes.len()
1299 }
1300
1301 pub fn status_label(&self) -> &'static str {
1303 if self.entries.is_empty() {
1304 "empty"
1305 } else {
1306 "ready"
1307 }
1308 }
1309
1310 fn collect_chunks(
1311 project_root: &Path,
1312 files: &[PathBuf],
1313 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1314 let collect_started = std::time::Instant::now();
1315 let per_file: Vec<(
1316 PathBuf,
1317 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1318 )> = files
1319 .par_iter()
1320 .map_init(HashMap::new, |parsers, file| {
1321 let result = collect_file_metadata(file).and_then(|metadata| {
1322 collect_file_chunks(project_root, file, parsers)
1323 .map(|chunks| (metadata, chunks))
1324 });
1325 (file.clone(), result)
1326 })
1327 .collect();
1328
1329 let mut chunks: Vec<SemanticChunk> = Vec::new();
1330 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1331
1332 for (file, result) in per_file {
1333 match result {
1334 Ok((metadata, file_chunks)) => {
1335 file_metadata.insert(file, metadata);
1336 chunks.extend(file_chunks);
1337 }
1338 Err(error) => {
1339 if error == "unsupported file extension" {
1345 continue;
1346 }
1347 slog_warn!(
1348 "failed to collect semantic chunks for {}: {}",
1349 file.display(),
1350 error
1351 );
1352 }
1353 }
1354 }
1355
1356 slog_info!(
1357 "semantic collect: {} chunks from {} files in {} ms",
1358 chunks.len(),
1359 file_metadata.len(),
1360 collect_started.elapsed().as_millis()
1361 );
1362
1363 (chunks, file_metadata)
1364 }
1365
1366 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1367 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1368 let mut reuse_map: ChunkReuseMap = HashMap::new();
1369
1370 for entry in &self.entries {
1371 if !requested.contains(entry.chunk.file.as_path()) {
1372 continue;
1373 }
1374
1375 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1380 reuse_map
1381 .entry(entry.chunk.file.clone())
1382 .or_default()
1383 .entry(hash)
1384 .or_default()
1385 .push(ReusableEmbedding {
1386 embed_text: entry.chunk.embed_text.clone(),
1387 vector: entry.vector.clone(),
1388 });
1389 }
1390
1391 reuse_map
1392 }
1393
1394 fn reusable_vector_for_chunk(
1395 reuse_map: &ChunkReuseMap,
1396 chunk: &SemanticChunk,
1397 ) -> Option<Vec<f32>> {
1398 let hash = blake3::hash(chunk.embed_text.as_bytes());
1399 reuse_map
1400 .get(&chunk.file)?
1401 .get(&hash)?
1402 .iter()
1403 .find(|candidate| candidate.embed_text == chunk.embed_text)
1404 .map(|candidate| candidate.vector.clone())
1405 }
1406
1407 fn entries_for_chunks_with_reuse<F, P>(
1408 chunks: Vec<SemanticChunk>,
1409 reuse_map: &ChunkReuseMap,
1410 embed_fn: &mut F,
1411 max_batch_size: usize,
1412 initial_observed_dimension: Option<usize>,
1413 refresh_label: &str,
1414 progress: &mut P,
1415 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1416 where
1417 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1418 P: FnMut(usize, usize),
1419 {
1420 let total_chunks = chunks.len();
1421 progress(0, total_chunks);
1422
1423 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1424 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1425
1426 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1427 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1428 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1429 } else {
1430 misses.push((chunk_index, chunk));
1431 }
1432 }
1433
1434 let mut completed = total_chunks.saturating_sub(misses.len());
1435 if completed > 0 {
1436 progress(completed, total_chunks);
1437 }
1438
1439 let batch_size = max_batch_size.max(1);
1440 let mut observed_dimension = initial_observed_dimension;
1441
1442 for batch_start in (0..misses.len()).step_by(batch_size) {
1443 let batch_end = (batch_start + batch_size).min(misses.len());
1444 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1445 .iter()
1446 .map(|(_, chunk)| chunk.embed_text.clone())
1447 .collect();
1448
1449 let vectors = embed_fn(batch_texts)?;
1450 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1451
1452 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1453 match observed_dimension {
1454 None => observed_dimension = Some(dim),
1455 Some(expected) if dim != expected => {
1456 return Err(format!(
1457 "embedding dimension changed during {refresh_label}: \
1458 cached index uses {expected}, new vectors use {dim}"
1459 ));
1460 }
1461 _ => {}
1462 }
1463 }
1464
1465 for (i, vector) in vectors.into_iter().enumerate() {
1466 let (chunk_index, chunk) = misses[batch_start + i].clone();
1467 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1468 }
1469
1470 completed += batch_end - batch_start;
1471 progress(completed, total_chunks);
1472 }
1473
1474 let entries = entries_by_chunk
1475 .into_iter()
1476 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1477 .collect();
1478
1479 Ok((entries, observed_dimension))
1480 }
1481
1482 fn build_from_chunks<F, P>(
1483 project_root: &Path,
1484 chunks: Vec<SemanticChunk>,
1485 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1486 embed_fn: &mut F,
1487 max_batch_size: usize,
1488 mut progress: Option<&mut P>,
1489 ) -> Result<Self, String>
1490 where
1491 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1492 P: FnMut(usize, usize),
1493 {
1494 debug_assert!(project_root.is_absolute());
1495 let total_chunks = chunks.len();
1496
1497 if chunks.is_empty() {
1498 return Ok(Self {
1499 entries: Vec::new(),
1500 file_mtimes: file_metadata
1501 .iter()
1502 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1503 .collect(),
1504 file_sizes: file_metadata
1505 .iter()
1506 .map(|(path, metadata)| (path.clone(), metadata.size))
1507 .collect(),
1508 file_hashes: file_metadata
1509 .into_iter()
1510 .map(|(path, metadata)| (path, metadata.content_hash))
1511 .collect(),
1512 dimension: DEFAULT_DIMENSION,
1513 fingerprint: None,
1514 project_root: project_root.to_path_buf(),
1515 deferred_files: HashSet::new(),
1516 });
1517 }
1518
1519 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1521 let mut expected_dimension: Option<usize> = None;
1522 let batch_size = max_batch_size.max(1);
1523 let embed_started = std::time::Instant::now();
1524 let batch_count = total_chunks.div_ceil(batch_size);
1525 for batch_start in (0..chunks.len()).step_by(batch_size) {
1526 let batch_end = (batch_start + batch_size).min(chunks.len());
1527 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1528 .iter()
1529 .map(|c| c.embed_text.clone())
1530 .collect();
1531
1532 let vectors = embed_fn(batch_texts)?;
1533 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1534
1535 if let Some(dim) = vectors.first().map(|v| v.len()) {
1537 match expected_dimension {
1538 None => expected_dimension = Some(dim),
1539 Some(expected) if dim != expected => {
1540 return Err(format!(
1541 "embedding dimension changed across batches: expected {expected}, got {dim}"
1542 ));
1543 }
1544 _ => {}
1545 }
1546 }
1547
1548 for (i, vector) in vectors.into_iter().enumerate() {
1549 let chunk_idx = batch_start + i;
1550 entries.push(EmbeddingEntry {
1551 chunk: chunks[chunk_idx].clone(),
1552 vector,
1553 });
1554 }
1555
1556 if let Some(callback) = progress.as_mut() {
1557 callback(entries.len(), total_chunks);
1558 }
1559 }
1560
1561 let embed_ms = embed_started.elapsed().as_millis();
1562 let rate = (total_chunks as u128 * 1000)
1563 .checked_div(embed_ms)
1564 .unwrap_or(0) as u64;
1565 slog_info!(
1566 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1567 total_chunks,
1568 batch_count,
1569 embed_ms,
1570 rate
1571 );
1572
1573 let dimension = entries
1574 .first()
1575 .map(|e| e.vector.len())
1576 .unwrap_or(DEFAULT_DIMENSION);
1577
1578 Ok(Self {
1579 entries,
1580 file_mtimes: file_metadata
1581 .iter()
1582 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1583 .collect(),
1584 file_sizes: file_metadata
1585 .iter()
1586 .map(|(path, metadata)| (path.clone(), metadata.size))
1587 .collect(),
1588 file_hashes: file_metadata
1589 .into_iter()
1590 .map(|(path, metadata)| (path, metadata.content_hash))
1591 .collect(),
1592 dimension,
1593 fingerprint: None,
1594 project_root: project_root.to_path_buf(),
1595 deferred_files: HashSet::new(),
1596 })
1597 }
1598
1599 pub fn build<F>(
1602 project_root: &Path,
1603 files: &[PathBuf],
1604 embed_fn: &mut F,
1605 max_batch_size: usize,
1606 ) -> Result<Self, String>
1607 where
1608 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1609 {
1610 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1611 Self::build_from_chunks(
1612 project_root,
1613 chunks,
1614 file_mtimes,
1615 embed_fn,
1616 max_batch_size,
1617 Option::<&mut fn(usize, usize)>::None,
1618 )
1619 }
1620
1621 pub fn build_with_progress<F, P>(
1623 project_root: &Path,
1624 files: &[PathBuf],
1625 embed_fn: &mut F,
1626 max_batch_size: usize,
1627 progress: &mut P,
1628 ) -> Result<Self, String>
1629 where
1630 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631 P: FnMut(usize, usize),
1632 {
1633 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634 let total_chunks = chunks.len();
1635 progress(0, total_chunks);
1636 Self::build_from_chunks(
1637 project_root,
1638 chunks,
1639 file_mtimes,
1640 embed_fn,
1641 max_batch_size,
1642 Some(progress),
1643 )
1644 }
1645
1646 pub fn refresh_stale_files<F, P>(
1657 &mut self,
1658 project_root: &Path,
1659 current_files: &[PathBuf],
1660 embed_fn: &mut F,
1661 max_batch_size: usize,
1662 progress: &mut P,
1663 ) -> Result<RefreshSummary, String>
1664 where
1665 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1666 P: FnMut(usize, usize),
1667 {
1668 self.backfill_missing_file_sizes();
1669
1670 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1672 self.deferred_files
1673 .retain(|path| current_set.contains(path.as_path()));
1674 let total_processed = current_set.len() + self.file_mtimes.len()
1675 - self
1676 .file_mtimes
1677 .keys()
1678 .filter(|path| current_set.contains(path.as_path()))
1679 .count();
1680
1681 let mut deleted: Vec<PathBuf> = Vec::new();
1684 let mut changed: Vec<PathBuf> = Vec::new();
1685 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1686 for indexed_path in &indexed_paths {
1687 if !current_set.contains(indexed_path.as_path()) {
1688 deleted.push(indexed_path.clone());
1689 continue;
1690 }
1691 let cached = match (
1692 self.file_mtimes.get(indexed_path),
1693 self.file_sizes.get(indexed_path),
1694 self.file_hashes.get(indexed_path),
1695 ) {
1696 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1697 mtime: *mtime,
1698 size: *size,
1699 content_hash: *hash,
1700 }),
1701 _ => None,
1702 };
1703 match cached
1704 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1705 {
1706 Some(FreshnessVerdict::HotFresh) => {}
1707 Some(FreshnessVerdict::ContentFresh {
1708 new_mtime,
1709 new_size,
1710 }) => {
1711 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1712 self.file_sizes.insert(indexed_path.clone(), new_size);
1713 }
1714 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1715 changed.push(indexed_path.clone());
1716 }
1717 }
1718 }
1719
1720 let mut added: Vec<PathBuf> = Vec::new();
1722 for path in current_files {
1723 if !self.file_mtimes.contains_key(path) {
1724 added.push(path.clone());
1725 }
1726 }
1727
1728 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1730 progress(0, 0);
1731 return Ok(RefreshSummary {
1732 total_processed,
1733 ..RefreshSummary::default()
1734 });
1735 }
1736
1737 if !deleted.is_empty() {
1741 self.remove_indexed_files(&deleted);
1742 }
1743
1744 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1746 to_embed.extend(changed.iter().cloned());
1747 to_embed.extend(added.iter().cloned());
1748
1749 if to_embed.is_empty() {
1750 progress(0, 0);
1752 return Ok(RefreshSummary {
1753 changed: 0,
1754 added: 0,
1755 deleted: deleted.len(),
1756 total_processed,
1757 });
1758 }
1759
1760 let reuse_map = self.build_chunk_reuse_map(&changed);
1761 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1762 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1763 let vanished = to_embed
1764 .iter()
1765 .filter(|path| {
1766 changed_set.contains(path.as_path())
1767 && !fresh_metadata.contains_key(*path)
1768 && !path.exists()
1769 })
1770 .cloned()
1771 .collect::<Vec<_>>();
1772 if !vanished.is_empty() {
1773 self.remove_indexed_files(&vanished);
1774 deleted.extend(vanished);
1775 }
1776
1777 if chunks.is_empty() {
1778 progress(0, 0);
1779 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1780 for file in &successful_files {
1781 self.deferred_files.remove(file);
1782 }
1783 if !successful_files.is_empty() {
1784 self.entries
1785 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1786 }
1787 let changed_count = changed
1788 .iter()
1789 .filter(|path| successful_files.contains(*path))
1790 .count();
1791 let added_count = added
1792 .iter()
1793 .filter(|path| successful_files.contains(*path))
1794 .count();
1795 for (file, metadata) in fresh_metadata {
1796 self.file_mtimes.insert(file.clone(), metadata.mtime);
1797 self.file_sizes.insert(file.clone(), metadata.size);
1798 self.file_hashes.insert(file.clone(), metadata.content_hash);
1799 }
1800 return Ok(RefreshSummary {
1801 changed: changed_count,
1802 added: added_count,
1803 deleted: deleted.len(),
1804 total_processed,
1805 });
1806 }
1807
1808 let existing_dimension = if self.entries.is_empty() {
1811 None
1812 } else {
1813 Some(self.dimension)
1814 };
1815 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1816 chunks,
1817 &reuse_map,
1818 embed_fn,
1819 max_batch_size,
1820 existing_dimension,
1821 "incremental refresh",
1822 progress,
1823 )?;
1824
1825 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1826 for file in &successful_files {
1827 self.deferred_files.remove(file);
1828 }
1829 if !successful_files.is_empty() {
1830 self.entries
1831 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1832 }
1833
1834 self.entries.extend(new_entries);
1835 for (file, metadata) in fresh_metadata {
1836 self.file_mtimes.insert(file.clone(), metadata.mtime);
1837 self.file_sizes.insert(file.clone(), metadata.size);
1838 self.file_hashes.insert(file, metadata.content_hash);
1839 }
1840 if let Some(dim) = observed_dimension {
1841 self.dimension = dim;
1842 }
1843
1844 Ok(RefreshSummary {
1845 changed: changed
1846 .iter()
1847 .filter(|path| successful_files.contains(*path))
1848 .count(),
1849 added: added
1850 .iter()
1851 .filter(|path| successful_files.contains(*path))
1852 .count(),
1853 deleted: deleted.len(),
1854 total_processed,
1855 })
1856 }
1857
1858 pub fn refresh_invalidated_files<F, P>(
1865 &mut self,
1866 project_root: &Path,
1867 paths: &[PathBuf],
1868 embed_fn: &mut F,
1869 max_batch_size: usize,
1870 max_files: usize,
1871 progress: &mut P,
1872 ) -> Result<InvalidatedFilesRefresh, String>
1873 where
1874 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1875 P: FnMut(usize, usize),
1876 {
1877 self.backfill_missing_file_sizes();
1878
1879 self.deferred_files.retain(|path| path.exists());
1880 let mut requested_paths = paths.to_vec();
1881 requested_paths.extend(self.deferred_files.iter().cloned());
1882 requested_paths.sort();
1883 requested_paths.dedup();
1884 let total_processed = requested_paths.len();
1885
1886 if requested_paths.is_empty() {
1887 progress(0, 0);
1888 return Ok(InvalidatedFilesRefresh {
1889 summary: RefreshSummary {
1890 total_processed,
1891 ..RefreshSummary::default()
1892 },
1893 ..InvalidatedFilesRefresh::default()
1894 });
1895 }
1896
1897 let previously_indexed: HashSet<PathBuf> = requested_paths
1898 .iter()
1899 .filter(|path| self.file_mtimes.contains_key(*path))
1900 .cloned()
1901 .collect();
1902 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1903
1904 self.remove_indexed_files(&requested_paths);
1908
1909 let existing_paths = requested_paths
1910 .iter()
1911 .filter(|path| path.exists())
1912 .cloned()
1913 .collect::<Vec<_>>();
1914 let deleted = requested_paths
1915 .iter()
1916 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1917 .count();
1918
1919 if existing_paths.is_empty() {
1920 for path in &requested_paths {
1921 if !path.exists() {
1922 self.deferred_files.remove(path);
1923 }
1924 }
1925 progress(0, 0);
1926 return Ok(InvalidatedFilesRefresh {
1927 completed_paths: requested_paths,
1928 summary: RefreshSummary {
1929 deleted,
1930 total_processed,
1931 ..RefreshSummary::default()
1932 },
1933 ..InvalidatedFilesRefresh::default()
1934 });
1935 }
1936
1937 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1938
1939 let retained_file_count = self.file_mtimes.len();
1940 let changed_successful_count = existing_paths
1941 .iter()
1942 .filter(|path| {
1943 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1944 })
1945 .count();
1946 let available_new_files =
1947 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1948 let new_successful_files = existing_paths
1949 .iter()
1950 .filter(|path| {
1951 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1952 })
1953 .cloned()
1954 .collect::<Vec<_>>();
1955 if new_successful_files.len() > available_new_files {
1956 let allowed_new_files = new_successful_files
1957 .iter()
1958 .take(available_new_files)
1959 .cloned()
1960 .collect::<HashSet<_>>();
1961 let deferred_new_files = new_successful_files
1962 .into_iter()
1963 .filter(|path| !allowed_new_files.contains(path))
1964 .collect::<HashSet<_>>();
1965
1966 fresh_metadata.retain(|file, _| {
1967 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1968 });
1969 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1970
1971 if !deferred_new_files.is_empty() {
1972 for path in &deferred_new_files {
1973 self.deferred_files.insert(path.clone());
1974 }
1975 slog_warn!(
1976 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1977 deferred_new_files.len(),
1978 max_files
1979 );
1980 }
1981 }
1982
1983 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1984 for file in &successful_files {
1985 self.deferred_files.remove(file);
1986 }
1987 let changed = successful_files
1988 .iter()
1989 .filter(|path| previously_indexed.contains(path.as_path()))
1990 .count();
1991 let added = successful_files.len().saturating_sub(changed);
1992 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1993
1994 if chunks.is_empty() {
1995 progress(0, 0);
1996 for (file, metadata) in fresh_metadata {
1997 let freshness = FileFreshness {
1998 mtime: metadata.mtime,
1999 size: metadata.size,
2000 content_hash: metadata.content_hash,
2001 };
2002 self.file_mtimes.insert(file.clone(), freshness.mtime);
2003 self.file_sizes.insert(file.clone(), freshness.size);
2004 self.file_hashes
2005 .insert(file.clone(), freshness.content_hash);
2006 updated_metadata.push((file, freshness));
2007 }
2008
2009 return Ok(InvalidatedFilesRefresh {
2010 updated_metadata,
2011 completed_paths: requested_paths,
2012 summary: RefreshSummary {
2013 changed,
2014 added,
2015 deleted,
2016 total_processed,
2017 },
2018 ..InvalidatedFilesRefresh::default()
2019 });
2020 }
2021
2022 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2023 {
2024 None
2025 } else {
2026 Some(self.dimension)
2027 };
2028 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2029 chunks,
2030 &reuse_map,
2031 embed_fn,
2032 max_batch_size,
2033 initial_observed_dimension,
2034 "invalidated-file refresh",
2035 progress,
2036 )?;
2037
2038 let added_entries = new_entries.clone();
2039 self.entries.extend(new_entries);
2040 for (file, metadata) in fresh_metadata {
2041 let freshness = FileFreshness {
2042 mtime: metadata.mtime,
2043 size: metadata.size,
2044 content_hash: metadata.content_hash,
2045 };
2046 self.file_mtimes.insert(file.clone(), freshness.mtime);
2047 self.file_sizes.insert(file.clone(), freshness.size);
2048 self.file_hashes
2049 .insert(file.clone(), freshness.content_hash);
2050 updated_metadata.push((file, freshness));
2051 }
2052 if let Some(dim) = observed_dimension {
2053 self.dimension = dim;
2054 }
2055
2056 Ok(InvalidatedFilesRefresh {
2057 added_entries,
2058 updated_metadata,
2059 completed_paths: requested_paths,
2060 summary: RefreshSummary {
2061 changed,
2062 added,
2063 deleted,
2064 total_processed,
2065 },
2066 })
2067 }
2068
2069 pub fn apply_refresh_update(
2070 &mut self,
2071 added_entries: Vec<EmbeddingEntry>,
2072 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2073 completed_paths: &[PathBuf],
2074 ) {
2075 self.remove_indexed_files(completed_paths);
2079
2080 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2081 self.entries.extend(added_entries);
2082 for (file, freshness) in updated_metadata {
2083 self.file_mtimes.insert(file.clone(), freshness.mtime);
2084 self.file_sizes.insert(file.clone(), freshness.size);
2085 self.file_hashes.insert(file, freshness.content_hash);
2086 }
2087 if let Some(dim) = observed_dimension {
2088 self.dimension = dim;
2089 }
2090 }
2091
2092 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2093 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2094 self.entries
2095 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2096 for path in files {
2097 self.file_mtimes.remove(path);
2098 self.file_sizes.remove(path);
2099 self.file_hashes.remove(path);
2100 }
2101 }
2102
2103 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2105 if self.entries.is_empty() || query_vector.len() != self.dimension {
2106 return Vec::new();
2107 }
2108
2109 let mut scored: Vec<(f32, usize)> = self
2110 .entries
2111 .iter()
2112 .enumerate()
2113 .map(|(i, entry)| {
2114 let mut score = cosine_similarity(query_vector, &entry.vector);
2115 if entry.chunk.exported {
2116 score *= 1.1;
2117 }
2118 (score, i)
2119 })
2120 .collect();
2121
2122 let keep = top_k.min(scored.len());
2123 if keep == 0 {
2124 return Vec::new();
2125 }
2126
2127 if keep < scored.len() {
2128 scored.select_nth_unstable_by(keep, semantic_score_order);
2129 scored.truncate(keep);
2130 }
2131 scored.sort_by(semantic_score_order);
2132
2133 scored
2134 .into_iter()
2135 .map(|(score, idx)| {
2139 let entry = &self.entries[idx];
2140 SemanticResult {
2141 file: entry.chunk.file.clone(),
2142 name: entry.chunk.name.clone(),
2143 kind: entry.chunk.kind.clone(),
2144 start_line: entry.chunk.start_line,
2145 end_line: entry.chunk.end_line,
2146 exported: entry.chunk.exported,
2147 snippet: entry.chunk.snippet.clone(),
2148 score,
2149 source: "semantic",
2150 }
2151 })
2152 .collect()
2153 }
2154
2155 pub fn len(&self) -> usize {
2157 self.entries.len()
2158 }
2159
2160 pub fn is_file_stale(&self, file: &Path) -> bool {
2162 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2163 return true;
2164 };
2165 let Some(stored_size) = self.file_sizes.get(file) else {
2166 return true;
2167 };
2168 let Some(stored_hash) = self.file_hashes.get(file) else {
2169 return true;
2170 };
2171 let cached = FileFreshness {
2172 mtime: *stored_mtime,
2173 size: *stored_size,
2174 content_hash: *stored_hash,
2175 };
2176 match cache_freshness::verify_file_strict(file, &cached) {
2177 FreshnessVerdict::HotFresh => false,
2178 FreshnessVerdict::ContentFresh { .. } => false,
2179 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2180 }
2181 }
2182
2183 fn backfill_missing_file_sizes(&mut self) {
2184 for path in self.file_mtimes.keys() {
2185 if self.file_sizes.contains_key(path) {
2186 continue;
2187 }
2188 if let Ok(metadata) = fs::metadata(path) {
2189 self.file_sizes.insert(path.clone(), metadata.len());
2190 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2191 self.file_hashes.insert(path.clone(), hash);
2192 }
2193 }
2194 }
2195 }
2196
2197 pub fn remove_file(&mut self, file: &Path) {
2199 self.invalidate_file(file);
2200 }
2201
2202 pub fn invalidate_file(&mut self, file: &Path) {
2203 let canonical_file = canonicalize_existing_or_deleted_path(file);
2204 self.entries
2205 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2206 self.file_mtimes.remove(file);
2207 self.file_sizes.remove(file);
2208 self.file_hashes.remove(file);
2209 if canonical_file.as_path() != file {
2210 self.file_mtimes.remove(&canonical_file);
2211 self.file_sizes.remove(&canonical_file);
2212 self.file_hashes.remove(&canonical_file);
2213 }
2214 }
2215
2216 pub fn dimension(&self) -> usize {
2218 self.dimension
2219 }
2220
2221 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2222 self.fingerprint.as_ref()
2223 }
2224
2225 pub fn backend_label(&self) -> Option<&str> {
2226 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2227 }
2228
2229 pub fn model_label(&self) -> Option<&str> {
2230 self.fingerprint.as_ref().map(|f| f.model.as_str())
2231 }
2232
2233 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2234 self.fingerprint = Some(fingerprint);
2235 }
2236
2237 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2239 if self.entries.is_empty() {
2242 slog_info!("skipping semantic index persistence (0 entries)");
2243 return;
2244 }
2245 let dir = storage_dir.join("semantic").join(project_key);
2246 if let Err(e) = fs::create_dir_all(&dir) {
2247 slog_warn!("failed to create semantic cache dir: {}", e);
2248 return;
2249 }
2250 let data_path = dir.join("semantic.bin");
2251 let tmp_path = dir.join(format!(
2252 "semantic.bin.tmp.{}.{}",
2253 std::process::id(),
2254 SystemTime::now()
2255 .duration_since(SystemTime::UNIX_EPOCH)
2256 .unwrap_or(Duration::ZERO)
2257 .as_nanos()
2258 ));
2259 let write_result = (|| -> io::Result<usize> {
2260 let file = fs::File::create(&tmp_path)?;
2261 let mut writer = BufWriter::new(file);
2262 let bytes_written = self.write_to_writer(&mut writer)?;
2263 writer.flush()?;
2264 writer.get_ref().sync_all()?;
2265 Ok(bytes_written)
2266 })();
2267 let bytes_written = match write_result {
2268 Ok(bytes_written) => bytes_written,
2269 Err(e) => {
2270 slog_warn!("failed to write semantic index: {}", e);
2271 let _ = fs::remove_file(&tmp_path);
2272 return;
2273 }
2274 };
2275 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2276 slog_warn!("failed to rename semantic index: {}", e);
2277 let _ = fs::remove_file(&tmp_path);
2278 return;
2279 }
2280 slog_info!(
2281 "semantic index persisted: {} entries, {:.1} KB",
2282 self.entries.len(),
2283 bytes_written as f64 / 1024.0
2284 );
2285 }
2286
2287 pub fn read_from_disk(
2289 storage_dir: &Path,
2290 project_key: &str,
2291 current_canonical_root: &Path,
2292 is_worktree_bridge: bool,
2293 expected_fingerprint: Option<&str>,
2294 ) -> Option<Self> {
2295 debug_assert!(current_canonical_root.is_absolute());
2296 let data_path = storage_dir
2297 .join("semantic")
2298 .join(project_key)
2299 .join("semantic.bin");
2300 let file = fs::File::open(&data_path).ok()?;
2301 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2302 if file_len < HEADER_BYTES_V1 {
2303 slog_warn!(
2304 "corrupt semantic index (too small: {} bytes), removing",
2305 file_len
2306 );
2307 if !is_worktree_bridge {
2308 let _ = fs::remove_file(&data_path);
2309 }
2310 return None;
2311 }
2312
2313 let mut reader = BufReader::new(file);
2314 let mut version_buf = [0u8; 1];
2315 reader.read_exact(&mut version_buf).ok()?;
2316 let version = version_buf[0];
2317 if version != SEMANTIC_INDEX_VERSION_V6 {
2318 slog_info!(
2319 "cached semantic index version {} is older than {}, rebuilding",
2320 version,
2321 SEMANTIC_INDEX_VERSION_V6
2322 );
2323 if !is_worktree_bridge {
2324 let _ = fs::remove_file(&data_path);
2325 }
2326 return None;
2327 }
2328 match Self::from_reader_after_version(
2329 reader,
2330 version,
2331 current_canonical_root,
2332 Some(file_len),
2333 1,
2334 ) {
2335 Ok(index) => {
2336 if index.entries.is_empty() {
2337 slog_info!("cached semantic index is empty, will rebuild");
2338 if !is_worktree_bridge {
2339 let _ = fs::remove_file(&data_path);
2340 }
2341 return None;
2342 }
2343 if let Some(expected) = expected_fingerprint {
2344 let matches = index
2345 .fingerprint()
2346 .map(|fingerprint| fingerprint.matches_expected(expected))
2347 .unwrap_or(false);
2348 if !matches {
2349 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2350 if !is_worktree_bridge {
2351 let _ = fs::remove_file(&data_path);
2352 }
2353 return None;
2354 }
2355 }
2356 slog_info!(
2357 "loaded semantic index from disk: {} entries",
2358 index.entries.len()
2359 );
2360 Some(index)
2361 }
2362 Err(e) => {
2363 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2364 if !is_worktree_bridge {
2365 let _ = fs::remove_file(&data_path);
2366 }
2367 None
2368 }
2369 }
2370 }
2371
2372 pub fn to_bytes(&self) -> Vec<u8> {
2374 let mut buf = Vec::new();
2375 self.write_to_writer(&mut buf)
2376 .expect("writing semantic index to Vec cannot fail");
2377 buf
2378 }
2379
2380 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2381 let mut bytes_written = 0usize;
2382 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2383 let encoded = fingerprint.as_string();
2384 if encoded.is_empty() {
2385 None
2386 } else {
2387 Some(encoded)
2388 }
2389 });
2390 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2391 let file_mtime_count = self
2392 .file_mtimes
2393 .iter()
2394 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2395 .count();
2396 let entry_count = self
2397 .entries
2398 .iter()
2399 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2400 .count();
2401
2402 let version = SEMANTIC_INDEX_VERSION_V6;
2415 write_counted(writer, &[version], &mut bytes_written)?;
2416 write_counted(
2417 writer,
2418 &(self.dimension as u32).to_le_bytes(),
2419 &mut bytes_written,
2420 )?;
2421 write_counted(
2422 writer,
2423 &(entry_count as u32).to_le_bytes(),
2424 &mut bytes_written,
2425 )?;
2426 write_counted(
2427 writer,
2428 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2429 &mut bytes_written,
2430 )?;
2431 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2432
2433 write_counted(
2436 writer,
2437 &(file_mtime_count as u32).to_le_bytes(),
2438 &mut bytes_written,
2439 )?;
2440 for (path, mtime) in &self.file_mtimes {
2441 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2442 continue;
2443 };
2444 let relative = relative.to_string_lossy();
2445 let path_bytes = relative.as_bytes();
2446 write_counted(
2447 writer,
2448 &(path_bytes.len() as u32).to_le_bytes(),
2449 &mut bytes_written,
2450 )?;
2451 write_counted(writer, path_bytes, &mut bytes_written)?;
2452 let duration = mtime
2453 .duration_since(SystemTime::UNIX_EPOCH)
2454 .unwrap_or_default();
2455 write_counted(
2456 writer,
2457 &duration.as_secs().to_le_bytes(),
2458 &mut bytes_written,
2459 )?;
2460 write_counted(
2461 writer,
2462 &duration.subsec_nanos().to_le_bytes(),
2463 &mut bytes_written,
2464 )?;
2465 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2466 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2467 let hash = self
2468 .file_hashes
2469 .get(path)
2470 .copied()
2471 .unwrap_or_else(cache_freshness::zero_hash);
2472 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2473 }
2474
2475 for entry in &self.entries {
2477 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2478 continue;
2479 };
2480 let c = &entry.chunk;
2481
2482 let relative = relative.to_string_lossy();
2484 let file_bytes = relative.as_bytes();
2485 write_counted(
2486 writer,
2487 &(file_bytes.len() as u32).to_le_bytes(),
2488 &mut bytes_written,
2489 )?;
2490 write_counted(writer, file_bytes, &mut bytes_written)?;
2491
2492 let name_bytes = c.name.as_bytes();
2494 write_counted(
2495 writer,
2496 &(name_bytes.len() as u32).to_le_bytes(),
2497 &mut bytes_written,
2498 )?;
2499 write_counted(writer, name_bytes, &mut bytes_written)?;
2500
2501 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2503
2504 write_counted(
2506 writer,
2507 &(c.start_line as u32).to_le_bytes(),
2508 &mut bytes_written,
2509 )?;
2510 write_counted(
2511 writer,
2512 &(c.end_line as u32).to_le_bytes(),
2513 &mut bytes_written,
2514 )?;
2515 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2516
2517 let snippet_bytes = c.snippet.as_bytes();
2519 write_counted(
2520 writer,
2521 &(snippet_bytes.len() as u32).to_le_bytes(),
2522 &mut bytes_written,
2523 )?;
2524 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2525
2526 let embed_bytes = c.embed_text.as_bytes();
2528 write_counted(
2529 writer,
2530 &(embed_bytes.len() as u32).to_le_bytes(),
2531 &mut bytes_written,
2532 )?;
2533 write_counted(writer, embed_bytes, &mut bytes_written)?;
2534
2535 for &val in &entry.vector {
2537 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2538 }
2539 }
2540
2541 Ok(bytes_written)
2542 }
2543
2544 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2546 debug_assert!(current_canonical_root.is_absolute());
2547 if data.len() < HEADER_BYTES_V1 {
2548 return Err("data too short".to_string());
2549 }
2550
2551 Self::from_reader_after_version(
2552 Cursor::new(&data[1..]),
2553 data[0],
2554 current_canonical_root,
2555 Some(data.len()),
2556 1,
2557 )
2558 }
2559
2560 fn from_reader_after_version<R: Read>(
2561 reader: R,
2562 version: u8,
2563 current_canonical_root: &Path,
2564 total_len: Option<usize>,
2565 bytes_read: usize,
2566 ) -> Result<Self, String> {
2567 debug_assert!(current_canonical_root.is_absolute());
2568 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2569
2570 if version != SEMANTIC_INDEX_VERSION_V1
2571 && version != SEMANTIC_INDEX_VERSION_V2
2572 && version != SEMANTIC_INDEX_VERSION_V3
2573 && version != SEMANTIC_INDEX_VERSION_V4
2574 && version != SEMANTIC_INDEX_VERSION_V5
2575 && version != SEMANTIC_INDEX_VERSION_V6
2576 {
2577 return Err(format!("unsupported version: {}", version));
2578 }
2579 if (version == SEMANTIC_INDEX_VERSION_V2
2583 || version == SEMANTIC_INDEX_VERSION_V3
2584 || version == SEMANTIC_INDEX_VERSION_V4
2585 || version == SEMANTIC_INDEX_VERSION_V5
2586 || version == SEMANTIC_INDEX_VERSION_V6)
2587 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2588 {
2589 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2590 }
2591
2592 let dimension = read_u32_stream(&mut reader)? as usize;
2593 let entry_count = read_u32_stream(&mut reader)? as usize;
2594 validate_embedding_dimension(dimension)?;
2595 if entry_count > MAX_ENTRIES {
2596 return Err(format!("too many semantic index entries: {}", entry_count));
2597 }
2598
2599 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2605 || version == SEMANTIC_INDEX_VERSION_V3
2606 || version == SEMANTIC_INDEX_VERSION_V4
2607 || version == SEMANTIC_INDEX_VERSION_V5
2608 || version == SEMANTIC_INDEX_VERSION_V6;
2609 let fingerprint = if has_fingerprint_field {
2610 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2611 if total_len
2612 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2613 {
2614 return Err("unexpected end of data reading fingerprint".to_string());
2615 }
2616 if fingerprint_len == 0 {
2617 None
2618 } else {
2619 let mut raw = vec![0u8; fingerprint_len];
2620 read_exact_stream(
2621 &mut reader,
2622 &mut raw,
2623 "unexpected end of data reading fingerprint",
2624 )?;
2625 let raw = String::from_utf8_lossy(&raw).to_string();
2626 Some(
2627 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2628 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2629 )
2630 }
2631 } else {
2632 None
2633 };
2634
2635 let mtime_count = read_u32_stream(&mut reader)? as usize;
2637 if mtime_count > MAX_ENTRIES {
2638 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2639 }
2640
2641 let vector_bytes = entry_count
2642 .checked_mul(dimension)
2643 .and_then(|count| count.checked_mul(F32_BYTES))
2644 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2645 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2646 return Err("semantic index vectors exceed available data".to_string());
2647 }
2648
2649 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2650 let mut file_sizes = HashMap::with_capacity(mtime_count);
2651 let mut file_hashes = HashMap::with_capacity(mtime_count);
2652 for _ in 0..mtime_count {
2653 let path = read_string_stream(&mut reader, total_len)?;
2654 let secs = read_u64_stream(&mut reader)?;
2655 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2661 || version == SEMANTIC_INDEX_VERSION_V4
2662 || version == SEMANTIC_INDEX_VERSION_V5
2663 || version == SEMANTIC_INDEX_VERSION_V6
2664 {
2665 read_u32_stream(&mut reader)?
2666 } else {
2667 0
2668 };
2669 let size =
2670 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2671 read_u64_stream(&mut reader)?
2672 } else {
2673 0
2674 };
2675 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2676 let mut hash_bytes = [0u8; 32];
2677 read_exact_stream(
2678 &mut reader,
2679 &mut hash_bytes,
2680 "unexpected end of data reading content hash",
2681 )?;
2682 blake3::Hash::from_bytes(hash_bytes)
2683 } else {
2684 cache_freshness::zero_hash()
2685 };
2686 if nanos >= 1_000_000_000 {
2693 return Err(format!(
2694 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2695 nanos
2696 ));
2697 }
2698 let duration = std::time::Duration::new(secs, nanos);
2699 let mtime = SystemTime::UNIX_EPOCH
2700 .checked_add(duration)
2701 .ok_or_else(|| {
2702 format!(
2703 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2704 secs, nanos
2705 )
2706 })?;
2707 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2708 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2709 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2710 } else {
2711 PathBuf::from(path)
2712 };
2713 file_mtimes.insert(path.clone(), mtime);
2714 file_sizes.insert(path.clone(), size);
2715 file_hashes.insert(path, content_hash);
2716 }
2717
2718 let mut entries = Vec::with_capacity(entry_count);
2720 for _ in 0..entry_count {
2721 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2722 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2723 cached_path_under_root(current_canonical_root, &raw_file)
2724 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2725 } else {
2726 raw_file
2727 };
2728 let name = read_string_stream(&mut reader, total_len)?;
2729
2730 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2731
2732 let start_line = read_u32_stream(&mut reader)?;
2733 let end_line = read_u32_stream(&mut reader)?;
2734
2735 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2736
2737 let snippet = read_string_stream(&mut reader, total_len)?;
2738 let embed_text = read_string_stream(&mut reader, total_len)?;
2739
2740 let vec_bytes = dimension
2742 .checked_mul(F32_BYTES)
2743 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2744 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2745 return Err("unexpected end of data reading vector".to_string());
2746 }
2747 let mut vector = Vec::with_capacity(dimension);
2748 for _ in 0..dimension {
2749 let mut bytes = [0u8; F32_BYTES];
2750 read_exact_stream(
2751 &mut reader,
2752 &mut bytes,
2753 "unexpected end of data reading vector",
2754 )?;
2755 vector.push(f32::from_le_bytes(bytes));
2756 }
2757
2758 entries.push(EmbeddingEntry {
2759 chunk: SemanticChunk {
2760 file,
2761 name,
2762 kind,
2763 start_line,
2764 end_line,
2765 exported,
2766 embed_text,
2767 snippet,
2768 },
2769 vector,
2770 });
2771 }
2772
2773 if entries.len() != entry_count {
2774 return Err(format!(
2775 "semantic cache entry count drift: header={} decoded={}",
2776 entry_count,
2777 entries.len()
2778 ));
2779 }
2780 for entry in &entries {
2781 if !file_mtimes.contains_key(&entry.chunk.file) {
2782 return Err(format!(
2783 "semantic cache metadata missing for entry file {}",
2784 entry.chunk.file.display()
2785 ));
2786 }
2787 }
2788
2789 Ok(Self {
2790 entries,
2791 file_mtimes,
2792 file_sizes,
2793 file_hashes,
2794 dimension,
2795 fingerprint,
2796 project_root: current_canonical_root.to_path_buf(),
2797 deferred_files: HashSet::new(),
2798 })
2799 }
2800}
2801
2802fn write_counted<W: Write>(
2803 writer: &mut W,
2804 bytes: &[u8],
2805 bytes_written: &mut usize,
2806) -> io::Result<()> {
2807 writer.write_all(bytes)?;
2808 *bytes_written = bytes_written.saturating_add(bytes.len());
2809 Ok(())
2810}
2811
2812struct CountingReader<R> {
2813 inner: R,
2814 bytes_read: usize,
2815}
2816
2817impl<R> CountingReader<R> {
2818 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2819 Self { inner, bytes_read }
2820 }
2821
2822 fn bytes_read(&self) -> usize {
2823 self.bytes_read
2824 }
2825}
2826
2827impl<R: Read> Read for CountingReader<R> {
2828 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2829 let read = self.inner.read(buf)?;
2830 self.bytes_read = self.bytes_read.saturating_add(read);
2831 Ok(read)
2832 }
2833}
2834
2835fn read_exact_stream<R: Read>(
2836 reader: &mut CountingReader<R>,
2837 buf: &mut [u8],
2838 eof_message: &'static str,
2839) -> Result<(), String> {
2840 reader.read_exact(buf).map_err(|error| {
2841 if error.kind() == io::ErrorKind::UnexpectedEof {
2842 eof_message.to_string()
2843 } else {
2844 format!("{eof_message}: {error}")
2845 }
2846 })
2847}
2848
2849fn read_u8_stream<R: Read>(
2850 reader: &mut CountingReader<R>,
2851 eof_message: &'static str,
2852) -> Result<u8, String> {
2853 let mut bytes = [0u8; 1];
2854 read_exact_stream(reader, &mut bytes, eof_message)?;
2855 Ok(bytes[0])
2856}
2857
2858fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2859 let mut bytes = [0u8; 4];
2860 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2861 Ok(u32::from_le_bytes(bytes))
2862}
2863
2864fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2865 let mut bytes = [0u8; 8];
2866 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2867 Ok(u64::from_le_bytes(bytes))
2868}
2869
2870fn read_string_stream<R: Read>(
2871 reader: &mut CountingReader<R>,
2872 total_len: Option<usize>,
2873) -> Result<String, String> {
2874 let len = read_u32_stream(reader)? as usize;
2875 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2876 return Err("unexpected end of data reading string".to_string());
2877 }
2878 let mut bytes = vec![0u8; len];
2879 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2880 Ok(String::from_utf8_lossy(&bytes).to_string())
2881}
2882
2883fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2885 let relative = file
2886 .strip_prefix(project_root)
2887 .unwrap_or(file)
2888 .to_string_lossy();
2889
2890 let kind_label = match &symbol.kind {
2891 SymbolKind::Function => "function",
2892 SymbolKind::Class => "class",
2893 SymbolKind::Method => "method",
2894 SymbolKind::Struct => "struct",
2895 SymbolKind::Interface => "interface",
2896 SymbolKind::Enum => "enum",
2897 SymbolKind::TypeAlias => "type",
2898 SymbolKind::Variable => "variable",
2899 SymbolKind::Heading => "heading",
2900 SymbolKind::FileSummary => "file-summary",
2901 };
2902
2903 let name = &symbol.name;
2905 let mut text = format!(
2906 "name:{name} file:{} kind:{} name:{name}",
2907 relative, kind_label
2908 );
2909
2910 if let Some(sig) = &symbol.signature {
2911 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2919 }
2920
2921 let lines: Vec<&str> = source.lines().collect();
2923 let start = (symbol.range.start_line as usize).min(lines.len());
2924 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2926 if start < end {
2927 let body: String = lines[start..end]
2928 .iter()
2929 .take(15) .copied()
2931 .collect::<Vec<&str>>()
2932 .join("\n");
2933 let snippet = if body.len() > 300 {
2934 format!("{}...", &body[..body.floor_char_boundary(300)])
2935 } else {
2936 body
2937 };
2938 text.push_str(&format!(" body:{}", snippet));
2939 }
2940
2941 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2946}
2947
2948const MAX_EMBED_TEXT_CHARS: usize = 1600;
2952
2953fn truncate_chars(value: &str, max_chars: usize) -> String {
2954 value.chars().take(max_chars).collect()
2955}
2956
2957fn first_leading_doc_comment(source: &str) -> String {
2958 let lines: Vec<&str> = source.lines().collect();
2959 let Some((start, first)) = lines
2960 .iter()
2961 .enumerate()
2962 .find(|(_, line)| !line.trim().is_empty())
2963 else {
2964 return String::new();
2965 };
2966
2967 let trimmed = first.trim_start();
2968 if trimmed.starts_with("/**") {
2969 let mut comment = Vec::new();
2970 for line in lines.iter().skip(start) {
2971 comment.push(*line);
2972 if line.contains("*/") {
2973 break;
2974 }
2975 }
2976 return truncate_chars(&comment.join("\n"), 200);
2977 }
2978
2979 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2980 let comment = lines
2981 .iter()
2982 .skip(start)
2983 .take_while(|line| {
2984 let trimmed = line.trim_start();
2985 trimmed.starts_with("///") || trimmed.starts_with("//!")
2986 })
2987 .copied()
2988 .collect::<Vec<_>>()
2989 .join("\n");
2990 return truncate_chars(&comment, 200);
2991 }
2992
2993 String::new()
2994}
2995
2996pub fn build_file_summary_chunk(
2997 file: &Path,
2998 project_root: &Path,
2999 source: &str,
3000 top_exports: &[&str],
3001 top_export_signatures: &[Option<&str>],
3002) -> SemanticChunk {
3003 let relative = file.strip_prefix(project_root).unwrap_or(file);
3004 let rel_path = relative.to_string_lossy();
3005 let parent_dir = relative
3006 .parent()
3007 .map(|parent| parent.to_string_lossy().to_string())
3008 .unwrap_or_default();
3009 let name = file
3010 .file_stem()
3011 .map(|stem| stem.to_string_lossy().to_string())
3012 .unwrap_or_default();
3013 let doc = first_leading_doc_comment(source);
3014 let exports = top_exports
3015 .iter()
3016 .take(5)
3017 .copied()
3018 .collect::<Vec<_>>()
3019 .join(",");
3020 let snippet = if doc.is_empty() {
3021 top_export_signatures
3022 .first()
3023 .and_then(|signature| signature.as_deref())
3024 .map(|signature| truncate_chars(signature, 200))
3025 .unwrap_or_default()
3026 } else {
3027 doc.clone()
3028 };
3029
3030 SemanticChunk {
3031 file: file.to_path_buf(),
3032 name,
3033 kind: SymbolKind::FileSummary,
3034 start_line: 0,
3035 end_line: 0,
3036 exported: false,
3037 embed_text: truncate_chars(
3038 &format!(
3039 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3040 file.file_stem()
3041 .map(|stem| stem.to_string_lossy().to_string())
3042 .unwrap_or_default()
3043 ),
3044 MAX_EMBED_TEXT_CHARS,
3045 ),
3046 snippet,
3047 }
3048}
3049
3050fn parser_for(
3051 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3052 lang: crate::parser::LangId,
3053) -> Result<&mut Parser, String> {
3054 use std::collections::hash_map::Entry;
3055
3056 match parsers.entry(lang) {
3057 Entry::Occupied(entry) => Ok(entry.into_mut()),
3058 Entry::Vacant(entry) => {
3059 let grammar = grammar_for(lang);
3060 let mut parser = Parser::new();
3061 parser
3062 .set_language(&grammar)
3063 .map_err(|error| error.to_string())?;
3064 Ok(entry.insert(parser))
3065 }
3066 }
3067}
3068
3069pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3070 matches!(
3071 path.extension().and_then(|extension| extension.to_str()),
3072 Some(
3073 "ts" | "tsx"
3074 | "js"
3075 | "jsx"
3076 | "py"
3077 | "rs"
3078 | "go"
3079 | "c"
3080 | "h"
3081 | "cc"
3082 | "cpp"
3083 | "cxx"
3084 | "hpp"
3085 | "hh"
3086 | "zig"
3087 | "cs"
3088 | "sh"
3089 | "bash"
3090 | "zsh"
3091 | "inc"
3092 | "php"
3093 | "sol"
3094 | "scss"
3095 | "vue"
3096 | "yaml"
3097 | "yml"
3098 )
3099 )
3100}
3101
3102fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
3103 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3104 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3105 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
3106 .map_err(|error| error.to_string())?
3107 .unwrap_or_else(cache_freshness::zero_hash);
3108 Ok(IndexedFileMetadata {
3109 mtime,
3110 size: metadata.len(),
3111 content_hash,
3112 })
3113}
3114
3115fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3116 if let Ok(canonical) = fs::canonicalize(path) {
3117 return canonical;
3118 }
3119
3120 let Some(parent) = path.parent() else {
3121 return path.to_path_buf();
3122 };
3123 let Some(file_name) = path.file_name() else {
3124 return path.to_path_buf();
3125 };
3126
3127 fs::canonicalize(parent)
3128 .map(|canonical_parent| canonical_parent.join(file_name))
3129 .unwrap_or_else(|_| path.to_path_buf())
3130}
3131
3132const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3142
3143fn collect_file_chunks(
3144 project_root: &Path,
3145 file: &Path,
3146 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3147) -> Result<Vec<SemanticChunk>, String> {
3148 if !is_semantic_indexed_extension(file) {
3149 return Err("unsupported file extension".to_string());
3150 }
3151 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3152 if std::fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3155 return Ok(Vec::new());
3156 }
3157 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
3158 let tree = parser_for(parsers, lang)?
3159 .parse(&source, None)
3160 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3161 let symbols =
3162 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
3163
3164 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
3165}
3166
3167fn build_snippet(symbol: &Symbol, source: &str) -> String {
3169 let lines: Vec<&str> = source.lines().collect();
3170 let start = (symbol.range.start_line as usize).min(lines.len());
3171 let end = (symbol.range.end_line as usize + 1).min(lines.len());
3173 if start < end {
3174 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3175 let mut snippet = snippet_lines.join("\n");
3176 if end - start > 5 {
3177 snippet.push_str("\n ...");
3178 }
3179 if snippet.len() > 300 {
3180 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3181 }
3182 snippet
3183 } else {
3184 String::new()
3185 }
3186}
3187
3188fn symbols_to_chunks(
3190 file: &Path,
3191 symbols: &[Symbol],
3192 source: &str,
3193 project_root: &Path,
3194) -> Vec<SemanticChunk> {
3195 let mut chunks = Vec::new();
3196 let top_exports_with_signatures = symbols
3197 .iter()
3198 .filter(|symbol| {
3199 symbol.exported
3200 && symbol.parent.is_none()
3201 && !matches!(symbol.kind, SymbolKind::Heading)
3202 })
3203 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3204 .collect::<Vec<_>>();
3205
3206 let has_only_headings = !symbols.is_empty()
3207 && symbols
3208 .iter()
3209 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3210 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3211 let top_exports = top_exports_with_signatures
3212 .iter()
3213 .map(|(name, _)| *name)
3214 .collect::<Vec<_>>();
3215 let top_export_signatures = top_exports_with_signatures
3216 .iter()
3217 .map(|(_, signature)| *signature)
3218 .collect::<Vec<_>>();
3219 chunks.push(build_file_summary_chunk(
3220 file,
3221 project_root,
3222 source,
3223 &top_exports,
3224 &top_export_signatures,
3225 ));
3226 }
3227
3228 for symbol in symbols {
3229 if matches!(symbol.kind, SymbolKind::Heading) {
3234 continue;
3235 }
3236
3237 let line_count = symbol
3239 .range
3240 .end_line
3241 .saturating_sub(symbol.range.start_line)
3242 + 1;
3243 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3244 continue;
3245 }
3246
3247 let embed_text = build_embed_text(symbol, source, file, project_root);
3248 let snippet = build_snippet(symbol, source);
3249
3250 chunks.push(SemanticChunk {
3251 file: file.to_path_buf(),
3252 name: symbol.name.clone(),
3253 kind: symbol.kind.clone(),
3254 start_line: symbol.range.start_line,
3255 end_line: symbol.range.end_line,
3256 exported: symbol.exported,
3257 embed_text,
3258 snippet,
3259 });
3260
3261 }
3264
3265 chunks
3266}
3267
3268fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3269 b.0.partial_cmp(&a.0)
3270 .unwrap_or(std::cmp::Ordering::Equal)
3271 .then_with(|| a.1.cmp(&b.1))
3272}
3273
3274fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3276 if a.len() != b.len() {
3277 return 0.0;
3278 }
3279
3280 let mut dot = 0.0f32;
3281 let mut norm_a = 0.0f32;
3282 let mut norm_b = 0.0f32;
3283
3284 for i in 0..a.len() {
3285 dot += a[i] * b[i];
3286 norm_a += a[i] * a[i];
3287 norm_b += b[i] * b[i];
3288 }
3289
3290 let denom = norm_a.sqrt() * norm_b.sqrt();
3291 if denom == 0.0 {
3292 0.0
3293 } else {
3294 dot / denom
3295 }
3296}
3297
3298fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3300 match kind {
3301 SymbolKind::Function => 0,
3302 SymbolKind::Class => 1,
3303 SymbolKind::Method => 2,
3304 SymbolKind::Struct => 3,
3305 SymbolKind::Interface => 4,
3306 SymbolKind::Enum => 5,
3307 SymbolKind::TypeAlias => 6,
3308 SymbolKind::Variable => 7,
3309 SymbolKind::Heading => 8,
3310 SymbolKind::FileSummary => 9,
3311 }
3312}
3313
3314fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3315 match v {
3316 0 => SymbolKind::Function,
3317 1 => SymbolKind::Class,
3318 2 => SymbolKind::Method,
3319 3 => SymbolKind::Struct,
3320 4 => SymbolKind::Interface,
3321 5 => SymbolKind::Enum,
3322 6 => SymbolKind::TypeAlias,
3323 7 => SymbolKind::Variable,
3324 8 => SymbolKind::Heading,
3325 9 => SymbolKind::FileSummary,
3326 _ => SymbolKind::Heading,
3327 }
3328}
3329
3330#[cfg(test)]
3331mod tests {
3332 use super::*;
3333 use crate::config::{SemanticBackend, SemanticBackendConfig};
3334 use crate::parser::FileParser;
3335 use std::io::{Read, Write};
3336 use std::net::TcpListener;
3337 use std::thread;
3338
3339 #[test]
3340 fn semantic_index_includes_php_inc_and_scss_extensions() {
3341 for file in ["partial.inc", "index.php", "styles.scss"] {
3342 assert!(
3343 is_semantic_indexed_extension(Path::new(file)),
3344 "{file} should be semantic-index eligible"
3345 );
3346 }
3347 }
3348
3349 #[test]
3350 fn transient_marker_round_trips_and_classifies() {
3351 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3354 assert!(embedding_failure_is_transient(&marked));
3355 let clean = strip_transient_embedding_marker(&marked);
3356 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3357 assert!(clean.starts_with("openai compatible request failed:"));
3358
3359 for permanent in [
3362 "openai compatible request failed (HTTP 401): Unauthorized",
3363 "embedding dimension mismatch: index has 384, model returned 768",
3364 "too many files (>20000) for semantic indexing (max 20000)",
3365 ] {
3366 assert!(
3367 !embedding_failure_is_transient(permanent),
3368 "{permanent:?} must not be transient"
3369 );
3370 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3372 }
3373 }
3374
3375 #[test]
3376 fn send_error_transience_separates_connect_timeout_from_4xx() {
3377 assert!(is_retryable_embedding_status(
3379 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3380 ));
3381 assert!(is_retryable_embedding_status(
3382 reqwest::StatusCode::TOO_MANY_REQUESTS
3383 ));
3384 assert!(!is_retryable_embedding_status(
3385 reqwest::StatusCode::UNAUTHORIZED
3386 ));
3387 assert!(!is_retryable_embedding_status(
3388 reqwest::StatusCode::BAD_REQUEST
3389 ));
3390 }
3391
3392 #[test]
3393 fn local_backend_model_loading_body_is_transient() {
3394 for body in [
3397 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3398 r#"{"error":"model is loading, please wait"}"#,
3399 r#"{"error":"Model not loaded"}"#,
3400 "Loading model into memory",
3401 ] {
3402 assert!(
3403 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3404 "{body:?} should be body-transient"
3405 );
3406 }
3407
3408 for body in [
3412 r#"{"error":"invalid api key"}"#,
3413 r#"{"error":"model 'foo' not found"}"#,
3414 "Bad Request: unknown field",
3415 "Bad Request: invalid loading model option",
3416 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3417 ] {
3418 assert!(
3419 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3420 "{body:?} must not be body-transient"
3421 );
3422 }
3423
3424 assert!(
3425 !embedding_response_body_is_transient(
3426 reqwest::StatusCode::UNAUTHORIZED,
3427 r#"{"error":"model is loading, please wait"}"#
3428 ),
3429 "permanent auth failures must not become transient because of body text"
3430 );
3431 }
3432
3433 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3434 where
3435 F: Fn(String, String, String) -> String + Send + 'static,
3436 {
3437 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3438 let addr = listener.local_addr().expect("local addr");
3439 let handle = thread::spawn(move || {
3440 let (mut stream, _) = listener.accept().expect("accept request");
3441 let mut buf = Vec::new();
3442 let mut chunk = [0u8; 4096];
3443 let mut header_end = None;
3444 let mut content_length = 0usize;
3445 loop {
3446 let n = stream.read(&mut chunk).expect("read request");
3447 if n == 0 {
3448 break;
3449 }
3450 buf.extend_from_slice(&chunk[..n]);
3451 if header_end.is_none() {
3452 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3453 header_end = Some(pos + 4);
3454 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3455 for line in headers.lines() {
3456 if let Some(value) = line.strip_prefix("Content-Length:") {
3457 content_length = value.trim().parse::<usize>().unwrap_or(0);
3458 }
3459 }
3460 }
3461 }
3462 if let Some(end) = header_end {
3463 if buf.len() >= end + content_length {
3464 break;
3465 }
3466 }
3467 }
3468
3469 let end = header_end.expect("header terminator");
3470 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3471 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3472 let mut lines = request.lines();
3473 let request_line = lines.next().expect("request line").to_string();
3474 let path = request_line
3475 .split_whitespace()
3476 .nth(1)
3477 .expect("request path")
3478 .to_string();
3479 let response_body = handler(request_line, path, body);
3480 let response = format!(
3481 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3482 response_body.len(),
3483 response_body
3484 );
3485 stream
3486 .write_all(response.as_bytes())
3487 .expect("write response");
3488 });
3489
3490 (format!("http://{}", addr), handle)
3491 }
3492
3493 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3494 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3495 listener
3496 .set_nonblocking(true)
3497 .expect("nonblocking listener");
3498 let addr = listener.local_addr().expect("local addr");
3499 let handle = thread::spawn(move || {
3500 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3501 let mut accepted = 0usize;
3502 while accepted < attempts && std::time::Instant::now() < deadline {
3503 match listener.accept() {
3504 Ok((mut stream, _)) => {
3505 accepted += 1;
3506 let mut buf = [0u8; 4096];
3507 let _ = stream.read(&mut buf);
3515 let response = "HTTP/1.1 200 OK
3516Content-Type: application/json
3517Content-Length: 128
3518Connection: close
3519
3520{";
3521 let _ = stream.write_all(response.as_bytes());
3522 }
3523 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3524 thread::sleep(Duration::from_millis(10));
3525 }
3526 Err(error) => panic!("accept request: {error}"),
3527 }
3528 }
3529 });
3530
3531 (format!("http://{}", addr), handle)
3532 }
3533
3534 #[test]
3535 fn response_body_read_failures_are_marked_transient() {
3536 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3537 let client = Client::builder()
3538 .timeout(Duration::from_millis(250))
3539 .build()
3540 .expect("client");
3541
3542 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3543 .expect_err("truncated body should fail");
3544
3545 handle.join().unwrap();
3546 assert!(
3547 embedding_failure_is_transient(&error),
3548 "body read failures should be transient-marked: {error}"
3549 );
3550 assert!(error.contains("response read failed"));
3551 }
3552
3553 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3554 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3555 }
3556
3557 fn write_rust_file(path: &Path, function_name: &str) {
3558 fs::write(
3559 path,
3560 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3561 )
3562 .unwrap();
3563 }
3564
3565 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3566 let mut embed = test_vector_for_texts;
3567 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3568 }
3569
3570 fn test_project_root() -> PathBuf {
3571 std::env::current_dir().unwrap()
3572 }
3573
3574 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3575 index.file_mtimes.insert(file.to_path_buf(), mtime);
3576 index.file_sizes.insert(file.to_path_buf(), size);
3577 index
3578 .file_hashes
3579 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3580 }
3581
3582 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3583 let mut buf = Vec::new();
3584 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3585 let encoded = fingerprint.as_string();
3586 if encoded.is_empty() {
3587 None
3588 } else {
3589 Some(encoded.into_bytes())
3590 }
3591 });
3592 let file_mtimes: Vec<_> = index
3593 .file_mtimes
3594 .iter()
3595 .filter_map(|(path, mtime)| {
3596 cache_relative_path(&index.project_root, path)
3597 .map(|relative| (relative, path, mtime))
3598 })
3599 .collect();
3600 let entries: Vec<_> = index
3601 .entries
3602 .iter()
3603 .filter_map(|entry| {
3604 cache_relative_path(&index.project_root, &entry.chunk.file)
3605 .map(|relative| (relative, entry))
3606 })
3607 .collect();
3608
3609 buf.push(SEMANTIC_INDEX_VERSION_V6);
3610 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3611 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3612 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3613 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3614 buf.extend_from_slice(fp_bytes_ref);
3615
3616 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3617 for (relative, path, mtime) in &file_mtimes {
3618 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3619 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3620 buf.extend_from_slice(&path_bytes);
3621 let duration = mtime
3622 .duration_since(SystemTime::UNIX_EPOCH)
3623 .unwrap_or_default();
3624 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3625 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3626 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3627 buf.extend_from_slice(&size.to_le_bytes());
3628 let hash = index
3629 .file_hashes
3630 .get(*path)
3631 .copied()
3632 .unwrap_or_else(cache_freshness::zero_hash);
3633 buf.extend_from_slice(hash.as_bytes());
3634 }
3635
3636 for (relative, entry) in &entries {
3637 let c = &entry.chunk;
3638 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3639 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3640 buf.extend_from_slice(&file_bytes);
3641
3642 let name_bytes = c.name.as_bytes();
3643 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3644 buf.extend_from_slice(name_bytes);
3645
3646 buf.push(symbol_kind_to_u8(&c.kind));
3647 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3648 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3649 buf.push(c.exported as u8);
3650
3651 let snippet_bytes = c.snippet.as_bytes();
3652 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3653 buf.extend_from_slice(snippet_bytes);
3654
3655 let embed_bytes = c.embed_text.as_bytes();
3656 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3657 buf.extend_from_slice(embed_bytes);
3658
3659 for &val in &entry.vector {
3660 buf.extend_from_slice(&val.to_le_bytes());
3661 }
3662 }
3663
3664 buf
3665 }
3666
3667 #[derive(Default)]
3668 struct RecordingEmbedder {
3669 calls: Vec<Vec<String>>,
3670 }
3671
3672 impl RecordingEmbedder {
3673 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3674 let vectors = texts
3675 .iter()
3676 .map(|text| deterministic_test_vector(text))
3677 .collect();
3678 self.calls.push(texts);
3679 Ok(vectors)
3680 }
3681
3682 fn total_embedded_texts(&self) -> usize {
3683 self.calls.iter().map(Vec::len).sum()
3684 }
3685
3686 fn embedded_texts(&self) -> Vec<&str> {
3687 self.calls
3688 .iter()
3689 .flat_map(|batch| batch.iter().map(String::as_str))
3690 .collect()
3691 }
3692 }
3693
3694 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3695 let hash = blake3::hash(text.as_bytes());
3696 let bytes = hash.as_bytes();
3697 vec![
3698 1.0,
3699 bytes[0] as f32 / 255.0,
3700 bytes[1] as f32 / 255.0,
3701 bytes[2] as f32 / 255.0,
3702 ]
3703 }
3704
3705 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3706 let mut embedder = RecordingEmbedder::default();
3707 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3708 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3709 }
3710
3711 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3712 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3713 }
3714
3715 fn write_source(path: &Path, source: &str) {
3716 if let Some(parent) = path.parent() {
3717 fs::create_dir_all(parent).unwrap();
3718 }
3719 fs::write(path, source).unwrap();
3720 }
3721
3722 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3723 index
3724 .entries
3725 .iter()
3726 .filter(|entry| entry.chunk.file == file)
3727 .collect()
3728 }
3729
3730 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3731 index
3732 .entries
3733 .iter()
3734 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3735 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3736 }
3737
3738 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3739 index
3740 .entries
3741 .iter()
3742 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3743 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3744 }
3745
3746 #[test]
3747 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3748 let temp = tempfile::tempdir().unwrap();
3749 let project_root = temp.path();
3750 let file = project_root.join("src/lib.rs");
3751 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3752 write_source(&file, original);
3753
3754 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3755 let original_entry_count = index.entries.len();
3756 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3757
3758 write_source(&file, &format!("\n{original}"));
3759 force_stale(&mut index, &file);
3760
3761 let mut embedder = RecordingEmbedder::default();
3762 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3763 let mut progress = |_done: usize, _total: usize| {};
3764 let summary = index
3765 .refresh_stale_files(
3766 project_root,
3767 std::slice::from_ref(&file),
3768 &mut embed,
3769 16,
3770 &mut progress,
3771 )
3772 .unwrap();
3773
3774 assert_eq!(summary.changed, 1);
3775 assert_eq!(embedder.total_embedded_texts(), 0);
3776 assert_eq!(index.entries.len(), original_entry_count);
3777 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3778 assert_eq!(shifted_alpha.chunk.start_line, 1);
3779 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3780 }
3781
3782 #[test]
3783 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3784 let temp = tempfile::tempdir().unwrap();
3785 let project_root = temp.path();
3786 let file = project_root.join("src/lib.rs");
3787 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3788 write_source(&file, original);
3789
3790 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3791 let mut serving_index = worker_index.clone();
3792 let original_entry_count = worker_index.entries.len();
3793
3794 write_source(&file, &format!("\n{original}"));
3795
3796 let mut embedder = RecordingEmbedder::default();
3797 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3798 let mut progress = |_done: usize, _total: usize| {};
3799 let update = worker_index
3800 .refresh_invalidated_files(
3801 project_root,
3802 std::slice::from_ref(&file),
3803 &mut embed,
3804 16,
3805 100,
3806 &mut progress,
3807 )
3808 .unwrap();
3809
3810 assert_eq!(embedder.total_embedded_texts(), 0);
3811 assert_eq!(update.added_entries.len(), original_entry_count);
3812 assert_eq!(worker_index.entries.len(), original_entry_count);
3813
3814 serving_index.apply_refresh_update(
3815 update.added_entries,
3816 update.updated_metadata,
3817 &update.completed_paths,
3818 );
3819
3820 assert_eq!(serving_index.entries.len(), original_entry_count);
3821 assert_eq!(
3822 entries_for_file(&serving_index, &file).len(),
3823 original_entry_count
3824 );
3825 assert_eq!(
3826 entry_by_name(&serving_index, &file, "alpha")
3827 .chunk
3828 .start_line,
3829 1
3830 );
3831 }
3832
3833 #[test]
3834 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3835 let temp = tempfile::tempdir().unwrap();
3836 let project_root = temp.path();
3837 let file = project_root.join("src/lib.rs");
3838 write_source(
3839 &file,
3840 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3841 );
3842
3843 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3844 let original_entry_count = index.entries.len();
3845 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3846
3847 write_source(
3848 &file,
3849 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3850 );
3851
3852 let mut embedder = RecordingEmbedder::default();
3853 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3854 let mut progress = |_done: usize, _total: usize| {};
3855 let update = index
3856 .refresh_invalidated_files(
3857 project_root,
3858 std::slice::from_ref(&file),
3859 &mut embed,
3860 16,
3861 100,
3862 &mut progress,
3863 )
3864 .unwrap();
3865
3866 assert_eq!(embedder.total_embedded_texts(), 1);
3867 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3868 assert_eq!(update.added_entries.len(), original_entry_count);
3869 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3870 }
3871
3872 #[test]
3873 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3874 let temp = tempfile::tempdir().unwrap();
3875 let project_root = temp.path();
3876 let file = project_root.join("src/dupe.js");
3877 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
3878 write_source(&file, one_duplicate);
3879
3880 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3881 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3882
3883 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3884
3885 let mut embedder = RecordingEmbedder::default();
3886 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3887 let mut progress = |_done: usize, _total: usize| {};
3888 index
3889 .refresh_invalidated_files(
3890 project_root,
3891 std::slice::from_ref(&file),
3892 &mut embed,
3893 16,
3894 100,
3895 &mut progress,
3896 )
3897 .unwrap();
3898
3899 let duplicate_entries = index
3900 .entries
3901 .iter()
3902 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3903 .collect::<Vec<_>>();
3904 assert_eq!(duplicate_entries.len(), 2);
3905 assert_eq!(embedder.total_embedded_texts(), 0);
3906 assert_eq!(duplicate_entries[0].vector, original_vector);
3907 assert_eq!(duplicate_entries[1].vector, original_vector);
3908 }
3909
3910 #[test]
3911 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3912 let temp = tempfile::tempdir().unwrap();
3913 let project_root = temp.path();
3914 let file = project_root.join("src/lib.rs");
3915 write_source(
3916 &file,
3917 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
3918 );
3919
3920 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3921 let summary_before = file_summary_entry(&index, &file).vector.clone();
3922
3923 write_source(
3924 &file,
3925 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
3926 );
3927 let mut body_embedder = RecordingEmbedder::default();
3928 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3929 let mut progress = |_done: usize, _total: usize| {};
3930 index
3931 .refresh_invalidated_files(
3932 project_root,
3933 std::slice::from_ref(&file),
3934 &mut body_embed,
3935 16,
3936 100,
3937 &mut progress,
3938 )
3939 .unwrap();
3940 assert_eq!(body_embedder.total_embedded_texts(), 1);
3941 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3942 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3943
3944 write_source(
3945 &file,
3946 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
3947 );
3948 let mut doc_embedder = RecordingEmbedder::default();
3949 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3950 index
3951 .refresh_invalidated_files(
3952 project_root,
3953 std::slice::from_ref(&file),
3954 &mut doc_embed,
3955 16,
3956 100,
3957 &mut progress,
3958 )
3959 .unwrap();
3960
3961 assert_eq!(doc_embedder.total_embedded_texts(), 1);
3962 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3963 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3964 }
3965
3966 #[test]
3967 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3968 let temp = tempfile::tempdir().unwrap();
3969 let project_root = temp.path();
3970 let file = project_root.join("src/lib.rs");
3971 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
3972
3973 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3974 let mut serving_index = worker_index.clone();
3975 fs::remove_file(&file).unwrap();
3976
3977 let mut embedder = RecordingEmbedder::default();
3978 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3979 let mut progress = |_done: usize, _total: usize| {};
3980 let update = worker_index
3981 .refresh_invalidated_files(
3982 project_root,
3983 std::slice::from_ref(&file),
3984 &mut embed,
3985 16,
3986 100,
3987 &mut progress,
3988 )
3989 .unwrap();
3990
3991 assert_eq!(update.summary.deleted, 1);
3992 assert_eq!(embedder.total_embedded_texts(), 0);
3993 assert!(worker_index.entries.is_empty());
3994
3995 serving_index.apply_refresh_update(
3996 update.added_entries,
3997 update.updated_metadata,
3998 &update.completed_paths,
3999 );
4000 assert!(serving_index.entries.is_empty());
4001 }
4002
4003 #[test]
4004 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4005 let temp = tempfile::tempdir().unwrap();
4006 let project_root = temp.path();
4007 let file = project_root.join("src/lib.rs");
4008 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4009
4010 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4011 let mut serving_index = worker_index.clone();
4012 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4013
4014 let mut embedder = RecordingEmbedder::default();
4015 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4016 let mut progress = |_done: usize, _total: usize| {};
4017 let update = worker_index
4018 .refresh_invalidated_files(
4019 project_root,
4020 std::slice::from_ref(&file),
4021 &mut embed,
4022 16,
4023 100,
4024 &mut progress,
4025 )
4026 .unwrap();
4027
4028 assert_eq!(embedder.total_embedded_texts(), 0);
4029 assert!(update.added_entries.is_empty());
4030 assert!(worker_index.entries.is_empty());
4031 assert!(!worker_index.file_mtimes.contains_key(&file));
4032
4033 serving_index.apply_refresh_update(
4034 update.added_entries,
4035 update.updated_metadata,
4036 &update.completed_paths,
4037 );
4038 assert!(serving_index.entries.is_empty());
4039 assert!(!serving_index.file_mtimes.contains_key(&file));
4040 }
4041
4042 #[test]
4043 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4044 let temp = tempfile::tempdir().unwrap();
4045 let project_root = temp.path();
4046 let indexed = project_root.join("src/a.rs");
4047 let deferred = project_root.join("src/b.rs");
4048 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4049 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4050
4051 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4052 let mut embedder = RecordingEmbedder::default();
4053 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4054 let mut progress = |_done: usize, _total: usize| {};
4055 let update = index
4056 .refresh_invalidated_files(
4057 project_root,
4058 std::slice::from_ref(&deferred),
4059 &mut embed,
4060 16,
4061 1,
4062 &mut progress,
4063 )
4064 .unwrap();
4065
4066 assert_eq!(update.summary.total_processed, 1);
4067 assert_eq!(update.summary.added, 0);
4068 assert_eq!(embedder.total_embedded_texts(), 0);
4069 assert_eq!(index.indexed_file_count(), 1);
4070 assert!(index.deferred_files.contains(&deferred));
4071 assert!(entries_for_file(&index, &deferred).is_empty());
4072 }
4073
4074 #[test]
4075 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4076 let dir = tempfile::tempdir().expect("create temp dir");
4077 let project = fs::canonicalize(dir.path()).expect("canonical project");
4078 let outside = project.join("..").join("outside.rs");
4079 let mut index = SemanticIndex::new(project.clone(), 3);
4080 index
4081 .file_mtimes
4082 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4083 index.file_sizes.insert(outside.clone(), 1);
4084 index
4085 .file_hashes
4086 .insert(outside.clone(), cache_freshness::zero_hash());
4087 index.entries.push(EmbeddingEntry {
4088 chunk: SemanticChunk {
4089 file: outside,
4090 name: "outside".to_string(),
4091 kind: SymbolKind::Function,
4092 start_line: 0,
4093 end_line: 0,
4094 exported: false,
4095 embed_text: "outside".to_string(),
4096 snippet: "outside".to_string(),
4097 },
4098 vector: vec![1.0, 0.0, 0.0],
4099 });
4100
4101 let bytes = index.to_bytes();
4102 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4103 assert_eq!(loaded.entries.len(), 0);
4104 assert!(loaded.file_mtimes.is_empty());
4105 }
4106
4107 #[test]
4108 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4109 let project_root = test_project_root();
4110 let file = project_root.join("src/lib.rs");
4111 let mut index = SemanticIndex::new(project_root, 2);
4112 let entries = [
4113 ("alpha", vec![1.0, 0.0], false),
4114 ("beta", vec![0.0, 1.0], false),
4115 ("gamma", vec![1.0, 0.0], false),
4116 ("delta", vec![0.5, 0.5], true),
4117 ("epsilon", vec![-1.0, 0.0], false),
4118 ];
4119 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4120 index.entries.push(EmbeddingEntry {
4121 chunk: SemanticChunk {
4122 file: file.clone(),
4123 name: name.to_string(),
4124 kind: SymbolKind::Function,
4125 start_line: line as u32 + 1,
4126 end_line: line as u32 + 1,
4127 exported,
4128 embed_text: name.to_string(),
4129 snippet: format!("fn {name}() {{}}"),
4130 },
4131 vector,
4132 });
4133 }
4134
4135 let query = vec![1.0, 0.0];
4136 let top_k = 4;
4137 let mut reference: Vec<(f32, usize)> = index
4138 .entries
4139 .iter()
4140 .enumerate()
4141 .map(|(idx, entry)| {
4142 let mut score = cosine_similarity(&query, &entry.vector);
4143 if entry.chunk.exported {
4144 score *= 1.1;
4145 }
4146 (score, idx)
4147 })
4148 .collect();
4149 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4150 let expected: Vec<(String, f32)> = reference
4151 .into_iter()
4152 .take(top_k)
4153 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4154 .collect();
4155
4156 let actual: Vec<(String, f32)> = index
4157 .search(&query, top_k)
4158 .into_iter()
4159 .map(|result| (result.name, result.score))
4160 .collect();
4161
4162 assert_eq!(
4163 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4164 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4165 );
4166 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4167 assert!((actual_score - expected_score).abs() < 1e-6);
4168 }
4169 assert_eq!(actual[0].0, "alpha");
4170 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4171 assert!(index.search(&query, 0).is_empty());
4172 }
4173
4174 #[test]
4175 fn test_cosine_similarity_identical() {
4176 let a = vec![1.0, 0.0, 0.0];
4177 let b = vec![1.0, 0.0, 0.0];
4178 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4179 }
4180
4181 #[test]
4182 fn test_cosine_similarity_orthogonal() {
4183 let a = vec![1.0, 0.0, 0.0];
4184 let b = vec![0.0, 1.0, 0.0];
4185 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4186 }
4187
4188 #[test]
4189 fn test_cosine_similarity_opposite() {
4190 let a = vec![1.0, 0.0, 0.0];
4191 let b = vec![-1.0, 0.0, 0.0];
4192 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4193 }
4194
4195 #[test]
4196 fn test_serialization_roundtrip() {
4197 let project_root = test_project_root();
4198 let file = project_root.join("src/main.rs");
4199 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4200 index.entries.push(EmbeddingEntry {
4201 chunk: SemanticChunk {
4202 file: file.clone(),
4203 name: "handle_request".to_string(),
4204 kind: SymbolKind::Function,
4205 start_line: 10,
4206 end_line: 25,
4207 exported: true,
4208 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4209 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4210 },
4211 vector: vec![0.1, 0.2, 0.3, 0.4],
4212 });
4213 index.dimension = 4;
4214 index
4215 .file_mtimes
4216 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4217 index.file_sizes.insert(file, 0);
4218 index.set_fingerprint(SemanticIndexFingerprint {
4219 backend: "fastembed".to_string(),
4220 model: "all-MiniLM-L6-v2".to_string(),
4221 base_url: FALLBACK_BACKEND.to_string(),
4222 dimension: 4,
4223 chunking_version: default_chunking_version(),
4224 });
4225
4226 let bytes = index.to_bytes();
4227 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4228
4229 assert_eq!(restored.entries.len(), 1);
4230 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4231 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4232 assert_eq!(restored.dimension, 4);
4233 assert_eq!(restored.backend_label(), Some("fastembed"));
4234 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4235 }
4236
4237 #[test]
4238 fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4239 let storage = tempfile::tempdir().expect("create storage dir");
4240 let project = storage.path().join("project");
4241 fs::create_dir_all(project.join("src")).expect("create project src");
4242 let file = project.join("src/lib.rs");
4243 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4244 let project_root = fs::canonicalize(&project).expect("canonical project");
4245 let file = fs::canonicalize(&file).expect("canonical file");
4246
4247 let mut index = SemanticIndex::new(project_root.clone(), 3);
4248 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4249 index.file_mtimes.insert(file.clone(), mtime);
4250 index.file_sizes.insert(file.clone(), 42);
4251 index
4252 .file_hashes
4253 .insert(file.clone(), cache_freshness::zero_hash());
4254 index.entries.push(EmbeddingEntry {
4255 chunk: SemanticChunk {
4256 file: file.clone(),
4257 name: "alpha".to_string(),
4258 kind: SymbolKind::Function,
4259 start_line: 0,
4260 end_line: 0,
4261 exported: true,
4262 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4263 snippet: "pub fn alpha() {}".to_string(),
4264 },
4265 vector: vec![0.1, 0.2, 0.3],
4266 });
4267 index.entries.push(EmbeddingEntry {
4268 chunk: SemanticChunk {
4269 file: file.clone(),
4270 name: "beta".to_string(),
4271 kind: SymbolKind::Function,
4272 start_line: 1,
4273 end_line: 1,
4274 exported: true,
4275 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4276 snippet: "pub fn beta() {}".to_string(),
4277 },
4278 vector: vec![0.4, 0.5, 0.6],
4279 });
4280 let fingerprint = SemanticIndexFingerprint {
4281 backend: "fastembed".to_string(),
4282 model: "all-MiniLM-L6-v2".to_string(),
4283 base_url: FALLBACK_BACKEND.to_string(),
4284 dimension: 3,
4285 chunking_version: default_chunking_version(),
4286 };
4287 index.set_fingerprint(fingerprint.clone());
4288
4289 let legacy_bytes = legacy_semantic_index_bytes(&index);
4290 assert_eq!(index.to_bytes(), legacy_bytes);
4291
4292 index.write_to_disk(storage.path(), "proj");
4293 let data_path = storage.path().join("semantic/proj/semantic.bin");
4294 assert_eq!(
4295 fs::read(&data_path).expect("read semantic.bin"),
4296 legacy_bytes
4297 );
4298
4299 let loaded = SemanticIndex::read_from_disk(
4300 storage.path(),
4301 "proj",
4302 &project_root,
4303 false,
4304 Some(&fingerprint.as_string()),
4305 )
4306 .expect("load semantic index");
4307 assert_eq!(loaded.entries.len(), index.entries.len());
4308 assert_eq!(loaded.dimension, index.dimension);
4309 assert_eq!(
4310 loaded.fingerprint().unwrap().as_string(),
4311 fingerprint.as_string()
4312 );
4313 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4314 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4315 assert_eq!(
4316 loaded.file_hashes.get(&file),
4317 Some(&cache_freshness::zero_hash())
4318 );
4319 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4320 assert_eq!(actual.chunk.file, expected.chunk.file);
4321 assert_eq!(actual.chunk.name, expected.chunk.name);
4322 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4323 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4324 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4325 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4326 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4327 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4328 assert_eq!(actual.vector, expected.vector);
4329 }
4330 assert_eq!(loaded.to_bytes(), legacy_bytes);
4331 }
4332
4333 #[test]
4334 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4335 let cases = [
4336 (SymbolKind::Function, 0),
4337 (SymbolKind::Class, 1),
4338 (SymbolKind::Method, 2),
4339 (SymbolKind::Struct, 3),
4340 (SymbolKind::Interface, 4),
4341 (SymbolKind::Enum, 5),
4342 (SymbolKind::TypeAlias, 6),
4343 (SymbolKind::Variable, 7),
4344 (SymbolKind::Heading, 8),
4345 (SymbolKind::FileSummary, 9),
4346 ];
4347
4348 for (kind, encoded) in cases {
4349 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4350 assert_eq!(u8_to_symbol_kind(encoded), kind);
4351 }
4352 }
4353
4354 #[test]
4355 fn test_search_top_k() {
4356 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4357 index.dimension = 3;
4358
4359 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4361 let mut vec = vec![0.0f32; 3];
4362 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4364 chunk: SemanticChunk {
4365 file: PathBuf::from("/src/lib.rs"),
4366 name: name.to_string(),
4367 kind: SymbolKind::Function,
4368 start_line: (i * 10 + 1) as u32,
4369 end_line: (i * 10 + 5) as u32,
4370 exported: true,
4371 embed_text: format!("kind:function name:{}", name),
4372 snippet: format!("fn {}() {{}}", name),
4373 },
4374 vector: vec,
4375 });
4376 }
4377
4378 let query = vec![0.9, 0.1, 0.0];
4380 let results = index.search(&query, 2);
4381
4382 assert_eq!(results.len(), 2);
4383 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4385 }
4386
4387 #[test]
4388 fn test_empty_index_search() {
4389 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4390 let results = index.search(&[0.1, 0.2, 0.3], 10);
4391 assert!(results.is_empty());
4392 }
4393
4394 #[test]
4395 fn single_line_symbol_builds_non_empty_snippet() {
4396 let symbol = Symbol {
4397 name: "answer".to_string(),
4398 kind: SymbolKind::Variable,
4399 range: crate::symbols::Range {
4400 start_line: 0,
4401 start_col: 0,
4402 end_line: 0,
4403 end_col: 24,
4404 },
4405 signature: Some("const answer = 42".to_string()),
4406 scope_chain: Vec::new(),
4407 exported: true,
4408 parent: None,
4409 };
4410 let source = "export const answer = 42;\n";
4411
4412 let snippet = build_snippet(&symbol, source);
4413
4414 assert_eq!(snippet, "export const answer = 42;");
4415 }
4416
4417 #[test]
4418 fn optimized_file_chunk_collection_matches_file_parser_path() {
4419 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4420 let file = project_root.join("src/semantic_index.rs");
4421 let source = std::fs::read_to_string(&file).unwrap();
4422
4423 let mut legacy_parser = FileParser::new();
4424 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4425 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4426
4427 let mut parsers = HashMap::new();
4428 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4429
4430 assert_eq!(
4431 chunk_fingerprint(&optimized_chunks),
4432 chunk_fingerprint(&legacy_chunks)
4433 );
4434 }
4435
4436 fn chunk_fingerprint(
4437 chunks: &[SemanticChunk],
4438 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4439 chunks
4440 .iter()
4441 .map(|chunk| {
4442 (
4443 chunk.name.clone(),
4444 chunk.kind.clone(),
4445 chunk.start_line,
4446 chunk.end_line,
4447 chunk.exported,
4448 chunk.embed_text.clone(),
4449 chunk.snippet.clone(),
4450 )
4451 })
4452 .collect()
4453 }
4454
4455 #[test]
4456 fn collect_file_chunks_skips_oversized_file() {
4457 let dir = tempfile::tempdir().unwrap();
4458 let big = dir.path().join("huge.ts");
4459 let filler = "export const x = 1;\n"
4461 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4462 std::fs::write(&big, &filler).unwrap();
4463 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4464
4465 let mut parsers = HashMap::new();
4466 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4469 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4470
4471 let small = dir.path().join("small.ts");
4473 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4474 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4475 assert!(!small_chunks.is_empty(), "small file should still chunk");
4476 }
4477
4478 #[test]
4479 fn rejects_oversized_dimension_during_deserialization() {
4480 let mut bytes = Vec::new();
4481 bytes.push(1u8);
4482 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4483 bytes.extend_from_slice(&0u32.to_le_bytes());
4484 bytes.extend_from_slice(&0u32.to_le_bytes());
4485
4486 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4487 }
4488
4489 #[test]
4490 fn rejects_oversized_entry_count_during_deserialization() {
4491 let mut bytes = Vec::new();
4492 bytes.push(1u8);
4493 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4494 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4495 bytes.extend_from_slice(&0u32.to_le_bytes());
4496
4497 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4498 }
4499
4500 #[test]
4501 fn invalidate_file_removes_entries_and_mtime() {
4502 let target = PathBuf::from("/src/main.rs");
4503 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4504 index.entries.push(EmbeddingEntry {
4505 chunk: SemanticChunk {
4506 file: target.clone(),
4507 name: "main".to_string(),
4508 kind: SymbolKind::Function,
4509 start_line: 0,
4510 end_line: 1,
4511 exported: false,
4512 embed_text: "main".to_string(),
4513 snippet: "fn main() {}".to_string(),
4514 },
4515 vector: vec![1.0; DEFAULT_DIMENSION],
4516 });
4517 index
4518 .file_mtimes
4519 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4520 index.file_sizes.insert(target.clone(), 0);
4521
4522 index.invalidate_file(&target);
4523
4524 assert!(index.entries.is_empty());
4525 assert!(!index.file_mtimes.contains_key(&target));
4526 assert!(!index.file_sizes.contains_key(&target));
4527 }
4528
4529 #[test]
4530 fn refresh_missing_changed_file_is_purged_after_collect() {
4531 let temp = tempfile::tempdir().unwrap();
4532 let project_root = temp.path();
4533 let file = project_root.join("src/lib.rs");
4534 fs::create_dir_all(file.parent().unwrap()).unwrap();
4535 write_rust_file(&file, "vanished_symbol");
4536
4537 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4538 let original_size = *index.file_sizes.get(&file).unwrap();
4539 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4540 fs::remove_file(&file).unwrap();
4541
4542 let mut embed = test_vector_for_texts;
4543 let mut progress = |_done: usize, _total: usize| {};
4544 let summary = index
4545 .refresh_stale_files(
4546 project_root,
4547 std::slice::from_ref(&file),
4548 &mut embed,
4549 8,
4550 &mut progress,
4551 )
4552 .unwrap();
4553
4554 assert_eq!(summary.changed, 0);
4555 assert_eq!(summary.added, 0);
4556 assert_eq!(summary.deleted, 1);
4557 assert!(index.entries.is_empty());
4558 assert!(!index.file_mtimes.contains_key(&file));
4559 assert!(!index.file_sizes.contains_key(&file));
4560 assert!(!index.file_hashes.contains_key(&file));
4561 }
4562
4563 #[test]
4564 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4565 let temp = tempfile::tempdir().unwrap();
4566 let project_root = temp.path();
4567 let file = project_root.join("src/lib.rs");
4568 fs::create_dir_all(file.parent().unwrap()).unwrap();
4569 write_rust_file(&file, "kept_symbol");
4570
4571 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4572 let original_entry_count = index.entries.len();
4573 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4574 let original_size = *index.file_sizes.get(&file).unwrap();
4575
4576 let stale_mtime = SystemTime::UNIX_EPOCH;
4577 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4578 fs::remove_file(&file).unwrap();
4579 fs::create_dir(&file).unwrap();
4580
4581 let mut embed = test_vector_for_texts;
4582 let mut progress = |_done: usize, _total: usize| {};
4583 let summary = index
4584 .refresh_stale_files(
4585 project_root,
4586 std::slice::from_ref(&file),
4587 &mut embed,
4588 8,
4589 &mut progress,
4590 )
4591 .unwrap();
4592
4593 assert_eq!(summary.changed, 0);
4594 assert_eq!(summary.added, 0);
4595 assert_eq!(summary.deleted, 0);
4596 assert_eq!(index.entries.len(), original_entry_count);
4597 assert!(index
4598 .entries
4599 .iter()
4600 .any(|entry| entry.chunk.name == "kept_symbol"));
4601 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4602 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4603 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4604 }
4605
4606 #[test]
4607 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4608 let temp = tempfile::tempdir().unwrap();
4609 let project_root = temp.path();
4610 let missing = project_root.join("src/missing.rs");
4611 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4612
4613 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4614 let mut embed = test_vector_for_texts;
4615 let mut progress = |_done: usize, _total: usize| {};
4616 let summary = index
4617 .refresh_stale_files(
4618 project_root,
4619 std::slice::from_ref(&missing),
4620 &mut embed,
4621 8,
4622 &mut progress,
4623 )
4624 .unwrap();
4625
4626 assert_eq!(summary.added, 0);
4627 assert_eq!(summary.changed, 0);
4628 assert_eq!(summary.deleted, 0);
4629 assert!(!index.file_mtimes.contains_key(&missing));
4630 assert!(!index.file_sizes.contains_key(&missing));
4631 assert!(index.entries.is_empty());
4632 }
4633
4634 #[test]
4635 fn refresh_reports_added_for_new_files() {
4636 let temp = tempfile::tempdir().unwrap();
4637 let project_root = temp.path();
4638 let existing = project_root.join("src/lib.rs");
4639 let added = project_root.join("src/new.rs");
4640 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4641 write_rust_file(&existing, "existing_symbol");
4642 write_rust_file(&added, "added_symbol");
4643
4644 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4645 let mut embed = test_vector_for_texts;
4646 let mut progress = |_done: usize, _total: usize| {};
4647 let summary = index
4648 .refresh_stale_files(
4649 project_root,
4650 &[existing.clone(), added.clone()],
4651 &mut embed,
4652 8,
4653 &mut progress,
4654 )
4655 .unwrap();
4656
4657 assert_eq!(summary.added, 1);
4658 assert_eq!(summary.changed, 0);
4659 assert_eq!(summary.deleted, 0);
4660 assert_eq!(summary.total_processed, 2);
4661 assert!(index.file_mtimes.contains_key(&added));
4662 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4663 }
4664
4665 #[test]
4666 fn refresh_reports_deleted_for_removed_files() {
4667 let temp = tempfile::tempdir().unwrap();
4668 let project_root = temp.path();
4669 let deleted = project_root.join("src/deleted.rs");
4670 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4671 write_rust_file(&deleted, "deleted_symbol");
4672
4673 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4674 fs::remove_file(&deleted).unwrap();
4675
4676 let mut embed = test_vector_for_texts;
4677 let mut progress = |_done: usize, _total: usize| {};
4678 let summary = index
4679 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4680 .unwrap();
4681
4682 assert_eq!(summary.deleted, 1);
4683 assert_eq!(summary.changed, 0);
4684 assert_eq!(summary.added, 0);
4685 assert_eq!(summary.total_processed, 1);
4686 assert!(!index.file_mtimes.contains_key(&deleted));
4687 assert!(index.entries.is_empty());
4688 }
4689
4690 #[test]
4691 fn refresh_reports_changed_for_modified_files() {
4692 let temp = tempfile::tempdir().unwrap();
4693 let project_root = temp.path();
4694 let file = project_root.join("src/lib.rs");
4695 fs::create_dir_all(file.parent().unwrap()).unwrap();
4696 write_rust_file(&file, "old_symbol");
4697
4698 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4699 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4700 write_rust_file(&file, "new_symbol");
4701
4702 let mut embed = test_vector_for_texts;
4703 let mut progress = |_done: usize, _total: usize| {};
4704 let summary = index
4705 .refresh_stale_files(
4706 project_root,
4707 std::slice::from_ref(&file),
4708 &mut embed,
4709 8,
4710 &mut progress,
4711 )
4712 .unwrap();
4713
4714 assert_eq!(summary.changed, 1);
4715 assert_eq!(summary.added, 0);
4716 assert_eq!(summary.deleted, 0);
4717 assert_eq!(summary.total_processed, 1);
4718 assert!(index
4719 .entries
4720 .iter()
4721 .any(|entry| entry.chunk.name == "new_symbol"));
4722 assert!(!index
4723 .entries
4724 .iter()
4725 .any(|entry| entry.chunk.name == "old_symbol"));
4726 }
4727
4728 #[test]
4729 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4730 let temp = tempfile::tempdir().unwrap();
4731 let project_root = temp.path();
4732 let file = project_root.join("src/lib.rs");
4733 fs::create_dir_all(file.parent().unwrap()).unwrap();
4734 write_rust_file(&file, "clean_symbol");
4735
4736 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4737 let original_entries = index.entries.len();
4738 let mut embed_called = false;
4739 let mut embed = |texts: Vec<String>| {
4740 embed_called = true;
4741 test_vector_for_texts(texts)
4742 };
4743 let mut progress = |_done: usize, _total: usize| {};
4744 let summary = index
4745 .refresh_stale_files(
4746 project_root,
4747 std::slice::from_ref(&file),
4748 &mut embed,
4749 8,
4750 &mut progress,
4751 )
4752 .unwrap();
4753
4754 assert!(summary.is_noop());
4755 assert_eq!(summary.total_processed, 1);
4756 assert!(!embed_called);
4757 assert_eq!(index.entries.len(), original_entries);
4758 }
4759
4760 #[test]
4761 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4762 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4763
4764 assert!(is_onnx_runtime_unavailable(message));
4765 }
4766
4767 #[test]
4768 fn formats_missing_onnx_runtime_with_install_hint() {
4769 let message = format_embedding_init_error(
4770 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4771 );
4772
4773 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4774 assert!(message.contains("Original error:"));
4775 }
4776
4777 #[test]
4778 fn openai_compatible_backend_embeds_with_mock_server() {
4779 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4780 assert!(request_line.starts_with("POST "));
4781 assert_eq!(path, "/v1/embeddings");
4782 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4783 });
4784
4785 let config = SemanticBackendConfig {
4786 backend: SemanticBackend::OpenAiCompatible,
4787 model: "test-embedding".to_string(),
4788 base_url: Some(base_url),
4789 api_key_env: None,
4790 timeout_ms: 5_000,
4791 max_batch_size: 64,
4792 max_files: 20_000,
4793 };
4794
4795 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4796 let vectors = model
4797 .embed(vec!["hello".to_string(), "world".to_string()])
4798 .unwrap();
4799
4800 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4801 handle.join().unwrap();
4802 }
4803
4804 #[test]
4814 fn openai_compatible_request_has_single_content_type_header() {
4815 use std::sync::{Arc, Mutex};
4816 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4817 let captured_for_thread = Arc::clone(&captured);
4818
4819 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4820 let addr = listener.local_addr().expect("local addr");
4821 let handle = thread::spawn(move || {
4822 let (mut stream, _) = listener.accept().expect("accept");
4823 let mut buf = Vec::new();
4824 let mut chunk = [0u8; 4096];
4825 let mut header_end = None;
4826 let mut content_length = 0usize;
4827 loop {
4828 let n = stream.read(&mut chunk).expect("read");
4829 if n == 0 {
4830 break;
4831 }
4832 buf.extend_from_slice(&chunk[..n]);
4833 if header_end.is_none() {
4834 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4835 header_end = Some(pos + 4);
4836 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4837 if let Some(value) = line.strip_prefix("Content-Length:") {
4838 content_length = value.trim().parse::<usize>().unwrap_or(0);
4839 }
4840 }
4841 }
4842 }
4843 if let Some(end) = header_end {
4844 if buf.len() >= end + content_length {
4845 break;
4846 }
4847 }
4848 }
4849 *captured_for_thread.lock().unwrap() = buf;
4850 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4851 let response = format!(
4852 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4853 body.len(),
4854 body
4855 );
4856 let _ = stream.write_all(response.as_bytes());
4857 });
4858
4859 let config = SemanticBackendConfig {
4860 backend: SemanticBackend::OpenAiCompatible,
4861 model: "text-embedding-3-small".to_string(),
4862 base_url: Some(format!("http://{}", addr)),
4863 api_key_env: None,
4864 timeout_ms: 5_000,
4865 max_batch_size: 64,
4866 max_files: 20_000,
4867 };
4868 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4869 let _ = model.embed(vec!["probe".to_string()]).unwrap();
4870 handle.join().unwrap();
4871
4872 let bytes = captured.lock().unwrap().clone();
4873 let request = String::from_utf8_lossy(&bytes);
4874
4875 let content_type_lines = request
4878 .lines()
4879 .filter(|line| {
4880 let lower = line.to_ascii_lowercase();
4881 lower.starts_with("content-type:")
4882 })
4883 .count();
4884 assert_eq!(
4885 content_type_lines, 1,
4886 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4887 );
4888
4889 assert!(
4892 request.contains(r#""model":"text-embedding-3-small""#),
4893 "request body should contain model field; full request:\n{request}",
4894 );
4895 }
4896
4897 #[test]
4898 fn ollama_backend_embeds_with_mock_server() {
4899 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4900 assert!(request_line.starts_with("POST "));
4901 assert_eq!(path, "/api/embed");
4902 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4903 });
4904
4905 let config = SemanticBackendConfig {
4906 backend: SemanticBackend::Ollama,
4907 model: "embeddinggemma".to_string(),
4908 base_url: Some(base_url),
4909 api_key_env: None,
4910 timeout_ms: 5_000,
4911 max_batch_size: 64,
4912 max_files: 20_000,
4913 };
4914
4915 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4916 let vectors = model
4917 .embed(vec!["hello".to_string(), "world".to_string()])
4918 .unwrap();
4919
4920 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4921 handle.join().unwrap();
4922 }
4923
4924 #[test]
4925 fn read_from_disk_rejects_fingerprint_mismatch() {
4926 let storage = tempfile::tempdir().unwrap();
4927 let project_key = "proj";
4928
4929 let project_root = test_project_root();
4930 let file = project_root.join("src/main.rs");
4931 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4932 index.entries.push(EmbeddingEntry {
4933 chunk: SemanticChunk {
4934 file: file.clone(),
4935 name: "handle_request".to_string(),
4936 kind: SymbolKind::Function,
4937 start_line: 10,
4938 end_line: 25,
4939 exported: true,
4940 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4941 snippet: "fn handle_request() {}".to_string(),
4942 },
4943 vector: vec![0.1, 0.2, 0.3],
4944 });
4945 index.dimension = 3;
4946 index
4947 .file_mtimes
4948 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4949 index.file_sizes.insert(file, 0);
4950 index.set_fingerprint(SemanticIndexFingerprint {
4951 backend: "openai_compatible".to_string(),
4952 model: "test-embedding".to_string(),
4953 base_url: "http://127.0.0.1:1234/v1".to_string(),
4954 dimension: 3,
4955 chunking_version: default_chunking_version(),
4956 });
4957 index.write_to_disk(storage.path(), project_key);
4958
4959 let matching = index.fingerprint().unwrap().as_string();
4960 assert!(SemanticIndex::read_from_disk(
4961 storage.path(),
4962 project_key,
4963 &project_root,
4964 false,
4965 Some(&matching),
4966 )
4967 .is_some());
4968
4969 let mismatched = SemanticIndexFingerprint {
4970 backend: "ollama".to_string(),
4971 model: "embeddinggemma".to_string(),
4972 base_url: "http://127.0.0.1:11434".to_string(),
4973 dimension: 3,
4974 chunking_version: default_chunking_version(),
4975 }
4976 .as_string();
4977 assert!(SemanticIndex::read_from_disk(
4978 storage.path(),
4979 project_key,
4980 &project_root,
4981 false,
4982 Some(&mismatched),
4983 )
4984 .is_none());
4985 }
4986
4987 #[test]
4988 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
4989 let storage = tempfile::tempdir().unwrap();
4990 let project_key = "proj-v3";
4991 let dir = storage.path().join("semantic").join(project_key);
4992 fs::create_dir_all(&dir).unwrap();
4993
4994 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4995 index.entries.push(EmbeddingEntry {
4996 chunk: SemanticChunk {
4997 file: PathBuf::from("/src/main.rs"),
4998 name: "handle_request".to_string(),
4999 kind: SymbolKind::Function,
5000 start_line: 0,
5001 end_line: 0,
5002 exported: true,
5003 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5004 snippet: "fn handle_request() {}".to_string(),
5005 },
5006 vector: vec![0.1, 0.2, 0.3],
5007 });
5008 index.dimension = 3;
5009 index
5010 .file_mtimes
5011 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5012 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5013 let fingerprint = SemanticIndexFingerprint {
5014 backend: "fastembed".to_string(),
5015 model: "test".to_string(),
5016 base_url: FALLBACK_BACKEND.to_string(),
5017 dimension: 3,
5018 chunking_version: default_chunking_version(),
5019 };
5020 index.set_fingerprint(fingerprint.clone());
5021
5022 let mut bytes = index.to_bytes();
5023 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5024 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5025
5026 assert!(SemanticIndex::read_from_disk(
5027 storage.path(),
5028 project_key,
5029 &test_project_root(),
5030 false,
5031 Some(&fingerprint.as_string())
5032 )
5033 .is_none());
5034 assert!(!dir.join("semantic.bin").exists());
5035 }
5036
5037 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5038 crate::symbols::Symbol {
5039 name: name.to_string(),
5040 kind,
5041 range: crate::symbols::Range {
5042 start_line: start,
5043 start_col: 0,
5044 end_line: end,
5045 end_col: 0,
5046 },
5047 signature: None,
5048 scope_chain: Vec::new(),
5049 exported: false,
5050 parent: None,
5051 }
5052 }
5053
5054 #[test]
5059 fn symbols_to_chunks_skips_heading_symbols() {
5060 let project_root = PathBuf::from("/proj");
5061 let file = project_root.join("README.md");
5062 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5063
5064 let symbols = vec![
5065 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5066 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5067 ];
5068
5069 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5070 assert!(
5071 chunks.is_empty(),
5072 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5073 chunks.len()
5074 );
5075 }
5076
5077 #[test]
5084 fn build_embed_text_clamps_oversized_signature() {
5085 let project_root = PathBuf::from("/proj");
5086 let file = project_root.join("cronjob.yaml");
5087 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5089
5090 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5091 symbol.signature = Some(huge_sig);
5092
5093 let text = build_embed_text(&symbol, source, &file, &project_root);
5094 assert!(
5095 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5096 "embed_text must be clamped to {} chars, got {}",
5097 MAX_EMBED_TEXT_CHARS,
5098 text.chars().count()
5099 );
5100 }
5101
5102 #[test]
5106 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5107 let project_root = PathBuf::from("/proj");
5108 let file = project_root.join("src/lib.rs");
5109 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5110
5111 let symbols = vec![
5112 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5114 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5115 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5116 ];
5117
5118 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5119 assert_eq!(
5120 chunks.len(),
5121 3,
5122 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5123 chunks.len()
5124 );
5125 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5126 assert!(chunks
5127 .iter()
5128 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5129 assert!(names.contains(&"handle_request"));
5130 assert!(names.contains(&"AuthService"));
5131 assert!(
5132 !names.contains(&"doc heading"),
5133 "Heading symbol leaked into chunks: {names:?}"
5134 );
5135 }
5136
5137 #[test]
5138 fn validate_ssrf_allows_loopback_hostnames() {
5139 for host in &[
5142 "http://localhost",
5143 "http://localhost:8080",
5144 "http://localhost:11434", "http://localhost.localdomain",
5146 "http://foo.localhost",
5147 ] {
5148 assert!(
5149 validate_base_url_no_ssrf(host).is_ok(),
5150 "Expected {host} to be allowed (loopback), got: {:?}",
5151 validate_base_url_no_ssrf(host)
5152 );
5153 }
5154 }
5155
5156 #[test]
5157 fn validate_ssrf_allows_loopback_ips() {
5158 for url in &[
5161 "http://127.0.0.1",
5162 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5164 "http://127.1.2.3",
5165 ] {
5166 let result = validate_base_url_no_ssrf(url);
5167 assert!(
5168 result.is_ok(),
5169 "Expected {url} to be allowed (loopback), got: {:?}",
5170 result
5171 );
5172 }
5173 }
5174
5175 #[test]
5176 fn validate_ssrf_rejects_private_non_loopback_ips() {
5177 for url in &[
5182 "http://192.168.1.1",
5183 "http://10.0.0.1",
5184 "http://172.16.0.1",
5185 "http://169.254.169.254",
5186 "http://100.64.0.1",
5187 ] {
5188 let result = validate_base_url_no_ssrf(url);
5189 assert!(
5190 result.is_err(),
5191 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5192 result
5193 );
5194 }
5195 }
5196
5197 #[test]
5198 fn validate_ssrf_rejects_mdns_local_hostnames() {
5199 for host in &[
5202 "http://printer.local",
5203 "http://nas.local:8080",
5204 "http://homelab.local",
5205 ] {
5206 let result = validate_base_url_no_ssrf(host);
5207 assert!(
5208 result.is_err(),
5209 "Expected {host} to be rejected (mDNS), got: {:?}",
5210 result
5211 );
5212 }
5213 }
5214
5215 #[test]
5216 fn normalize_base_url_allows_localhost_for_tests() {
5217 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5220 assert!(normalize_base_url("http://localhost:8080").is_ok());
5221 }
5222
5223 #[test]
5224 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5225 use std::net::IpAddr;
5226 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5227
5228 assert!(blocked("10.0.0.1"));
5230 assert!(blocked("192.168.1.1"));
5231 assert!(blocked("169.254.0.1"));
5232 assert!(blocked("100.64.0.1"));
5233 assert!(
5235 blocked("198.18.0.1"),
5236 "RFC2544 benchmark range must be blocked"
5237 );
5238 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5239 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5240 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5241
5242 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5244 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5245 assert!(
5246 !blocked("::ffff:127.0.0.1"),
5247 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5248 );
5249
5250 assert!(!blocked("8.8.8.8"));
5252 }
5253
5254 #[test]
5261 fn ort_mismatch_message_recommends_auto_fix_first() {
5262 let msg =
5263 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5264
5265 assert!(
5267 msg.contains("v1.9.0"),
5268 "should report detected version: {msg}"
5269 );
5270 assert!(
5271 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5272 "should report system path: {msg}"
5273 );
5274 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5275
5276 let auto_fix_pos = msg
5278 .find("Auto-fix")
5279 .expect("Auto-fix solution missing — users won't discover --fix");
5280 let remove_pos = msg
5281 .find("Remove the old library")
5282 .expect("system-rm solution missing");
5283 assert!(
5284 auto_fix_pos < remove_pos,
5285 "Auto-fix must come before manual rm — see PR comment thread"
5286 );
5287
5288 assert!(
5290 msg.contains("npx @cortexkit/aft doctor --fix"),
5291 "auto-fix command must be present and copy-pasteable: {msg}"
5292 );
5293 }
5294
5295 #[cfg(any(target_os = "linux", target_os = "macos"))]
5296 #[test]
5297 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5298 let requested = "libonnxruntime.so";
5299 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5300
5301 assert_eq!(detect_ort_version_from_path(requested), None);
5302 let (version, source) =
5303 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5304
5305 assert_eq!(version, Some("1.19.0".to_string()));
5306 assert_eq!(source, actual);
5307
5308 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5309 assert!(msg.contains("v1.19.0"));
5310 assert!(msg.contains(actual));
5311 }
5312
5313 #[test]
5317 fn ort_mismatch_message_handles_macos_dylib_path() {
5318 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5319 assert!(msg.contains("v1.9.0"));
5320 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5321 assert!(
5325 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5326 "system path should be quoted in the auto-fix sentence: {msg}"
5327 );
5328 }
5329}