1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Local(LocalEmbedder),
133 OpenAiCompatible {
134 client: Client,
135 model: String,
136 base_url: String,
137 api_key: Option<String>,
138 },
139 Ollama {
140 client: Client,
141 model: String,
142 base_url: String,
143 },
144}
145
146pub struct SemanticEmbeddingModel {
147 backend: SemanticBackend,
148 model: String,
149 base_url: Option<String>,
150 timeout_ms: u64,
151 max_batch_size: usize,
152 dimension: Option<usize>,
153 engine: SemanticEmbeddingEngine,
154 query_embedding_cache: HashMap<String, Vec<f32>>,
155 query_embedding_cache_order: VecDeque<String>,
156 query_embedding_cache_hits: u64,
157 query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163 vectors: &[Vec<f32>],
164 expected_count: usize,
165 context: &str,
166) -> Result<(), String> {
167 if expected_count > 0 && vectors.is_empty() {
168 return Err(format!(
169 "{context} returned no vectors for {expected_count} inputs"
170 ));
171 }
172
173 if vectors.len() != expected_count {
174 return Err(format!(
175 "{context} returned {} vectors for {} inputs",
176 vectors.len(),
177 expected_count
178 ));
179 }
180
181 let Some(first_vector) = vectors.first() else {
182 return Ok(());
183 };
184 let expected_dimension = first_vector.len();
185 validate_embedding_dimension(expected_dimension)
186 .map_err(|error| format!("{context} returned {error}"))?;
187 for (index, vector) in vectors.iter().enumerate() {
188 if vector.len() != expected_dimension {
189 return Err(format!(
190 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191 vector.len()
192 ));
193 }
194 }
195
196 Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200 if dimension == 0 || dimension > MAX_DIMENSION {
201 return Err(format!(
202 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203 ));
204 }
205
206 Ok(())
207}
208
209fn normalize_base_url(raw: &str) -> Result<String, String> {
213 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214 let scheme = parsed.scheme();
215 if scheme != "http" && scheme != "https" {
216 return Err(format!(
217 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218 scheme
219 ));
220 }
221 Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239 use std::net::{IpAddr, ToSocketAddrs};
240
241 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243 let host = parsed.host_str().unwrap_or("");
244
245 let is_loopback_host =
250 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251 if is_loopback_host {
252 return Ok(());
253 }
254
255 if host.ends_with(".local") {
258 return Err(format!(
259 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260 ));
261 }
262
263 let port = parsed.port_or_known_default().unwrap_or(443);
266 let addr_str = format!("{host}:{port}");
267 let addrs: Vec<IpAddr> = addr_str
268 .to_socket_addrs()
269 .map(|iter| iter.map(|sa| sa.ip()).collect())
270 .unwrap_or_default();
271 for ip in &addrs {
272 if is_private_non_loopback_ip(ip) {
273 return Err(format!(
274 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275 ));
276 }
277 }
278
279 Ok(())
280}
281
282fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287 match ip {
288 IpAddr::V4(v4) => {
289 let o = v4.octets();
290 o[0] == 10
293 || (o[0] == 172 && (16..=31).contains(&o[1]))
295 || (o[0] == 192 && o[1] == 168)
297 || (o[0] == 169 && o[1] == 254)
299 || (o[0] == 100 && (64..=127).contains(&o[1]))
301 || o[0] == 0
303 }
304 IpAddr::V6(v6) => {
305 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
309 || (v6.segments()[0] & 0xfe00) == 0xfc00
311 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313 && v6.segments()[2] == 0 && v6.segments()[3] == 0
314 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315 && {
316 let [a, b] = v6.segments()[6..8] else { return false; };
317 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319 })
320 }
321 }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325 if base_url.ends_with("/v1") {
326 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327 } else {
328 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329 }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333 if base_url.ends_with("/api") {
334 format!("{base_url}/embed")
335 } else {
336 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337 }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341 value.and_then(|token| {
342 let token = token.trim();
343 if token.is_empty() {
344 None
345 } else {
346 Some(token.to_string())
347 }
348 })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
361 if !matches!(
362 status,
363 reqwest::StatusCode::BAD_REQUEST
364 | reqwest::StatusCode::CONFLICT
365 | reqwest::StatusCode::REQUEST_TIMEOUT
366 | reqwest::StatusCode::LOCKED
367 | reqwest::StatusCode::TOO_EARLY
368 ) {
369 return false;
370 }
371
372 let lower = raw.to_ascii_lowercase();
373 let normalized = lower.trim();
374
375 normalized.contains("model was unloaded while the request was still in queue")
376 || normalized == "model is loading"
377 || normalized.starts_with("model is loading,")
378 || normalized.contains(r#""error":"model is loading"#)
379 || normalized.contains(r#""message":"model is loading"#)
380 || normalized == "model not loaded"
381 || normalized.contains(r#""error":"model not loaded""#)
382 || normalized.contains(r#""message":"model not loaded""#)
383 || normalized == "loading model into memory"
384 || normalized.contains(r#""error":"loading model into memory""#)
385 || normalized.contains(r#""message":"loading model into memory""#)
386 || normalized == "model is being loaded"
387 || normalized.contains(r#""error":"model is being loaded""#)
388 || normalized.contains(r#""message":"model is being loaded""#)
389 || normalized == "model is currently loading"
390 || normalized.contains(r#""error":"model is currently loading""#)
391 || normalized.contains(r#""message":"model is currently loading""#)
392}
393
394fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
395 error.is_connect()
396}
397
398fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
404 error.is_connect() || error.is_timeout()
405}
406
407fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
408 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
409}
410
411pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
418
419pub fn embedding_failure_is_transient(error: &str) -> bool {
422 error.contains(TRANSIENT_EMBEDDING_MARKER)
423}
424
425pub fn strip_transient_embedding_marker(error: &str) -> String {
427 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
428}
429
430fn sleep_before_embedding_retry(attempt_index: usize) {
431 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
432 std::thread::sleep(Duration::from_millis(*delay_ms));
433 }
434}
435
436fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
437where
438 F: FnMut() -> reqwest::blocking::RequestBuilder,
439{
440 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
441 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
442
443 let response = match make_request().send() {
444 Ok(response) => response,
445 Err(error) => {
446 if !last_attempt && is_retryable_embedding_error(&error) {
447 sleep_before_embedding_retry(attempt_index);
448 continue;
449 }
450 let marker = if embedding_send_error_is_transient(&error) {
454 TRANSIENT_EMBEDDING_MARKER
455 } else {
456 ""
457 };
458 return Err(format!("{marker}{backend_label} request failed: {error}"));
459 }
460 };
461
462 let status = response.status();
463 let raw = match response.text() {
464 Ok(raw) => raw,
465 Err(error) => {
466 if !last_attempt && embedding_response_read_error_is_transient(&error) {
467 sleep_before_embedding_retry(attempt_index);
468 continue;
469 }
470 let marker = if embedding_response_read_error_is_transient(&error) {
471 TRANSIENT_EMBEDDING_MARKER
472 } else {
473 ""
474 };
475 return Err(format!(
476 "{marker}{backend_label} response read failed: {error}"
477 ));
478 }
479 };
480
481 if status.is_success() {
482 return Ok(raw);
483 }
484
485 let body_transient = embedding_response_body_is_transient(status, &raw);
489 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
490 sleep_before_embedding_retry(attempt_index);
491 continue;
492 }
493
494 let marker = if is_retryable_embedding_status(status) || body_transient {
500 TRANSIENT_EMBEDDING_MARKER
501 } else {
502 ""
503 };
504 return Err(format!(
505 "{marker}{backend_label} request failed (HTTP {}): {}",
506 status, raw
507 ));
508 }
509
510 unreachable!("embedding request retries exhausted without returning")
511}
512
513impl SemanticEmbeddingModel {
514 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
515 let timeout_ms = if config.timeout_ms == 0 {
516 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
517 } else {
518 config.timeout_ms
519 };
520
521 let max_batch_size = if config.max_batch_size == 0 {
522 DEFAULT_MAX_BATCH_SIZE
523 } else {
524 config.max_batch_size
525 };
526
527 let api_key_env = normalize_api_key(config.api_key_env.clone());
528 let model = config.model.clone();
529
530 let client = Client::builder()
531 .timeout(Duration::from_millis(timeout_ms))
532 .redirect(reqwest::redirect::Policy::none())
533 .build()
534 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
535
536 let engine = match config.backend {
537 SemanticBackend::Fastembed => {
538 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
539 }
540 SemanticBackend::OpenAiCompatible => {
541 let raw = config.base_url.as_ref().ok_or_else(|| {
542 "base_url is required for openai_compatible backend".to_string()
543 })?;
544 let base_url = normalize_base_url(raw)?;
545
546 let api_key = match api_key_env {
547 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
548 format!("missing api_key_env '{var_name}' for openai_compatible backend")
549 })?),
550 None => None,
551 };
552
553 SemanticEmbeddingEngine::OpenAiCompatible {
554 client,
555 model,
556 base_url,
557 api_key,
558 }
559 }
560 SemanticBackend::Ollama => {
561 let raw = config
562 .base_url
563 .as_ref()
564 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
565 let base_url = normalize_base_url(raw)?;
566
567 SemanticEmbeddingEngine::Ollama {
568 client,
569 model,
570 base_url,
571 }
572 }
573 };
574
575 Ok(Self {
576 backend: config.backend,
577 model: config.model.clone(),
578 base_url: config.base_url.clone(),
579 timeout_ms,
580 max_batch_size,
581 dimension: None,
582 engine,
583 query_embedding_cache: HashMap::new(),
584 query_embedding_cache_order: VecDeque::new(),
585 query_embedding_cache_hits: 0,
586 query_embedding_cache_misses: 0,
587 })
588 }
589
590 pub fn backend(&self) -> SemanticBackend {
591 self.backend
592 }
593
594 pub fn model(&self) -> &str {
595 &self.model
596 }
597
598 pub fn base_url(&self) -> Option<&str> {
599 self.base_url.as_deref()
600 }
601
602 pub fn max_batch_size(&self) -> usize {
603 self.max_batch_size
604 }
605
606 pub fn timeout_ms(&self) -> u64 {
607 self.timeout_ms
608 }
609
610 pub fn fingerprint(
611 &mut self,
612 config: &SemanticBackendConfig,
613 ) -> Result<SemanticIndexFingerprint, String> {
614 let dimension = self.dimension()?;
615 Ok(SemanticIndexFingerprint::from_config(config, dimension))
616 }
617
618 pub fn dimension(&mut self) -> Result<usize, String> {
619 if let Some(dimension) = self.dimension {
620 return Ok(dimension);
621 }
622
623 let dimension = match &mut self.engine {
624 SemanticEmbeddingEngine::Local(model) => {
625 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
626 vectors
627 .first()
628 .map(|v| v.len())
629 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
630 }
631 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
632 let vectors =
633 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
634 vectors
635 .first()
636 .map(|v| v.len())
637 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
638 }
639 SemanticEmbeddingEngine::Ollama { .. } => {
640 let vectors =
641 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
642 vectors
643 .first()
644 .map(|v| v.len())
645 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
646 }
647 };
648
649 self.dimension = Some(dimension);
650 Ok(dimension)
651 }
652
653 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
654 self.embed_texts(texts)
655 }
656
657 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
658 if let Some(vector) = self.query_embedding_cache.get(query) {
659 self.query_embedding_cache_hits += 1;
660 return Ok(vector.clone());
661 }
662
663 self.query_embedding_cache_misses += 1;
664 let embeddings = self.embed_texts(vec![query.to_string()])?;
665 let vector = embeddings
666 .first()
667 .cloned()
668 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
669
670 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
671 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
672 self.query_embedding_cache.remove(&oldest);
673 }
674 }
675 self.query_embedding_cache
676 .insert(query.to_string(), vector.clone());
677 self.query_embedding_cache_order
678 .push_back(query.to_string());
679
680 Ok(vector)
681 }
682
683 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
684 (
685 self.query_embedding_cache_hits,
686 self.query_embedding_cache_misses,
687 self.query_embedding_cache.len(),
688 )
689 }
690
691 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
692 match &mut self.engine {
693 SemanticEmbeddingEngine::Local(model) => model
694 .embed(&texts)
695 .map_err(|error| format!("failed to embed batch: {error}")),
696 SemanticEmbeddingEngine::OpenAiCompatible {
697 client,
698 model,
699 base_url,
700 api_key,
701 } => {
702 let expected_text_count = texts.len();
703 let endpoint = build_openai_embeddings_endpoint(base_url);
704 let body = serde_json::json!({
705 "input": texts,
706 "model": model,
707 });
708
709 let raw = send_embedding_request(
710 || {
711 let mut request = client.post(&endpoint).json(&body);
721
722 if let Some(api_key) = api_key {
723 request = request.header("Authorization", format!("Bearer {api_key}"));
724 }
725
726 request
727 },
728 "openai compatible",
729 )?;
730
731 #[derive(Deserialize)]
732 struct OpenAiResponse {
733 data: Vec<OpenAiEmbeddingResult>,
734 }
735
736 #[derive(Deserialize)]
737 struct OpenAiEmbeddingResult {
738 embedding: Vec<f32>,
739 index: Option<u32>,
740 }
741
742 let parsed: OpenAiResponse = serde_json::from_str(&raw)
743 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
744 if parsed.data.len() != expected_text_count {
745 return Err(format!(
746 "openai compatible response returned {} embeddings for {} inputs",
747 parsed.data.len(),
748 expected_text_count
749 ));
750 }
751
752 let mut vectors = vec![Vec::new(); parsed.data.len()];
753 for (i, item) in parsed.data.into_iter().enumerate() {
754 let index = item.index.unwrap_or(i as u32) as usize;
755 if index >= vectors.len() {
756 return Err(
757 "openai compatible response contains invalid vector index".to_string()
758 );
759 }
760 vectors[index] = item.embedding;
761 }
762
763 for vector in &vectors {
764 if vector.is_empty() {
765 return Err(
766 "openai compatible response contained missing vectors".to_string()
767 );
768 }
769 }
770
771 self.dimension = vectors.first().map(Vec::len);
772 Ok(vectors)
773 }
774 SemanticEmbeddingEngine::Ollama {
775 client,
776 model,
777 base_url,
778 } => {
779 let expected_text_count = texts.len();
780 let endpoint = build_ollama_embeddings_endpoint(base_url);
781
782 #[derive(Serialize)]
783 struct OllamaPayload<'a> {
784 model: &'a str,
785 input: Vec<String>,
786 }
787
788 let payload = OllamaPayload {
789 model,
790 input: texts,
791 };
792
793 let raw = send_embedding_request(
794 || {
795 client.post(&endpoint).json(&payload)
800 },
801 "ollama",
802 )?;
803
804 #[derive(Deserialize)]
805 struct OllamaResponse {
806 embeddings: Vec<Vec<f32>>,
807 }
808
809 let parsed: OllamaResponse = serde_json::from_str(&raw)
810 .map_err(|error| format!("invalid ollama response: {error}"))?;
811 if parsed.embeddings.is_empty() {
812 return Err("ollama response returned no embeddings".to_string());
813 }
814 if parsed.embeddings.len() != expected_text_count {
815 return Err(format!(
816 "ollama response returned {} embeddings for {} inputs",
817 parsed.embeddings.len(),
818 expected_text_count
819 ));
820 }
821
822 let vectors = parsed.embeddings;
823 for vector in &vectors {
824 if vector.is_empty() {
825 return Err("ollama response contained empty embeddings".to_string());
826 }
827 }
828
829 self.dimension = vectors.first().map(Vec::len);
830 Ok(vectors)
831 }
832 }
833 }
834}
835
836pub fn pre_validate_onnx_runtime() -> Result<(), String> {
840 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
841
842 #[cfg(any(target_os = "linux", target_os = "macos"))]
843 {
844 #[cfg(target_os = "linux")]
845 let default_name = "libonnxruntime.so";
846 #[cfg(target_os = "macos")]
847 let default_name = "libonnxruntime.dylib";
848
849 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
850
851 unsafe {
852 let c_name = std::ffi::CString::new(lib_name)
853 .map_err(|e| format!("invalid library path: {}", e))?;
854 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
855 if handle.is_null() {
856 let err = libc::dlerror();
857 let msg = if err.is_null() {
858 "unknown dlopen error".to_string()
859 } else {
860 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
861 };
862 return Err(format!(
863 "ONNX Runtime not found. dlopen('{}') failed: {}. \
864 Run `npx @cortexkit/aft doctor` to diagnose.",
865 lib_name, msg
866 ));
867 }
868
869 let (detected_version, version_source) =
874 detect_ort_version_from_loaded_library(handle, lib_name);
875
876 libc::dlclose(handle);
877
878 if let Some(ref version) = detected_version {
880 let parts: Vec<&str> = version.split('.').collect();
881 if let (Some(major), Some(minor)) = (
882 parts.first().and_then(|s| s.parse::<u32>().ok()),
883 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
884 ) {
885 if major != 1 || minor < 20 {
886 return Err(format_ort_version_mismatch(version, &version_source));
887 }
888 }
889 }
890 }
891 }
892
893 #[cfg(target_os = "windows")]
894 {
895 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
900
901 #[link(name = "kernel32")]
905 extern "system" {
906 fn LoadLibraryExW(
907 lpLibFileName: *const u16,
908 hFile: *mut std::ffi::c_void,
909 dwFlags: u32,
910 ) -> *mut std::ffi::c_void;
911 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
912 fn GetModuleFileNameW(
913 hModule: *mut std::ffi::c_void,
914 lpFilename: *mut u16,
915 nSize: u32,
916 ) -> u32;
917 }
918
919 #[link(name = "version")]
920 extern "system" {
921 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
922 fn GetFileVersionInfoW(
923 lptstrFilename: *const u16,
924 dwHandle: u32,
925 dwLen: u32,
926 lpData: *mut std::ffi::c_void,
927 ) -> i32;
928 fn VerQueryValueW(
929 pBlock: *mut std::ffi::c_void,
930 lpSubBlock: *const u16,
931 lplpBuffer: *mut *mut std::ffi::c_void,
932 puLen: *mut u32,
933 ) -> i32;
934 }
935
936 #[repr(C)]
937 struct VS_FIXEDFILEINFO {
938 dw_signature: u32,
939 dw_struc_version: u32,
940 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
943 dw_product_version_ls: u32,
944 dw_file_flags_mask: u32,
945 dw_file_flags: u32,
946 dw_file_os: u32,
947 dw_file_type: u32,
948 dw_file_subtype: u32,
949 dw_file_date_ms: u32,
950 dw_file_date_ls: u32,
951 }
952
953 unsafe {
954 use std::os::windows::ffi::OsStrExt;
955 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
956 .encode_wide()
957 .chain(std::iter::once(0))
958 .collect();
959
960 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
961 if handle.is_null() {
962 let err = std::io::Error::last_os_error();
963 return Err(format!(
964 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
965 Run `npx @cortexkit/aft doctor` to diagnose.",
966 lib_name, err
967 ));
968 }
969
970 let mut detected_major: u32 = 0;
973 let mut detected_minor: u32 = 0;
974 let mut path_buf = [0u16; 32767];
980 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
981 if path_len > 0 {
982 let mut dummy_handle: u32 = 0;
983 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
984 if info_size > 0 {
985 let mut info = vec![0u8; info_size as usize];
986 if GetFileVersionInfoW(
987 path_buf.as_ptr(),
988 0,
989 info_size,
990 info.as_mut_ptr() as *mut std::ffi::c_void,
991 ) != 0
992 {
993 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
994 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
995 let mut vs_len: u32 = 0;
996 if VerQueryValueW(
997 info.as_mut_ptr() as *mut std::ffi::c_void,
998 sub_block.as_ptr(),
999 &mut vs_info,
1000 &mut vs_len,
1001 ) != 0
1002 && !vs_info.is_null()
1003 {
1004 let fixed = vs_info as *const VS_FIXEDFILEINFO;
1005 detected_major = (*fixed).dw_file_version_ms >> 16;
1006 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1007 }
1008 }
1009 }
1010 }
1011
1012 FreeLibrary(handle);
1013
1014 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1018 let ver = format!("{}.{}", detected_major, detected_minor);
1019 return Err(format_ort_version_mismatch(&ver, lib_name));
1020 }
1021 }
1022 }
1023
1024 Ok(())
1025}
1026
1027#[cfg(any(target_os = "linux", target_os = "macos"))]
1028unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1029 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1030 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1031 if symbol.is_null() {
1032 return None;
1033 }
1034
1035 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1036 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1037 return None;
1038 }
1039
1040 let info = unsafe { info.assume_init() };
1041 if info.dli_fname.is_null() {
1042 return None;
1043 }
1044
1045 Some(
1046 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1047 .to_string_lossy()
1048 .into_owned(),
1049 )
1050}
1051
1052#[cfg(any(target_os = "linux", target_os = "macos"))]
1053fn detect_ort_version_from_resolved_or_requested(
1054 resolved_path: Option<String>,
1055 requested_lib_name: &str,
1056) -> (Option<String>, String) {
1057 if let Some(path) = resolved_path {
1058 if let Some(version) = detect_ort_version_from_path(&path) {
1059 return (Some(version), path);
1060 }
1061 return (detect_ort_version_from_path(requested_lib_name), path);
1062 }
1063
1064 (
1065 detect_ort_version_from_path(requested_lib_name),
1066 requested_lib_name.to_string(),
1067 )
1068}
1069
1070#[cfg(any(target_os = "linux", target_os = "macos"))]
1071fn detect_ort_version_from_loaded_library(
1072 handle: *mut std::ffi::c_void,
1073 requested_lib_name: &str,
1074) -> (Option<String>, String) {
1075 detect_ort_version_from_resolved_or_requested(
1076 unsafe { loaded_library_path_from_handle(handle) },
1077 requested_lib_name,
1078 )
1079}
1080
1081#[cfg(any(target_os = "linux", target_os = "macos"))]
1084fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1085 let path = std::path::Path::new(lib_path);
1086
1087 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1089 .into_iter()
1090 .flatten()
1091 {
1092 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1093 if let Some(version) = extract_version_from_filename(name) {
1094 return Some(version);
1095 }
1096 }
1097 }
1098
1099 if let Some(parent) = path.parent() {
1101 if let Ok(entries) = std::fs::read_dir(parent) {
1102 for entry in entries.flatten() {
1103 if let Some(name) = entry.file_name().to_str() {
1104 if name.starts_with("libonnxruntime") {
1105 if let Some(version) = extract_version_from_filename(name) {
1106 return Some(version);
1107 }
1108 }
1109 }
1110 }
1111 }
1112 }
1113
1114 None
1115}
1116
1117#[cfg(any(target_os = "linux", target_os = "macos"))]
1119fn extract_version_from_filename(name: &str) -> Option<String> {
1120 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1122 re.find(name).map(|m| m.as_str().to_string())
1123}
1124
1125fn suggest_removal_command(lib_path: &str) -> String {
1126 if lib_path.starts_with("/usr/local/lib")
1127 || lib_path == "libonnxruntime.so"
1128 || lib_path == "libonnxruntime.dylib"
1129 {
1130 #[cfg(target_os = "linux")]
1131 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1132 #[cfg(target_os = "macos")]
1133 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1134 }
1135 format!(" rm '{}'", lib_path)
1136}
1137
1138pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1144 format!(
1145 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1146 Solutions:\n\
1147 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1148 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1149 configures the bridge to load it instead of the system library — no \
1150 changes to '{}'.\n\
1151 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1152 {}\n\
1153 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1154 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1155 version,
1156 lib_name,
1157 lib_name,
1158 suggest_removal_command(lib_name),
1159 )
1160}
1161
1162pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1163 if message.trim_start().starts_with("ONNX Runtime not found.") {
1164 return true;
1165 }
1166
1167 let message = message.to_ascii_lowercase();
1168 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1169 .iter()
1170 .any(|pattern| message.contains(pattern));
1171 let mentions_dynamic_load_failure = [
1172 "shared library",
1173 "dynamic library",
1174 "failed to load",
1175 "could not load",
1176 "unable to load",
1177 "dlopen",
1178 "loadlibrary",
1179 "no such file",
1180 "not found",
1181 ]
1182 .iter()
1183 .any(|pattern| message.contains(pattern));
1184
1185 mentions_onnx_runtime && mentions_dynamic_load_failure
1186}
1187
1188pub fn format_embedding_init_error(error: impl Display) -> String {
1189 let message = error.to_string();
1190
1191 if is_onnx_runtime_unavailable(&message) {
1192 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1193 }
1194
1195 format!("failed to initialize semantic embedding model: {message}")
1196}
1197
1198#[derive(Debug, Clone)]
1200pub struct SemanticChunk {
1201 pub file: PathBuf,
1203 pub name: String,
1205 pub kind: SymbolKind,
1207 pub start_line: u32,
1209 pub end_line: u32,
1210 pub exported: bool,
1212 pub embed_text: String,
1214 pub snippet: String,
1216}
1217
1218#[derive(Debug, Clone)]
1220pub struct EmbeddingEntry {
1221 chunk: SemanticChunk,
1222 vector: Vec<f32>,
1223}
1224
1225#[derive(Debug, Clone)]
1227pub struct SemanticIndex {
1228 entries: Vec<EmbeddingEntry>,
1229 file_mtimes: HashMap<PathBuf, SystemTime>,
1231 file_sizes: HashMap<PathBuf, u64>,
1233 file_hashes: HashMap<PathBuf, blake3::Hash>,
1234 dimension: usize,
1236 fingerprint: Option<SemanticIndexFingerprint>,
1237 project_root: PathBuf,
1238 deferred_files: HashSet<PathBuf>,
1239}
1240
1241#[derive(Debug, Clone, Copy)]
1242struct IndexedFileMetadata {
1243 mtime: SystemTime,
1244 size: u64,
1245 content_hash: blake3::Hash,
1246}
1247
1248#[derive(Debug, Default, Clone, Copy)]
1251pub struct RefreshSummary {
1252 pub changed: usize,
1253 pub added: usize,
1254 pub deleted: usize,
1255 pub total_processed: usize,
1256}
1257
1258impl RefreshSummary {
1259 pub fn is_noop(&self) -> bool {
1261 self.changed == 0 && self.added == 0 && self.deleted == 0
1262 }
1263}
1264
1265#[derive(Debug, Default)]
1266pub struct InvalidatedFilesRefresh {
1267 pub added_entries: Vec<EmbeddingEntry>,
1268 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1269 pub completed_paths: Vec<PathBuf>,
1270 pub summary: RefreshSummary,
1271}
1272
1273#[derive(Debug, Clone)]
1275pub struct SemanticResult {
1276 pub file: PathBuf,
1277 pub name: String,
1278 pub kind: SymbolKind,
1279 pub start_line: u32,
1280 pub end_line: u32,
1281 pub exported: bool,
1282 pub snippet: String,
1283 pub score: f32,
1284 pub source: &'static str,
1285}
1286
1287impl SemanticIndex {
1288 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1289 debug_assert!(project_root.is_absolute());
1290 Self {
1291 entries: Vec::new(),
1292 file_mtimes: HashMap::new(),
1293 file_sizes: HashMap::new(),
1294 file_hashes: HashMap::new(),
1295 dimension,
1296 fingerprint: None,
1297 project_root,
1298 deferred_files: HashSet::new(),
1299 }
1300 }
1301
1302 pub fn entry_count(&self) -> usize {
1304 self.entries.len()
1305 }
1306
1307 pub fn indexed_file_count(&self) -> usize {
1309 self.file_mtimes.len()
1310 }
1311
1312 pub fn status_label(&self) -> &'static str {
1314 if self.entries.is_empty() {
1315 "empty"
1316 } else {
1317 "ready"
1318 }
1319 }
1320
1321 fn collect_chunks(
1322 project_root: &Path,
1323 files: &[PathBuf],
1324 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1325 let collect_started = std::time::Instant::now();
1326 let per_file: Vec<(
1327 PathBuf,
1328 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1329 )> = files
1330 .par_iter()
1331 .map_init(HashMap::new, |parsers, file| {
1332 let result = collect_file_metadata(file).and_then(|metadata| {
1333 collect_file_chunks(project_root, file, parsers)
1334 .map(|chunks| (metadata, chunks))
1335 });
1336 (file.clone(), result)
1337 })
1338 .collect();
1339
1340 let mut chunks: Vec<SemanticChunk> = Vec::new();
1341 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1342
1343 for (file, result) in per_file {
1344 match result {
1345 Ok((metadata, file_chunks)) => {
1346 file_metadata.insert(file, metadata);
1347 chunks.extend(file_chunks);
1348 }
1349 Err(error) => {
1350 if error == "unsupported file extension" {
1356 continue;
1357 }
1358 slog_warn!(
1359 "failed to collect semantic chunks for {}: {}",
1360 file.display(),
1361 error
1362 );
1363 }
1364 }
1365 }
1366
1367 slog_info!(
1368 "semantic collect: {} chunks from {} files in {} ms",
1369 chunks.len(),
1370 file_metadata.len(),
1371 collect_started.elapsed().as_millis()
1372 );
1373
1374 (chunks, file_metadata)
1375 }
1376
1377 fn build_from_chunks<F, P>(
1378 project_root: &Path,
1379 chunks: Vec<SemanticChunk>,
1380 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1381 embed_fn: &mut F,
1382 max_batch_size: usize,
1383 mut progress: Option<&mut P>,
1384 ) -> Result<Self, String>
1385 where
1386 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1387 P: FnMut(usize, usize),
1388 {
1389 debug_assert!(project_root.is_absolute());
1390 let total_chunks = chunks.len();
1391
1392 if chunks.is_empty() {
1393 return Ok(Self {
1394 entries: Vec::new(),
1395 file_mtimes: file_metadata
1396 .iter()
1397 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1398 .collect(),
1399 file_sizes: file_metadata
1400 .iter()
1401 .map(|(path, metadata)| (path.clone(), metadata.size))
1402 .collect(),
1403 file_hashes: file_metadata
1404 .into_iter()
1405 .map(|(path, metadata)| (path, metadata.content_hash))
1406 .collect(),
1407 dimension: DEFAULT_DIMENSION,
1408 fingerprint: None,
1409 project_root: project_root.to_path_buf(),
1410 deferred_files: HashSet::new(),
1411 });
1412 }
1413
1414 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1416 let mut expected_dimension: Option<usize> = None;
1417 let batch_size = max_batch_size.max(1);
1418 let embed_started = std::time::Instant::now();
1419 let batch_count = total_chunks.div_ceil(batch_size);
1420 for batch_start in (0..chunks.len()).step_by(batch_size) {
1421 let batch_end = (batch_start + batch_size).min(chunks.len());
1422 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1423 .iter()
1424 .map(|c| c.embed_text.clone())
1425 .collect();
1426
1427 let vectors = embed_fn(batch_texts)?;
1428 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1429
1430 if let Some(dim) = vectors.first().map(|v| v.len()) {
1432 match expected_dimension {
1433 None => expected_dimension = Some(dim),
1434 Some(expected) if dim != expected => {
1435 return Err(format!(
1436 "embedding dimension changed across batches: expected {expected}, got {dim}"
1437 ));
1438 }
1439 _ => {}
1440 }
1441 }
1442
1443 for (i, vector) in vectors.into_iter().enumerate() {
1444 let chunk_idx = batch_start + i;
1445 entries.push(EmbeddingEntry {
1446 chunk: chunks[chunk_idx].clone(),
1447 vector,
1448 });
1449 }
1450
1451 if let Some(callback) = progress.as_mut() {
1452 callback(entries.len(), total_chunks);
1453 }
1454 }
1455
1456 let embed_ms = embed_started.elapsed().as_millis();
1457 let rate = (total_chunks as u128 * 1000)
1458 .checked_div(embed_ms)
1459 .unwrap_or(0) as u64;
1460 slog_info!(
1461 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1462 total_chunks,
1463 batch_count,
1464 embed_ms,
1465 rate
1466 );
1467
1468 let dimension = entries
1469 .first()
1470 .map(|e| e.vector.len())
1471 .unwrap_or(DEFAULT_DIMENSION);
1472
1473 Ok(Self {
1474 entries,
1475 file_mtimes: file_metadata
1476 .iter()
1477 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1478 .collect(),
1479 file_sizes: file_metadata
1480 .iter()
1481 .map(|(path, metadata)| (path.clone(), metadata.size))
1482 .collect(),
1483 file_hashes: file_metadata
1484 .into_iter()
1485 .map(|(path, metadata)| (path, metadata.content_hash))
1486 .collect(),
1487 dimension,
1488 fingerprint: None,
1489 project_root: project_root.to_path_buf(),
1490 deferred_files: HashSet::new(),
1491 })
1492 }
1493
1494 pub fn build<F>(
1497 project_root: &Path,
1498 files: &[PathBuf],
1499 embed_fn: &mut F,
1500 max_batch_size: usize,
1501 ) -> Result<Self, String>
1502 where
1503 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1504 {
1505 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1506 Self::build_from_chunks(
1507 project_root,
1508 chunks,
1509 file_mtimes,
1510 embed_fn,
1511 max_batch_size,
1512 Option::<&mut fn(usize, usize)>::None,
1513 )
1514 }
1515
1516 pub fn build_with_progress<F, P>(
1518 project_root: &Path,
1519 files: &[PathBuf],
1520 embed_fn: &mut F,
1521 max_batch_size: usize,
1522 progress: &mut P,
1523 ) -> Result<Self, String>
1524 where
1525 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1526 P: FnMut(usize, usize),
1527 {
1528 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1529 let total_chunks = chunks.len();
1530 progress(0, total_chunks);
1531 Self::build_from_chunks(
1532 project_root,
1533 chunks,
1534 file_mtimes,
1535 embed_fn,
1536 max_batch_size,
1537 Some(progress),
1538 )
1539 }
1540
1541 pub fn refresh_stale_files<F, P>(
1552 &mut self,
1553 project_root: &Path,
1554 current_files: &[PathBuf],
1555 embed_fn: &mut F,
1556 max_batch_size: usize,
1557 progress: &mut P,
1558 ) -> Result<RefreshSummary, String>
1559 where
1560 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1561 P: FnMut(usize, usize),
1562 {
1563 self.backfill_missing_file_sizes();
1564
1565 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1567 self.deferred_files
1568 .retain(|path| current_set.contains(path.as_path()));
1569 let total_processed = current_set.len() + self.file_mtimes.len()
1570 - self
1571 .file_mtimes
1572 .keys()
1573 .filter(|path| current_set.contains(path.as_path()))
1574 .count();
1575
1576 let mut deleted: Vec<PathBuf> = Vec::new();
1579 let mut changed: Vec<PathBuf> = Vec::new();
1580 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1581 for indexed_path in &indexed_paths {
1582 if !current_set.contains(indexed_path.as_path()) {
1583 deleted.push(indexed_path.clone());
1584 continue;
1585 }
1586 let cached = match (
1587 self.file_mtimes.get(indexed_path),
1588 self.file_sizes.get(indexed_path),
1589 self.file_hashes.get(indexed_path),
1590 ) {
1591 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1592 mtime: *mtime,
1593 size: *size,
1594 content_hash: *hash,
1595 }),
1596 _ => None,
1597 };
1598 match cached
1599 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1600 {
1601 Some(FreshnessVerdict::HotFresh) => {}
1602 Some(FreshnessVerdict::ContentFresh {
1603 new_mtime,
1604 new_size,
1605 }) => {
1606 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1607 self.file_sizes.insert(indexed_path.clone(), new_size);
1608 }
1609 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1610 changed.push(indexed_path.clone());
1611 }
1612 }
1613 }
1614
1615 let mut added: Vec<PathBuf> = Vec::new();
1617 for path in current_files {
1618 if !self.file_mtimes.contains_key(path) {
1619 added.push(path.clone());
1620 }
1621 }
1622
1623 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1625 progress(0, 0);
1626 return Ok(RefreshSummary {
1627 total_processed,
1628 ..RefreshSummary::default()
1629 });
1630 }
1631
1632 if !deleted.is_empty() {
1636 self.remove_indexed_files(&deleted);
1637 }
1638
1639 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1641 to_embed.extend(changed.iter().cloned());
1642 to_embed.extend(added.iter().cloned());
1643
1644 if to_embed.is_empty() {
1645 progress(0, 0);
1647 return Ok(RefreshSummary {
1648 changed: 0,
1649 added: 0,
1650 deleted: deleted.len(),
1651 total_processed,
1652 });
1653 }
1654
1655 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1656 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1657 let vanished = to_embed
1658 .iter()
1659 .filter(|path| {
1660 changed_set.contains(path.as_path())
1661 && !fresh_metadata.contains_key(*path)
1662 && !path.exists()
1663 })
1664 .cloned()
1665 .collect::<Vec<_>>();
1666 if !vanished.is_empty() {
1667 self.remove_indexed_files(&vanished);
1668 deleted.extend(vanished);
1669 }
1670
1671 if chunks.is_empty() {
1672 progress(0, 0);
1673 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1674 for file in &successful_files {
1675 self.deferred_files.remove(file);
1676 }
1677 if !successful_files.is_empty() {
1678 self.entries
1679 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1680 }
1681 let changed_count = changed
1682 .iter()
1683 .filter(|path| successful_files.contains(*path))
1684 .count();
1685 let added_count = added
1686 .iter()
1687 .filter(|path| successful_files.contains(*path))
1688 .count();
1689 for (file, metadata) in fresh_metadata {
1690 self.file_mtimes.insert(file.clone(), metadata.mtime);
1691 self.file_sizes.insert(file.clone(), metadata.size);
1692 self.file_hashes.insert(file.clone(), metadata.content_hash);
1693 }
1694 return Ok(RefreshSummary {
1695 changed: changed_count,
1696 added: added_count,
1697 deleted: deleted.len(),
1698 total_processed,
1699 });
1700 }
1701
1702 let total_chunks = chunks.len();
1704 progress(0, total_chunks);
1705 let batch_size = max_batch_size.max(1);
1706 let existing_dimension = if self.entries.is_empty() {
1707 None
1708 } else {
1709 Some(self.dimension)
1710 };
1711 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1712 let mut observed_dimension: Option<usize> = existing_dimension;
1713
1714 for batch_start in (0..chunks.len()).step_by(batch_size) {
1715 let batch_end = (batch_start + batch_size).min(chunks.len());
1716 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1717 .iter()
1718 .map(|c| c.embed_text.clone())
1719 .collect();
1720
1721 let vectors = embed_fn(batch_texts)?;
1722 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1723
1724 if let Some(dim) = vectors.first().map(|v| v.len()) {
1725 match observed_dimension {
1726 None => observed_dimension = Some(dim),
1727 Some(expected) if dim != expected => {
1728 return Err(format!(
1731 "embedding dimension changed during incremental refresh: \
1732 cached index uses {expected}, new vectors use {dim}"
1733 ));
1734 }
1735 _ => {}
1736 }
1737 }
1738
1739 for (i, vector) in vectors.into_iter().enumerate() {
1740 let chunk_idx = batch_start + i;
1741 new_entries.push(EmbeddingEntry {
1742 chunk: chunks[chunk_idx].clone(),
1743 vector,
1744 });
1745 }
1746
1747 progress(new_entries.len(), total_chunks);
1748 }
1749
1750 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1751 for file in &successful_files {
1752 self.deferred_files.remove(file);
1753 }
1754 if !successful_files.is_empty() {
1755 self.entries
1756 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1757 }
1758
1759 self.entries.extend(new_entries);
1760 for (file, metadata) in fresh_metadata {
1761 self.file_mtimes.insert(file.clone(), metadata.mtime);
1762 self.file_sizes.insert(file.clone(), metadata.size);
1763 self.file_hashes.insert(file, metadata.content_hash);
1764 }
1765 if let Some(dim) = observed_dimension {
1766 self.dimension = dim;
1767 }
1768
1769 Ok(RefreshSummary {
1770 changed: changed
1771 .iter()
1772 .filter(|path| successful_files.contains(*path))
1773 .count(),
1774 added: added
1775 .iter()
1776 .filter(|path| successful_files.contains(*path))
1777 .count(),
1778 deleted: deleted.len(),
1779 total_processed,
1780 })
1781 }
1782
1783 pub fn refresh_invalidated_files<F, P>(
1790 &mut self,
1791 project_root: &Path,
1792 paths: &[PathBuf],
1793 embed_fn: &mut F,
1794 max_batch_size: usize,
1795 max_files: usize,
1796 progress: &mut P,
1797 ) -> Result<InvalidatedFilesRefresh, String>
1798 where
1799 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1800 P: FnMut(usize, usize),
1801 {
1802 self.backfill_missing_file_sizes();
1803
1804 self.deferred_files.retain(|path| path.exists());
1805 let mut requested_paths = paths.to_vec();
1806 requested_paths.extend(self.deferred_files.iter().cloned());
1807 requested_paths.sort();
1808 requested_paths.dedup();
1809 let total_processed = requested_paths.len();
1810
1811 if requested_paths.is_empty() {
1812 progress(0, 0);
1813 return Ok(InvalidatedFilesRefresh {
1814 summary: RefreshSummary {
1815 total_processed,
1816 ..RefreshSummary::default()
1817 },
1818 ..InvalidatedFilesRefresh::default()
1819 });
1820 }
1821
1822 let previously_indexed: HashSet<PathBuf> = requested_paths
1823 .iter()
1824 .filter(|path| self.file_mtimes.contains_key(*path))
1825 .cloned()
1826 .collect();
1827
1828 self.remove_indexed_files(&requested_paths);
1832
1833 let existing_paths = requested_paths
1834 .iter()
1835 .filter(|path| path.exists())
1836 .cloned()
1837 .collect::<Vec<_>>();
1838 let deleted = requested_paths
1839 .iter()
1840 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1841 .count();
1842
1843 if existing_paths.is_empty() {
1844 for path in &requested_paths {
1845 if !path.exists() {
1846 self.deferred_files.remove(path);
1847 }
1848 }
1849 progress(0, 0);
1850 return Ok(InvalidatedFilesRefresh {
1851 completed_paths: requested_paths,
1852 summary: RefreshSummary {
1853 deleted,
1854 total_processed,
1855 ..RefreshSummary::default()
1856 },
1857 ..InvalidatedFilesRefresh::default()
1858 });
1859 }
1860
1861 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1862
1863 let retained_file_count = self.file_mtimes.len();
1864 let changed_successful_count = existing_paths
1865 .iter()
1866 .filter(|path| {
1867 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1868 })
1869 .count();
1870 let available_new_files =
1871 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1872 let new_successful_files = existing_paths
1873 .iter()
1874 .filter(|path| {
1875 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1876 })
1877 .cloned()
1878 .collect::<Vec<_>>();
1879 if new_successful_files.len() > available_new_files {
1880 let allowed_new_files = new_successful_files
1881 .iter()
1882 .take(available_new_files)
1883 .cloned()
1884 .collect::<HashSet<_>>();
1885 let deferred_new_files = new_successful_files
1886 .into_iter()
1887 .filter(|path| !allowed_new_files.contains(path))
1888 .collect::<HashSet<_>>();
1889
1890 fresh_metadata.retain(|file, _| {
1891 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1892 });
1893 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1894
1895 if !deferred_new_files.is_empty() {
1896 for path in &deferred_new_files {
1897 self.deferred_files.insert(path.clone());
1898 }
1899 slog_warn!(
1900 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1901 deferred_new_files.len(),
1902 max_files
1903 );
1904 }
1905 }
1906
1907 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1908 for file in &successful_files {
1909 self.deferred_files.remove(file);
1910 }
1911 let changed = successful_files
1912 .iter()
1913 .filter(|path| previously_indexed.contains(path.as_path()))
1914 .count();
1915 let added = successful_files.len().saturating_sub(changed);
1916 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1917
1918 if chunks.is_empty() {
1919 progress(0, 0);
1920 for (file, metadata) in fresh_metadata {
1921 let freshness = FileFreshness {
1922 mtime: metadata.mtime,
1923 size: metadata.size,
1924 content_hash: metadata.content_hash,
1925 };
1926 self.file_mtimes.insert(file.clone(), freshness.mtime);
1927 self.file_sizes.insert(file.clone(), freshness.size);
1928 self.file_hashes
1929 .insert(file.clone(), freshness.content_hash);
1930 updated_metadata.push((file, freshness));
1931 }
1932
1933 return Ok(InvalidatedFilesRefresh {
1934 updated_metadata,
1935 completed_paths: requested_paths,
1936 summary: RefreshSummary {
1937 changed,
1938 added,
1939 deleted,
1940 total_processed,
1941 },
1942 ..InvalidatedFilesRefresh::default()
1943 });
1944 }
1945
1946 let total_chunks = chunks.len();
1947 progress(0, total_chunks);
1948 let batch_size = max_batch_size.max(1);
1949 let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1950 None
1951 } else {
1952 Some(self.dimension)
1953 };
1954 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1955
1956 for batch_start in (0..chunks.len()).step_by(batch_size) {
1957 let batch_end = (batch_start + batch_size).min(chunks.len());
1958 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1959 .iter()
1960 .map(|chunk| chunk.embed_text.clone())
1961 .collect();
1962
1963 let vectors = embed_fn(batch_texts)?;
1964 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1965
1966 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1967 match observed_dimension {
1968 None => observed_dimension = Some(dim),
1969 Some(expected) if dim != expected => {
1970 return Err(format!(
1971 "embedding dimension changed during invalidated-file refresh: \
1972 cached index uses {expected}, new vectors use {dim}"
1973 ));
1974 }
1975 _ => {}
1976 }
1977 }
1978
1979 for (i, vector) in vectors.into_iter().enumerate() {
1980 let chunk_idx = batch_start + i;
1981 new_entries.push(EmbeddingEntry {
1982 chunk: chunks[chunk_idx].clone(),
1983 vector,
1984 });
1985 }
1986
1987 progress(new_entries.len(), total_chunks);
1988 }
1989
1990 let added_entries = new_entries.clone();
1991 self.entries.extend(new_entries);
1992 for (file, metadata) in fresh_metadata {
1993 let freshness = FileFreshness {
1994 mtime: metadata.mtime,
1995 size: metadata.size,
1996 content_hash: metadata.content_hash,
1997 };
1998 self.file_mtimes.insert(file.clone(), freshness.mtime);
1999 self.file_sizes.insert(file.clone(), freshness.size);
2000 self.file_hashes
2001 .insert(file.clone(), freshness.content_hash);
2002 updated_metadata.push((file, freshness));
2003 }
2004 if let Some(dim) = observed_dimension {
2005 self.dimension = dim;
2006 }
2007
2008 Ok(InvalidatedFilesRefresh {
2009 added_entries,
2010 updated_metadata,
2011 completed_paths: requested_paths,
2012 summary: RefreshSummary {
2013 changed,
2014 added,
2015 deleted,
2016 total_processed,
2017 },
2018 })
2019 }
2020
2021 pub fn apply_refresh_update(
2022 &mut self,
2023 added_entries: Vec<EmbeddingEntry>,
2024 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2025 completed_paths: &[PathBuf],
2026 ) {
2027 self.remove_indexed_files(completed_paths);
2028
2029 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2030 self.entries.extend(added_entries);
2031 for (file, freshness) in updated_metadata {
2032 self.file_mtimes.insert(file.clone(), freshness.mtime);
2033 self.file_sizes.insert(file.clone(), freshness.size);
2034 self.file_hashes.insert(file, freshness.content_hash);
2035 }
2036 if let Some(dim) = observed_dimension {
2037 self.dimension = dim;
2038 }
2039 }
2040
2041 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2042 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2043 self.entries
2044 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2045 for path in files {
2046 self.file_mtimes.remove(path);
2047 self.file_sizes.remove(path);
2048 self.file_hashes.remove(path);
2049 }
2050 }
2051
2052 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2054 if self.entries.is_empty() || query_vector.len() != self.dimension {
2055 return Vec::new();
2056 }
2057
2058 let mut scored: Vec<(f32, usize)> = self
2059 .entries
2060 .iter()
2061 .enumerate()
2062 .map(|(i, entry)| {
2063 let mut score = cosine_similarity(query_vector, &entry.vector);
2064 if entry.chunk.exported {
2065 score *= 1.1;
2066 }
2067 (score, i)
2068 })
2069 .collect();
2070
2071 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
2073
2074 scored
2075 .into_iter()
2076 .take(top_k)
2077 .map(|(score, idx)| {
2081 let entry = &self.entries[idx];
2082 SemanticResult {
2083 file: entry.chunk.file.clone(),
2084 name: entry.chunk.name.clone(),
2085 kind: entry.chunk.kind.clone(),
2086 start_line: entry.chunk.start_line,
2087 end_line: entry.chunk.end_line,
2088 exported: entry.chunk.exported,
2089 snippet: entry.chunk.snippet.clone(),
2090 score,
2091 source: "semantic",
2092 }
2093 })
2094 .collect()
2095 }
2096
2097 pub fn len(&self) -> usize {
2099 self.entries.len()
2100 }
2101
2102 pub fn is_file_stale(&self, file: &Path) -> bool {
2104 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2105 return true;
2106 };
2107 let Some(stored_size) = self.file_sizes.get(file) else {
2108 return true;
2109 };
2110 let Some(stored_hash) = self.file_hashes.get(file) else {
2111 return true;
2112 };
2113 let cached = FileFreshness {
2114 mtime: *stored_mtime,
2115 size: *stored_size,
2116 content_hash: *stored_hash,
2117 };
2118 match cache_freshness::verify_file_strict(file, &cached) {
2119 FreshnessVerdict::HotFresh => false,
2120 FreshnessVerdict::ContentFresh { .. } => false,
2121 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2122 }
2123 }
2124
2125 fn backfill_missing_file_sizes(&mut self) {
2126 for path in self.file_mtimes.keys() {
2127 if self.file_sizes.contains_key(path) {
2128 continue;
2129 }
2130 if let Ok(metadata) = fs::metadata(path) {
2131 self.file_sizes.insert(path.clone(), metadata.len());
2132 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2133 self.file_hashes.insert(path.clone(), hash);
2134 }
2135 }
2136 }
2137 }
2138
2139 pub fn remove_file(&mut self, file: &Path) {
2141 self.invalidate_file(file);
2142 }
2143
2144 pub fn invalidate_file(&mut self, file: &Path) {
2145 let canonical_file = canonicalize_existing_or_deleted_path(file);
2146 self.entries
2147 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2148 self.file_mtimes.remove(file);
2149 self.file_sizes.remove(file);
2150 self.file_hashes.remove(file);
2151 if canonical_file.as_path() != file {
2152 self.file_mtimes.remove(&canonical_file);
2153 self.file_sizes.remove(&canonical_file);
2154 self.file_hashes.remove(&canonical_file);
2155 }
2156 }
2157
2158 pub fn dimension(&self) -> usize {
2160 self.dimension
2161 }
2162
2163 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2164 self.fingerprint.as_ref()
2165 }
2166
2167 pub fn backend_label(&self) -> Option<&str> {
2168 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2169 }
2170
2171 pub fn model_label(&self) -> Option<&str> {
2172 self.fingerprint.as_ref().map(|f| f.model.as_str())
2173 }
2174
2175 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2176 self.fingerprint = Some(fingerprint);
2177 }
2178
2179 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2181 if self.entries.is_empty() {
2184 slog_info!("skipping semantic index persistence (0 entries)");
2185 return;
2186 }
2187 let dir = storage_dir.join("semantic").join(project_key);
2188 if let Err(e) = fs::create_dir_all(&dir) {
2189 slog_warn!("failed to create semantic cache dir: {}", e);
2190 return;
2191 }
2192 let data_path = dir.join("semantic.bin");
2193 let tmp_path = dir.join(format!(
2194 "semantic.bin.tmp.{}.{}",
2195 std::process::id(),
2196 SystemTime::now()
2197 .duration_since(SystemTime::UNIX_EPOCH)
2198 .unwrap_or(Duration::ZERO)
2199 .as_nanos()
2200 ));
2201 let bytes = self.to_bytes();
2202 let write_result = (|| -> std::io::Result<()> {
2203 use std::io::Write;
2204 let mut file = fs::File::create(&tmp_path)?;
2205 file.write_all(&bytes)?;
2206 file.sync_all()?;
2207 Ok(())
2208 })();
2209 if let Err(e) = write_result {
2210 slog_warn!("failed to write semantic index: {}", e);
2211 let _ = fs::remove_file(&tmp_path);
2212 return;
2213 }
2214 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2215 slog_warn!("failed to rename semantic index: {}", e);
2216 let _ = fs::remove_file(&tmp_path);
2217 return;
2218 }
2219 slog_info!(
2220 "semantic index persisted: {} entries, {:.1} KB",
2221 self.entries.len(),
2222 bytes.len() as f64 / 1024.0
2223 );
2224 }
2225
2226 pub fn read_from_disk(
2228 storage_dir: &Path,
2229 project_key: &str,
2230 current_canonical_root: &Path,
2231 is_worktree_bridge: bool,
2232 expected_fingerprint: Option<&str>,
2233 ) -> Option<Self> {
2234 debug_assert!(current_canonical_root.is_absolute());
2235 let data_path = storage_dir
2236 .join("semantic")
2237 .join(project_key)
2238 .join("semantic.bin");
2239 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2240 if file_len < HEADER_BYTES_V1 {
2241 slog_warn!(
2242 "corrupt semantic index (too small: {} bytes), removing",
2243 file_len
2244 );
2245 if !is_worktree_bridge {
2246 let _ = fs::remove_file(&data_path);
2247 }
2248 return None;
2249 }
2250
2251 let bytes = fs::read(&data_path).ok()?;
2252 let version = bytes[0];
2253 if version != SEMANTIC_INDEX_VERSION_V6 {
2254 slog_info!(
2255 "cached semantic index version {} is older than {}, rebuilding",
2256 version,
2257 SEMANTIC_INDEX_VERSION_V6
2258 );
2259 if !is_worktree_bridge {
2260 let _ = fs::remove_file(&data_path);
2261 }
2262 return None;
2263 }
2264 match Self::from_bytes(&bytes, current_canonical_root) {
2265 Ok(index) => {
2266 if index.entries.is_empty() {
2267 slog_info!("cached semantic index is empty, will rebuild");
2268 if !is_worktree_bridge {
2269 let _ = fs::remove_file(&data_path);
2270 }
2271 return None;
2272 }
2273 if let Some(expected) = expected_fingerprint {
2274 let matches = index
2275 .fingerprint()
2276 .map(|fingerprint| fingerprint.matches_expected(expected))
2277 .unwrap_or(false);
2278 if !matches {
2279 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2280 if !is_worktree_bridge {
2281 let _ = fs::remove_file(&data_path);
2282 }
2283 return None;
2284 }
2285 }
2286 slog_info!(
2287 "loaded semantic index from disk: {} entries",
2288 index.entries.len()
2289 );
2290 Some(index)
2291 }
2292 Err(e) => {
2293 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2294 if !is_worktree_bridge {
2295 let _ = fs::remove_file(&data_path);
2296 }
2297 None
2298 }
2299 }
2300 }
2301
2302 pub fn to_bytes(&self) -> Vec<u8> {
2304 let mut buf = Vec::new();
2305 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2306 let encoded = fingerprint.as_string();
2307 if encoded.is_empty() {
2308 None
2309 } else {
2310 Some(encoded.into_bytes())
2311 }
2312 });
2313 let file_mtimes: Vec<_> = self
2314 .file_mtimes
2315 .iter()
2316 .filter_map(|(path, mtime)| {
2317 cache_relative_path(&self.project_root, path)
2318 .map(|relative| (relative, path, mtime))
2319 })
2320 .collect();
2321 let entries: Vec<_> = self
2322 .entries
2323 .iter()
2324 .filter_map(|entry| {
2325 cache_relative_path(&self.project_root, &entry.chunk.file)
2326 .map(|relative| (relative, entry))
2327 })
2328 .collect();
2329
2330 let version = SEMANTIC_INDEX_VERSION_V6;
2343 buf.push(version);
2344 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2345 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2346 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2347 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2348 buf.extend_from_slice(fp_bytes_ref);
2349
2350 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2353 for (relative, path, mtime) in &file_mtimes {
2354 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2355 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2356 buf.extend_from_slice(&path_bytes);
2357 let duration = mtime
2358 .duration_since(SystemTime::UNIX_EPOCH)
2359 .unwrap_or_default();
2360 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2361 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2362 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2363 buf.extend_from_slice(&size.to_le_bytes());
2364 let hash = self
2365 .file_hashes
2366 .get(*path)
2367 .copied()
2368 .unwrap_or_else(cache_freshness::zero_hash);
2369 buf.extend_from_slice(hash.as_bytes());
2370 }
2371
2372 for (relative, entry) in &entries {
2374 let c = &entry.chunk;
2375
2376 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2378 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2379 buf.extend_from_slice(&file_bytes);
2380
2381 let name_bytes = c.name.as_bytes();
2383 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2384 buf.extend_from_slice(name_bytes);
2385
2386 buf.push(symbol_kind_to_u8(&c.kind));
2388
2389 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2391 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2392 buf.push(c.exported as u8);
2393
2394 let snippet_bytes = c.snippet.as_bytes();
2396 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2397 buf.extend_from_slice(snippet_bytes);
2398
2399 let embed_bytes = c.embed_text.as_bytes();
2401 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2402 buf.extend_from_slice(embed_bytes);
2403
2404 for &val in &entry.vector {
2406 buf.extend_from_slice(&val.to_le_bytes());
2407 }
2408 }
2409
2410 buf
2411 }
2412
2413 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2415 debug_assert!(current_canonical_root.is_absolute());
2416 let mut pos = 0;
2417
2418 if data.len() < HEADER_BYTES_V1 {
2419 return Err("data too short".to_string());
2420 }
2421
2422 let version = data[pos];
2423 pos += 1;
2424 if version != SEMANTIC_INDEX_VERSION_V1
2425 && version != SEMANTIC_INDEX_VERSION_V2
2426 && version != SEMANTIC_INDEX_VERSION_V3
2427 && version != SEMANTIC_INDEX_VERSION_V4
2428 && version != SEMANTIC_INDEX_VERSION_V5
2429 && version != SEMANTIC_INDEX_VERSION_V6
2430 {
2431 return Err(format!("unsupported version: {}", version));
2432 }
2433 if (version == SEMANTIC_INDEX_VERSION_V2
2437 || version == SEMANTIC_INDEX_VERSION_V3
2438 || version == SEMANTIC_INDEX_VERSION_V4
2439 || version == SEMANTIC_INDEX_VERSION_V5
2440 || version == SEMANTIC_INDEX_VERSION_V6)
2441 && data.len() < HEADER_BYTES_V2
2442 {
2443 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2444 }
2445
2446 let dimension = read_u32(data, &mut pos)? as usize;
2447 let entry_count = read_u32(data, &mut pos)? as usize;
2448 validate_embedding_dimension(dimension)?;
2449 if entry_count > MAX_ENTRIES {
2450 return Err(format!("too many semantic index entries: {}", entry_count));
2451 }
2452
2453 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2459 || version == SEMANTIC_INDEX_VERSION_V3
2460 || version == SEMANTIC_INDEX_VERSION_V4
2461 || version == SEMANTIC_INDEX_VERSION_V5
2462 || version == SEMANTIC_INDEX_VERSION_V6;
2463 let fingerprint = if has_fingerprint_field {
2464 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2465 if pos + fingerprint_len > data.len() {
2466 return Err("unexpected end of data reading fingerprint".to_string());
2467 }
2468 if fingerprint_len == 0 {
2469 None
2470 } else {
2471 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2472 pos += fingerprint_len;
2473 Some(
2474 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2475 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2476 )
2477 }
2478 } else {
2479 None
2480 };
2481
2482 let mtime_count = read_u32(data, &mut pos)? as usize;
2484 if mtime_count > MAX_ENTRIES {
2485 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2486 }
2487
2488 let vector_bytes = entry_count
2489 .checked_mul(dimension)
2490 .and_then(|count| count.checked_mul(F32_BYTES))
2491 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2492 if vector_bytes > data.len().saturating_sub(pos) {
2493 return Err("semantic index vectors exceed available data".to_string());
2494 }
2495
2496 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2497 let mut file_sizes = HashMap::with_capacity(mtime_count);
2498 let mut file_hashes = HashMap::with_capacity(mtime_count);
2499 for _ in 0..mtime_count {
2500 let path = read_string(data, &mut pos)?;
2501 let secs = read_u64(data, &mut pos)?;
2502 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2508 || version == SEMANTIC_INDEX_VERSION_V4
2509 || version == SEMANTIC_INDEX_VERSION_V5
2510 || version == SEMANTIC_INDEX_VERSION_V6
2511 {
2512 read_u32(data, &mut pos)?
2513 } else {
2514 0
2515 };
2516 let size =
2517 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2518 read_u64(data, &mut pos)?
2519 } else {
2520 0
2521 };
2522 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2523 if pos + 32 > data.len() {
2524 return Err("unexpected end of data reading content hash".to_string());
2525 }
2526 let mut hash_bytes = [0u8; 32];
2527 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2528 pos += 32;
2529 blake3::Hash::from_bytes(hash_bytes)
2530 } else {
2531 cache_freshness::zero_hash()
2532 };
2533 if nanos >= 1_000_000_000 {
2540 return Err(format!(
2541 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2542 nanos
2543 ));
2544 }
2545 let duration = std::time::Duration::new(secs, nanos);
2546 let mtime = SystemTime::UNIX_EPOCH
2547 .checked_add(duration)
2548 .ok_or_else(|| {
2549 format!(
2550 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2551 secs, nanos
2552 )
2553 })?;
2554 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2555 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2556 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2557 } else {
2558 PathBuf::from(path)
2559 };
2560 file_mtimes.insert(path.clone(), mtime);
2561 file_sizes.insert(path.clone(), size);
2562 file_hashes.insert(path, content_hash);
2563 }
2564
2565 let mut entries = Vec::with_capacity(entry_count);
2567 for _ in 0..entry_count {
2568 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2569 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2570 cached_path_under_root(current_canonical_root, &raw_file)
2571 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2572 } else {
2573 raw_file
2574 };
2575 let name = read_string(data, &mut pos)?;
2576
2577 if pos >= data.len() {
2578 return Err("unexpected end of data".to_string());
2579 }
2580 let kind = u8_to_symbol_kind(data[pos]);
2581 pos += 1;
2582
2583 let start_line = read_u32(data, &mut pos)?;
2584 let end_line = read_u32(data, &mut pos)?;
2585
2586 if pos >= data.len() {
2587 return Err("unexpected end of data".to_string());
2588 }
2589 let exported = data[pos] != 0;
2590 pos += 1;
2591
2592 let snippet = read_string(data, &mut pos)?;
2593 let embed_text = read_string(data, &mut pos)?;
2594
2595 let vec_bytes = dimension
2597 .checked_mul(F32_BYTES)
2598 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2599 if pos + vec_bytes > data.len() {
2600 return Err("unexpected end of data reading vector".to_string());
2601 }
2602 let mut vector = Vec::with_capacity(dimension);
2603 for _ in 0..dimension {
2604 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2605 vector.push(f32::from_le_bytes(bytes));
2606 pos += 4;
2607 }
2608
2609 entries.push(EmbeddingEntry {
2610 chunk: SemanticChunk {
2611 file,
2612 name,
2613 kind,
2614 start_line,
2615 end_line,
2616 exported,
2617 embed_text,
2618 snippet,
2619 },
2620 vector,
2621 });
2622 }
2623
2624 if entries.len() != entry_count {
2625 return Err(format!(
2626 "semantic cache entry count drift: header={} decoded={}",
2627 entry_count,
2628 entries.len()
2629 ));
2630 }
2631 for entry in &entries {
2632 if !file_mtimes.contains_key(&entry.chunk.file) {
2633 return Err(format!(
2634 "semantic cache metadata missing for entry file {}",
2635 entry.chunk.file.display()
2636 ));
2637 }
2638 }
2639
2640 Ok(Self {
2641 entries,
2642 file_mtimes,
2643 file_sizes,
2644 file_hashes,
2645 dimension,
2646 fingerprint,
2647 project_root: current_canonical_root.to_path_buf(),
2648 deferred_files: HashSet::new(),
2649 })
2650 }
2651}
2652
2653fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2655 let relative = file
2656 .strip_prefix(project_root)
2657 .unwrap_or(file)
2658 .to_string_lossy();
2659
2660 let kind_label = match &symbol.kind {
2661 SymbolKind::Function => "function",
2662 SymbolKind::Class => "class",
2663 SymbolKind::Method => "method",
2664 SymbolKind::Struct => "struct",
2665 SymbolKind::Interface => "interface",
2666 SymbolKind::Enum => "enum",
2667 SymbolKind::TypeAlias => "type",
2668 SymbolKind::Variable => "variable",
2669 SymbolKind::Heading => "heading",
2670 SymbolKind::FileSummary => "file-summary",
2671 };
2672
2673 let name = &symbol.name;
2675 let mut text = format!(
2676 "name:{name} file:{} kind:{} name:{name}",
2677 relative, kind_label
2678 );
2679
2680 if let Some(sig) = &symbol.signature {
2681 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2689 }
2690
2691 let lines: Vec<&str> = source.lines().collect();
2693 let start = (symbol.range.start_line as usize).min(lines.len());
2694 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2696 if start < end {
2697 let body: String = lines[start..end]
2698 .iter()
2699 .take(15) .copied()
2701 .collect::<Vec<&str>>()
2702 .join("\n");
2703 let snippet = if body.len() > 300 {
2704 format!("{}...", &body[..body.floor_char_boundary(300)])
2705 } else {
2706 body
2707 };
2708 text.push_str(&format!(" body:{}", snippet));
2709 }
2710
2711 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2716}
2717
2718const MAX_EMBED_TEXT_CHARS: usize = 1600;
2722
2723fn truncate_chars(value: &str, max_chars: usize) -> String {
2724 value.chars().take(max_chars).collect()
2725}
2726
2727fn first_leading_doc_comment(source: &str) -> String {
2728 let lines: Vec<&str> = source.lines().collect();
2729 let Some((start, first)) = lines
2730 .iter()
2731 .enumerate()
2732 .find(|(_, line)| !line.trim().is_empty())
2733 else {
2734 return String::new();
2735 };
2736
2737 let trimmed = first.trim_start();
2738 if trimmed.starts_with("/**") {
2739 let mut comment = Vec::new();
2740 for line in lines.iter().skip(start) {
2741 comment.push(*line);
2742 if line.contains("*/") {
2743 break;
2744 }
2745 }
2746 return truncate_chars(&comment.join("\n"), 200);
2747 }
2748
2749 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2750 let comment = lines
2751 .iter()
2752 .skip(start)
2753 .take_while(|line| {
2754 let trimmed = line.trim_start();
2755 trimmed.starts_with("///") || trimmed.starts_with("//!")
2756 })
2757 .copied()
2758 .collect::<Vec<_>>()
2759 .join("\n");
2760 return truncate_chars(&comment, 200);
2761 }
2762
2763 String::new()
2764}
2765
2766pub fn build_file_summary_chunk(
2767 file: &Path,
2768 project_root: &Path,
2769 source: &str,
2770 top_exports: &[&str],
2771 top_export_signatures: &[Option<&str>],
2772) -> SemanticChunk {
2773 let relative = file.strip_prefix(project_root).unwrap_or(file);
2774 let rel_path = relative.to_string_lossy();
2775 let parent_dir = relative
2776 .parent()
2777 .map(|parent| parent.to_string_lossy().to_string())
2778 .unwrap_or_default();
2779 let name = file
2780 .file_stem()
2781 .map(|stem| stem.to_string_lossy().to_string())
2782 .unwrap_or_default();
2783 let doc = first_leading_doc_comment(source);
2784 let exports = top_exports
2785 .iter()
2786 .take(5)
2787 .copied()
2788 .collect::<Vec<_>>()
2789 .join(",");
2790 let snippet = if doc.is_empty() {
2791 top_export_signatures
2792 .first()
2793 .and_then(|signature| signature.as_deref())
2794 .map(|signature| truncate_chars(signature, 200))
2795 .unwrap_or_default()
2796 } else {
2797 doc.clone()
2798 };
2799
2800 SemanticChunk {
2801 file: file.to_path_buf(),
2802 name,
2803 kind: SymbolKind::FileSummary,
2804 start_line: 0,
2805 end_line: 0,
2806 exported: false,
2807 embed_text: truncate_chars(
2808 &format!(
2809 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2810 file.file_stem()
2811 .map(|stem| stem.to_string_lossy().to_string())
2812 .unwrap_or_default()
2813 ),
2814 MAX_EMBED_TEXT_CHARS,
2815 ),
2816 snippet,
2817 }
2818}
2819
2820fn parser_for(
2821 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2822 lang: crate::parser::LangId,
2823) -> Result<&mut Parser, String> {
2824 use std::collections::hash_map::Entry;
2825
2826 match parsers.entry(lang) {
2827 Entry::Occupied(entry) => Ok(entry.into_mut()),
2828 Entry::Vacant(entry) => {
2829 let grammar = grammar_for(lang);
2830 let mut parser = Parser::new();
2831 parser
2832 .set_language(&grammar)
2833 .map_err(|error| error.to_string())?;
2834 Ok(entry.insert(parser))
2835 }
2836 }
2837}
2838
2839pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2840 matches!(
2841 path.extension().and_then(|extension| extension.to_str()),
2842 Some(
2843 "ts" | "tsx"
2844 | "js"
2845 | "jsx"
2846 | "py"
2847 | "rs"
2848 | "go"
2849 | "c"
2850 | "h"
2851 | "cc"
2852 | "cpp"
2853 | "cxx"
2854 | "hpp"
2855 | "hh"
2856 | "zig"
2857 | "cs"
2858 | "sh"
2859 | "bash"
2860 | "zsh"
2861 | "inc"
2862 | "php"
2863 | "sol"
2864 | "scss"
2865 | "vue"
2866 | "yaml"
2867 | "yml"
2868 )
2869 )
2870}
2871
2872fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2873 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2874 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2875 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2876 .map_err(|error| error.to_string())?
2877 .unwrap_or_else(cache_freshness::zero_hash);
2878 Ok(IndexedFileMetadata {
2879 mtime,
2880 size: metadata.len(),
2881 content_hash,
2882 })
2883}
2884
2885fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2886 if let Ok(canonical) = fs::canonicalize(path) {
2887 return canonical;
2888 }
2889
2890 let Some(parent) = path.parent() else {
2891 return path.to_path_buf();
2892 };
2893 let Some(file_name) = path.file_name() else {
2894 return path.to_path_buf();
2895 };
2896
2897 fs::canonicalize(parent)
2898 .map(|canonical_parent| canonical_parent.join(file_name))
2899 .unwrap_or_else(|_| path.to_path_buf())
2900}
2901
2902fn collect_file_chunks(
2903 project_root: &Path,
2904 file: &Path,
2905 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2906) -> Result<Vec<SemanticChunk>, String> {
2907 if !is_semantic_indexed_extension(file) {
2908 return Err("unsupported file extension".to_string());
2909 }
2910 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2911 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2912 let tree = parser_for(parsers, lang)?
2913 .parse(&source, None)
2914 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2915 let symbols =
2916 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2917
2918 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2919}
2920
2921fn build_snippet(symbol: &Symbol, source: &str) -> String {
2923 let lines: Vec<&str> = source.lines().collect();
2924 let start = (symbol.range.start_line as usize).min(lines.len());
2925 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2927 if start < end {
2928 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2929 let mut snippet = snippet_lines.join("\n");
2930 if end - start > 5 {
2931 snippet.push_str("\n ...");
2932 }
2933 if snippet.len() > 300 {
2934 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2935 }
2936 snippet
2937 } else {
2938 String::new()
2939 }
2940}
2941
2942fn symbols_to_chunks(
2944 file: &Path,
2945 symbols: &[Symbol],
2946 source: &str,
2947 project_root: &Path,
2948) -> Vec<SemanticChunk> {
2949 let mut chunks = Vec::new();
2950 let top_exports_with_signatures = symbols
2951 .iter()
2952 .filter(|symbol| {
2953 symbol.exported
2954 && symbol.parent.is_none()
2955 && !matches!(symbol.kind, SymbolKind::Heading)
2956 })
2957 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2958 .collect::<Vec<_>>();
2959
2960 let has_only_headings = !symbols.is_empty()
2961 && symbols
2962 .iter()
2963 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2964 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2965 let top_exports = top_exports_with_signatures
2966 .iter()
2967 .map(|(name, _)| *name)
2968 .collect::<Vec<_>>();
2969 let top_export_signatures = top_exports_with_signatures
2970 .iter()
2971 .map(|(_, signature)| *signature)
2972 .collect::<Vec<_>>();
2973 chunks.push(build_file_summary_chunk(
2974 file,
2975 project_root,
2976 source,
2977 &top_exports,
2978 &top_export_signatures,
2979 ));
2980 }
2981
2982 for symbol in symbols {
2983 if matches!(symbol.kind, SymbolKind::Heading) {
2988 continue;
2989 }
2990
2991 let line_count = symbol
2993 .range
2994 .end_line
2995 .saturating_sub(symbol.range.start_line)
2996 + 1;
2997 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2998 continue;
2999 }
3000
3001 let embed_text = build_embed_text(symbol, source, file, project_root);
3002 let snippet = build_snippet(symbol, source);
3003
3004 chunks.push(SemanticChunk {
3005 file: file.to_path_buf(),
3006 name: symbol.name.clone(),
3007 kind: symbol.kind.clone(),
3008 start_line: symbol.range.start_line,
3009 end_line: symbol.range.end_line,
3010 exported: symbol.exported,
3011 embed_text,
3012 snippet,
3013 });
3014
3015 }
3018
3019 chunks
3020}
3021
3022fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3024 if a.len() != b.len() {
3025 return 0.0;
3026 }
3027
3028 let mut dot = 0.0f32;
3029 let mut norm_a = 0.0f32;
3030 let mut norm_b = 0.0f32;
3031
3032 for i in 0..a.len() {
3033 dot += a[i] * b[i];
3034 norm_a += a[i] * a[i];
3035 norm_b += b[i] * b[i];
3036 }
3037
3038 let denom = norm_a.sqrt() * norm_b.sqrt();
3039 if denom == 0.0 {
3040 0.0
3041 } else {
3042 dot / denom
3043 }
3044}
3045
3046fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3048 match kind {
3049 SymbolKind::Function => 0,
3050 SymbolKind::Class => 1,
3051 SymbolKind::Method => 2,
3052 SymbolKind::Struct => 3,
3053 SymbolKind::Interface => 4,
3054 SymbolKind::Enum => 5,
3055 SymbolKind::TypeAlias => 6,
3056 SymbolKind::Variable => 7,
3057 SymbolKind::Heading => 8,
3058 SymbolKind::FileSummary => 9,
3059 }
3060}
3061
3062fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3063 match v {
3064 0 => SymbolKind::Function,
3065 1 => SymbolKind::Class,
3066 2 => SymbolKind::Method,
3067 3 => SymbolKind::Struct,
3068 4 => SymbolKind::Interface,
3069 5 => SymbolKind::Enum,
3070 6 => SymbolKind::TypeAlias,
3071 7 => SymbolKind::Variable,
3072 8 => SymbolKind::Heading,
3073 9 => SymbolKind::FileSummary,
3074 _ => SymbolKind::Heading,
3075 }
3076}
3077
3078fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
3079 if *pos + 4 > data.len() {
3080 return Err("unexpected end of data reading u32".to_string());
3081 }
3082 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
3083 *pos += 4;
3084 Ok(val)
3085}
3086
3087fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
3088 if *pos + 8 > data.len() {
3089 return Err("unexpected end of data reading u64".to_string());
3090 }
3091 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
3092 *pos += 8;
3093 Ok(u64::from_le_bytes(bytes))
3094}
3095
3096fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
3097 let len = read_u32(data, pos)? as usize;
3098 if *pos + len > data.len() {
3099 return Err("unexpected end of data reading string".to_string());
3100 }
3101 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
3102 *pos += len;
3103 Ok(s)
3104}
3105
3106#[cfg(test)]
3107mod tests {
3108 use super::*;
3109 use crate::config::{SemanticBackend, SemanticBackendConfig};
3110 use crate::parser::FileParser;
3111 use std::io::{Read, Write};
3112 use std::net::TcpListener;
3113 use std::thread;
3114
3115 #[test]
3116 fn semantic_index_includes_php_inc_and_scss_extensions() {
3117 for file in ["partial.inc", "index.php", "styles.scss"] {
3118 assert!(
3119 is_semantic_indexed_extension(Path::new(file)),
3120 "{file} should be semantic-index eligible"
3121 );
3122 }
3123 }
3124
3125 #[test]
3126 fn transient_marker_round_trips_and_classifies() {
3127 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3130 assert!(embedding_failure_is_transient(&marked));
3131 let clean = strip_transient_embedding_marker(&marked);
3132 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3133 assert!(clean.starts_with("openai compatible request failed:"));
3134
3135 for permanent in [
3138 "openai compatible request failed (HTTP 401): Unauthorized",
3139 "embedding dimension mismatch: index has 384, model returned 768",
3140 "too many files (>20000) for semantic indexing (max 20000)",
3141 ] {
3142 assert!(
3143 !embedding_failure_is_transient(permanent),
3144 "{permanent:?} must not be transient"
3145 );
3146 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3148 }
3149 }
3150
3151 #[test]
3152 fn send_error_transience_separates_connect_timeout_from_4xx() {
3153 assert!(is_retryable_embedding_status(
3155 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3156 ));
3157 assert!(is_retryable_embedding_status(
3158 reqwest::StatusCode::TOO_MANY_REQUESTS
3159 ));
3160 assert!(!is_retryable_embedding_status(
3161 reqwest::StatusCode::UNAUTHORIZED
3162 ));
3163 assert!(!is_retryable_embedding_status(
3164 reqwest::StatusCode::BAD_REQUEST
3165 ));
3166 }
3167
3168 #[test]
3169 fn local_backend_model_loading_body_is_transient() {
3170 for body in [
3173 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3174 r#"{"error":"model is loading, please wait"}"#,
3175 r#"{"error":"Model not loaded"}"#,
3176 "Loading model into memory",
3177 ] {
3178 assert!(
3179 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3180 "{body:?} should be body-transient"
3181 );
3182 }
3183
3184 for body in [
3188 r#"{"error":"invalid api key"}"#,
3189 r#"{"error":"model 'foo' not found"}"#,
3190 "Bad Request: unknown field",
3191 "Bad Request: invalid loading model option",
3192 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3193 ] {
3194 assert!(
3195 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3196 "{body:?} must not be body-transient"
3197 );
3198 }
3199
3200 assert!(
3201 !embedding_response_body_is_transient(
3202 reqwest::StatusCode::UNAUTHORIZED,
3203 r#"{"error":"model is loading, please wait"}"#
3204 ),
3205 "permanent auth failures must not become transient because of body text"
3206 );
3207 }
3208
3209 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3210 where
3211 F: Fn(String, String, String) -> String + Send + 'static,
3212 {
3213 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3214 let addr = listener.local_addr().expect("local addr");
3215 let handle = thread::spawn(move || {
3216 let (mut stream, _) = listener.accept().expect("accept request");
3217 let mut buf = Vec::new();
3218 let mut chunk = [0u8; 4096];
3219 let mut header_end = None;
3220 let mut content_length = 0usize;
3221 loop {
3222 let n = stream.read(&mut chunk).expect("read request");
3223 if n == 0 {
3224 break;
3225 }
3226 buf.extend_from_slice(&chunk[..n]);
3227 if header_end.is_none() {
3228 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3229 header_end = Some(pos + 4);
3230 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3231 for line in headers.lines() {
3232 if let Some(value) = line.strip_prefix("Content-Length:") {
3233 content_length = value.trim().parse::<usize>().unwrap_or(0);
3234 }
3235 }
3236 }
3237 }
3238 if let Some(end) = header_end {
3239 if buf.len() >= end + content_length {
3240 break;
3241 }
3242 }
3243 }
3244
3245 let end = header_end.expect("header terminator");
3246 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3247 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3248 let mut lines = request.lines();
3249 let request_line = lines.next().expect("request line").to_string();
3250 let path = request_line
3251 .split_whitespace()
3252 .nth(1)
3253 .expect("request path")
3254 .to_string();
3255 let response_body = handler(request_line, path, body);
3256 let response = format!(
3257 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3258 response_body.len(),
3259 response_body
3260 );
3261 stream
3262 .write_all(response.as_bytes())
3263 .expect("write response");
3264 });
3265
3266 (format!("http://{}", addr), handle)
3267 }
3268
3269 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3270 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3271 listener
3272 .set_nonblocking(true)
3273 .expect("nonblocking listener");
3274 let addr = listener.local_addr().expect("local addr");
3275 let handle = thread::spawn(move || {
3276 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3277 let mut accepted = 0usize;
3278 while accepted < attempts && std::time::Instant::now() < deadline {
3279 match listener.accept() {
3280 Ok((mut stream, _)) => {
3281 accepted += 1;
3282 let mut buf = [0u8; 4096];
3283 let _ = stream.read(&mut buf);
3291 let response = "HTTP/1.1 200 OK
3292Content-Type: application/json
3293Content-Length: 128
3294Connection: close
3295
3296{";
3297 let _ = stream.write_all(response.as_bytes());
3298 }
3299 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3300 thread::sleep(Duration::from_millis(10));
3301 }
3302 Err(error) => panic!("accept request: {error}"),
3303 }
3304 }
3305 });
3306
3307 (format!("http://{}", addr), handle)
3308 }
3309
3310 #[test]
3311 fn response_body_read_failures_are_marked_transient() {
3312 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3313 let client = Client::builder()
3314 .timeout(Duration::from_millis(250))
3315 .build()
3316 .expect("client");
3317
3318 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3319 .expect_err("truncated body should fail");
3320
3321 handle.join().unwrap();
3322 assert!(
3323 embedding_failure_is_transient(&error),
3324 "body read failures should be transient-marked: {error}"
3325 );
3326 assert!(error.contains("response read failed"));
3327 }
3328
3329 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3330 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3331 }
3332
3333 fn write_rust_file(path: &Path, function_name: &str) {
3334 fs::write(
3335 path,
3336 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3337 )
3338 .unwrap();
3339 }
3340
3341 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3342 let mut embed = test_vector_for_texts;
3343 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3344 }
3345
3346 fn test_project_root() -> PathBuf {
3347 std::env::current_dir().unwrap()
3348 }
3349
3350 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3351 index.file_mtimes.insert(file.to_path_buf(), mtime);
3352 index.file_sizes.insert(file.to_path_buf(), size);
3353 index
3354 .file_hashes
3355 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3356 }
3357
3358 #[test]
3359 fn semantic_cache_serialization_skips_paths_outside_project_root() {
3360 let dir = tempfile::tempdir().expect("create temp dir");
3361 let project = fs::canonicalize(dir.path()).expect("canonical project");
3362 let outside = project.join("..").join("outside.rs");
3363 let mut index = SemanticIndex::new(project.clone(), 3);
3364 index
3365 .file_mtimes
3366 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3367 index.file_sizes.insert(outside.clone(), 1);
3368 index
3369 .file_hashes
3370 .insert(outside.clone(), cache_freshness::zero_hash());
3371 index.entries.push(EmbeddingEntry {
3372 chunk: SemanticChunk {
3373 file: outside,
3374 name: "outside".to_string(),
3375 kind: SymbolKind::Function,
3376 start_line: 0,
3377 end_line: 0,
3378 exported: false,
3379 embed_text: "outside".to_string(),
3380 snippet: "outside".to_string(),
3381 },
3382 vector: vec![1.0, 0.0, 0.0],
3383 });
3384
3385 let bytes = index.to_bytes();
3386 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3387 assert_eq!(loaded.entries.len(), 0);
3388 assert!(loaded.file_mtimes.is_empty());
3389 }
3390
3391 #[test]
3392 fn test_cosine_similarity_identical() {
3393 let a = vec![1.0, 0.0, 0.0];
3394 let b = vec![1.0, 0.0, 0.0];
3395 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3396 }
3397
3398 #[test]
3399 fn test_cosine_similarity_orthogonal() {
3400 let a = vec![1.0, 0.0, 0.0];
3401 let b = vec![0.0, 1.0, 0.0];
3402 assert!(cosine_similarity(&a, &b).abs() < 0.001);
3403 }
3404
3405 #[test]
3406 fn test_cosine_similarity_opposite() {
3407 let a = vec![1.0, 0.0, 0.0];
3408 let b = vec![-1.0, 0.0, 0.0];
3409 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3410 }
3411
3412 #[test]
3413 fn test_serialization_roundtrip() {
3414 let project_root = test_project_root();
3415 let file = project_root.join("src/main.rs");
3416 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3417 index.entries.push(EmbeddingEntry {
3418 chunk: SemanticChunk {
3419 file: file.clone(),
3420 name: "handle_request".to_string(),
3421 kind: SymbolKind::Function,
3422 start_line: 10,
3423 end_line: 25,
3424 exported: true,
3425 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3426 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3427 },
3428 vector: vec![0.1, 0.2, 0.3, 0.4],
3429 });
3430 index.dimension = 4;
3431 index
3432 .file_mtimes
3433 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3434 index.file_sizes.insert(file, 0);
3435 index.set_fingerprint(SemanticIndexFingerprint {
3436 backend: "fastembed".to_string(),
3437 model: "all-MiniLM-L6-v2".to_string(),
3438 base_url: FALLBACK_BACKEND.to_string(),
3439 dimension: 4,
3440 chunking_version: default_chunking_version(),
3441 });
3442
3443 let bytes = index.to_bytes();
3444 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3445
3446 assert_eq!(restored.entries.len(), 1);
3447 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3448 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3449 assert_eq!(restored.dimension, 4);
3450 assert_eq!(restored.backend_label(), Some("fastembed"));
3451 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3452 }
3453
3454 #[test]
3455 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3456 let cases = [
3457 (SymbolKind::Function, 0),
3458 (SymbolKind::Class, 1),
3459 (SymbolKind::Method, 2),
3460 (SymbolKind::Struct, 3),
3461 (SymbolKind::Interface, 4),
3462 (SymbolKind::Enum, 5),
3463 (SymbolKind::TypeAlias, 6),
3464 (SymbolKind::Variable, 7),
3465 (SymbolKind::Heading, 8),
3466 (SymbolKind::FileSummary, 9),
3467 ];
3468
3469 for (kind, encoded) in cases {
3470 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3471 assert_eq!(u8_to_symbol_kind(encoded), kind);
3472 }
3473 }
3474
3475 #[test]
3476 fn test_search_top_k() {
3477 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3478 index.dimension = 3;
3479
3480 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3482 let mut vec = vec![0.0f32; 3];
3483 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3485 chunk: SemanticChunk {
3486 file: PathBuf::from("/src/lib.rs"),
3487 name: name.to_string(),
3488 kind: SymbolKind::Function,
3489 start_line: (i * 10 + 1) as u32,
3490 end_line: (i * 10 + 5) as u32,
3491 exported: true,
3492 embed_text: format!("kind:function name:{}", name),
3493 snippet: format!("fn {}() {{}}", name),
3494 },
3495 vector: vec,
3496 });
3497 }
3498
3499 let query = vec![0.9, 0.1, 0.0];
3501 let results = index.search(&query, 2);
3502
3503 assert_eq!(results.len(), 2);
3504 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3506 }
3507
3508 #[test]
3509 fn test_empty_index_search() {
3510 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3511 let results = index.search(&[0.1, 0.2, 0.3], 10);
3512 assert!(results.is_empty());
3513 }
3514
3515 #[test]
3516 fn single_line_symbol_builds_non_empty_snippet() {
3517 let symbol = Symbol {
3518 name: "answer".to_string(),
3519 kind: SymbolKind::Variable,
3520 range: crate::symbols::Range {
3521 start_line: 0,
3522 start_col: 0,
3523 end_line: 0,
3524 end_col: 24,
3525 },
3526 signature: Some("const answer = 42".to_string()),
3527 scope_chain: Vec::new(),
3528 exported: true,
3529 parent: None,
3530 };
3531 let source = "export const answer = 42;\n";
3532
3533 let snippet = build_snippet(&symbol, source);
3534
3535 assert_eq!(snippet, "export const answer = 42;");
3536 }
3537
3538 #[test]
3539 fn optimized_file_chunk_collection_matches_file_parser_path() {
3540 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3541 let file = project_root.join("src/semantic_index.rs");
3542 let source = std::fs::read_to_string(&file).unwrap();
3543
3544 let mut legacy_parser = FileParser::new();
3545 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3546 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3547
3548 let mut parsers = HashMap::new();
3549 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3550
3551 assert_eq!(
3552 chunk_fingerprint(&optimized_chunks),
3553 chunk_fingerprint(&legacy_chunks)
3554 );
3555 }
3556
3557 fn chunk_fingerprint(
3558 chunks: &[SemanticChunk],
3559 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3560 chunks
3561 .iter()
3562 .map(|chunk| {
3563 (
3564 chunk.name.clone(),
3565 chunk.kind.clone(),
3566 chunk.start_line,
3567 chunk.end_line,
3568 chunk.exported,
3569 chunk.embed_text.clone(),
3570 chunk.snippet.clone(),
3571 )
3572 })
3573 .collect()
3574 }
3575
3576 #[test]
3577 fn rejects_oversized_dimension_during_deserialization() {
3578 let mut bytes = Vec::new();
3579 bytes.push(1u8);
3580 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3581 bytes.extend_from_slice(&0u32.to_le_bytes());
3582 bytes.extend_from_slice(&0u32.to_le_bytes());
3583
3584 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3585 }
3586
3587 #[test]
3588 fn rejects_oversized_entry_count_during_deserialization() {
3589 let mut bytes = Vec::new();
3590 bytes.push(1u8);
3591 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3592 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3593 bytes.extend_from_slice(&0u32.to_le_bytes());
3594
3595 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3596 }
3597
3598 #[test]
3599 fn invalidate_file_removes_entries_and_mtime() {
3600 let target = PathBuf::from("/src/main.rs");
3601 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3602 index.entries.push(EmbeddingEntry {
3603 chunk: SemanticChunk {
3604 file: target.clone(),
3605 name: "main".to_string(),
3606 kind: SymbolKind::Function,
3607 start_line: 0,
3608 end_line: 1,
3609 exported: false,
3610 embed_text: "main".to_string(),
3611 snippet: "fn main() {}".to_string(),
3612 },
3613 vector: vec![1.0; DEFAULT_DIMENSION],
3614 });
3615 index
3616 .file_mtimes
3617 .insert(target.clone(), SystemTime::UNIX_EPOCH);
3618 index.file_sizes.insert(target.clone(), 0);
3619
3620 index.invalidate_file(&target);
3621
3622 assert!(index.entries.is_empty());
3623 assert!(!index.file_mtimes.contains_key(&target));
3624 assert!(!index.file_sizes.contains_key(&target));
3625 }
3626
3627 #[test]
3628 fn refresh_missing_changed_file_is_purged_after_collect() {
3629 let temp = tempfile::tempdir().unwrap();
3630 let project_root = temp.path();
3631 let file = project_root.join("src/lib.rs");
3632 fs::create_dir_all(file.parent().unwrap()).unwrap();
3633 write_rust_file(&file, "vanished_symbol");
3634
3635 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3636 let original_size = *index.file_sizes.get(&file).unwrap();
3637 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3638 fs::remove_file(&file).unwrap();
3639
3640 let mut embed = test_vector_for_texts;
3641 let mut progress = |_done: usize, _total: usize| {};
3642 let summary = index
3643 .refresh_stale_files(
3644 project_root,
3645 std::slice::from_ref(&file),
3646 &mut embed,
3647 8,
3648 &mut progress,
3649 )
3650 .unwrap();
3651
3652 assert_eq!(summary.changed, 0);
3653 assert_eq!(summary.added, 0);
3654 assert_eq!(summary.deleted, 1);
3655 assert!(index.entries.is_empty());
3656 assert!(!index.file_mtimes.contains_key(&file));
3657 assert!(!index.file_sizes.contains_key(&file));
3658 assert!(!index.file_hashes.contains_key(&file));
3659 }
3660
3661 #[test]
3662 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3663 let temp = tempfile::tempdir().unwrap();
3664 let project_root = temp.path();
3665 let file = project_root.join("src/lib.rs");
3666 fs::create_dir_all(file.parent().unwrap()).unwrap();
3667 write_rust_file(&file, "kept_symbol");
3668
3669 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3670 let original_entry_count = index.entries.len();
3671 let original_mtime = *index.file_mtimes.get(&file).unwrap();
3672 let original_size = *index.file_sizes.get(&file).unwrap();
3673
3674 let stale_mtime = SystemTime::UNIX_EPOCH;
3675 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3676 fs::remove_file(&file).unwrap();
3677 fs::create_dir(&file).unwrap();
3678
3679 let mut embed = test_vector_for_texts;
3680 let mut progress = |_done: usize, _total: usize| {};
3681 let summary = index
3682 .refresh_stale_files(
3683 project_root,
3684 std::slice::from_ref(&file),
3685 &mut embed,
3686 8,
3687 &mut progress,
3688 )
3689 .unwrap();
3690
3691 assert_eq!(summary.changed, 0);
3692 assert_eq!(summary.added, 0);
3693 assert_eq!(summary.deleted, 0);
3694 assert_eq!(index.entries.len(), original_entry_count);
3695 assert!(index
3696 .entries
3697 .iter()
3698 .any(|entry| entry.chunk.name == "kept_symbol"));
3699 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3700 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3701 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3702 }
3703
3704 #[test]
3705 fn refresh_never_indexed_file_error_does_not_record_mtime() {
3706 let temp = tempfile::tempdir().unwrap();
3707 let project_root = temp.path();
3708 let missing = project_root.join("src/missing.rs");
3709 fs::create_dir_all(missing.parent().unwrap()).unwrap();
3710
3711 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3712 let mut embed = test_vector_for_texts;
3713 let mut progress = |_done: usize, _total: usize| {};
3714 let summary = index
3715 .refresh_stale_files(
3716 project_root,
3717 std::slice::from_ref(&missing),
3718 &mut embed,
3719 8,
3720 &mut progress,
3721 )
3722 .unwrap();
3723
3724 assert_eq!(summary.added, 0);
3725 assert_eq!(summary.changed, 0);
3726 assert_eq!(summary.deleted, 0);
3727 assert!(!index.file_mtimes.contains_key(&missing));
3728 assert!(!index.file_sizes.contains_key(&missing));
3729 assert!(index.entries.is_empty());
3730 }
3731
3732 #[test]
3733 fn refresh_reports_added_for_new_files() {
3734 let temp = tempfile::tempdir().unwrap();
3735 let project_root = temp.path();
3736 let existing = project_root.join("src/lib.rs");
3737 let added = project_root.join("src/new.rs");
3738 fs::create_dir_all(existing.parent().unwrap()).unwrap();
3739 write_rust_file(&existing, "existing_symbol");
3740 write_rust_file(&added, "added_symbol");
3741
3742 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3743 let mut embed = test_vector_for_texts;
3744 let mut progress = |_done: usize, _total: usize| {};
3745 let summary = index
3746 .refresh_stale_files(
3747 project_root,
3748 &[existing.clone(), added.clone()],
3749 &mut embed,
3750 8,
3751 &mut progress,
3752 )
3753 .unwrap();
3754
3755 assert_eq!(summary.added, 1);
3756 assert_eq!(summary.changed, 0);
3757 assert_eq!(summary.deleted, 0);
3758 assert_eq!(summary.total_processed, 2);
3759 assert!(index.file_mtimes.contains_key(&added));
3760 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3761 }
3762
3763 #[test]
3764 fn refresh_reports_deleted_for_removed_files() {
3765 let temp = tempfile::tempdir().unwrap();
3766 let project_root = temp.path();
3767 let deleted = project_root.join("src/deleted.rs");
3768 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3769 write_rust_file(&deleted, "deleted_symbol");
3770
3771 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3772 fs::remove_file(&deleted).unwrap();
3773
3774 let mut embed = test_vector_for_texts;
3775 let mut progress = |_done: usize, _total: usize| {};
3776 let summary = index
3777 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3778 .unwrap();
3779
3780 assert_eq!(summary.deleted, 1);
3781 assert_eq!(summary.changed, 0);
3782 assert_eq!(summary.added, 0);
3783 assert_eq!(summary.total_processed, 1);
3784 assert!(!index.file_mtimes.contains_key(&deleted));
3785 assert!(index.entries.is_empty());
3786 }
3787
3788 #[test]
3789 fn refresh_reports_changed_for_modified_files() {
3790 let temp = tempfile::tempdir().unwrap();
3791 let project_root = temp.path();
3792 let file = project_root.join("src/lib.rs");
3793 fs::create_dir_all(file.parent().unwrap()).unwrap();
3794 write_rust_file(&file, "old_symbol");
3795
3796 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3797 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3798 write_rust_file(&file, "new_symbol");
3799
3800 let mut embed = test_vector_for_texts;
3801 let mut progress = |_done: usize, _total: usize| {};
3802 let summary = index
3803 .refresh_stale_files(
3804 project_root,
3805 std::slice::from_ref(&file),
3806 &mut embed,
3807 8,
3808 &mut progress,
3809 )
3810 .unwrap();
3811
3812 assert_eq!(summary.changed, 1);
3813 assert_eq!(summary.added, 0);
3814 assert_eq!(summary.deleted, 0);
3815 assert_eq!(summary.total_processed, 1);
3816 assert!(index
3817 .entries
3818 .iter()
3819 .any(|entry| entry.chunk.name == "new_symbol"));
3820 assert!(!index
3821 .entries
3822 .iter()
3823 .any(|entry| entry.chunk.name == "old_symbol"));
3824 }
3825
3826 #[test]
3827 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3828 let temp = tempfile::tempdir().unwrap();
3829 let project_root = temp.path();
3830 let file = project_root.join("src/lib.rs");
3831 fs::create_dir_all(file.parent().unwrap()).unwrap();
3832 write_rust_file(&file, "clean_symbol");
3833
3834 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3835 let original_entries = index.entries.len();
3836 let mut embed_called = false;
3837 let mut embed = |texts: Vec<String>| {
3838 embed_called = true;
3839 test_vector_for_texts(texts)
3840 };
3841 let mut progress = |_done: usize, _total: usize| {};
3842 let summary = index
3843 .refresh_stale_files(
3844 project_root,
3845 std::slice::from_ref(&file),
3846 &mut embed,
3847 8,
3848 &mut progress,
3849 )
3850 .unwrap();
3851
3852 assert!(summary.is_noop());
3853 assert_eq!(summary.total_processed, 1);
3854 assert!(!embed_called);
3855 assert_eq!(index.entries.len(), original_entries);
3856 }
3857
3858 #[test]
3859 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3860 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3861
3862 assert!(is_onnx_runtime_unavailable(message));
3863 }
3864
3865 #[test]
3866 fn formats_missing_onnx_runtime_with_install_hint() {
3867 let message = format_embedding_init_error(
3868 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3869 );
3870
3871 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3872 assert!(message.contains("Original error:"));
3873 }
3874
3875 #[test]
3876 fn openai_compatible_backend_embeds_with_mock_server() {
3877 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3878 assert!(request_line.starts_with("POST "));
3879 assert_eq!(path, "/v1/embeddings");
3880 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3881 });
3882
3883 let config = SemanticBackendConfig {
3884 backend: SemanticBackend::OpenAiCompatible,
3885 model: "test-embedding".to_string(),
3886 base_url: Some(base_url),
3887 api_key_env: None,
3888 timeout_ms: 5_000,
3889 max_batch_size: 64,
3890 max_files: 20_000,
3891 };
3892
3893 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3894 let vectors = model
3895 .embed(vec!["hello".to_string(), "world".to_string()])
3896 .unwrap();
3897
3898 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3899 handle.join().unwrap();
3900 }
3901
3902 #[test]
3912 fn openai_compatible_request_has_single_content_type_header() {
3913 use std::sync::{Arc, Mutex};
3914 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3915 let captured_for_thread = Arc::clone(&captured);
3916
3917 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3918 let addr = listener.local_addr().expect("local addr");
3919 let handle = thread::spawn(move || {
3920 let (mut stream, _) = listener.accept().expect("accept");
3921 let mut buf = Vec::new();
3922 let mut chunk = [0u8; 4096];
3923 let mut header_end = None;
3924 let mut content_length = 0usize;
3925 loop {
3926 let n = stream.read(&mut chunk).expect("read");
3927 if n == 0 {
3928 break;
3929 }
3930 buf.extend_from_slice(&chunk[..n]);
3931 if header_end.is_none() {
3932 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3933 header_end = Some(pos + 4);
3934 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3935 if let Some(value) = line.strip_prefix("Content-Length:") {
3936 content_length = value.trim().parse::<usize>().unwrap_or(0);
3937 }
3938 }
3939 }
3940 }
3941 if let Some(end) = header_end {
3942 if buf.len() >= end + content_length {
3943 break;
3944 }
3945 }
3946 }
3947 *captured_for_thread.lock().unwrap() = buf;
3948 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3949 let response = format!(
3950 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3951 body.len(),
3952 body
3953 );
3954 let _ = stream.write_all(response.as_bytes());
3955 });
3956
3957 let config = SemanticBackendConfig {
3958 backend: SemanticBackend::OpenAiCompatible,
3959 model: "text-embedding-3-small".to_string(),
3960 base_url: Some(format!("http://{}", addr)),
3961 api_key_env: None,
3962 timeout_ms: 5_000,
3963 max_batch_size: 64,
3964 max_files: 20_000,
3965 };
3966 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3967 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3968 handle.join().unwrap();
3969
3970 let bytes = captured.lock().unwrap().clone();
3971 let request = String::from_utf8_lossy(&bytes);
3972
3973 let content_type_lines = request
3976 .lines()
3977 .filter(|line| {
3978 let lower = line.to_ascii_lowercase();
3979 lower.starts_with("content-type:")
3980 })
3981 .count();
3982 assert_eq!(
3983 content_type_lines, 1,
3984 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3985 );
3986
3987 assert!(
3990 request.contains(r#""model":"text-embedding-3-small""#),
3991 "request body should contain model field; full request:\n{request}",
3992 );
3993 }
3994
3995 #[test]
3996 fn ollama_backend_embeds_with_mock_server() {
3997 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3998 assert!(request_line.starts_with("POST "));
3999 assert_eq!(path, "/api/embed");
4000 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4001 });
4002
4003 let config = SemanticBackendConfig {
4004 backend: SemanticBackend::Ollama,
4005 model: "embeddinggemma".to_string(),
4006 base_url: Some(base_url),
4007 api_key_env: None,
4008 timeout_ms: 5_000,
4009 max_batch_size: 64,
4010 max_files: 20_000,
4011 };
4012
4013 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4014 let vectors = model
4015 .embed(vec!["hello".to_string(), "world".to_string()])
4016 .unwrap();
4017
4018 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4019 handle.join().unwrap();
4020 }
4021
4022 #[test]
4023 fn read_from_disk_rejects_fingerprint_mismatch() {
4024 let storage = tempfile::tempdir().unwrap();
4025 let project_key = "proj";
4026
4027 let project_root = test_project_root();
4028 let file = project_root.join("src/main.rs");
4029 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4030 index.entries.push(EmbeddingEntry {
4031 chunk: SemanticChunk {
4032 file: file.clone(),
4033 name: "handle_request".to_string(),
4034 kind: SymbolKind::Function,
4035 start_line: 10,
4036 end_line: 25,
4037 exported: true,
4038 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4039 snippet: "fn handle_request() {}".to_string(),
4040 },
4041 vector: vec![0.1, 0.2, 0.3],
4042 });
4043 index.dimension = 3;
4044 index
4045 .file_mtimes
4046 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4047 index.file_sizes.insert(file, 0);
4048 index.set_fingerprint(SemanticIndexFingerprint {
4049 backend: "openai_compatible".to_string(),
4050 model: "test-embedding".to_string(),
4051 base_url: "http://127.0.0.1:1234/v1".to_string(),
4052 dimension: 3,
4053 chunking_version: default_chunking_version(),
4054 });
4055 index.write_to_disk(storage.path(), project_key);
4056
4057 let matching = index.fingerprint().unwrap().as_string();
4058 assert!(SemanticIndex::read_from_disk(
4059 storage.path(),
4060 project_key,
4061 &project_root,
4062 false,
4063 Some(&matching),
4064 )
4065 .is_some());
4066
4067 let mismatched = SemanticIndexFingerprint {
4068 backend: "ollama".to_string(),
4069 model: "embeddinggemma".to_string(),
4070 base_url: "http://127.0.0.1:11434".to_string(),
4071 dimension: 3,
4072 chunking_version: default_chunking_version(),
4073 }
4074 .as_string();
4075 assert!(SemanticIndex::read_from_disk(
4076 storage.path(),
4077 project_key,
4078 &project_root,
4079 false,
4080 Some(&mismatched),
4081 )
4082 .is_none());
4083 }
4084
4085 #[test]
4086 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
4087 let storage = tempfile::tempdir().unwrap();
4088 let project_key = "proj-v3";
4089 let dir = storage.path().join("semantic").join(project_key);
4090 fs::create_dir_all(&dir).unwrap();
4091
4092 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4093 index.entries.push(EmbeddingEntry {
4094 chunk: SemanticChunk {
4095 file: PathBuf::from("/src/main.rs"),
4096 name: "handle_request".to_string(),
4097 kind: SymbolKind::Function,
4098 start_line: 0,
4099 end_line: 0,
4100 exported: true,
4101 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4102 snippet: "fn handle_request() {}".to_string(),
4103 },
4104 vector: vec![0.1, 0.2, 0.3],
4105 });
4106 index.dimension = 3;
4107 index
4108 .file_mtimes
4109 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
4110 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
4111 let fingerprint = SemanticIndexFingerprint {
4112 backend: "fastembed".to_string(),
4113 model: "test".to_string(),
4114 base_url: FALLBACK_BACKEND.to_string(),
4115 dimension: 3,
4116 chunking_version: default_chunking_version(),
4117 };
4118 index.set_fingerprint(fingerprint.clone());
4119
4120 let mut bytes = index.to_bytes();
4121 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
4122 fs::write(dir.join("semantic.bin"), bytes).unwrap();
4123
4124 assert!(SemanticIndex::read_from_disk(
4125 storage.path(),
4126 project_key,
4127 &test_project_root(),
4128 false,
4129 Some(&fingerprint.as_string())
4130 )
4131 .is_none());
4132 assert!(!dir.join("semantic.bin").exists());
4133 }
4134
4135 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
4136 crate::symbols::Symbol {
4137 name: name.to_string(),
4138 kind,
4139 range: crate::symbols::Range {
4140 start_line: start,
4141 start_col: 0,
4142 end_line: end,
4143 end_col: 0,
4144 },
4145 signature: None,
4146 scope_chain: Vec::new(),
4147 exported: false,
4148 parent: None,
4149 }
4150 }
4151
4152 #[test]
4157 fn symbols_to_chunks_skips_heading_symbols() {
4158 let project_root = PathBuf::from("/proj");
4159 let file = project_root.join("README.md");
4160 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
4161
4162 let symbols = vec![
4163 make_symbol(SymbolKind::Heading, "Title", 0, 2),
4164 make_symbol(SymbolKind::Heading, "Section", 4, 6),
4165 ];
4166
4167 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4168 assert!(
4169 chunks.is_empty(),
4170 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
4171 chunks.len()
4172 );
4173 }
4174
4175 #[test]
4182 fn build_embed_text_clamps_oversized_signature() {
4183 let project_root = PathBuf::from("/proj");
4184 let file = project_root.join("cronjob.yaml");
4185 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
4187
4188 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
4189 symbol.signature = Some(huge_sig);
4190
4191 let text = build_embed_text(&symbol, source, &file, &project_root);
4192 assert!(
4193 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
4194 "embed_text must be clamped to {} chars, got {}",
4195 MAX_EMBED_TEXT_CHARS,
4196 text.chars().count()
4197 );
4198 }
4199
4200 #[test]
4204 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
4205 let project_root = PathBuf::from("/proj");
4206 let file = project_root.join("src/lib.rs");
4207 let source = "pub fn handle_request() -> bool {\n true\n}\n";
4208
4209 let symbols = vec![
4210 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
4212 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
4213 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
4214 ];
4215
4216 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4217 assert_eq!(
4218 chunks.len(),
4219 3,
4220 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
4221 chunks.len()
4222 );
4223 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
4224 assert!(chunks
4225 .iter()
4226 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
4227 assert!(names.contains(&"handle_request"));
4228 assert!(names.contains(&"AuthService"));
4229 assert!(
4230 !names.contains(&"doc heading"),
4231 "Heading symbol leaked into chunks: {names:?}"
4232 );
4233 }
4234
4235 #[test]
4236 fn validate_ssrf_allows_loopback_hostnames() {
4237 for host in &[
4240 "http://localhost",
4241 "http://localhost:8080",
4242 "http://localhost:11434", "http://localhost.localdomain",
4244 "http://foo.localhost",
4245 ] {
4246 assert!(
4247 validate_base_url_no_ssrf(host).is_ok(),
4248 "Expected {host} to be allowed (loopback), got: {:?}",
4249 validate_base_url_no_ssrf(host)
4250 );
4251 }
4252 }
4253
4254 #[test]
4255 fn validate_ssrf_allows_loopback_ips() {
4256 for url in &[
4259 "http://127.0.0.1",
4260 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
4262 "http://127.1.2.3",
4263 ] {
4264 let result = validate_base_url_no_ssrf(url);
4265 assert!(
4266 result.is_ok(),
4267 "Expected {url} to be allowed (loopback), got: {:?}",
4268 result
4269 );
4270 }
4271 }
4272
4273 #[test]
4274 fn validate_ssrf_rejects_private_non_loopback_ips() {
4275 for url in &[
4280 "http://192.168.1.1",
4281 "http://10.0.0.1",
4282 "http://172.16.0.1",
4283 "http://169.254.169.254",
4284 "http://100.64.0.1",
4285 ] {
4286 let result = validate_base_url_no_ssrf(url);
4287 assert!(
4288 result.is_err(),
4289 "Expected {url} to be rejected (non-loopback private), got: {:?}",
4290 result
4291 );
4292 }
4293 }
4294
4295 #[test]
4296 fn validate_ssrf_rejects_mdns_local_hostnames() {
4297 for host in &[
4300 "http://printer.local",
4301 "http://nas.local:8080",
4302 "http://homelab.local",
4303 ] {
4304 let result = validate_base_url_no_ssrf(host);
4305 assert!(
4306 result.is_err(),
4307 "Expected {host} to be rejected (mDNS), got: {:?}",
4308 result
4309 );
4310 }
4311 }
4312
4313 #[test]
4314 fn normalize_base_url_allows_localhost_for_tests() {
4315 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4318 assert!(normalize_base_url("http://localhost:8080").is_ok());
4319 }
4320
4321 #[test]
4328 fn ort_mismatch_message_recommends_auto_fix_first() {
4329 let msg =
4330 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4331
4332 assert!(
4334 msg.contains("v1.9.0"),
4335 "should report detected version: {msg}"
4336 );
4337 assert!(
4338 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4339 "should report system path: {msg}"
4340 );
4341 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4342
4343 let auto_fix_pos = msg
4345 .find("Auto-fix")
4346 .expect("Auto-fix solution missing — users won't discover --fix");
4347 let remove_pos = msg
4348 .find("Remove the old library")
4349 .expect("system-rm solution missing");
4350 assert!(
4351 auto_fix_pos < remove_pos,
4352 "Auto-fix must come before manual rm — see PR comment thread"
4353 );
4354
4355 assert!(
4357 msg.contains("npx @cortexkit/aft doctor --fix"),
4358 "auto-fix command must be present and copy-pasteable: {msg}"
4359 );
4360 }
4361
4362 #[cfg(any(target_os = "linux", target_os = "macos"))]
4363 #[test]
4364 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
4365 let requested = "libonnxruntime.so";
4366 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
4367
4368 assert_eq!(detect_ort_version_from_path(requested), None);
4369 let (version, source) =
4370 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
4371
4372 assert_eq!(version, Some("1.19.0".to_string()));
4373 assert_eq!(source, actual);
4374
4375 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
4376 assert!(msg.contains("v1.19.0"));
4377 assert!(msg.contains(actual));
4378 }
4379
4380 #[test]
4384 fn ort_mismatch_message_handles_macos_dylib_path() {
4385 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4386 assert!(msg.contains("v1.9.0"));
4387 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4388 assert!(
4392 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4393 "system path should be quoted in the auto-fix sentence: {msg}"
4394 );
4395 }
4396}