1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Local(LocalEmbedder),
133 OpenAiCompatible {
134 client: Client,
135 model: String,
136 base_url: String,
137 api_key: Option<String>,
138 },
139 Ollama {
140 client: Client,
141 model: String,
142 base_url: String,
143 },
144}
145
146pub struct SemanticEmbeddingModel {
147 backend: SemanticBackend,
148 model: String,
149 base_url: Option<String>,
150 timeout_ms: u64,
151 max_batch_size: usize,
152 dimension: Option<usize>,
153 engine: SemanticEmbeddingEngine,
154 query_embedding_cache: HashMap<String, Vec<f32>>,
155 query_embedding_cache_order: VecDeque<String>,
156 query_embedding_cache_hits: u64,
157 query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163 vectors: &[Vec<f32>],
164 expected_count: usize,
165 context: &str,
166) -> Result<(), String> {
167 if expected_count > 0 && vectors.is_empty() {
168 return Err(format!(
169 "{context} returned no vectors for {expected_count} inputs"
170 ));
171 }
172
173 if vectors.len() != expected_count {
174 return Err(format!(
175 "{context} returned {} vectors for {} inputs",
176 vectors.len(),
177 expected_count
178 ));
179 }
180
181 let Some(first_vector) = vectors.first() else {
182 return Ok(());
183 };
184 let expected_dimension = first_vector.len();
185 validate_embedding_dimension(expected_dimension)
186 .map_err(|error| format!("{context} returned {error}"))?;
187 for (index, vector) in vectors.iter().enumerate() {
188 if vector.len() != expected_dimension {
189 return Err(format!(
190 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191 vector.len()
192 ));
193 }
194 }
195
196 Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200 if dimension == 0 || dimension > MAX_DIMENSION {
201 return Err(format!(
202 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203 ));
204 }
205
206 Ok(())
207}
208
209fn normalize_base_url(raw: &str) -> Result<String, String> {
213 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214 let scheme = parsed.scheme();
215 if scheme != "http" && scheme != "https" {
216 return Err(format!(
217 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218 scheme
219 ));
220 }
221 Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239 use std::net::{IpAddr, ToSocketAddrs};
240
241 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243 let host = parsed.host_str().unwrap_or("");
244
245 let is_loopback_host =
250 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251 if is_loopback_host {
252 return Ok(());
253 }
254
255 if host.ends_with(".local") {
258 return Err(format!(
259 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260 ));
261 }
262
263 let port = parsed.port_or_known_default().unwrap_or(443);
266 let addr_str = format!("{host}:{port}");
267 let addrs: Vec<IpAddr> = addr_str
268 .to_socket_addrs()
269 .map(|iter| iter.map(|sa| sa.ip()).collect())
270 .unwrap_or_default();
271 for ip in &addrs {
272 if is_private_non_loopback_ip(ip) {
273 return Err(format!(
274 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275 ));
276 }
277 }
278
279 Ok(())
280}
281
282fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287 match ip {
288 IpAddr::V4(v4) => {
289 let o = v4.octets();
290 o[0] == 10
293 || (o[0] == 172 && (16..=31).contains(&o[1]))
295 || (o[0] == 192 && o[1] == 168)
297 || (o[0] == 169 && o[1] == 254)
299 || (o[0] == 100 && (64..=127).contains(&o[1]))
301 || o[0] == 0
303 }
304 IpAddr::V6(v6) => {
305 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
309 || (v6.segments()[0] & 0xfe00) == 0xfc00
311 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313 && v6.segments()[2] == 0 && v6.segments()[3] == 0
314 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315 && {
316 let [a, b] = v6.segments()[6..8] else { return false; };
317 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319 })
320 }
321 }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325 if base_url.ends_with("/v1") {
326 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327 } else {
328 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329 }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333 if base_url.ends_with("/api") {
334 format!("{base_url}/embed")
335 } else {
336 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337 }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341 value.and_then(|token| {
342 let token = token.trim();
343 if token.is_empty() {
344 None
345 } else {
346 Some(token.to_string())
347 }
348 })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
361 if !matches!(
362 status,
363 reqwest::StatusCode::BAD_REQUEST
364 | reqwest::StatusCode::CONFLICT
365 | reqwest::StatusCode::REQUEST_TIMEOUT
366 | reqwest::StatusCode::LOCKED
367 | reqwest::StatusCode::TOO_EARLY
368 ) {
369 return false;
370 }
371
372 let lower = raw.to_ascii_lowercase();
373 let normalized = lower.trim();
374
375 normalized.contains("model was unloaded while the request was still in queue")
376 || normalized == "model is loading"
377 || normalized.starts_with("model is loading,")
378 || normalized.contains(r#""error":"model is loading"#)
379 || normalized.contains(r#""message":"model is loading"#)
380 || normalized == "model not loaded"
381 || normalized.contains(r#""error":"model not loaded""#)
382 || normalized.contains(r#""message":"model not loaded""#)
383 || normalized == "loading model into memory"
384 || normalized.contains(r#""error":"loading model into memory""#)
385 || normalized.contains(r#""message":"loading model into memory""#)
386 || normalized == "model is being loaded"
387 || normalized.contains(r#""error":"model is being loaded""#)
388 || normalized.contains(r#""message":"model is being loaded""#)
389 || normalized == "model is currently loading"
390 || normalized.contains(r#""error":"model is currently loading""#)
391 || normalized.contains(r#""message":"model is currently loading""#)
392}
393
394fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
395 error.is_connect()
396}
397
398fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
404 error.is_connect() || error.is_timeout()
405}
406
407fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
408 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
409}
410
411pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
418
419pub fn embedding_failure_is_transient(error: &str) -> bool {
422 error.contains(TRANSIENT_EMBEDDING_MARKER)
423}
424
425pub fn strip_transient_embedding_marker(error: &str) -> String {
427 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
428}
429
430fn sleep_before_embedding_retry(attempt_index: usize) {
431 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
432 std::thread::sleep(Duration::from_millis(*delay_ms));
433 }
434}
435
436fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
437where
438 F: FnMut() -> reqwest::blocking::RequestBuilder,
439{
440 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
441 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
442
443 let response = match make_request().send() {
444 Ok(response) => response,
445 Err(error) => {
446 if !last_attempt && is_retryable_embedding_error(&error) {
447 sleep_before_embedding_retry(attempt_index);
448 continue;
449 }
450 let marker = if embedding_send_error_is_transient(&error) {
454 TRANSIENT_EMBEDDING_MARKER
455 } else {
456 ""
457 };
458 return Err(format!("{marker}{backend_label} request failed: {error}"));
459 }
460 };
461
462 let status = response.status();
463 let raw = match response.text() {
464 Ok(raw) => raw,
465 Err(error) => {
466 if !last_attempt && embedding_response_read_error_is_transient(&error) {
467 sleep_before_embedding_retry(attempt_index);
468 continue;
469 }
470 let marker = if embedding_response_read_error_is_transient(&error) {
471 TRANSIENT_EMBEDDING_MARKER
472 } else {
473 ""
474 };
475 return Err(format!(
476 "{marker}{backend_label} response read failed: {error}"
477 ));
478 }
479 };
480
481 if status.is_success() {
482 return Ok(raw);
483 }
484
485 let body_transient = embedding_response_body_is_transient(status, &raw);
489 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
490 sleep_before_embedding_retry(attempt_index);
491 continue;
492 }
493
494 let marker = if is_retryable_embedding_status(status) || body_transient {
500 TRANSIENT_EMBEDDING_MARKER
501 } else {
502 ""
503 };
504 return Err(format!(
505 "{marker}{backend_label} request failed (HTTP {}): {}",
506 status, raw
507 ));
508 }
509
510 unreachable!("embedding request retries exhausted without returning")
511}
512
513impl SemanticEmbeddingModel {
514 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
515 let timeout_ms = if config.timeout_ms == 0 {
516 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
517 } else {
518 config.timeout_ms
519 };
520
521 let max_batch_size = if config.max_batch_size == 0 {
522 DEFAULT_MAX_BATCH_SIZE
523 } else {
524 config.max_batch_size
525 };
526
527 let api_key_env = normalize_api_key(config.api_key_env.clone());
528 let model = config.model.clone();
529
530 let client = Client::builder()
531 .timeout(Duration::from_millis(timeout_ms))
532 .redirect(reqwest::redirect::Policy::none())
533 .build()
534 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
535
536 let engine = match config.backend {
537 SemanticBackend::Fastembed => {
538 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
539 }
540 SemanticBackend::OpenAiCompatible => {
541 let raw = config.base_url.as_ref().ok_or_else(|| {
542 "base_url is required for openai_compatible backend".to_string()
543 })?;
544 let base_url = normalize_base_url(raw)?;
545
546 let api_key = match api_key_env {
547 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
548 format!("missing api_key_env '{var_name}' for openai_compatible backend")
549 })?),
550 None => None,
551 };
552
553 SemanticEmbeddingEngine::OpenAiCompatible {
554 client,
555 model,
556 base_url,
557 api_key,
558 }
559 }
560 SemanticBackend::Ollama => {
561 let raw = config
562 .base_url
563 .as_ref()
564 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
565 let base_url = normalize_base_url(raw)?;
566
567 SemanticEmbeddingEngine::Ollama {
568 client,
569 model,
570 base_url,
571 }
572 }
573 };
574
575 Ok(Self {
576 backend: config.backend,
577 model: config.model.clone(),
578 base_url: config.base_url.clone(),
579 timeout_ms,
580 max_batch_size,
581 dimension: None,
582 engine,
583 query_embedding_cache: HashMap::new(),
584 query_embedding_cache_order: VecDeque::new(),
585 query_embedding_cache_hits: 0,
586 query_embedding_cache_misses: 0,
587 })
588 }
589
590 pub fn backend(&self) -> SemanticBackend {
591 self.backend
592 }
593
594 pub fn model(&self) -> &str {
595 &self.model
596 }
597
598 pub fn base_url(&self) -> Option<&str> {
599 self.base_url.as_deref()
600 }
601
602 pub fn max_batch_size(&self) -> usize {
603 self.max_batch_size
604 }
605
606 pub fn timeout_ms(&self) -> u64 {
607 self.timeout_ms
608 }
609
610 pub fn fingerprint(
611 &mut self,
612 config: &SemanticBackendConfig,
613 ) -> Result<SemanticIndexFingerprint, String> {
614 let dimension = self.dimension()?;
615 Ok(SemanticIndexFingerprint::from_config(config, dimension))
616 }
617
618 pub fn dimension(&mut self) -> Result<usize, String> {
619 if let Some(dimension) = self.dimension {
620 return Ok(dimension);
621 }
622
623 let dimension = match &mut self.engine {
624 SemanticEmbeddingEngine::Local(model) => {
625 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
626 vectors
627 .first()
628 .map(|v| v.len())
629 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
630 }
631 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
632 let vectors =
633 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
634 vectors
635 .first()
636 .map(|v| v.len())
637 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
638 }
639 SemanticEmbeddingEngine::Ollama { .. } => {
640 let vectors =
641 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
642 vectors
643 .first()
644 .map(|v| v.len())
645 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
646 }
647 };
648
649 self.dimension = Some(dimension);
650 Ok(dimension)
651 }
652
653 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
654 self.embed_texts(texts)
655 }
656
657 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
658 if let Some(vector) = self.query_embedding_cache.get(query) {
659 self.query_embedding_cache_hits += 1;
660 return Ok(vector.clone());
661 }
662
663 self.query_embedding_cache_misses += 1;
664 let embeddings = self.embed_texts(vec![query.to_string()])?;
665 let vector = embeddings
666 .first()
667 .cloned()
668 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
669
670 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
671 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
672 self.query_embedding_cache.remove(&oldest);
673 }
674 }
675 self.query_embedding_cache
676 .insert(query.to_string(), vector.clone());
677 self.query_embedding_cache_order
678 .push_back(query.to_string());
679
680 Ok(vector)
681 }
682
683 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
684 (
685 self.query_embedding_cache_hits,
686 self.query_embedding_cache_misses,
687 self.query_embedding_cache.len(),
688 )
689 }
690
691 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
692 match &mut self.engine {
693 SemanticEmbeddingEngine::Local(model) => model
694 .embed(&texts)
695 .map_err(|error| format!("failed to embed batch: {error}")),
696 SemanticEmbeddingEngine::OpenAiCompatible {
697 client,
698 model,
699 base_url,
700 api_key,
701 } => {
702 let expected_text_count = texts.len();
703 let endpoint = build_openai_embeddings_endpoint(base_url);
704 let body = serde_json::json!({
705 "input": texts,
706 "model": model,
707 });
708
709 let raw = send_embedding_request(
710 || {
711 let mut request = client.post(&endpoint).json(&body);
721
722 if let Some(api_key) = api_key {
723 request = request.header("Authorization", format!("Bearer {api_key}"));
724 }
725
726 request
727 },
728 "openai compatible",
729 )?;
730
731 #[derive(Deserialize)]
732 struct OpenAiResponse {
733 data: Vec<OpenAiEmbeddingResult>,
734 }
735
736 #[derive(Deserialize)]
737 struct OpenAiEmbeddingResult {
738 embedding: Vec<f32>,
739 index: Option<u32>,
740 }
741
742 let parsed: OpenAiResponse = serde_json::from_str(&raw)
743 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
744 if parsed.data.len() != expected_text_count {
745 return Err(format!(
746 "openai compatible response returned {} embeddings for {} inputs",
747 parsed.data.len(),
748 expected_text_count
749 ));
750 }
751
752 let mut vectors = vec![Vec::new(); parsed.data.len()];
753 for (i, item) in parsed.data.into_iter().enumerate() {
754 let index = item.index.unwrap_or(i as u32) as usize;
755 if index >= vectors.len() {
756 return Err(
757 "openai compatible response contains invalid vector index".to_string()
758 );
759 }
760 vectors[index] = item.embedding;
761 }
762
763 for vector in &vectors {
764 if vector.is_empty() {
765 return Err(
766 "openai compatible response contained missing vectors".to_string()
767 );
768 }
769 }
770
771 self.dimension = vectors.first().map(Vec::len);
772 Ok(vectors)
773 }
774 SemanticEmbeddingEngine::Ollama {
775 client,
776 model,
777 base_url,
778 } => {
779 let expected_text_count = texts.len();
780 let endpoint = build_ollama_embeddings_endpoint(base_url);
781
782 #[derive(Serialize)]
783 struct OllamaPayload<'a> {
784 model: &'a str,
785 input: Vec<String>,
786 }
787
788 let payload = OllamaPayload {
789 model,
790 input: texts,
791 };
792
793 let raw = send_embedding_request(
794 || {
795 client.post(&endpoint).json(&payload)
800 },
801 "ollama",
802 )?;
803
804 #[derive(Deserialize)]
805 struct OllamaResponse {
806 embeddings: Vec<Vec<f32>>,
807 }
808
809 let parsed: OllamaResponse = serde_json::from_str(&raw)
810 .map_err(|error| format!("invalid ollama response: {error}"))?;
811 if parsed.embeddings.is_empty() {
812 return Err("ollama response returned no embeddings".to_string());
813 }
814 if parsed.embeddings.len() != expected_text_count {
815 return Err(format!(
816 "ollama response returned {} embeddings for {} inputs",
817 parsed.embeddings.len(),
818 expected_text_count
819 ));
820 }
821
822 let vectors = parsed.embeddings;
823 for vector in &vectors {
824 if vector.is_empty() {
825 return Err("ollama response contained empty embeddings".to_string());
826 }
827 }
828
829 self.dimension = vectors.first().map(Vec::len);
830 Ok(vectors)
831 }
832 }
833 }
834}
835
836pub fn pre_validate_onnx_runtime() -> Result<(), String> {
840 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
841
842 #[cfg(any(target_os = "linux", target_os = "macos"))]
843 {
844 #[cfg(target_os = "linux")]
845 let default_name = "libonnxruntime.so";
846 #[cfg(target_os = "macos")]
847 let default_name = "libonnxruntime.dylib";
848
849 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
850
851 unsafe {
852 let c_name = std::ffi::CString::new(lib_name)
853 .map_err(|e| format!("invalid library path: {}", e))?;
854 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
855 if handle.is_null() {
856 let err = libc::dlerror();
857 let msg = if err.is_null() {
858 "unknown dlopen error".to_string()
859 } else {
860 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
861 };
862 return Err(format!(
863 "ONNX Runtime not found. dlopen('{}') failed: {}. \
864 Run `npx @cortexkit/aft doctor` to diagnose.",
865 lib_name, msg
866 ));
867 }
868
869 let (detected_version, version_source) =
874 detect_ort_version_from_loaded_library(handle, lib_name);
875
876 libc::dlclose(handle);
877
878 if let Some(ref version) = detected_version {
880 let parts: Vec<&str> = version.split('.').collect();
881 if let (Some(major), Some(minor)) = (
882 parts.first().and_then(|s| s.parse::<u32>().ok()),
883 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
884 ) {
885 if major != 1 || minor < 20 {
886 return Err(format_ort_version_mismatch(version, &version_source));
887 }
888 }
889 }
890 }
891 }
892
893 #[cfg(target_os = "windows")]
894 {
895 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
900
901 #[link(name = "kernel32")]
905 extern "system" {
906 fn LoadLibraryExW(
907 lpLibFileName: *const u16,
908 hFile: *mut std::ffi::c_void,
909 dwFlags: u32,
910 ) -> *mut std::ffi::c_void;
911 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
912 fn GetModuleFileNameW(
913 hModule: *mut std::ffi::c_void,
914 lpFilename: *mut u16,
915 nSize: u32,
916 ) -> u32;
917 }
918
919 #[link(name = "version")]
920 extern "system" {
921 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
922 fn GetFileVersionInfoW(
923 lptstrFilename: *const u16,
924 dwHandle: u32,
925 dwLen: u32,
926 lpData: *mut std::ffi::c_void,
927 ) -> i32;
928 fn VerQueryValueW(
929 pBlock: *mut std::ffi::c_void,
930 lpSubBlock: *const u16,
931 lplpBuffer: *mut *mut std::ffi::c_void,
932 puLen: *mut u32,
933 ) -> i32;
934 }
935
936 #[repr(C)]
937 struct VS_FIXEDFILEINFO {
938 dw_signature: u32,
939 dw_struc_version: u32,
940 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
943 dw_product_version_ls: u32,
944 dw_file_flags_mask: u32,
945 dw_file_flags: u32,
946 dw_file_os: u32,
947 dw_file_type: u32,
948 dw_file_subtype: u32,
949 dw_file_date_ms: u32,
950 dw_file_date_ls: u32,
951 }
952
953 unsafe {
954 use std::os::windows::ffi::OsStrExt;
955 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
956 .encode_wide()
957 .chain(std::iter::once(0))
958 .collect();
959
960 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
961 if handle.is_null() {
962 let err = std::io::Error::last_os_error();
963 return Err(format!(
964 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
965 Run `npx @cortexkit/aft doctor` to diagnose.",
966 lib_name, err
967 ));
968 }
969
970 let mut detected_major: u32 = 0;
973 let mut detected_minor: u32 = 0;
974 let mut path_buf = [0u16; 32767];
980 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
981 if path_len > 0 {
982 let mut dummy_handle: u32 = 0;
983 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
984 if info_size > 0 {
985 let mut info = vec![0u8; info_size as usize];
986 if GetFileVersionInfoW(
987 path_buf.as_ptr(),
988 0,
989 info_size,
990 info.as_mut_ptr() as *mut std::ffi::c_void,
991 ) != 0
992 {
993 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
994 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
995 let mut vs_len: u32 = 0;
996 if VerQueryValueW(
997 info.as_mut_ptr() as *mut std::ffi::c_void,
998 sub_block.as_ptr(),
999 &mut vs_info,
1000 &mut vs_len,
1001 ) != 0
1002 && !vs_info.is_null()
1003 {
1004 let fixed = vs_info as *const VS_FIXEDFILEINFO;
1005 detected_major = (*fixed).dw_file_version_ms >> 16;
1006 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1007 }
1008 }
1009 }
1010 }
1011
1012 FreeLibrary(handle);
1013
1014 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1018 let ver = format!("{}.{}", detected_major, detected_minor);
1019 return Err(format_ort_version_mismatch(&ver, lib_name));
1020 }
1021 }
1022 }
1023
1024 Ok(())
1025}
1026
1027#[cfg(any(target_os = "linux", target_os = "macos"))]
1028unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1029 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1030 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1031 if symbol.is_null() {
1032 return None;
1033 }
1034
1035 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1036 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1037 return None;
1038 }
1039
1040 let info = unsafe { info.assume_init() };
1041 if info.dli_fname.is_null() {
1042 return None;
1043 }
1044
1045 Some(
1046 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1047 .to_string_lossy()
1048 .into_owned(),
1049 )
1050}
1051
1052#[cfg(any(target_os = "linux", target_os = "macos"))]
1053fn detect_ort_version_from_resolved_or_requested(
1054 resolved_path: Option<String>,
1055 requested_lib_name: &str,
1056) -> (Option<String>, String) {
1057 if let Some(path) = resolved_path {
1058 if let Some(version) = detect_ort_version_from_path(&path) {
1059 return (Some(version), path);
1060 }
1061 return (detect_ort_version_from_path(requested_lib_name), path);
1062 }
1063
1064 (
1065 detect_ort_version_from_path(requested_lib_name),
1066 requested_lib_name.to_string(),
1067 )
1068}
1069
1070#[cfg(any(target_os = "linux", target_os = "macos"))]
1071fn detect_ort_version_from_loaded_library(
1072 handle: *mut std::ffi::c_void,
1073 requested_lib_name: &str,
1074) -> (Option<String>, String) {
1075 detect_ort_version_from_resolved_or_requested(
1076 unsafe { loaded_library_path_from_handle(handle) },
1077 requested_lib_name,
1078 )
1079}
1080
1081#[cfg(any(target_os = "linux", target_os = "macos"))]
1084fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1085 let path = std::path::Path::new(lib_path);
1086
1087 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1089 .into_iter()
1090 .flatten()
1091 {
1092 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1093 if let Some(version) = extract_version_from_filename(name) {
1094 return Some(version);
1095 }
1096 }
1097 }
1098
1099 if let Some(parent) = path.parent() {
1101 if let Ok(entries) = std::fs::read_dir(parent) {
1102 for entry in entries.flatten() {
1103 if let Some(name) = entry.file_name().to_str() {
1104 if name.starts_with("libonnxruntime") {
1105 if let Some(version) = extract_version_from_filename(name) {
1106 return Some(version);
1107 }
1108 }
1109 }
1110 }
1111 }
1112 }
1113
1114 None
1115}
1116
1117#[cfg(any(target_os = "linux", target_os = "macos"))]
1119fn extract_version_from_filename(name: &str) -> Option<String> {
1120 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1122 re.find(name).map(|m| m.as_str().to_string())
1123}
1124
1125fn suggest_removal_command(lib_path: &str) -> String {
1126 if lib_path.starts_with("/usr/local/lib")
1127 || lib_path == "libonnxruntime.so"
1128 || lib_path == "libonnxruntime.dylib"
1129 {
1130 #[cfg(target_os = "linux")]
1131 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1132 #[cfg(target_os = "macos")]
1133 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1134 }
1135 format!(" rm '{}'", lib_path)
1136}
1137
1138pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1144 format!(
1145 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1146 Solutions:\n\
1147 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1148 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1149 configures the bridge to load it instead of the system library — no \
1150 changes to '{}'.\n\
1151 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1152 {}\n\
1153 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1154 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1155 version,
1156 lib_name,
1157 lib_name,
1158 suggest_removal_command(lib_name),
1159 )
1160}
1161
1162pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1163 if message.trim_start().starts_with("ONNX Runtime not found.") {
1164 return true;
1165 }
1166
1167 let message = message.to_ascii_lowercase();
1168 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1169 .iter()
1170 .any(|pattern| message.contains(pattern));
1171 let mentions_dynamic_load_failure = [
1172 "shared library",
1173 "dynamic library",
1174 "failed to load",
1175 "could not load",
1176 "unable to load",
1177 "dlopen",
1178 "loadlibrary",
1179 "no such file",
1180 "not found",
1181 ]
1182 .iter()
1183 .any(|pattern| message.contains(pattern));
1184
1185 mentions_onnx_runtime && mentions_dynamic_load_failure
1186}
1187
1188pub fn format_embedding_init_error(error: impl Display) -> String {
1189 let message = error.to_string();
1190
1191 if is_onnx_runtime_unavailable(&message) {
1192 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1193 }
1194
1195 format!("failed to initialize semantic embedding model: {message}")
1196}
1197
1198#[derive(Debug, Clone)]
1200pub struct SemanticChunk {
1201 pub file: PathBuf,
1203 pub name: String,
1205 pub kind: SymbolKind,
1207 pub start_line: u32,
1209 pub end_line: u32,
1210 pub exported: bool,
1212 pub embed_text: String,
1214 pub snippet: String,
1216}
1217
1218#[derive(Debug, Clone)]
1220pub struct EmbeddingEntry {
1221 chunk: SemanticChunk,
1222 vector: Vec<f32>,
1223}
1224
1225#[derive(Debug, Clone)]
1227pub struct SemanticIndex {
1228 entries: Vec<EmbeddingEntry>,
1229 file_mtimes: HashMap<PathBuf, SystemTime>,
1231 file_sizes: HashMap<PathBuf, u64>,
1233 file_hashes: HashMap<PathBuf, blake3::Hash>,
1234 dimension: usize,
1236 fingerprint: Option<SemanticIndexFingerprint>,
1237 project_root: PathBuf,
1238 deferred_files: HashSet<PathBuf>,
1239}
1240
1241#[derive(Debug, Clone, Copy)]
1242struct IndexedFileMetadata {
1243 mtime: SystemTime,
1244 size: u64,
1245 content_hash: blake3::Hash,
1246}
1247
1248#[derive(Debug, Default, Clone, Copy)]
1251pub struct RefreshSummary {
1252 pub changed: usize,
1253 pub added: usize,
1254 pub deleted: usize,
1255 pub total_processed: usize,
1256}
1257
1258impl RefreshSummary {
1259 pub fn is_noop(&self) -> bool {
1261 self.changed == 0 && self.added == 0 && self.deleted == 0
1262 }
1263}
1264
1265#[derive(Debug, Default)]
1266pub struct InvalidatedFilesRefresh {
1267 pub added_entries: Vec<EmbeddingEntry>,
1271 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1272 pub completed_paths: Vec<PathBuf>,
1273 pub summary: RefreshSummary,
1274}
1275
1276#[derive(Debug, Clone)]
1277struct ReusableEmbedding {
1278 embed_text: String,
1279 vector: Vec<f32>,
1280}
1281
1282type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1283
1284#[derive(Debug, Clone)]
1286pub struct SemanticResult {
1287 pub file: PathBuf,
1288 pub name: String,
1289 pub kind: SymbolKind,
1290 pub start_line: u32,
1291 pub end_line: u32,
1292 pub exported: bool,
1293 pub snippet: String,
1294 pub score: f32,
1295 pub source: &'static str,
1296}
1297
1298impl SemanticIndex {
1299 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1300 debug_assert!(project_root.is_absolute());
1301 Self {
1302 entries: Vec::new(),
1303 file_mtimes: HashMap::new(),
1304 file_sizes: HashMap::new(),
1305 file_hashes: HashMap::new(),
1306 dimension,
1307 fingerprint: None,
1308 project_root,
1309 deferred_files: HashSet::new(),
1310 }
1311 }
1312
1313 pub fn entry_count(&self) -> usize {
1315 self.entries.len()
1316 }
1317
1318 pub fn indexed_file_count(&self) -> usize {
1320 self.file_mtimes.len()
1321 }
1322
1323 pub fn status_label(&self) -> &'static str {
1325 if self.entries.is_empty() {
1326 "empty"
1327 } else {
1328 "ready"
1329 }
1330 }
1331
1332 fn collect_chunks(
1333 project_root: &Path,
1334 files: &[PathBuf],
1335 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1336 let collect_started = std::time::Instant::now();
1337 let per_file: Vec<(
1338 PathBuf,
1339 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1340 )> = files
1341 .par_iter()
1342 .map_init(HashMap::new, |parsers, file| {
1343 let result = collect_file_metadata(file).and_then(|metadata| {
1344 collect_file_chunks(project_root, file, parsers)
1345 .map(|chunks| (metadata, chunks))
1346 });
1347 (file.clone(), result)
1348 })
1349 .collect();
1350
1351 let mut chunks: Vec<SemanticChunk> = Vec::new();
1352 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1353
1354 for (file, result) in per_file {
1355 match result {
1356 Ok((metadata, file_chunks)) => {
1357 file_metadata.insert(file, metadata);
1358 chunks.extend(file_chunks);
1359 }
1360 Err(error) => {
1361 if error == "unsupported file extension" {
1367 continue;
1368 }
1369 slog_warn!(
1370 "failed to collect semantic chunks for {}: {}",
1371 file.display(),
1372 error
1373 );
1374 }
1375 }
1376 }
1377
1378 slog_info!(
1379 "semantic collect: {} chunks from {} files in {} ms",
1380 chunks.len(),
1381 file_metadata.len(),
1382 collect_started.elapsed().as_millis()
1383 );
1384
1385 (chunks, file_metadata)
1386 }
1387
1388 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1389 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1390 let mut reuse_map: ChunkReuseMap = HashMap::new();
1391
1392 for entry in &self.entries {
1393 if !requested.contains(entry.chunk.file.as_path()) {
1394 continue;
1395 }
1396
1397 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1402 reuse_map
1403 .entry(entry.chunk.file.clone())
1404 .or_default()
1405 .entry(hash)
1406 .or_default()
1407 .push(ReusableEmbedding {
1408 embed_text: entry.chunk.embed_text.clone(),
1409 vector: entry.vector.clone(),
1410 });
1411 }
1412
1413 reuse_map
1414 }
1415
1416 fn reusable_vector_for_chunk(
1417 reuse_map: &ChunkReuseMap,
1418 chunk: &SemanticChunk,
1419 ) -> Option<Vec<f32>> {
1420 let hash = blake3::hash(chunk.embed_text.as_bytes());
1421 reuse_map
1422 .get(&chunk.file)?
1423 .get(&hash)?
1424 .iter()
1425 .find(|candidate| candidate.embed_text == chunk.embed_text)
1426 .map(|candidate| candidate.vector.clone())
1427 }
1428
1429 fn entries_for_chunks_with_reuse<F, P>(
1430 chunks: Vec<SemanticChunk>,
1431 reuse_map: &ChunkReuseMap,
1432 embed_fn: &mut F,
1433 max_batch_size: usize,
1434 initial_observed_dimension: Option<usize>,
1435 refresh_label: &str,
1436 progress: &mut P,
1437 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1438 where
1439 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1440 P: FnMut(usize, usize),
1441 {
1442 let total_chunks = chunks.len();
1443 progress(0, total_chunks);
1444
1445 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1446 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1447
1448 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1449 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1450 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1451 } else {
1452 misses.push((chunk_index, chunk));
1453 }
1454 }
1455
1456 let mut completed = total_chunks.saturating_sub(misses.len());
1457 if completed > 0 {
1458 progress(completed, total_chunks);
1459 }
1460
1461 let batch_size = max_batch_size.max(1);
1462 let mut observed_dimension = initial_observed_dimension;
1463
1464 for batch_start in (0..misses.len()).step_by(batch_size) {
1465 let batch_end = (batch_start + batch_size).min(misses.len());
1466 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1467 .iter()
1468 .map(|(_, chunk)| chunk.embed_text.clone())
1469 .collect();
1470
1471 let vectors = embed_fn(batch_texts)?;
1472 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1473
1474 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1475 match observed_dimension {
1476 None => observed_dimension = Some(dim),
1477 Some(expected) if dim != expected => {
1478 return Err(format!(
1479 "embedding dimension changed during {refresh_label}: \
1480 cached index uses {expected}, new vectors use {dim}"
1481 ));
1482 }
1483 _ => {}
1484 }
1485 }
1486
1487 for (i, vector) in vectors.into_iter().enumerate() {
1488 let (chunk_index, chunk) = misses[batch_start + i].clone();
1489 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1490 }
1491
1492 completed += batch_end - batch_start;
1493 progress(completed, total_chunks);
1494 }
1495
1496 let entries = entries_by_chunk
1497 .into_iter()
1498 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1499 .collect();
1500
1501 Ok((entries, observed_dimension))
1502 }
1503
1504 fn build_from_chunks<F, P>(
1505 project_root: &Path,
1506 chunks: Vec<SemanticChunk>,
1507 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1508 embed_fn: &mut F,
1509 max_batch_size: usize,
1510 mut progress: Option<&mut P>,
1511 ) -> Result<Self, String>
1512 where
1513 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1514 P: FnMut(usize, usize),
1515 {
1516 debug_assert!(project_root.is_absolute());
1517 let total_chunks = chunks.len();
1518
1519 if chunks.is_empty() {
1520 return Ok(Self {
1521 entries: Vec::new(),
1522 file_mtimes: file_metadata
1523 .iter()
1524 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1525 .collect(),
1526 file_sizes: file_metadata
1527 .iter()
1528 .map(|(path, metadata)| (path.clone(), metadata.size))
1529 .collect(),
1530 file_hashes: file_metadata
1531 .into_iter()
1532 .map(|(path, metadata)| (path, metadata.content_hash))
1533 .collect(),
1534 dimension: DEFAULT_DIMENSION,
1535 fingerprint: None,
1536 project_root: project_root.to_path_buf(),
1537 deferred_files: HashSet::new(),
1538 });
1539 }
1540
1541 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1543 let mut expected_dimension: Option<usize> = None;
1544 let batch_size = max_batch_size.max(1);
1545 let embed_started = std::time::Instant::now();
1546 let batch_count = total_chunks.div_ceil(batch_size);
1547 for batch_start in (0..chunks.len()).step_by(batch_size) {
1548 let batch_end = (batch_start + batch_size).min(chunks.len());
1549 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1550 .iter()
1551 .map(|c| c.embed_text.clone())
1552 .collect();
1553
1554 let vectors = embed_fn(batch_texts)?;
1555 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1556
1557 if let Some(dim) = vectors.first().map(|v| v.len()) {
1559 match expected_dimension {
1560 None => expected_dimension = Some(dim),
1561 Some(expected) if dim != expected => {
1562 return Err(format!(
1563 "embedding dimension changed across batches: expected {expected}, got {dim}"
1564 ));
1565 }
1566 _ => {}
1567 }
1568 }
1569
1570 for (i, vector) in vectors.into_iter().enumerate() {
1571 let chunk_idx = batch_start + i;
1572 entries.push(EmbeddingEntry {
1573 chunk: chunks[chunk_idx].clone(),
1574 vector,
1575 });
1576 }
1577
1578 if let Some(callback) = progress.as_mut() {
1579 callback(entries.len(), total_chunks);
1580 }
1581 }
1582
1583 let embed_ms = embed_started.elapsed().as_millis();
1584 let rate = (total_chunks as u128 * 1000)
1585 .checked_div(embed_ms)
1586 .unwrap_or(0) as u64;
1587 slog_info!(
1588 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1589 total_chunks,
1590 batch_count,
1591 embed_ms,
1592 rate
1593 );
1594
1595 let dimension = entries
1596 .first()
1597 .map(|e| e.vector.len())
1598 .unwrap_or(DEFAULT_DIMENSION);
1599
1600 Ok(Self {
1601 entries,
1602 file_mtimes: file_metadata
1603 .iter()
1604 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1605 .collect(),
1606 file_sizes: file_metadata
1607 .iter()
1608 .map(|(path, metadata)| (path.clone(), metadata.size))
1609 .collect(),
1610 file_hashes: file_metadata
1611 .into_iter()
1612 .map(|(path, metadata)| (path, metadata.content_hash))
1613 .collect(),
1614 dimension,
1615 fingerprint: None,
1616 project_root: project_root.to_path_buf(),
1617 deferred_files: HashSet::new(),
1618 })
1619 }
1620
1621 pub fn build<F>(
1624 project_root: &Path,
1625 files: &[PathBuf],
1626 embed_fn: &mut F,
1627 max_batch_size: usize,
1628 ) -> Result<Self, String>
1629 where
1630 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631 {
1632 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1633 Self::build_from_chunks(
1634 project_root,
1635 chunks,
1636 file_mtimes,
1637 embed_fn,
1638 max_batch_size,
1639 Option::<&mut fn(usize, usize)>::None,
1640 )
1641 }
1642
1643 pub fn build_with_progress<F, P>(
1645 project_root: &Path,
1646 files: &[PathBuf],
1647 embed_fn: &mut F,
1648 max_batch_size: usize,
1649 progress: &mut P,
1650 ) -> Result<Self, String>
1651 where
1652 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1653 P: FnMut(usize, usize),
1654 {
1655 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1656 let total_chunks = chunks.len();
1657 progress(0, total_chunks);
1658 Self::build_from_chunks(
1659 project_root,
1660 chunks,
1661 file_mtimes,
1662 embed_fn,
1663 max_batch_size,
1664 Some(progress),
1665 )
1666 }
1667
1668 pub fn refresh_stale_files<F, P>(
1679 &mut self,
1680 project_root: &Path,
1681 current_files: &[PathBuf],
1682 embed_fn: &mut F,
1683 max_batch_size: usize,
1684 progress: &mut P,
1685 ) -> Result<RefreshSummary, String>
1686 where
1687 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1688 P: FnMut(usize, usize),
1689 {
1690 self.backfill_missing_file_sizes();
1691
1692 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1694 self.deferred_files
1695 .retain(|path| current_set.contains(path.as_path()));
1696 let total_processed = current_set.len() + self.file_mtimes.len()
1697 - self
1698 .file_mtimes
1699 .keys()
1700 .filter(|path| current_set.contains(path.as_path()))
1701 .count();
1702
1703 let mut deleted: Vec<PathBuf> = Vec::new();
1706 let mut changed: Vec<PathBuf> = Vec::new();
1707 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708 for indexed_path in &indexed_paths {
1709 if !current_set.contains(indexed_path.as_path()) {
1710 deleted.push(indexed_path.clone());
1711 continue;
1712 }
1713 let cached = match (
1714 self.file_mtimes.get(indexed_path),
1715 self.file_sizes.get(indexed_path),
1716 self.file_hashes.get(indexed_path),
1717 ) {
1718 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1719 mtime: *mtime,
1720 size: *size,
1721 content_hash: *hash,
1722 }),
1723 _ => None,
1724 };
1725 match cached
1726 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1727 {
1728 Some(FreshnessVerdict::HotFresh) => {}
1729 Some(FreshnessVerdict::ContentFresh {
1730 new_mtime,
1731 new_size,
1732 }) => {
1733 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1734 self.file_sizes.insert(indexed_path.clone(), new_size);
1735 }
1736 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1737 changed.push(indexed_path.clone());
1738 }
1739 }
1740 }
1741
1742 let mut added: Vec<PathBuf> = Vec::new();
1744 for path in current_files {
1745 if !self.file_mtimes.contains_key(path) {
1746 added.push(path.clone());
1747 }
1748 }
1749
1750 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1752 progress(0, 0);
1753 return Ok(RefreshSummary {
1754 total_processed,
1755 ..RefreshSummary::default()
1756 });
1757 }
1758
1759 if !deleted.is_empty() {
1763 self.remove_indexed_files(&deleted);
1764 }
1765
1766 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1768 to_embed.extend(changed.iter().cloned());
1769 to_embed.extend(added.iter().cloned());
1770
1771 if to_embed.is_empty() {
1772 progress(0, 0);
1774 return Ok(RefreshSummary {
1775 changed: 0,
1776 added: 0,
1777 deleted: deleted.len(),
1778 total_processed,
1779 });
1780 }
1781
1782 let reuse_map = self.build_chunk_reuse_map(&changed);
1783 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1784 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1785 let vanished = to_embed
1786 .iter()
1787 .filter(|path| {
1788 changed_set.contains(path.as_path())
1789 && !fresh_metadata.contains_key(*path)
1790 && !path.exists()
1791 })
1792 .cloned()
1793 .collect::<Vec<_>>();
1794 if !vanished.is_empty() {
1795 self.remove_indexed_files(&vanished);
1796 deleted.extend(vanished);
1797 }
1798
1799 if chunks.is_empty() {
1800 progress(0, 0);
1801 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1802 for file in &successful_files {
1803 self.deferred_files.remove(file);
1804 }
1805 if !successful_files.is_empty() {
1806 self.entries
1807 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1808 }
1809 let changed_count = changed
1810 .iter()
1811 .filter(|path| successful_files.contains(*path))
1812 .count();
1813 let added_count = added
1814 .iter()
1815 .filter(|path| successful_files.contains(*path))
1816 .count();
1817 for (file, metadata) in fresh_metadata {
1818 self.file_mtimes.insert(file.clone(), metadata.mtime);
1819 self.file_sizes.insert(file.clone(), metadata.size);
1820 self.file_hashes.insert(file.clone(), metadata.content_hash);
1821 }
1822 return Ok(RefreshSummary {
1823 changed: changed_count,
1824 added: added_count,
1825 deleted: deleted.len(),
1826 total_processed,
1827 });
1828 }
1829
1830 let existing_dimension = if self.entries.is_empty() {
1833 None
1834 } else {
1835 Some(self.dimension)
1836 };
1837 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1838 chunks,
1839 &reuse_map,
1840 embed_fn,
1841 max_batch_size,
1842 existing_dimension,
1843 "incremental refresh",
1844 progress,
1845 )?;
1846
1847 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1848 for file in &successful_files {
1849 self.deferred_files.remove(file);
1850 }
1851 if !successful_files.is_empty() {
1852 self.entries
1853 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1854 }
1855
1856 self.entries.extend(new_entries);
1857 for (file, metadata) in fresh_metadata {
1858 self.file_mtimes.insert(file.clone(), metadata.mtime);
1859 self.file_sizes.insert(file.clone(), metadata.size);
1860 self.file_hashes.insert(file, metadata.content_hash);
1861 }
1862 if let Some(dim) = observed_dimension {
1863 self.dimension = dim;
1864 }
1865
1866 Ok(RefreshSummary {
1867 changed: changed
1868 .iter()
1869 .filter(|path| successful_files.contains(*path))
1870 .count(),
1871 added: added
1872 .iter()
1873 .filter(|path| successful_files.contains(*path))
1874 .count(),
1875 deleted: deleted.len(),
1876 total_processed,
1877 })
1878 }
1879
1880 pub fn refresh_invalidated_files<F, P>(
1887 &mut self,
1888 project_root: &Path,
1889 paths: &[PathBuf],
1890 embed_fn: &mut F,
1891 max_batch_size: usize,
1892 max_files: usize,
1893 progress: &mut P,
1894 ) -> Result<InvalidatedFilesRefresh, String>
1895 where
1896 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1897 P: FnMut(usize, usize),
1898 {
1899 self.backfill_missing_file_sizes();
1900
1901 self.deferred_files.retain(|path| path.exists());
1902 let mut requested_paths = paths.to_vec();
1903 requested_paths.extend(self.deferred_files.iter().cloned());
1904 requested_paths.sort();
1905 requested_paths.dedup();
1906 let total_processed = requested_paths.len();
1907
1908 if requested_paths.is_empty() {
1909 progress(0, 0);
1910 return Ok(InvalidatedFilesRefresh {
1911 summary: RefreshSummary {
1912 total_processed,
1913 ..RefreshSummary::default()
1914 },
1915 ..InvalidatedFilesRefresh::default()
1916 });
1917 }
1918
1919 let previously_indexed: HashSet<PathBuf> = requested_paths
1920 .iter()
1921 .filter(|path| self.file_mtimes.contains_key(*path))
1922 .cloned()
1923 .collect();
1924 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1925
1926 self.remove_indexed_files(&requested_paths);
1930
1931 let existing_paths = requested_paths
1932 .iter()
1933 .filter(|path| path.exists())
1934 .cloned()
1935 .collect::<Vec<_>>();
1936 let deleted = requested_paths
1937 .iter()
1938 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1939 .count();
1940
1941 if existing_paths.is_empty() {
1942 for path in &requested_paths {
1943 if !path.exists() {
1944 self.deferred_files.remove(path);
1945 }
1946 }
1947 progress(0, 0);
1948 return Ok(InvalidatedFilesRefresh {
1949 completed_paths: requested_paths,
1950 summary: RefreshSummary {
1951 deleted,
1952 total_processed,
1953 ..RefreshSummary::default()
1954 },
1955 ..InvalidatedFilesRefresh::default()
1956 });
1957 }
1958
1959 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1960
1961 let retained_file_count = self.file_mtimes.len();
1962 let changed_successful_count = existing_paths
1963 .iter()
1964 .filter(|path| {
1965 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1966 })
1967 .count();
1968 let available_new_files =
1969 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1970 let new_successful_files = existing_paths
1971 .iter()
1972 .filter(|path| {
1973 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1974 })
1975 .cloned()
1976 .collect::<Vec<_>>();
1977 if new_successful_files.len() > available_new_files {
1978 let allowed_new_files = new_successful_files
1979 .iter()
1980 .take(available_new_files)
1981 .cloned()
1982 .collect::<HashSet<_>>();
1983 let deferred_new_files = new_successful_files
1984 .into_iter()
1985 .filter(|path| !allowed_new_files.contains(path))
1986 .collect::<HashSet<_>>();
1987
1988 fresh_metadata.retain(|file, _| {
1989 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1990 });
1991 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1992
1993 if !deferred_new_files.is_empty() {
1994 for path in &deferred_new_files {
1995 self.deferred_files.insert(path.clone());
1996 }
1997 slog_warn!(
1998 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1999 deferred_new_files.len(),
2000 max_files
2001 );
2002 }
2003 }
2004
2005 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2006 for file in &successful_files {
2007 self.deferred_files.remove(file);
2008 }
2009 let changed = successful_files
2010 .iter()
2011 .filter(|path| previously_indexed.contains(path.as_path()))
2012 .count();
2013 let added = successful_files.len().saturating_sub(changed);
2014 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2015
2016 if chunks.is_empty() {
2017 progress(0, 0);
2018 for (file, metadata) in fresh_metadata {
2019 let freshness = FileFreshness {
2020 mtime: metadata.mtime,
2021 size: metadata.size,
2022 content_hash: metadata.content_hash,
2023 };
2024 self.file_mtimes.insert(file.clone(), freshness.mtime);
2025 self.file_sizes.insert(file.clone(), freshness.size);
2026 self.file_hashes
2027 .insert(file.clone(), freshness.content_hash);
2028 updated_metadata.push((file, freshness));
2029 }
2030
2031 return Ok(InvalidatedFilesRefresh {
2032 updated_metadata,
2033 completed_paths: requested_paths,
2034 summary: RefreshSummary {
2035 changed,
2036 added,
2037 deleted,
2038 total_processed,
2039 },
2040 ..InvalidatedFilesRefresh::default()
2041 });
2042 }
2043
2044 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2045 {
2046 None
2047 } else {
2048 Some(self.dimension)
2049 };
2050 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2051 chunks,
2052 &reuse_map,
2053 embed_fn,
2054 max_batch_size,
2055 initial_observed_dimension,
2056 "invalidated-file refresh",
2057 progress,
2058 )?;
2059
2060 let added_entries = new_entries.clone();
2061 self.entries.extend(new_entries);
2062 for (file, metadata) in fresh_metadata {
2063 let freshness = FileFreshness {
2064 mtime: metadata.mtime,
2065 size: metadata.size,
2066 content_hash: metadata.content_hash,
2067 };
2068 self.file_mtimes.insert(file.clone(), freshness.mtime);
2069 self.file_sizes.insert(file.clone(), freshness.size);
2070 self.file_hashes
2071 .insert(file.clone(), freshness.content_hash);
2072 updated_metadata.push((file, freshness));
2073 }
2074 if let Some(dim) = observed_dimension {
2075 self.dimension = dim;
2076 }
2077
2078 Ok(InvalidatedFilesRefresh {
2079 added_entries,
2080 updated_metadata,
2081 completed_paths: requested_paths,
2082 summary: RefreshSummary {
2083 changed,
2084 added,
2085 deleted,
2086 total_processed,
2087 },
2088 })
2089 }
2090
2091 pub fn apply_refresh_update(
2092 &mut self,
2093 added_entries: Vec<EmbeddingEntry>,
2094 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2095 completed_paths: &[PathBuf],
2096 ) {
2097 self.remove_indexed_files(completed_paths);
2101
2102 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2103 self.entries.extend(added_entries);
2104 for (file, freshness) in updated_metadata {
2105 self.file_mtimes.insert(file.clone(), freshness.mtime);
2106 self.file_sizes.insert(file.clone(), freshness.size);
2107 self.file_hashes.insert(file, freshness.content_hash);
2108 }
2109 if let Some(dim) = observed_dimension {
2110 self.dimension = dim;
2111 }
2112 }
2113
2114 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2115 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2116 self.entries
2117 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2118 for path in files {
2119 self.file_mtimes.remove(path);
2120 self.file_sizes.remove(path);
2121 self.file_hashes.remove(path);
2122 }
2123 }
2124
2125 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2127 if self.entries.is_empty() || query_vector.len() != self.dimension {
2128 return Vec::new();
2129 }
2130
2131 let mut scored: Vec<(f32, usize)> = self
2132 .entries
2133 .iter()
2134 .enumerate()
2135 .map(|(i, entry)| {
2136 let mut score = cosine_similarity(query_vector, &entry.vector);
2137 if entry.chunk.exported {
2138 score *= 1.1;
2139 }
2140 (score, i)
2141 })
2142 .collect();
2143
2144 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
2146
2147 scored
2148 .into_iter()
2149 .take(top_k)
2150 .map(|(score, idx)| {
2154 let entry = &self.entries[idx];
2155 SemanticResult {
2156 file: entry.chunk.file.clone(),
2157 name: entry.chunk.name.clone(),
2158 kind: entry.chunk.kind.clone(),
2159 start_line: entry.chunk.start_line,
2160 end_line: entry.chunk.end_line,
2161 exported: entry.chunk.exported,
2162 snippet: entry.chunk.snippet.clone(),
2163 score,
2164 source: "semantic",
2165 }
2166 })
2167 .collect()
2168 }
2169
2170 pub fn len(&self) -> usize {
2172 self.entries.len()
2173 }
2174
2175 pub fn is_file_stale(&self, file: &Path) -> bool {
2177 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2178 return true;
2179 };
2180 let Some(stored_size) = self.file_sizes.get(file) else {
2181 return true;
2182 };
2183 let Some(stored_hash) = self.file_hashes.get(file) else {
2184 return true;
2185 };
2186 let cached = FileFreshness {
2187 mtime: *stored_mtime,
2188 size: *stored_size,
2189 content_hash: *stored_hash,
2190 };
2191 match cache_freshness::verify_file_strict(file, &cached) {
2192 FreshnessVerdict::HotFresh => false,
2193 FreshnessVerdict::ContentFresh { .. } => false,
2194 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2195 }
2196 }
2197
2198 fn backfill_missing_file_sizes(&mut self) {
2199 for path in self.file_mtimes.keys() {
2200 if self.file_sizes.contains_key(path) {
2201 continue;
2202 }
2203 if let Ok(metadata) = fs::metadata(path) {
2204 self.file_sizes.insert(path.clone(), metadata.len());
2205 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2206 self.file_hashes.insert(path.clone(), hash);
2207 }
2208 }
2209 }
2210 }
2211
2212 pub fn remove_file(&mut self, file: &Path) {
2214 self.invalidate_file(file);
2215 }
2216
2217 pub fn invalidate_file(&mut self, file: &Path) {
2218 let canonical_file = canonicalize_existing_or_deleted_path(file);
2219 self.entries
2220 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2221 self.file_mtimes.remove(file);
2222 self.file_sizes.remove(file);
2223 self.file_hashes.remove(file);
2224 if canonical_file.as_path() != file {
2225 self.file_mtimes.remove(&canonical_file);
2226 self.file_sizes.remove(&canonical_file);
2227 self.file_hashes.remove(&canonical_file);
2228 }
2229 }
2230
2231 pub fn dimension(&self) -> usize {
2233 self.dimension
2234 }
2235
2236 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2237 self.fingerprint.as_ref()
2238 }
2239
2240 pub fn backend_label(&self) -> Option<&str> {
2241 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2242 }
2243
2244 pub fn model_label(&self) -> Option<&str> {
2245 self.fingerprint.as_ref().map(|f| f.model.as_str())
2246 }
2247
2248 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2249 self.fingerprint = Some(fingerprint);
2250 }
2251
2252 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2254 if self.entries.is_empty() {
2257 slog_info!("skipping semantic index persistence (0 entries)");
2258 return;
2259 }
2260 let dir = storage_dir.join("semantic").join(project_key);
2261 if let Err(e) = fs::create_dir_all(&dir) {
2262 slog_warn!("failed to create semantic cache dir: {}", e);
2263 return;
2264 }
2265 let data_path = dir.join("semantic.bin");
2266 let tmp_path = dir.join(format!(
2267 "semantic.bin.tmp.{}.{}",
2268 std::process::id(),
2269 SystemTime::now()
2270 .duration_since(SystemTime::UNIX_EPOCH)
2271 .unwrap_or(Duration::ZERO)
2272 .as_nanos()
2273 ));
2274 let bytes = self.to_bytes();
2275 let write_result = (|| -> std::io::Result<()> {
2276 use std::io::Write;
2277 let mut file = fs::File::create(&tmp_path)?;
2278 file.write_all(&bytes)?;
2279 file.sync_all()?;
2280 Ok(())
2281 })();
2282 if let Err(e) = write_result {
2283 slog_warn!("failed to write semantic index: {}", e);
2284 let _ = fs::remove_file(&tmp_path);
2285 return;
2286 }
2287 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2288 slog_warn!("failed to rename semantic index: {}", e);
2289 let _ = fs::remove_file(&tmp_path);
2290 return;
2291 }
2292 slog_info!(
2293 "semantic index persisted: {} entries, {:.1} KB",
2294 self.entries.len(),
2295 bytes.len() as f64 / 1024.0
2296 );
2297 }
2298
2299 pub fn read_from_disk(
2301 storage_dir: &Path,
2302 project_key: &str,
2303 current_canonical_root: &Path,
2304 is_worktree_bridge: bool,
2305 expected_fingerprint: Option<&str>,
2306 ) -> Option<Self> {
2307 debug_assert!(current_canonical_root.is_absolute());
2308 let data_path = storage_dir
2309 .join("semantic")
2310 .join(project_key)
2311 .join("semantic.bin");
2312 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2313 if file_len < HEADER_BYTES_V1 {
2314 slog_warn!(
2315 "corrupt semantic index (too small: {} bytes), removing",
2316 file_len
2317 );
2318 if !is_worktree_bridge {
2319 let _ = fs::remove_file(&data_path);
2320 }
2321 return None;
2322 }
2323
2324 let bytes = fs::read(&data_path).ok()?;
2325 let version = bytes[0];
2326 if version != SEMANTIC_INDEX_VERSION_V6 {
2327 slog_info!(
2328 "cached semantic index version {} is older than {}, rebuilding",
2329 version,
2330 SEMANTIC_INDEX_VERSION_V6
2331 );
2332 if !is_worktree_bridge {
2333 let _ = fs::remove_file(&data_path);
2334 }
2335 return None;
2336 }
2337 match Self::from_bytes(&bytes, current_canonical_root) {
2338 Ok(index) => {
2339 if index.entries.is_empty() {
2340 slog_info!("cached semantic index is empty, will rebuild");
2341 if !is_worktree_bridge {
2342 let _ = fs::remove_file(&data_path);
2343 }
2344 return None;
2345 }
2346 if let Some(expected) = expected_fingerprint {
2347 let matches = index
2348 .fingerprint()
2349 .map(|fingerprint| fingerprint.matches_expected(expected))
2350 .unwrap_or(false);
2351 if !matches {
2352 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2353 if !is_worktree_bridge {
2354 let _ = fs::remove_file(&data_path);
2355 }
2356 return None;
2357 }
2358 }
2359 slog_info!(
2360 "loaded semantic index from disk: {} entries",
2361 index.entries.len()
2362 );
2363 Some(index)
2364 }
2365 Err(e) => {
2366 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2367 if !is_worktree_bridge {
2368 let _ = fs::remove_file(&data_path);
2369 }
2370 None
2371 }
2372 }
2373 }
2374
2375 pub fn to_bytes(&self) -> Vec<u8> {
2377 let mut buf = Vec::new();
2378 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2379 let encoded = fingerprint.as_string();
2380 if encoded.is_empty() {
2381 None
2382 } else {
2383 Some(encoded.into_bytes())
2384 }
2385 });
2386 let file_mtimes: Vec<_> = self
2387 .file_mtimes
2388 .iter()
2389 .filter_map(|(path, mtime)| {
2390 cache_relative_path(&self.project_root, path)
2391 .map(|relative| (relative, path, mtime))
2392 })
2393 .collect();
2394 let entries: Vec<_> = self
2395 .entries
2396 .iter()
2397 .filter_map(|entry| {
2398 cache_relative_path(&self.project_root, &entry.chunk.file)
2399 .map(|relative| (relative, entry))
2400 })
2401 .collect();
2402
2403 let version = SEMANTIC_INDEX_VERSION_V6;
2416 buf.push(version);
2417 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2418 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2419 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2420 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2421 buf.extend_from_slice(fp_bytes_ref);
2422
2423 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2426 for (relative, path, mtime) in &file_mtimes {
2427 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2428 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2429 buf.extend_from_slice(&path_bytes);
2430 let duration = mtime
2431 .duration_since(SystemTime::UNIX_EPOCH)
2432 .unwrap_or_default();
2433 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2434 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2435 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2436 buf.extend_from_slice(&size.to_le_bytes());
2437 let hash = self
2438 .file_hashes
2439 .get(*path)
2440 .copied()
2441 .unwrap_or_else(cache_freshness::zero_hash);
2442 buf.extend_from_slice(hash.as_bytes());
2443 }
2444
2445 for (relative, entry) in &entries {
2447 let c = &entry.chunk;
2448
2449 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2451 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2452 buf.extend_from_slice(&file_bytes);
2453
2454 let name_bytes = c.name.as_bytes();
2456 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2457 buf.extend_from_slice(name_bytes);
2458
2459 buf.push(symbol_kind_to_u8(&c.kind));
2461
2462 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2464 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2465 buf.push(c.exported as u8);
2466
2467 let snippet_bytes = c.snippet.as_bytes();
2469 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2470 buf.extend_from_slice(snippet_bytes);
2471
2472 let embed_bytes = c.embed_text.as_bytes();
2474 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2475 buf.extend_from_slice(embed_bytes);
2476
2477 for &val in &entry.vector {
2479 buf.extend_from_slice(&val.to_le_bytes());
2480 }
2481 }
2482
2483 buf
2484 }
2485
2486 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2488 debug_assert!(current_canonical_root.is_absolute());
2489 let mut pos = 0;
2490
2491 if data.len() < HEADER_BYTES_V1 {
2492 return Err("data too short".to_string());
2493 }
2494
2495 let version = data[pos];
2496 pos += 1;
2497 if version != SEMANTIC_INDEX_VERSION_V1
2498 && version != SEMANTIC_INDEX_VERSION_V2
2499 && version != SEMANTIC_INDEX_VERSION_V3
2500 && version != SEMANTIC_INDEX_VERSION_V4
2501 && version != SEMANTIC_INDEX_VERSION_V5
2502 && version != SEMANTIC_INDEX_VERSION_V6
2503 {
2504 return Err(format!("unsupported version: {}", version));
2505 }
2506 if (version == SEMANTIC_INDEX_VERSION_V2
2510 || version == SEMANTIC_INDEX_VERSION_V3
2511 || version == SEMANTIC_INDEX_VERSION_V4
2512 || version == SEMANTIC_INDEX_VERSION_V5
2513 || version == SEMANTIC_INDEX_VERSION_V6)
2514 && data.len() < HEADER_BYTES_V2
2515 {
2516 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2517 }
2518
2519 let dimension = read_u32(data, &mut pos)? as usize;
2520 let entry_count = read_u32(data, &mut pos)? as usize;
2521 validate_embedding_dimension(dimension)?;
2522 if entry_count > MAX_ENTRIES {
2523 return Err(format!("too many semantic index entries: {}", entry_count));
2524 }
2525
2526 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2532 || version == SEMANTIC_INDEX_VERSION_V3
2533 || version == SEMANTIC_INDEX_VERSION_V4
2534 || version == SEMANTIC_INDEX_VERSION_V5
2535 || version == SEMANTIC_INDEX_VERSION_V6;
2536 let fingerprint = if has_fingerprint_field {
2537 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2538 if pos + fingerprint_len > data.len() {
2539 return Err("unexpected end of data reading fingerprint".to_string());
2540 }
2541 if fingerprint_len == 0 {
2542 None
2543 } else {
2544 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2545 pos += fingerprint_len;
2546 Some(
2547 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2548 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2549 )
2550 }
2551 } else {
2552 None
2553 };
2554
2555 let mtime_count = read_u32(data, &mut pos)? as usize;
2557 if mtime_count > MAX_ENTRIES {
2558 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2559 }
2560
2561 let vector_bytes = entry_count
2562 .checked_mul(dimension)
2563 .and_then(|count| count.checked_mul(F32_BYTES))
2564 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2565 if vector_bytes > data.len().saturating_sub(pos) {
2566 return Err("semantic index vectors exceed available data".to_string());
2567 }
2568
2569 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2570 let mut file_sizes = HashMap::with_capacity(mtime_count);
2571 let mut file_hashes = HashMap::with_capacity(mtime_count);
2572 for _ in 0..mtime_count {
2573 let path = read_string(data, &mut pos)?;
2574 let secs = read_u64(data, &mut pos)?;
2575 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2581 || version == SEMANTIC_INDEX_VERSION_V4
2582 || version == SEMANTIC_INDEX_VERSION_V5
2583 || version == SEMANTIC_INDEX_VERSION_V6
2584 {
2585 read_u32(data, &mut pos)?
2586 } else {
2587 0
2588 };
2589 let size =
2590 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2591 read_u64(data, &mut pos)?
2592 } else {
2593 0
2594 };
2595 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2596 if pos + 32 > data.len() {
2597 return Err("unexpected end of data reading content hash".to_string());
2598 }
2599 let mut hash_bytes = [0u8; 32];
2600 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2601 pos += 32;
2602 blake3::Hash::from_bytes(hash_bytes)
2603 } else {
2604 cache_freshness::zero_hash()
2605 };
2606 if nanos >= 1_000_000_000 {
2613 return Err(format!(
2614 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2615 nanos
2616 ));
2617 }
2618 let duration = std::time::Duration::new(secs, nanos);
2619 let mtime = SystemTime::UNIX_EPOCH
2620 .checked_add(duration)
2621 .ok_or_else(|| {
2622 format!(
2623 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2624 secs, nanos
2625 )
2626 })?;
2627 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2628 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2629 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2630 } else {
2631 PathBuf::from(path)
2632 };
2633 file_mtimes.insert(path.clone(), mtime);
2634 file_sizes.insert(path.clone(), size);
2635 file_hashes.insert(path, content_hash);
2636 }
2637
2638 let mut entries = Vec::with_capacity(entry_count);
2640 for _ in 0..entry_count {
2641 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2642 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2643 cached_path_under_root(current_canonical_root, &raw_file)
2644 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2645 } else {
2646 raw_file
2647 };
2648 let name = read_string(data, &mut pos)?;
2649
2650 if pos >= data.len() {
2651 return Err("unexpected end of data".to_string());
2652 }
2653 let kind = u8_to_symbol_kind(data[pos]);
2654 pos += 1;
2655
2656 let start_line = read_u32(data, &mut pos)?;
2657 let end_line = read_u32(data, &mut pos)?;
2658
2659 if pos >= data.len() {
2660 return Err("unexpected end of data".to_string());
2661 }
2662 let exported = data[pos] != 0;
2663 pos += 1;
2664
2665 let snippet = read_string(data, &mut pos)?;
2666 let embed_text = read_string(data, &mut pos)?;
2667
2668 let vec_bytes = dimension
2670 .checked_mul(F32_BYTES)
2671 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2672 if pos + vec_bytes > data.len() {
2673 return Err("unexpected end of data reading vector".to_string());
2674 }
2675 let mut vector = Vec::with_capacity(dimension);
2676 for _ in 0..dimension {
2677 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2678 vector.push(f32::from_le_bytes(bytes));
2679 pos += 4;
2680 }
2681
2682 entries.push(EmbeddingEntry {
2683 chunk: SemanticChunk {
2684 file,
2685 name,
2686 kind,
2687 start_line,
2688 end_line,
2689 exported,
2690 embed_text,
2691 snippet,
2692 },
2693 vector,
2694 });
2695 }
2696
2697 if entries.len() != entry_count {
2698 return Err(format!(
2699 "semantic cache entry count drift: header={} decoded={}",
2700 entry_count,
2701 entries.len()
2702 ));
2703 }
2704 for entry in &entries {
2705 if !file_mtimes.contains_key(&entry.chunk.file) {
2706 return Err(format!(
2707 "semantic cache metadata missing for entry file {}",
2708 entry.chunk.file.display()
2709 ));
2710 }
2711 }
2712
2713 Ok(Self {
2714 entries,
2715 file_mtimes,
2716 file_sizes,
2717 file_hashes,
2718 dimension,
2719 fingerprint,
2720 project_root: current_canonical_root.to_path_buf(),
2721 deferred_files: HashSet::new(),
2722 })
2723 }
2724}
2725
2726fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2728 let relative = file
2729 .strip_prefix(project_root)
2730 .unwrap_or(file)
2731 .to_string_lossy();
2732
2733 let kind_label = match &symbol.kind {
2734 SymbolKind::Function => "function",
2735 SymbolKind::Class => "class",
2736 SymbolKind::Method => "method",
2737 SymbolKind::Struct => "struct",
2738 SymbolKind::Interface => "interface",
2739 SymbolKind::Enum => "enum",
2740 SymbolKind::TypeAlias => "type",
2741 SymbolKind::Variable => "variable",
2742 SymbolKind::Heading => "heading",
2743 SymbolKind::FileSummary => "file-summary",
2744 };
2745
2746 let name = &symbol.name;
2748 let mut text = format!(
2749 "name:{name} file:{} kind:{} name:{name}",
2750 relative, kind_label
2751 );
2752
2753 if let Some(sig) = &symbol.signature {
2754 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2762 }
2763
2764 let lines: Vec<&str> = source.lines().collect();
2766 let start = (symbol.range.start_line as usize).min(lines.len());
2767 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2769 if start < end {
2770 let body: String = lines[start..end]
2771 .iter()
2772 .take(15) .copied()
2774 .collect::<Vec<&str>>()
2775 .join("\n");
2776 let snippet = if body.len() > 300 {
2777 format!("{}...", &body[..body.floor_char_boundary(300)])
2778 } else {
2779 body
2780 };
2781 text.push_str(&format!(" body:{}", snippet));
2782 }
2783
2784 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2789}
2790
2791const MAX_EMBED_TEXT_CHARS: usize = 1600;
2795
2796fn truncate_chars(value: &str, max_chars: usize) -> String {
2797 value.chars().take(max_chars).collect()
2798}
2799
2800fn first_leading_doc_comment(source: &str) -> String {
2801 let lines: Vec<&str> = source.lines().collect();
2802 let Some((start, first)) = lines
2803 .iter()
2804 .enumerate()
2805 .find(|(_, line)| !line.trim().is_empty())
2806 else {
2807 return String::new();
2808 };
2809
2810 let trimmed = first.trim_start();
2811 if trimmed.starts_with("/**") {
2812 let mut comment = Vec::new();
2813 for line in lines.iter().skip(start) {
2814 comment.push(*line);
2815 if line.contains("*/") {
2816 break;
2817 }
2818 }
2819 return truncate_chars(&comment.join("\n"), 200);
2820 }
2821
2822 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2823 let comment = lines
2824 .iter()
2825 .skip(start)
2826 .take_while(|line| {
2827 let trimmed = line.trim_start();
2828 trimmed.starts_with("///") || trimmed.starts_with("//!")
2829 })
2830 .copied()
2831 .collect::<Vec<_>>()
2832 .join("\n");
2833 return truncate_chars(&comment, 200);
2834 }
2835
2836 String::new()
2837}
2838
2839pub fn build_file_summary_chunk(
2840 file: &Path,
2841 project_root: &Path,
2842 source: &str,
2843 top_exports: &[&str],
2844 top_export_signatures: &[Option<&str>],
2845) -> SemanticChunk {
2846 let relative = file.strip_prefix(project_root).unwrap_or(file);
2847 let rel_path = relative.to_string_lossy();
2848 let parent_dir = relative
2849 .parent()
2850 .map(|parent| parent.to_string_lossy().to_string())
2851 .unwrap_or_default();
2852 let name = file
2853 .file_stem()
2854 .map(|stem| stem.to_string_lossy().to_string())
2855 .unwrap_or_default();
2856 let doc = first_leading_doc_comment(source);
2857 let exports = top_exports
2858 .iter()
2859 .take(5)
2860 .copied()
2861 .collect::<Vec<_>>()
2862 .join(",");
2863 let snippet = if doc.is_empty() {
2864 top_export_signatures
2865 .first()
2866 .and_then(|signature| signature.as_deref())
2867 .map(|signature| truncate_chars(signature, 200))
2868 .unwrap_or_default()
2869 } else {
2870 doc.clone()
2871 };
2872
2873 SemanticChunk {
2874 file: file.to_path_buf(),
2875 name,
2876 kind: SymbolKind::FileSummary,
2877 start_line: 0,
2878 end_line: 0,
2879 exported: false,
2880 embed_text: truncate_chars(
2881 &format!(
2882 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2883 file.file_stem()
2884 .map(|stem| stem.to_string_lossy().to_string())
2885 .unwrap_or_default()
2886 ),
2887 MAX_EMBED_TEXT_CHARS,
2888 ),
2889 snippet,
2890 }
2891}
2892
2893fn parser_for(
2894 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2895 lang: crate::parser::LangId,
2896) -> Result<&mut Parser, String> {
2897 use std::collections::hash_map::Entry;
2898
2899 match parsers.entry(lang) {
2900 Entry::Occupied(entry) => Ok(entry.into_mut()),
2901 Entry::Vacant(entry) => {
2902 let grammar = grammar_for(lang);
2903 let mut parser = Parser::new();
2904 parser
2905 .set_language(&grammar)
2906 .map_err(|error| error.to_string())?;
2907 Ok(entry.insert(parser))
2908 }
2909 }
2910}
2911
2912pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2913 matches!(
2914 path.extension().and_then(|extension| extension.to_str()),
2915 Some(
2916 "ts" | "tsx"
2917 | "js"
2918 | "jsx"
2919 | "py"
2920 | "rs"
2921 | "go"
2922 | "c"
2923 | "h"
2924 | "cc"
2925 | "cpp"
2926 | "cxx"
2927 | "hpp"
2928 | "hh"
2929 | "zig"
2930 | "cs"
2931 | "sh"
2932 | "bash"
2933 | "zsh"
2934 | "inc"
2935 | "php"
2936 | "sol"
2937 | "scss"
2938 | "vue"
2939 | "yaml"
2940 | "yml"
2941 )
2942 )
2943}
2944
2945fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2946 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2947 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2948 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2949 .map_err(|error| error.to_string())?
2950 .unwrap_or_else(cache_freshness::zero_hash);
2951 Ok(IndexedFileMetadata {
2952 mtime,
2953 size: metadata.len(),
2954 content_hash,
2955 })
2956}
2957
2958fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2959 if let Ok(canonical) = fs::canonicalize(path) {
2960 return canonical;
2961 }
2962
2963 let Some(parent) = path.parent() else {
2964 return path.to_path_buf();
2965 };
2966 let Some(file_name) = path.file_name() else {
2967 return path.to_path_buf();
2968 };
2969
2970 fs::canonicalize(parent)
2971 .map(|canonical_parent| canonical_parent.join(file_name))
2972 .unwrap_or_else(|_| path.to_path_buf())
2973}
2974
2975fn collect_file_chunks(
2976 project_root: &Path,
2977 file: &Path,
2978 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2979) -> Result<Vec<SemanticChunk>, String> {
2980 if !is_semantic_indexed_extension(file) {
2981 return Err("unsupported file extension".to_string());
2982 }
2983 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2984 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2985 let tree = parser_for(parsers, lang)?
2986 .parse(&source, None)
2987 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2988 let symbols =
2989 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2990
2991 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2992}
2993
2994fn build_snippet(symbol: &Symbol, source: &str) -> String {
2996 let lines: Vec<&str> = source.lines().collect();
2997 let start = (symbol.range.start_line as usize).min(lines.len());
2998 let end = (symbol.range.end_line as usize + 1).min(lines.len());
3000 if start < end {
3001 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3002 let mut snippet = snippet_lines.join("\n");
3003 if end - start > 5 {
3004 snippet.push_str("\n ...");
3005 }
3006 if snippet.len() > 300 {
3007 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3008 }
3009 snippet
3010 } else {
3011 String::new()
3012 }
3013}
3014
3015fn symbols_to_chunks(
3017 file: &Path,
3018 symbols: &[Symbol],
3019 source: &str,
3020 project_root: &Path,
3021) -> Vec<SemanticChunk> {
3022 let mut chunks = Vec::new();
3023 let top_exports_with_signatures = symbols
3024 .iter()
3025 .filter(|symbol| {
3026 symbol.exported
3027 && symbol.parent.is_none()
3028 && !matches!(symbol.kind, SymbolKind::Heading)
3029 })
3030 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3031 .collect::<Vec<_>>();
3032
3033 let has_only_headings = !symbols.is_empty()
3034 && symbols
3035 .iter()
3036 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3037 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3038 let top_exports = top_exports_with_signatures
3039 .iter()
3040 .map(|(name, _)| *name)
3041 .collect::<Vec<_>>();
3042 let top_export_signatures = top_exports_with_signatures
3043 .iter()
3044 .map(|(_, signature)| *signature)
3045 .collect::<Vec<_>>();
3046 chunks.push(build_file_summary_chunk(
3047 file,
3048 project_root,
3049 source,
3050 &top_exports,
3051 &top_export_signatures,
3052 ));
3053 }
3054
3055 for symbol in symbols {
3056 if matches!(symbol.kind, SymbolKind::Heading) {
3061 continue;
3062 }
3063
3064 let line_count = symbol
3066 .range
3067 .end_line
3068 .saturating_sub(symbol.range.start_line)
3069 + 1;
3070 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3071 continue;
3072 }
3073
3074 let embed_text = build_embed_text(symbol, source, file, project_root);
3075 let snippet = build_snippet(symbol, source);
3076
3077 chunks.push(SemanticChunk {
3078 file: file.to_path_buf(),
3079 name: symbol.name.clone(),
3080 kind: symbol.kind.clone(),
3081 start_line: symbol.range.start_line,
3082 end_line: symbol.range.end_line,
3083 exported: symbol.exported,
3084 embed_text,
3085 snippet,
3086 });
3087
3088 }
3091
3092 chunks
3093}
3094
3095fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3097 if a.len() != b.len() {
3098 return 0.0;
3099 }
3100
3101 let mut dot = 0.0f32;
3102 let mut norm_a = 0.0f32;
3103 let mut norm_b = 0.0f32;
3104
3105 for i in 0..a.len() {
3106 dot += a[i] * b[i];
3107 norm_a += a[i] * a[i];
3108 norm_b += b[i] * b[i];
3109 }
3110
3111 let denom = norm_a.sqrt() * norm_b.sqrt();
3112 if denom == 0.0 {
3113 0.0
3114 } else {
3115 dot / denom
3116 }
3117}
3118
3119fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3121 match kind {
3122 SymbolKind::Function => 0,
3123 SymbolKind::Class => 1,
3124 SymbolKind::Method => 2,
3125 SymbolKind::Struct => 3,
3126 SymbolKind::Interface => 4,
3127 SymbolKind::Enum => 5,
3128 SymbolKind::TypeAlias => 6,
3129 SymbolKind::Variable => 7,
3130 SymbolKind::Heading => 8,
3131 SymbolKind::FileSummary => 9,
3132 }
3133}
3134
3135fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3136 match v {
3137 0 => SymbolKind::Function,
3138 1 => SymbolKind::Class,
3139 2 => SymbolKind::Method,
3140 3 => SymbolKind::Struct,
3141 4 => SymbolKind::Interface,
3142 5 => SymbolKind::Enum,
3143 6 => SymbolKind::TypeAlias,
3144 7 => SymbolKind::Variable,
3145 8 => SymbolKind::Heading,
3146 9 => SymbolKind::FileSummary,
3147 _ => SymbolKind::Heading,
3148 }
3149}
3150
3151fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
3152 if *pos + 4 > data.len() {
3153 return Err("unexpected end of data reading u32".to_string());
3154 }
3155 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
3156 *pos += 4;
3157 Ok(val)
3158}
3159
3160fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
3161 if *pos + 8 > data.len() {
3162 return Err("unexpected end of data reading u64".to_string());
3163 }
3164 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
3165 *pos += 8;
3166 Ok(u64::from_le_bytes(bytes))
3167}
3168
3169fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
3170 let len = read_u32(data, pos)? as usize;
3171 if *pos + len > data.len() {
3172 return Err("unexpected end of data reading string".to_string());
3173 }
3174 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
3175 *pos += len;
3176 Ok(s)
3177}
3178
3179#[cfg(test)]
3180mod tests {
3181 use super::*;
3182 use crate::config::{SemanticBackend, SemanticBackendConfig};
3183 use crate::parser::FileParser;
3184 use std::io::{Read, Write};
3185 use std::net::TcpListener;
3186 use std::thread;
3187
3188 #[test]
3189 fn semantic_index_includes_php_inc_and_scss_extensions() {
3190 for file in ["partial.inc", "index.php", "styles.scss"] {
3191 assert!(
3192 is_semantic_indexed_extension(Path::new(file)),
3193 "{file} should be semantic-index eligible"
3194 );
3195 }
3196 }
3197
3198 #[test]
3199 fn transient_marker_round_trips_and_classifies() {
3200 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3203 assert!(embedding_failure_is_transient(&marked));
3204 let clean = strip_transient_embedding_marker(&marked);
3205 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3206 assert!(clean.starts_with("openai compatible request failed:"));
3207
3208 for permanent in [
3211 "openai compatible request failed (HTTP 401): Unauthorized",
3212 "embedding dimension mismatch: index has 384, model returned 768",
3213 "too many files (>20000) for semantic indexing (max 20000)",
3214 ] {
3215 assert!(
3216 !embedding_failure_is_transient(permanent),
3217 "{permanent:?} must not be transient"
3218 );
3219 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3221 }
3222 }
3223
3224 #[test]
3225 fn send_error_transience_separates_connect_timeout_from_4xx() {
3226 assert!(is_retryable_embedding_status(
3228 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3229 ));
3230 assert!(is_retryable_embedding_status(
3231 reqwest::StatusCode::TOO_MANY_REQUESTS
3232 ));
3233 assert!(!is_retryable_embedding_status(
3234 reqwest::StatusCode::UNAUTHORIZED
3235 ));
3236 assert!(!is_retryable_embedding_status(
3237 reqwest::StatusCode::BAD_REQUEST
3238 ));
3239 }
3240
3241 #[test]
3242 fn local_backend_model_loading_body_is_transient() {
3243 for body in [
3246 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3247 r#"{"error":"model is loading, please wait"}"#,
3248 r#"{"error":"Model not loaded"}"#,
3249 "Loading model into memory",
3250 ] {
3251 assert!(
3252 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3253 "{body:?} should be body-transient"
3254 );
3255 }
3256
3257 for body in [
3261 r#"{"error":"invalid api key"}"#,
3262 r#"{"error":"model 'foo' not found"}"#,
3263 "Bad Request: unknown field",
3264 "Bad Request: invalid loading model option",
3265 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3266 ] {
3267 assert!(
3268 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3269 "{body:?} must not be body-transient"
3270 );
3271 }
3272
3273 assert!(
3274 !embedding_response_body_is_transient(
3275 reqwest::StatusCode::UNAUTHORIZED,
3276 r#"{"error":"model is loading, please wait"}"#
3277 ),
3278 "permanent auth failures must not become transient because of body text"
3279 );
3280 }
3281
3282 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3283 where
3284 F: Fn(String, String, String) -> String + Send + 'static,
3285 {
3286 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3287 let addr = listener.local_addr().expect("local addr");
3288 let handle = thread::spawn(move || {
3289 let (mut stream, _) = listener.accept().expect("accept request");
3290 let mut buf = Vec::new();
3291 let mut chunk = [0u8; 4096];
3292 let mut header_end = None;
3293 let mut content_length = 0usize;
3294 loop {
3295 let n = stream.read(&mut chunk).expect("read request");
3296 if n == 0 {
3297 break;
3298 }
3299 buf.extend_from_slice(&chunk[..n]);
3300 if header_end.is_none() {
3301 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3302 header_end = Some(pos + 4);
3303 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3304 for line in headers.lines() {
3305 if let Some(value) = line.strip_prefix("Content-Length:") {
3306 content_length = value.trim().parse::<usize>().unwrap_or(0);
3307 }
3308 }
3309 }
3310 }
3311 if let Some(end) = header_end {
3312 if buf.len() >= end + content_length {
3313 break;
3314 }
3315 }
3316 }
3317
3318 let end = header_end.expect("header terminator");
3319 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3320 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3321 let mut lines = request.lines();
3322 let request_line = lines.next().expect("request line").to_string();
3323 let path = request_line
3324 .split_whitespace()
3325 .nth(1)
3326 .expect("request path")
3327 .to_string();
3328 let response_body = handler(request_line, path, body);
3329 let response = format!(
3330 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3331 response_body.len(),
3332 response_body
3333 );
3334 stream
3335 .write_all(response.as_bytes())
3336 .expect("write response");
3337 });
3338
3339 (format!("http://{}", addr), handle)
3340 }
3341
3342 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3343 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3344 listener
3345 .set_nonblocking(true)
3346 .expect("nonblocking listener");
3347 let addr = listener.local_addr().expect("local addr");
3348 let handle = thread::spawn(move || {
3349 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3350 let mut accepted = 0usize;
3351 while accepted < attempts && std::time::Instant::now() < deadline {
3352 match listener.accept() {
3353 Ok((mut stream, _)) => {
3354 accepted += 1;
3355 let mut buf = [0u8; 4096];
3356 let _ = stream.read(&mut buf);
3364 let response = "HTTP/1.1 200 OK
3365Content-Type: application/json
3366Content-Length: 128
3367Connection: close
3368
3369{";
3370 let _ = stream.write_all(response.as_bytes());
3371 }
3372 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3373 thread::sleep(Duration::from_millis(10));
3374 }
3375 Err(error) => panic!("accept request: {error}"),
3376 }
3377 }
3378 });
3379
3380 (format!("http://{}", addr), handle)
3381 }
3382
3383 #[test]
3384 fn response_body_read_failures_are_marked_transient() {
3385 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3386 let client = Client::builder()
3387 .timeout(Duration::from_millis(250))
3388 .build()
3389 .expect("client");
3390
3391 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3392 .expect_err("truncated body should fail");
3393
3394 handle.join().unwrap();
3395 assert!(
3396 embedding_failure_is_transient(&error),
3397 "body read failures should be transient-marked: {error}"
3398 );
3399 assert!(error.contains("response read failed"));
3400 }
3401
3402 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3403 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3404 }
3405
3406 fn write_rust_file(path: &Path, function_name: &str) {
3407 fs::write(
3408 path,
3409 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3410 )
3411 .unwrap();
3412 }
3413
3414 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3415 let mut embed = test_vector_for_texts;
3416 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3417 }
3418
3419 fn test_project_root() -> PathBuf {
3420 std::env::current_dir().unwrap()
3421 }
3422
3423 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3424 index.file_mtimes.insert(file.to_path_buf(), mtime);
3425 index.file_sizes.insert(file.to_path_buf(), size);
3426 index
3427 .file_hashes
3428 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3429 }
3430
3431 #[derive(Default)]
3432 struct RecordingEmbedder {
3433 calls: Vec<Vec<String>>,
3434 }
3435
3436 impl RecordingEmbedder {
3437 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3438 let vectors = texts
3439 .iter()
3440 .map(|text| deterministic_test_vector(text))
3441 .collect();
3442 self.calls.push(texts);
3443 Ok(vectors)
3444 }
3445
3446 fn total_embedded_texts(&self) -> usize {
3447 self.calls.iter().map(Vec::len).sum()
3448 }
3449
3450 fn embedded_texts(&self) -> Vec<&str> {
3451 self.calls
3452 .iter()
3453 .flat_map(|batch| batch.iter().map(String::as_str))
3454 .collect()
3455 }
3456 }
3457
3458 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3459 let hash = blake3::hash(text.as_bytes());
3460 let bytes = hash.as_bytes();
3461 vec![
3462 1.0,
3463 bytes[0] as f32 / 255.0,
3464 bytes[1] as f32 / 255.0,
3465 bytes[2] as f32 / 255.0,
3466 ]
3467 }
3468
3469 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3470 let mut embedder = RecordingEmbedder::default();
3471 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3472 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3473 }
3474
3475 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3476 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3477 }
3478
3479 fn write_source(path: &Path, source: &str) {
3480 if let Some(parent) = path.parent() {
3481 fs::create_dir_all(parent).unwrap();
3482 }
3483 fs::write(path, source).unwrap();
3484 }
3485
3486 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3487 index
3488 .entries
3489 .iter()
3490 .filter(|entry| entry.chunk.file == file)
3491 .collect()
3492 }
3493
3494 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3495 index
3496 .entries
3497 .iter()
3498 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3499 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3500 }
3501
3502 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3503 index
3504 .entries
3505 .iter()
3506 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3507 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3508 }
3509
3510 #[test]
3511 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3512 let temp = tempfile::tempdir().unwrap();
3513 let project_root = temp.path();
3514 let file = project_root.join("src/lib.rs");
3515 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3516 write_source(&file, original);
3517
3518 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3519 let original_entry_count = index.entries.len();
3520 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3521
3522 write_source(&file, &format!("\n{original}"));
3523 force_stale(&mut index, &file);
3524
3525 let mut embedder = RecordingEmbedder::default();
3526 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3527 let mut progress = |_done: usize, _total: usize| {};
3528 let summary = index
3529 .refresh_stale_files(
3530 project_root,
3531 std::slice::from_ref(&file),
3532 &mut embed,
3533 16,
3534 &mut progress,
3535 )
3536 .unwrap();
3537
3538 assert_eq!(summary.changed, 1);
3539 assert_eq!(embedder.total_embedded_texts(), 0);
3540 assert_eq!(index.entries.len(), original_entry_count);
3541 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3542 assert_eq!(shifted_alpha.chunk.start_line, 1);
3543 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3544 }
3545
3546 #[test]
3547 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3548 let temp = tempfile::tempdir().unwrap();
3549 let project_root = temp.path();
3550 let file = project_root.join("src/lib.rs");
3551 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3552 write_source(&file, original);
3553
3554 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3555 let mut serving_index = worker_index.clone();
3556 let original_entry_count = worker_index.entries.len();
3557
3558 write_source(&file, &format!("\n{original}"));
3559
3560 let mut embedder = RecordingEmbedder::default();
3561 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3562 let mut progress = |_done: usize, _total: usize| {};
3563 let update = worker_index
3564 .refresh_invalidated_files(
3565 project_root,
3566 std::slice::from_ref(&file),
3567 &mut embed,
3568 16,
3569 100,
3570 &mut progress,
3571 )
3572 .unwrap();
3573
3574 assert_eq!(embedder.total_embedded_texts(), 0);
3575 assert_eq!(update.added_entries.len(), original_entry_count);
3576 assert_eq!(worker_index.entries.len(), original_entry_count);
3577
3578 serving_index.apply_refresh_update(
3579 update.added_entries,
3580 update.updated_metadata,
3581 &update.completed_paths,
3582 );
3583
3584 assert_eq!(serving_index.entries.len(), original_entry_count);
3585 assert_eq!(
3586 entries_for_file(&serving_index, &file).len(),
3587 original_entry_count
3588 );
3589 assert_eq!(
3590 entry_by_name(&serving_index, &file, "alpha")
3591 .chunk
3592 .start_line,
3593 1
3594 );
3595 }
3596
3597 #[test]
3598 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3599 let temp = tempfile::tempdir().unwrap();
3600 let project_root = temp.path();
3601 let file = project_root.join("src/lib.rs");
3602 write_source(
3603 &file,
3604 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3605 );
3606
3607 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3608 let original_entry_count = index.entries.len();
3609 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3610
3611 write_source(
3612 &file,
3613 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3614 );
3615
3616 let mut embedder = RecordingEmbedder::default();
3617 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3618 let mut progress = |_done: usize, _total: usize| {};
3619 let update = index
3620 .refresh_invalidated_files(
3621 project_root,
3622 std::slice::from_ref(&file),
3623 &mut embed,
3624 16,
3625 100,
3626 &mut progress,
3627 )
3628 .unwrap();
3629
3630 assert_eq!(embedder.total_embedded_texts(), 1);
3631 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3632 assert_eq!(update.added_entries.len(), original_entry_count);
3633 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3634 }
3635
3636 #[test]
3637 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3638 let temp = tempfile::tempdir().unwrap();
3639 let project_root = temp.path();
3640 let file = project_root.join("src/dupe.js");
3641 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
3642 write_source(&file, one_duplicate);
3643
3644 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3645 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3646
3647 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3648
3649 let mut embedder = RecordingEmbedder::default();
3650 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3651 let mut progress = |_done: usize, _total: usize| {};
3652 index
3653 .refresh_invalidated_files(
3654 project_root,
3655 std::slice::from_ref(&file),
3656 &mut embed,
3657 16,
3658 100,
3659 &mut progress,
3660 )
3661 .unwrap();
3662
3663 let duplicate_entries = index
3664 .entries
3665 .iter()
3666 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3667 .collect::<Vec<_>>();
3668 assert_eq!(duplicate_entries.len(), 2);
3669 assert_eq!(embedder.total_embedded_texts(), 0);
3670 assert_eq!(duplicate_entries[0].vector, original_vector);
3671 assert_eq!(duplicate_entries[1].vector, original_vector);
3672 }
3673
3674 #[test]
3675 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3676 let temp = tempfile::tempdir().unwrap();
3677 let project_root = temp.path();
3678 let file = project_root.join("src/lib.rs");
3679 write_source(
3680 &file,
3681 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
3682 );
3683
3684 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3685 let summary_before = file_summary_entry(&index, &file).vector.clone();
3686
3687 write_source(
3688 &file,
3689 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
3690 );
3691 let mut body_embedder = RecordingEmbedder::default();
3692 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3693 let mut progress = |_done: usize, _total: usize| {};
3694 index
3695 .refresh_invalidated_files(
3696 project_root,
3697 std::slice::from_ref(&file),
3698 &mut body_embed,
3699 16,
3700 100,
3701 &mut progress,
3702 )
3703 .unwrap();
3704 assert_eq!(body_embedder.total_embedded_texts(), 1);
3705 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3706 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3707
3708 write_source(
3709 &file,
3710 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
3711 );
3712 let mut doc_embedder = RecordingEmbedder::default();
3713 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3714 index
3715 .refresh_invalidated_files(
3716 project_root,
3717 std::slice::from_ref(&file),
3718 &mut doc_embed,
3719 16,
3720 100,
3721 &mut progress,
3722 )
3723 .unwrap();
3724
3725 assert_eq!(doc_embedder.total_embedded_texts(), 1);
3726 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3727 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3728 }
3729
3730 #[test]
3731 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3732 let temp = tempfile::tempdir().unwrap();
3733 let project_root = temp.path();
3734 let file = project_root.join("src/lib.rs");
3735 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
3736
3737 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3738 let mut serving_index = worker_index.clone();
3739 fs::remove_file(&file).unwrap();
3740
3741 let mut embedder = RecordingEmbedder::default();
3742 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3743 let mut progress = |_done: usize, _total: usize| {};
3744 let update = worker_index
3745 .refresh_invalidated_files(
3746 project_root,
3747 std::slice::from_ref(&file),
3748 &mut embed,
3749 16,
3750 100,
3751 &mut progress,
3752 )
3753 .unwrap();
3754
3755 assert_eq!(update.summary.deleted, 1);
3756 assert_eq!(embedder.total_embedded_texts(), 0);
3757 assert!(worker_index.entries.is_empty());
3758
3759 serving_index.apply_refresh_update(
3760 update.added_entries,
3761 update.updated_metadata,
3762 &update.completed_paths,
3763 );
3764 assert!(serving_index.entries.is_empty());
3765 }
3766
3767 #[test]
3768 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
3769 let temp = tempfile::tempdir().unwrap();
3770 let project_root = temp.path();
3771 let file = project_root.join("src/lib.rs");
3772 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
3773
3774 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3775 let mut serving_index = worker_index.clone();
3776 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
3777
3778 let mut embedder = RecordingEmbedder::default();
3779 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3780 let mut progress = |_done: usize, _total: usize| {};
3781 let update = worker_index
3782 .refresh_invalidated_files(
3783 project_root,
3784 std::slice::from_ref(&file),
3785 &mut embed,
3786 16,
3787 100,
3788 &mut progress,
3789 )
3790 .unwrap();
3791
3792 assert_eq!(embedder.total_embedded_texts(), 0);
3793 assert!(update.added_entries.is_empty());
3794 assert!(worker_index.entries.is_empty());
3795 assert!(!worker_index.file_mtimes.contains_key(&file));
3796
3797 serving_index.apply_refresh_update(
3798 update.added_entries,
3799 update.updated_metadata,
3800 &update.completed_paths,
3801 );
3802 assert!(serving_index.entries.is_empty());
3803 assert!(!serving_index.file_mtimes.contains_key(&file));
3804 }
3805
3806 #[test]
3807 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
3808 let temp = tempfile::tempdir().unwrap();
3809 let project_root = temp.path();
3810 let indexed = project_root.join("src/a.rs");
3811 let deferred = project_root.join("src/b.rs");
3812 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
3813 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
3814
3815 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
3816 let mut embedder = RecordingEmbedder::default();
3817 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3818 let mut progress = |_done: usize, _total: usize| {};
3819 let update = index
3820 .refresh_invalidated_files(
3821 project_root,
3822 std::slice::from_ref(&deferred),
3823 &mut embed,
3824 16,
3825 1,
3826 &mut progress,
3827 )
3828 .unwrap();
3829
3830 assert_eq!(update.summary.total_processed, 1);
3831 assert_eq!(update.summary.added, 0);
3832 assert_eq!(embedder.total_embedded_texts(), 0);
3833 assert_eq!(index.indexed_file_count(), 1);
3834 assert!(index.deferred_files.contains(&deferred));
3835 assert!(entries_for_file(&index, &deferred).is_empty());
3836 }
3837
3838 #[test]
3839 fn semantic_cache_serialization_skips_paths_outside_project_root() {
3840 let dir = tempfile::tempdir().expect("create temp dir");
3841 let project = fs::canonicalize(dir.path()).expect("canonical project");
3842 let outside = project.join("..").join("outside.rs");
3843 let mut index = SemanticIndex::new(project.clone(), 3);
3844 index
3845 .file_mtimes
3846 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3847 index.file_sizes.insert(outside.clone(), 1);
3848 index
3849 .file_hashes
3850 .insert(outside.clone(), cache_freshness::zero_hash());
3851 index.entries.push(EmbeddingEntry {
3852 chunk: SemanticChunk {
3853 file: outside,
3854 name: "outside".to_string(),
3855 kind: SymbolKind::Function,
3856 start_line: 0,
3857 end_line: 0,
3858 exported: false,
3859 embed_text: "outside".to_string(),
3860 snippet: "outside".to_string(),
3861 },
3862 vector: vec![1.0, 0.0, 0.0],
3863 });
3864
3865 let bytes = index.to_bytes();
3866 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3867 assert_eq!(loaded.entries.len(), 0);
3868 assert!(loaded.file_mtimes.is_empty());
3869 }
3870
3871 #[test]
3872 fn test_cosine_similarity_identical() {
3873 let a = vec![1.0, 0.0, 0.0];
3874 let b = vec![1.0, 0.0, 0.0];
3875 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3876 }
3877
3878 #[test]
3879 fn test_cosine_similarity_orthogonal() {
3880 let a = vec![1.0, 0.0, 0.0];
3881 let b = vec![0.0, 1.0, 0.0];
3882 assert!(cosine_similarity(&a, &b).abs() < 0.001);
3883 }
3884
3885 #[test]
3886 fn test_cosine_similarity_opposite() {
3887 let a = vec![1.0, 0.0, 0.0];
3888 let b = vec![-1.0, 0.0, 0.0];
3889 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3890 }
3891
3892 #[test]
3893 fn test_serialization_roundtrip() {
3894 let project_root = test_project_root();
3895 let file = project_root.join("src/main.rs");
3896 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3897 index.entries.push(EmbeddingEntry {
3898 chunk: SemanticChunk {
3899 file: file.clone(),
3900 name: "handle_request".to_string(),
3901 kind: SymbolKind::Function,
3902 start_line: 10,
3903 end_line: 25,
3904 exported: true,
3905 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3906 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3907 },
3908 vector: vec![0.1, 0.2, 0.3, 0.4],
3909 });
3910 index.dimension = 4;
3911 index
3912 .file_mtimes
3913 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3914 index.file_sizes.insert(file, 0);
3915 index.set_fingerprint(SemanticIndexFingerprint {
3916 backend: "fastembed".to_string(),
3917 model: "all-MiniLM-L6-v2".to_string(),
3918 base_url: FALLBACK_BACKEND.to_string(),
3919 dimension: 4,
3920 chunking_version: default_chunking_version(),
3921 });
3922
3923 let bytes = index.to_bytes();
3924 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3925
3926 assert_eq!(restored.entries.len(), 1);
3927 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3928 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3929 assert_eq!(restored.dimension, 4);
3930 assert_eq!(restored.backend_label(), Some("fastembed"));
3931 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3932 }
3933
3934 #[test]
3935 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3936 let cases = [
3937 (SymbolKind::Function, 0),
3938 (SymbolKind::Class, 1),
3939 (SymbolKind::Method, 2),
3940 (SymbolKind::Struct, 3),
3941 (SymbolKind::Interface, 4),
3942 (SymbolKind::Enum, 5),
3943 (SymbolKind::TypeAlias, 6),
3944 (SymbolKind::Variable, 7),
3945 (SymbolKind::Heading, 8),
3946 (SymbolKind::FileSummary, 9),
3947 ];
3948
3949 for (kind, encoded) in cases {
3950 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3951 assert_eq!(u8_to_symbol_kind(encoded), kind);
3952 }
3953 }
3954
3955 #[test]
3956 fn test_search_top_k() {
3957 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3958 index.dimension = 3;
3959
3960 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3962 let mut vec = vec![0.0f32; 3];
3963 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3965 chunk: SemanticChunk {
3966 file: PathBuf::from("/src/lib.rs"),
3967 name: name.to_string(),
3968 kind: SymbolKind::Function,
3969 start_line: (i * 10 + 1) as u32,
3970 end_line: (i * 10 + 5) as u32,
3971 exported: true,
3972 embed_text: format!("kind:function name:{}", name),
3973 snippet: format!("fn {}() {{}}", name),
3974 },
3975 vector: vec,
3976 });
3977 }
3978
3979 let query = vec![0.9, 0.1, 0.0];
3981 let results = index.search(&query, 2);
3982
3983 assert_eq!(results.len(), 2);
3984 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3986 }
3987
3988 #[test]
3989 fn test_empty_index_search() {
3990 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3991 let results = index.search(&[0.1, 0.2, 0.3], 10);
3992 assert!(results.is_empty());
3993 }
3994
3995 #[test]
3996 fn single_line_symbol_builds_non_empty_snippet() {
3997 let symbol = Symbol {
3998 name: "answer".to_string(),
3999 kind: SymbolKind::Variable,
4000 range: crate::symbols::Range {
4001 start_line: 0,
4002 start_col: 0,
4003 end_line: 0,
4004 end_col: 24,
4005 },
4006 signature: Some("const answer = 42".to_string()),
4007 scope_chain: Vec::new(),
4008 exported: true,
4009 parent: None,
4010 };
4011 let source = "export const answer = 42;\n";
4012
4013 let snippet = build_snippet(&symbol, source);
4014
4015 assert_eq!(snippet, "export const answer = 42;");
4016 }
4017
4018 #[test]
4019 fn optimized_file_chunk_collection_matches_file_parser_path() {
4020 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4021 let file = project_root.join("src/semantic_index.rs");
4022 let source = std::fs::read_to_string(&file).unwrap();
4023
4024 let mut legacy_parser = FileParser::new();
4025 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4026 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4027
4028 let mut parsers = HashMap::new();
4029 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4030
4031 assert_eq!(
4032 chunk_fingerprint(&optimized_chunks),
4033 chunk_fingerprint(&legacy_chunks)
4034 );
4035 }
4036
4037 fn chunk_fingerprint(
4038 chunks: &[SemanticChunk],
4039 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4040 chunks
4041 .iter()
4042 .map(|chunk| {
4043 (
4044 chunk.name.clone(),
4045 chunk.kind.clone(),
4046 chunk.start_line,
4047 chunk.end_line,
4048 chunk.exported,
4049 chunk.embed_text.clone(),
4050 chunk.snippet.clone(),
4051 )
4052 })
4053 .collect()
4054 }
4055
4056 #[test]
4057 fn rejects_oversized_dimension_during_deserialization() {
4058 let mut bytes = Vec::new();
4059 bytes.push(1u8);
4060 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4061 bytes.extend_from_slice(&0u32.to_le_bytes());
4062 bytes.extend_from_slice(&0u32.to_le_bytes());
4063
4064 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4065 }
4066
4067 #[test]
4068 fn rejects_oversized_entry_count_during_deserialization() {
4069 let mut bytes = Vec::new();
4070 bytes.push(1u8);
4071 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4072 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4073 bytes.extend_from_slice(&0u32.to_le_bytes());
4074
4075 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4076 }
4077
4078 #[test]
4079 fn invalidate_file_removes_entries_and_mtime() {
4080 let target = PathBuf::from("/src/main.rs");
4081 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4082 index.entries.push(EmbeddingEntry {
4083 chunk: SemanticChunk {
4084 file: target.clone(),
4085 name: "main".to_string(),
4086 kind: SymbolKind::Function,
4087 start_line: 0,
4088 end_line: 1,
4089 exported: false,
4090 embed_text: "main".to_string(),
4091 snippet: "fn main() {}".to_string(),
4092 },
4093 vector: vec![1.0; DEFAULT_DIMENSION],
4094 });
4095 index
4096 .file_mtimes
4097 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4098 index.file_sizes.insert(target.clone(), 0);
4099
4100 index.invalidate_file(&target);
4101
4102 assert!(index.entries.is_empty());
4103 assert!(!index.file_mtimes.contains_key(&target));
4104 assert!(!index.file_sizes.contains_key(&target));
4105 }
4106
4107 #[test]
4108 fn refresh_missing_changed_file_is_purged_after_collect() {
4109 let temp = tempfile::tempdir().unwrap();
4110 let project_root = temp.path();
4111 let file = project_root.join("src/lib.rs");
4112 fs::create_dir_all(file.parent().unwrap()).unwrap();
4113 write_rust_file(&file, "vanished_symbol");
4114
4115 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4116 let original_size = *index.file_sizes.get(&file).unwrap();
4117 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4118 fs::remove_file(&file).unwrap();
4119
4120 let mut embed = test_vector_for_texts;
4121 let mut progress = |_done: usize, _total: usize| {};
4122 let summary = index
4123 .refresh_stale_files(
4124 project_root,
4125 std::slice::from_ref(&file),
4126 &mut embed,
4127 8,
4128 &mut progress,
4129 )
4130 .unwrap();
4131
4132 assert_eq!(summary.changed, 0);
4133 assert_eq!(summary.added, 0);
4134 assert_eq!(summary.deleted, 1);
4135 assert!(index.entries.is_empty());
4136 assert!(!index.file_mtimes.contains_key(&file));
4137 assert!(!index.file_sizes.contains_key(&file));
4138 assert!(!index.file_hashes.contains_key(&file));
4139 }
4140
4141 #[test]
4142 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4143 let temp = tempfile::tempdir().unwrap();
4144 let project_root = temp.path();
4145 let file = project_root.join("src/lib.rs");
4146 fs::create_dir_all(file.parent().unwrap()).unwrap();
4147 write_rust_file(&file, "kept_symbol");
4148
4149 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4150 let original_entry_count = index.entries.len();
4151 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4152 let original_size = *index.file_sizes.get(&file).unwrap();
4153
4154 let stale_mtime = SystemTime::UNIX_EPOCH;
4155 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4156 fs::remove_file(&file).unwrap();
4157 fs::create_dir(&file).unwrap();
4158
4159 let mut embed = test_vector_for_texts;
4160 let mut progress = |_done: usize, _total: usize| {};
4161 let summary = index
4162 .refresh_stale_files(
4163 project_root,
4164 std::slice::from_ref(&file),
4165 &mut embed,
4166 8,
4167 &mut progress,
4168 )
4169 .unwrap();
4170
4171 assert_eq!(summary.changed, 0);
4172 assert_eq!(summary.added, 0);
4173 assert_eq!(summary.deleted, 0);
4174 assert_eq!(index.entries.len(), original_entry_count);
4175 assert!(index
4176 .entries
4177 .iter()
4178 .any(|entry| entry.chunk.name == "kept_symbol"));
4179 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4180 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4181 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4182 }
4183
4184 #[test]
4185 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4186 let temp = tempfile::tempdir().unwrap();
4187 let project_root = temp.path();
4188 let missing = project_root.join("src/missing.rs");
4189 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4190
4191 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4192 let mut embed = test_vector_for_texts;
4193 let mut progress = |_done: usize, _total: usize| {};
4194 let summary = index
4195 .refresh_stale_files(
4196 project_root,
4197 std::slice::from_ref(&missing),
4198 &mut embed,
4199 8,
4200 &mut progress,
4201 )
4202 .unwrap();
4203
4204 assert_eq!(summary.added, 0);
4205 assert_eq!(summary.changed, 0);
4206 assert_eq!(summary.deleted, 0);
4207 assert!(!index.file_mtimes.contains_key(&missing));
4208 assert!(!index.file_sizes.contains_key(&missing));
4209 assert!(index.entries.is_empty());
4210 }
4211
4212 #[test]
4213 fn refresh_reports_added_for_new_files() {
4214 let temp = tempfile::tempdir().unwrap();
4215 let project_root = temp.path();
4216 let existing = project_root.join("src/lib.rs");
4217 let added = project_root.join("src/new.rs");
4218 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4219 write_rust_file(&existing, "existing_symbol");
4220 write_rust_file(&added, "added_symbol");
4221
4222 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4223 let mut embed = test_vector_for_texts;
4224 let mut progress = |_done: usize, _total: usize| {};
4225 let summary = index
4226 .refresh_stale_files(
4227 project_root,
4228 &[existing.clone(), added.clone()],
4229 &mut embed,
4230 8,
4231 &mut progress,
4232 )
4233 .unwrap();
4234
4235 assert_eq!(summary.added, 1);
4236 assert_eq!(summary.changed, 0);
4237 assert_eq!(summary.deleted, 0);
4238 assert_eq!(summary.total_processed, 2);
4239 assert!(index.file_mtimes.contains_key(&added));
4240 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4241 }
4242
4243 #[test]
4244 fn refresh_reports_deleted_for_removed_files() {
4245 let temp = tempfile::tempdir().unwrap();
4246 let project_root = temp.path();
4247 let deleted = project_root.join("src/deleted.rs");
4248 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4249 write_rust_file(&deleted, "deleted_symbol");
4250
4251 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4252 fs::remove_file(&deleted).unwrap();
4253
4254 let mut embed = test_vector_for_texts;
4255 let mut progress = |_done: usize, _total: usize| {};
4256 let summary = index
4257 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4258 .unwrap();
4259
4260 assert_eq!(summary.deleted, 1);
4261 assert_eq!(summary.changed, 0);
4262 assert_eq!(summary.added, 0);
4263 assert_eq!(summary.total_processed, 1);
4264 assert!(!index.file_mtimes.contains_key(&deleted));
4265 assert!(index.entries.is_empty());
4266 }
4267
4268 #[test]
4269 fn refresh_reports_changed_for_modified_files() {
4270 let temp = tempfile::tempdir().unwrap();
4271 let project_root = temp.path();
4272 let file = project_root.join("src/lib.rs");
4273 fs::create_dir_all(file.parent().unwrap()).unwrap();
4274 write_rust_file(&file, "old_symbol");
4275
4276 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4277 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4278 write_rust_file(&file, "new_symbol");
4279
4280 let mut embed = test_vector_for_texts;
4281 let mut progress = |_done: usize, _total: usize| {};
4282 let summary = index
4283 .refresh_stale_files(
4284 project_root,
4285 std::slice::from_ref(&file),
4286 &mut embed,
4287 8,
4288 &mut progress,
4289 )
4290 .unwrap();
4291
4292 assert_eq!(summary.changed, 1);
4293 assert_eq!(summary.added, 0);
4294 assert_eq!(summary.deleted, 0);
4295 assert_eq!(summary.total_processed, 1);
4296 assert!(index
4297 .entries
4298 .iter()
4299 .any(|entry| entry.chunk.name == "new_symbol"));
4300 assert!(!index
4301 .entries
4302 .iter()
4303 .any(|entry| entry.chunk.name == "old_symbol"));
4304 }
4305
4306 #[test]
4307 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4308 let temp = tempfile::tempdir().unwrap();
4309 let project_root = temp.path();
4310 let file = project_root.join("src/lib.rs");
4311 fs::create_dir_all(file.parent().unwrap()).unwrap();
4312 write_rust_file(&file, "clean_symbol");
4313
4314 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4315 let original_entries = index.entries.len();
4316 let mut embed_called = false;
4317 let mut embed = |texts: Vec<String>| {
4318 embed_called = true;
4319 test_vector_for_texts(texts)
4320 };
4321 let mut progress = |_done: usize, _total: usize| {};
4322 let summary = index
4323 .refresh_stale_files(
4324 project_root,
4325 std::slice::from_ref(&file),
4326 &mut embed,
4327 8,
4328 &mut progress,
4329 )
4330 .unwrap();
4331
4332 assert!(summary.is_noop());
4333 assert_eq!(summary.total_processed, 1);
4334 assert!(!embed_called);
4335 assert_eq!(index.entries.len(), original_entries);
4336 }
4337
4338 #[test]
4339 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4340 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4341
4342 assert!(is_onnx_runtime_unavailable(message));
4343 }
4344
4345 #[test]
4346 fn formats_missing_onnx_runtime_with_install_hint() {
4347 let message = format_embedding_init_error(
4348 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4349 );
4350
4351 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4352 assert!(message.contains("Original error:"));
4353 }
4354
4355 #[test]
4356 fn openai_compatible_backend_embeds_with_mock_server() {
4357 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4358 assert!(request_line.starts_with("POST "));
4359 assert_eq!(path, "/v1/embeddings");
4360 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4361 });
4362
4363 let config = SemanticBackendConfig {
4364 backend: SemanticBackend::OpenAiCompatible,
4365 model: "test-embedding".to_string(),
4366 base_url: Some(base_url),
4367 api_key_env: None,
4368 timeout_ms: 5_000,
4369 max_batch_size: 64,
4370 max_files: 20_000,
4371 };
4372
4373 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4374 let vectors = model
4375 .embed(vec!["hello".to_string(), "world".to_string()])
4376 .unwrap();
4377
4378 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4379 handle.join().unwrap();
4380 }
4381
4382 #[test]
4392 fn openai_compatible_request_has_single_content_type_header() {
4393 use std::sync::{Arc, Mutex};
4394 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4395 let captured_for_thread = Arc::clone(&captured);
4396
4397 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4398 let addr = listener.local_addr().expect("local addr");
4399 let handle = thread::spawn(move || {
4400 let (mut stream, _) = listener.accept().expect("accept");
4401 let mut buf = Vec::new();
4402 let mut chunk = [0u8; 4096];
4403 let mut header_end = None;
4404 let mut content_length = 0usize;
4405 loop {
4406 let n = stream.read(&mut chunk).expect("read");
4407 if n == 0 {
4408 break;
4409 }
4410 buf.extend_from_slice(&chunk[..n]);
4411 if header_end.is_none() {
4412 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4413 header_end = Some(pos + 4);
4414 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4415 if let Some(value) = line.strip_prefix("Content-Length:") {
4416 content_length = value.trim().parse::<usize>().unwrap_or(0);
4417 }
4418 }
4419 }
4420 }
4421 if let Some(end) = header_end {
4422 if buf.len() >= end + content_length {
4423 break;
4424 }
4425 }
4426 }
4427 *captured_for_thread.lock().unwrap() = buf;
4428 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4429 let response = format!(
4430 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4431 body.len(),
4432 body
4433 );
4434 let _ = stream.write_all(response.as_bytes());
4435 });
4436
4437 let config = SemanticBackendConfig {
4438 backend: SemanticBackend::OpenAiCompatible,
4439 model: "text-embedding-3-small".to_string(),
4440 base_url: Some(format!("http://{}", addr)),
4441 api_key_env: None,
4442 timeout_ms: 5_000,
4443 max_batch_size: 64,
4444 max_files: 20_000,
4445 };
4446 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4447 let _ = model.embed(vec!["probe".to_string()]).unwrap();
4448 handle.join().unwrap();
4449
4450 let bytes = captured.lock().unwrap().clone();
4451 let request = String::from_utf8_lossy(&bytes);
4452
4453 let content_type_lines = request
4456 .lines()
4457 .filter(|line| {
4458 let lower = line.to_ascii_lowercase();
4459 lower.starts_with("content-type:")
4460 })
4461 .count();
4462 assert_eq!(
4463 content_type_lines, 1,
4464 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4465 );
4466
4467 assert!(
4470 request.contains(r#""model":"text-embedding-3-small""#),
4471 "request body should contain model field; full request:\n{request}",
4472 );
4473 }
4474
4475 #[test]
4476 fn ollama_backend_embeds_with_mock_server() {
4477 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4478 assert!(request_line.starts_with("POST "));
4479 assert_eq!(path, "/api/embed");
4480 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4481 });
4482
4483 let config = SemanticBackendConfig {
4484 backend: SemanticBackend::Ollama,
4485 model: "embeddinggemma".to_string(),
4486 base_url: Some(base_url),
4487 api_key_env: None,
4488 timeout_ms: 5_000,
4489 max_batch_size: 64,
4490 max_files: 20_000,
4491 };
4492
4493 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4494 let vectors = model
4495 .embed(vec!["hello".to_string(), "world".to_string()])
4496 .unwrap();
4497
4498 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4499 handle.join().unwrap();
4500 }
4501
4502 #[test]
4503 fn read_from_disk_rejects_fingerprint_mismatch() {
4504 let storage = tempfile::tempdir().unwrap();
4505 let project_key = "proj";
4506
4507 let project_root = test_project_root();
4508 let file = project_root.join("src/main.rs");
4509 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4510 index.entries.push(EmbeddingEntry {
4511 chunk: SemanticChunk {
4512 file: file.clone(),
4513 name: "handle_request".to_string(),
4514 kind: SymbolKind::Function,
4515 start_line: 10,
4516 end_line: 25,
4517 exported: true,
4518 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4519 snippet: "fn handle_request() {}".to_string(),
4520 },
4521 vector: vec![0.1, 0.2, 0.3],
4522 });
4523 index.dimension = 3;
4524 index
4525 .file_mtimes
4526 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4527 index.file_sizes.insert(file, 0);
4528 index.set_fingerprint(SemanticIndexFingerprint {
4529 backend: "openai_compatible".to_string(),
4530 model: "test-embedding".to_string(),
4531 base_url: "http://127.0.0.1:1234/v1".to_string(),
4532 dimension: 3,
4533 chunking_version: default_chunking_version(),
4534 });
4535 index.write_to_disk(storage.path(), project_key);
4536
4537 let matching = index.fingerprint().unwrap().as_string();
4538 assert!(SemanticIndex::read_from_disk(
4539 storage.path(),
4540 project_key,
4541 &project_root,
4542 false,
4543 Some(&matching),
4544 )
4545 .is_some());
4546
4547 let mismatched = SemanticIndexFingerprint {
4548 backend: "ollama".to_string(),
4549 model: "embeddinggemma".to_string(),
4550 base_url: "http://127.0.0.1:11434".to_string(),
4551 dimension: 3,
4552 chunking_version: default_chunking_version(),
4553 }
4554 .as_string();
4555 assert!(SemanticIndex::read_from_disk(
4556 storage.path(),
4557 project_key,
4558 &project_root,
4559 false,
4560 Some(&mismatched),
4561 )
4562 .is_none());
4563 }
4564
4565 #[test]
4566 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
4567 let storage = tempfile::tempdir().unwrap();
4568 let project_key = "proj-v3";
4569 let dir = storage.path().join("semantic").join(project_key);
4570 fs::create_dir_all(&dir).unwrap();
4571
4572 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4573 index.entries.push(EmbeddingEntry {
4574 chunk: SemanticChunk {
4575 file: PathBuf::from("/src/main.rs"),
4576 name: "handle_request".to_string(),
4577 kind: SymbolKind::Function,
4578 start_line: 0,
4579 end_line: 0,
4580 exported: true,
4581 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4582 snippet: "fn handle_request() {}".to_string(),
4583 },
4584 vector: vec![0.1, 0.2, 0.3],
4585 });
4586 index.dimension = 3;
4587 index
4588 .file_mtimes
4589 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
4590 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
4591 let fingerprint = SemanticIndexFingerprint {
4592 backend: "fastembed".to_string(),
4593 model: "test".to_string(),
4594 base_url: FALLBACK_BACKEND.to_string(),
4595 dimension: 3,
4596 chunking_version: default_chunking_version(),
4597 };
4598 index.set_fingerprint(fingerprint.clone());
4599
4600 let mut bytes = index.to_bytes();
4601 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
4602 fs::write(dir.join("semantic.bin"), bytes).unwrap();
4603
4604 assert!(SemanticIndex::read_from_disk(
4605 storage.path(),
4606 project_key,
4607 &test_project_root(),
4608 false,
4609 Some(&fingerprint.as_string())
4610 )
4611 .is_none());
4612 assert!(!dir.join("semantic.bin").exists());
4613 }
4614
4615 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
4616 crate::symbols::Symbol {
4617 name: name.to_string(),
4618 kind,
4619 range: crate::symbols::Range {
4620 start_line: start,
4621 start_col: 0,
4622 end_line: end,
4623 end_col: 0,
4624 },
4625 signature: None,
4626 scope_chain: Vec::new(),
4627 exported: false,
4628 parent: None,
4629 }
4630 }
4631
4632 #[test]
4637 fn symbols_to_chunks_skips_heading_symbols() {
4638 let project_root = PathBuf::from("/proj");
4639 let file = project_root.join("README.md");
4640 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
4641
4642 let symbols = vec![
4643 make_symbol(SymbolKind::Heading, "Title", 0, 2),
4644 make_symbol(SymbolKind::Heading, "Section", 4, 6),
4645 ];
4646
4647 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4648 assert!(
4649 chunks.is_empty(),
4650 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
4651 chunks.len()
4652 );
4653 }
4654
4655 #[test]
4662 fn build_embed_text_clamps_oversized_signature() {
4663 let project_root = PathBuf::from("/proj");
4664 let file = project_root.join("cronjob.yaml");
4665 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
4667
4668 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
4669 symbol.signature = Some(huge_sig);
4670
4671 let text = build_embed_text(&symbol, source, &file, &project_root);
4672 assert!(
4673 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
4674 "embed_text must be clamped to {} chars, got {}",
4675 MAX_EMBED_TEXT_CHARS,
4676 text.chars().count()
4677 );
4678 }
4679
4680 #[test]
4684 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
4685 let project_root = PathBuf::from("/proj");
4686 let file = project_root.join("src/lib.rs");
4687 let source = "pub fn handle_request() -> bool {\n true\n}\n";
4688
4689 let symbols = vec![
4690 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
4692 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
4693 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
4694 ];
4695
4696 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4697 assert_eq!(
4698 chunks.len(),
4699 3,
4700 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
4701 chunks.len()
4702 );
4703 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
4704 assert!(chunks
4705 .iter()
4706 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
4707 assert!(names.contains(&"handle_request"));
4708 assert!(names.contains(&"AuthService"));
4709 assert!(
4710 !names.contains(&"doc heading"),
4711 "Heading symbol leaked into chunks: {names:?}"
4712 );
4713 }
4714
4715 #[test]
4716 fn validate_ssrf_allows_loopback_hostnames() {
4717 for host in &[
4720 "http://localhost",
4721 "http://localhost:8080",
4722 "http://localhost:11434", "http://localhost.localdomain",
4724 "http://foo.localhost",
4725 ] {
4726 assert!(
4727 validate_base_url_no_ssrf(host).is_ok(),
4728 "Expected {host} to be allowed (loopback), got: {:?}",
4729 validate_base_url_no_ssrf(host)
4730 );
4731 }
4732 }
4733
4734 #[test]
4735 fn validate_ssrf_allows_loopback_ips() {
4736 for url in &[
4739 "http://127.0.0.1",
4740 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
4742 "http://127.1.2.3",
4743 ] {
4744 let result = validate_base_url_no_ssrf(url);
4745 assert!(
4746 result.is_ok(),
4747 "Expected {url} to be allowed (loopback), got: {:?}",
4748 result
4749 );
4750 }
4751 }
4752
4753 #[test]
4754 fn validate_ssrf_rejects_private_non_loopback_ips() {
4755 for url in &[
4760 "http://192.168.1.1",
4761 "http://10.0.0.1",
4762 "http://172.16.0.1",
4763 "http://169.254.169.254",
4764 "http://100.64.0.1",
4765 ] {
4766 let result = validate_base_url_no_ssrf(url);
4767 assert!(
4768 result.is_err(),
4769 "Expected {url} to be rejected (non-loopback private), got: {:?}",
4770 result
4771 );
4772 }
4773 }
4774
4775 #[test]
4776 fn validate_ssrf_rejects_mdns_local_hostnames() {
4777 for host in &[
4780 "http://printer.local",
4781 "http://nas.local:8080",
4782 "http://homelab.local",
4783 ] {
4784 let result = validate_base_url_no_ssrf(host);
4785 assert!(
4786 result.is_err(),
4787 "Expected {host} to be rejected (mDNS), got: {:?}",
4788 result
4789 );
4790 }
4791 }
4792
4793 #[test]
4794 fn normalize_base_url_allows_localhost_for_tests() {
4795 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4798 assert!(normalize_base_url("http://localhost:8080").is_ok());
4799 }
4800
4801 #[test]
4808 fn ort_mismatch_message_recommends_auto_fix_first() {
4809 let msg =
4810 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4811
4812 assert!(
4814 msg.contains("v1.9.0"),
4815 "should report detected version: {msg}"
4816 );
4817 assert!(
4818 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4819 "should report system path: {msg}"
4820 );
4821 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4822
4823 let auto_fix_pos = msg
4825 .find("Auto-fix")
4826 .expect("Auto-fix solution missing — users won't discover --fix");
4827 let remove_pos = msg
4828 .find("Remove the old library")
4829 .expect("system-rm solution missing");
4830 assert!(
4831 auto_fix_pos < remove_pos,
4832 "Auto-fix must come before manual rm — see PR comment thread"
4833 );
4834
4835 assert!(
4837 msg.contains("npx @cortexkit/aft doctor --fix"),
4838 "auto-fix command must be present and copy-pasteable: {msg}"
4839 );
4840 }
4841
4842 #[cfg(any(target_os = "linux", target_os = "macos"))]
4843 #[test]
4844 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
4845 let requested = "libonnxruntime.so";
4846 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
4847
4848 assert_eq!(detect_ort_version_from_path(requested), None);
4849 let (version, source) =
4850 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
4851
4852 assert_eq!(version, Some("1.19.0".to_string()));
4853 assert_eq!(source, actual);
4854
4855 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
4856 assert!(msg.contains("v1.19.0"));
4857 assert!(msg.contains(actual));
4858 }
4859
4860 #[test]
4864 fn ort_mismatch_message_handles_macos_dylib_path() {
4865 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4866 assert!(msg.contains("v1.9.0"));
4867 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4868 assert!(
4872 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4873 "system path should be quoted in the auto-fix sentence: {msg}"
4874 );
4875 }
4876}