1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
58const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
61const DEFAULT_MAX_BATCH_SIZE: usize = 64;
62const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
63const FALLBACK_BACKEND: &str = "none";
64const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
65const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
66static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
67
68pub struct SemanticIndexLock {
69 _guard: fs_lock::LockGuard,
70}
71
72impl SemanticIndexLock {
73 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
74 let dir = storage_dir.join("semantic").join(project_key);
75 fs::create_dir_all(&dir)?;
76 let path = dir.join("cache.lock");
77 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
78 .lock()
79 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
80 fs_lock::try_acquire(&path, Duration::from_secs(2))
81 .map(|guard| Self { _guard: guard })
82 .map_err(|error| match error {
83 fs_lock::AcquireError::Timeout => {
84 std::io::Error::other("timed out acquiring semantic cache lock")
85 }
86 fs_lock::AcquireError::Io(error) => error,
87 })
88 }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticIndexFingerprint {
93 pub backend: String,
94 pub model: String,
95 #[serde(default)]
96 pub base_url: String,
97 pub dimension: usize,
98 #[serde(default = "default_chunking_version")]
99 pub chunking_version: u32,
100}
101
102fn default_chunking_version() -> u32 {
103 2
104}
105
106impl SemanticIndexFingerprint {
107 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
108 let base_url = config
111 .base_url
112 .as_ref()
113 .and_then(|u| normalize_base_url(u).ok())
114 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
115 Self {
116 backend: config.backend.as_str().to_string(),
117 model: config.model.clone(),
118 base_url,
119 dimension,
120 chunking_version: default_chunking_version(),
121 }
122 }
123
124 pub fn as_string(&self) -> String {
125 serde_json::to_string(self).unwrap_or_else(|_| String::new())
126 }
127
128 fn matches_expected(&self, expected: &str) -> bool {
129 let encoded = self.as_string();
130 !encoded.is_empty() && encoded == expected
131 }
132}
133
134enum SemanticEmbeddingEngine {
135 Local(LocalEmbedder),
138 OpenAiCompatible {
139 client: Client,
140 model: String,
141 base_url: String,
142 api_key: Option<String>,
143 },
144 Ollama {
145 client: Client,
146 model: String,
147 base_url: String,
148 },
149}
150
151pub struct SemanticEmbeddingModel {
152 backend: SemanticBackend,
153 model: String,
154 base_url: Option<String>,
155 timeout_ms: u64,
156 max_batch_size: usize,
157 dimension: Option<usize>,
158 engine: SemanticEmbeddingEngine,
159 query_embedding_cache: HashMap<String, Vec<f32>>,
160 query_embedding_cache_order: VecDeque<String>,
161 query_embedding_cache_hits: u64,
162 query_embedding_cache_misses: u64,
163}
164
165pub type EmbeddingModel = SemanticEmbeddingModel;
166
167fn validate_embedding_batch(
168 vectors: &[Vec<f32>],
169 expected_count: usize,
170 context: &str,
171) -> Result<(), String> {
172 if expected_count > 0 && vectors.is_empty() {
173 return Err(format!(
174 "{context} returned no vectors for {expected_count} inputs"
175 ));
176 }
177
178 if vectors.len() != expected_count {
179 return Err(format!(
180 "{context} returned {} vectors for {} inputs",
181 vectors.len(),
182 expected_count
183 ));
184 }
185
186 let Some(first_vector) = vectors.first() else {
187 return Ok(());
188 };
189 let expected_dimension = first_vector.len();
190 validate_embedding_dimension(expected_dimension)
191 .map_err(|error| format!("{context} returned {error}"))?;
192 for (index, vector) in vectors.iter().enumerate() {
193 if vector.len() != expected_dimension {
194 return Err(format!(
195 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
196 vector.len()
197 ));
198 }
199 }
200
201 Ok(())
202}
203
204fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
205 if dimension == 0 || dimension > MAX_DIMENSION {
206 return Err(format!(
207 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
208 ));
209 }
210
211 Ok(())
212}
213
214fn normalize_base_url(raw: &str) -> Result<String, String> {
218 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
219 let scheme = parsed.scheme();
220 if scheme != "http" && scheme != "https" {
221 return Err(format!(
222 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
223 scheme
224 ));
225 }
226 Ok(parsed.to_string().trim_end_matches('/').to_string())
227}
228
229pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
244 use std::net::{IpAddr, ToSocketAddrs};
245
246 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
247
248 let host = parsed.host_str().unwrap_or("");
249
250 let is_loopback_host =
255 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
256 if is_loopback_host {
257 return Ok(());
258 }
259
260 if host.ends_with(".local") {
263 return Err(format!(
264 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
265 ));
266 }
267
268 let port = parsed.port_or_known_default().unwrap_or(443);
271 let addr_str = format!("{host}:{port}");
272 let addrs: Vec<IpAddr> = addr_str
273 .to_socket_addrs()
274 .map(|iter| iter.map(|sa| sa.ip()).collect())
275 .unwrap_or_default();
276 for ip in &addrs {
277 if is_private_non_loopback_ip(ip) {
278 return Err(format!(
279 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
280 ));
281 }
282 }
283
284 Ok(())
285}
286
287fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
298 if ip.to_canonical().is_loopback() {
301 return false;
302 }
303 crate::url_fetch::is_private_or_reserved_ip(*ip)
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307 if base_url.ends_with("/v1") {
308 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309 } else {
310 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311 }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315 if base_url.ends_with("/api") {
316 format!("{base_url}/embed")
317 } else {
318 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319 }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323 value.and_then(|token| {
324 let token = token.trim();
325 if token.is_empty() {
326 None
327 } else {
328 Some(token.to_string())
329 }
330 })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
343 if !matches!(
344 status,
345 reqwest::StatusCode::BAD_REQUEST
346 | reqwest::StatusCode::CONFLICT
347 | reqwest::StatusCode::REQUEST_TIMEOUT
348 | reqwest::StatusCode::LOCKED
349 | reqwest::StatusCode::TOO_EARLY
350 ) {
351 return false;
352 }
353
354 let lower = raw.to_ascii_lowercase();
355 let normalized = lower.trim();
356
357 normalized.contains("model was unloaded while the request was still in queue")
358 || normalized == "model is loading"
359 || normalized.starts_with("model is loading,")
360 || normalized.contains(r#""error":"model is loading"#)
361 || normalized.contains(r#""message":"model is loading"#)
362 || normalized == "model not loaded"
363 || normalized.contains(r#""error":"model not loaded""#)
364 || normalized.contains(r#""message":"model not loaded""#)
365 || normalized == "loading model into memory"
366 || normalized.contains(r#""error":"loading model into memory""#)
367 || normalized.contains(r#""message":"loading model into memory""#)
368 || normalized == "model is being loaded"
369 || normalized.contains(r#""error":"model is being loaded""#)
370 || normalized.contains(r#""message":"model is being loaded""#)
371 || normalized == "model is currently loading"
372 || normalized.contains(r#""error":"model is currently loading""#)
373 || normalized.contains(r#""message":"model is currently loading""#)
374}
375
376fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
377 error.is_connect()
378}
379
380fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
386 error.is_connect() || error.is_timeout()
387}
388
389fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
390 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
391}
392
393pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
400
401pub fn embedding_failure_is_transient(error: &str) -> bool {
404 error.contains(TRANSIENT_EMBEDDING_MARKER)
405}
406
407pub fn strip_transient_embedding_marker(error: &str) -> String {
409 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
410}
411
412fn sleep_before_embedding_retry(attempt_index: usize) {
413 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
414 std::thread::sleep(Duration::from_millis(*delay_ms));
415 }
416}
417
418fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
419where
420 F: FnMut() -> reqwest::blocking::RequestBuilder,
421{
422 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
423 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
424
425 let response = match make_request().send() {
426 Ok(response) => response,
427 Err(error) => {
428 if !last_attempt && is_retryable_embedding_error(&error) {
429 sleep_before_embedding_retry(attempt_index);
430 continue;
431 }
432 let marker = if embedding_send_error_is_transient(&error) {
436 TRANSIENT_EMBEDDING_MARKER
437 } else {
438 ""
439 };
440 return Err(format!("{marker}{backend_label} request failed: {error}"));
441 }
442 };
443
444 let status = response.status();
445 let raw = match response.text() {
446 Ok(raw) => raw,
447 Err(error) => {
448 if !last_attempt && embedding_response_read_error_is_transient(&error) {
449 sleep_before_embedding_retry(attempt_index);
450 continue;
451 }
452 let marker = if embedding_response_read_error_is_transient(&error) {
453 TRANSIENT_EMBEDDING_MARKER
454 } else {
455 ""
456 };
457 return Err(format!(
458 "{marker}{backend_label} response read failed: {error}"
459 ));
460 }
461 };
462
463 if status.is_success() {
464 return Ok(raw);
465 }
466
467 let body_transient = embedding_response_body_is_transient(status, &raw);
471 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
472 sleep_before_embedding_retry(attempt_index);
473 continue;
474 }
475
476 let marker = if is_retryable_embedding_status(status) || body_transient {
482 TRANSIENT_EMBEDDING_MARKER
483 } else {
484 ""
485 };
486 return Err(format!(
487 "{marker}{backend_label} request failed (HTTP {}): {}",
488 status, raw
489 ));
490 }
491
492 unreachable!("embedding request retries exhausted without returning")
493}
494
495fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
496 if config.timeout_ms == 0 {
497 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
498 } else {
499 config.timeout_ms
500 }
501}
502
503impl SemanticEmbeddingModel {
504 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
505 Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
506 }
507
508 pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
509 let timeout_ms =
510 configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
511 Self::from_config_with_timeout_ms(config, timeout_ms)
512 }
513
514 fn from_config_with_timeout_ms(
515 config: &SemanticBackendConfig,
516 timeout_ms: u64,
517 ) -> Result<Self, String> {
518 let max_batch_size = if config.max_batch_size == 0 {
519 DEFAULT_MAX_BATCH_SIZE
520 } else {
521 config.max_batch_size
522 };
523
524 let api_key_env = normalize_api_key(config.api_key_env.clone());
525 let model = config.model.clone();
526
527 let client = Client::builder()
528 .timeout(Duration::from_millis(timeout_ms))
529 .redirect(reqwest::redirect::Policy::none())
530 .build()
531 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
532
533 let engine = match config.backend {
534 SemanticBackend::Fastembed => {
535 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
536 }
537 SemanticBackend::OpenAiCompatible => {
538 let raw = config.base_url.as_ref().ok_or_else(|| {
539 "base_url is required for openai_compatible backend".to_string()
540 })?;
541 let base_url = normalize_base_url(raw)?;
542
543 let api_key = match api_key_env {
544 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
545 format!("missing api_key_env '{var_name}' for openai_compatible backend")
546 })?),
547 None => None,
548 };
549
550 SemanticEmbeddingEngine::OpenAiCompatible {
551 client,
552 model,
553 base_url,
554 api_key,
555 }
556 }
557 SemanticBackend::Ollama => {
558 let raw = config
559 .base_url
560 .as_ref()
561 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
562 let base_url = normalize_base_url(raw)?;
563
564 SemanticEmbeddingEngine::Ollama {
565 client,
566 model,
567 base_url,
568 }
569 }
570 };
571
572 Ok(Self {
573 backend: config.backend,
574 model: config.model.clone(),
575 base_url: config.base_url.clone(),
576 timeout_ms,
577 max_batch_size,
578 dimension: None,
579 engine,
580 query_embedding_cache: HashMap::new(),
581 query_embedding_cache_order: VecDeque::new(),
582 query_embedding_cache_hits: 0,
583 query_embedding_cache_misses: 0,
584 })
585 }
586
587 pub fn backend(&self) -> SemanticBackend {
588 self.backend
589 }
590
591 pub fn model(&self) -> &str {
592 &self.model
593 }
594
595 pub fn base_url(&self) -> Option<&str> {
596 self.base_url.as_deref()
597 }
598
599 pub fn max_batch_size(&self) -> usize {
600 self.max_batch_size
601 }
602
603 pub fn timeout_ms(&self) -> u64 {
604 self.timeout_ms
605 }
606
607 pub fn fingerprint(
608 &mut self,
609 config: &SemanticBackendConfig,
610 ) -> Result<SemanticIndexFingerprint, String> {
611 let dimension = self.dimension()?;
612 Ok(SemanticIndexFingerprint::from_config(config, dimension))
613 }
614
615 pub fn dimension(&mut self) -> Result<usize, String> {
616 if let Some(dimension) = self.dimension {
617 return Ok(dimension);
618 }
619
620 let dimension = match &mut self.engine {
621 SemanticEmbeddingEngine::Local(model) => {
622 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
623 vectors
624 .first()
625 .map(|v| v.len())
626 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
627 }
628 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
629 let vectors =
630 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
631 vectors
632 .first()
633 .map(|v| v.len())
634 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
635 }
636 SemanticEmbeddingEngine::Ollama { .. } => {
637 let vectors =
638 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
639 vectors
640 .first()
641 .map(|v| v.len())
642 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
643 }
644 };
645
646 self.dimension = Some(dimension);
647 Ok(dimension)
648 }
649
650 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
651 self.embed_texts(texts)
652 }
653
654 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
655 if let Some(vector) = self.query_embedding_cache.get(query) {
656 self.query_embedding_cache_hits += 1;
657 return Ok(vector.clone());
658 }
659
660 self.query_embedding_cache_misses += 1;
661 let embeddings = self.embed_texts(vec![query.to_string()])?;
662 let vector = embeddings
663 .first()
664 .cloned()
665 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
666
667 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
668 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
669 self.query_embedding_cache.remove(&oldest);
670 }
671 }
672 self.query_embedding_cache
673 .insert(query.to_string(), vector.clone());
674 self.query_embedding_cache_order
675 .push_back(query.to_string());
676
677 Ok(vector)
678 }
679
680 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
681 (
682 self.query_embedding_cache_hits,
683 self.query_embedding_cache_misses,
684 self.query_embedding_cache.len(),
685 )
686 }
687
688 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
689 match &mut self.engine {
690 SemanticEmbeddingEngine::Local(model) => model
691 .embed(&texts)
692 .map_err(|error| format!("failed to embed batch: {error}")),
693 SemanticEmbeddingEngine::OpenAiCompatible {
694 client,
695 model,
696 base_url,
697 api_key,
698 } => {
699 let expected_text_count = texts.len();
700 let endpoint = build_openai_embeddings_endpoint(base_url);
701 let body = serde_json::json!({
702 "input": texts,
703 "model": model,
704 });
705
706 let raw = send_embedding_request(
707 || {
708 let mut request = client.post(&endpoint).json(&body);
718
719 if let Some(api_key) = api_key {
720 request = request.header("Authorization", format!("Bearer {api_key}"));
721 }
722
723 request
724 },
725 "openai compatible",
726 )?;
727
728 #[derive(Deserialize)]
729 struct OpenAiResponse {
730 data: Vec<OpenAiEmbeddingResult>,
731 }
732
733 #[derive(Deserialize)]
734 struct OpenAiEmbeddingResult {
735 embedding: Vec<f32>,
736 index: Option<u32>,
737 }
738
739 let parsed: OpenAiResponse = serde_json::from_str(&raw)
740 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
741 if parsed.data.len() != expected_text_count {
742 return Err(format!(
743 "openai compatible response returned {} embeddings for {} inputs",
744 parsed.data.len(),
745 expected_text_count
746 ));
747 }
748
749 let mut vectors = vec![Vec::new(); parsed.data.len()];
750 for (i, item) in parsed.data.into_iter().enumerate() {
751 let index = item.index.unwrap_or(i as u32) as usize;
752 if index >= vectors.len() {
753 return Err(
754 "openai compatible response contains invalid vector index".to_string()
755 );
756 }
757 vectors[index] = item.embedding;
758 }
759
760 for vector in &vectors {
761 if vector.is_empty() {
762 return Err(
763 "openai compatible response contained missing vectors".to_string()
764 );
765 }
766 }
767
768 self.dimension = vectors.first().map(Vec::len);
769 Ok(vectors)
770 }
771 SemanticEmbeddingEngine::Ollama {
772 client,
773 model,
774 base_url,
775 } => {
776 let expected_text_count = texts.len();
777 let endpoint = build_ollama_embeddings_endpoint(base_url);
778
779 #[derive(Serialize)]
780 struct OllamaPayload<'a> {
781 model: &'a str,
782 input: Vec<String>,
783 }
784
785 let payload = OllamaPayload {
786 model,
787 input: texts,
788 };
789
790 let raw = send_embedding_request(
791 || {
792 client.post(&endpoint).json(&payload)
797 },
798 "ollama",
799 )?;
800
801 #[derive(Deserialize)]
802 struct OllamaResponse {
803 embeddings: Vec<Vec<f32>>,
804 }
805
806 let parsed: OllamaResponse = serde_json::from_str(&raw)
807 .map_err(|error| format!("invalid ollama response: {error}"))?;
808 if parsed.embeddings.is_empty() {
809 return Err("ollama response returned no embeddings".to_string());
810 }
811 if parsed.embeddings.len() != expected_text_count {
812 return Err(format!(
813 "ollama response returned {} embeddings for {} inputs",
814 parsed.embeddings.len(),
815 expected_text_count
816 ));
817 }
818
819 let vectors = parsed.embeddings;
820 for vector in &vectors {
821 if vector.is_empty() {
822 return Err("ollama response contained empty embeddings".to_string());
823 }
824 }
825
826 self.dimension = vectors.first().map(Vec::len);
827 Ok(vectors)
828 }
829 }
830 }
831}
832
833pub fn pre_validate_onnx_runtime() -> Result<(), String> {
837 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
838
839 #[cfg(any(target_os = "linux", target_os = "macos"))]
840 {
841 #[cfg(target_os = "linux")]
842 let default_name = "libonnxruntime.so";
843 #[cfg(target_os = "macos")]
844 let default_name = "libonnxruntime.dylib";
845
846 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
847
848 unsafe {
849 let c_name = std::ffi::CString::new(lib_name)
850 .map_err(|e| format!("invalid library path: {}", e))?;
851 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
852 if handle.is_null() {
853 let err = libc::dlerror();
854 let msg = if err.is_null() {
855 "unknown dlopen error".to_string()
856 } else {
857 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
858 };
859 return Err(format!(
860 "ONNX Runtime not found. dlopen('{}') failed: {}. \
861 Run `npx @cortexkit/aft doctor` to diagnose.",
862 lib_name, msg
863 ));
864 }
865
866 let (detected_version, version_source) =
871 detect_ort_version_from_loaded_library(handle, lib_name);
872
873 libc::dlclose(handle);
874
875 if let Some(ref version) = detected_version {
877 let parts: Vec<&str> = version.split('.').collect();
878 if let (Some(major), Some(minor)) = (
879 parts.first().and_then(|s| s.parse::<u32>().ok()),
880 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
881 ) {
882 if major != 1 || minor < 20 {
883 return Err(format_ort_version_mismatch(version, &version_source));
884 }
885 }
886 }
887 }
888 }
889
890 #[cfg(target_os = "windows")]
891 {
892 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
897
898 #[link(name = "kernel32")]
902 extern "system" {
903 fn LoadLibraryExW(
904 lpLibFileName: *const u16,
905 hFile: *mut std::ffi::c_void,
906 dwFlags: u32,
907 ) -> *mut std::ffi::c_void;
908 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
909 fn GetModuleFileNameW(
910 hModule: *mut std::ffi::c_void,
911 lpFilename: *mut u16,
912 nSize: u32,
913 ) -> u32;
914 }
915
916 #[link(name = "version")]
917 extern "system" {
918 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
919 fn GetFileVersionInfoW(
920 lptstrFilename: *const u16,
921 dwHandle: u32,
922 dwLen: u32,
923 lpData: *mut std::ffi::c_void,
924 ) -> i32;
925 fn VerQueryValueW(
926 pBlock: *mut std::ffi::c_void,
927 lpSubBlock: *const u16,
928 lplpBuffer: *mut *mut std::ffi::c_void,
929 puLen: *mut u32,
930 ) -> i32;
931 }
932
933 #[repr(C)]
934 struct VS_FIXEDFILEINFO {
935 dw_signature: u32,
936 dw_struc_version: u32,
937 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
940 dw_product_version_ls: u32,
941 dw_file_flags_mask: u32,
942 dw_file_flags: u32,
943 dw_file_os: u32,
944 dw_file_type: u32,
945 dw_file_subtype: u32,
946 dw_file_date_ms: u32,
947 dw_file_date_ls: u32,
948 }
949
950 unsafe {
951 use std::os::windows::ffi::OsStrExt;
952 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
953 .encode_wide()
954 .chain(std::iter::once(0))
955 .collect();
956
957 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
958 if handle.is_null() {
959 let err = std::io::Error::last_os_error();
960 return Err(format!(
961 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
962 Run `npx @cortexkit/aft doctor` to diagnose.",
963 lib_name, err
964 ));
965 }
966
967 let mut detected_major: u32 = 0;
970 let mut detected_minor: u32 = 0;
971 let mut path_buf = [0u16; 32767];
977 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
978 if path_len > 0 {
979 let mut dummy_handle: u32 = 0;
980 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
981 if info_size > 0 {
982 let mut info = vec![0u8; info_size as usize];
983 if GetFileVersionInfoW(
984 path_buf.as_ptr(),
985 0,
986 info_size,
987 info.as_mut_ptr() as *mut std::ffi::c_void,
988 ) != 0
989 {
990 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
991 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
992 let mut vs_len: u32 = 0;
993 if VerQueryValueW(
994 info.as_mut_ptr() as *mut std::ffi::c_void,
995 sub_block.as_ptr(),
996 &mut vs_info,
997 &mut vs_len,
998 ) != 0
999 && !vs_info.is_null()
1000 {
1001 let fixed = vs_info as *const VS_FIXEDFILEINFO;
1002 detected_major = (*fixed).dw_file_version_ms >> 16;
1003 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1004 }
1005 }
1006 }
1007 }
1008
1009 FreeLibrary(handle);
1010
1011 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1015 let ver = format!("{}.{}", detected_major, detected_minor);
1016 return Err(format_ort_version_mismatch(&ver, lib_name));
1017 }
1018 }
1019 }
1020
1021 Ok(())
1022}
1023
1024#[cfg(any(target_os = "linux", target_os = "macos"))]
1025unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1026 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1027 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1028 if symbol.is_null() {
1029 return None;
1030 }
1031
1032 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1033 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1034 return None;
1035 }
1036
1037 let info = unsafe { info.assume_init() };
1038 if info.dli_fname.is_null() {
1039 return None;
1040 }
1041
1042 Some(
1043 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1044 .to_string_lossy()
1045 .into_owned(),
1046 )
1047}
1048
1049#[cfg(any(target_os = "linux", target_os = "macos"))]
1050fn detect_ort_version_from_resolved_or_requested(
1051 resolved_path: Option<String>,
1052 requested_lib_name: &str,
1053) -> (Option<String>, String) {
1054 if let Some(path) = resolved_path {
1055 if let Some(version) = detect_ort_version_from_path(&path) {
1056 return (Some(version), path);
1057 }
1058 return (detect_ort_version_from_path(requested_lib_name), path);
1059 }
1060
1061 (
1062 detect_ort_version_from_path(requested_lib_name),
1063 requested_lib_name.to_string(),
1064 )
1065}
1066
1067#[cfg(any(target_os = "linux", target_os = "macos"))]
1068fn detect_ort_version_from_loaded_library(
1069 handle: *mut std::ffi::c_void,
1070 requested_lib_name: &str,
1071) -> (Option<String>, String) {
1072 detect_ort_version_from_resolved_or_requested(
1073 unsafe { loaded_library_path_from_handle(handle) },
1074 requested_lib_name,
1075 )
1076}
1077
1078#[cfg(any(target_os = "linux", target_os = "macos"))]
1081fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1082 let path = std::path::Path::new(lib_path);
1083
1084 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1086 .into_iter()
1087 .flatten()
1088 {
1089 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1090 if let Some(version) = extract_version_from_filename(name) {
1091 return Some(version);
1092 }
1093 }
1094 }
1095
1096 if let Some(parent) = path.parent() {
1098 if let Ok(entries) = std::fs::read_dir(parent) {
1099 for entry in entries.flatten() {
1100 if let Some(name) = entry.file_name().to_str() {
1101 if name.starts_with("libonnxruntime") {
1102 if let Some(version) = extract_version_from_filename(name) {
1103 return Some(version);
1104 }
1105 }
1106 }
1107 }
1108 }
1109 }
1110
1111 None
1112}
1113
1114#[cfg(any(target_os = "linux", target_os = "macos"))]
1116fn extract_version_from_filename(name: &str) -> Option<String> {
1117 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1119 re.find(name).map(|m| m.as_str().to_string())
1120}
1121
1122fn suggest_removal_command(lib_path: &str) -> String {
1123 if lib_path.starts_with("/usr/local/lib")
1124 || lib_path == "libonnxruntime.so"
1125 || lib_path == "libonnxruntime.dylib"
1126 {
1127 #[cfg(target_os = "linux")]
1128 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1129 #[cfg(target_os = "macos")]
1130 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1131 }
1132 format!(" rm '{}'", lib_path)
1133}
1134
1135pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1141 format!(
1142 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1143 Solutions:\n\
1144 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1145 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1146 configures the bridge to load it instead of the system library — no \
1147 changes to '{}'.\n\
1148 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1149 {}\n\
1150 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1151 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1152 version,
1153 lib_name,
1154 lib_name,
1155 suggest_removal_command(lib_name),
1156 )
1157}
1158
1159pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1160 if message.trim_start().starts_with("ONNX Runtime not found.") {
1161 return true;
1162 }
1163
1164 let message = message.to_ascii_lowercase();
1165 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1166 .iter()
1167 .any(|pattern| message.contains(pattern));
1168 let mentions_dynamic_load_failure = [
1169 "shared library",
1170 "dynamic library",
1171 "failed to load",
1172 "could not load",
1173 "unable to load",
1174 "dlopen",
1175 "loadlibrary",
1176 "no such file",
1177 "not found",
1178 ]
1179 .iter()
1180 .any(|pattern| message.contains(pattern));
1181
1182 mentions_onnx_runtime && mentions_dynamic_load_failure
1183}
1184
1185pub fn format_embedding_init_error(error: impl Display) -> String {
1186 let message = error.to_string();
1187
1188 if is_onnx_runtime_unavailable(&message) {
1189 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1190 }
1191
1192 format!("failed to initialize semantic embedding model: {message}")
1193}
1194
1195#[derive(Debug, Clone)]
1197pub struct SemanticChunk {
1198 pub file: PathBuf,
1200 pub name: String,
1202 pub kind: SymbolKind,
1204 pub start_line: u32,
1206 pub end_line: u32,
1207 pub exported: bool,
1209 pub embed_text: String,
1211 pub snippet: String,
1213}
1214
1215#[derive(Debug, Clone)]
1217pub struct EmbeddingEntry {
1218 chunk: SemanticChunk,
1219 vector: Vec<f32>,
1220}
1221
1222#[derive(Debug, Clone)]
1224pub struct SemanticIndex {
1225 entries: Vec<EmbeddingEntry>,
1226 file_mtimes: HashMap<PathBuf, SystemTime>,
1228 file_sizes: HashMap<PathBuf, u64>,
1230 file_hashes: HashMap<PathBuf, blake3::Hash>,
1231 dimension: usize,
1233 fingerprint: Option<SemanticIndexFingerprint>,
1234 project_root: PathBuf,
1235 deferred_files: HashSet<PathBuf>,
1236}
1237
1238#[derive(Debug, Clone, Copy)]
1239struct IndexedFileMetadata {
1240 mtime: SystemTime,
1241 size: u64,
1242 content_hash: blake3::Hash,
1243}
1244
1245#[derive(Debug, Default, Clone, Copy)]
1248pub struct RefreshSummary {
1249 pub changed: usize,
1250 pub added: usize,
1251 pub deleted: usize,
1252 pub total_processed: usize,
1253}
1254
1255impl RefreshSummary {
1256 pub fn is_noop(&self) -> bool {
1258 self.changed == 0 && self.added == 0 && self.deleted == 0
1259 }
1260}
1261
1262#[derive(Debug, Default)]
1263pub struct InvalidatedFilesRefresh {
1264 pub added_entries: Vec<EmbeddingEntry>,
1268 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1269 pub completed_paths: Vec<PathBuf>,
1270 pub summary: RefreshSummary,
1271}
1272
1273#[derive(Debug, Clone)]
1274struct ReusableEmbedding {
1275 embed_text: String,
1276 vector: Vec<f32>,
1277}
1278
1279type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1280
1281#[derive(Debug, Clone)]
1283pub struct SemanticResult {
1284 pub file: PathBuf,
1285 pub name: String,
1286 pub kind: SymbolKind,
1287 pub start_line: u32,
1288 pub end_line: u32,
1289 pub exported: bool,
1290 pub snippet: String,
1291 pub score: f32,
1292 pub source: &'static str,
1293}
1294
1295impl SemanticIndex {
1296 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1297 debug_assert!(project_root.is_absolute());
1298 Self {
1299 entries: Vec::new(),
1300 file_mtimes: HashMap::new(),
1301 file_sizes: HashMap::new(),
1302 file_hashes: HashMap::new(),
1303 dimension,
1304 fingerprint: None,
1305 project_root,
1306 deferred_files: HashSet::new(),
1307 }
1308 }
1309
1310 pub fn entry_count(&self) -> usize {
1312 self.entries.len()
1313 }
1314
1315 pub fn indexed_file_count(&self) -> usize {
1317 self.file_mtimes.len()
1318 }
1319
1320 pub fn status_label(&self) -> &'static str {
1322 if self.entries.is_empty() {
1323 "empty"
1324 } else {
1325 "ready"
1326 }
1327 }
1328
1329 fn collect_chunks(
1330 project_root: &Path,
1331 files: &[PathBuf],
1332 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1333 let collect_started = std::time::Instant::now();
1334 let per_file: Vec<(
1335 PathBuf,
1336 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1337 )> = files
1338 .par_iter()
1339 .map_init(HashMap::new, |parsers, file| {
1340 let result = collect_semantic_file(project_root, file, parsers);
1341 (file.clone(), result)
1342 })
1343 .collect();
1344
1345 let mut chunks: Vec<SemanticChunk> = Vec::new();
1346 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1347
1348 for (file, result) in per_file {
1349 match result {
1350 Ok((metadata, file_chunks)) => {
1351 file_metadata.insert(file, metadata);
1352 chunks.extend(file_chunks);
1353 }
1354 Err(error) => {
1355 if error == "unsupported file extension" {
1361 continue;
1362 }
1363 slog_warn!(
1364 "failed to collect semantic chunks for {}: {}",
1365 file.display(),
1366 error
1367 );
1368 }
1369 }
1370 }
1371
1372 slog_info!(
1373 "semantic collect: {} chunks from {} files in {} ms",
1374 chunks.len(),
1375 file_metadata.len(),
1376 collect_started.elapsed().as_millis()
1377 );
1378
1379 (chunks, file_metadata)
1380 }
1381
1382 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1383 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1384 let mut reuse_map: ChunkReuseMap = HashMap::new();
1385
1386 for entry in &self.entries {
1387 if !requested.contains(entry.chunk.file.as_path()) {
1388 continue;
1389 }
1390
1391 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1396 reuse_map
1397 .entry(entry.chunk.file.clone())
1398 .or_default()
1399 .entry(hash)
1400 .or_default()
1401 .push(ReusableEmbedding {
1402 embed_text: entry.chunk.embed_text.clone(),
1403 vector: entry.vector.clone(),
1404 });
1405 }
1406
1407 reuse_map
1408 }
1409
1410 fn reusable_vector_for_chunk(
1411 reuse_map: &ChunkReuseMap,
1412 chunk: &SemanticChunk,
1413 ) -> Option<Vec<f32>> {
1414 let hash = blake3::hash(chunk.embed_text.as_bytes());
1415 reuse_map
1416 .get(&chunk.file)?
1417 .get(&hash)?
1418 .iter()
1419 .find(|candidate| candidate.embed_text == chunk.embed_text)
1420 .map(|candidate| candidate.vector.clone())
1421 }
1422
1423 fn entries_for_chunks_with_reuse<F, P>(
1424 chunks: Vec<SemanticChunk>,
1425 reuse_map: &ChunkReuseMap,
1426 embed_fn: &mut F,
1427 max_batch_size: usize,
1428 initial_observed_dimension: Option<usize>,
1429 refresh_label: &str,
1430 progress: &mut P,
1431 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1432 where
1433 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1434 P: FnMut(usize, usize),
1435 {
1436 let total_chunks = chunks.len();
1437 progress(0, total_chunks);
1438
1439 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1440 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1441
1442 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1443 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1444 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1445 } else {
1446 misses.push((chunk_index, chunk));
1447 }
1448 }
1449
1450 let mut completed = total_chunks.saturating_sub(misses.len());
1451 if completed > 0 {
1452 progress(completed, total_chunks);
1453 }
1454
1455 let batch_size = max_batch_size.max(1);
1456 let mut observed_dimension = initial_observed_dimension;
1457
1458 for batch_start in (0..misses.len()).step_by(batch_size) {
1459 let batch_end = (batch_start + batch_size).min(misses.len());
1460 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1461 .iter()
1462 .map(|(_, chunk)| chunk.embed_text.clone())
1463 .collect();
1464
1465 let vectors = embed_fn(batch_texts)?;
1466 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1467
1468 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1469 match observed_dimension {
1470 None => observed_dimension = Some(dim),
1471 Some(expected) if dim != expected => {
1472 return Err(format!(
1473 "embedding dimension changed during {refresh_label}: \
1474 cached index uses {expected}, new vectors use {dim}"
1475 ));
1476 }
1477 _ => {}
1478 }
1479 }
1480
1481 for (i, vector) in vectors.into_iter().enumerate() {
1482 let (chunk_index, chunk) = misses[batch_start + i].clone();
1483 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1484 }
1485
1486 completed += batch_end - batch_start;
1487 progress(completed, total_chunks);
1488 }
1489
1490 let entries = entries_by_chunk
1491 .into_iter()
1492 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1493 .collect();
1494
1495 Ok((entries, observed_dimension))
1496 }
1497
1498 fn build_from_chunks<F, P>(
1499 project_root: &Path,
1500 chunks: Vec<SemanticChunk>,
1501 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1502 embed_fn: &mut F,
1503 max_batch_size: usize,
1504 mut progress: Option<&mut P>,
1505 ) -> Result<Self, String>
1506 where
1507 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1508 P: FnMut(usize, usize),
1509 {
1510 debug_assert!(project_root.is_absolute());
1511 let total_chunks = chunks.len();
1512
1513 if chunks.is_empty() {
1514 return Ok(Self {
1515 entries: Vec::new(),
1516 file_mtimes: file_metadata
1517 .iter()
1518 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1519 .collect(),
1520 file_sizes: file_metadata
1521 .iter()
1522 .map(|(path, metadata)| (path.clone(), metadata.size))
1523 .collect(),
1524 file_hashes: file_metadata
1525 .into_iter()
1526 .map(|(path, metadata)| (path, metadata.content_hash))
1527 .collect(),
1528 dimension: DEFAULT_DIMENSION,
1529 fingerprint: None,
1530 project_root: project_root.to_path_buf(),
1531 deferred_files: HashSet::new(),
1532 });
1533 }
1534
1535 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1537 let mut expected_dimension: Option<usize> = None;
1538 let batch_size = max_batch_size.max(1);
1539 let embed_started = std::time::Instant::now();
1540 let batch_count = total_chunks.div_ceil(batch_size);
1541 for batch_start in (0..chunks.len()).step_by(batch_size) {
1542 let batch_end = (batch_start + batch_size).min(chunks.len());
1543 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1544 .iter()
1545 .map(|c| c.embed_text.clone())
1546 .collect();
1547
1548 let vectors = embed_fn(batch_texts)?;
1549 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1550
1551 if let Some(dim) = vectors.first().map(|v| v.len()) {
1553 match expected_dimension {
1554 None => expected_dimension = Some(dim),
1555 Some(expected) if dim != expected => {
1556 return Err(format!(
1557 "embedding dimension changed across batches: expected {expected}, got {dim}"
1558 ));
1559 }
1560 _ => {}
1561 }
1562 }
1563
1564 for (i, vector) in vectors.into_iter().enumerate() {
1565 let chunk_idx = batch_start + i;
1566 entries.push(EmbeddingEntry {
1567 chunk: chunks[chunk_idx].clone(),
1568 vector,
1569 });
1570 }
1571
1572 if let Some(callback) = progress.as_mut() {
1573 callback(entries.len(), total_chunks);
1574 }
1575 }
1576
1577 let embed_ms = embed_started.elapsed().as_millis();
1578 let rate = (total_chunks as u128 * 1000)
1579 .checked_div(embed_ms)
1580 .unwrap_or(0) as u64;
1581 slog_info!(
1582 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1583 total_chunks,
1584 batch_count,
1585 embed_ms,
1586 rate
1587 );
1588
1589 let dimension = entries
1590 .first()
1591 .map(|e| e.vector.len())
1592 .unwrap_or(DEFAULT_DIMENSION);
1593
1594 Ok(Self {
1595 entries,
1596 file_mtimes: file_metadata
1597 .iter()
1598 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1599 .collect(),
1600 file_sizes: file_metadata
1601 .iter()
1602 .map(|(path, metadata)| (path.clone(), metadata.size))
1603 .collect(),
1604 file_hashes: file_metadata
1605 .into_iter()
1606 .map(|(path, metadata)| (path, metadata.content_hash))
1607 .collect(),
1608 dimension,
1609 fingerprint: None,
1610 project_root: project_root.to_path_buf(),
1611 deferred_files: HashSet::new(),
1612 })
1613 }
1614
1615 pub fn build<F>(
1618 project_root: &Path,
1619 files: &[PathBuf],
1620 embed_fn: &mut F,
1621 max_batch_size: usize,
1622 ) -> Result<Self, String>
1623 where
1624 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1625 {
1626 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1627 Self::build_from_chunks(
1628 project_root,
1629 chunks,
1630 file_mtimes,
1631 embed_fn,
1632 max_batch_size,
1633 Option::<&mut fn(usize, usize)>::None,
1634 )
1635 }
1636
1637 pub fn build_with_progress<F, P>(
1639 project_root: &Path,
1640 files: &[PathBuf],
1641 embed_fn: &mut F,
1642 max_batch_size: usize,
1643 progress: &mut P,
1644 ) -> Result<Self, String>
1645 where
1646 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1647 P: FnMut(usize, usize),
1648 {
1649 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1650 let total_chunks = chunks.len();
1651 progress(0, total_chunks);
1652 Self::build_from_chunks(
1653 project_root,
1654 chunks,
1655 file_mtimes,
1656 embed_fn,
1657 max_batch_size,
1658 Some(progress),
1659 )
1660 }
1661
1662 pub fn refresh_stale_files<F, P>(
1673 &mut self,
1674 project_root: &Path,
1675 current_files: &[PathBuf],
1676 embed_fn: &mut F,
1677 max_batch_size: usize,
1678 progress: &mut P,
1679 ) -> Result<RefreshSummary, String>
1680 where
1681 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1682 P: FnMut(usize, usize),
1683 {
1684 self.backfill_missing_file_sizes();
1685
1686 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1688 self.deferred_files
1689 .retain(|path| current_set.contains(path.as_path()));
1690 let total_processed = current_set.len() + self.file_mtimes.len()
1691 - self
1692 .file_mtimes
1693 .keys()
1694 .filter(|path| current_set.contains(path.as_path()))
1695 .count();
1696
1697 enum IndexedFileCheck {
1700 Deleted(PathBuf),
1701 MissingMetadata(PathBuf),
1702 Verified(PathBuf, FreshnessVerdict),
1703 }
1704
1705 let mut deleted: Vec<PathBuf> = Vec::new();
1706 let mut changed: Vec<PathBuf> = Vec::new();
1707 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708 let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1709 let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1710
1711 for indexed_path in indexed_paths {
1712 let check_index = checks.len();
1713 if !current_set.contains(indexed_path.as_path()) {
1714 checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1715 continue;
1716 }
1717 let cached = match (
1718 self.file_mtimes.get(&indexed_path),
1719 self.file_sizes.get(&indexed_path),
1720 self.file_hashes.get(&indexed_path),
1721 ) {
1722 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1723 mtime: *mtime,
1724 size: *size,
1725 content_hash: *hash,
1726 }),
1727 _ => None,
1728 };
1729 if let Some(freshness) = cached {
1730 strict_verify_inputs.push((check_index, indexed_path, freshness));
1731 checks.push(None);
1732 } else {
1733 checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1734 }
1735 }
1736
1737 for (check_index, path, verdict) in
1738 cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1739 {
1740 checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1741 }
1742
1743 for check in checks {
1744 match check.expect("strict freshness check should be populated") {
1745 IndexedFileCheck::Deleted(path) => deleted.push(path),
1746 IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1747 IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1748 IndexedFileCheck::Verified(
1749 path,
1750 FreshnessVerdict::ContentFresh {
1751 new_mtime,
1752 new_size,
1753 },
1754 ) => {
1755 self.file_mtimes.insert(path.clone(), new_mtime);
1756 self.file_sizes.insert(path, new_size);
1757 }
1758 IndexedFileCheck::Verified(
1759 path,
1760 FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1761 ) => {
1762 changed.push(path);
1763 }
1764 }
1765 }
1766
1767 let mut added: Vec<PathBuf> = Vec::new();
1769 for path in current_files {
1770 if !self.file_mtimes.contains_key(path) {
1771 added.push(path.clone());
1772 }
1773 }
1774
1775 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1777 progress(0, 0);
1778 return Ok(RefreshSummary {
1779 total_processed,
1780 ..RefreshSummary::default()
1781 });
1782 }
1783
1784 if !deleted.is_empty() {
1788 self.remove_indexed_files(&deleted);
1789 }
1790
1791 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1793 to_embed.extend(changed.iter().cloned());
1794 to_embed.extend(added.iter().cloned());
1795
1796 if to_embed.is_empty() {
1797 progress(0, 0);
1799 return Ok(RefreshSummary {
1800 changed: 0,
1801 added: 0,
1802 deleted: deleted.len(),
1803 total_processed,
1804 });
1805 }
1806
1807 let reuse_map = self.build_chunk_reuse_map(&changed);
1808 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1809 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1810 let vanished = to_embed
1811 .iter()
1812 .filter(|path| {
1813 changed_set.contains(path.as_path())
1814 && !fresh_metadata.contains_key(*path)
1815 && !path.exists()
1816 })
1817 .cloned()
1818 .collect::<Vec<_>>();
1819 if !vanished.is_empty() {
1820 self.remove_indexed_files(&vanished);
1821 deleted.extend(vanished);
1822 }
1823
1824 if chunks.is_empty() {
1825 progress(0, 0);
1826 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1827 for file in &successful_files {
1828 self.deferred_files.remove(file);
1829 }
1830 if !successful_files.is_empty() {
1831 self.entries
1832 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1833 }
1834 let changed_count = changed
1835 .iter()
1836 .filter(|path| successful_files.contains(*path))
1837 .count();
1838 let added_count = added
1839 .iter()
1840 .filter(|path| successful_files.contains(*path))
1841 .count();
1842 for (file, metadata) in fresh_metadata {
1843 self.file_mtimes.insert(file.clone(), metadata.mtime);
1844 self.file_sizes.insert(file.clone(), metadata.size);
1845 self.file_hashes.insert(file.clone(), metadata.content_hash);
1846 }
1847 return Ok(RefreshSummary {
1848 changed: changed_count,
1849 added: added_count,
1850 deleted: deleted.len(),
1851 total_processed,
1852 });
1853 }
1854
1855 let existing_dimension = if self.entries.is_empty() {
1858 None
1859 } else {
1860 Some(self.dimension)
1861 };
1862 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1863 chunks,
1864 &reuse_map,
1865 embed_fn,
1866 max_batch_size,
1867 existing_dimension,
1868 "incremental refresh",
1869 progress,
1870 )?;
1871
1872 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1873 for file in &successful_files {
1874 self.deferred_files.remove(file);
1875 }
1876 if !successful_files.is_empty() {
1877 self.entries
1878 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1879 }
1880
1881 self.entries.extend(new_entries);
1882 for (file, metadata) in fresh_metadata {
1883 self.file_mtimes.insert(file.clone(), metadata.mtime);
1884 self.file_sizes.insert(file.clone(), metadata.size);
1885 self.file_hashes.insert(file, metadata.content_hash);
1886 }
1887 if let Some(dim) = observed_dimension {
1888 self.dimension = dim;
1889 }
1890
1891 Ok(RefreshSummary {
1892 changed: changed
1893 .iter()
1894 .filter(|path| successful_files.contains(*path))
1895 .count(),
1896 added: added
1897 .iter()
1898 .filter(|path| successful_files.contains(*path))
1899 .count(),
1900 deleted: deleted.len(),
1901 total_processed,
1902 })
1903 }
1904
1905 pub fn refresh_invalidated_files<F, P>(
1912 &mut self,
1913 project_root: &Path,
1914 paths: &[PathBuf],
1915 embed_fn: &mut F,
1916 max_batch_size: usize,
1917 max_files: usize,
1918 progress: &mut P,
1919 ) -> Result<InvalidatedFilesRefresh, String>
1920 where
1921 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1922 P: FnMut(usize, usize),
1923 {
1924 self.backfill_missing_file_sizes();
1925
1926 self.deferred_files.retain(|path| path.exists());
1927 let mut requested_paths = paths.to_vec();
1928 requested_paths.extend(self.deferred_files.iter().cloned());
1929 requested_paths.sort();
1930 requested_paths.dedup();
1931 let total_processed = requested_paths.len();
1932
1933 if requested_paths.is_empty() {
1934 progress(0, 0);
1935 return Ok(InvalidatedFilesRefresh {
1936 summary: RefreshSummary {
1937 total_processed,
1938 ..RefreshSummary::default()
1939 },
1940 ..InvalidatedFilesRefresh::default()
1941 });
1942 }
1943
1944 let previously_indexed: HashSet<PathBuf> = requested_paths
1945 .iter()
1946 .filter(|path| self.file_mtimes.contains_key(*path))
1947 .cloned()
1948 .collect();
1949 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1950
1951 self.remove_indexed_files(&requested_paths);
1955
1956 let existing_paths = requested_paths
1957 .iter()
1958 .filter(|path| path.exists())
1959 .cloned()
1960 .collect::<Vec<_>>();
1961 let deleted = requested_paths
1962 .iter()
1963 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1964 .count();
1965
1966 if existing_paths.is_empty() {
1967 for path in &requested_paths {
1968 if !path.exists() {
1969 self.deferred_files.remove(path);
1970 }
1971 }
1972 progress(0, 0);
1973 return Ok(InvalidatedFilesRefresh {
1974 completed_paths: requested_paths,
1975 summary: RefreshSummary {
1976 deleted,
1977 total_processed,
1978 ..RefreshSummary::default()
1979 },
1980 ..InvalidatedFilesRefresh::default()
1981 });
1982 }
1983
1984 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1985
1986 let retained_file_count = self.file_mtimes.len();
1987 let changed_successful_count = existing_paths
1988 .iter()
1989 .filter(|path| {
1990 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1991 })
1992 .count();
1993 let available_new_files =
1994 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1995 let new_successful_files = existing_paths
1996 .iter()
1997 .filter(|path| {
1998 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1999 })
2000 .cloned()
2001 .collect::<Vec<_>>();
2002 if new_successful_files.len() > available_new_files {
2003 let allowed_new_files = new_successful_files
2004 .iter()
2005 .take(available_new_files)
2006 .cloned()
2007 .collect::<HashSet<_>>();
2008 let deferred_new_files = new_successful_files
2009 .into_iter()
2010 .filter(|path| !allowed_new_files.contains(path))
2011 .collect::<HashSet<_>>();
2012
2013 fresh_metadata.retain(|file, _| {
2014 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2015 });
2016 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2017
2018 if !deferred_new_files.is_empty() {
2019 for path in &deferred_new_files {
2020 self.deferred_files.insert(path.clone());
2021 }
2022 slog_warn!(
2023 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2024 deferred_new_files.len(),
2025 max_files
2026 );
2027 }
2028 }
2029
2030 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2031 for file in &successful_files {
2032 self.deferred_files.remove(file);
2033 }
2034 let changed = successful_files
2035 .iter()
2036 .filter(|path| previously_indexed.contains(path.as_path()))
2037 .count();
2038 let added = successful_files.len().saturating_sub(changed);
2039 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2040
2041 if chunks.is_empty() {
2042 progress(0, 0);
2043 for (file, metadata) in fresh_metadata {
2044 let freshness = FileFreshness {
2045 mtime: metadata.mtime,
2046 size: metadata.size,
2047 content_hash: metadata.content_hash,
2048 };
2049 self.file_mtimes.insert(file.clone(), freshness.mtime);
2050 self.file_sizes.insert(file.clone(), freshness.size);
2051 self.file_hashes
2052 .insert(file.clone(), freshness.content_hash);
2053 updated_metadata.push((file, freshness));
2054 }
2055
2056 return Ok(InvalidatedFilesRefresh {
2057 updated_metadata,
2058 completed_paths: requested_paths,
2059 summary: RefreshSummary {
2060 changed,
2061 added,
2062 deleted,
2063 total_processed,
2064 },
2065 ..InvalidatedFilesRefresh::default()
2066 });
2067 }
2068
2069 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2070 {
2071 None
2072 } else {
2073 Some(self.dimension)
2074 };
2075 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2076 chunks,
2077 &reuse_map,
2078 embed_fn,
2079 max_batch_size,
2080 initial_observed_dimension,
2081 "invalidated-file refresh",
2082 progress,
2083 )?;
2084
2085 let added_entries = new_entries.clone();
2086 self.entries.extend(new_entries);
2087 for (file, metadata) in fresh_metadata {
2088 let freshness = FileFreshness {
2089 mtime: metadata.mtime,
2090 size: metadata.size,
2091 content_hash: metadata.content_hash,
2092 };
2093 self.file_mtimes.insert(file.clone(), freshness.mtime);
2094 self.file_sizes.insert(file.clone(), freshness.size);
2095 self.file_hashes
2096 .insert(file.clone(), freshness.content_hash);
2097 updated_metadata.push((file, freshness));
2098 }
2099 if let Some(dim) = observed_dimension {
2100 self.dimension = dim;
2101 }
2102
2103 Ok(InvalidatedFilesRefresh {
2104 added_entries,
2105 updated_metadata,
2106 completed_paths: requested_paths,
2107 summary: RefreshSummary {
2108 changed,
2109 added,
2110 deleted,
2111 total_processed,
2112 },
2113 })
2114 }
2115
2116 pub fn apply_refresh_update(
2117 &mut self,
2118 added_entries: Vec<EmbeddingEntry>,
2119 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2120 completed_paths: &[PathBuf],
2121 ) {
2122 self.remove_indexed_files(completed_paths);
2126
2127 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2128 self.entries.extend(added_entries);
2129 for (file, freshness) in updated_metadata {
2130 self.file_mtimes.insert(file.clone(), freshness.mtime);
2131 self.file_sizes.insert(file.clone(), freshness.size);
2132 self.file_hashes.insert(file, freshness.content_hash);
2133 }
2134 if let Some(dim) = observed_dimension {
2135 self.dimension = dim;
2136 }
2137 }
2138
2139 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2140 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2141 self.entries
2142 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2143 for path in files {
2144 self.file_mtimes.remove(path);
2145 self.file_sizes.remove(path);
2146 self.file_hashes.remove(path);
2147 }
2148 }
2149
2150 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2152 if self.entries.is_empty() || query_vector.len() != self.dimension {
2153 return Vec::new();
2154 }
2155
2156 let mut scored: Vec<(f32, usize)> = self
2157 .entries
2158 .iter()
2159 .enumerate()
2160 .map(|(i, entry)| {
2161 let mut score = cosine_similarity(query_vector, &entry.vector);
2162 if entry.chunk.exported {
2163 score *= 1.1;
2164 }
2165 (score, i)
2166 })
2167 .collect();
2168
2169 let keep = top_k.min(scored.len());
2170 if keep == 0 {
2171 return Vec::new();
2172 }
2173
2174 if keep < scored.len() {
2175 scored.select_nth_unstable_by(keep, semantic_score_order);
2176 scored.truncate(keep);
2177 }
2178 scored.sort_by(semantic_score_order);
2179
2180 scored
2181 .into_iter()
2182 .map(|(score, idx)| {
2186 let entry = &self.entries[idx];
2187 SemanticResult {
2188 file: entry.chunk.file.clone(),
2189 name: entry.chunk.name.clone(),
2190 kind: entry.chunk.kind.clone(),
2191 start_line: entry.chunk.start_line,
2192 end_line: entry.chunk.end_line,
2193 exported: entry.chunk.exported,
2194 snippet: entry.chunk.snippet.clone(),
2195 score,
2196 source: "semantic",
2197 }
2198 })
2199 .collect()
2200 }
2201
2202 pub fn len(&self) -> usize {
2204 self.entries.len()
2205 }
2206
2207 pub fn is_file_stale(&self, file: &Path) -> bool {
2209 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2210 return true;
2211 };
2212 let Some(stored_size) = self.file_sizes.get(file) else {
2213 return true;
2214 };
2215 let Some(stored_hash) = self.file_hashes.get(file) else {
2216 return true;
2217 };
2218 let cached = FileFreshness {
2219 mtime: *stored_mtime,
2220 size: *stored_size,
2221 content_hash: *stored_hash,
2222 };
2223 match cache_freshness::verify_file_strict(file, &cached) {
2224 FreshnessVerdict::HotFresh => false,
2225 FreshnessVerdict::ContentFresh { .. } => false,
2226 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2227 }
2228 }
2229
2230 fn backfill_missing_file_sizes(&mut self) {
2231 for path in self.file_mtimes.keys() {
2232 if self.file_sizes.contains_key(path) {
2233 continue;
2234 }
2235 if let Ok(metadata) = fs::metadata(path) {
2236 self.file_sizes.insert(path.clone(), metadata.len());
2237 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2238 self.file_hashes.insert(path.clone(), hash);
2239 }
2240 }
2241 }
2242 }
2243
2244 pub fn remove_file(&mut self, file: &Path) {
2246 self.invalidate_file(file);
2247 }
2248
2249 pub fn invalidate_file(&mut self, file: &Path) {
2250 let canonical_file = canonicalize_existing_or_deleted_path(file);
2251 self.entries
2252 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2253 self.file_mtimes.remove(file);
2254 self.file_sizes.remove(file);
2255 self.file_hashes.remove(file);
2256 if canonical_file.as_path() != file {
2257 self.file_mtimes.remove(&canonical_file);
2258 self.file_sizes.remove(&canonical_file);
2259 self.file_hashes.remove(&canonical_file);
2260 }
2261 }
2262
2263 pub fn dimension(&self) -> usize {
2265 self.dimension
2266 }
2267
2268 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2269 self.fingerprint.as_ref()
2270 }
2271
2272 pub fn backend_label(&self) -> Option<&str> {
2273 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2274 }
2275
2276 pub fn model_label(&self) -> Option<&str> {
2277 self.fingerprint.as_ref().map(|f| f.model.as_str())
2278 }
2279
2280 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2281 self.fingerprint = Some(fingerprint);
2282 }
2283
2284 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2286 if self.entries.is_empty() {
2289 slog_info!("skipping semantic index persistence (0 entries)");
2290 return;
2291 }
2292 let dir = storage_dir.join("semantic").join(project_key);
2293 if let Err(e) = fs::create_dir_all(&dir) {
2294 slog_warn!("failed to create semantic cache dir: {}", e);
2295 return;
2296 }
2297 let data_path = dir.join("semantic.bin");
2298 let tmp_path = dir.join(format!(
2299 "semantic.bin.tmp.{}.{}",
2300 std::process::id(),
2301 SystemTime::now()
2302 .duration_since(SystemTime::UNIX_EPOCH)
2303 .unwrap_or(Duration::ZERO)
2304 .as_nanos()
2305 ));
2306 let write_result = (|| -> io::Result<usize> {
2307 let file = fs::File::create(&tmp_path)?;
2308 let mut writer = BufWriter::new(file);
2309 let bytes_written = self.write_to_writer(&mut writer)?;
2310 writer.flush()?;
2311 writer.get_ref().sync_all()?;
2312 Ok(bytes_written)
2313 })();
2314 let bytes_written = match write_result {
2315 Ok(bytes_written) => bytes_written,
2316 Err(e) => {
2317 slog_warn!("failed to write semantic index: {}", e);
2318 let _ = fs::remove_file(&tmp_path);
2319 return;
2320 }
2321 };
2322 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2323 slog_warn!("failed to rename semantic index: {}", e);
2324 let _ = fs::remove_file(&tmp_path);
2325 return;
2326 }
2327 slog_info!(
2328 "semantic index persisted: {} entries, {:.1} KB",
2329 self.entries.len(),
2330 bytes_written as f64 / 1024.0
2331 );
2332 }
2333
2334 pub fn read_from_disk(
2336 storage_dir: &Path,
2337 project_key: &str,
2338 current_canonical_root: &Path,
2339 is_worktree_bridge: bool,
2340 expected_fingerprint: Option<&str>,
2341 ) -> Option<Self> {
2342 debug_assert!(current_canonical_root.is_absolute());
2343 let data_path = storage_dir
2344 .join("semantic")
2345 .join(project_key)
2346 .join("semantic.bin");
2347 let file = fs::File::open(&data_path).ok()?;
2348 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2349 if file_len < HEADER_BYTES_V1 {
2350 slog_warn!(
2351 "corrupt semantic index (too small: {} bytes), removing",
2352 file_len
2353 );
2354 if !is_worktree_bridge {
2355 let _ = fs::remove_file(&data_path);
2356 }
2357 return None;
2358 }
2359
2360 let mut reader = BufReader::new(file);
2361 let mut version_buf = [0u8; 1];
2362 reader.read_exact(&mut version_buf).ok()?;
2363 let version = version_buf[0];
2364 if version != SEMANTIC_INDEX_VERSION_V6 {
2365 slog_info!(
2366 "cached semantic index version {} is older than {}, rebuilding",
2367 version,
2368 SEMANTIC_INDEX_VERSION_V6
2369 );
2370 if !is_worktree_bridge {
2371 let _ = fs::remove_file(&data_path);
2372 }
2373 return None;
2374 }
2375 match Self::from_reader_after_version(
2376 reader,
2377 version,
2378 current_canonical_root,
2379 Some(file_len),
2380 1,
2381 ) {
2382 Ok(index) => {
2383 if index.entries.is_empty() {
2384 slog_info!("cached semantic index is empty, will rebuild");
2385 if !is_worktree_bridge {
2386 let _ = fs::remove_file(&data_path);
2387 }
2388 return None;
2389 }
2390 if let Some(expected) = expected_fingerprint {
2391 let matches = index
2392 .fingerprint()
2393 .map(|fingerprint| fingerprint.matches_expected(expected))
2394 .unwrap_or(false);
2395 if !matches {
2396 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2397 if !is_worktree_bridge {
2398 let _ = fs::remove_file(&data_path);
2399 }
2400 return None;
2401 }
2402 }
2403 slog_info!(
2404 "loaded semantic index from disk: {} entries",
2405 index.entries.len()
2406 );
2407 Some(index)
2408 }
2409 Err(e) => {
2410 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2411 if !is_worktree_bridge {
2412 let _ = fs::remove_file(&data_path);
2413 }
2414 None
2415 }
2416 }
2417 }
2418
2419 pub fn to_bytes(&self) -> Vec<u8> {
2421 let mut buf = Vec::new();
2422 self.write_to_writer(&mut buf)
2423 .expect("writing semantic index to Vec cannot fail");
2424 buf
2425 }
2426
2427 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2428 let mut bytes_written = 0usize;
2429 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2430 let encoded = fingerprint.as_string();
2431 if encoded.is_empty() {
2432 None
2433 } else {
2434 Some(encoded)
2435 }
2436 });
2437 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2438 let file_mtime_count = self
2439 .file_mtimes
2440 .iter()
2441 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2442 .count();
2443 let entry_count = self
2444 .entries
2445 .iter()
2446 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2447 .count();
2448
2449 let version = SEMANTIC_INDEX_VERSION_V6;
2462 write_counted(writer, &[version], &mut bytes_written)?;
2463 write_counted(
2464 writer,
2465 &(self.dimension as u32).to_le_bytes(),
2466 &mut bytes_written,
2467 )?;
2468 write_counted(
2469 writer,
2470 &(entry_count as u32).to_le_bytes(),
2471 &mut bytes_written,
2472 )?;
2473 write_counted(
2474 writer,
2475 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2476 &mut bytes_written,
2477 )?;
2478 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2479
2480 write_counted(
2483 writer,
2484 &(file_mtime_count as u32).to_le_bytes(),
2485 &mut bytes_written,
2486 )?;
2487 for (path, mtime) in &self.file_mtimes {
2488 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2489 continue;
2490 };
2491 let relative = relative.to_string_lossy();
2492 let path_bytes = relative.as_bytes();
2493 write_counted(
2494 writer,
2495 &(path_bytes.len() as u32).to_le_bytes(),
2496 &mut bytes_written,
2497 )?;
2498 write_counted(writer, path_bytes, &mut bytes_written)?;
2499 let duration = mtime
2500 .duration_since(SystemTime::UNIX_EPOCH)
2501 .unwrap_or_default();
2502 write_counted(
2503 writer,
2504 &duration.as_secs().to_le_bytes(),
2505 &mut bytes_written,
2506 )?;
2507 write_counted(
2508 writer,
2509 &duration.subsec_nanos().to_le_bytes(),
2510 &mut bytes_written,
2511 )?;
2512 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2513 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2514 let hash = self
2515 .file_hashes
2516 .get(path)
2517 .copied()
2518 .unwrap_or_else(cache_freshness::zero_hash);
2519 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2520 }
2521
2522 for entry in &self.entries {
2524 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2525 continue;
2526 };
2527 let c = &entry.chunk;
2528
2529 let relative = relative.to_string_lossy();
2531 let file_bytes = relative.as_bytes();
2532 write_counted(
2533 writer,
2534 &(file_bytes.len() as u32).to_le_bytes(),
2535 &mut bytes_written,
2536 )?;
2537 write_counted(writer, file_bytes, &mut bytes_written)?;
2538
2539 let name_bytes = c.name.as_bytes();
2541 write_counted(
2542 writer,
2543 &(name_bytes.len() as u32).to_le_bytes(),
2544 &mut bytes_written,
2545 )?;
2546 write_counted(writer, name_bytes, &mut bytes_written)?;
2547
2548 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2550
2551 write_counted(
2553 writer,
2554 &(c.start_line as u32).to_le_bytes(),
2555 &mut bytes_written,
2556 )?;
2557 write_counted(
2558 writer,
2559 &(c.end_line as u32).to_le_bytes(),
2560 &mut bytes_written,
2561 )?;
2562 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2563
2564 let snippet_bytes = c.snippet.as_bytes();
2566 write_counted(
2567 writer,
2568 &(snippet_bytes.len() as u32).to_le_bytes(),
2569 &mut bytes_written,
2570 )?;
2571 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2572
2573 let embed_bytes = c.embed_text.as_bytes();
2575 write_counted(
2576 writer,
2577 &(embed_bytes.len() as u32).to_le_bytes(),
2578 &mut bytes_written,
2579 )?;
2580 write_counted(writer, embed_bytes, &mut bytes_written)?;
2581
2582 for &val in &entry.vector {
2584 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2585 }
2586 }
2587
2588 Ok(bytes_written)
2589 }
2590
2591 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2593 debug_assert!(current_canonical_root.is_absolute());
2594 if data.len() < HEADER_BYTES_V1 {
2595 return Err("data too short".to_string());
2596 }
2597
2598 Self::from_reader_after_version(
2599 Cursor::new(&data[1..]),
2600 data[0],
2601 current_canonical_root,
2602 Some(data.len()),
2603 1,
2604 )
2605 }
2606
2607 fn from_reader_after_version<R: Read>(
2608 reader: R,
2609 version: u8,
2610 current_canonical_root: &Path,
2611 total_len: Option<usize>,
2612 bytes_read: usize,
2613 ) -> Result<Self, String> {
2614 debug_assert!(current_canonical_root.is_absolute());
2615 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2616
2617 if version != SEMANTIC_INDEX_VERSION_V1
2618 && version != SEMANTIC_INDEX_VERSION_V2
2619 && version != SEMANTIC_INDEX_VERSION_V3
2620 && version != SEMANTIC_INDEX_VERSION_V4
2621 && version != SEMANTIC_INDEX_VERSION_V5
2622 && version != SEMANTIC_INDEX_VERSION_V6
2623 {
2624 return Err(format!("unsupported version: {}", version));
2625 }
2626 if (version == SEMANTIC_INDEX_VERSION_V2
2630 || version == SEMANTIC_INDEX_VERSION_V3
2631 || version == SEMANTIC_INDEX_VERSION_V4
2632 || version == SEMANTIC_INDEX_VERSION_V5
2633 || version == SEMANTIC_INDEX_VERSION_V6)
2634 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2635 {
2636 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2637 }
2638
2639 let dimension = read_u32_stream(&mut reader)? as usize;
2640 let entry_count = read_u32_stream(&mut reader)? as usize;
2641 validate_embedding_dimension(dimension)?;
2642 if entry_count > MAX_ENTRIES {
2643 return Err(format!("too many semantic index entries: {}", entry_count));
2644 }
2645
2646 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2652 || version == SEMANTIC_INDEX_VERSION_V3
2653 || version == SEMANTIC_INDEX_VERSION_V4
2654 || version == SEMANTIC_INDEX_VERSION_V5
2655 || version == SEMANTIC_INDEX_VERSION_V6;
2656 let fingerprint = if has_fingerprint_field {
2657 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2658 if total_len
2659 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2660 {
2661 return Err("unexpected end of data reading fingerprint".to_string());
2662 }
2663 if fingerprint_len == 0 {
2664 None
2665 } else {
2666 let mut raw = vec![0u8; fingerprint_len];
2667 read_exact_stream(
2668 &mut reader,
2669 &mut raw,
2670 "unexpected end of data reading fingerprint",
2671 )?;
2672 let raw = String::from_utf8_lossy(&raw).to_string();
2673 Some(
2674 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2675 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2676 )
2677 }
2678 } else {
2679 None
2680 };
2681
2682 let mtime_count = read_u32_stream(&mut reader)? as usize;
2684 if mtime_count > MAX_ENTRIES {
2685 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2686 }
2687
2688 let vector_bytes = entry_count
2689 .checked_mul(dimension)
2690 .and_then(|count| count.checked_mul(F32_BYTES))
2691 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2692 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2693 return Err("semantic index vectors exceed available data".to_string());
2694 }
2695
2696 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2697 let mut file_sizes = HashMap::with_capacity(mtime_count);
2698 let mut file_hashes = HashMap::with_capacity(mtime_count);
2699 for _ in 0..mtime_count {
2700 let path = read_string_stream(&mut reader, total_len)?;
2701 let secs = read_u64_stream(&mut reader)?;
2702 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2708 || version == SEMANTIC_INDEX_VERSION_V4
2709 || version == SEMANTIC_INDEX_VERSION_V5
2710 || version == SEMANTIC_INDEX_VERSION_V6
2711 {
2712 read_u32_stream(&mut reader)?
2713 } else {
2714 0
2715 };
2716 let size =
2717 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2718 read_u64_stream(&mut reader)?
2719 } else {
2720 0
2721 };
2722 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2723 let mut hash_bytes = [0u8; 32];
2724 read_exact_stream(
2725 &mut reader,
2726 &mut hash_bytes,
2727 "unexpected end of data reading content hash",
2728 )?;
2729 blake3::Hash::from_bytes(hash_bytes)
2730 } else {
2731 cache_freshness::zero_hash()
2732 };
2733 if nanos >= 1_000_000_000 {
2740 return Err(format!(
2741 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2742 nanos
2743 ));
2744 }
2745 let duration = std::time::Duration::new(secs, nanos);
2746 let mtime = SystemTime::UNIX_EPOCH
2747 .checked_add(duration)
2748 .ok_or_else(|| {
2749 format!(
2750 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2751 secs, nanos
2752 )
2753 })?;
2754 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2755 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2756 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2757 } else {
2758 PathBuf::from(path)
2759 };
2760 file_mtimes.insert(path.clone(), mtime);
2761 file_sizes.insert(path.clone(), size);
2762 file_hashes.insert(path, content_hash);
2763 }
2764
2765 let mut entries = Vec::with_capacity(entry_count);
2767 for _ in 0..entry_count {
2768 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2769 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2770 cached_path_under_root(current_canonical_root, &raw_file)
2771 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2772 } else {
2773 raw_file
2774 };
2775 let name = read_string_stream(&mut reader, total_len)?;
2776
2777 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2778
2779 let start_line = read_u32_stream(&mut reader)?;
2780 let end_line = read_u32_stream(&mut reader)?;
2781
2782 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2783
2784 let snippet = read_string_stream(&mut reader, total_len)?;
2785 let embed_text = read_string_stream(&mut reader, total_len)?;
2786
2787 let vec_bytes = dimension
2789 .checked_mul(F32_BYTES)
2790 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2791 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2792 return Err("unexpected end of data reading vector".to_string());
2793 }
2794 let mut vector = Vec::with_capacity(dimension);
2795 for _ in 0..dimension {
2796 let mut bytes = [0u8; F32_BYTES];
2797 read_exact_stream(
2798 &mut reader,
2799 &mut bytes,
2800 "unexpected end of data reading vector",
2801 )?;
2802 vector.push(f32::from_le_bytes(bytes));
2803 }
2804
2805 entries.push(EmbeddingEntry {
2806 chunk: SemanticChunk {
2807 file,
2808 name,
2809 kind,
2810 start_line,
2811 end_line,
2812 exported,
2813 embed_text,
2814 snippet,
2815 },
2816 vector,
2817 });
2818 }
2819
2820 if entries.len() != entry_count {
2821 return Err(format!(
2822 "semantic cache entry count drift: header={} decoded={}",
2823 entry_count,
2824 entries.len()
2825 ));
2826 }
2827 for entry in &entries {
2828 if !file_mtimes.contains_key(&entry.chunk.file) {
2829 return Err(format!(
2830 "semantic cache metadata missing for entry file {}",
2831 entry.chunk.file.display()
2832 ));
2833 }
2834 }
2835
2836 Ok(Self {
2837 entries,
2838 file_mtimes,
2839 file_sizes,
2840 file_hashes,
2841 dimension,
2842 fingerprint,
2843 project_root: current_canonical_root.to_path_buf(),
2844 deferred_files: HashSet::new(),
2845 })
2846 }
2847}
2848
2849fn write_counted<W: Write>(
2850 writer: &mut W,
2851 bytes: &[u8],
2852 bytes_written: &mut usize,
2853) -> io::Result<()> {
2854 writer.write_all(bytes)?;
2855 *bytes_written = bytes_written.saturating_add(bytes.len());
2856 Ok(())
2857}
2858
2859struct CountingReader<R> {
2860 inner: R,
2861 bytes_read: usize,
2862}
2863
2864impl<R> CountingReader<R> {
2865 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2866 Self { inner, bytes_read }
2867 }
2868
2869 fn bytes_read(&self) -> usize {
2870 self.bytes_read
2871 }
2872}
2873
2874impl<R: Read> Read for CountingReader<R> {
2875 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2876 let read = self.inner.read(buf)?;
2877 self.bytes_read = self.bytes_read.saturating_add(read);
2878 Ok(read)
2879 }
2880}
2881
2882fn read_exact_stream<R: Read>(
2883 reader: &mut CountingReader<R>,
2884 buf: &mut [u8],
2885 eof_message: &'static str,
2886) -> Result<(), String> {
2887 reader.read_exact(buf).map_err(|error| {
2888 if error.kind() == io::ErrorKind::UnexpectedEof {
2889 eof_message.to_string()
2890 } else {
2891 format!("{eof_message}: {error}")
2892 }
2893 })
2894}
2895
2896fn read_u8_stream<R: Read>(
2897 reader: &mut CountingReader<R>,
2898 eof_message: &'static str,
2899) -> Result<u8, String> {
2900 let mut bytes = [0u8; 1];
2901 read_exact_stream(reader, &mut bytes, eof_message)?;
2902 Ok(bytes[0])
2903}
2904
2905fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2906 let mut bytes = [0u8; 4];
2907 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2908 Ok(u32::from_le_bytes(bytes))
2909}
2910
2911fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2912 let mut bytes = [0u8; 8];
2913 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2914 Ok(u64::from_le_bytes(bytes))
2915}
2916
2917fn read_string_stream<R: Read>(
2918 reader: &mut CountingReader<R>,
2919 total_len: Option<usize>,
2920) -> Result<String, String> {
2921 let len = read_u32_stream(reader)? as usize;
2922 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2923 return Err("unexpected end of data reading string".to_string());
2924 }
2925 let mut bytes = vec![0u8; len];
2926 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2927 Ok(String::from_utf8_lossy(&bytes).to_string())
2928}
2929
2930struct SourceLineCache<'a> {
2931 lines: Vec<&'a str>,
2932 line_starts: Vec<usize>,
2933}
2934
2935impl<'a> SourceLineCache<'a> {
2936 fn new(source: &'a str) -> Self {
2937 let lines: Vec<&'a str> = source.lines().collect();
2938 let mut line_starts = Vec::with_capacity(lines.len());
2939 let bytes = source.as_bytes();
2940 let mut offset = 0usize;
2941 for line in &lines {
2942 line_starts.push(offset);
2943 offset += line.len();
2944 if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2945 offset += 2;
2946 } else if bytes.get(offset) == Some(&b'\n') {
2947 offset += 1;
2948 }
2949 }
2950 Self { lines, line_starts }
2951 }
2952
2953 fn len(&self) -> usize {
2954 debug_assert_eq!(self.lines.len(), self.line_starts.len());
2955 self.line_starts.len()
2956 }
2957}
2958
2959fn build_embed_text_with_lines(
2961 symbol: &Symbol,
2962 line_cache: &SourceLineCache<'_>,
2963 file: &Path,
2964 project_root: &Path,
2965) -> String {
2966 let relative = file
2967 .strip_prefix(project_root)
2968 .unwrap_or(file)
2969 .to_string_lossy();
2970
2971 let kind_label = match &symbol.kind {
2972 SymbolKind::Function => "function",
2973 SymbolKind::Class => "class",
2974 SymbolKind::Method => "method",
2975 SymbolKind::Struct => "struct",
2976 SymbolKind::Interface => "interface",
2977 SymbolKind::Enum => "enum",
2978 SymbolKind::TypeAlias => "type",
2979 SymbolKind::Variable => "variable",
2980 SymbolKind::Heading => "heading",
2981 SymbolKind::FileSummary => "file-summary",
2982 };
2983
2984 let name = &symbol.name;
2986 let mut text = format!(
2987 "name:{name} file:{} kind:{} name:{name}",
2988 relative, kind_label
2989 );
2990
2991 if let Some(sig) = &symbol.signature {
2992 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3000 }
3001
3002 let start = (symbol.range.start_line as usize).min(line_cache.len());
3004 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3006 if start < end {
3007 let body: String = line_cache.lines[start..end]
3008 .iter()
3009 .take(15) .copied()
3011 .collect::<Vec<&str>>()
3012 .join("\n");
3013 let snippet = if body.len() > 300 {
3014 format!("{}...", &body[..body.floor_char_boundary(300)])
3015 } else {
3016 body
3017 };
3018 text.push_str(&format!(" body:{}", snippet));
3019 }
3020
3021 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3026}
3027
3028#[cfg(test)]
3029fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3030 let line_cache = SourceLineCache::new(source);
3031 build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3032}
3033
3034const MAX_EMBED_TEXT_CHARS: usize = 1600;
3038
3039fn truncate_chars(value: &str, max_chars: usize) -> String {
3040 value.chars().take(max_chars).collect()
3041}
3042
3043fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3044 let Some((start, first)) = line_cache
3045 .lines
3046 .iter()
3047 .enumerate()
3048 .find(|(_, line)| !line.trim().is_empty())
3049 else {
3050 return String::new();
3051 };
3052
3053 let trimmed = first.trim_start();
3054 if trimmed.starts_with("/**") {
3055 let mut comment = Vec::new();
3056 for line in line_cache.lines.iter().skip(start) {
3057 comment.push(*line);
3058 if line.contains("*/") {
3059 break;
3060 }
3061 }
3062 return truncate_chars(&comment.join("\n"), 200);
3063 }
3064
3065 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3066 let comment = line_cache
3067 .lines
3068 .iter()
3069 .skip(start)
3070 .take_while(|line| {
3071 let trimmed = line.trim_start();
3072 trimmed.starts_with("///") || trimmed.starts_with("//!")
3073 })
3074 .copied()
3075 .collect::<Vec<_>>()
3076 .join("\n");
3077 return truncate_chars(&comment, 200);
3078 }
3079
3080 String::new()
3081}
3082
3083pub fn build_file_summary_chunk(
3084 file: &Path,
3085 project_root: &Path,
3086 source: &str,
3087 top_exports: &[&str],
3088 top_export_signatures: &[Option<&str>],
3089) -> SemanticChunk {
3090 let line_cache = SourceLineCache::new(source);
3091 build_file_summary_chunk_with_lines(
3092 file,
3093 project_root,
3094 &line_cache,
3095 top_exports,
3096 top_export_signatures,
3097 )
3098}
3099
3100fn build_file_summary_chunk_with_lines(
3101 file: &Path,
3102 project_root: &Path,
3103 line_cache: &SourceLineCache<'_>,
3104 top_exports: &[&str],
3105 top_export_signatures: &[Option<&str>],
3106) -> SemanticChunk {
3107 let relative = file.strip_prefix(project_root).unwrap_or(file);
3108 let rel_path = relative.to_string_lossy();
3109 let parent_dir = relative
3110 .parent()
3111 .map(|parent| parent.to_string_lossy().to_string())
3112 .unwrap_or_default();
3113 let name = file
3114 .file_stem()
3115 .map(|stem| stem.to_string_lossy().to_string())
3116 .unwrap_or_default();
3117 let doc = first_leading_doc_comment(line_cache);
3118 let exports = top_exports
3119 .iter()
3120 .take(5)
3121 .copied()
3122 .collect::<Vec<_>>()
3123 .join(",");
3124 let snippet = if doc.is_empty() {
3125 top_export_signatures
3126 .first()
3127 .and_then(|signature| signature.as_deref())
3128 .map(|signature| truncate_chars(signature, 200))
3129 .unwrap_or_default()
3130 } else {
3131 doc.clone()
3132 };
3133
3134 SemanticChunk {
3135 file: file.to_path_buf(),
3136 name,
3137 kind: SymbolKind::FileSummary,
3138 start_line: 0,
3139 end_line: 0,
3140 exported: false,
3141 embed_text: truncate_chars(
3142 &format!(
3143 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3144 file.file_stem()
3145 .map(|stem| stem.to_string_lossy().to_string())
3146 .unwrap_or_default()
3147 ),
3148 MAX_EMBED_TEXT_CHARS,
3149 ),
3150 snippet,
3151 }
3152}
3153
3154fn parser_for(
3155 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3156 lang: crate::parser::LangId,
3157) -> Result<&mut Parser, String> {
3158 use std::collections::hash_map::Entry;
3159
3160 match parsers.entry(lang) {
3161 Entry::Occupied(entry) => Ok(entry.into_mut()),
3162 Entry::Vacant(entry) => {
3163 let grammar = grammar_for(lang);
3164 let mut parser = Parser::new();
3165 parser
3166 .set_language(&grammar)
3167 .map_err(|error| error.to_string())?;
3168 Ok(entry.insert(parser))
3169 }
3170 }
3171}
3172
3173pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3174 matches!(
3175 path.extension().and_then(|extension| extension.to_str()),
3176 Some(
3177 "ts" | "tsx"
3178 | "js"
3179 | "jsx"
3180 | "py"
3181 | "rs"
3182 | "go"
3183 | "c"
3184 | "h"
3185 | "cc"
3186 | "cpp"
3187 | "cxx"
3188 | "hpp"
3189 | "hh"
3190 | "zig"
3191 | "cs"
3192 | "sh"
3193 | "bash"
3194 | "zsh"
3195 | "inc"
3196 | "php"
3197 | "sol"
3198 | "scss"
3199 | "vue"
3200 | "yaml"
3201 | "yml"
3202 | "pas"
3203 | "pp"
3204 | "dpr"
3205 | "dpk"
3206 | "lpr"
3207 | "java"
3208 | "kt"
3209 | "kts"
3210 | "rb"
3211 | "swift"
3212 | "scala"
3213 | "sc"
3214 | "lua"
3215 | "pl"
3216 | "pm"
3217 | "t"
3218 | "r"
3219 | "R",
3220 )
3221 )
3222}
3223
3224fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3225 if let Ok(canonical) = fs::canonicalize(path) {
3226 return canonical;
3227 }
3228
3229 let Some(parent) = path.parent() else {
3230 return path.to_path_buf();
3231 };
3232 let Some(file_name) = path.file_name() else {
3233 return path.to_path_buf();
3234 };
3235
3236 fs::canonicalize(parent)
3237 .map(|canonical_parent| canonical_parent.join(file_name))
3238 .unwrap_or_else(|_| path.to_path_buf())
3239}
3240
3241const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3251
3252fn collect_semantic_file(
3253 project_root: &Path,
3254 file: &Path,
3255 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3256) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3257 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3258 if !metadata.is_file() {
3259 return Err("not a regular file".to_string());
3260 }
3261 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3262 let size = metadata.len();
3263
3264 if !is_semantic_indexed_extension(file) {
3265 return Err("unsupported file extension".to_string());
3266 }
3267 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3268
3269 let mut indexed_metadata = IndexedFileMetadata {
3270 mtime,
3271 size,
3272 content_hash: cache_freshness::zero_hash(),
3273 };
3274
3275 if size > MAX_SEMANTIC_FILE_BYTES {
3278 return Ok((indexed_metadata, Vec::new()));
3279 }
3280
3281 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3282 indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3283 cache_freshness::hash_bytes(source.as_bytes())
3284 } else {
3285 cache_freshness::zero_hash()
3286 };
3287
3288 let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3289 Ok((indexed_metadata, chunks))
3290}
3291
3292#[cfg(test)]
3293fn collect_file_chunks(
3294 project_root: &Path,
3295 file: &Path,
3296 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3297) -> Result<Vec<SemanticChunk>, String> {
3298 if !is_semantic_indexed_extension(file) {
3299 return Err("unsupported file extension".to_string());
3300 }
3301 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3302 if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3305 return Ok(Vec::new());
3306 }
3307 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3308 collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3309}
3310
3311fn collect_file_chunks_from_source(
3312 project_root: &Path,
3313 file: &Path,
3314 lang: crate::parser::LangId,
3315 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3316 source: &str,
3317) -> Result<Vec<SemanticChunk>, String> {
3318 let tree = parser_for(parsers, lang)?
3319 .parse(source, None)
3320 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3321 let symbols =
3322 extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3323
3324 Ok(symbols_to_chunks(file, &symbols, source, project_root))
3325}
3326
3327fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3329 let start = (symbol.range.start_line as usize).min(line_cache.len());
3330 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3332 if start < end {
3333 let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3334 .iter()
3335 .take(5)
3336 .copied()
3337 .collect();
3338 let mut snippet = snippet_lines.join("\n");
3339 if end - start > 5 {
3340 snippet.push_str("\n ...");
3341 }
3342 if snippet.len() > 300 {
3343 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3344 }
3345 snippet
3346 } else {
3347 String::new()
3348 }
3349}
3350
3351#[cfg(test)]
3352fn build_snippet(symbol: &Symbol, source: &str) -> String {
3353 let line_cache = SourceLineCache::new(source);
3354 build_snippet_with_lines(symbol, &line_cache)
3355}
3356
3357fn symbols_to_chunks(
3359 file: &Path,
3360 symbols: &[Symbol],
3361 source: &str,
3362 project_root: &Path,
3363) -> Vec<SemanticChunk> {
3364 let line_cache = SourceLineCache::new(source);
3365 let mut chunks = Vec::new();
3366 let top_exports_with_signatures = symbols
3367 .iter()
3368 .filter(|symbol| {
3369 symbol.exported
3370 && symbol.parent.is_none()
3371 && !matches!(symbol.kind, SymbolKind::Heading)
3372 })
3373 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3374 .collect::<Vec<_>>();
3375
3376 let has_only_headings = !symbols.is_empty()
3377 && symbols
3378 .iter()
3379 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3380 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3381 let top_exports = top_exports_with_signatures
3382 .iter()
3383 .map(|(name, _)| *name)
3384 .collect::<Vec<_>>();
3385 let top_export_signatures = top_exports_with_signatures
3386 .iter()
3387 .map(|(_, signature)| *signature)
3388 .collect::<Vec<_>>();
3389 chunks.push(build_file_summary_chunk_with_lines(
3390 file,
3391 project_root,
3392 &line_cache,
3393 &top_exports,
3394 &top_export_signatures,
3395 ));
3396 }
3397
3398 for symbol in symbols {
3399 if matches!(symbol.kind, SymbolKind::Heading) {
3404 continue;
3405 }
3406
3407 let line_count = symbol
3409 .range
3410 .end_line
3411 .saturating_sub(symbol.range.start_line)
3412 + 1;
3413 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3414 continue;
3415 }
3416
3417 let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3418 let snippet = build_snippet_with_lines(symbol, &line_cache);
3419
3420 chunks.push(SemanticChunk {
3421 file: file.to_path_buf(),
3422 name: symbol.name.clone(),
3423 kind: symbol.kind.clone(),
3424 start_line: symbol.range.start_line,
3425 end_line: symbol.range.end_line,
3426 exported: symbol.exported,
3427 embed_text,
3428 snippet,
3429 });
3430
3431 }
3434
3435 chunks
3436}
3437
3438fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3439 b.0.partial_cmp(&a.0)
3440 .unwrap_or(std::cmp::Ordering::Equal)
3441 .then_with(|| a.1.cmp(&b.1))
3442}
3443
3444fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3446 if a.len() != b.len() {
3447 return 0.0;
3448 }
3449
3450 let mut dot = 0.0f32;
3451 let mut norm_a = 0.0f32;
3452 let mut norm_b = 0.0f32;
3453
3454 for i in 0..a.len() {
3455 dot += a[i] * b[i];
3456 norm_a += a[i] * a[i];
3457 norm_b += b[i] * b[i];
3458 }
3459
3460 let denom = norm_a.sqrt() * norm_b.sqrt();
3461 if denom == 0.0 {
3462 0.0
3463 } else {
3464 dot / denom
3465 }
3466}
3467
3468fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3470 match kind {
3471 SymbolKind::Function => 0,
3472 SymbolKind::Class => 1,
3473 SymbolKind::Method => 2,
3474 SymbolKind::Struct => 3,
3475 SymbolKind::Interface => 4,
3476 SymbolKind::Enum => 5,
3477 SymbolKind::TypeAlias => 6,
3478 SymbolKind::Variable => 7,
3479 SymbolKind::Heading => 8,
3480 SymbolKind::FileSummary => 9,
3481 }
3482}
3483
3484fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3485 match v {
3486 0 => SymbolKind::Function,
3487 1 => SymbolKind::Class,
3488 2 => SymbolKind::Method,
3489 3 => SymbolKind::Struct,
3490 4 => SymbolKind::Interface,
3491 5 => SymbolKind::Enum,
3492 6 => SymbolKind::TypeAlias,
3493 7 => SymbolKind::Variable,
3494 8 => SymbolKind::Heading,
3495 9 => SymbolKind::FileSummary,
3496 _ => SymbolKind::Heading,
3497 }
3498}
3499
3500#[cfg(test)]
3501mod tests {
3502 use super::*;
3503 use crate::config::{SemanticBackend, SemanticBackendConfig};
3504 use crate::parser::FileParser;
3505 use std::io::{Read, Write};
3506 use std::net::TcpListener;
3507 use std::thread;
3508
3509 #[test]
3510 fn semantic_index_includes_php_inc_and_scss_extensions() {
3511 for file in ["partial.inc", "index.php", "styles.scss"] {
3512 assert!(
3513 is_semantic_indexed_extension(Path::new(file)),
3514 "{file} should be semantic-index eligible"
3515 );
3516 }
3517 }
3518
3519 #[test]
3520 fn transient_marker_round_trips_and_classifies() {
3521 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3524 assert!(embedding_failure_is_transient(&marked));
3525 let clean = strip_transient_embedding_marker(&marked);
3526 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3527 assert!(clean.starts_with("openai compatible request failed:"));
3528
3529 for permanent in [
3532 "openai compatible request failed (HTTP 401): Unauthorized",
3533 "embedding dimension mismatch: index has 384, model returned 768",
3534 "too many files (>20000) for semantic indexing (max 20000)",
3535 ] {
3536 assert!(
3537 !embedding_failure_is_transient(permanent),
3538 "{permanent:?} must not be transient"
3539 );
3540 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3542 }
3543 }
3544
3545 #[test]
3546 fn send_error_transience_separates_connect_timeout_from_4xx() {
3547 assert!(is_retryable_embedding_status(
3549 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3550 ));
3551 assert!(is_retryable_embedding_status(
3552 reqwest::StatusCode::TOO_MANY_REQUESTS
3553 ));
3554 assert!(!is_retryable_embedding_status(
3555 reqwest::StatusCode::UNAUTHORIZED
3556 ));
3557 assert!(!is_retryable_embedding_status(
3558 reqwest::StatusCode::BAD_REQUEST
3559 ));
3560 }
3561
3562 #[test]
3563 fn local_backend_model_loading_body_is_transient() {
3564 for body in [
3567 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3568 r#"{"error":"model is loading, please wait"}"#,
3569 r#"{"error":"Model not loaded"}"#,
3570 "Loading model into memory",
3571 ] {
3572 assert!(
3573 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3574 "{body:?} should be body-transient"
3575 );
3576 }
3577
3578 for body in [
3582 r#"{"error":"invalid api key"}"#,
3583 r#"{"error":"model 'foo' not found"}"#,
3584 "Bad Request: unknown field",
3585 "Bad Request: invalid loading model option",
3586 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3587 ] {
3588 assert!(
3589 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3590 "{body:?} must not be body-transient"
3591 );
3592 }
3593
3594 assert!(
3595 !embedding_response_body_is_transient(
3596 reqwest::StatusCode::UNAUTHORIZED,
3597 r#"{"error":"model is loading, please wait"}"#
3598 ),
3599 "permanent auth failures must not become transient because of body text"
3600 );
3601 }
3602
3603 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3604 where
3605 F: Fn(String, String, String) -> String + Send + 'static,
3606 {
3607 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3608 let addr = listener.local_addr().expect("local addr");
3609 let handle = thread::spawn(move || {
3610 let (mut stream, _) = listener.accept().expect("accept request");
3611 let mut buf = Vec::new();
3612 let mut chunk = [0u8; 4096];
3613 let mut header_end = None;
3614 let mut content_length = 0usize;
3615 loop {
3616 let n = stream.read(&mut chunk).expect("read request");
3617 if n == 0 {
3618 break;
3619 }
3620 buf.extend_from_slice(&chunk[..n]);
3621 if header_end.is_none() {
3622 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3623 header_end = Some(pos + 4);
3624 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3625 for line in headers.lines() {
3626 if let Some(value) = line.strip_prefix("Content-Length:") {
3627 content_length = value.trim().parse::<usize>().unwrap_or(0);
3628 }
3629 }
3630 }
3631 }
3632 if let Some(end) = header_end {
3633 if buf.len() >= end + content_length {
3634 break;
3635 }
3636 }
3637 }
3638
3639 let end = header_end.expect("header terminator");
3640 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3641 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3642 let mut lines = request.lines();
3643 let request_line = lines.next().expect("request line").to_string();
3644 let path = request_line
3645 .split_whitespace()
3646 .nth(1)
3647 .expect("request path")
3648 .to_string();
3649 let response_body = handler(request_line, path, body);
3650 let response = format!(
3651 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3652 response_body.len(),
3653 response_body
3654 );
3655 stream
3656 .write_all(response.as_bytes())
3657 .expect("write response");
3658 });
3659
3660 (format!("http://{}", addr), handle)
3661 }
3662
3663 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3664 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3665 listener
3666 .set_nonblocking(true)
3667 .expect("nonblocking listener");
3668 let addr = listener.local_addr().expect("local addr");
3669 let handle = thread::spawn(move || {
3670 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3671 let mut accepted = 0usize;
3672 while accepted < attempts && std::time::Instant::now() < deadline {
3673 match listener.accept() {
3674 Ok((mut stream, _)) => {
3675 accepted += 1;
3676 let mut buf = [0u8; 4096];
3677 let _ = stream.read(&mut buf);
3685 let response = "HTTP/1.1 200 OK
3686Content-Type: application/json
3687Content-Length: 128
3688Connection: close
3689
3690{";
3691 let _ = stream.write_all(response.as_bytes());
3692 }
3693 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3694 thread::sleep(Duration::from_millis(10));
3695 }
3696 Err(error) => panic!("accept request: {error}"),
3697 }
3698 }
3699 });
3700
3701 (format!("http://{}", addr), handle)
3702 }
3703
3704 #[test]
3705 fn response_body_read_failures_are_marked_transient() {
3706 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3707 let client = Client::builder()
3708 .timeout(Duration::from_millis(250))
3709 .build()
3710 .expect("client");
3711
3712 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3713 .expect_err("truncated body should fail");
3714
3715 handle.join().unwrap();
3716 assert!(
3717 embedding_failure_is_transient(&error),
3718 "body read failures should be transient-marked: {error}"
3719 );
3720 assert!(error.contains("response read failed"));
3721 }
3722
3723 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3724 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3725 }
3726
3727 fn write_rust_file(path: &Path, function_name: &str) {
3728 fs::write(
3729 path,
3730 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3731 )
3732 .unwrap();
3733 }
3734
3735 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3736 let mut embed = test_vector_for_texts;
3737 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3738 }
3739
3740 fn test_project_root() -> PathBuf {
3741 std::env::current_dir().unwrap()
3742 }
3743
3744 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3745 index.file_mtimes.insert(file.to_path_buf(), mtime);
3746 index.file_sizes.insert(file.to_path_buf(), size);
3747 index
3748 .file_hashes
3749 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3750 }
3751
3752 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3753 let mut buf = Vec::new();
3754 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3755 let encoded = fingerprint.as_string();
3756 if encoded.is_empty() {
3757 None
3758 } else {
3759 Some(encoded.into_bytes())
3760 }
3761 });
3762 let file_mtimes: Vec<_> = index
3763 .file_mtimes
3764 .iter()
3765 .filter_map(|(path, mtime)| {
3766 cache_relative_path(&index.project_root, path)
3767 .map(|relative| (relative, path, mtime))
3768 })
3769 .collect();
3770 let entries: Vec<_> = index
3771 .entries
3772 .iter()
3773 .filter_map(|entry| {
3774 cache_relative_path(&index.project_root, &entry.chunk.file)
3775 .map(|relative| (relative, entry))
3776 })
3777 .collect();
3778
3779 buf.push(SEMANTIC_INDEX_VERSION_V6);
3780 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3781 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3782 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3783 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3784 buf.extend_from_slice(fp_bytes_ref);
3785
3786 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3787 for (relative, path, mtime) in &file_mtimes {
3788 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3789 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3790 buf.extend_from_slice(&path_bytes);
3791 let duration = mtime
3792 .duration_since(SystemTime::UNIX_EPOCH)
3793 .unwrap_or_default();
3794 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3795 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3796 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3797 buf.extend_from_slice(&size.to_le_bytes());
3798 let hash = index
3799 .file_hashes
3800 .get(*path)
3801 .copied()
3802 .unwrap_or_else(cache_freshness::zero_hash);
3803 buf.extend_from_slice(hash.as_bytes());
3804 }
3805
3806 for (relative, entry) in &entries {
3807 let c = &entry.chunk;
3808 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3809 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3810 buf.extend_from_slice(&file_bytes);
3811
3812 let name_bytes = c.name.as_bytes();
3813 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3814 buf.extend_from_slice(name_bytes);
3815
3816 buf.push(symbol_kind_to_u8(&c.kind));
3817 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3818 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3819 buf.push(c.exported as u8);
3820
3821 let snippet_bytes = c.snippet.as_bytes();
3822 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3823 buf.extend_from_slice(snippet_bytes);
3824
3825 let embed_bytes = c.embed_text.as_bytes();
3826 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3827 buf.extend_from_slice(embed_bytes);
3828
3829 for &val in &entry.vector {
3830 buf.extend_from_slice(&val.to_le_bytes());
3831 }
3832 }
3833
3834 buf
3835 }
3836
3837 #[derive(Default)]
3838 struct RecordingEmbedder {
3839 calls: Vec<Vec<String>>,
3840 }
3841
3842 impl RecordingEmbedder {
3843 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3844 let vectors = texts
3845 .iter()
3846 .map(|text| deterministic_test_vector(text))
3847 .collect();
3848 self.calls.push(texts);
3849 Ok(vectors)
3850 }
3851
3852 fn total_embedded_texts(&self) -> usize {
3853 self.calls.iter().map(Vec::len).sum()
3854 }
3855
3856 fn embedded_texts(&self) -> Vec<&str> {
3857 self.calls
3858 .iter()
3859 .flat_map(|batch| batch.iter().map(String::as_str))
3860 .collect()
3861 }
3862 }
3863
3864 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3865 let hash = blake3::hash(text.as_bytes());
3866 let bytes = hash.as_bytes();
3867 vec![
3868 1.0,
3869 bytes[0] as f32 / 255.0,
3870 bytes[1] as f32 / 255.0,
3871 bytes[2] as f32 / 255.0,
3872 ]
3873 }
3874
3875 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3876 let mut embedder = RecordingEmbedder::default();
3877 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3878 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3879 }
3880
3881 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3882 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3883 }
3884
3885 fn write_source(path: &Path, source: &str) {
3886 if let Some(parent) = path.parent() {
3887 fs::create_dir_all(parent).unwrap();
3888 }
3889 fs::write(path, source).unwrap();
3890 }
3891
3892 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3893 index
3894 .entries
3895 .iter()
3896 .filter(|entry| entry.chunk.file == file)
3897 .collect()
3898 }
3899
3900 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3901 index
3902 .entries
3903 .iter()
3904 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3905 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3906 }
3907
3908 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3909 index
3910 .entries
3911 .iter()
3912 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3913 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3914 }
3915
3916 #[test]
3917 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3918 let temp = tempfile::tempdir().unwrap();
3919 let project_root = temp.path();
3920 let file = project_root.join("src/lib.rs");
3921 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3922 write_source(&file, original);
3923
3924 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3925 let original_entry_count = index.entries.len();
3926 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3927
3928 write_source(&file, &format!("\n{original}"));
3929 force_stale(&mut index, &file);
3930
3931 let mut embedder = RecordingEmbedder::default();
3932 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3933 let mut progress = |_done: usize, _total: usize| {};
3934 let summary = index
3935 .refresh_stale_files(
3936 project_root,
3937 std::slice::from_ref(&file),
3938 &mut embed,
3939 16,
3940 &mut progress,
3941 )
3942 .unwrap();
3943
3944 assert_eq!(summary.changed, 1);
3945 assert_eq!(embedder.total_embedded_texts(), 0);
3946 assert_eq!(index.entries.len(), original_entry_count);
3947 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3948 assert_eq!(shifted_alpha.chunk.start_line, 1);
3949 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3950 }
3951
3952 #[test]
3953 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3954 let temp = tempfile::tempdir().unwrap();
3955 let project_root = temp.path();
3956 let file = project_root.join("src/lib.rs");
3957 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3958 write_source(&file, original);
3959
3960 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3961 let mut serving_index = worker_index.clone();
3962 let original_entry_count = worker_index.entries.len();
3963
3964 write_source(&file, &format!("\n{original}"));
3965
3966 let mut embedder = RecordingEmbedder::default();
3967 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3968 let mut progress = |_done: usize, _total: usize| {};
3969 let update = worker_index
3970 .refresh_invalidated_files(
3971 project_root,
3972 std::slice::from_ref(&file),
3973 &mut embed,
3974 16,
3975 100,
3976 &mut progress,
3977 )
3978 .unwrap();
3979
3980 assert_eq!(embedder.total_embedded_texts(), 0);
3981 assert_eq!(update.added_entries.len(), original_entry_count);
3982 assert_eq!(worker_index.entries.len(), original_entry_count);
3983
3984 serving_index.apply_refresh_update(
3985 update.added_entries,
3986 update.updated_metadata,
3987 &update.completed_paths,
3988 );
3989
3990 assert_eq!(serving_index.entries.len(), original_entry_count);
3991 assert_eq!(
3992 entries_for_file(&serving_index, &file).len(),
3993 original_entry_count
3994 );
3995 assert_eq!(
3996 entry_by_name(&serving_index, &file, "alpha")
3997 .chunk
3998 .start_line,
3999 1
4000 );
4001 }
4002
4003 #[test]
4004 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
4005 let temp = tempfile::tempdir().unwrap();
4006 let project_root = temp.path();
4007 let file = project_root.join("src/lib.rs");
4008 write_source(
4009 &file,
4010 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
4011 );
4012
4013 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4014 let original_entry_count = index.entries.len();
4015 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4016
4017 write_source(
4018 &file,
4019 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
4020 );
4021
4022 let mut embedder = RecordingEmbedder::default();
4023 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4024 let mut progress = |_done: usize, _total: usize| {};
4025 let update = index
4026 .refresh_invalidated_files(
4027 project_root,
4028 std::slice::from_ref(&file),
4029 &mut embed,
4030 16,
4031 100,
4032 &mut progress,
4033 )
4034 .unwrap();
4035
4036 assert_eq!(embedder.total_embedded_texts(), 1);
4037 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4038 assert_eq!(update.added_entries.len(), original_entry_count);
4039 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4040 }
4041
4042 #[test]
4043 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4044 let temp = tempfile::tempdir().unwrap();
4045 let project_root = temp.path();
4046 let file = project_root.join("src/dupe.js");
4047 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
4048 write_source(&file, one_duplicate);
4049
4050 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4051 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4052
4053 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4054
4055 let mut embedder = RecordingEmbedder::default();
4056 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4057 let mut progress = |_done: usize, _total: usize| {};
4058 index
4059 .refresh_invalidated_files(
4060 project_root,
4061 std::slice::from_ref(&file),
4062 &mut embed,
4063 16,
4064 100,
4065 &mut progress,
4066 )
4067 .unwrap();
4068
4069 let duplicate_entries = index
4070 .entries
4071 .iter()
4072 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4073 .collect::<Vec<_>>();
4074 assert_eq!(duplicate_entries.len(), 2);
4075 assert_eq!(embedder.total_embedded_texts(), 0);
4076 assert_eq!(duplicate_entries[0].vector, original_vector);
4077 assert_eq!(duplicate_entries[1].vector, original_vector);
4078 }
4079
4080 #[test]
4081 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4082 let temp = tempfile::tempdir().unwrap();
4083 let project_root = temp.path();
4084 let file = project_root.join("src/lib.rs");
4085 write_source(
4086 &file,
4087 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
4088 );
4089
4090 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4091 let summary_before = file_summary_entry(&index, &file).vector.clone();
4092
4093 write_source(
4094 &file,
4095 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
4096 );
4097 let mut body_embedder = RecordingEmbedder::default();
4098 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4099 let mut progress = |_done: usize, _total: usize| {};
4100 index
4101 .refresh_invalidated_files(
4102 project_root,
4103 std::slice::from_ref(&file),
4104 &mut body_embed,
4105 16,
4106 100,
4107 &mut progress,
4108 )
4109 .unwrap();
4110 assert_eq!(body_embedder.total_embedded_texts(), 1);
4111 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4112 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4113
4114 write_source(
4115 &file,
4116 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
4117 );
4118 let mut doc_embedder = RecordingEmbedder::default();
4119 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4120 index
4121 .refresh_invalidated_files(
4122 project_root,
4123 std::slice::from_ref(&file),
4124 &mut doc_embed,
4125 16,
4126 100,
4127 &mut progress,
4128 )
4129 .unwrap();
4130
4131 assert_eq!(doc_embedder.total_embedded_texts(), 1);
4132 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4133 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4134 }
4135
4136 #[test]
4137 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4138 let temp = tempfile::tempdir().unwrap();
4139 let project_root = temp.path();
4140 let file = project_root.join("src/lib.rs");
4141 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4142
4143 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4144 let mut serving_index = worker_index.clone();
4145 fs::remove_file(&file).unwrap();
4146
4147 let mut embedder = RecordingEmbedder::default();
4148 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4149 let mut progress = |_done: usize, _total: usize| {};
4150 let update = worker_index
4151 .refresh_invalidated_files(
4152 project_root,
4153 std::slice::from_ref(&file),
4154 &mut embed,
4155 16,
4156 100,
4157 &mut progress,
4158 )
4159 .unwrap();
4160
4161 assert_eq!(update.summary.deleted, 1);
4162 assert_eq!(embedder.total_embedded_texts(), 0);
4163 assert!(worker_index.entries.is_empty());
4164
4165 serving_index.apply_refresh_update(
4166 update.added_entries,
4167 update.updated_metadata,
4168 &update.completed_paths,
4169 );
4170 assert!(serving_index.entries.is_empty());
4171 }
4172
4173 #[test]
4174 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4175 let temp = tempfile::tempdir().unwrap();
4176 let project_root = temp.path();
4177 let file = project_root.join("src/lib.rs");
4178 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4179
4180 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4181 let mut serving_index = worker_index.clone();
4182 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4183
4184 let mut embedder = RecordingEmbedder::default();
4185 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4186 let mut progress = |_done: usize, _total: usize| {};
4187 let update = worker_index
4188 .refresh_invalidated_files(
4189 project_root,
4190 std::slice::from_ref(&file),
4191 &mut embed,
4192 16,
4193 100,
4194 &mut progress,
4195 )
4196 .unwrap();
4197
4198 assert_eq!(embedder.total_embedded_texts(), 0);
4199 assert!(update.added_entries.is_empty());
4200 assert!(worker_index.entries.is_empty());
4201 assert!(!worker_index.file_mtimes.contains_key(&file));
4202
4203 serving_index.apply_refresh_update(
4204 update.added_entries,
4205 update.updated_metadata,
4206 &update.completed_paths,
4207 );
4208 assert!(serving_index.entries.is_empty());
4209 assert!(!serving_index.file_mtimes.contains_key(&file));
4210 }
4211
4212 #[test]
4213 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4214 let temp = tempfile::tempdir().unwrap();
4215 let project_root = temp.path();
4216 let indexed = project_root.join("src/a.rs");
4217 let deferred = project_root.join("src/b.rs");
4218 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4219 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4220
4221 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4222 let mut embedder = RecordingEmbedder::default();
4223 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4224 let mut progress = |_done: usize, _total: usize| {};
4225 let update = index
4226 .refresh_invalidated_files(
4227 project_root,
4228 std::slice::from_ref(&deferred),
4229 &mut embed,
4230 16,
4231 1,
4232 &mut progress,
4233 )
4234 .unwrap();
4235
4236 assert_eq!(update.summary.total_processed, 1);
4237 assert_eq!(update.summary.added, 0);
4238 assert_eq!(embedder.total_embedded_texts(), 0);
4239 assert_eq!(index.indexed_file_count(), 1);
4240 assert!(index.deferred_files.contains(&deferred));
4241 assert!(entries_for_file(&index, &deferred).is_empty());
4242 }
4243
4244 #[test]
4245 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4246 let dir = tempfile::tempdir().expect("create temp dir");
4247 let project = fs::canonicalize(dir.path()).expect("canonical project");
4248 let outside = project.join("..").join("outside.rs");
4249 let mut index = SemanticIndex::new(project.clone(), 3);
4250 index
4251 .file_mtimes
4252 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4253 index.file_sizes.insert(outside.clone(), 1);
4254 index
4255 .file_hashes
4256 .insert(outside.clone(), cache_freshness::zero_hash());
4257 index.entries.push(EmbeddingEntry {
4258 chunk: SemanticChunk {
4259 file: outside,
4260 name: "outside".to_string(),
4261 kind: SymbolKind::Function,
4262 start_line: 0,
4263 end_line: 0,
4264 exported: false,
4265 embed_text: "outside".to_string(),
4266 snippet: "outside".to_string(),
4267 },
4268 vector: vec![1.0, 0.0, 0.0],
4269 });
4270
4271 let bytes = index.to_bytes();
4272 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4273 assert_eq!(loaded.entries.len(), 0);
4274 assert!(loaded.file_mtimes.is_empty());
4275 }
4276
4277 #[test]
4278 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4279 let project_root = test_project_root();
4280 let file = project_root.join("src/lib.rs");
4281 let mut index = SemanticIndex::new(project_root, 2);
4282 let entries = [
4283 ("alpha", vec![1.0, 0.0], false),
4284 ("beta", vec![0.0, 1.0], false),
4285 ("gamma", vec![1.0, 0.0], false),
4286 ("delta", vec![0.5, 0.5], true),
4287 ("epsilon", vec![-1.0, 0.0], false),
4288 ];
4289 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4290 index.entries.push(EmbeddingEntry {
4291 chunk: SemanticChunk {
4292 file: file.clone(),
4293 name: name.to_string(),
4294 kind: SymbolKind::Function,
4295 start_line: line as u32 + 1,
4296 end_line: line as u32 + 1,
4297 exported,
4298 embed_text: name.to_string(),
4299 snippet: format!("fn {name}() {{}}"),
4300 },
4301 vector,
4302 });
4303 }
4304
4305 let query = vec![1.0, 0.0];
4306 let top_k = 4;
4307 let mut reference: Vec<(f32, usize)> = index
4308 .entries
4309 .iter()
4310 .enumerate()
4311 .map(|(idx, entry)| {
4312 let mut score = cosine_similarity(&query, &entry.vector);
4313 if entry.chunk.exported {
4314 score *= 1.1;
4315 }
4316 (score, idx)
4317 })
4318 .collect();
4319 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4320 let expected: Vec<(String, f32)> = reference
4321 .into_iter()
4322 .take(top_k)
4323 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4324 .collect();
4325
4326 let actual: Vec<(String, f32)> = index
4327 .search(&query, top_k)
4328 .into_iter()
4329 .map(|result| (result.name, result.score))
4330 .collect();
4331
4332 assert_eq!(
4333 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4334 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4335 );
4336 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4337 assert!((actual_score - expected_score).abs() < 1e-6);
4338 }
4339 assert_eq!(actual[0].0, "alpha");
4340 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4341 assert!(index.search(&query, 0).is_empty());
4342 }
4343
4344 #[test]
4345 fn test_cosine_similarity_identical() {
4346 let a = vec![1.0, 0.0, 0.0];
4347 let b = vec![1.0, 0.0, 0.0];
4348 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4349 }
4350
4351 #[test]
4352 fn test_cosine_similarity_orthogonal() {
4353 let a = vec![1.0, 0.0, 0.0];
4354 let b = vec![0.0, 1.0, 0.0];
4355 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4356 }
4357
4358 #[test]
4359 fn test_cosine_similarity_opposite() {
4360 let a = vec![1.0, 0.0, 0.0];
4361 let b = vec![-1.0, 0.0, 0.0];
4362 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4363 }
4364
4365 #[test]
4366 fn test_serialization_roundtrip() {
4367 let project_root = test_project_root();
4368 let file = project_root.join("src/main.rs");
4369 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4370 index.entries.push(EmbeddingEntry {
4371 chunk: SemanticChunk {
4372 file: file.clone(),
4373 name: "handle_request".to_string(),
4374 kind: SymbolKind::Function,
4375 start_line: 10,
4376 end_line: 25,
4377 exported: true,
4378 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4379 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4380 },
4381 vector: vec![0.1, 0.2, 0.3, 0.4],
4382 });
4383 index.dimension = 4;
4384 index
4385 .file_mtimes
4386 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4387 index.file_sizes.insert(file, 0);
4388 index.set_fingerprint(SemanticIndexFingerprint {
4389 backend: "fastembed".to_string(),
4390 model: "all-MiniLM-L6-v2".to_string(),
4391 base_url: FALLBACK_BACKEND.to_string(),
4392 dimension: 4,
4393 chunking_version: default_chunking_version(),
4394 });
4395
4396 let bytes = index.to_bytes();
4397 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4398
4399 assert_eq!(restored.entries.len(), 1);
4400 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4401 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4402 assert_eq!(restored.dimension, 4);
4403 assert_eq!(restored.backend_label(), Some("fastembed"));
4404 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4405 }
4406
4407 #[test]
4408 fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4409 let storage = tempfile::tempdir().expect("create storage dir");
4410 let project = storage.path().join("project");
4411 fs::create_dir_all(project.join("src")).expect("create project src");
4412 let file = project.join("src/lib.rs");
4413 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4414 let project_root = fs::canonicalize(&project).expect("canonical project");
4415 let file = fs::canonicalize(&file).expect("canonical file");
4416
4417 let mut index = SemanticIndex::new(project_root.clone(), 3);
4418 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4419 index.file_mtimes.insert(file.clone(), mtime);
4420 index.file_sizes.insert(file.clone(), 42);
4421 index
4422 .file_hashes
4423 .insert(file.clone(), cache_freshness::zero_hash());
4424 index.entries.push(EmbeddingEntry {
4425 chunk: SemanticChunk {
4426 file: file.clone(),
4427 name: "alpha".to_string(),
4428 kind: SymbolKind::Function,
4429 start_line: 0,
4430 end_line: 0,
4431 exported: true,
4432 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4433 snippet: "pub fn alpha() {}".to_string(),
4434 },
4435 vector: vec![0.1, 0.2, 0.3],
4436 });
4437 index.entries.push(EmbeddingEntry {
4438 chunk: SemanticChunk {
4439 file: file.clone(),
4440 name: "beta".to_string(),
4441 kind: SymbolKind::Function,
4442 start_line: 1,
4443 end_line: 1,
4444 exported: true,
4445 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4446 snippet: "pub fn beta() {}".to_string(),
4447 },
4448 vector: vec![0.4, 0.5, 0.6],
4449 });
4450 let fingerprint = SemanticIndexFingerprint {
4451 backend: "fastembed".to_string(),
4452 model: "all-MiniLM-L6-v2".to_string(),
4453 base_url: FALLBACK_BACKEND.to_string(),
4454 dimension: 3,
4455 chunking_version: default_chunking_version(),
4456 };
4457 index.set_fingerprint(fingerprint.clone());
4458
4459 let legacy_bytes = legacy_semantic_index_bytes(&index);
4460 assert_eq!(index.to_bytes(), legacy_bytes);
4461
4462 index.write_to_disk(storage.path(), "proj");
4463 let data_path = storage.path().join("semantic/proj/semantic.bin");
4464 assert_eq!(
4465 fs::read(&data_path).expect("read semantic.bin"),
4466 legacy_bytes
4467 );
4468
4469 let loaded = SemanticIndex::read_from_disk(
4470 storage.path(),
4471 "proj",
4472 &project_root,
4473 false,
4474 Some(&fingerprint.as_string()),
4475 )
4476 .expect("load semantic index");
4477 assert_eq!(loaded.entries.len(), index.entries.len());
4478 assert_eq!(loaded.dimension, index.dimension);
4479 assert_eq!(
4480 loaded.fingerprint().unwrap().as_string(),
4481 fingerprint.as_string()
4482 );
4483 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4484 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4485 assert_eq!(
4486 loaded.file_hashes.get(&file),
4487 Some(&cache_freshness::zero_hash())
4488 );
4489 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4490 assert_eq!(actual.chunk.file, expected.chunk.file);
4491 assert_eq!(actual.chunk.name, expected.chunk.name);
4492 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4493 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4494 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4495 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4496 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4497 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4498 assert_eq!(actual.vector, expected.vector);
4499 }
4500 assert_eq!(loaded.to_bytes(), legacy_bytes);
4501 }
4502
4503 #[test]
4504 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4505 let cases = [
4506 (SymbolKind::Function, 0),
4507 (SymbolKind::Class, 1),
4508 (SymbolKind::Method, 2),
4509 (SymbolKind::Struct, 3),
4510 (SymbolKind::Interface, 4),
4511 (SymbolKind::Enum, 5),
4512 (SymbolKind::TypeAlias, 6),
4513 (SymbolKind::Variable, 7),
4514 (SymbolKind::Heading, 8),
4515 (SymbolKind::FileSummary, 9),
4516 ];
4517
4518 for (kind, encoded) in cases {
4519 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4520 assert_eq!(u8_to_symbol_kind(encoded), kind);
4521 }
4522 }
4523
4524 #[test]
4525 fn test_search_top_k() {
4526 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4527 index.dimension = 3;
4528
4529 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4531 let mut vec = vec![0.0f32; 3];
4532 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4534 chunk: SemanticChunk {
4535 file: PathBuf::from("/src/lib.rs"),
4536 name: name.to_string(),
4537 kind: SymbolKind::Function,
4538 start_line: (i * 10 + 1) as u32,
4539 end_line: (i * 10 + 5) as u32,
4540 exported: true,
4541 embed_text: format!("kind:function name:{}", name),
4542 snippet: format!("fn {}() {{}}", name),
4543 },
4544 vector: vec,
4545 });
4546 }
4547
4548 let query = vec![0.9, 0.1, 0.0];
4550 let results = index.search(&query, 2);
4551
4552 assert_eq!(results.len(), 2);
4553 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4555 }
4556
4557 #[test]
4558 fn test_empty_index_search() {
4559 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4560 let results = index.search(&[0.1, 0.2, 0.3], 10);
4561 assert!(results.is_empty());
4562 }
4563
4564 #[test]
4565 fn single_line_symbol_builds_non_empty_snippet() {
4566 let symbol = Symbol {
4567 name: "answer".to_string(),
4568 kind: SymbolKind::Variable,
4569 range: crate::symbols::Range {
4570 start_line: 0,
4571 start_col: 0,
4572 end_line: 0,
4573 end_col: 24,
4574 },
4575 signature: Some("const answer = 42".to_string()),
4576 scope_chain: Vec::new(),
4577 exported: true,
4578 parent: None,
4579 };
4580 let source = "export const answer = 42;\n";
4581
4582 let snippet = build_snippet(&symbol, source);
4583
4584 assert_eq!(snippet, "export const answer = 42;");
4585 }
4586
4587 #[test]
4588 fn optimized_file_chunk_collection_matches_file_parser_path() {
4589 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4590 let file = project_root.join("src/semantic_index.rs");
4591 let source = std::fs::read_to_string(&file).unwrap();
4592
4593 let mut legacy_parser = FileParser::new();
4594 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4595 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4596
4597 let mut parsers = HashMap::new();
4598 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4599
4600 assert_eq!(
4601 chunk_fingerprint(&optimized_chunks),
4602 chunk_fingerprint(&legacy_chunks)
4603 );
4604 }
4605
4606 #[test]
4607 fn collect_file_chunks_indexes_java_symbols() {
4608 let dir = tempfile::tempdir().unwrap();
4609 let file = dir.path().join("Greeter.java");
4610 std::fs::write(
4611 &file,
4612 r#"package example;
4613
4614public class Greeter {
4615 public String greet(String name) {
4616 return "Hello, " + name;
4617 }
4618}
4619"#,
4620 )
4621 .unwrap();
4622
4623 let mut parsers = HashMap::new();
4624 let chunks = collect_file_chunks(dir.path(), &file, &mut parsers).unwrap();
4625
4626 assert!(
4627 !chunks.is_empty(),
4628 "Java file should produce semantic chunks"
4629 );
4630 assert!(
4631 chunks
4632 .iter()
4633 .any(|chunk| chunk.name == "Greeter" && chunk.kind == SymbolKind::Class),
4634 "Java class symbol should be chunked: {chunks:?}"
4635 );
4636 assert!(
4637 chunks
4638 .iter()
4639 .any(|chunk| chunk.name == "greet" && chunk.kind == SymbolKind::Method),
4640 "Java method symbol should be chunked: {chunks:?}"
4641 );
4642 }
4643
4644 fn chunk_fingerprint(
4645 chunks: &[SemanticChunk],
4646 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4647 chunks
4648 .iter()
4649 .map(|chunk| {
4650 (
4651 chunk.name.clone(),
4652 chunk.kind.clone(),
4653 chunk.start_line,
4654 chunk.end_line,
4655 chunk.exported,
4656 chunk.embed_text.clone(),
4657 chunk.snippet.clone(),
4658 )
4659 })
4660 .collect()
4661 }
4662
4663 #[test]
4664 fn collect_file_chunks_skips_oversized_file() {
4665 let dir = tempfile::tempdir().unwrap();
4666 let big = dir.path().join("huge.ts");
4667 let filler = "export const x = 1;\n"
4669 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4670 std::fs::write(&big, &filler).unwrap();
4671 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4672
4673 let mut parsers = HashMap::new();
4674 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4677 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4678
4679 let small = dir.path().join("small.ts");
4681 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4682 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4683 assert!(!small_chunks.is_empty(), "small file should still chunk");
4684 }
4685
4686 #[test]
4687 fn rejects_oversized_dimension_during_deserialization() {
4688 let mut bytes = Vec::new();
4689 bytes.push(1u8);
4690 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4691 bytes.extend_from_slice(&0u32.to_le_bytes());
4692 bytes.extend_from_slice(&0u32.to_le_bytes());
4693
4694 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4695 }
4696
4697 #[test]
4698 fn rejects_oversized_entry_count_during_deserialization() {
4699 let mut bytes = Vec::new();
4700 bytes.push(1u8);
4701 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4702 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4703 bytes.extend_from_slice(&0u32.to_le_bytes());
4704
4705 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4706 }
4707
4708 #[test]
4709 fn invalidate_file_removes_entries_and_mtime() {
4710 let target = PathBuf::from("/src/main.rs");
4711 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4712 index.entries.push(EmbeddingEntry {
4713 chunk: SemanticChunk {
4714 file: target.clone(),
4715 name: "main".to_string(),
4716 kind: SymbolKind::Function,
4717 start_line: 0,
4718 end_line: 1,
4719 exported: false,
4720 embed_text: "main".to_string(),
4721 snippet: "fn main() {}".to_string(),
4722 },
4723 vector: vec![1.0; DEFAULT_DIMENSION],
4724 });
4725 index
4726 .file_mtimes
4727 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4728 index.file_sizes.insert(target.clone(), 0);
4729
4730 index.invalidate_file(&target);
4731
4732 assert!(index.entries.is_empty());
4733 assert!(!index.file_mtimes.contains_key(&target));
4734 assert!(!index.file_sizes.contains_key(&target));
4735 }
4736
4737 #[test]
4738 fn refresh_missing_changed_file_is_purged_after_collect() {
4739 let temp = tempfile::tempdir().unwrap();
4740 let project_root = temp.path();
4741 let file = project_root.join("src/lib.rs");
4742 fs::create_dir_all(file.parent().unwrap()).unwrap();
4743 write_rust_file(&file, "vanished_symbol");
4744
4745 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4746 let original_size = *index.file_sizes.get(&file).unwrap();
4747 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4748 fs::remove_file(&file).unwrap();
4749
4750 let mut embed = test_vector_for_texts;
4751 let mut progress = |_done: usize, _total: usize| {};
4752 let summary = index
4753 .refresh_stale_files(
4754 project_root,
4755 std::slice::from_ref(&file),
4756 &mut embed,
4757 8,
4758 &mut progress,
4759 )
4760 .unwrap();
4761
4762 assert_eq!(summary.changed, 0);
4763 assert_eq!(summary.added, 0);
4764 assert_eq!(summary.deleted, 1);
4765 assert!(index.entries.is_empty());
4766 assert!(!index.file_mtimes.contains_key(&file));
4767 assert!(!index.file_sizes.contains_key(&file));
4768 assert!(!index.file_hashes.contains_key(&file));
4769 }
4770
4771 #[test]
4772 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4773 let temp = tempfile::tempdir().unwrap();
4774 let project_root = temp.path();
4775 let file = project_root.join("src/lib.rs");
4776 fs::create_dir_all(file.parent().unwrap()).unwrap();
4777 write_rust_file(&file, "kept_symbol");
4778
4779 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4780 let original_entry_count = index.entries.len();
4781 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4782 let original_size = *index.file_sizes.get(&file).unwrap();
4783
4784 let stale_mtime = SystemTime::UNIX_EPOCH;
4785 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4786 fs::remove_file(&file).unwrap();
4787 fs::create_dir(&file).unwrap();
4788
4789 let mut embed = test_vector_for_texts;
4790 let mut progress = |_done: usize, _total: usize| {};
4791 let summary = index
4792 .refresh_stale_files(
4793 project_root,
4794 std::slice::from_ref(&file),
4795 &mut embed,
4796 8,
4797 &mut progress,
4798 )
4799 .unwrap();
4800
4801 assert_eq!(summary.changed, 0);
4802 assert_eq!(summary.added, 0);
4803 assert_eq!(summary.deleted, 0);
4804 assert_eq!(index.entries.len(), original_entry_count);
4805 assert!(index
4806 .entries
4807 .iter()
4808 .any(|entry| entry.chunk.name == "kept_symbol"));
4809 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4810 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4811 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4812 }
4813
4814 #[test]
4815 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4816 let temp = tempfile::tempdir().unwrap();
4817 let project_root = temp.path();
4818 let missing = project_root.join("src/missing.rs");
4819 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4820
4821 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4822 let mut embed = test_vector_for_texts;
4823 let mut progress = |_done: usize, _total: usize| {};
4824 let summary = index
4825 .refresh_stale_files(
4826 project_root,
4827 std::slice::from_ref(&missing),
4828 &mut embed,
4829 8,
4830 &mut progress,
4831 )
4832 .unwrap();
4833
4834 assert_eq!(summary.added, 0);
4835 assert_eq!(summary.changed, 0);
4836 assert_eq!(summary.deleted, 0);
4837 assert!(!index.file_mtimes.contains_key(&missing));
4838 assert!(!index.file_sizes.contains_key(&missing));
4839 assert!(index.entries.is_empty());
4840 }
4841
4842 #[test]
4843 fn refresh_reports_added_for_new_files() {
4844 let temp = tempfile::tempdir().unwrap();
4845 let project_root = temp.path();
4846 let existing = project_root.join("src/lib.rs");
4847 let added = project_root.join("src/new.rs");
4848 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4849 write_rust_file(&existing, "existing_symbol");
4850 write_rust_file(&added, "added_symbol");
4851
4852 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4853 let mut embed = test_vector_for_texts;
4854 let mut progress = |_done: usize, _total: usize| {};
4855 let summary = index
4856 .refresh_stale_files(
4857 project_root,
4858 &[existing.clone(), added.clone()],
4859 &mut embed,
4860 8,
4861 &mut progress,
4862 )
4863 .unwrap();
4864
4865 assert_eq!(summary.added, 1);
4866 assert_eq!(summary.changed, 0);
4867 assert_eq!(summary.deleted, 0);
4868 assert_eq!(summary.total_processed, 2);
4869 assert!(index.file_mtimes.contains_key(&added));
4870 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4871 }
4872
4873 #[test]
4874 fn refresh_reports_deleted_for_removed_files() {
4875 let temp = tempfile::tempdir().unwrap();
4876 let project_root = temp.path();
4877 let deleted = project_root.join("src/deleted.rs");
4878 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4879 write_rust_file(&deleted, "deleted_symbol");
4880
4881 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4882 fs::remove_file(&deleted).unwrap();
4883
4884 let mut embed = test_vector_for_texts;
4885 let mut progress = |_done: usize, _total: usize| {};
4886 let summary = index
4887 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4888 .unwrap();
4889
4890 assert_eq!(summary.deleted, 1);
4891 assert_eq!(summary.changed, 0);
4892 assert_eq!(summary.added, 0);
4893 assert_eq!(summary.total_processed, 1);
4894 assert!(!index.file_mtimes.contains_key(&deleted));
4895 assert!(index.entries.is_empty());
4896 }
4897
4898 #[test]
4899 fn refresh_reports_changed_for_modified_files() {
4900 let temp = tempfile::tempdir().unwrap();
4901 let project_root = temp.path();
4902 let file = project_root.join("src/lib.rs");
4903 fs::create_dir_all(file.parent().unwrap()).unwrap();
4904 write_rust_file(&file, "old_symbol");
4905
4906 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4907 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4908 write_rust_file(&file, "new_symbol");
4909
4910 let mut embed = test_vector_for_texts;
4911 let mut progress = |_done: usize, _total: usize| {};
4912 let summary = index
4913 .refresh_stale_files(
4914 project_root,
4915 std::slice::from_ref(&file),
4916 &mut embed,
4917 8,
4918 &mut progress,
4919 )
4920 .unwrap();
4921
4922 assert_eq!(summary.changed, 1);
4923 assert_eq!(summary.added, 0);
4924 assert_eq!(summary.deleted, 0);
4925 assert_eq!(summary.total_processed, 1);
4926 assert!(index
4927 .entries
4928 .iter()
4929 .any(|entry| entry.chunk.name == "new_symbol"));
4930 assert!(!index
4931 .entries
4932 .iter()
4933 .any(|entry| entry.chunk.name == "old_symbol"));
4934 }
4935
4936 #[test]
4937 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4938 let temp = tempfile::tempdir().unwrap();
4939 let project_root = temp.path();
4940 let file = project_root.join("src/lib.rs");
4941 fs::create_dir_all(file.parent().unwrap()).unwrap();
4942 write_rust_file(&file, "clean_symbol");
4943
4944 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4945 let original_entries = index.entries.len();
4946 let mut embed_called = false;
4947 let mut embed = |texts: Vec<String>| {
4948 embed_called = true;
4949 test_vector_for_texts(texts)
4950 };
4951 let mut progress = |_done: usize, _total: usize| {};
4952 let summary = index
4953 .refresh_stale_files(
4954 project_root,
4955 std::slice::from_ref(&file),
4956 &mut embed,
4957 8,
4958 &mut progress,
4959 )
4960 .unwrap();
4961
4962 assert!(summary.is_noop());
4963 assert_eq!(summary.total_processed, 1);
4964 assert!(!embed_called);
4965 assert_eq!(index.entries.len(), original_entries);
4966 }
4967
4968 #[test]
4969 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4970 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4971
4972 assert!(is_onnx_runtime_unavailable(message));
4973 }
4974
4975 #[test]
4976 fn formats_missing_onnx_runtime_with_install_hint() {
4977 let message = format_embedding_init_error(
4978 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4979 );
4980
4981 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4982 assert!(message.contains("Original error:"));
4983 }
4984
4985 #[test]
4986 fn interactive_query_embedding_model_caps_remote_timeout() {
4987 let mut config = SemanticBackendConfig {
4988 backend: SemanticBackend::OpenAiCompatible,
4989 model: "test-embedding".to_string(),
4990 base_url: Some("http://127.0.0.1:9".to_string()),
4991 api_key_env: None,
4992 timeout_ms: 0,
4993 max_batch_size: 64,
4994 max_files: 20_000,
4995 };
4996
4997 let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
4998 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4999 assert_eq!(
5000 build_model.timeout_ms(),
5001 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
5002 "background build keeps the longer default embedding timeout"
5003 );
5004 assert_eq!(
5005 query_model.timeout_ms(),
5006 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5007 "interactive query embedding is capped below the dispatch transport timeout"
5008 );
5009
5010 config.timeout_ms = 60_000;
5011 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5012 assert_eq!(
5013 query_model.timeout_ms(),
5014 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5015 "explicitly long backend timeouts are capped for interactive queries"
5016 );
5017
5018 config.timeout_ms = 3_000;
5019 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5020 assert_eq!(
5021 query_model.timeout_ms(),
5022 3_000,
5023 "shorter explicit timeouts are respected for interactive queries"
5024 );
5025 }
5026
5027 #[test]
5028 fn openai_compatible_backend_embeds_with_mock_server() {
5029 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5030 assert!(request_line.starts_with("POST "));
5031 assert_eq!(path, "/v1/embeddings");
5032 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
5033 });
5034
5035 let config = SemanticBackendConfig {
5036 backend: SemanticBackend::OpenAiCompatible,
5037 model: "test-embedding".to_string(),
5038 base_url: Some(base_url),
5039 api_key_env: None,
5040 timeout_ms: 5_000,
5041 max_batch_size: 64,
5042 max_files: 20_000,
5043 };
5044
5045 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5046 let vectors = model
5047 .embed(vec!["hello".to_string(), "world".to_string()])
5048 .unwrap();
5049
5050 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5051 handle.join().unwrap();
5052 }
5053
5054 #[test]
5064 fn openai_compatible_request_has_single_content_type_header() {
5065 use std::sync::{Arc, Mutex};
5066 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5067 let captured_for_thread = Arc::clone(&captured);
5068
5069 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5070 let addr = listener.local_addr().expect("local addr");
5071 let handle = thread::spawn(move || {
5072 let (mut stream, _) = listener.accept().expect("accept");
5073 let mut buf = Vec::new();
5074 let mut chunk = [0u8; 4096];
5075 let mut header_end = None;
5076 let mut content_length = 0usize;
5077 loop {
5078 let n = stream.read(&mut chunk).expect("read");
5079 if n == 0 {
5080 break;
5081 }
5082 buf.extend_from_slice(&chunk[..n]);
5083 if header_end.is_none() {
5084 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5085 header_end = Some(pos + 4);
5086 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5087 if let Some(value) = line.strip_prefix("Content-Length:") {
5088 content_length = value.trim().parse::<usize>().unwrap_or(0);
5089 }
5090 }
5091 }
5092 }
5093 if let Some(end) = header_end {
5094 if buf.len() >= end + content_length {
5095 break;
5096 }
5097 }
5098 }
5099 *captured_for_thread.lock().unwrap() = buf;
5100 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5101 let response = format!(
5102 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5103 body.len(),
5104 body
5105 );
5106 let _ = stream.write_all(response.as_bytes());
5107 });
5108
5109 let config = SemanticBackendConfig {
5110 backend: SemanticBackend::OpenAiCompatible,
5111 model: "text-embedding-3-small".to_string(),
5112 base_url: Some(format!("http://{}", addr)),
5113 api_key_env: None,
5114 timeout_ms: 5_000,
5115 max_batch_size: 64,
5116 max_files: 20_000,
5117 };
5118 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5119 let _ = model.embed(vec!["probe".to_string()]).unwrap();
5120 handle.join().unwrap();
5121
5122 let bytes = captured.lock().unwrap().clone();
5123 let request = String::from_utf8_lossy(&bytes);
5124
5125 let content_type_lines = request
5128 .lines()
5129 .filter(|line| {
5130 let lower = line.to_ascii_lowercase();
5131 lower.starts_with("content-type:")
5132 })
5133 .count();
5134 assert_eq!(
5135 content_type_lines, 1,
5136 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5137 );
5138
5139 assert!(
5142 request.contains(r#""model":"text-embedding-3-small""#),
5143 "request body should contain model field; full request:\n{request}",
5144 );
5145 }
5146
5147 #[test]
5148 fn ollama_backend_embeds_with_mock_server() {
5149 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5150 assert!(request_line.starts_with("POST "));
5151 assert_eq!(path, "/api/embed");
5152 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5153 });
5154
5155 let config = SemanticBackendConfig {
5156 backend: SemanticBackend::Ollama,
5157 model: "embeddinggemma".to_string(),
5158 base_url: Some(base_url),
5159 api_key_env: None,
5160 timeout_ms: 5_000,
5161 max_batch_size: 64,
5162 max_files: 20_000,
5163 };
5164
5165 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5166 let vectors = model
5167 .embed(vec!["hello".to_string(), "world".to_string()])
5168 .unwrap();
5169
5170 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5171 handle.join().unwrap();
5172 }
5173
5174 #[test]
5175 fn read_from_disk_rejects_fingerprint_mismatch() {
5176 let storage = tempfile::tempdir().unwrap();
5177 let project_key = "proj";
5178
5179 let project_root = test_project_root();
5180 let file = project_root.join("src/main.rs");
5181 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5182 index.entries.push(EmbeddingEntry {
5183 chunk: SemanticChunk {
5184 file: file.clone(),
5185 name: "handle_request".to_string(),
5186 kind: SymbolKind::Function,
5187 start_line: 10,
5188 end_line: 25,
5189 exported: true,
5190 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5191 snippet: "fn handle_request() {}".to_string(),
5192 },
5193 vector: vec![0.1, 0.2, 0.3],
5194 });
5195 index.dimension = 3;
5196 index
5197 .file_mtimes
5198 .insert(file.clone(), SystemTime::UNIX_EPOCH);
5199 index.file_sizes.insert(file, 0);
5200 index.set_fingerprint(SemanticIndexFingerprint {
5201 backend: "openai_compatible".to_string(),
5202 model: "test-embedding".to_string(),
5203 base_url: "http://127.0.0.1:1234/v1".to_string(),
5204 dimension: 3,
5205 chunking_version: default_chunking_version(),
5206 });
5207 index.write_to_disk(storage.path(), project_key);
5208
5209 let matching = index.fingerprint().unwrap().as_string();
5210 assert!(SemanticIndex::read_from_disk(
5211 storage.path(),
5212 project_key,
5213 &project_root,
5214 false,
5215 Some(&matching),
5216 )
5217 .is_some());
5218
5219 let mismatched = SemanticIndexFingerprint {
5220 backend: "ollama".to_string(),
5221 model: "embeddinggemma".to_string(),
5222 base_url: "http://127.0.0.1:11434".to_string(),
5223 dimension: 3,
5224 chunking_version: default_chunking_version(),
5225 }
5226 .as_string();
5227 assert!(SemanticIndex::read_from_disk(
5228 storage.path(),
5229 project_key,
5230 &project_root,
5231 false,
5232 Some(&mismatched),
5233 )
5234 .is_none());
5235 }
5236
5237 #[test]
5238 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5239 let storage = tempfile::tempdir().unwrap();
5240 let project_key = "proj-v3";
5241 let dir = storage.path().join("semantic").join(project_key);
5242 fs::create_dir_all(&dir).unwrap();
5243
5244 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5245 index.entries.push(EmbeddingEntry {
5246 chunk: SemanticChunk {
5247 file: PathBuf::from("/src/main.rs"),
5248 name: "handle_request".to_string(),
5249 kind: SymbolKind::Function,
5250 start_line: 0,
5251 end_line: 0,
5252 exported: true,
5253 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5254 snippet: "fn handle_request() {}".to_string(),
5255 },
5256 vector: vec![0.1, 0.2, 0.3],
5257 });
5258 index.dimension = 3;
5259 index
5260 .file_mtimes
5261 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5262 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5263 let fingerprint = SemanticIndexFingerprint {
5264 backend: "fastembed".to_string(),
5265 model: "test".to_string(),
5266 base_url: FALLBACK_BACKEND.to_string(),
5267 dimension: 3,
5268 chunking_version: default_chunking_version(),
5269 };
5270 index.set_fingerprint(fingerprint.clone());
5271
5272 let mut bytes = index.to_bytes();
5273 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5274 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5275
5276 assert!(SemanticIndex::read_from_disk(
5277 storage.path(),
5278 project_key,
5279 &test_project_root(),
5280 false,
5281 Some(&fingerprint.as_string())
5282 )
5283 .is_none());
5284 assert!(!dir.join("semantic.bin").exists());
5285 }
5286
5287 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5288 crate::symbols::Symbol {
5289 name: name.to_string(),
5290 kind,
5291 range: crate::symbols::Range {
5292 start_line: start,
5293 start_col: 0,
5294 end_line: end,
5295 end_col: 0,
5296 },
5297 signature: None,
5298 scope_chain: Vec::new(),
5299 exported: false,
5300 parent: None,
5301 }
5302 }
5303
5304 #[test]
5309 fn symbols_to_chunks_skips_heading_symbols() {
5310 let project_root = PathBuf::from("/proj");
5311 let file = project_root.join("README.md");
5312 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5313
5314 let symbols = vec![
5315 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5316 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5317 ];
5318
5319 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5320 assert!(
5321 chunks.is_empty(),
5322 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5323 chunks.len()
5324 );
5325 }
5326
5327 #[test]
5334 fn build_embed_text_clamps_oversized_signature() {
5335 let project_root = PathBuf::from("/proj");
5336 let file = project_root.join("cronjob.yaml");
5337 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5339
5340 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5341 symbol.signature = Some(huge_sig);
5342
5343 let text = build_embed_text(&symbol, source, &file, &project_root);
5344 assert!(
5345 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5346 "embed_text must be clamped to {} chars, got {}",
5347 MAX_EMBED_TEXT_CHARS,
5348 text.chars().count()
5349 );
5350 }
5351
5352 #[test]
5356 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5357 let project_root = PathBuf::from("/proj");
5358 let file = project_root.join("src/lib.rs");
5359 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5360
5361 let symbols = vec![
5362 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5364 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5365 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5366 ];
5367
5368 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5369 assert_eq!(
5370 chunks.len(),
5371 3,
5372 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5373 chunks.len()
5374 );
5375 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5376 assert!(chunks
5377 .iter()
5378 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5379 assert!(names.contains(&"handle_request"));
5380 assert!(names.contains(&"AuthService"));
5381 assert!(
5382 !names.contains(&"doc heading"),
5383 "Heading symbol leaked into chunks: {names:?}"
5384 );
5385 }
5386
5387 #[test]
5388 fn validate_ssrf_allows_loopback_hostnames() {
5389 for host in &[
5392 "http://localhost",
5393 "http://localhost:8080",
5394 "http://localhost:11434", "http://localhost.localdomain",
5396 "http://foo.localhost",
5397 ] {
5398 assert!(
5399 validate_base_url_no_ssrf(host).is_ok(),
5400 "Expected {host} to be allowed (loopback), got: {:?}",
5401 validate_base_url_no_ssrf(host)
5402 );
5403 }
5404 }
5405
5406 #[test]
5407 fn validate_ssrf_allows_loopback_ips() {
5408 for url in &[
5411 "http://127.0.0.1",
5412 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5414 "http://127.1.2.3",
5415 ] {
5416 let result = validate_base_url_no_ssrf(url);
5417 assert!(
5418 result.is_ok(),
5419 "Expected {url} to be allowed (loopback), got: {:?}",
5420 result
5421 );
5422 }
5423 }
5424
5425 #[test]
5426 fn validate_ssrf_rejects_private_non_loopback_ips() {
5427 for url in &[
5432 "http://192.168.1.1",
5433 "http://10.0.0.1",
5434 "http://172.16.0.1",
5435 "http://169.254.169.254",
5436 "http://100.64.0.1",
5437 ] {
5438 let result = validate_base_url_no_ssrf(url);
5439 assert!(
5440 result.is_err(),
5441 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5442 result
5443 );
5444 }
5445 }
5446
5447 #[test]
5448 fn validate_ssrf_rejects_mdns_local_hostnames() {
5449 for host in &[
5452 "http://printer.local",
5453 "http://nas.local:8080",
5454 "http://homelab.local",
5455 ] {
5456 let result = validate_base_url_no_ssrf(host);
5457 assert!(
5458 result.is_err(),
5459 "Expected {host} to be rejected (mDNS), got: {:?}",
5460 result
5461 );
5462 }
5463 }
5464
5465 #[test]
5466 fn normalize_base_url_allows_localhost_for_tests() {
5467 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5470 assert!(normalize_base_url("http://localhost:8080").is_ok());
5471 }
5472
5473 #[test]
5474 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5475 use std::net::IpAddr;
5476 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5477
5478 assert!(blocked("10.0.0.1"));
5480 assert!(blocked("192.168.1.1"));
5481 assert!(blocked("169.254.0.1"));
5482 assert!(blocked("100.64.0.1"));
5483 assert!(
5485 blocked("198.18.0.1"),
5486 "RFC2544 benchmark range must be blocked"
5487 );
5488 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5489 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5490 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5491
5492 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5494 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5495 assert!(
5496 !blocked("::ffff:127.0.0.1"),
5497 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5498 );
5499
5500 assert!(!blocked("8.8.8.8"));
5502 }
5503
5504 #[test]
5511 fn ort_mismatch_message_recommends_auto_fix_first() {
5512 let msg =
5513 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5514
5515 assert!(
5517 msg.contains("v1.9.0"),
5518 "should report detected version: {msg}"
5519 );
5520 assert!(
5521 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5522 "should report system path: {msg}"
5523 );
5524 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5525
5526 let auto_fix_pos = msg
5528 .find("Auto-fix")
5529 .expect("Auto-fix solution missing — users won't discover --fix");
5530 let remove_pos = msg
5531 .find("Remove the old library")
5532 .expect("system-rm solution missing");
5533 assert!(
5534 auto_fix_pos < remove_pos,
5535 "Auto-fix must come before manual rm — see PR comment thread"
5536 );
5537
5538 assert!(
5540 msg.contains("npx @cortexkit/aft doctor --fix"),
5541 "auto-fix command must be present and copy-pasteable: {msg}"
5542 );
5543 }
5544
5545 #[cfg(any(target_os = "linux", target_os = "macos"))]
5546 #[test]
5547 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5548 let requested = "libonnxruntime.so";
5549 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5550
5551 assert_eq!(detect_ort_version_from_path(requested), None);
5552 let (version, source) =
5553 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5554
5555 assert_eq!(version, Some("1.19.0".to_string()));
5556 assert_eq!(source, actual);
5557
5558 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5559 assert!(msg.contains("v1.19.0"));
5560 assert!(msg.contains(actual));
5561 }
5562
5563 #[test]
5567 fn ort_mismatch_message_handles_macos_dylib_path() {
5568 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5569 assert!(msg.contains("v1.9.0"));
5570 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5571 assert!(
5575 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5576 "system path should be quoted in the auto-fix sentence: {msg}"
5577 );
5578 }
5579}