1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
58const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
61const DEFAULT_MAX_BATCH_SIZE: usize = 64;
62const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
63const FALLBACK_BACKEND: &str = "none";
64const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
65const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
66static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
67
68pub struct SemanticIndexLock {
69 _guard: fs_lock::LockGuard,
70}
71
72impl SemanticIndexLock {
73 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
74 let dir = storage_dir.join("semantic").join(project_key);
75 fs::create_dir_all(&dir)?;
76 let path = dir.join("cache.lock");
77 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
78 .lock()
79 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
80 fs_lock::try_acquire(&path, Duration::from_secs(2))
81 .map(|guard| Self { _guard: guard })
82 .map_err(|error| match error {
83 fs_lock::AcquireError::Timeout => {
84 std::io::Error::other("timed out acquiring semantic cache lock")
85 }
86 fs_lock::AcquireError::Io(error) => error,
87 })
88 }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticIndexFingerprint {
93 pub backend: String,
94 pub model: String,
95 #[serde(default)]
96 pub base_url: String,
97 pub dimension: usize,
98 #[serde(default = "default_chunking_version")]
99 pub chunking_version: u32,
100}
101
102fn default_chunking_version() -> u32 {
103 2
104}
105
106impl SemanticIndexFingerprint {
107 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
108 let base_url = config
111 .base_url
112 .as_ref()
113 .and_then(|u| normalize_base_url(u).ok())
114 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
115 Self {
116 backend: config.backend.as_str().to_string(),
117 model: config.model.clone(),
118 base_url,
119 dimension,
120 chunking_version: default_chunking_version(),
121 }
122 }
123
124 pub fn as_string(&self) -> String {
125 serde_json::to_string(self).unwrap_or_else(|_| String::new())
126 }
127
128 fn matches_expected(&self, expected: &str) -> bool {
129 let encoded = self.as_string();
130 !encoded.is_empty() && encoded == expected
131 }
132}
133
134enum SemanticEmbeddingEngine {
135 Local(LocalEmbedder),
138 OpenAiCompatible {
139 client: Client,
140 model: String,
141 base_url: String,
142 api_key: Option<String>,
143 },
144 Ollama {
145 client: Client,
146 model: String,
147 base_url: String,
148 },
149}
150
151pub struct SemanticEmbeddingModel {
152 backend: SemanticBackend,
153 model: String,
154 base_url: Option<String>,
155 timeout_ms: u64,
156 max_batch_size: usize,
157 dimension: Option<usize>,
158 engine: SemanticEmbeddingEngine,
159 query_embedding_cache: HashMap<String, Vec<f32>>,
160 query_embedding_cache_order: VecDeque<String>,
161 query_embedding_cache_hits: u64,
162 query_embedding_cache_misses: u64,
163}
164
165pub type EmbeddingModel = SemanticEmbeddingModel;
166
167fn validate_embedding_batch(
168 vectors: &[Vec<f32>],
169 expected_count: usize,
170 context: &str,
171) -> Result<(), String> {
172 if expected_count > 0 && vectors.is_empty() {
173 return Err(format!(
174 "{context} returned no vectors for {expected_count} inputs"
175 ));
176 }
177
178 if vectors.len() != expected_count {
179 return Err(format!(
180 "{context} returned {} vectors for {} inputs",
181 vectors.len(),
182 expected_count
183 ));
184 }
185
186 let Some(first_vector) = vectors.first() else {
187 return Ok(());
188 };
189 let expected_dimension = first_vector.len();
190 validate_embedding_dimension(expected_dimension)
191 .map_err(|error| format!("{context} returned {error}"))?;
192 for (index, vector) in vectors.iter().enumerate() {
193 if vector.len() != expected_dimension {
194 return Err(format!(
195 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
196 vector.len()
197 ));
198 }
199 }
200
201 Ok(())
202}
203
204fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
205 if dimension == 0 || dimension > MAX_DIMENSION {
206 return Err(format!(
207 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
208 ));
209 }
210
211 Ok(())
212}
213
214fn normalize_base_url(raw: &str) -> Result<String, String> {
218 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
219 let scheme = parsed.scheme();
220 if scheme != "http" && scheme != "https" {
221 return Err(format!(
222 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
223 scheme
224 ));
225 }
226 Ok(parsed.to_string().trim_end_matches('/').to_string())
227}
228
229pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
244 use std::net::{IpAddr, ToSocketAddrs};
245
246 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
247
248 let host = parsed.host_str().unwrap_or("");
249
250 let is_loopback_host =
255 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
256 if is_loopback_host {
257 return Ok(());
258 }
259
260 if host.ends_with(".local") {
263 return Err(format!(
264 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
265 ));
266 }
267
268 let port = parsed.port_or_known_default().unwrap_or(443);
271 let addr_str = format!("{host}:{port}");
272 let addrs: Vec<IpAddr> = addr_str
273 .to_socket_addrs()
274 .map(|iter| iter.map(|sa| sa.ip()).collect())
275 .unwrap_or_default();
276 for ip in &addrs {
277 if is_private_non_loopback_ip(ip) {
278 return Err(format!(
279 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
280 ));
281 }
282 }
283
284 Ok(())
285}
286
287fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
298 if ip.to_canonical().is_loopback() {
301 return false;
302 }
303 crate::url_fetch::is_private_or_reserved_ip(*ip)
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307 if base_url.ends_with("/v1") {
308 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309 } else {
310 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311 }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315 if base_url.ends_with("/api") {
316 format!("{base_url}/embed")
317 } else {
318 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319 }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323 value.and_then(|token| {
324 let token = token.trim();
325 if token.is_empty() {
326 None
327 } else {
328 Some(token.to_string())
329 }
330 })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
343 if !matches!(
344 status,
345 reqwest::StatusCode::BAD_REQUEST
346 | reqwest::StatusCode::CONFLICT
347 | reqwest::StatusCode::REQUEST_TIMEOUT
348 | reqwest::StatusCode::LOCKED
349 | reqwest::StatusCode::TOO_EARLY
350 ) {
351 return false;
352 }
353
354 let lower = raw.to_ascii_lowercase();
355 let normalized = lower.trim();
356
357 normalized.contains("model was unloaded while the request was still in queue")
358 || normalized == "model is loading"
359 || normalized.starts_with("model is loading,")
360 || normalized.contains(r#""error":"model is loading"#)
361 || normalized.contains(r#""message":"model is loading"#)
362 || normalized == "model not loaded"
363 || normalized.contains(r#""error":"model not loaded""#)
364 || normalized.contains(r#""message":"model not loaded""#)
365 || normalized == "loading model into memory"
366 || normalized.contains(r#""error":"loading model into memory""#)
367 || normalized.contains(r#""message":"loading model into memory""#)
368 || normalized == "model is being loaded"
369 || normalized.contains(r#""error":"model is being loaded""#)
370 || normalized.contains(r#""message":"model is being loaded""#)
371 || normalized == "model is currently loading"
372 || normalized.contains(r#""error":"model is currently loading""#)
373 || normalized.contains(r#""message":"model is currently loading""#)
374}
375
376fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
377 error.is_connect()
378}
379
380fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
386 error.is_connect() || error.is_timeout()
387}
388
389fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
390 embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
391}
392
393pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
400
401pub fn embedding_failure_is_transient(error: &str) -> bool {
404 error.contains(TRANSIENT_EMBEDDING_MARKER)
405}
406
407pub fn strip_transient_embedding_marker(error: &str) -> String {
409 error.replace(TRANSIENT_EMBEDDING_MARKER, "")
410}
411
412fn sleep_before_embedding_retry(attempt_index: usize) {
413 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
414 std::thread::sleep(Duration::from_millis(*delay_ms));
415 }
416}
417
418fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
419where
420 F: FnMut() -> reqwest::blocking::RequestBuilder,
421{
422 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
423 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
424
425 let response = match make_request().send() {
426 Ok(response) => response,
427 Err(error) => {
428 if !last_attempt && is_retryable_embedding_error(&error) {
429 sleep_before_embedding_retry(attempt_index);
430 continue;
431 }
432 let marker = if embedding_send_error_is_transient(&error) {
436 TRANSIENT_EMBEDDING_MARKER
437 } else {
438 ""
439 };
440 return Err(format!("{marker}{backend_label} request failed: {error}"));
441 }
442 };
443
444 let status = response.status();
445 let raw = match response.text() {
446 Ok(raw) => raw,
447 Err(error) => {
448 if !last_attempt && embedding_response_read_error_is_transient(&error) {
449 sleep_before_embedding_retry(attempt_index);
450 continue;
451 }
452 let marker = if embedding_response_read_error_is_transient(&error) {
453 TRANSIENT_EMBEDDING_MARKER
454 } else {
455 ""
456 };
457 return Err(format!(
458 "{marker}{backend_label} response read failed: {error}"
459 ));
460 }
461 };
462
463 if status.is_success() {
464 return Ok(raw);
465 }
466
467 let body_transient = embedding_response_body_is_transient(status, &raw);
471 if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
472 sleep_before_embedding_retry(attempt_index);
473 continue;
474 }
475
476 let marker = if is_retryable_embedding_status(status) || body_transient {
482 TRANSIENT_EMBEDDING_MARKER
483 } else {
484 ""
485 };
486 return Err(format!(
487 "{marker}{backend_label} request failed (HTTP {}): {}",
488 status, raw
489 ));
490 }
491
492 unreachable!("embedding request retries exhausted without returning")
493}
494
495fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
496 if config.timeout_ms == 0 {
497 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
498 } else {
499 config.timeout_ms
500 }
501}
502
503impl SemanticEmbeddingModel {
504 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
505 Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
506 }
507
508 pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
509 let timeout_ms =
510 configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
511 Self::from_config_with_timeout_ms(config, timeout_ms)
512 }
513
514 fn from_config_with_timeout_ms(
515 config: &SemanticBackendConfig,
516 timeout_ms: u64,
517 ) -> Result<Self, String> {
518 let max_batch_size = if config.max_batch_size == 0 {
519 DEFAULT_MAX_BATCH_SIZE
520 } else {
521 config.max_batch_size
522 };
523
524 let api_key_env = normalize_api_key(config.api_key_env.clone());
525 let model = config.model.clone();
526
527 let client = Client::builder()
528 .timeout(Duration::from_millis(timeout_ms))
529 .redirect(reqwest::redirect::Policy::none())
530 .build()
531 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
532
533 let engine = match config.backend {
534 SemanticBackend::Fastembed => {
535 SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
536 }
537 SemanticBackend::OpenAiCompatible => {
538 let raw = config.base_url.as_ref().ok_or_else(|| {
539 "base_url is required for openai_compatible backend".to_string()
540 })?;
541 let base_url = normalize_base_url(raw)?;
542
543 let api_key = match api_key_env {
544 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
545 format!("missing api_key_env '{var_name}' for openai_compatible backend")
546 })?),
547 None => None,
548 };
549
550 SemanticEmbeddingEngine::OpenAiCompatible {
551 client,
552 model,
553 base_url,
554 api_key,
555 }
556 }
557 SemanticBackend::Ollama => {
558 let raw = config
559 .base_url
560 .as_ref()
561 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
562 let base_url = normalize_base_url(raw)?;
563
564 SemanticEmbeddingEngine::Ollama {
565 client,
566 model,
567 base_url,
568 }
569 }
570 };
571
572 Ok(Self {
573 backend: config.backend,
574 model: config.model.clone(),
575 base_url: config.base_url.clone(),
576 timeout_ms,
577 max_batch_size,
578 dimension: None,
579 engine,
580 query_embedding_cache: HashMap::new(),
581 query_embedding_cache_order: VecDeque::new(),
582 query_embedding_cache_hits: 0,
583 query_embedding_cache_misses: 0,
584 })
585 }
586
587 pub fn backend(&self) -> SemanticBackend {
588 self.backend
589 }
590
591 pub fn model(&self) -> &str {
592 &self.model
593 }
594
595 pub fn base_url(&self) -> Option<&str> {
596 self.base_url.as_deref()
597 }
598
599 pub fn max_batch_size(&self) -> usize {
600 self.max_batch_size
601 }
602
603 pub fn timeout_ms(&self) -> u64 {
604 self.timeout_ms
605 }
606
607 pub fn fingerprint(
608 &mut self,
609 config: &SemanticBackendConfig,
610 ) -> Result<SemanticIndexFingerprint, String> {
611 let dimension = self.dimension()?;
612 Ok(SemanticIndexFingerprint::from_config(config, dimension))
613 }
614
615 pub fn dimension(&mut self) -> Result<usize, String> {
616 if let Some(dimension) = self.dimension {
617 return Ok(dimension);
618 }
619
620 let dimension = match &mut self.engine {
621 SemanticEmbeddingEngine::Local(model) => {
622 let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
623 vectors
624 .first()
625 .map(|v| v.len())
626 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
627 }
628 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
629 let vectors =
630 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
631 vectors
632 .first()
633 .map(|v| v.len())
634 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
635 }
636 SemanticEmbeddingEngine::Ollama { .. } => {
637 let vectors =
638 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
639 vectors
640 .first()
641 .map(|v| v.len())
642 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
643 }
644 };
645
646 self.dimension = Some(dimension);
647 Ok(dimension)
648 }
649
650 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
651 self.embed_texts(texts)
652 }
653
654 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
655 if let Some(vector) = self.query_embedding_cache.get(query) {
656 self.query_embedding_cache_hits += 1;
657 return Ok(vector.clone());
658 }
659
660 self.query_embedding_cache_misses += 1;
661 let embeddings = self.embed_texts(vec![query.to_string()])?;
662 let vector = embeddings
663 .first()
664 .cloned()
665 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
666
667 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
668 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
669 self.query_embedding_cache.remove(&oldest);
670 }
671 }
672 self.query_embedding_cache
673 .insert(query.to_string(), vector.clone());
674 self.query_embedding_cache_order
675 .push_back(query.to_string());
676
677 Ok(vector)
678 }
679
680 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
681 (
682 self.query_embedding_cache_hits,
683 self.query_embedding_cache_misses,
684 self.query_embedding_cache.len(),
685 )
686 }
687
688 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
689 match &mut self.engine {
690 SemanticEmbeddingEngine::Local(model) => model
691 .embed(&texts)
692 .map_err(|error| format!("failed to embed batch: {error}")),
693 SemanticEmbeddingEngine::OpenAiCompatible {
694 client,
695 model,
696 base_url,
697 api_key,
698 } => {
699 let expected_text_count = texts.len();
700 let endpoint = build_openai_embeddings_endpoint(base_url);
701 let body = serde_json::json!({
702 "input": texts,
703 "model": model,
704 });
705
706 let raw = send_embedding_request(
707 || {
708 let mut request = client.post(&endpoint).json(&body);
718
719 if let Some(api_key) = api_key {
720 request = request.header("Authorization", format!("Bearer {api_key}"));
721 }
722
723 request
724 },
725 "openai compatible",
726 )?;
727
728 #[derive(Deserialize)]
729 struct OpenAiResponse {
730 data: Vec<OpenAiEmbeddingResult>,
731 }
732
733 #[derive(Deserialize)]
734 struct OpenAiEmbeddingResult {
735 embedding: Vec<f32>,
736 index: Option<u32>,
737 }
738
739 let parsed: OpenAiResponse = serde_json::from_str(&raw)
740 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
741 if parsed.data.len() != expected_text_count {
742 return Err(format!(
743 "openai compatible response returned {} embeddings for {} inputs",
744 parsed.data.len(),
745 expected_text_count
746 ));
747 }
748
749 let mut vectors = vec![Vec::new(); parsed.data.len()];
750 for (i, item) in parsed.data.into_iter().enumerate() {
751 let index = item.index.unwrap_or(i as u32) as usize;
752 if index >= vectors.len() {
753 return Err(
754 "openai compatible response contains invalid vector index".to_string()
755 );
756 }
757 vectors[index] = item.embedding;
758 }
759
760 for vector in &vectors {
761 if vector.is_empty() {
762 return Err(
763 "openai compatible response contained missing vectors".to_string()
764 );
765 }
766 }
767
768 self.dimension = vectors.first().map(Vec::len);
769 Ok(vectors)
770 }
771 SemanticEmbeddingEngine::Ollama {
772 client,
773 model,
774 base_url,
775 } => {
776 let expected_text_count = texts.len();
777 let endpoint = build_ollama_embeddings_endpoint(base_url);
778
779 #[derive(Serialize)]
780 struct OllamaPayload<'a> {
781 model: &'a str,
782 input: Vec<String>,
783 }
784
785 let payload = OllamaPayload {
786 model,
787 input: texts,
788 };
789
790 let raw = send_embedding_request(
791 || {
792 client.post(&endpoint).json(&payload)
797 },
798 "ollama",
799 )?;
800
801 #[derive(Deserialize)]
802 struct OllamaResponse {
803 embeddings: Vec<Vec<f32>>,
804 }
805
806 let parsed: OllamaResponse = serde_json::from_str(&raw)
807 .map_err(|error| format!("invalid ollama response: {error}"))?;
808 if parsed.embeddings.is_empty() {
809 return Err("ollama response returned no embeddings".to_string());
810 }
811 if parsed.embeddings.len() != expected_text_count {
812 return Err(format!(
813 "ollama response returned {} embeddings for {} inputs",
814 parsed.embeddings.len(),
815 expected_text_count
816 ));
817 }
818
819 let vectors = parsed.embeddings;
820 for vector in &vectors {
821 if vector.is_empty() {
822 return Err("ollama response contained empty embeddings".to_string());
823 }
824 }
825
826 self.dimension = vectors.first().map(Vec::len);
827 Ok(vectors)
828 }
829 }
830 }
831}
832
833pub fn pre_validate_onnx_runtime() -> Result<(), String> {
837 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
838
839 #[cfg(any(target_os = "linux", target_os = "macos"))]
840 {
841 #[cfg(target_os = "linux")]
842 let default_name = "libonnxruntime.so";
843 #[cfg(target_os = "macos")]
844 let default_name = "libonnxruntime.dylib";
845
846 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
847
848 unsafe {
849 let c_name = std::ffi::CString::new(lib_name)
850 .map_err(|e| format!("invalid library path: {}", e))?;
851 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
852 if handle.is_null() {
853 let err = libc::dlerror();
854 let msg = if err.is_null() {
855 "unknown dlopen error".to_string()
856 } else {
857 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
858 };
859 return Err(format!(
860 "ONNX Runtime not found. dlopen('{}') failed: {}. \
861 Run `npx @cortexkit/aft doctor` to diagnose.",
862 lib_name, msg
863 ));
864 }
865
866 let (detected_version, version_source) =
871 detect_ort_version_from_loaded_library(handle, lib_name);
872
873 libc::dlclose(handle);
874
875 if let Some(ref version) = detected_version {
877 let parts: Vec<&str> = version.split('.').collect();
878 if let (Some(major), Some(minor)) = (
879 parts.first().and_then(|s| s.parse::<u32>().ok()),
880 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
881 ) {
882 if major != 1 || minor < 20 {
883 return Err(format_ort_version_mismatch(version, &version_source));
884 }
885 }
886 }
887 }
888 }
889
890 #[cfg(target_os = "windows")]
891 {
892 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
897
898 #[link(name = "kernel32")]
902 extern "system" {
903 fn LoadLibraryExW(
904 lpLibFileName: *const u16,
905 hFile: *mut std::ffi::c_void,
906 dwFlags: u32,
907 ) -> *mut std::ffi::c_void;
908 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
909 fn GetModuleFileNameW(
910 hModule: *mut std::ffi::c_void,
911 lpFilename: *mut u16,
912 nSize: u32,
913 ) -> u32;
914 }
915
916 #[link(name = "version")]
917 extern "system" {
918 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
919 fn GetFileVersionInfoW(
920 lptstrFilename: *const u16,
921 dwHandle: u32,
922 dwLen: u32,
923 lpData: *mut std::ffi::c_void,
924 ) -> i32;
925 fn VerQueryValueW(
926 pBlock: *mut std::ffi::c_void,
927 lpSubBlock: *const u16,
928 lplpBuffer: *mut *mut std::ffi::c_void,
929 puLen: *mut u32,
930 ) -> i32;
931 }
932
933 #[repr(C)]
934 struct VS_FIXEDFILEINFO {
935 dw_signature: u32,
936 dw_struc_version: u32,
937 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
940 dw_product_version_ls: u32,
941 dw_file_flags_mask: u32,
942 dw_file_flags: u32,
943 dw_file_os: u32,
944 dw_file_type: u32,
945 dw_file_subtype: u32,
946 dw_file_date_ms: u32,
947 dw_file_date_ls: u32,
948 }
949
950 unsafe {
951 use std::os::windows::ffi::OsStrExt;
952 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
953 .encode_wide()
954 .chain(std::iter::once(0))
955 .collect();
956
957 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
958 if handle.is_null() {
959 let err = std::io::Error::last_os_error();
960 return Err(format!(
961 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
962 Run `npx @cortexkit/aft doctor` to diagnose.",
963 lib_name, err
964 ));
965 }
966
967 let mut detected_major: u32 = 0;
970 let mut detected_minor: u32 = 0;
971 let mut path_buf = [0u16; 32767];
977 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
978 if path_len > 0 {
979 let mut dummy_handle: u32 = 0;
980 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
981 if info_size > 0 {
982 let mut info = vec![0u8; info_size as usize];
983 if GetFileVersionInfoW(
984 path_buf.as_ptr(),
985 0,
986 info_size,
987 info.as_mut_ptr() as *mut std::ffi::c_void,
988 ) != 0
989 {
990 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
991 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
992 let mut vs_len: u32 = 0;
993 if VerQueryValueW(
994 info.as_mut_ptr() as *mut std::ffi::c_void,
995 sub_block.as_ptr(),
996 &mut vs_info,
997 &mut vs_len,
998 ) != 0
999 && !vs_info.is_null()
1000 {
1001 let fixed = vs_info as *const VS_FIXEDFILEINFO;
1002 detected_major = (*fixed).dw_file_version_ms >> 16;
1003 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1004 }
1005 }
1006 }
1007 }
1008
1009 FreeLibrary(handle);
1010
1011 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1015 let ver = format!("{}.{}", detected_major, detected_minor);
1016 return Err(format_ort_version_mismatch(&ver, lib_name));
1017 }
1018 }
1019 }
1020
1021 Ok(())
1022}
1023
1024#[cfg(any(target_os = "linux", target_os = "macos"))]
1025unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1026 let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1027 let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1028 if symbol.is_null() {
1029 return None;
1030 }
1031
1032 let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1033 if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1034 return None;
1035 }
1036
1037 let info = unsafe { info.assume_init() };
1038 if info.dli_fname.is_null() {
1039 return None;
1040 }
1041
1042 Some(
1043 unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1044 .to_string_lossy()
1045 .into_owned(),
1046 )
1047}
1048
1049#[cfg(any(target_os = "linux", target_os = "macos"))]
1050fn detect_ort_version_from_resolved_or_requested(
1051 resolved_path: Option<String>,
1052 requested_lib_name: &str,
1053) -> (Option<String>, String) {
1054 if let Some(path) = resolved_path {
1055 if let Some(version) = detect_ort_version_from_path(&path) {
1056 return (Some(version), path);
1057 }
1058 return (detect_ort_version_from_path(requested_lib_name), path);
1059 }
1060
1061 (
1062 detect_ort_version_from_path(requested_lib_name),
1063 requested_lib_name.to_string(),
1064 )
1065}
1066
1067#[cfg(any(target_os = "linux", target_os = "macos"))]
1068fn detect_ort_version_from_loaded_library(
1069 handle: *mut std::ffi::c_void,
1070 requested_lib_name: &str,
1071) -> (Option<String>, String) {
1072 detect_ort_version_from_resolved_or_requested(
1073 unsafe { loaded_library_path_from_handle(handle) },
1074 requested_lib_name,
1075 )
1076}
1077
1078#[cfg(any(target_os = "linux", target_os = "macos"))]
1081fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1082 let path = std::path::Path::new(lib_path);
1083
1084 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1086 .into_iter()
1087 .flatten()
1088 {
1089 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1090 if let Some(version) = extract_version_from_filename(name) {
1091 return Some(version);
1092 }
1093 }
1094 }
1095
1096 if let Some(parent) = path.parent() {
1098 if let Ok(entries) = std::fs::read_dir(parent) {
1099 for entry in entries.flatten() {
1100 if let Some(name) = entry.file_name().to_str() {
1101 if name.starts_with("libonnxruntime") {
1102 if let Some(version) = extract_version_from_filename(name) {
1103 return Some(version);
1104 }
1105 }
1106 }
1107 }
1108 }
1109 }
1110
1111 None
1112}
1113
1114#[cfg(any(target_os = "linux", target_os = "macos"))]
1116fn extract_version_from_filename(name: &str) -> Option<String> {
1117 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1119 re.find(name).map(|m| m.as_str().to_string())
1120}
1121
1122fn suggest_removal_command(lib_path: &str) -> String {
1123 if lib_path.starts_with("/usr/local/lib")
1124 || lib_path == "libonnxruntime.so"
1125 || lib_path == "libonnxruntime.dylib"
1126 {
1127 #[cfg(target_os = "linux")]
1128 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1129 #[cfg(target_os = "macos")]
1130 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
1131 }
1132 format!(" rm '{}'", lib_path)
1133}
1134
1135pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1141 format!(
1142 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1143 Solutions:\n\
1144 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1145 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1146 configures the bridge to load it instead of the system library — no \
1147 changes to '{}'.\n\
1148 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1149 {}\n\
1150 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1151 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1152 version,
1153 lib_name,
1154 lib_name,
1155 suggest_removal_command(lib_name),
1156 )
1157}
1158
1159pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1160 if message.trim_start().starts_with("ONNX Runtime not found.") {
1161 return true;
1162 }
1163
1164 let message = message.to_ascii_lowercase();
1165 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1166 .iter()
1167 .any(|pattern| message.contains(pattern));
1168 let mentions_dynamic_load_failure = [
1169 "shared library",
1170 "dynamic library",
1171 "failed to load",
1172 "could not load",
1173 "unable to load",
1174 "dlopen",
1175 "loadlibrary",
1176 "no such file",
1177 "not found",
1178 ]
1179 .iter()
1180 .any(|pattern| message.contains(pattern));
1181
1182 mentions_onnx_runtime && mentions_dynamic_load_failure
1183}
1184
1185pub fn format_embedding_init_error(error: impl Display) -> String {
1186 let message = error.to_string();
1187
1188 if is_onnx_runtime_unavailable(&message) {
1189 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1190 }
1191
1192 format!("failed to initialize semantic embedding model: {message}")
1193}
1194
1195#[derive(Debug, Clone)]
1197pub struct SemanticChunk {
1198 pub file: PathBuf,
1200 pub name: String,
1202 pub kind: SymbolKind,
1204 pub start_line: u32,
1206 pub end_line: u32,
1207 pub exported: bool,
1209 pub embed_text: String,
1211 pub snippet: String,
1213}
1214
1215#[derive(Debug, Clone)]
1217pub struct EmbeddingEntry {
1218 chunk: SemanticChunk,
1219 vector: Vec<f32>,
1220}
1221
1222#[derive(Debug, Clone)]
1224pub struct SemanticIndex {
1225 entries: Vec<EmbeddingEntry>,
1226 file_mtimes: HashMap<PathBuf, SystemTime>,
1228 file_sizes: HashMap<PathBuf, u64>,
1230 file_hashes: HashMap<PathBuf, blake3::Hash>,
1231 dimension: usize,
1233 fingerprint: Option<SemanticIndexFingerprint>,
1234 project_root: PathBuf,
1235 deferred_files: HashSet<PathBuf>,
1236}
1237
1238#[derive(Debug, Clone, Copy)]
1239struct IndexedFileMetadata {
1240 mtime: SystemTime,
1241 size: u64,
1242 content_hash: blake3::Hash,
1243}
1244
1245#[derive(Debug, Default, Clone, Copy)]
1248pub struct RefreshSummary {
1249 pub changed: usize,
1250 pub added: usize,
1251 pub deleted: usize,
1252 pub total_processed: usize,
1253}
1254
1255impl RefreshSummary {
1256 pub fn is_noop(&self) -> bool {
1258 self.changed == 0 && self.added == 0 && self.deleted == 0
1259 }
1260}
1261
1262#[derive(Debug, Default)]
1263pub struct InvalidatedFilesRefresh {
1264 pub added_entries: Vec<EmbeddingEntry>,
1268 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1269 pub completed_paths: Vec<PathBuf>,
1270 pub summary: RefreshSummary,
1271}
1272
1273#[derive(Debug, Clone)]
1274struct ReusableEmbedding {
1275 embed_text: String,
1276 vector: Vec<f32>,
1277}
1278
1279type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1280
1281#[derive(Debug, Clone)]
1283pub struct SemanticResult {
1284 pub file: PathBuf,
1285 pub name: String,
1286 pub kind: SymbolKind,
1287 pub start_line: u32,
1288 pub end_line: u32,
1289 pub exported: bool,
1290 pub snippet: String,
1291 pub score: f32,
1292 pub source: &'static str,
1293}
1294
1295impl SemanticIndex {
1296 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1297 debug_assert!(project_root.is_absolute());
1298 Self {
1299 entries: Vec::new(),
1300 file_mtimes: HashMap::new(),
1301 file_sizes: HashMap::new(),
1302 file_hashes: HashMap::new(),
1303 dimension,
1304 fingerprint: None,
1305 project_root,
1306 deferred_files: HashSet::new(),
1307 }
1308 }
1309
1310 pub fn entry_count(&self) -> usize {
1312 self.entries.len()
1313 }
1314
1315 pub fn indexed_file_count(&self) -> usize {
1317 self.file_mtimes.len()
1318 }
1319
1320 pub fn status_label(&self) -> &'static str {
1322 if self.entries.is_empty() {
1323 "empty"
1324 } else {
1325 "ready"
1326 }
1327 }
1328
1329 fn collect_chunks(
1330 project_root: &Path,
1331 files: &[PathBuf],
1332 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1333 let collect_started = std::time::Instant::now();
1334 let per_file: Vec<(
1335 PathBuf,
1336 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1337 )> = files
1338 .par_iter()
1339 .map_init(HashMap::new, |parsers, file| {
1340 let result = collect_semantic_file(project_root, file, parsers);
1341 (file.clone(), result)
1342 })
1343 .collect();
1344
1345 let mut chunks: Vec<SemanticChunk> = Vec::new();
1346 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1347
1348 for (file, result) in per_file {
1349 match result {
1350 Ok((metadata, file_chunks)) => {
1351 file_metadata.insert(file, metadata);
1352 chunks.extend(file_chunks);
1353 }
1354 Err(error) => {
1355 if error == "unsupported file extension" {
1361 continue;
1362 }
1363 slog_warn!(
1364 "failed to collect semantic chunks for {}: {}",
1365 file.display(),
1366 error
1367 );
1368 }
1369 }
1370 }
1371
1372 slog_info!(
1373 "semantic collect: {} chunks from {} files in {} ms",
1374 chunks.len(),
1375 file_metadata.len(),
1376 collect_started.elapsed().as_millis()
1377 );
1378
1379 (chunks, file_metadata)
1380 }
1381
1382 fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1383 let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1384 let mut reuse_map: ChunkReuseMap = HashMap::new();
1385
1386 for entry in &self.entries {
1387 if !requested.contains(entry.chunk.file.as_path()) {
1388 continue;
1389 }
1390
1391 let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1396 reuse_map
1397 .entry(entry.chunk.file.clone())
1398 .or_default()
1399 .entry(hash)
1400 .or_default()
1401 .push(ReusableEmbedding {
1402 embed_text: entry.chunk.embed_text.clone(),
1403 vector: entry.vector.clone(),
1404 });
1405 }
1406
1407 reuse_map
1408 }
1409
1410 fn reusable_vector_for_chunk(
1411 reuse_map: &ChunkReuseMap,
1412 chunk: &SemanticChunk,
1413 ) -> Option<Vec<f32>> {
1414 let hash = blake3::hash(chunk.embed_text.as_bytes());
1415 reuse_map
1416 .get(&chunk.file)?
1417 .get(&hash)?
1418 .iter()
1419 .find(|candidate| candidate.embed_text == chunk.embed_text)
1420 .map(|candidate| candidate.vector.clone())
1421 }
1422
1423 fn entries_for_chunks_with_reuse<F, P>(
1424 chunks: Vec<SemanticChunk>,
1425 reuse_map: &ChunkReuseMap,
1426 embed_fn: &mut F,
1427 max_batch_size: usize,
1428 initial_observed_dimension: Option<usize>,
1429 refresh_label: &str,
1430 progress: &mut P,
1431 ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1432 where
1433 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1434 P: FnMut(usize, usize),
1435 {
1436 let total_chunks = chunks.len();
1437 progress(0, total_chunks);
1438
1439 let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1440 let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1441
1442 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1443 if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1444 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1445 } else {
1446 misses.push((chunk_index, chunk));
1447 }
1448 }
1449
1450 let mut completed = total_chunks.saturating_sub(misses.len());
1451 if completed > 0 {
1452 progress(completed, total_chunks);
1453 }
1454
1455 let batch_size = max_batch_size.max(1);
1456 let mut observed_dimension = initial_observed_dimension;
1457
1458 for batch_start in (0..misses.len()).step_by(batch_size) {
1459 let batch_end = (batch_start + batch_size).min(misses.len());
1460 let batch_texts: Vec<String> = misses[batch_start..batch_end]
1461 .iter()
1462 .map(|(_, chunk)| chunk.embed_text.clone())
1463 .collect();
1464
1465 let vectors = embed_fn(batch_texts)?;
1466 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1467
1468 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1469 match observed_dimension {
1470 None => observed_dimension = Some(dim),
1471 Some(expected) if dim != expected => {
1472 return Err(format!(
1473 "embedding dimension changed during {refresh_label}: \
1474 cached index uses {expected}, new vectors use {dim}"
1475 ));
1476 }
1477 _ => {}
1478 }
1479 }
1480
1481 for (i, vector) in vectors.into_iter().enumerate() {
1482 let (chunk_index, chunk) = misses[batch_start + i].clone();
1483 entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1484 }
1485
1486 completed += batch_end - batch_start;
1487 progress(completed, total_chunks);
1488 }
1489
1490 let entries = entries_by_chunk
1491 .into_iter()
1492 .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1493 .collect();
1494
1495 Ok((entries, observed_dimension))
1496 }
1497
1498 fn build_from_chunks<F, P>(
1499 project_root: &Path,
1500 chunks: Vec<SemanticChunk>,
1501 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1502 embed_fn: &mut F,
1503 max_batch_size: usize,
1504 mut progress: Option<&mut P>,
1505 ) -> Result<Self, String>
1506 where
1507 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1508 P: FnMut(usize, usize),
1509 {
1510 debug_assert!(project_root.is_absolute());
1511 let total_chunks = chunks.len();
1512
1513 if chunks.is_empty() {
1514 return Ok(Self {
1515 entries: Vec::new(),
1516 file_mtimes: file_metadata
1517 .iter()
1518 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1519 .collect(),
1520 file_sizes: file_metadata
1521 .iter()
1522 .map(|(path, metadata)| (path.clone(), metadata.size))
1523 .collect(),
1524 file_hashes: file_metadata
1525 .into_iter()
1526 .map(|(path, metadata)| (path, metadata.content_hash))
1527 .collect(),
1528 dimension: DEFAULT_DIMENSION,
1529 fingerprint: None,
1530 project_root: project_root.to_path_buf(),
1531 deferred_files: HashSet::new(),
1532 });
1533 }
1534
1535 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1537 let mut expected_dimension: Option<usize> = None;
1538 let batch_size = max_batch_size.max(1);
1539 let embed_started = std::time::Instant::now();
1540 let batch_count = total_chunks.div_ceil(batch_size);
1541 for batch_start in (0..chunks.len()).step_by(batch_size) {
1542 let batch_end = (batch_start + batch_size).min(chunks.len());
1543 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1544 .iter()
1545 .map(|c| c.embed_text.clone())
1546 .collect();
1547
1548 let vectors = embed_fn(batch_texts)?;
1549 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1550
1551 if let Some(dim) = vectors.first().map(|v| v.len()) {
1553 match expected_dimension {
1554 None => expected_dimension = Some(dim),
1555 Some(expected) if dim != expected => {
1556 return Err(format!(
1557 "embedding dimension changed across batches: expected {expected}, got {dim}"
1558 ));
1559 }
1560 _ => {}
1561 }
1562 }
1563
1564 for (i, vector) in vectors.into_iter().enumerate() {
1565 let chunk_idx = batch_start + i;
1566 entries.push(EmbeddingEntry {
1567 chunk: chunks[chunk_idx].clone(),
1568 vector,
1569 });
1570 }
1571
1572 if let Some(callback) = progress.as_mut() {
1573 callback(entries.len(), total_chunks);
1574 }
1575 }
1576
1577 let embed_ms = embed_started.elapsed().as_millis();
1578 let rate = (total_chunks as u128 * 1000)
1579 .checked_div(embed_ms)
1580 .unwrap_or(0) as u64;
1581 slog_info!(
1582 "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1583 total_chunks,
1584 batch_count,
1585 embed_ms,
1586 rate
1587 );
1588
1589 let dimension = entries
1590 .first()
1591 .map(|e| e.vector.len())
1592 .unwrap_or(DEFAULT_DIMENSION);
1593
1594 Ok(Self {
1595 entries,
1596 file_mtimes: file_metadata
1597 .iter()
1598 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1599 .collect(),
1600 file_sizes: file_metadata
1601 .iter()
1602 .map(|(path, metadata)| (path.clone(), metadata.size))
1603 .collect(),
1604 file_hashes: file_metadata
1605 .into_iter()
1606 .map(|(path, metadata)| (path, metadata.content_hash))
1607 .collect(),
1608 dimension,
1609 fingerprint: None,
1610 project_root: project_root.to_path_buf(),
1611 deferred_files: HashSet::new(),
1612 })
1613 }
1614
1615 pub fn build<F>(
1618 project_root: &Path,
1619 files: &[PathBuf],
1620 embed_fn: &mut F,
1621 max_batch_size: usize,
1622 ) -> Result<Self, String>
1623 where
1624 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1625 {
1626 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1627 Self::build_from_chunks(
1628 project_root,
1629 chunks,
1630 file_mtimes,
1631 embed_fn,
1632 max_batch_size,
1633 Option::<&mut fn(usize, usize)>::None,
1634 )
1635 }
1636
1637 pub fn build_with_progress<F, P>(
1639 project_root: &Path,
1640 files: &[PathBuf],
1641 embed_fn: &mut F,
1642 max_batch_size: usize,
1643 progress: &mut P,
1644 ) -> Result<Self, String>
1645 where
1646 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1647 P: FnMut(usize, usize),
1648 {
1649 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1650 let total_chunks = chunks.len();
1651 progress(0, total_chunks);
1652 Self::build_from_chunks(
1653 project_root,
1654 chunks,
1655 file_mtimes,
1656 embed_fn,
1657 max_batch_size,
1658 Some(progress),
1659 )
1660 }
1661
1662 pub fn refresh_stale_files<F, P>(
1673 &mut self,
1674 project_root: &Path,
1675 current_files: &[PathBuf],
1676 embed_fn: &mut F,
1677 max_batch_size: usize,
1678 progress: &mut P,
1679 ) -> Result<RefreshSummary, String>
1680 where
1681 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1682 P: FnMut(usize, usize),
1683 {
1684 self.backfill_missing_file_sizes();
1685
1686 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1688 self.deferred_files
1689 .retain(|path| current_set.contains(path.as_path()));
1690 let total_processed = current_set.len() + self.file_mtimes.len()
1691 - self
1692 .file_mtimes
1693 .keys()
1694 .filter(|path| current_set.contains(path.as_path()))
1695 .count();
1696
1697 enum IndexedFileCheck {
1700 Deleted(PathBuf),
1701 MissingMetadata(PathBuf),
1702 Verified(PathBuf, FreshnessVerdict),
1703 }
1704
1705 let mut deleted: Vec<PathBuf> = Vec::new();
1706 let mut changed: Vec<PathBuf> = Vec::new();
1707 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708 let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1709 let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1710
1711 for indexed_path in indexed_paths {
1712 let check_index = checks.len();
1713 if !current_set.contains(indexed_path.as_path()) {
1714 checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1715 continue;
1716 }
1717 let cached = match (
1718 self.file_mtimes.get(&indexed_path),
1719 self.file_sizes.get(&indexed_path),
1720 self.file_hashes.get(&indexed_path),
1721 ) {
1722 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1723 mtime: *mtime,
1724 size: *size,
1725 content_hash: *hash,
1726 }),
1727 _ => None,
1728 };
1729 if let Some(freshness) = cached {
1730 strict_verify_inputs.push((check_index, indexed_path, freshness));
1731 checks.push(None);
1732 } else {
1733 checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1734 }
1735 }
1736
1737 for (check_index, path, verdict) in
1738 cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1739 {
1740 checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1741 }
1742
1743 for check in checks {
1744 match check.expect("strict freshness check should be populated") {
1745 IndexedFileCheck::Deleted(path) => deleted.push(path),
1746 IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1747 IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1748 IndexedFileCheck::Verified(
1749 path,
1750 FreshnessVerdict::ContentFresh {
1751 new_mtime,
1752 new_size,
1753 },
1754 ) => {
1755 self.file_mtimes.insert(path.clone(), new_mtime);
1756 self.file_sizes.insert(path, new_size);
1757 }
1758 IndexedFileCheck::Verified(
1759 path,
1760 FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1761 ) => {
1762 changed.push(path);
1763 }
1764 }
1765 }
1766
1767 let mut added: Vec<PathBuf> = Vec::new();
1769 for path in current_files {
1770 if !self.file_mtimes.contains_key(path) {
1771 added.push(path.clone());
1772 }
1773 }
1774
1775 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1777 progress(0, 0);
1778 return Ok(RefreshSummary {
1779 total_processed,
1780 ..RefreshSummary::default()
1781 });
1782 }
1783
1784 if !deleted.is_empty() {
1788 self.remove_indexed_files(&deleted);
1789 }
1790
1791 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1793 to_embed.extend(changed.iter().cloned());
1794 to_embed.extend(added.iter().cloned());
1795
1796 if to_embed.is_empty() {
1797 progress(0, 0);
1799 return Ok(RefreshSummary {
1800 changed: 0,
1801 added: 0,
1802 deleted: deleted.len(),
1803 total_processed,
1804 });
1805 }
1806
1807 let reuse_map = self.build_chunk_reuse_map(&changed);
1808 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1809 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1810 let vanished = to_embed
1811 .iter()
1812 .filter(|path| {
1813 changed_set.contains(path.as_path())
1814 && !fresh_metadata.contains_key(*path)
1815 && !path.exists()
1816 })
1817 .cloned()
1818 .collect::<Vec<_>>();
1819 if !vanished.is_empty() {
1820 self.remove_indexed_files(&vanished);
1821 deleted.extend(vanished);
1822 }
1823
1824 if chunks.is_empty() {
1825 progress(0, 0);
1826 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1827 for file in &successful_files {
1828 self.deferred_files.remove(file);
1829 }
1830 if !successful_files.is_empty() {
1831 self.entries
1832 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1833 }
1834 let changed_count = changed
1835 .iter()
1836 .filter(|path| successful_files.contains(*path))
1837 .count();
1838 let added_count = added
1839 .iter()
1840 .filter(|path| successful_files.contains(*path))
1841 .count();
1842 for (file, metadata) in fresh_metadata {
1843 self.file_mtimes.insert(file.clone(), metadata.mtime);
1844 self.file_sizes.insert(file.clone(), metadata.size);
1845 self.file_hashes.insert(file.clone(), metadata.content_hash);
1846 }
1847 return Ok(RefreshSummary {
1848 changed: changed_count,
1849 added: added_count,
1850 deleted: deleted.len(),
1851 total_processed,
1852 });
1853 }
1854
1855 let existing_dimension = if self.entries.is_empty() {
1858 None
1859 } else {
1860 Some(self.dimension)
1861 };
1862 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1863 chunks,
1864 &reuse_map,
1865 embed_fn,
1866 max_batch_size,
1867 existing_dimension,
1868 "incremental refresh",
1869 progress,
1870 )?;
1871
1872 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1873 for file in &successful_files {
1874 self.deferred_files.remove(file);
1875 }
1876 if !successful_files.is_empty() {
1877 self.entries
1878 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1879 }
1880
1881 self.entries.extend(new_entries);
1882 for (file, metadata) in fresh_metadata {
1883 self.file_mtimes.insert(file.clone(), metadata.mtime);
1884 self.file_sizes.insert(file.clone(), metadata.size);
1885 self.file_hashes.insert(file, metadata.content_hash);
1886 }
1887 if let Some(dim) = observed_dimension {
1888 self.dimension = dim;
1889 }
1890
1891 Ok(RefreshSummary {
1892 changed: changed
1893 .iter()
1894 .filter(|path| successful_files.contains(*path))
1895 .count(),
1896 added: added
1897 .iter()
1898 .filter(|path| successful_files.contains(*path))
1899 .count(),
1900 deleted: deleted.len(),
1901 total_processed,
1902 })
1903 }
1904
1905 pub fn refresh_invalidated_files<F, P>(
1912 &mut self,
1913 project_root: &Path,
1914 paths: &[PathBuf],
1915 embed_fn: &mut F,
1916 max_batch_size: usize,
1917 max_files: usize,
1918 progress: &mut P,
1919 ) -> Result<InvalidatedFilesRefresh, String>
1920 where
1921 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1922 P: FnMut(usize, usize),
1923 {
1924 self.backfill_missing_file_sizes();
1925
1926 self.deferred_files.retain(|path| path.exists());
1927 let mut requested_paths = paths.to_vec();
1928 requested_paths.extend(self.deferred_files.iter().cloned());
1929 requested_paths.sort();
1930 requested_paths.dedup();
1931 let total_processed = requested_paths.len();
1932
1933 if requested_paths.is_empty() {
1934 progress(0, 0);
1935 return Ok(InvalidatedFilesRefresh {
1936 summary: RefreshSummary {
1937 total_processed,
1938 ..RefreshSummary::default()
1939 },
1940 ..InvalidatedFilesRefresh::default()
1941 });
1942 }
1943
1944 let previously_indexed: HashSet<PathBuf> = requested_paths
1945 .iter()
1946 .filter(|path| self.file_mtimes.contains_key(*path))
1947 .cloned()
1948 .collect();
1949 let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1950
1951 self.remove_indexed_files(&requested_paths);
1955
1956 let existing_paths = requested_paths
1957 .iter()
1958 .filter(|path| path.exists())
1959 .cloned()
1960 .collect::<Vec<_>>();
1961 let deleted = requested_paths
1962 .iter()
1963 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1964 .count();
1965
1966 if existing_paths.is_empty() {
1967 for path in &requested_paths {
1968 if !path.exists() {
1969 self.deferred_files.remove(path);
1970 }
1971 }
1972 progress(0, 0);
1973 return Ok(InvalidatedFilesRefresh {
1974 completed_paths: requested_paths,
1975 summary: RefreshSummary {
1976 deleted,
1977 total_processed,
1978 ..RefreshSummary::default()
1979 },
1980 ..InvalidatedFilesRefresh::default()
1981 });
1982 }
1983
1984 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1985
1986 let retained_file_count = self.file_mtimes.len();
1987 let changed_successful_count = existing_paths
1988 .iter()
1989 .filter(|path| {
1990 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1991 })
1992 .count();
1993 let available_new_files =
1994 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1995 let new_successful_files = existing_paths
1996 .iter()
1997 .filter(|path| {
1998 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1999 })
2000 .cloned()
2001 .collect::<Vec<_>>();
2002 if new_successful_files.len() > available_new_files {
2003 let allowed_new_files = new_successful_files
2004 .iter()
2005 .take(available_new_files)
2006 .cloned()
2007 .collect::<HashSet<_>>();
2008 let deferred_new_files = new_successful_files
2009 .into_iter()
2010 .filter(|path| !allowed_new_files.contains(path))
2011 .collect::<HashSet<_>>();
2012
2013 fresh_metadata.retain(|file, _| {
2014 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2015 });
2016 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2017
2018 if !deferred_new_files.is_empty() {
2019 for path in &deferred_new_files {
2020 self.deferred_files.insert(path.clone());
2021 }
2022 slog_warn!(
2023 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2024 deferred_new_files.len(),
2025 max_files
2026 );
2027 }
2028 }
2029
2030 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2031 for file in &successful_files {
2032 self.deferred_files.remove(file);
2033 }
2034 let changed = successful_files
2035 .iter()
2036 .filter(|path| previously_indexed.contains(path.as_path()))
2037 .count();
2038 let added = successful_files.len().saturating_sub(changed);
2039 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2040
2041 if chunks.is_empty() {
2042 progress(0, 0);
2043 for (file, metadata) in fresh_metadata {
2044 let freshness = FileFreshness {
2045 mtime: metadata.mtime,
2046 size: metadata.size,
2047 content_hash: metadata.content_hash,
2048 };
2049 self.file_mtimes.insert(file.clone(), freshness.mtime);
2050 self.file_sizes.insert(file.clone(), freshness.size);
2051 self.file_hashes
2052 .insert(file.clone(), freshness.content_hash);
2053 updated_metadata.push((file, freshness));
2054 }
2055
2056 return Ok(InvalidatedFilesRefresh {
2057 updated_metadata,
2058 completed_paths: requested_paths,
2059 summary: RefreshSummary {
2060 changed,
2061 added,
2062 deleted,
2063 total_processed,
2064 },
2065 ..InvalidatedFilesRefresh::default()
2066 });
2067 }
2068
2069 let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2070 {
2071 None
2072 } else {
2073 Some(self.dimension)
2074 };
2075 let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2076 chunks,
2077 &reuse_map,
2078 embed_fn,
2079 max_batch_size,
2080 initial_observed_dimension,
2081 "invalidated-file refresh",
2082 progress,
2083 )?;
2084
2085 let added_entries = new_entries.clone();
2086 self.entries.extend(new_entries);
2087 for (file, metadata) in fresh_metadata {
2088 let freshness = FileFreshness {
2089 mtime: metadata.mtime,
2090 size: metadata.size,
2091 content_hash: metadata.content_hash,
2092 };
2093 self.file_mtimes.insert(file.clone(), freshness.mtime);
2094 self.file_sizes.insert(file.clone(), freshness.size);
2095 self.file_hashes
2096 .insert(file.clone(), freshness.content_hash);
2097 updated_metadata.push((file, freshness));
2098 }
2099 if let Some(dim) = observed_dimension {
2100 self.dimension = dim;
2101 }
2102
2103 Ok(InvalidatedFilesRefresh {
2104 added_entries,
2105 updated_metadata,
2106 completed_paths: requested_paths,
2107 summary: RefreshSummary {
2108 changed,
2109 added,
2110 deleted,
2111 total_processed,
2112 },
2113 })
2114 }
2115
2116 pub fn apply_refresh_update(
2117 &mut self,
2118 added_entries: Vec<EmbeddingEntry>,
2119 updated_metadata: Vec<(PathBuf, FileFreshness)>,
2120 completed_paths: &[PathBuf],
2121 ) {
2122 self.remove_indexed_files(completed_paths);
2126
2127 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2128 self.entries.extend(added_entries);
2129 for (file, freshness) in updated_metadata {
2130 self.file_mtimes.insert(file.clone(), freshness.mtime);
2131 self.file_sizes.insert(file.clone(), freshness.size);
2132 self.file_hashes.insert(file, freshness.content_hash);
2133 }
2134 if let Some(dim) = observed_dimension {
2135 self.dimension = dim;
2136 }
2137 }
2138
2139 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2140 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2141 self.entries
2142 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2143 for path in files {
2144 self.file_mtimes.remove(path);
2145 self.file_sizes.remove(path);
2146 self.file_hashes.remove(path);
2147 }
2148 }
2149
2150 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2152 if self.entries.is_empty() || query_vector.len() != self.dimension {
2153 return Vec::new();
2154 }
2155
2156 let mut scored: Vec<(f32, usize)> = self
2157 .entries
2158 .iter()
2159 .enumerate()
2160 .map(|(i, entry)| {
2161 let mut score = cosine_similarity(query_vector, &entry.vector);
2162 if entry.chunk.exported {
2163 score *= 1.1;
2164 }
2165 (score, i)
2166 })
2167 .collect();
2168
2169 let keep = top_k.min(scored.len());
2170 if keep == 0 {
2171 return Vec::new();
2172 }
2173
2174 if keep < scored.len() {
2175 scored.select_nth_unstable_by(keep, semantic_score_order);
2176 scored.truncate(keep);
2177 }
2178 scored.sort_by(semantic_score_order);
2179
2180 scored
2181 .into_iter()
2182 .map(|(score, idx)| {
2186 let entry = &self.entries[idx];
2187 SemanticResult {
2188 file: entry.chunk.file.clone(),
2189 name: entry.chunk.name.clone(),
2190 kind: entry.chunk.kind.clone(),
2191 start_line: entry.chunk.start_line,
2192 end_line: entry.chunk.end_line,
2193 exported: entry.chunk.exported,
2194 snippet: entry.chunk.snippet.clone(),
2195 score,
2196 source: "semantic",
2197 }
2198 })
2199 .collect()
2200 }
2201
2202 pub fn len(&self) -> usize {
2204 self.entries.len()
2205 }
2206
2207 pub fn is_file_stale(&self, file: &Path) -> bool {
2209 let Some(stored_mtime) = self.file_mtimes.get(file) else {
2210 return true;
2211 };
2212 let Some(stored_size) = self.file_sizes.get(file) else {
2213 return true;
2214 };
2215 let Some(stored_hash) = self.file_hashes.get(file) else {
2216 return true;
2217 };
2218 let cached = FileFreshness {
2219 mtime: *stored_mtime,
2220 size: *stored_size,
2221 content_hash: *stored_hash,
2222 };
2223 match cache_freshness::verify_file_strict(file, &cached) {
2224 FreshnessVerdict::HotFresh => false,
2225 FreshnessVerdict::ContentFresh { .. } => false,
2226 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2227 }
2228 }
2229
2230 fn backfill_missing_file_sizes(&mut self) {
2231 for path in self.file_mtimes.keys() {
2232 if self.file_sizes.contains_key(path) {
2233 continue;
2234 }
2235 if let Ok(metadata) = fs::metadata(path) {
2236 self.file_sizes.insert(path.clone(), metadata.len());
2237 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2238 self.file_hashes.insert(path.clone(), hash);
2239 }
2240 }
2241 }
2242 }
2243
2244 pub fn remove_file(&mut self, file: &Path) {
2246 self.invalidate_file(file);
2247 }
2248
2249 pub fn invalidate_file(&mut self, file: &Path) {
2250 let canonical_file = canonicalize_existing_or_deleted_path(file);
2251 self.entries
2252 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2253 self.file_mtimes.remove(file);
2254 self.file_sizes.remove(file);
2255 self.file_hashes.remove(file);
2256 if canonical_file.as_path() != file {
2257 self.file_mtimes.remove(&canonical_file);
2258 self.file_sizes.remove(&canonical_file);
2259 self.file_hashes.remove(&canonical_file);
2260 }
2261 }
2262
2263 pub fn dimension(&self) -> usize {
2265 self.dimension
2266 }
2267
2268 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2269 self.fingerprint.as_ref()
2270 }
2271
2272 pub fn backend_label(&self) -> Option<&str> {
2273 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2274 }
2275
2276 pub fn model_label(&self) -> Option<&str> {
2277 self.fingerprint.as_ref().map(|f| f.model.as_str())
2278 }
2279
2280 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2281 self.fingerprint = Some(fingerprint);
2282 }
2283
2284 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2286 if self.entries.is_empty() {
2289 slog_info!("skipping semantic index persistence (0 entries)");
2290 return;
2291 }
2292 let dir = storage_dir.join("semantic").join(project_key);
2293 if let Err(e) = fs::create_dir_all(&dir) {
2294 slog_warn!("failed to create semantic cache dir: {}", e);
2295 return;
2296 }
2297 let data_path = dir.join("semantic.bin");
2298 let tmp_path = dir.join(format!(
2299 "semantic.bin.tmp.{}.{}",
2300 std::process::id(),
2301 SystemTime::now()
2302 .duration_since(SystemTime::UNIX_EPOCH)
2303 .unwrap_or(Duration::ZERO)
2304 .as_nanos()
2305 ));
2306 let write_result = (|| -> io::Result<usize> {
2307 let file = fs::File::create(&tmp_path)?;
2308 let mut writer = BufWriter::new(file);
2309 let bytes_written = self.write_to_writer(&mut writer)?;
2310 writer.flush()?;
2311 writer.get_ref().sync_all()?;
2312 Ok(bytes_written)
2313 })();
2314 let bytes_written = match write_result {
2315 Ok(bytes_written) => bytes_written,
2316 Err(e) => {
2317 slog_warn!("failed to write semantic index: {}", e);
2318 let _ = fs::remove_file(&tmp_path);
2319 return;
2320 }
2321 };
2322 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2323 slog_warn!("failed to rename semantic index: {}", e);
2324 let _ = fs::remove_file(&tmp_path);
2325 return;
2326 }
2327 slog_info!(
2328 "semantic index persisted: {} entries, {:.1} KB",
2329 self.entries.len(),
2330 bytes_written as f64 / 1024.0
2331 );
2332 }
2333
2334 pub fn read_from_disk(
2336 storage_dir: &Path,
2337 project_key: &str,
2338 current_canonical_root: &Path,
2339 is_worktree_bridge: bool,
2340 expected_fingerprint: Option<&str>,
2341 ) -> Option<Self> {
2342 debug_assert!(current_canonical_root.is_absolute());
2343 let data_path = storage_dir
2344 .join("semantic")
2345 .join(project_key)
2346 .join("semantic.bin");
2347 let file = fs::File::open(&data_path).ok()?;
2348 let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2349 if file_len < HEADER_BYTES_V1 {
2350 slog_warn!(
2351 "corrupt semantic index (too small: {} bytes), removing",
2352 file_len
2353 );
2354 if !is_worktree_bridge {
2355 let _ = fs::remove_file(&data_path);
2356 }
2357 return None;
2358 }
2359
2360 let mut reader = BufReader::new(file);
2361 let mut version_buf = [0u8; 1];
2362 reader.read_exact(&mut version_buf).ok()?;
2363 let version = version_buf[0];
2364 if version != SEMANTIC_INDEX_VERSION_V6 {
2365 slog_info!(
2366 "cached semantic index version {} is older than {}, rebuilding",
2367 version,
2368 SEMANTIC_INDEX_VERSION_V6
2369 );
2370 if !is_worktree_bridge {
2371 let _ = fs::remove_file(&data_path);
2372 }
2373 return None;
2374 }
2375 match Self::from_reader_after_version(
2376 reader,
2377 version,
2378 current_canonical_root,
2379 Some(file_len),
2380 1,
2381 ) {
2382 Ok(index) => {
2383 if index.entries.is_empty() {
2384 slog_info!("cached semantic index is empty, will rebuild");
2385 if !is_worktree_bridge {
2386 let _ = fs::remove_file(&data_path);
2387 }
2388 return None;
2389 }
2390 if let Some(expected) = expected_fingerprint {
2391 let matches = index
2392 .fingerprint()
2393 .map(|fingerprint| fingerprint.matches_expected(expected))
2394 .unwrap_or(false);
2395 if !matches {
2396 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2397 if !is_worktree_bridge {
2398 let _ = fs::remove_file(&data_path);
2399 }
2400 return None;
2401 }
2402 }
2403 slog_info!(
2404 "loaded semantic index from disk: {} entries",
2405 index.entries.len()
2406 );
2407 Some(index)
2408 }
2409 Err(e) => {
2410 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2411 if !is_worktree_bridge {
2412 let _ = fs::remove_file(&data_path);
2413 }
2414 None
2415 }
2416 }
2417 }
2418
2419 pub fn to_bytes(&self) -> Vec<u8> {
2421 let mut buf = Vec::new();
2422 self.write_to_writer(&mut buf)
2423 .expect("writing semantic index to Vec cannot fail");
2424 buf
2425 }
2426
2427 fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2428 let mut bytes_written = 0usize;
2429 let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2430 let encoded = fingerprint.as_string();
2431 if encoded.is_empty() {
2432 None
2433 } else {
2434 Some(encoded)
2435 }
2436 });
2437 let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2438 let file_mtime_count = self
2439 .file_mtimes
2440 .iter()
2441 .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2442 .count();
2443 let entry_count = self
2444 .entries
2445 .iter()
2446 .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2447 .count();
2448
2449 let version = SEMANTIC_INDEX_VERSION_V6;
2462 write_counted(writer, &[version], &mut bytes_written)?;
2463 write_counted(
2464 writer,
2465 &(self.dimension as u32).to_le_bytes(),
2466 &mut bytes_written,
2467 )?;
2468 write_counted(
2469 writer,
2470 &(entry_count as u32).to_le_bytes(),
2471 &mut bytes_written,
2472 )?;
2473 write_counted(
2474 writer,
2475 &(fp_bytes_ref.len() as u32).to_le_bytes(),
2476 &mut bytes_written,
2477 )?;
2478 write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2479
2480 write_counted(
2483 writer,
2484 &(file_mtime_count as u32).to_le_bytes(),
2485 &mut bytes_written,
2486 )?;
2487 for (path, mtime) in &self.file_mtimes {
2488 let Some(relative) = cache_relative_path(&self.project_root, path) else {
2489 continue;
2490 };
2491 let relative = relative.to_string_lossy();
2492 let path_bytes = relative.as_bytes();
2493 write_counted(
2494 writer,
2495 &(path_bytes.len() as u32).to_le_bytes(),
2496 &mut bytes_written,
2497 )?;
2498 write_counted(writer, path_bytes, &mut bytes_written)?;
2499 let duration = mtime
2500 .duration_since(SystemTime::UNIX_EPOCH)
2501 .unwrap_or_default();
2502 write_counted(
2503 writer,
2504 &duration.as_secs().to_le_bytes(),
2505 &mut bytes_written,
2506 )?;
2507 write_counted(
2508 writer,
2509 &duration.subsec_nanos().to_le_bytes(),
2510 &mut bytes_written,
2511 )?;
2512 let size = self.file_sizes.get(path).copied().unwrap_or_default();
2513 write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2514 let hash = self
2515 .file_hashes
2516 .get(path)
2517 .copied()
2518 .unwrap_or_else(cache_freshness::zero_hash);
2519 write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2520 }
2521
2522 for entry in &self.entries {
2524 let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2525 continue;
2526 };
2527 let c = &entry.chunk;
2528
2529 let relative = relative.to_string_lossy();
2531 let file_bytes = relative.as_bytes();
2532 write_counted(
2533 writer,
2534 &(file_bytes.len() as u32).to_le_bytes(),
2535 &mut bytes_written,
2536 )?;
2537 write_counted(writer, file_bytes, &mut bytes_written)?;
2538
2539 let name_bytes = c.name.as_bytes();
2541 write_counted(
2542 writer,
2543 &(name_bytes.len() as u32).to_le_bytes(),
2544 &mut bytes_written,
2545 )?;
2546 write_counted(writer, name_bytes, &mut bytes_written)?;
2547
2548 write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2550
2551 write_counted(
2553 writer,
2554 &(c.start_line as u32).to_le_bytes(),
2555 &mut bytes_written,
2556 )?;
2557 write_counted(
2558 writer,
2559 &(c.end_line as u32).to_le_bytes(),
2560 &mut bytes_written,
2561 )?;
2562 write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2563
2564 let snippet_bytes = c.snippet.as_bytes();
2566 write_counted(
2567 writer,
2568 &(snippet_bytes.len() as u32).to_le_bytes(),
2569 &mut bytes_written,
2570 )?;
2571 write_counted(writer, snippet_bytes, &mut bytes_written)?;
2572
2573 let embed_bytes = c.embed_text.as_bytes();
2575 write_counted(
2576 writer,
2577 &(embed_bytes.len() as u32).to_le_bytes(),
2578 &mut bytes_written,
2579 )?;
2580 write_counted(writer, embed_bytes, &mut bytes_written)?;
2581
2582 for &val in &entry.vector {
2584 write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2585 }
2586 }
2587
2588 Ok(bytes_written)
2589 }
2590
2591 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2593 debug_assert!(current_canonical_root.is_absolute());
2594 if data.len() < HEADER_BYTES_V1 {
2595 return Err("data too short".to_string());
2596 }
2597
2598 Self::from_reader_after_version(
2599 Cursor::new(&data[1..]),
2600 data[0],
2601 current_canonical_root,
2602 Some(data.len()),
2603 1,
2604 )
2605 }
2606
2607 fn from_reader_after_version<R: Read>(
2608 reader: R,
2609 version: u8,
2610 current_canonical_root: &Path,
2611 total_len: Option<usize>,
2612 bytes_read: usize,
2613 ) -> Result<Self, String> {
2614 debug_assert!(current_canonical_root.is_absolute());
2615 let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2616
2617 if version != SEMANTIC_INDEX_VERSION_V1
2618 && version != SEMANTIC_INDEX_VERSION_V2
2619 && version != SEMANTIC_INDEX_VERSION_V3
2620 && version != SEMANTIC_INDEX_VERSION_V4
2621 && version != SEMANTIC_INDEX_VERSION_V5
2622 && version != SEMANTIC_INDEX_VERSION_V6
2623 {
2624 return Err(format!("unsupported version: {}", version));
2625 }
2626 if (version == SEMANTIC_INDEX_VERSION_V2
2630 || version == SEMANTIC_INDEX_VERSION_V3
2631 || version == SEMANTIC_INDEX_VERSION_V4
2632 || version == SEMANTIC_INDEX_VERSION_V5
2633 || version == SEMANTIC_INDEX_VERSION_V6)
2634 && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2635 {
2636 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2637 }
2638
2639 let dimension = read_u32_stream(&mut reader)? as usize;
2640 let entry_count = read_u32_stream(&mut reader)? as usize;
2641 validate_embedding_dimension(dimension)?;
2642 if entry_count > MAX_ENTRIES {
2643 return Err(format!("too many semantic index entries: {}", entry_count));
2644 }
2645
2646 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2652 || version == SEMANTIC_INDEX_VERSION_V3
2653 || version == SEMANTIC_INDEX_VERSION_V4
2654 || version == SEMANTIC_INDEX_VERSION_V5
2655 || version == SEMANTIC_INDEX_VERSION_V6;
2656 let fingerprint = if has_fingerprint_field {
2657 let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2658 if total_len
2659 .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2660 {
2661 return Err("unexpected end of data reading fingerprint".to_string());
2662 }
2663 if fingerprint_len == 0 {
2664 None
2665 } else {
2666 let mut raw = vec![0u8; fingerprint_len];
2667 read_exact_stream(
2668 &mut reader,
2669 &mut raw,
2670 "unexpected end of data reading fingerprint",
2671 )?;
2672 let raw = String::from_utf8_lossy(&raw).to_string();
2673 Some(
2674 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2675 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2676 )
2677 }
2678 } else {
2679 None
2680 };
2681
2682 let mtime_count = read_u32_stream(&mut reader)? as usize;
2684 if mtime_count > MAX_ENTRIES {
2685 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2686 }
2687
2688 let vector_bytes = entry_count
2689 .checked_mul(dimension)
2690 .and_then(|count| count.checked_mul(F32_BYTES))
2691 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2692 if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2693 return Err("semantic index vectors exceed available data".to_string());
2694 }
2695
2696 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2697 let mut file_sizes = HashMap::with_capacity(mtime_count);
2698 let mut file_hashes = HashMap::with_capacity(mtime_count);
2699 for _ in 0..mtime_count {
2700 let path = read_string_stream(&mut reader, total_len)?;
2701 let secs = read_u64_stream(&mut reader)?;
2702 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2708 || version == SEMANTIC_INDEX_VERSION_V4
2709 || version == SEMANTIC_INDEX_VERSION_V5
2710 || version == SEMANTIC_INDEX_VERSION_V6
2711 {
2712 read_u32_stream(&mut reader)?
2713 } else {
2714 0
2715 };
2716 let size =
2717 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2718 read_u64_stream(&mut reader)?
2719 } else {
2720 0
2721 };
2722 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2723 let mut hash_bytes = [0u8; 32];
2724 read_exact_stream(
2725 &mut reader,
2726 &mut hash_bytes,
2727 "unexpected end of data reading content hash",
2728 )?;
2729 blake3::Hash::from_bytes(hash_bytes)
2730 } else {
2731 cache_freshness::zero_hash()
2732 };
2733 if nanos >= 1_000_000_000 {
2740 return Err(format!(
2741 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2742 nanos
2743 ));
2744 }
2745 let duration = std::time::Duration::new(secs, nanos);
2746 let mtime = SystemTime::UNIX_EPOCH
2747 .checked_add(duration)
2748 .ok_or_else(|| {
2749 format!(
2750 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2751 secs, nanos
2752 )
2753 })?;
2754 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2755 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2756 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2757 } else {
2758 PathBuf::from(path)
2759 };
2760 file_mtimes.insert(path.clone(), mtime);
2761 file_sizes.insert(path.clone(), size);
2762 file_hashes.insert(path, content_hash);
2763 }
2764
2765 let mut entries = Vec::with_capacity(entry_count);
2767 for _ in 0..entry_count {
2768 let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2769 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2770 cached_path_under_root(current_canonical_root, &raw_file)
2771 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2772 } else {
2773 raw_file
2774 };
2775 let name = read_string_stream(&mut reader, total_len)?;
2776
2777 let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2778
2779 let start_line = read_u32_stream(&mut reader)?;
2780 let end_line = read_u32_stream(&mut reader)?;
2781
2782 let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2783
2784 let snippet = read_string_stream(&mut reader, total_len)?;
2785 let embed_text = read_string_stream(&mut reader, total_len)?;
2786
2787 let vec_bytes = dimension
2789 .checked_mul(F32_BYTES)
2790 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2791 if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2792 return Err("unexpected end of data reading vector".to_string());
2793 }
2794 let mut vector = Vec::with_capacity(dimension);
2795 for _ in 0..dimension {
2796 let mut bytes = [0u8; F32_BYTES];
2797 read_exact_stream(
2798 &mut reader,
2799 &mut bytes,
2800 "unexpected end of data reading vector",
2801 )?;
2802 vector.push(f32::from_le_bytes(bytes));
2803 }
2804
2805 entries.push(EmbeddingEntry {
2806 chunk: SemanticChunk {
2807 file,
2808 name,
2809 kind,
2810 start_line,
2811 end_line,
2812 exported,
2813 embed_text,
2814 snippet,
2815 },
2816 vector,
2817 });
2818 }
2819
2820 if entries.len() != entry_count {
2821 return Err(format!(
2822 "semantic cache entry count drift: header={} decoded={}",
2823 entry_count,
2824 entries.len()
2825 ));
2826 }
2827 for entry in &entries {
2828 if !file_mtimes.contains_key(&entry.chunk.file) {
2829 return Err(format!(
2830 "semantic cache metadata missing for entry file {}",
2831 entry.chunk.file.display()
2832 ));
2833 }
2834 }
2835
2836 Ok(Self {
2837 entries,
2838 file_mtimes,
2839 file_sizes,
2840 file_hashes,
2841 dimension,
2842 fingerprint,
2843 project_root: current_canonical_root.to_path_buf(),
2844 deferred_files: HashSet::new(),
2845 })
2846 }
2847}
2848
2849fn write_counted<W: Write>(
2850 writer: &mut W,
2851 bytes: &[u8],
2852 bytes_written: &mut usize,
2853) -> io::Result<()> {
2854 writer.write_all(bytes)?;
2855 *bytes_written = bytes_written.saturating_add(bytes.len());
2856 Ok(())
2857}
2858
2859struct CountingReader<R> {
2860 inner: R,
2861 bytes_read: usize,
2862}
2863
2864impl<R> CountingReader<R> {
2865 fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2866 Self { inner, bytes_read }
2867 }
2868
2869 fn bytes_read(&self) -> usize {
2870 self.bytes_read
2871 }
2872}
2873
2874impl<R: Read> Read for CountingReader<R> {
2875 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2876 let read = self.inner.read(buf)?;
2877 self.bytes_read = self.bytes_read.saturating_add(read);
2878 Ok(read)
2879 }
2880}
2881
2882fn read_exact_stream<R: Read>(
2883 reader: &mut CountingReader<R>,
2884 buf: &mut [u8],
2885 eof_message: &'static str,
2886) -> Result<(), String> {
2887 reader.read_exact(buf).map_err(|error| {
2888 if error.kind() == io::ErrorKind::UnexpectedEof {
2889 eof_message.to_string()
2890 } else {
2891 format!("{eof_message}: {error}")
2892 }
2893 })
2894}
2895
2896fn read_u8_stream<R: Read>(
2897 reader: &mut CountingReader<R>,
2898 eof_message: &'static str,
2899) -> Result<u8, String> {
2900 let mut bytes = [0u8; 1];
2901 read_exact_stream(reader, &mut bytes, eof_message)?;
2902 Ok(bytes[0])
2903}
2904
2905fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2906 let mut bytes = [0u8; 4];
2907 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2908 Ok(u32::from_le_bytes(bytes))
2909}
2910
2911fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2912 let mut bytes = [0u8; 8];
2913 read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2914 Ok(u64::from_le_bytes(bytes))
2915}
2916
2917fn read_string_stream<R: Read>(
2918 reader: &mut CountingReader<R>,
2919 total_len: Option<usize>,
2920) -> Result<String, String> {
2921 let len = read_u32_stream(reader)? as usize;
2922 if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2923 return Err("unexpected end of data reading string".to_string());
2924 }
2925 let mut bytes = vec![0u8; len];
2926 read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2927 Ok(String::from_utf8_lossy(&bytes).to_string())
2928}
2929
2930struct SourceLineCache<'a> {
2931 lines: Vec<&'a str>,
2932 line_starts: Vec<usize>,
2933}
2934
2935impl<'a> SourceLineCache<'a> {
2936 fn new(source: &'a str) -> Self {
2937 let lines: Vec<&'a str> = source.lines().collect();
2938 let mut line_starts = Vec::with_capacity(lines.len());
2939 let bytes = source.as_bytes();
2940 let mut offset = 0usize;
2941 for line in &lines {
2942 line_starts.push(offset);
2943 offset += line.len();
2944 if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2945 offset += 2;
2946 } else if bytes.get(offset) == Some(&b'\n') {
2947 offset += 1;
2948 }
2949 }
2950 Self { lines, line_starts }
2951 }
2952
2953 fn len(&self) -> usize {
2954 debug_assert_eq!(self.lines.len(), self.line_starts.len());
2955 self.line_starts.len()
2956 }
2957}
2958
2959fn build_embed_text_with_lines(
2961 symbol: &Symbol,
2962 line_cache: &SourceLineCache<'_>,
2963 file: &Path,
2964 project_root: &Path,
2965) -> String {
2966 let relative = file
2967 .strip_prefix(project_root)
2968 .unwrap_or(file)
2969 .to_string_lossy();
2970
2971 let kind_label = match &symbol.kind {
2972 SymbolKind::Function => "function",
2973 SymbolKind::Class => "class",
2974 SymbolKind::Method => "method",
2975 SymbolKind::Struct => "struct",
2976 SymbolKind::Interface => "interface",
2977 SymbolKind::Enum => "enum",
2978 SymbolKind::TypeAlias => "type",
2979 SymbolKind::Variable => "variable",
2980 SymbolKind::Heading => "heading",
2981 SymbolKind::FileSummary => "file-summary",
2982 };
2983
2984 let name = &symbol.name;
2986 let mut text = format!(
2987 "name:{name} file:{} kind:{} name:{name}",
2988 relative, kind_label
2989 );
2990
2991 if let Some(sig) = &symbol.signature {
2992 text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3000 }
3001
3002 let start = (symbol.range.start_line as usize).min(line_cache.len());
3004 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3006 if start < end {
3007 let body: String = line_cache.lines[start..end]
3008 .iter()
3009 .take(15) .copied()
3011 .collect::<Vec<&str>>()
3012 .join("\n");
3013 let snippet = if body.len() > 300 {
3014 format!("{}...", &body[..body.floor_char_boundary(300)])
3015 } else {
3016 body
3017 };
3018 text.push_str(&format!(" body:{}", snippet));
3019 }
3020
3021 truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3026}
3027
3028#[cfg(test)]
3029fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3030 let line_cache = SourceLineCache::new(source);
3031 build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3032}
3033
3034const MAX_EMBED_TEXT_CHARS: usize = 1600;
3038
3039fn truncate_chars(value: &str, max_chars: usize) -> String {
3040 value.chars().take(max_chars).collect()
3041}
3042
3043fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3044 let Some((start, first)) = line_cache
3045 .lines
3046 .iter()
3047 .enumerate()
3048 .find(|(_, line)| !line.trim().is_empty())
3049 else {
3050 return String::new();
3051 };
3052
3053 let trimmed = first.trim_start();
3054 if trimmed.starts_with("/**") {
3055 let mut comment = Vec::new();
3056 for line in line_cache.lines.iter().skip(start) {
3057 comment.push(*line);
3058 if line.contains("*/") {
3059 break;
3060 }
3061 }
3062 return truncate_chars(&comment.join("\n"), 200);
3063 }
3064
3065 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3066 let comment = line_cache
3067 .lines
3068 .iter()
3069 .skip(start)
3070 .take_while(|line| {
3071 let trimmed = line.trim_start();
3072 trimmed.starts_with("///") || trimmed.starts_with("//!")
3073 })
3074 .copied()
3075 .collect::<Vec<_>>()
3076 .join("\n");
3077 return truncate_chars(&comment, 200);
3078 }
3079
3080 String::new()
3081}
3082
3083pub fn build_file_summary_chunk(
3084 file: &Path,
3085 project_root: &Path,
3086 source: &str,
3087 top_exports: &[&str],
3088 top_export_signatures: &[Option<&str>],
3089) -> SemanticChunk {
3090 let line_cache = SourceLineCache::new(source);
3091 build_file_summary_chunk_with_lines(
3092 file,
3093 project_root,
3094 &line_cache,
3095 top_exports,
3096 top_export_signatures,
3097 )
3098}
3099
3100fn build_file_summary_chunk_with_lines(
3101 file: &Path,
3102 project_root: &Path,
3103 line_cache: &SourceLineCache<'_>,
3104 top_exports: &[&str],
3105 top_export_signatures: &[Option<&str>],
3106) -> SemanticChunk {
3107 let relative = file.strip_prefix(project_root).unwrap_or(file);
3108 let rel_path = relative.to_string_lossy();
3109 let parent_dir = relative
3110 .parent()
3111 .map(|parent| parent.to_string_lossy().to_string())
3112 .unwrap_or_default();
3113 let name = file
3114 .file_stem()
3115 .map(|stem| stem.to_string_lossy().to_string())
3116 .unwrap_or_default();
3117 let doc = first_leading_doc_comment(line_cache);
3118 let exports = top_exports
3119 .iter()
3120 .take(5)
3121 .copied()
3122 .collect::<Vec<_>>()
3123 .join(",");
3124 let snippet = if doc.is_empty() {
3125 top_export_signatures
3126 .first()
3127 .and_then(|signature| signature.as_deref())
3128 .map(|signature| truncate_chars(signature, 200))
3129 .unwrap_or_default()
3130 } else {
3131 doc.clone()
3132 };
3133
3134 SemanticChunk {
3135 file: file.to_path_buf(),
3136 name,
3137 kind: SymbolKind::FileSummary,
3138 start_line: 0,
3139 end_line: 0,
3140 exported: false,
3141 embed_text: truncate_chars(
3142 &format!(
3143 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3144 file.file_stem()
3145 .map(|stem| stem.to_string_lossy().to_string())
3146 .unwrap_or_default()
3147 ),
3148 MAX_EMBED_TEXT_CHARS,
3149 ),
3150 snippet,
3151 }
3152}
3153
3154fn parser_for(
3155 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3156 lang: crate::parser::LangId,
3157) -> Result<&mut Parser, String> {
3158 use std::collections::hash_map::Entry;
3159
3160 match parsers.entry(lang) {
3161 Entry::Occupied(entry) => Ok(entry.into_mut()),
3162 Entry::Vacant(entry) => {
3163 let grammar = grammar_for(lang);
3164 let mut parser = Parser::new();
3165 parser
3166 .set_language(&grammar)
3167 .map_err(|error| error.to_string())?;
3168 Ok(entry.insert(parser))
3169 }
3170 }
3171}
3172
3173pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3174 matches!(
3175 path.extension().and_then(|extension| extension.to_str()),
3176 Some(
3177 "ts" | "tsx"
3178 | "js"
3179 | "jsx"
3180 | "py"
3181 | "rs"
3182 | "go"
3183 | "c"
3184 | "h"
3185 | "cc"
3186 | "cpp"
3187 | "cxx"
3188 | "hpp"
3189 | "hh"
3190 | "zig"
3191 | "cs"
3192 | "sh"
3193 | "bash"
3194 | "zsh"
3195 | "inc"
3196 | "php"
3197 | "sol"
3198 | "scss"
3199 | "vue"
3200 | "yaml"
3201 | "yml"
3202 | "pas"
3203 | "pp"
3204 | "dpr"
3205 | "dpk"
3206 | "lpr",
3207 )
3208 )
3209}
3210
3211fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3212 if let Ok(canonical) = fs::canonicalize(path) {
3213 return canonical;
3214 }
3215
3216 let Some(parent) = path.parent() else {
3217 return path.to_path_buf();
3218 };
3219 let Some(file_name) = path.file_name() else {
3220 return path.to_path_buf();
3221 };
3222
3223 fs::canonicalize(parent)
3224 .map(|canonical_parent| canonical_parent.join(file_name))
3225 .unwrap_or_else(|_| path.to_path_buf())
3226}
3227
3228const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3238
3239fn collect_semantic_file(
3240 project_root: &Path,
3241 file: &Path,
3242 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3243) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3244 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3245 if !metadata.is_file() {
3246 return Err("not a regular file".to_string());
3247 }
3248 let mtime = metadata.modified().map_err(|error| error.to_string())?;
3249 let size = metadata.len();
3250
3251 if !is_semantic_indexed_extension(file) {
3252 return Err("unsupported file extension".to_string());
3253 }
3254 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3255
3256 let mut indexed_metadata = IndexedFileMetadata {
3257 mtime,
3258 size,
3259 content_hash: cache_freshness::zero_hash(),
3260 };
3261
3262 if size > MAX_SEMANTIC_FILE_BYTES {
3265 return Ok((indexed_metadata, Vec::new()));
3266 }
3267
3268 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3269 indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3270 cache_freshness::hash_bytes(source.as_bytes())
3271 } else {
3272 cache_freshness::zero_hash()
3273 };
3274
3275 let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3276 Ok((indexed_metadata, chunks))
3277}
3278
3279#[cfg(test)]
3280fn collect_file_chunks(
3281 project_root: &Path,
3282 file: &Path,
3283 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3284) -> Result<Vec<SemanticChunk>, String> {
3285 if !is_semantic_indexed_extension(file) {
3286 return Err("unsupported file extension".to_string());
3287 }
3288 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3289 if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3292 return Ok(Vec::new());
3293 }
3294 let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3295 collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3296}
3297
3298fn collect_file_chunks_from_source(
3299 project_root: &Path,
3300 file: &Path,
3301 lang: crate::parser::LangId,
3302 parsers: &mut HashMap<crate::parser::LangId, Parser>,
3303 source: &str,
3304) -> Result<Vec<SemanticChunk>, String> {
3305 let tree = parser_for(parsers, lang)?
3306 .parse(source, None)
3307 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3308 let symbols =
3309 extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3310
3311 Ok(symbols_to_chunks(file, &symbols, source, project_root))
3312}
3313
3314fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3316 let start = (symbol.range.start_line as usize).min(line_cache.len());
3317 let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3319 if start < end {
3320 let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3321 .iter()
3322 .take(5)
3323 .copied()
3324 .collect();
3325 let mut snippet = snippet_lines.join("\n");
3326 if end - start > 5 {
3327 snippet.push_str("\n ...");
3328 }
3329 if snippet.len() > 300 {
3330 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3331 }
3332 snippet
3333 } else {
3334 String::new()
3335 }
3336}
3337
3338#[cfg(test)]
3339fn build_snippet(symbol: &Symbol, source: &str) -> String {
3340 let line_cache = SourceLineCache::new(source);
3341 build_snippet_with_lines(symbol, &line_cache)
3342}
3343
3344fn symbols_to_chunks(
3346 file: &Path,
3347 symbols: &[Symbol],
3348 source: &str,
3349 project_root: &Path,
3350) -> Vec<SemanticChunk> {
3351 let line_cache = SourceLineCache::new(source);
3352 let mut chunks = Vec::new();
3353 let top_exports_with_signatures = symbols
3354 .iter()
3355 .filter(|symbol| {
3356 symbol.exported
3357 && symbol.parent.is_none()
3358 && !matches!(symbol.kind, SymbolKind::Heading)
3359 })
3360 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3361 .collect::<Vec<_>>();
3362
3363 let has_only_headings = !symbols.is_empty()
3364 && symbols
3365 .iter()
3366 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3367 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3368 let top_exports = top_exports_with_signatures
3369 .iter()
3370 .map(|(name, _)| *name)
3371 .collect::<Vec<_>>();
3372 let top_export_signatures = top_exports_with_signatures
3373 .iter()
3374 .map(|(_, signature)| *signature)
3375 .collect::<Vec<_>>();
3376 chunks.push(build_file_summary_chunk_with_lines(
3377 file,
3378 project_root,
3379 &line_cache,
3380 &top_exports,
3381 &top_export_signatures,
3382 ));
3383 }
3384
3385 for symbol in symbols {
3386 if matches!(symbol.kind, SymbolKind::Heading) {
3391 continue;
3392 }
3393
3394 let line_count = symbol
3396 .range
3397 .end_line
3398 .saturating_sub(symbol.range.start_line)
3399 + 1;
3400 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3401 continue;
3402 }
3403
3404 let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3405 let snippet = build_snippet_with_lines(symbol, &line_cache);
3406
3407 chunks.push(SemanticChunk {
3408 file: file.to_path_buf(),
3409 name: symbol.name.clone(),
3410 kind: symbol.kind.clone(),
3411 start_line: symbol.range.start_line,
3412 end_line: symbol.range.end_line,
3413 exported: symbol.exported,
3414 embed_text,
3415 snippet,
3416 });
3417
3418 }
3421
3422 chunks
3423}
3424
3425fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3426 b.0.partial_cmp(&a.0)
3427 .unwrap_or(std::cmp::Ordering::Equal)
3428 .then_with(|| a.1.cmp(&b.1))
3429}
3430
3431fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3433 if a.len() != b.len() {
3434 return 0.0;
3435 }
3436
3437 let mut dot = 0.0f32;
3438 let mut norm_a = 0.0f32;
3439 let mut norm_b = 0.0f32;
3440
3441 for i in 0..a.len() {
3442 dot += a[i] * b[i];
3443 norm_a += a[i] * a[i];
3444 norm_b += b[i] * b[i];
3445 }
3446
3447 let denom = norm_a.sqrt() * norm_b.sqrt();
3448 if denom == 0.0 {
3449 0.0
3450 } else {
3451 dot / denom
3452 }
3453}
3454
3455fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3457 match kind {
3458 SymbolKind::Function => 0,
3459 SymbolKind::Class => 1,
3460 SymbolKind::Method => 2,
3461 SymbolKind::Struct => 3,
3462 SymbolKind::Interface => 4,
3463 SymbolKind::Enum => 5,
3464 SymbolKind::TypeAlias => 6,
3465 SymbolKind::Variable => 7,
3466 SymbolKind::Heading => 8,
3467 SymbolKind::FileSummary => 9,
3468 }
3469}
3470
3471fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3472 match v {
3473 0 => SymbolKind::Function,
3474 1 => SymbolKind::Class,
3475 2 => SymbolKind::Method,
3476 3 => SymbolKind::Struct,
3477 4 => SymbolKind::Interface,
3478 5 => SymbolKind::Enum,
3479 6 => SymbolKind::TypeAlias,
3480 7 => SymbolKind::Variable,
3481 8 => SymbolKind::Heading,
3482 9 => SymbolKind::FileSummary,
3483 _ => SymbolKind::Heading,
3484 }
3485}
3486
3487#[cfg(test)]
3488mod tests {
3489 use super::*;
3490 use crate::config::{SemanticBackend, SemanticBackendConfig};
3491 use crate::parser::FileParser;
3492 use std::io::{Read, Write};
3493 use std::net::TcpListener;
3494 use std::thread;
3495
3496 #[test]
3497 fn semantic_index_includes_php_inc_and_scss_extensions() {
3498 for file in ["partial.inc", "index.php", "styles.scss"] {
3499 assert!(
3500 is_semantic_indexed_extension(Path::new(file)),
3501 "{file} should be semantic-index eligible"
3502 );
3503 }
3504 }
3505
3506 #[test]
3507 fn transient_marker_round_trips_and_classifies() {
3508 let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3511 assert!(embedding_failure_is_transient(&marked));
3512 let clean = strip_transient_embedding_marker(&marked);
3513 assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3514 assert!(clean.starts_with("openai compatible request failed:"));
3515
3516 for permanent in [
3519 "openai compatible request failed (HTTP 401): Unauthorized",
3520 "embedding dimension mismatch: index has 384, model returned 768",
3521 "too many files (>20000) for semantic indexing (max 20000)",
3522 ] {
3523 assert!(
3524 !embedding_failure_is_transient(permanent),
3525 "{permanent:?} must not be transient"
3526 );
3527 assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3529 }
3530 }
3531
3532 #[test]
3533 fn send_error_transience_separates_connect_timeout_from_4xx() {
3534 assert!(is_retryable_embedding_status(
3536 reqwest::StatusCode::INTERNAL_SERVER_ERROR
3537 ));
3538 assert!(is_retryable_embedding_status(
3539 reqwest::StatusCode::TOO_MANY_REQUESTS
3540 ));
3541 assert!(!is_retryable_embedding_status(
3542 reqwest::StatusCode::UNAUTHORIZED
3543 ));
3544 assert!(!is_retryable_embedding_status(
3545 reqwest::StatusCode::BAD_REQUEST
3546 ));
3547 }
3548
3549 #[test]
3550 fn local_backend_model_loading_body_is_transient() {
3551 for body in [
3554 r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3555 r#"{"error":"model is loading, please wait"}"#,
3556 r#"{"error":"Model not loaded"}"#,
3557 "Loading model into memory",
3558 ] {
3559 assert!(
3560 embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3561 "{body:?} should be body-transient"
3562 );
3563 }
3564
3565 for body in [
3569 r#"{"error":"invalid api key"}"#,
3570 r#"{"error":"model 'foo' not found"}"#,
3571 "Bad Request: unknown field",
3572 "Bad Request: invalid loading model option",
3573 r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3574 ] {
3575 assert!(
3576 !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3577 "{body:?} must not be body-transient"
3578 );
3579 }
3580
3581 assert!(
3582 !embedding_response_body_is_transient(
3583 reqwest::StatusCode::UNAUTHORIZED,
3584 r#"{"error":"model is loading, please wait"}"#
3585 ),
3586 "permanent auth failures must not become transient because of body text"
3587 );
3588 }
3589
3590 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3591 where
3592 F: Fn(String, String, String) -> String + Send + 'static,
3593 {
3594 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3595 let addr = listener.local_addr().expect("local addr");
3596 let handle = thread::spawn(move || {
3597 let (mut stream, _) = listener.accept().expect("accept request");
3598 let mut buf = Vec::new();
3599 let mut chunk = [0u8; 4096];
3600 let mut header_end = None;
3601 let mut content_length = 0usize;
3602 loop {
3603 let n = stream.read(&mut chunk).expect("read request");
3604 if n == 0 {
3605 break;
3606 }
3607 buf.extend_from_slice(&chunk[..n]);
3608 if header_end.is_none() {
3609 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3610 header_end = Some(pos + 4);
3611 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3612 for line in headers.lines() {
3613 if let Some(value) = line.strip_prefix("Content-Length:") {
3614 content_length = value.trim().parse::<usize>().unwrap_or(0);
3615 }
3616 }
3617 }
3618 }
3619 if let Some(end) = header_end {
3620 if buf.len() >= end + content_length {
3621 break;
3622 }
3623 }
3624 }
3625
3626 let end = header_end.expect("header terminator");
3627 let request = String::from_utf8_lossy(&buf[..end]).to_string();
3628 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3629 let mut lines = request.lines();
3630 let request_line = lines.next().expect("request line").to_string();
3631 let path = request_line
3632 .split_whitespace()
3633 .nth(1)
3634 .expect("request path")
3635 .to_string();
3636 let response_body = handler(request_line, path, body);
3637 let response = format!(
3638 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3639 response_body.len(),
3640 response_body
3641 );
3642 stream
3643 .write_all(response.as_bytes())
3644 .expect("write response");
3645 });
3646
3647 (format!("http://{}", addr), handle)
3648 }
3649
3650 fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3651 let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3652 listener
3653 .set_nonblocking(true)
3654 .expect("nonblocking listener");
3655 let addr = listener.local_addr().expect("local addr");
3656 let handle = thread::spawn(move || {
3657 let deadline = std::time::Instant::now() + Duration::from_secs(2);
3658 let mut accepted = 0usize;
3659 while accepted < attempts && std::time::Instant::now() < deadline {
3660 match listener.accept() {
3661 Ok((mut stream, _)) => {
3662 accepted += 1;
3663 let mut buf = [0u8; 4096];
3664 let _ = stream.read(&mut buf);
3672 let response = "HTTP/1.1 200 OK
3673Content-Type: application/json
3674Content-Length: 128
3675Connection: close
3676
3677{";
3678 let _ = stream.write_all(response.as_bytes());
3679 }
3680 Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3681 thread::sleep(Duration::from_millis(10));
3682 }
3683 Err(error) => panic!("accept request: {error}"),
3684 }
3685 }
3686 });
3687
3688 (format!("http://{}", addr), handle)
3689 }
3690
3691 #[test]
3692 fn response_body_read_failures_are_marked_transient() {
3693 let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3694 let client = Client::builder()
3695 .timeout(Duration::from_millis(250))
3696 .build()
3697 .expect("client");
3698
3699 let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3700 .expect_err("truncated body should fail");
3701
3702 handle.join().unwrap();
3703 assert!(
3704 embedding_failure_is_transient(&error),
3705 "body read failures should be transient-marked: {error}"
3706 );
3707 assert!(error.contains("response read failed"));
3708 }
3709
3710 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3711 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3712 }
3713
3714 fn write_rust_file(path: &Path, function_name: &str) {
3715 fs::write(
3716 path,
3717 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3718 )
3719 .unwrap();
3720 }
3721
3722 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3723 let mut embed = test_vector_for_texts;
3724 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3725 }
3726
3727 fn test_project_root() -> PathBuf {
3728 std::env::current_dir().unwrap()
3729 }
3730
3731 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3732 index.file_mtimes.insert(file.to_path_buf(), mtime);
3733 index.file_sizes.insert(file.to_path_buf(), size);
3734 index
3735 .file_hashes
3736 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3737 }
3738
3739 fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3740 let mut buf = Vec::new();
3741 let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3742 let encoded = fingerprint.as_string();
3743 if encoded.is_empty() {
3744 None
3745 } else {
3746 Some(encoded.into_bytes())
3747 }
3748 });
3749 let file_mtimes: Vec<_> = index
3750 .file_mtimes
3751 .iter()
3752 .filter_map(|(path, mtime)| {
3753 cache_relative_path(&index.project_root, path)
3754 .map(|relative| (relative, path, mtime))
3755 })
3756 .collect();
3757 let entries: Vec<_> = index
3758 .entries
3759 .iter()
3760 .filter_map(|entry| {
3761 cache_relative_path(&index.project_root, &entry.chunk.file)
3762 .map(|relative| (relative, entry))
3763 })
3764 .collect();
3765
3766 buf.push(SEMANTIC_INDEX_VERSION_V6);
3767 buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3768 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3769 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3770 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3771 buf.extend_from_slice(fp_bytes_ref);
3772
3773 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3774 for (relative, path, mtime) in &file_mtimes {
3775 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3776 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3777 buf.extend_from_slice(&path_bytes);
3778 let duration = mtime
3779 .duration_since(SystemTime::UNIX_EPOCH)
3780 .unwrap_or_default();
3781 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3782 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3783 let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3784 buf.extend_from_slice(&size.to_le_bytes());
3785 let hash = index
3786 .file_hashes
3787 .get(*path)
3788 .copied()
3789 .unwrap_or_else(cache_freshness::zero_hash);
3790 buf.extend_from_slice(hash.as_bytes());
3791 }
3792
3793 for (relative, entry) in &entries {
3794 let c = &entry.chunk;
3795 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3796 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3797 buf.extend_from_slice(&file_bytes);
3798
3799 let name_bytes = c.name.as_bytes();
3800 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3801 buf.extend_from_slice(name_bytes);
3802
3803 buf.push(symbol_kind_to_u8(&c.kind));
3804 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3805 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3806 buf.push(c.exported as u8);
3807
3808 let snippet_bytes = c.snippet.as_bytes();
3809 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3810 buf.extend_from_slice(snippet_bytes);
3811
3812 let embed_bytes = c.embed_text.as_bytes();
3813 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3814 buf.extend_from_slice(embed_bytes);
3815
3816 for &val in &entry.vector {
3817 buf.extend_from_slice(&val.to_le_bytes());
3818 }
3819 }
3820
3821 buf
3822 }
3823
3824 #[derive(Default)]
3825 struct RecordingEmbedder {
3826 calls: Vec<Vec<String>>,
3827 }
3828
3829 impl RecordingEmbedder {
3830 fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3831 let vectors = texts
3832 .iter()
3833 .map(|text| deterministic_test_vector(text))
3834 .collect();
3835 self.calls.push(texts);
3836 Ok(vectors)
3837 }
3838
3839 fn total_embedded_texts(&self) -> usize {
3840 self.calls.iter().map(Vec::len).sum()
3841 }
3842
3843 fn embedded_texts(&self) -> Vec<&str> {
3844 self.calls
3845 .iter()
3846 .flat_map(|batch| batch.iter().map(String::as_str))
3847 .collect()
3848 }
3849 }
3850
3851 fn deterministic_test_vector(text: &str) -> Vec<f32> {
3852 let hash = blake3::hash(text.as_bytes());
3853 let bytes = hash.as_bytes();
3854 vec![
3855 1.0,
3856 bytes[0] as f32 / 255.0,
3857 bytes[1] as f32 / 255.0,
3858 bytes[2] as f32 / 255.0,
3859 ]
3860 }
3861
3862 fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3863 let mut embedder = RecordingEmbedder::default();
3864 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3865 SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3866 }
3867
3868 fn force_stale(index: &mut SemanticIndex, file: &Path) {
3869 set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3870 }
3871
3872 fn write_source(path: &Path, source: &str) {
3873 if let Some(parent) = path.parent() {
3874 fs::create_dir_all(parent).unwrap();
3875 }
3876 fs::write(path, source).unwrap();
3877 }
3878
3879 fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3880 index
3881 .entries
3882 .iter()
3883 .filter(|entry| entry.chunk.file == file)
3884 .collect()
3885 }
3886
3887 fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3888 index
3889 .entries
3890 .iter()
3891 .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3892 .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3893 }
3894
3895 fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3896 index
3897 .entries
3898 .iter()
3899 .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3900 .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3901 }
3902
3903 #[test]
3904 fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3905 let temp = tempfile::tempdir().unwrap();
3906 let project_root = temp.path();
3907 let file = project_root.join("src/lib.rs");
3908 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3909 write_source(&file, original);
3910
3911 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3912 let original_entry_count = index.entries.len();
3913 let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3914
3915 write_source(&file, &format!("\n{original}"));
3916 force_stale(&mut index, &file);
3917
3918 let mut embedder = RecordingEmbedder::default();
3919 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3920 let mut progress = |_done: usize, _total: usize| {};
3921 let summary = index
3922 .refresh_stale_files(
3923 project_root,
3924 std::slice::from_ref(&file),
3925 &mut embed,
3926 16,
3927 &mut progress,
3928 )
3929 .unwrap();
3930
3931 assert_eq!(summary.changed, 1);
3932 assert_eq!(embedder.total_embedded_texts(), 0);
3933 assert_eq!(index.entries.len(), original_entry_count);
3934 let shifted_alpha = entry_by_name(&index, &file, "alpha");
3935 assert_eq!(shifted_alpha.chunk.start_line, 1);
3936 assert_eq!(shifted_alpha.vector, original_alpha_vector);
3937 }
3938
3939 #[test]
3940 fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3941 let temp = tempfile::tempdir().unwrap();
3942 let project_root = temp.path();
3943 let file = project_root.join("src/lib.rs");
3944 let original = "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n";
3945 write_source(&file, original);
3946
3947 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3948 let mut serving_index = worker_index.clone();
3949 let original_entry_count = worker_index.entries.len();
3950
3951 write_source(&file, &format!("\n{original}"));
3952
3953 let mut embedder = RecordingEmbedder::default();
3954 let mut embed = |texts: Vec<String>| embedder.embed(texts);
3955 let mut progress = |_done: usize, _total: usize| {};
3956 let update = worker_index
3957 .refresh_invalidated_files(
3958 project_root,
3959 std::slice::from_ref(&file),
3960 &mut embed,
3961 16,
3962 100,
3963 &mut progress,
3964 )
3965 .unwrap();
3966
3967 assert_eq!(embedder.total_embedded_texts(), 0);
3968 assert_eq!(update.added_entries.len(), original_entry_count);
3969 assert_eq!(worker_index.entries.len(), original_entry_count);
3970
3971 serving_index.apply_refresh_update(
3972 update.added_entries,
3973 update.updated_metadata,
3974 &update.completed_paths,
3975 );
3976
3977 assert_eq!(serving_index.entries.len(), original_entry_count);
3978 assert_eq!(
3979 entries_for_file(&serving_index, &file).len(),
3980 original_entry_count
3981 );
3982 assert_eq!(
3983 entry_by_name(&serving_index, &file, "alpha")
3984 .chunk
3985 .start_line,
3986 1
3987 );
3988 }
3989
3990 #[test]
3991 fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3992 let temp = tempfile::tempdir().unwrap();
3993 let project_root = temp.path();
3994 let file = project_root.join("src/lib.rs");
3995 write_source(
3996 &file,
3997 "pub fn alpha() -> i32 {\n 1\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
3998 );
3999
4000 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4001 let original_entry_count = index.entries.len();
4002 let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4003
4004 write_source(
4005 &file,
4006 "pub fn alpha() -> i32 {\n 10\n}\n\npub fn beta() -> i32 {\n 2\n}\n",
4007 );
4008
4009 let mut embedder = RecordingEmbedder::default();
4010 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4011 let mut progress = |_done: usize, _total: usize| {};
4012 let update = index
4013 .refresh_invalidated_files(
4014 project_root,
4015 std::slice::from_ref(&file),
4016 &mut embed,
4017 16,
4018 100,
4019 &mut progress,
4020 )
4021 .unwrap();
4022
4023 assert_eq!(embedder.total_embedded_texts(), 1);
4024 assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4025 assert_eq!(update.added_entries.len(), original_entry_count);
4026 assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4027 }
4028
4029 #[test]
4030 fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4031 let temp = tempfile::tempdir().unwrap();
4032 let project_root = temp.path();
4033 let file = project_root.join("src/dupe.js");
4034 let one_duplicate = "function duplicate() {\n return 1;\n}\n";
4035 write_source(&file, one_duplicate);
4036
4037 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4038 let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4039
4040 write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4041
4042 let mut embedder = RecordingEmbedder::default();
4043 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4044 let mut progress = |_done: usize, _total: usize| {};
4045 index
4046 .refresh_invalidated_files(
4047 project_root,
4048 std::slice::from_ref(&file),
4049 &mut embed,
4050 16,
4051 100,
4052 &mut progress,
4053 )
4054 .unwrap();
4055
4056 let duplicate_entries = index
4057 .entries
4058 .iter()
4059 .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4060 .collect::<Vec<_>>();
4061 assert_eq!(duplicate_entries.len(), 2);
4062 assert_eq!(embedder.total_embedded_texts(), 0);
4063 assert_eq!(duplicate_entries[0].vector, original_vector);
4064 assert_eq!(duplicate_entries[1].vector, original_vector);
4065 }
4066
4067 #[test]
4068 fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4069 let temp = tempfile::tempdir().unwrap();
4070 let project_root = temp.path();
4071 let file = project_root.join("src/lib.rs");
4072 write_source(
4073 &file,
4074 "//! module docs v1\n\npub fn alpha() -> i32 {\n 1\n}\n",
4075 );
4076
4077 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4078 let summary_before = file_summary_entry(&index, &file).vector.clone();
4079
4080 write_source(
4081 &file,
4082 "//! module docs v1\n\npub fn alpha() -> i32 {\n 2\n}\n",
4083 );
4084 let mut body_embedder = RecordingEmbedder::default();
4085 let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4086 let mut progress = |_done: usize, _total: usize| {};
4087 index
4088 .refresh_invalidated_files(
4089 project_root,
4090 std::slice::from_ref(&file),
4091 &mut body_embed,
4092 16,
4093 100,
4094 &mut progress,
4095 )
4096 .unwrap();
4097 assert_eq!(body_embedder.total_embedded_texts(), 1);
4098 assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4099 assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4100
4101 write_source(
4102 &file,
4103 "//! module docs v2\n\npub fn alpha() -> i32 {\n 2\n}\n",
4104 );
4105 let mut doc_embedder = RecordingEmbedder::default();
4106 let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4107 index
4108 .refresh_invalidated_files(
4109 project_root,
4110 std::slice::from_ref(&file),
4111 &mut doc_embed,
4112 16,
4113 100,
4114 &mut progress,
4115 )
4116 .unwrap();
4117
4118 assert_eq!(doc_embedder.total_embedded_texts(), 1);
4119 assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4120 assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4121 }
4122
4123 #[test]
4124 fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4125 let temp = tempfile::tempdir().unwrap();
4126 let project_root = temp.path();
4127 let file = project_root.join("src/lib.rs");
4128 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4129
4130 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4131 let mut serving_index = worker_index.clone();
4132 fs::remove_file(&file).unwrap();
4133
4134 let mut embedder = RecordingEmbedder::default();
4135 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4136 let mut progress = |_done: usize, _total: usize| {};
4137 let update = worker_index
4138 .refresh_invalidated_files(
4139 project_root,
4140 std::slice::from_ref(&file),
4141 &mut embed,
4142 16,
4143 100,
4144 &mut progress,
4145 )
4146 .unwrap();
4147
4148 assert_eq!(update.summary.deleted, 1);
4149 assert_eq!(embedder.total_embedded_texts(), 0);
4150 assert!(worker_index.entries.is_empty());
4151
4152 serving_index.apply_refresh_update(
4153 update.added_entries,
4154 update.updated_metadata,
4155 &update.completed_paths,
4156 );
4157 assert!(serving_index.entries.is_empty());
4158 }
4159
4160 #[test]
4161 fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4162 let temp = tempfile::tempdir().unwrap();
4163 let project_root = temp.path();
4164 let file = project_root.join("src/lib.rs");
4165 write_source(&file, "pub fn alpha() -> i32 {\n 1\n}\n");
4166
4167 let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4168 let mut serving_index = worker_index.clone();
4169 fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4170
4171 let mut embedder = RecordingEmbedder::default();
4172 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4173 let mut progress = |_done: usize, _total: usize| {};
4174 let update = worker_index
4175 .refresh_invalidated_files(
4176 project_root,
4177 std::slice::from_ref(&file),
4178 &mut embed,
4179 16,
4180 100,
4181 &mut progress,
4182 )
4183 .unwrap();
4184
4185 assert_eq!(embedder.total_embedded_texts(), 0);
4186 assert!(update.added_entries.is_empty());
4187 assert!(worker_index.entries.is_empty());
4188 assert!(!worker_index.file_mtimes.contains_key(&file));
4189
4190 serving_index.apply_refresh_update(
4191 update.added_entries,
4192 update.updated_metadata,
4193 &update.completed_paths,
4194 );
4195 assert!(serving_index.entries.is_empty());
4196 assert!(!serving_index.file_mtimes.contains_key(&file));
4197 }
4198
4199 #[test]
4200 fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4201 let temp = tempfile::tempdir().unwrap();
4202 let project_root = temp.path();
4203 let indexed = project_root.join("src/a.rs");
4204 let deferred = project_root.join("src/b.rs");
4205 write_source(&indexed, "pub fn alpha() -> i32 {\n 1\n}\n");
4206 write_source(&deferred, "pub fn beta() -> i32 {\n 2\n}\n");
4207
4208 let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4209 let mut embedder = RecordingEmbedder::default();
4210 let mut embed = |texts: Vec<String>| embedder.embed(texts);
4211 let mut progress = |_done: usize, _total: usize| {};
4212 let update = index
4213 .refresh_invalidated_files(
4214 project_root,
4215 std::slice::from_ref(&deferred),
4216 &mut embed,
4217 16,
4218 1,
4219 &mut progress,
4220 )
4221 .unwrap();
4222
4223 assert_eq!(update.summary.total_processed, 1);
4224 assert_eq!(update.summary.added, 0);
4225 assert_eq!(embedder.total_embedded_texts(), 0);
4226 assert_eq!(index.indexed_file_count(), 1);
4227 assert!(index.deferred_files.contains(&deferred));
4228 assert!(entries_for_file(&index, &deferred).is_empty());
4229 }
4230
4231 #[test]
4232 fn semantic_cache_serialization_skips_paths_outside_project_root() {
4233 let dir = tempfile::tempdir().expect("create temp dir");
4234 let project = fs::canonicalize(dir.path()).expect("canonical project");
4235 let outside = project.join("..").join("outside.rs");
4236 let mut index = SemanticIndex::new(project.clone(), 3);
4237 index
4238 .file_mtimes
4239 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4240 index.file_sizes.insert(outside.clone(), 1);
4241 index
4242 .file_hashes
4243 .insert(outside.clone(), cache_freshness::zero_hash());
4244 index.entries.push(EmbeddingEntry {
4245 chunk: SemanticChunk {
4246 file: outside,
4247 name: "outside".to_string(),
4248 kind: SymbolKind::Function,
4249 start_line: 0,
4250 end_line: 0,
4251 exported: false,
4252 embed_text: "outside".to_string(),
4253 snippet: "outside".to_string(),
4254 },
4255 vector: vec![1.0, 0.0, 0.0],
4256 });
4257
4258 let bytes = index.to_bytes();
4259 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4260 assert_eq!(loaded.entries.len(), 0);
4261 assert!(loaded.file_mtimes.is_empty());
4262 }
4263
4264 #[test]
4265 fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4266 let project_root = test_project_root();
4267 let file = project_root.join("src/lib.rs");
4268 let mut index = SemanticIndex::new(project_root, 2);
4269 let entries = [
4270 ("alpha", vec![1.0, 0.0], false),
4271 ("beta", vec![0.0, 1.0], false),
4272 ("gamma", vec![1.0, 0.0], false),
4273 ("delta", vec![0.5, 0.5], true),
4274 ("epsilon", vec![-1.0, 0.0], false),
4275 ];
4276 for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4277 index.entries.push(EmbeddingEntry {
4278 chunk: SemanticChunk {
4279 file: file.clone(),
4280 name: name.to_string(),
4281 kind: SymbolKind::Function,
4282 start_line: line as u32 + 1,
4283 end_line: line as u32 + 1,
4284 exported,
4285 embed_text: name.to_string(),
4286 snippet: format!("fn {name}() {{}}"),
4287 },
4288 vector,
4289 });
4290 }
4291
4292 let query = vec![1.0, 0.0];
4293 let top_k = 4;
4294 let mut reference: Vec<(f32, usize)> = index
4295 .entries
4296 .iter()
4297 .enumerate()
4298 .map(|(idx, entry)| {
4299 let mut score = cosine_similarity(&query, &entry.vector);
4300 if entry.chunk.exported {
4301 score *= 1.1;
4302 }
4303 (score, idx)
4304 })
4305 .collect();
4306 reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4307 let expected: Vec<(String, f32)> = reference
4308 .into_iter()
4309 .take(top_k)
4310 .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4311 .collect();
4312
4313 let actual: Vec<(String, f32)> = index
4314 .search(&query, top_k)
4315 .into_iter()
4316 .map(|result| (result.name, result.score))
4317 .collect();
4318
4319 assert_eq!(
4320 actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4321 expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4322 );
4323 for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4324 assert!((actual_score - expected_score).abs() < 1e-6);
4325 }
4326 assert_eq!(actual[0].0, "alpha");
4327 assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4328 assert!(index.search(&query, 0).is_empty());
4329 }
4330
4331 #[test]
4332 fn test_cosine_similarity_identical() {
4333 let a = vec![1.0, 0.0, 0.0];
4334 let b = vec![1.0, 0.0, 0.0];
4335 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4336 }
4337
4338 #[test]
4339 fn test_cosine_similarity_orthogonal() {
4340 let a = vec![1.0, 0.0, 0.0];
4341 let b = vec![0.0, 1.0, 0.0];
4342 assert!(cosine_similarity(&a, &b).abs() < 0.001);
4343 }
4344
4345 #[test]
4346 fn test_cosine_similarity_opposite() {
4347 let a = vec![1.0, 0.0, 0.0];
4348 let b = vec![-1.0, 0.0, 0.0];
4349 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4350 }
4351
4352 #[test]
4353 fn test_serialization_roundtrip() {
4354 let project_root = test_project_root();
4355 let file = project_root.join("src/main.rs");
4356 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4357 index.entries.push(EmbeddingEntry {
4358 chunk: SemanticChunk {
4359 file: file.clone(),
4360 name: "handle_request".to_string(),
4361 kind: SymbolKind::Function,
4362 start_line: 10,
4363 end_line: 25,
4364 exported: true,
4365 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4366 snippet: "fn handle_request() {\n // ...\n}".to_string(),
4367 },
4368 vector: vec![0.1, 0.2, 0.3, 0.4],
4369 });
4370 index.dimension = 4;
4371 index
4372 .file_mtimes
4373 .insert(file.clone(), SystemTime::UNIX_EPOCH);
4374 index.file_sizes.insert(file, 0);
4375 index.set_fingerprint(SemanticIndexFingerprint {
4376 backend: "fastembed".to_string(),
4377 model: "all-MiniLM-L6-v2".to_string(),
4378 base_url: FALLBACK_BACKEND.to_string(),
4379 dimension: 4,
4380 chunking_version: default_chunking_version(),
4381 });
4382
4383 let bytes = index.to_bytes();
4384 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4385
4386 assert_eq!(restored.entries.len(), 1);
4387 assert_eq!(restored.entries[0].chunk.name, "handle_request");
4388 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4389 assert_eq!(restored.dimension, 4);
4390 assert_eq!(restored.backend_label(), Some("fastembed"));
4391 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4392 }
4393
4394 #[test]
4395 fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4396 let storage = tempfile::tempdir().expect("create storage dir");
4397 let project = storage.path().join("project");
4398 fs::create_dir_all(project.join("src")).expect("create project src");
4399 let file = project.join("src/lib.rs");
4400 fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4401 let project_root = fs::canonicalize(&project).expect("canonical project");
4402 let file = fs::canonicalize(&file).expect("canonical file");
4403
4404 let mut index = SemanticIndex::new(project_root.clone(), 3);
4405 let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4406 index.file_mtimes.insert(file.clone(), mtime);
4407 index.file_sizes.insert(file.clone(), 42);
4408 index
4409 .file_hashes
4410 .insert(file.clone(), cache_freshness::zero_hash());
4411 index.entries.push(EmbeddingEntry {
4412 chunk: SemanticChunk {
4413 file: file.clone(),
4414 name: "alpha".to_string(),
4415 kind: SymbolKind::Function,
4416 start_line: 0,
4417 end_line: 0,
4418 exported: true,
4419 embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4420 snippet: "pub fn alpha() {}".to_string(),
4421 },
4422 vector: vec![0.1, 0.2, 0.3],
4423 });
4424 index.entries.push(EmbeddingEntry {
4425 chunk: SemanticChunk {
4426 file: file.clone(),
4427 name: "beta".to_string(),
4428 kind: SymbolKind::Function,
4429 start_line: 1,
4430 end_line: 1,
4431 exported: true,
4432 embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4433 snippet: "pub fn beta() {}".to_string(),
4434 },
4435 vector: vec![0.4, 0.5, 0.6],
4436 });
4437 let fingerprint = SemanticIndexFingerprint {
4438 backend: "fastembed".to_string(),
4439 model: "all-MiniLM-L6-v2".to_string(),
4440 base_url: FALLBACK_BACKEND.to_string(),
4441 dimension: 3,
4442 chunking_version: default_chunking_version(),
4443 };
4444 index.set_fingerprint(fingerprint.clone());
4445
4446 let legacy_bytes = legacy_semantic_index_bytes(&index);
4447 assert_eq!(index.to_bytes(), legacy_bytes);
4448
4449 index.write_to_disk(storage.path(), "proj");
4450 let data_path = storage.path().join("semantic/proj/semantic.bin");
4451 assert_eq!(
4452 fs::read(&data_path).expect("read semantic.bin"),
4453 legacy_bytes
4454 );
4455
4456 let loaded = SemanticIndex::read_from_disk(
4457 storage.path(),
4458 "proj",
4459 &project_root,
4460 false,
4461 Some(&fingerprint.as_string()),
4462 )
4463 .expect("load semantic index");
4464 assert_eq!(loaded.entries.len(), index.entries.len());
4465 assert_eq!(loaded.dimension, index.dimension);
4466 assert_eq!(
4467 loaded.fingerprint().unwrap().as_string(),
4468 fingerprint.as_string()
4469 );
4470 assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4471 assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4472 assert_eq!(
4473 loaded.file_hashes.get(&file),
4474 Some(&cache_freshness::zero_hash())
4475 );
4476 for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4477 assert_eq!(actual.chunk.file, expected.chunk.file);
4478 assert_eq!(actual.chunk.name, expected.chunk.name);
4479 assert_eq!(actual.chunk.kind, expected.chunk.kind);
4480 assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4481 assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4482 assert_eq!(actual.chunk.exported, expected.chunk.exported);
4483 assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4484 assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4485 assert_eq!(actual.vector, expected.vector);
4486 }
4487 assert_eq!(loaded.to_bytes(), legacy_bytes);
4488 }
4489
4490 #[test]
4491 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4492 let cases = [
4493 (SymbolKind::Function, 0),
4494 (SymbolKind::Class, 1),
4495 (SymbolKind::Method, 2),
4496 (SymbolKind::Struct, 3),
4497 (SymbolKind::Interface, 4),
4498 (SymbolKind::Enum, 5),
4499 (SymbolKind::TypeAlias, 6),
4500 (SymbolKind::Variable, 7),
4501 (SymbolKind::Heading, 8),
4502 (SymbolKind::FileSummary, 9),
4503 ];
4504
4505 for (kind, encoded) in cases {
4506 assert_eq!(symbol_kind_to_u8(&kind), encoded);
4507 assert_eq!(u8_to_symbol_kind(encoded), kind);
4508 }
4509 }
4510
4511 #[test]
4512 fn test_search_top_k() {
4513 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4514 index.dimension = 3;
4515
4516 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4518 let mut vec = vec![0.0f32; 3];
4519 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
4521 chunk: SemanticChunk {
4522 file: PathBuf::from("/src/lib.rs"),
4523 name: name.to_string(),
4524 kind: SymbolKind::Function,
4525 start_line: (i * 10 + 1) as u32,
4526 end_line: (i * 10 + 5) as u32,
4527 exported: true,
4528 embed_text: format!("kind:function name:{}", name),
4529 snippet: format!("fn {}() {{}}", name),
4530 },
4531 vector: vec,
4532 });
4533 }
4534
4535 let query = vec![0.9, 0.1, 0.0];
4537 let results = index.search(&query, 2);
4538
4539 assert_eq!(results.len(), 2);
4540 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
4542 }
4543
4544 #[test]
4545 fn test_empty_index_search() {
4546 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4547 let results = index.search(&[0.1, 0.2, 0.3], 10);
4548 assert!(results.is_empty());
4549 }
4550
4551 #[test]
4552 fn single_line_symbol_builds_non_empty_snippet() {
4553 let symbol = Symbol {
4554 name: "answer".to_string(),
4555 kind: SymbolKind::Variable,
4556 range: crate::symbols::Range {
4557 start_line: 0,
4558 start_col: 0,
4559 end_line: 0,
4560 end_col: 24,
4561 },
4562 signature: Some("const answer = 42".to_string()),
4563 scope_chain: Vec::new(),
4564 exported: true,
4565 parent: None,
4566 };
4567 let source = "export const answer = 42;\n";
4568
4569 let snippet = build_snippet(&symbol, source);
4570
4571 assert_eq!(snippet, "export const answer = 42;");
4572 }
4573
4574 #[test]
4575 fn optimized_file_chunk_collection_matches_file_parser_path() {
4576 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4577 let file = project_root.join("src/semantic_index.rs");
4578 let source = std::fs::read_to_string(&file).unwrap();
4579
4580 let mut legacy_parser = FileParser::new();
4581 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4582 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4583
4584 let mut parsers = HashMap::new();
4585 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4586
4587 assert_eq!(
4588 chunk_fingerprint(&optimized_chunks),
4589 chunk_fingerprint(&legacy_chunks)
4590 );
4591 }
4592
4593 fn chunk_fingerprint(
4594 chunks: &[SemanticChunk],
4595 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4596 chunks
4597 .iter()
4598 .map(|chunk| {
4599 (
4600 chunk.name.clone(),
4601 chunk.kind.clone(),
4602 chunk.start_line,
4603 chunk.end_line,
4604 chunk.exported,
4605 chunk.embed_text.clone(),
4606 chunk.snippet.clone(),
4607 )
4608 })
4609 .collect()
4610 }
4611
4612 #[test]
4613 fn collect_file_chunks_skips_oversized_file() {
4614 let dir = tempfile::tempdir().unwrap();
4615 let big = dir.path().join("huge.ts");
4616 let filler = "export const x = 1;\n"
4618 .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4619 std::fs::write(&big, &filler).unwrap();
4620 assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4621
4622 let mut parsers = HashMap::new();
4623 let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4626 assert!(chunks.is_empty(), "oversized file must yield no chunks");
4627
4628 let small = dir.path().join("small.ts");
4630 std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4631 let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4632 assert!(!small_chunks.is_empty(), "small file should still chunk");
4633 }
4634
4635 #[test]
4636 fn rejects_oversized_dimension_during_deserialization() {
4637 let mut bytes = Vec::new();
4638 bytes.push(1u8);
4639 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4640 bytes.extend_from_slice(&0u32.to_le_bytes());
4641 bytes.extend_from_slice(&0u32.to_le_bytes());
4642
4643 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4644 }
4645
4646 #[test]
4647 fn rejects_oversized_entry_count_during_deserialization() {
4648 let mut bytes = Vec::new();
4649 bytes.push(1u8);
4650 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4651 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4652 bytes.extend_from_slice(&0u32.to_le_bytes());
4653
4654 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4655 }
4656
4657 #[test]
4658 fn invalidate_file_removes_entries_and_mtime() {
4659 let target = PathBuf::from("/src/main.rs");
4660 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4661 index.entries.push(EmbeddingEntry {
4662 chunk: SemanticChunk {
4663 file: target.clone(),
4664 name: "main".to_string(),
4665 kind: SymbolKind::Function,
4666 start_line: 0,
4667 end_line: 1,
4668 exported: false,
4669 embed_text: "main".to_string(),
4670 snippet: "fn main() {}".to_string(),
4671 },
4672 vector: vec![1.0; DEFAULT_DIMENSION],
4673 });
4674 index
4675 .file_mtimes
4676 .insert(target.clone(), SystemTime::UNIX_EPOCH);
4677 index.file_sizes.insert(target.clone(), 0);
4678
4679 index.invalidate_file(&target);
4680
4681 assert!(index.entries.is_empty());
4682 assert!(!index.file_mtimes.contains_key(&target));
4683 assert!(!index.file_sizes.contains_key(&target));
4684 }
4685
4686 #[test]
4687 fn refresh_missing_changed_file_is_purged_after_collect() {
4688 let temp = tempfile::tempdir().unwrap();
4689 let project_root = temp.path();
4690 let file = project_root.join("src/lib.rs");
4691 fs::create_dir_all(file.parent().unwrap()).unwrap();
4692 write_rust_file(&file, "vanished_symbol");
4693
4694 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4695 let original_size = *index.file_sizes.get(&file).unwrap();
4696 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4697 fs::remove_file(&file).unwrap();
4698
4699 let mut embed = test_vector_for_texts;
4700 let mut progress = |_done: usize, _total: usize| {};
4701 let summary = index
4702 .refresh_stale_files(
4703 project_root,
4704 std::slice::from_ref(&file),
4705 &mut embed,
4706 8,
4707 &mut progress,
4708 )
4709 .unwrap();
4710
4711 assert_eq!(summary.changed, 0);
4712 assert_eq!(summary.added, 0);
4713 assert_eq!(summary.deleted, 1);
4714 assert!(index.entries.is_empty());
4715 assert!(!index.file_mtimes.contains_key(&file));
4716 assert!(!index.file_sizes.contains_key(&file));
4717 assert!(!index.file_hashes.contains_key(&file));
4718 }
4719
4720 #[test]
4721 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4722 let temp = tempfile::tempdir().unwrap();
4723 let project_root = temp.path();
4724 let file = project_root.join("src/lib.rs");
4725 fs::create_dir_all(file.parent().unwrap()).unwrap();
4726 write_rust_file(&file, "kept_symbol");
4727
4728 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4729 let original_entry_count = index.entries.len();
4730 let original_mtime = *index.file_mtimes.get(&file).unwrap();
4731 let original_size = *index.file_sizes.get(&file).unwrap();
4732
4733 let stale_mtime = SystemTime::UNIX_EPOCH;
4734 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4735 fs::remove_file(&file).unwrap();
4736 fs::create_dir(&file).unwrap();
4737
4738 let mut embed = test_vector_for_texts;
4739 let mut progress = |_done: usize, _total: usize| {};
4740 let summary = index
4741 .refresh_stale_files(
4742 project_root,
4743 std::slice::from_ref(&file),
4744 &mut embed,
4745 8,
4746 &mut progress,
4747 )
4748 .unwrap();
4749
4750 assert_eq!(summary.changed, 0);
4751 assert_eq!(summary.added, 0);
4752 assert_eq!(summary.deleted, 0);
4753 assert_eq!(index.entries.len(), original_entry_count);
4754 assert!(index
4755 .entries
4756 .iter()
4757 .any(|entry| entry.chunk.name == "kept_symbol"));
4758 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4759 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4760 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4761 }
4762
4763 #[test]
4764 fn refresh_never_indexed_file_error_does_not_record_mtime() {
4765 let temp = tempfile::tempdir().unwrap();
4766 let project_root = temp.path();
4767 let missing = project_root.join("src/missing.rs");
4768 fs::create_dir_all(missing.parent().unwrap()).unwrap();
4769
4770 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4771 let mut embed = test_vector_for_texts;
4772 let mut progress = |_done: usize, _total: usize| {};
4773 let summary = index
4774 .refresh_stale_files(
4775 project_root,
4776 std::slice::from_ref(&missing),
4777 &mut embed,
4778 8,
4779 &mut progress,
4780 )
4781 .unwrap();
4782
4783 assert_eq!(summary.added, 0);
4784 assert_eq!(summary.changed, 0);
4785 assert_eq!(summary.deleted, 0);
4786 assert!(!index.file_mtimes.contains_key(&missing));
4787 assert!(!index.file_sizes.contains_key(&missing));
4788 assert!(index.entries.is_empty());
4789 }
4790
4791 #[test]
4792 fn refresh_reports_added_for_new_files() {
4793 let temp = tempfile::tempdir().unwrap();
4794 let project_root = temp.path();
4795 let existing = project_root.join("src/lib.rs");
4796 let added = project_root.join("src/new.rs");
4797 fs::create_dir_all(existing.parent().unwrap()).unwrap();
4798 write_rust_file(&existing, "existing_symbol");
4799 write_rust_file(&added, "added_symbol");
4800
4801 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4802 let mut embed = test_vector_for_texts;
4803 let mut progress = |_done: usize, _total: usize| {};
4804 let summary = index
4805 .refresh_stale_files(
4806 project_root,
4807 &[existing.clone(), added.clone()],
4808 &mut embed,
4809 8,
4810 &mut progress,
4811 )
4812 .unwrap();
4813
4814 assert_eq!(summary.added, 1);
4815 assert_eq!(summary.changed, 0);
4816 assert_eq!(summary.deleted, 0);
4817 assert_eq!(summary.total_processed, 2);
4818 assert!(index.file_mtimes.contains_key(&added));
4819 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4820 }
4821
4822 #[test]
4823 fn refresh_reports_deleted_for_removed_files() {
4824 let temp = tempfile::tempdir().unwrap();
4825 let project_root = temp.path();
4826 let deleted = project_root.join("src/deleted.rs");
4827 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4828 write_rust_file(&deleted, "deleted_symbol");
4829
4830 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4831 fs::remove_file(&deleted).unwrap();
4832
4833 let mut embed = test_vector_for_texts;
4834 let mut progress = |_done: usize, _total: usize| {};
4835 let summary = index
4836 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4837 .unwrap();
4838
4839 assert_eq!(summary.deleted, 1);
4840 assert_eq!(summary.changed, 0);
4841 assert_eq!(summary.added, 0);
4842 assert_eq!(summary.total_processed, 1);
4843 assert!(!index.file_mtimes.contains_key(&deleted));
4844 assert!(index.entries.is_empty());
4845 }
4846
4847 #[test]
4848 fn refresh_reports_changed_for_modified_files() {
4849 let temp = tempfile::tempdir().unwrap();
4850 let project_root = temp.path();
4851 let file = project_root.join("src/lib.rs");
4852 fs::create_dir_all(file.parent().unwrap()).unwrap();
4853 write_rust_file(&file, "old_symbol");
4854
4855 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4856 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4857 write_rust_file(&file, "new_symbol");
4858
4859 let mut embed = test_vector_for_texts;
4860 let mut progress = |_done: usize, _total: usize| {};
4861 let summary = index
4862 .refresh_stale_files(
4863 project_root,
4864 std::slice::from_ref(&file),
4865 &mut embed,
4866 8,
4867 &mut progress,
4868 )
4869 .unwrap();
4870
4871 assert_eq!(summary.changed, 1);
4872 assert_eq!(summary.added, 0);
4873 assert_eq!(summary.deleted, 0);
4874 assert_eq!(summary.total_processed, 1);
4875 assert!(index
4876 .entries
4877 .iter()
4878 .any(|entry| entry.chunk.name == "new_symbol"));
4879 assert!(!index
4880 .entries
4881 .iter()
4882 .any(|entry| entry.chunk.name == "old_symbol"));
4883 }
4884
4885 #[test]
4886 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4887 let temp = tempfile::tempdir().unwrap();
4888 let project_root = temp.path();
4889 let file = project_root.join("src/lib.rs");
4890 fs::create_dir_all(file.parent().unwrap()).unwrap();
4891 write_rust_file(&file, "clean_symbol");
4892
4893 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4894 let original_entries = index.entries.len();
4895 let mut embed_called = false;
4896 let mut embed = |texts: Vec<String>| {
4897 embed_called = true;
4898 test_vector_for_texts(texts)
4899 };
4900 let mut progress = |_done: usize, _total: usize| {};
4901 let summary = index
4902 .refresh_stale_files(
4903 project_root,
4904 std::slice::from_ref(&file),
4905 &mut embed,
4906 8,
4907 &mut progress,
4908 )
4909 .unwrap();
4910
4911 assert!(summary.is_noop());
4912 assert_eq!(summary.total_processed, 1);
4913 assert!(!embed_called);
4914 assert_eq!(index.entries.len(), original_entries);
4915 }
4916
4917 #[test]
4918 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4919 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4920
4921 assert!(is_onnx_runtime_unavailable(message));
4922 }
4923
4924 #[test]
4925 fn formats_missing_onnx_runtime_with_install_hint() {
4926 let message = format_embedding_init_error(
4927 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4928 );
4929
4930 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4931 assert!(message.contains("Original error:"));
4932 }
4933
4934 #[test]
4935 fn interactive_query_embedding_model_caps_remote_timeout() {
4936 let mut config = SemanticBackendConfig {
4937 backend: SemanticBackend::OpenAiCompatible,
4938 model: "test-embedding".to_string(),
4939 base_url: Some("http://127.0.0.1:9".to_string()),
4940 api_key_env: None,
4941 timeout_ms: 0,
4942 max_batch_size: 64,
4943 max_files: 20_000,
4944 };
4945
4946 let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
4947 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4948 assert_eq!(
4949 build_model.timeout_ms(),
4950 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
4951 "background build keeps the longer default embedding timeout"
4952 );
4953 assert_eq!(
4954 query_model.timeout_ms(),
4955 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
4956 "interactive query embedding is capped below the dispatch transport timeout"
4957 );
4958
4959 config.timeout_ms = 60_000;
4960 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4961 assert_eq!(
4962 query_model.timeout_ms(),
4963 DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
4964 "explicitly long backend timeouts are capped for interactive queries"
4965 );
4966
4967 config.timeout_ms = 3_000;
4968 let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4969 assert_eq!(
4970 query_model.timeout_ms(),
4971 3_000,
4972 "shorter explicit timeouts are respected for interactive queries"
4973 );
4974 }
4975
4976 #[test]
4977 fn openai_compatible_backend_embeds_with_mock_server() {
4978 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4979 assert!(request_line.starts_with("POST "));
4980 assert_eq!(path, "/v1/embeddings");
4981 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4982 });
4983
4984 let config = SemanticBackendConfig {
4985 backend: SemanticBackend::OpenAiCompatible,
4986 model: "test-embedding".to_string(),
4987 base_url: Some(base_url),
4988 api_key_env: None,
4989 timeout_ms: 5_000,
4990 max_batch_size: 64,
4991 max_files: 20_000,
4992 };
4993
4994 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4995 let vectors = model
4996 .embed(vec!["hello".to_string(), "world".to_string()])
4997 .unwrap();
4998
4999 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5000 handle.join().unwrap();
5001 }
5002
5003 #[test]
5013 fn openai_compatible_request_has_single_content_type_header() {
5014 use std::sync::{Arc, Mutex};
5015 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5016 let captured_for_thread = Arc::clone(&captured);
5017
5018 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5019 let addr = listener.local_addr().expect("local addr");
5020 let handle = thread::spawn(move || {
5021 let (mut stream, _) = listener.accept().expect("accept");
5022 let mut buf = Vec::new();
5023 let mut chunk = [0u8; 4096];
5024 let mut header_end = None;
5025 let mut content_length = 0usize;
5026 loop {
5027 let n = stream.read(&mut chunk).expect("read");
5028 if n == 0 {
5029 break;
5030 }
5031 buf.extend_from_slice(&chunk[..n]);
5032 if header_end.is_none() {
5033 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5034 header_end = Some(pos + 4);
5035 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5036 if let Some(value) = line.strip_prefix("Content-Length:") {
5037 content_length = value.trim().parse::<usize>().unwrap_or(0);
5038 }
5039 }
5040 }
5041 }
5042 if let Some(end) = header_end {
5043 if buf.len() >= end + content_length {
5044 break;
5045 }
5046 }
5047 }
5048 *captured_for_thread.lock().unwrap() = buf;
5049 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5050 let response = format!(
5051 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5052 body.len(),
5053 body
5054 );
5055 let _ = stream.write_all(response.as_bytes());
5056 });
5057
5058 let config = SemanticBackendConfig {
5059 backend: SemanticBackend::OpenAiCompatible,
5060 model: "text-embedding-3-small".to_string(),
5061 base_url: Some(format!("http://{}", addr)),
5062 api_key_env: None,
5063 timeout_ms: 5_000,
5064 max_batch_size: 64,
5065 max_files: 20_000,
5066 };
5067 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5068 let _ = model.embed(vec!["probe".to_string()]).unwrap();
5069 handle.join().unwrap();
5070
5071 let bytes = captured.lock().unwrap().clone();
5072 let request = String::from_utf8_lossy(&bytes);
5073
5074 let content_type_lines = request
5077 .lines()
5078 .filter(|line| {
5079 let lower = line.to_ascii_lowercase();
5080 lower.starts_with("content-type:")
5081 })
5082 .count();
5083 assert_eq!(
5084 content_type_lines, 1,
5085 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5086 );
5087
5088 assert!(
5091 request.contains(r#""model":"text-embedding-3-small""#),
5092 "request body should contain model field; full request:\n{request}",
5093 );
5094 }
5095
5096 #[test]
5097 fn ollama_backend_embeds_with_mock_server() {
5098 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5099 assert!(request_line.starts_with("POST "));
5100 assert_eq!(path, "/api/embed");
5101 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5102 });
5103
5104 let config = SemanticBackendConfig {
5105 backend: SemanticBackend::Ollama,
5106 model: "embeddinggemma".to_string(),
5107 base_url: Some(base_url),
5108 api_key_env: None,
5109 timeout_ms: 5_000,
5110 max_batch_size: 64,
5111 max_files: 20_000,
5112 };
5113
5114 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5115 let vectors = model
5116 .embed(vec!["hello".to_string(), "world".to_string()])
5117 .unwrap();
5118
5119 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5120 handle.join().unwrap();
5121 }
5122
5123 #[test]
5124 fn read_from_disk_rejects_fingerprint_mismatch() {
5125 let storage = tempfile::tempdir().unwrap();
5126 let project_key = "proj";
5127
5128 let project_root = test_project_root();
5129 let file = project_root.join("src/main.rs");
5130 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5131 index.entries.push(EmbeddingEntry {
5132 chunk: SemanticChunk {
5133 file: file.clone(),
5134 name: "handle_request".to_string(),
5135 kind: SymbolKind::Function,
5136 start_line: 10,
5137 end_line: 25,
5138 exported: true,
5139 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5140 snippet: "fn handle_request() {}".to_string(),
5141 },
5142 vector: vec![0.1, 0.2, 0.3],
5143 });
5144 index.dimension = 3;
5145 index
5146 .file_mtimes
5147 .insert(file.clone(), SystemTime::UNIX_EPOCH);
5148 index.file_sizes.insert(file, 0);
5149 index.set_fingerprint(SemanticIndexFingerprint {
5150 backend: "openai_compatible".to_string(),
5151 model: "test-embedding".to_string(),
5152 base_url: "http://127.0.0.1:1234/v1".to_string(),
5153 dimension: 3,
5154 chunking_version: default_chunking_version(),
5155 });
5156 index.write_to_disk(storage.path(), project_key);
5157
5158 let matching = index.fingerprint().unwrap().as_string();
5159 assert!(SemanticIndex::read_from_disk(
5160 storage.path(),
5161 project_key,
5162 &project_root,
5163 false,
5164 Some(&matching),
5165 )
5166 .is_some());
5167
5168 let mismatched = SemanticIndexFingerprint {
5169 backend: "ollama".to_string(),
5170 model: "embeddinggemma".to_string(),
5171 base_url: "http://127.0.0.1:11434".to_string(),
5172 dimension: 3,
5173 chunking_version: default_chunking_version(),
5174 }
5175 .as_string();
5176 assert!(SemanticIndex::read_from_disk(
5177 storage.path(),
5178 project_key,
5179 &project_root,
5180 false,
5181 Some(&mismatched),
5182 )
5183 .is_none());
5184 }
5185
5186 #[test]
5187 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5188 let storage = tempfile::tempdir().unwrap();
5189 let project_key = "proj-v3";
5190 let dir = storage.path().join("semantic").join(project_key);
5191 fs::create_dir_all(&dir).unwrap();
5192
5193 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5194 index.entries.push(EmbeddingEntry {
5195 chunk: SemanticChunk {
5196 file: PathBuf::from("/src/main.rs"),
5197 name: "handle_request".to_string(),
5198 kind: SymbolKind::Function,
5199 start_line: 0,
5200 end_line: 0,
5201 exported: true,
5202 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5203 snippet: "fn handle_request() {}".to_string(),
5204 },
5205 vector: vec![0.1, 0.2, 0.3],
5206 });
5207 index.dimension = 3;
5208 index
5209 .file_mtimes
5210 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5211 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5212 let fingerprint = SemanticIndexFingerprint {
5213 backend: "fastembed".to_string(),
5214 model: "test".to_string(),
5215 base_url: FALLBACK_BACKEND.to_string(),
5216 dimension: 3,
5217 chunking_version: default_chunking_version(),
5218 };
5219 index.set_fingerprint(fingerprint.clone());
5220
5221 let mut bytes = index.to_bytes();
5222 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5223 fs::write(dir.join("semantic.bin"), bytes).unwrap();
5224
5225 assert!(SemanticIndex::read_from_disk(
5226 storage.path(),
5227 project_key,
5228 &test_project_root(),
5229 false,
5230 Some(&fingerprint.as_string())
5231 )
5232 .is_none());
5233 assert!(!dir.join("semantic.bin").exists());
5234 }
5235
5236 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5237 crate::symbols::Symbol {
5238 name: name.to_string(),
5239 kind,
5240 range: crate::symbols::Range {
5241 start_line: start,
5242 start_col: 0,
5243 end_line: end,
5244 end_col: 0,
5245 },
5246 signature: None,
5247 scope_chain: Vec::new(),
5248 exported: false,
5249 parent: None,
5250 }
5251 }
5252
5253 #[test]
5258 fn symbols_to_chunks_skips_heading_symbols() {
5259 let project_root = PathBuf::from("/proj");
5260 let file = project_root.join("README.md");
5261 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5262
5263 let symbols = vec![
5264 make_symbol(SymbolKind::Heading, "Title", 0, 2),
5265 make_symbol(SymbolKind::Heading, "Section", 4, 6),
5266 ];
5267
5268 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5269 assert!(
5270 chunks.is_empty(),
5271 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5272 chunks.len()
5273 );
5274 }
5275
5276 #[test]
5283 fn build_embed_text_clamps_oversized_signature() {
5284 let project_root = PathBuf::from("/proj");
5285 let file = project_root.join("cronjob.yaml");
5286 let huge_sig = "kubectl ".repeat(2000); let source = "apiVersion: batch/v1\nkind: CronJob\n";
5288
5289 let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5290 symbol.signature = Some(huge_sig);
5291
5292 let text = build_embed_text(&symbol, source, &file, &project_root);
5293 assert!(
5294 text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5295 "embed_text must be clamped to {} chars, got {}",
5296 MAX_EMBED_TEXT_CHARS,
5297 text.chars().count()
5298 );
5299 }
5300
5301 #[test]
5305 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5306 let project_root = PathBuf::from("/proj");
5307 let file = project_root.join("src/lib.rs");
5308 let source = "pub fn handle_request() -> bool {\n true\n}\n";
5309
5310 let symbols = vec![
5311 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5313 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5314 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5315 ];
5316
5317 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5318 assert_eq!(
5319 chunks.len(),
5320 3,
5321 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5322 chunks.len()
5323 );
5324 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5325 assert!(chunks
5326 .iter()
5327 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5328 assert!(names.contains(&"handle_request"));
5329 assert!(names.contains(&"AuthService"));
5330 assert!(
5331 !names.contains(&"doc heading"),
5332 "Heading symbol leaked into chunks: {names:?}"
5333 );
5334 }
5335
5336 #[test]
5337 fn validate_ssrf_allows_loopback_hostnames() {
5338 for host in &[
5341 "http://localhost",
5342 "http://localhost:8080",
5343 "http://localhost:11434", "http://localhost.localdomain",
5345 "http://foo.localhost",
5346 ] {
5347 assert!(
5348 validate_base_url_no_ssrf(host).is_ok(),
5349 "Expected {host} to be allowed (loopback), got: {:?}",
5350 validate_base_url_no_ssrf(host)
5351 );
5352 }
5353 }
5354
5355 #[test]
5356 fn validate_ssrf_allows_loopback_ips() {
5357 for url in &[
5360 "http://127.0.0.1",
5361 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
5363 "http://127.1.2.3",
5364 ] {
5365 let result = validate_base_url_no_ssrf(url);
5366 assert!(
5367 result.is_ok(),
5368 "Expected {url} to be allowed (loopback), got: {:?}",
5369 result
5370 );
5371 }
5372 }
5373
5374 #[test]
5375 fn validate_ssrf_rejects_private_non_loopback_ips() {
5376 for url in &[
5381 "http://192.168.1.1",
5382 "http://10.0.0.1",
5383 "http://172.16.0.1",
5384 "http://169.254.169.254",
5385 "http://100.64.0.1",
5386 ] {
5387 let result = validate_base_url_no_ssrf(url);
5388 assert!(
5389 result.is_err(),
5390 "Expected {url} to be rejected (non-loopback private), got: {:?}",
5391 result
5392 );
5393 }
5394 }
5395
5396 #[test]
5397 fn validate_ssrf_rejects_mdns_local_hostnames() {
5398 for host in &[
5401 "http://printer.local",
5402 "http://nas.local:8080",
5403 "http://homelab.local",
5404 ] {
5405 let result = validate_base_url_no_ssrf(host);
5406 assert!(
5407 result.is_err(),
5408 "Expected {host} to be rejected (mDNS), got: {:?}",
5409 result
5410 );
5411 }
5412 }
5413
5414 #[test]
5415 fn normalize_base_url_allows_localhost_for_tests() {
5416 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5419 assert!(normalize_base_url("http://localhost:8080").is_ok());
5420 }
5421
5422 #[test]
5423 fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5424 use std::net::IpAddr;
5425 let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5426
5427 assert!(blocked("10.0.0.1"));
5429 assert!(blocked("192.168.1.1"));
5430 assert!(blocked("169.254.0.1"));
5431 assert!(blocked("100.64.0.1"));
5432 assert!(
5434 blocked("198.18.0.1"),
5435 "RFC2544 benchmark range must be blocked"
5436 );
5437 assert!(blocked("224.0.0.1"), "multicast must be blocked");
5438 assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5439 assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5440
5441 assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5443 assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5444 assert!(
5445 !blocked("::ffff:127.0.0.1"),
5446 "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5447 );
5448
5449 assert!(!blocked("8.8.8.8"));
5451 }
5452
5453 #[test]
5460 fn ort_mismatch_message_recommends_auto_fix_first() {
5461 let msg =
5462 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5463
5464 assert!(
5466 msg.contains("v1.9.0"),
5467 "should report detected version: {msg}"
5468 );
5469 assert!(
5470 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5471 "should report system path: {msg}"
5472 );
5473 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5474
5475 let auto_fix_pos = msg
5477 .find("Auto-fix")
5478 .expect("Auto-fix solution missing — users won't discover --fix");
5479 let remove_pos = msg
5480 .find("Remove the old library")
5481 .expect("system-rm solution missing");
5482 assert!(
5483 auto_fix_pos < remove_pos,
5484 "Auto-fix must come before manual rm — see PR comment thread"
5485 );
5486
5487 assert!(
5489 msg.contains("npx @cortexkit/aft doctor --fix"),
5490 "auto-fix command must be present and copy-pasteable: {msg}"
5491 );
5492 }
5493
5494 #[cfg(any(target_os = "linux", target_os = "macos"))]
5495 #[test]
5496 fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5497 let requested = "libonnxruntime.so";
5498 let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5499
5500 assert_eq!(detect_ort_version_from_path(requested), None);
5501 let (version, source) =
5502 detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5503
5504 assert_eq!(version, Some("1.19.0".to_string()));
5505 assert_eq!(source, actual);
5506
5507 let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5508 assert!(msg.contains("v1.19.0"));
5509 assert!(msg.contains(actual));
5510 }
5511
5512 #[test]
5516 fn ort_mismatch_message_handles_macos_dylib_path() {
5517 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5518 assert!(msg.contains("v1.9.0"));
5519 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5520 assert!(
5524 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5525 "system path should be quoted in the auto-fix sentence: {msg}"
5526 );
5527 }
5528}