1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34 apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35 AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64 _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68 pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69 let dir = storage_dir.join("semantic").join(project_key);
70 fs::create_dir_all(&dir)?;
71 let path = dir.join("cache.lock");
72 let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73 .lock()
74 .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75 fs_lock::try_acquire(&path, Duration::from_secs(2))
76 .map(|guard| Self { _guard: guard })
77 .map_err(|error| match error {
78 fs_lock::AcquireError::Timeout => {
79 std::io::Error::other("timed out acquiring semantic cache lock")
80 }
81 fs_lock::AcquireError::Io(error) => error,
82 })
83 }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88 pub backend: String,
89 pub model: String,
90 #[serde(default)]
91 pub base_url: String,
92 pub dimension: usize,
93 #[serde(default = "default_chunking_version")]
94 pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98 2
99}
100
101impl SemanticIndexFingerprint {
102 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103 let base_url = config
106 .base_url
107 .as_ref()
108 .and_then(|u| normalize_base_url(u).ok())
109 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110 Self {
111 backend: config.backend.as_str().to_string(),
112 model: config.model.clone(),
113 base_url,
114 dimension,
115 chunking_version: default_chunking_version(),
116 }
117 }
118
119 pub fn as_string(&self) -> String {
120 serde_json::to_string(self).unwrap_or_else(|_| String::new())
121 }
122
123 fn matches_expected(&self, expected: &str) -> bool {
124 let encoded = self.as_string();
125 !encoded.is_empty() && encoded == expected
126 }
127}
128
129enum SemanticEmbeddingEngine {
130 Fastembed(TextEmbedding),
131 OpenAiCompatible {
132 client: Client,
133 model: String,
134 base_url: String,
135 api_key: Option<String>,
136 },
137 Ollama {
138 client: Client,
139 model: String,
140 base_url: String,
141 },
142}
143
144pub struct SemanticEmbeddingModel {
145 backend: SemanticBackend,
146 model: String,
147 base_url: Option<String>,
148 timeout_ms: u64,
149 max_batch_size: usize,
150 dimension: Option<usize>,
151 engine: SemanticEmbeddingEngine,
152 query_embedding_cache: HashMap<String, Vec<f32>>,
153 query_embedding_cache_order: VecDeque<String>,
154 query_embedding_cache_hits: u64,
155 query_embedding_cache_misses: u64,
156}
157
158pub type EmbeddingModel = SemanticEmbeddingModel;
159
160fn validate_embedding_batch(
161 vectors: &[Vec<f32>],
162 expected_count: usize,
163 context: &str,
164) -> Result<(), String> {
165 if expected_count > 0 && vectors.is_empty() {
166 return Err(format!(
167 "{context} returned no vectors for {expected_count} inputs"
168 ));
169 }
170
171 if vectors.len() != expected_count {
172 return Err(format!(
173 "{context} returned {} vectors for {} inputs",
174 vectors.len(),
175 expected_count
176 ));
177 }
178
179 let Some(first_vector) = vectors.first() else {
180 return Ok(());
181 };
182 let expected_dimension = first_vector.len();
183 validate_embedding_dimension(expected_dimension)
184 .map_err(|error| format!("{context} returned {error}"))?;
185 for (index, vector) in vectors.iter().enumerate() {
186 if vector.len() != expected_dimension {
187 return Err(format!(
188 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
189 vector.len()
190 ));
191 }
192 }
193
194 Ok(())
195}
196
197fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
198 if dimension == 0 || dimension > MAX_DIMENSION {
199 return Err(format!(
200 "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
201 ));
202 }
203
204 Ok(())
205}
206
207fn normalize_base_url(raw: &str) -> Result<String, String> {
211 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
212 let scheme = parsed.scheme();
213 if scheme != "http" && scheme != "https" {
214 return Err(format!(
215 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
216 scheme
217 ));
218 }
219 Ok(parsed.to_string().trim_end_matches('/').to_string())
220}
221
222pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
237 use std::net::{IpAddr, ToSocketAddrs};
238
239 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
240
241 let host = parsed.host_str().unwrap_or("");
242
243 let is_loopback_host =
248 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
249 if is_loopback_host {
250 return Ok(());
251 }
252
253 if host.ends_with(".local") {
256 return Err(format!(
257 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
258 ));
259 }
260
261 let port = parsed.port_or_known_default().unwrap_or(443);
264 let addr_str = format!("{host}:{port}");
265 let addrs: Vec<IpAddr> = addr_str
266 .to_socket_addrs()
267 .map(|iter| iter.map(|sa| sa.ip()).collect())
268 .unwrap_or_default();
269 for ip in &addrs {
270 if is_private_non_loopback_ip(ip) {
271 return Err(format!(
272 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
273 ));
274 }
275 }
276
277 Ok(())
278}
279
280fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
284 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
285 match ip {
286 IpAddr::V4(v4) => {
287 let o = v4.octets();
288 o[0] == 10
291 || (o[0] == 172 && (16..=31).contains(&o[1]))
293 || (o[0] == 192 && o[1] == 168)
295 || (o[0] == 169 && o[1] == 254)
297 || (o[0] == 100 && (64..=127).contains(&o[1]))
299 || o[0] == 0
301 }
302 IpAddr::V6(v6) => {
303 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
307 || (v6.segments()[0] & 0xfe00) == 0xfc00
309 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
311 && v6.segments()[2] == 0 && v6.segments()[3] == 0
312 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
313 && {
314 let [a, b] = v6.segments()[6..8] else { return false; };
315 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
316 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
317 })
318 }
319 }
320}
321
322fn build_openai_embeddings_endpoint(base_url: &str) -> String {
323 if base_url.ends_with("/v1") {
324 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
325 } else {
326 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
327 }
328}
329
330fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
331 if base_url.ends_with("/api") {
332 format!("{base_url}/embed")
333 } else {
334 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
335 }
336}
337
338fn normalize_api_key(value: Option<String>) -> Option<String> {
339 value.and_then(|token| {
340 let token = token.trim();
341 if token.is_empty() {
342 None
343 } else {
344 Some(token.to_string())
345 }
346 })
347}
348
349fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
350 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
351}
352
353fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
354 error.is_connect()
355}
356
357fn sleep_before_embedding_retry(attempt_index: usize) {
358 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
359 std::thread::sleep(Duration::from_millis(*delay_ms));
360 }
361}
362
363fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
364where
365 F: FnMut() -> reqwest::blocking::RequestBuilder,
366{
367 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
368 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
369
370 let response = match make_request().send() {
371 Ok(response) => response,
372 Err(error) => {
373 if !last_attempt && is_retryable_embedding_error(&error) {
374 sleep_before_embedding_retry(attempt_index);
375 continue;
376 }
377 return Err(format!("{backend_label} request failed: {error}"));
378 }
379 };
380
381 let status = response.status();
382 let raw = match response.text() {
383 Ok(raw) => raw,
384 Err(error) => {
385 if !last_attempt && is_retryable_embedding_error(&error) {
386 sleep_before_embedding_retry(attempt_index);
387 continue;
388 }
389 return Err(format!("{backend_label} response read failed: {error}"));
390 }
391 };
392
393 if status.is_success() {
394 return Ok(raw);
395 }
396
397 if !last_attempt && is_retryable_embedding_status(status) {
398 sleep_before_embedding_retry(attempt_index);
399 continue;
400 }
401
402 return Err(format!(
403 "{backend_label} request failed (HTTP {}): {}",
404 status, raw
405 ));
406 }
407
408 unreachable!("embedding request retries exhausted without returning")
409}
410
411impl SemanticEmbeddingModel {
412 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
413 let timeout_ms = if config.timeout_ms == 0 {
414 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
415 } else {
416 config.timeout_ms
417 };
418
419 let max_batch_size = if config.max_batch_size == 0 {
420 DEFAULT_MAX_BATCH_SIZE
421 } else {
422 config.max_batch_size
423 };
424
425 let api_key_env = normalize_api_key(config.api_key_env.clone());
426 let model = config.model.clone();
427
428 let client = Client::builder()
429 .timeout(Duration::from_millis(timeout_ms))
430 .redirect(reqwest::redirect::Policy::none())
431 .build()
432 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
433
434 let engine = match config.backend {
435 SemanticBackend::Fastembed => {
436 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
437 }
438 SemanticBackend::OpenAiCompatible => {
439 let raw = config.base_url.as_ref().ok_or_else(|| {
440 "base_url is required for openai_compatible backend".to_string()
441 })?;
442 let base_url = normalize_base_url(raw)?;
443
444 let api_key = match api_key_env {
445 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
446 format!("missing api_key_env '{var_name}' for openai_compatible backend")
447 })?),
448 None => None,
449 };
450
451 SemanticEmbeddingEngine::OpenAiCompatible {
452 client,
453 model,
454 base_url,
455 api_key,
456 }
457 }
458 SemanticBackend::Ollama => {
459 let raw = config
460 .base_url
461 .as_ref()
462 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
463 let base_url = normalize_base_url(raw)?;
464
465 SemanticEmbeddingEngine::Ollama {
466 client,
467 model,
468 base_url,
469 }
470 }
471 };
472
473 Ok(Self {
474 backend: config.backend,
475 model: config.model.clone(),
476 base_url: config.base_url.clone(),
477 timeout_ms,
478 max_batch_size,
479 dimension: None,
480 engine,
481 query_embedding_cache: HashMap::new(),
482 query_embedding_cache_order: VecDeque::new(),
483 query_embedding_cache_hits: 0,
484 query_embedding_cache_misses: 0,
485 })
486 }
487
488 pub fn backend(&self) -> SemanticBackend {
489 self.backend
490 }
491
492 pub fn model(&self) -> &str {
493 &self.model
494 }
495
496 pub fn base_url(&self) -> Option<&str> {
497 self.base_url.as_deref()
498 }
499
500 pub fn max_batch_size(&self) -> usize {
501 self.max_batch_size
502 }
503
504 pub fn timeout_ms(&self) -> u64 {
505 self.timeout_ms
506 }
507
508 pub fn fingerprint(
509 &mut self,
510 config: &SemanticBackendConfig,
511 ) -> Result<SemanticIndexFingerprint, String> {
512 let dimension = self.dimension()?;
513 Ok(SemanticIndexFingerprint::from_config(config, dimension))
514 }
515
516 pub fn dimension(&mut self) -> Result<usize, String> {
517 if let Some(dimension) = self.dimension {
518 return Ok(dimension);
519 }
520
521 let dimension = match &mut self.engine {
522 SemanticEmbeddingEngine::Fastembed(model) => {
523 let vectors = model
524 .embed(vec!["semantic index fingerprint probe".to_string()], None)
525 .map_err(|error| format_embedding_init_error(error.to_string()))?;
526 vectors
527 .first()
528 .map(|v| v.len())
529 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530 }
531 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
532 let vectors =
533 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
534 vectors
535 .first()
536 .map(|v| v.len())
537 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
538 }
539 SemanticEmbeddingEngine::Ollama { .. } => {
540 let vectors =
541 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
542 vectors
543 .first()
544 .map(|v| v.len())
545 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
546 }
547 };
548
549 self.dimension = Some(dimension);
550 Ok(dimension)
551 }
552
553 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
554 self.embed_texts(texts)
555 }
556
557 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
558 if let Some(vector) = self.query_embedding_cache.get(query) {
559 self.query_embedding_cache_hits += 1;
560 return Ok(vector.clone());
561 }
562
563 self.query_embedding_cache_misses += 1;
564 let embeddings = self.embed_texts(vec![query.to_string()])?;
565 let vector = embeddings
566 .first()
567 .cloned()
568 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
569
570 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
571 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
572 self.query_embedding_cache.remove(&oldest);
573 }
574 }
575 self.query_embedding_cache
576 .insert(query.to_string(), vector.clone());
577 self.query_embedding_cache_order
578 .push_back(query.to_string());
579
580 Ok(vector)
581 }
582
583 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
584 (
585 self.query_embedding_cache_hits,
586 self.query_embedding_cache_misses,
587 self.query_embedding_cache.len(),
588 )
589 }
590
591 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
592 match &mut self.engine {
593 SemanticEmbeddingEngine::Fastembed(model) => model
594 .embed(texts, None::<usize>)
595 .map_err(|error| format_embedding_init_error(error.to_string()))
596 .map_err(|error| format!("failed to embed batch: {error}")),
597 SemanticEmbeddingEngine::OpenAiCompatible {
598 client,
599 model,
600 base_url,
601 api_key,
602 } => {
603 let expected_text_count = texts.len();
604 let endpoint = build_openai_embeddings_endpoint(base_url);
605 let body = serde_json::json!({
606 "input": texts,
607 "model": model,
608 });
609
610 let raw = send_embedding_request(
611 || {
612 let mut request = client.post(&endpoint).json(&body);
622
623 if let Some(api_key) = api_key {
624 request = request.header("Authorization", format!("Bearer {api_key}"));
625 }
626
627 request
628 },
629 "openai compatible",
630 )?;
631
632 #[derive(Deserialize)]
633 struct OpenAiResponse {
634 data: Vec<OpenAiEmbeddingResult>,
635 }
636
637 #[derive(Deserialize)]
638 struct OpenAiEmbeddingResult {
639 embedding: Vec<f32>,
640 index: Option<u32>,
641 }
642
643 let parsed: OpenAiResponse = serde_json::from_str(&raw)
644 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
645 if parsed.data.len() != expected_text_count {
646 return Err(format!(
647 "openai compatible response returned {} embeddings for {} inputs",
648 parsed.data.len(),
649 expected_text_count
650 ));
651 }
652
653 let mut vectors = vec![Vec::new(); parsed.data.len()];
654 for (i, item) in parsed.data.into_iter().enumerate() {
655 let index = item.index.unwrap_or(i as u32) as usize;
656 if index >= vectors.len() {
657 return Err(
658 "openai compatible response contains invalid vector index".to_string()
659 );
660 }
661 vectors[index] = item.embedding;
662 }
663
664 for vector in &vectors {
665 if vector.is_empty() {
666 return Err(
667 "openai compatible response contained missing vectors".to_string()
668 );
669 }
670 }
671
672 self.dimension = vectors.first().map(Vec::len);
673 Ok(vectors)
674 }
675 SemanticEmbeddingEngine::Ollama {
676 client,
677 model,
678 base_url,
679 } => {
680 let expected_text_count = texts.len();
681 let endpoint = build_ollama_embeddings_endpoint(base_url);
682
683 #[derive(Serialize)]
684 struct OllamaPayload<'a> {
685 model: &'a str,
686 input: Vec<String>,
687 }
688
689 let payload = OllamaPayload {
690 model,
691 input: texts,
692 };
693
694 let raw = send_embedding_request(
695 || {
696 client.post(&endpoint).json(&payload)
701 },
702 "ollama",
703 )?;
704
705 #[derive(Deserialize)]
706 struct OllamaResponse {
707 embeddings: Vec<Vec<f32>>,
708 }
709
710 let parsed: OllamaResponse = serde_json::from_str(&raw)
711 .map_err(|error| format!("invalid ollama response: {error}"))?;
712 if parsed.embeddings.is_empty() {
713 return Err("ollama response returned no embeddings".to_string());
714 }
715 if parsed.embeddings.len() != expected_text_count {
716 return Err(format!(
717 "ollama response returned {} embeddings for {} inputs",
718 parsed.embeddings.len(),
719 expected_text_count
720 ));
721 }
722
723 let vectors = parsed.embeddings;
724 for vector in &vectors {
725 if vector.is_empty() {
726 return Err("ollama response contained empty embeddings".to_string());
727 }
728 }
729
730 self.dimension = vectors.first().map(Vec::len);
731 Ok(vectors)
732 }
733 }
734 }
735}
736
737pub fn pre_validate_onnx_runtime() -> Result<(), String> {
741 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
742
743 #[cfg(any(target_os = "linux", target_os = "macos"))]
744 {
745 #[cfg(target_os = "linux")]
746 let default_name = "libonnxruntime.so";
747 #[cfg(target_os = "macos")]
748 let default_name = "libonnxruntime.dylib";
749
750 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
751
752 unsafe {
753 let c_name = std::ffi::CString::new(lib_name)
754 .map_err(|e| format!("invalid library path: {}", e))?;
755 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
756 if handle.is_null() {
757 let err = libc::dlerror();
758 let msg = if err.is_null() {
759 "unknown dlopen error".to_string()
760 } else {
761 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
762 };
763 return Err(format!(
764 "ONNX Runtime not found. dlopen('{}') failed: {}. \
765 Run `npx @cortexkit/aft doctor` to diagnose.",
766 lib_name, msg
767 ));
768 }
769
770 let detected_version = detect_ort_version_from_path(lib_name);
773
774 libc::dlclose(handle);
775
776 if let Some(ref version) = detected_version {
778 let parts: Vec<&str> = version.split('.').collect();
779 if let (Some(major), Some(minor)) = (
780 parts.first().and_then(|s| s.parse::<u32>().ok()),
781 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
782 ) {
783 if major != 1 || minor < 20 {
784 return Err(format_ort_version_mismatch(version, lib_name));
785 }
786 }
787 }
788 }
789 }
790
791 #[cfg(target_os = "windows")]
792 {
793 let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
798
799 #[link(name = "kernel32")]
803 extern "system" {
804 fn LoadLibraryExW(
805 lpLibFileName: *const u16,
806 hFile: *mut std::ffi::c_void,
807 dwFlags: u32,
808 ) -> *mut std::ffi::c_void;
809 fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
810 fn GetModuleFileNameW(
811 hModule: *mut std::ffi::c_void,
812 lpFilename: *mut u16,
813 nSize: u32,
814 ) -> u32;
815 }
816
817 #[link(name = "version")]
818 extern "system" {
819 fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
820 fn GetFileVersionInfoW(
821 lptstrFilename: *const u16,
822 dwHandle: u32,
823 dwLen: u32,
824 lpData: *mut std::ffi::c_void,
825 ) -> i32;
826 fn VerQueryValueW(
827 pBlock: *mut std::ffi::c_void,
828 lpSubBlock: *const u16,
829 lplpBuffer: *mut *mut std::ffi::c_void,
830 puLen: *mut u32,
831 ) -> i32;
832 }
833
834 #[repr(C)]
835 struct VS_FIXEDFILEINFO {
836 dw_signature: u32,
837 dw_struc_version: u32,
838 dw_file_version_ms: u32, dw_file_version_ls: u32, dw_product_version_ms: u32,
841 dw_product_version_ls: u32,
842 dw_file_flags_mask: u32,
843 dw_file_flags: u32,
844 dw_file_os: u32,
845 dw_file_type: u32,
846 dw_file_subtype: u32,
847 dw_file_date_ms: u32,
848 dw_file_date_ls: u32,
849 }
850
851 unsafe {
852 use std::os::windows::ffi::OsStrExt;
853 let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
854 .encode_wide()
855 .chain(std::iter::once(0))
856 .collect();
857
858 let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
859 if handle.is_null() {
860 let err = std::io::Error::last_os_error();
861 return Err(format!(
862 "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
863 Run `npx @cortexkit/aft doctor` to diagnose.",
864 lib_name, err
865 ));
866 }
867
868 let mut detected_major: u32 = 0;
871 let mut detected_minor: u32 = 0;
872 let mut path_buf = [0u16; 32767];
878 let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
879 if path_len > 0 {
880 let mut dummy_handle: u32 = 0;
881 let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
882 if info_size > 0 {
883 let mut info = vec![0u8; info_size as usize];
884 if GetFileVersionInfoW(
885 path_buf.as_ptr(),
886 0,
887 info_size,
888 info.as_mut_ptr() as *mut std::ffi::c_void,
889 ) != 0
890 {
891 let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
892 let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
893 let mut vs_len: u32 = 0;
894 if VerQueryValueW(
895 info.as_mut_ptr() as *mut std::ffi::c_void,
896 sub_block.as_ptr(),
897 &mut vs_info,
898 &mut vs_len,
899 ) != 0
900 && !vs_info.is_null()
901 {
902 let fixed = vs_info as *const VS_FIXEDFILEINFO;
903 detected_major = (*fixed).dw_file_version_ms >> 16;
904 detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
905 }
906 }
907 }
908 }
909
910 FreeLibrary(handle);
911
912 if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
916 let ver = format!("{}.{}", detected_major, detected_minor);
917 return Err(format_ort_version_mismatch(&ver, lib_name));
918 }
919 }
920 }
921
922 Ok(())
923}
924
925#[cfg(any(test, target_os = "linux", target_os = "macos"))]
928fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
929 let path = std::path::Path::new(lib_path);
930
931 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
933 .into_iter()
934 .flatten()
935 {
936 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
937 if let Some(version) = extract_version_from_filename(name) {
938 return Some(version);
939 }
940 }
941 }
942
943 if let Some(parent) = path.parent() {
945 if let Ok(entries) = std::fs::read_dir(parent) {
946 for entry in entries.flatten() {
947 if let Some(name) = entry.file_name().to_str() {
948 if name.starts_with("libonnxruntime") {
949 if let Some(version) = extract_version_from_filename(name) {
950 return Some(version);
951 }
952 }
953 }
954 }
955 }
956 }
957
958 None
959}
960
961#[cfg(any(test, target_os = "linux", target_os = "macos"))]
963fn extract_version_from_filename(name: &str) -> Option<String> {
964 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
966 re.find(name).map(|m| m.as_str().to_string())
967}
968
969fn suggest_removal_command(lib_path: &str) -> String {
970 if lib_path.starts_with("/usr/local/lib")
971 || lib_path == "libonnxruntime.so"
972 || lib_path == "libonnxruntime.dylib"
973 {
974 #[cfg(target_os = "linux")]
975 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
976 #[cfg(target_os = "macos")]
977 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
978 }
979 format!(" rm '{}'", lib_path)
980}
981
982pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
988 format!(
989 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
990 Solutions:\n\
991 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
992 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
993 configures the bridge to load it instead of the system library — no \
994 changes to '{}'.\n\
995 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
996 {}\n\
997 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
998 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
999 version,
1000 lib_name,
1001 lib_name,
1002 suggest_removal_command(lib_name),
1003 )
1004}
1005
1006pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
1007 pre_validate_onnx_runtime()?;
1009
1010 let selected_model = match model {
1011 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
1012 _ => {
1013 return Err(format!(
1014 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
1015 model
1016 ))
1017 }
1018 };
1019
1020 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
1021}
1022
1023pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1024 if message.trim_start().starts_with("ONNX Runtime not found.") {
1025 return true;
1026 }
1027
1028 let message = message.to_ascii_lowercase();
1029 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1030 .iter()
1031 .any(|pattern| message.contains(pattern));
1032 let mentions_dynamic_load_failure = [
1033 "shared library",
1034 "dynamic library",
1035 "failed to load",
1036 "could not load",
1037 "unable to load",
1038 "dlopen",
1039 "loadlibrary",
1040 "no such file",
1041 "not found",
1042 ]
1043 .iter()
1044 .any(|pattern| message.contains(pattern));
1045
1046 mentions_onnx_runtime && mentions_dynamic_load_failure
1047}
1048
1049fn format_embedding_init_error(error: impl Display) -> String {
1050 let message = error.to_string();
1051
1052 if is_onnx_runtime_unavailable(&message) {
1053 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1054 }
1055
1056 format!("failed to initialize semantic embedding model: {message}")
1057}
1058
1059#[derive(Debug, Clone)]
1061pub struct SemanticChunk {
1062 pub file: PathBuf,
1064 pub name: String,
1066 pub kind: SymbolKind,
1068 pub start_line: u32,
1070 pub end_line: u32,
1071 pub exported: bool,
1073 pub embed_text: String,
1075 pub snippet: String,
1077}
1078
1079#[derive(Debug, Clone)]
1081pub struct EmbeddingEntry {
1082 chunk: SemanticChunk,
1083 vector: Vec<f32>,
1084}
1085
1086#[derive(Debug, Clone)]
1088pub struct SemanticIndex {
1089 entries: Vec<EmbeddingEntry>,
1090 file_mtimes: HashMap<PathBuf, SystemTime>,
1092 file_sizes: HashMap<PathBuf, u64>,
1094 file_hashes: HashMap<PathBuf, blake3::Hash>,
1095 dimension: usize,
1097 fingerprint: Option<SemanticIndexFingerprint>,
1098 project_root: PathBuf,
1099 deferred_files: HashSet<PathBuf>,
1100}
1101
1102#[derive(Debug, Clone, Copy)]
1103struct IndexedFileMetadata {
1104 mtime: SystemTime,
1105 size: u64,
1106 content_hash: blake3::Hash,
1107}
1108
1109#[derive(Debug, Default, Clone, Copy)]
1112pub struct RefreshSummary {
1113 pub changed: usize,
1114 pub added: usize,
1115 pub deleted: usize,
1116 pub total_processed: usize,
1117}
1118
1119impl RefreshSummary {
1120 pub fn is_noop(&self) -> bool {
1122 self.changed == 0 && self.added == 0 && self.deleted == 0
1123 }
1124}
1125
1126#[derive(Debug, Default)]
1127pub struct InvalidatedFilesRefresh {
1128 pub added_entries: Vec<EmbeddingEntry>,
1129 pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1130 pub completed_paths: Vec<PathBuf>,
1131 pub summary: RefreshSummary,
1132}
1133
1134#[derive(Debug, Clone)]
1136pub struct SemanticResult {
1137 pub file: PathBuf,
1138 pub name: String,
1139 pub kind: SymbolKind,
1140 pub start_line: u32,
1141 pub end_line: u32,
1142 pub exported: bool,
1143 pub snippet: String,
1144 pub score: f32,
1145 pub source: &'static str,
1146}
1147
1148impl SemanticIndex {
1149 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1150 debug_assert!(project_root.is_absolute());
1151 Self {
1152 entries: Vec::new(),
1153 file_mtimes: HashMap::new(),
1154 file_sizes: HashMap::new(),
1155 file_hashes: HashMap::new(),
1156 dimension,
1157 fingerprint: None,
1158 project_root,
1159 deferred_files: HashSet::new(),
1160 }
1161 }
1162
1163 pub fn entry_count(&self) -> usize {
1165 self.entries.len()
1166 }
1167
1168 pub fn indexed_file_count(&self) -> usize {
1170 self.file_mtimes.len()
1171 }
1172
1173 pub fn status_label(&self) -> &'static str {
1175 if self.entries.is_empty() {
1176 "empty"
1177 } else {
1178 "ready"
1179 }
1180 }
1181
1182 fn collect_chunks(
1183 project_root: &Path,
1184 files: &[PathBuf],
1185 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1186 let per_file: Vec<(
1187 PathBuf,
1188 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1189 )> = files
1190 .par_iter()
1191 .map_init(HashMap::new, |parsers, file| {
1192 let result = collect_file_metadata(file).and_then(|metadata| {
1193 collect_file_chunks(project_root, file, parsers)
1194 .map(|chunks| (metadata, chunks))
1195 });
1196 (file.clone(), result)
1197 })
1198 .collect();
1199
1200 let mut chunks: Vec<SemanticChunk> = Vec::new();
1201 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1202
1203 for (file, result) in per_file {
1204 match result {
1205 Ok((metadata, file_chunks)) => {
1206 file_metadata.insert(file, metadata);
1207 chunks.extend(file_chunks);
1208 }
1209 Err(error) => {
1210 if error == "unsupported file extension" {
1216 continue;
1217 }
1218 slog_warn!(
1219 "failed to collect semantic chunks for {}: {}",
1220 file.display(),
1221 error
1222 );
1223 }
1224 }
1225 }
1226
1227 (chunks, file_metadata)
1228 }
1229
1230 fn build_from_chunks<F, P>(
1231 project_root: &Path,
1232 chunks: Vec<SemanticChunk>,
1233 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1234 embed_fn: &mut F,
1235 max_batch_size: usize,
1236 mut progress: Option<&mut P>,
1237 ) -> Result<Self, String>
1238 where
1239 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1240 P: FnMut(usize, usize),
1241 {
1242 debug_assert!(project_root.is_absolute());
1243 let total_chunks = chunks.len();
1244
1245 if chunks.is_empty() {
1246 return Ok(Self {
1247 entries: Vec::new(),
1248 file_mtimes: file_metadata
1249 .iter()
1250 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1251 .collect(),
1252 file_sizes: file_metadata
1253 .iter()
1254 .map(|(path, metadata)| (path.clone(), metadata.size))
1255 .collect(),
1256 file_hashes: file_metadata
1257 .into_iter()
1258 .map(|(path, metadata)| (path, metadata.content_hash))
1259 .collect(),
1260 dimension: DEFAULT_DIMENSION,
1261 fingerprint: None,
1262 project_root: project_root.to_path_buf(),
1263 deferred_files: HashSet::new(),
1264 });
1265 }
1266
1267 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1269 let mut expected_dimension: Option<usize> = None;
1270 let batch_size = max_batch_size.max(1);
1271 for batch_start in (0..chunks.len()).step_by(batch_size) {
1272 let batch_end = (batch_start + batch_size).min(chunks.len());
1273 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1274 .iter()
1275 .map(|c| c.embed_text.clone())
1276 .collect();
1277
1278 let vectors = embed_fn(batch_texts)?;
1279 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1280
1281 if let Some(dim) = vectors.first().map(|v| v.len()) {
1283 match expected_dimension {
1284 None => expected_dimension = Some(dim),
1285 Some(expected) if dim != expected => {
1286 return Err(format!(
1287 "embedding dimension changed across batches: expected {expected}, got {dim}"
1288 ));
1289 }
1290 _ => {}
1291 }
1292 }
1293
1294 for (i, vector) in vectors.into_iter().enumerate() {
1295 let chunk_idx = batch_start + i;
1296 entries.push(EmbeddingEntry {
1297 chunk: chunks[chunk_idx].clone(),
1298 vector,
1299 });
1300 }
1301
1302 if let Some(callback) = progress.as_mut() {
1303 callback(entries.len(), total_chunks);
1304 }
1305 }
1306
1307 let dimension = entries
1308 .first()
1309 .map(|e| e.vector.len())
1310 .unwrap_or(DEFAULT_DIMENSION);
1311
1312 Ok(Self {
1313 entries,
1314 file_mtimes: file_metadata
1315 .iter()
1316 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1317 .collect(),
1318 file_sizes: file_metadata
1319 .iter()
1320 .map(|(path, metadata)| (path.clone(), metadata.size))
1321 .collect(),
1322 file_hashes: file_metadata
1323 .into_iter()
1324 .map(|(path, metadata)| (path, metadata.content_hash))
1325 .collect(),
1326 dimension,
1327 fingerprint: None,
1328 project_root: project_root.to_path_buf(),
1329 deferred_files: HashSet::new(),
1330 })
1331 }
1332
1333 pub fn build<F>(
1336 project_root: &Path,
1337 files: &[PathBuf],
1338 embed_fn: &mut F,
1339 max_batch_size: usize,
1340 ) -> Result<Self, String>
1341 where
1342 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1343 {
1344 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1345 Self::build_from_chunks(
1346 project_root,
1347 chunks,
1348 file_mtimes,
1349 embed_fn,
1350 max_batch_size,
1351 Option::<&mut fn(usize, usize)>::None,
1352 )
1353 }
1354
1355 pub fn build_with_progress<F, P>(
1357 project_root: &Path,
1358 files: &[PathBuf],
1359 embed_fn: &mut F,
1360 max_batch_size: usize,
1361 progress: &mut P,
1362 ) -> Result<Self, String>
1363 where
1364 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1365 P: FnMut(usize, usize),
1366 {
1367 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1368 let total_chunks = chunks.len();
1369 progress(0, total_chunks);
1370 Self::build_from_chunks(
1371 project_root,
1372 chunks,
1373 file_mtimes,
1374 embed_fn,
1375 max_batch_size,
1376 Some(progress),
1377 )
1378 }
1379
1380 pub fn refresh_stale_files<F, P>(
1391 &mut self,
1392 project_root: &Path,
1393 current_files: &[PathBuf],
1394 embed_fn: &mut F,
1395 max_batch_size: usize,
1396 progress: &mut P,
1397 ) -> Result<RefreshSummary, String>
1398 where
1399 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1400 P: FnMut(usize, usize),
1401 {
1402 self.backfill_missing_file_sizes();
1403
1404 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1406 self.deferred_files
1407 .retain(|path| current_set.contains(path.as_path()));
1408 let total_processed = current_set.len() + self.file_mtimes.len()
1409 - self
1410 .file_mtimes
1411 .keys()
1412 .filter(|path| current_set.contains(path.as_path()))
1413 .count();
1414
1415 let mut deleted: Vec<PathBuf> = Vec::new();
1418 let mut changed: Vec<PathBuf> = Vec::new();
1419 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1420 for indexed_path in &indexed_paths {
1421 if !current_set.contains(indexed_path.as_path()) {
1422 deleted.push(indexed_path.clone());
1423 continue;
1424 }
1425 let cached = match (
1426 self.file_mtimes.get(indexed_path),
1427 self.file_sizes.get(indexed_path),
1428 self.file_hashes.get(indexed_path),
1429 ) {
1430 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1431 mtime: *mtime,
1432 size: *size,
1433 content_hash: *hash,
1434 }),
1435 _ => None,
1436 };
1437 match cached
1438 .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1439 {
1440 Some(FreshnessVerdict::HotFresh) => {}
1441 Some(FreshnessVerdict::ContentFresh {
1442 new_mtime,
1443 new_size,
1444 }) => {
1445 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1446 self.file_sizes.insert(indexed_path.clone(), new_size);
1447 }
1448 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1449 changed.push(indexed_path.clone());
1450 }
1451 }
1452 }
1453
1454 let mut added: Vec<PathBuf> = Vec::new();
1456 for path in current_files {
1457 if !self.file_mtimes.contains_key(path) {
1458 added.push(path.clone());
1459 }
1460 }
1461
1462 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1464 progress(0, 0);
1465 return Ok(RefreshSummary {
1466 total_processed,
1467 ..RefreshSummary::default()
1468 });
1469 }
1470
1471 if !deleted.is_empty() {
1475 self.remove_indexed_files(&deleted);
1476 }
1477
1478 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1480 to_embed.extend(changed.iter().cloned());
1481 to_embed.extend(added.iter().cloned());
1482
1483 if to_embed.is_empty() {
1484 progress(0, 0);
1486 return Ok(RefreshSummary {
1487 changed: 0,
1488 added: 0,
1489 deleted: deleted.len(),
1490 total_processed,
1491 });
1492 }
1493
1494 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1495 let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1496 let vanished = to_embed
1497 .iter()
1498 .filter(|path| {
1499 changed_set.contains(path.as_path())
1500 && !fresh_metadata.contains_key(*path)
1501 && !path.exists()
1502 })
1503 .cloned()
1504 .collect::<Vec<_>>();
1505 if !vanished.is_empty() {
1506 self.remove_indexed_files(&vanished);
1507 deleted.extend(vanished);
1508 }
1509
1510 if chunks.is_empty() {
1511 progress(0, 0);
1512 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1513 for file in &successful_files {
1514 self.deferred_files.remove(file);
1515 }
1516 if !successful_files.is_empty() {
1517 self.entries
1518 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1519 }
1520 let changed_count = changed
1521 .iter()
1522 .filter(|path| successful_files.contains(*path))
1523 .count();
1524 let added_count = added
1525 .iter()
1526 .filter(|path| successful_files.contains(*path))
1527 .count();
1528 for (file, metadata) in fresh_metadata {
1529 self.file_mtimes.insert(file.clone(), metadata.mtime);
1530 self.file_sizes.insert(file.clone(), metadata.size);
1531 self.file_hashes.insert(file.clone(), metadata.content_hash);
1532 }
1533 return Ok(RefreshSummary {
1534 changed: changed_count,
1535 added: added_count,
1536 deleted: deleted.len(),
1537 total_processed,
1538 });
1539 }
1540
1541 let total_chunks = chunks.len();
1543 progress(0, total_chunks);
1544 let batch_size = max_batch_size.max(1);
1545 let existing_dimension = if self.entries.is_empty() {
1546 None
1547 } else {
1548 Some(self.dimension)
1549 };
1550 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1551 let mut observed_dimension: Option<usize> = existing_dimension;
1552
1553 for batch_start in (0..chunks.len()).step_by(batch_size) {
1554 let batch_end = (batch_start + batch_size).min(chunks.len());
1555 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1556 .iter()
1557 .map(|c| c.embed_text.clone())
1558 .collect();
1559
1560 let vectors = embed_fn(batch_texts)?;
1561 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1562
1563 if let Some(dim) = vectors.first().map(|v| v.len()) {
1564 match observed_dimension {
1565 None => observed_dimension = Some(dim),
1566 Some(expected) if dim != expected => {
1567 return Err(format!(
1570 "embedding dimension changed during incremental refresh: \
1571 cached index uses {expected}, new vectors use {dim}"
1572 ));
1573 }
1574 _ => {}
1575 }
1576 }
1577
1578 for (i, vector) in vectors.into_iter().enumerate() {
1579 let chunk_idx = batch_start + i;
1580 new_entries.push(EmbeddingEntry {
1581 chunk: chunks[chunk_idx].clone(),
1582 vector,
1583 });
1584 }
1585
1586 progress(new_entries.len(), total_chunks);
1587 }
1588
1589 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1590 for file in &successful_files {
1591 self.deferred_files.remove(file);
1592 }
1593 if !successful_files.is_empty() {
1594 self.entries
1595 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1596 }
1597
1598 self.entries.extend(new_entries);
1599 for (file, metadata) in fresh_metadata {
1600 self.file_mtimes.insert(file.clone(), metadata.mtime);
1601 self.file_sizes.insert(file.clone(), metadata.size);
1602 self.file_hashes.insert(file, metadata.content_hash);
1603 }
1604 if let Some(dim) = observed_dimension {
1605 self.dimension = dim;
1606 }
1607
1608 Ok(RefreshSummary {
1609 changed: changed
1610 .iter()
1611 .filter(|path| successful_files.contains(*path))
1612 .count(),
1613 added: added
1614 .iter()
1615 .filter(|path| successful_files.contains(*path))
1616 .count(),
1617 deleted: deleted.len(),
1618 total_processed,
1619 })
1620 }
1621
1622 pub fn refresh_invalidated_files<F, P>(
1629 &mut self,
1630 project_root: &Path,
1631 paths: &[PathBuf],
1632 embed_fn: &mut F,
1633 max_batch_size: usize,
1634 max_files: usize,
1635 progress: &mut P,
1636 ) -> Result<InvalidatedFilesRefresh, String>
1637 where
1638 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1639 P: FnMut(usize, usize),
1640 {
1641 self.backfill_missing_file_sizes();
1642
1643 self.deferred_files.retain(|path| path.exists());
1644 let mut requested_paths = paths.to_vec();
1645 requested_paths.extend(self.deferred_files.iter().cloned());
1646 requested_paths.sort();
1647 requested_paths.dedup();
1648 let total_processed = requested_paths.len();
1649
1650 if requested_paths.is_empty() {
1651 progress(0, 0);
1652 return Ok(InvalidatedFilesRefresh {
1653 summary: RefreshSummary {
1654 total_processed,
1655 ..RefreshSummary::default()
1656 },
1657 ..InvalidatedFilesRefresh::default()
1658 });
1659 }
1660
1661 let previously_indexed: HashSet<PathBuf> = requested_paths
1662 .iter()
1663 .filter(|path| self.file_mtimes.contains_key(*path))
1664 .cloned()
1665 .collect();
1666
1667 self.remove_indexed_files(&requested_paths);
1671
1672 let existing_paths = requested_paths
1673 .iter()
1674 .filter(|path| path.exists())
1675 .cloned()
1676 .collect::<Vec<_>>();
1677 let deleted = requested_paths
1678 .iter()
1679 .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1680 .count();
1681
1682 if existing_paths.is_empty() {
1683 for path in &requested_paths {
1684 if !path.exists() {
1685 self.deferred_files.remove(path);
1686 }
1687 }
1688 progress(0, 0);
1689 return Ok(InvalidatedFilesRefresh {
1690 completed_paths: requested_paths,
1691 summary: RefreshSummary {
1692 deleted,
1693 total_processed,
1694 ..RefreshSummary::default()
1695 },
1696 ..InvalidatedFilesRefresh::default()
1697 });
1698 }
1699
1700 let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1701
1702 let retained_file_count = self.file_mtimes.len();
1703 let changed_successful_count = existing_paths
1704 .iter()
1705 .filter(|path| {
1706 previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1707 })
1708 .count();
1709 let available_new_files =
1710 max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1711 let new_successful_files = existing_paths
1712 .iter()
1713 .filter(|path| {
1714 !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1715 })
1716 .cloned()
1717 .collect::<Vec<_>>();
1718 if new_successful_files.len() > available_new_files {
1719 let allowed_new_files = new_successful_files
1720 .iter()
1721 .take(available_new_files)
1722 .cloned()
1723 .collect::<HashSet<_>>();
1724 let deferred_new_files = new_successful_files
1725 .into_iter()
1726 .filter(|path| !allowed_new_files.contains(path))
1727 .collect::<HashSet<_>>();
1728
1729 fresh_metadata.retain(|file, _| {
1730 previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1731 });
1732 chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1733
1734 if !deferred_new_files.is_empty() {
1735 for path in &deferred_new_files {
1736 self.deferred_files.insert(path.clone());
1737 }
1738 slog_warn!(
1739 "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1740 deferred_new_files.len(),
1741 max_files
1742 );
1743 }
1744 }
1745
1746 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1747 for file in &successful_files {
1748 self.deferred_files.remove(file);
1749 }
1750 let changed = successful_files
1751 .iter()
1752 .filter(|path| previously_indexed.contains(path.as_path()))
1753 .count();
1754 let added = successful_files.len().saturating_sub(changed);
1755 let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1756
1757 if chunks.is_empty() {
1758 progress(0, 0);
1759 for (file, metadata) in fresh_metadata {
1760 let freshness = FileFreshness {
1761 mtime: metadata.mtime,
1762 size: metadata.size,
1763 content_hash: metadata.content_hash,
1764 };
1765 self.file_mtimes.insert(file.clone(), freshness.mtime);
1766 self.file_sizes.insert(file.clone(), freshness.size);
1767 self.file_hashes
1768 .insert(file.clone(), freshness.content_hash);
1769 updated_metadata.push((file, freshness));
1770 }
1771
1772 return Ok(InvalidatedFilesRefresh {
1773 updated_metadata,
1774 completed_paths: requested_paths,
1775 summary: RefreshSummary {
1776 changed,
1777 added,
1778 deleted,
1779 total_processed,
1780 },
1781 ..InvalidatedFilesRefresh::default()
1782 });
1783 }
1784
1785 let total_chunks = chunks.len();
1786 progress(0, total_chunks);
1787 let batch_size = max_batch_size.max(1);
1788 let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1789 None
1790 } else {
1791 Some(self.dimension)
1792 };
1793 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1794
1795 for batch_start in (0..chunks.len()).step_by(batch_size) {
1796 let batch_end = (batch_start + batch_size).min(chunks.len());
1797 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1798 .iter()
1799 .map(|chunk| chunk.embed_text.clone())
1800 .collect();
1801
1802 let vectors = embed_fn(batch_texts)?;
1803 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1804
1805 if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1806 match observed_dimension {
1807 None => observed_dimension = Some(dim),
1808 Some(expected) if dim != expected => {
1809 return Err(format!(
1810 "embedding dimension changed during invalidated-file refresh: \
1811 cached index uses {expected}, new vectors use {dim}"
1812 ));
1813 }
1814 _ => {}
1815 }
1816 }
1817
1818 for (i, vector) in vectors.into_iter().enumerate() {
1819 let chunk_idx = batch_start + i;
1820 new_entries.push(EmbeddingEntry {
1821 chunk: chunks[chunk_idx].clone(),
1822 vector,
1823 });
1824 }
1825
1826 progress(new_entries.len(), total_chunks);
1827 }
1828
1829 let added_entries = new_entries.clone();
1830 self.entries.extend(new_entries);
1831 for (file, metadata) in fresh_metadata {
1832 let freshness = FileFreshness {
1833 mtime: metadata.mtime,
1834 size: metadata.size,
1835 content_hash: metadata.content_hash,
1836 };
1837 self.file_mtimes.insert(file.clone(), freshness.mtime);
1838 self.file_sizes.insert(file.clone(), freshness.size);
1839 self.file_hashes
1840 .insert(file.clone(), freshness.content_hash);
1841 updated_metadata.push((file, freshness));
1842 }
1843 if let Some(dim) = observed_dimension {
1844 self.dimension = dim;
1845 }
1846
1847 Ok(InvalidatedFilesRefresh {
1848 added_entries,
1849 updated_metadata,
1850 completed_paths: requested_paths,
1851 summary: RefreshSummary {
1852 changed,
1853 added,
1854 deleted,
1855 total_processed,
1856 },
1857 })
1858 }
1859
1860 pub fn apply_refresh_update(
1861 &mut self,
1862 added_entries: Vec<EmbeddingEntry>,
1863 updated_metadata: Vec<(PathBuf, FileFreshness)>,
1864 completed_paths: &[PathBuf],
1865 ) {
1866 self.remove_indexed_files(completed_paths);
1867
1868 let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1869 self.entries.extend(added_entries);
1870 for (file, freshness) in updated_metadata {
1871 self.file_mtimes.insert(file.clone(), freshness.mtime);
1872 self.file_sizes.insert(file.clone(), freshness.size);
1873 self.file_hashes.insert(file, freshness.content_hash);
1874 }
1875 if let Some(dim) = observed_dimension {
1876 self.dimension = dim;
1877 }
1878 }
1879
1880 fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1881 let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1882 self.entries
1883 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1884 for path in files {
1885 self.file_mtimes.remove(path);
1886 self.file_sizes.remove(path);
1887 self.file_hashes.remove(path);
1888 }
1889 }
1890
1891 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1893 if self.entries.is_empty() || query_vector.len() != self.dimension {
1894 return Vec::new();
1895 }
1896
1897 let mut scored: Vec<(f32, usize)> = self
1898 .entries
1899 .iter()
1900 .enumerate()
1901 .map(|(i, entry)| {
1902 let mut score = cosine_similarity(query_vector, &entry.vector);
1903 if entry.chunk.exported {
1904 score *= 1.1;
1905 }
1906 (score, i)
1907 })
1908 .collect();
1909
1910 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1912
1913 scored
1914 .into_iter()
1915 .take(top_k)
1916 .map(|(score, idx)| {
1920 let entry = &self.entries[idx];
1921 SemanticResult {
1922 file: entry.chunk.file.clone(),
1923 name: entry.chunk.name.clone(),
1924 kind: entry.chunk.kind.clone(),
1925 start_line: entry.chunk.start_line,
1926 end_line: entry.chunk.end_line,
1927 exported: entry.chunk.exported,
1928 snippet: entry.chunk.snippet.clone(),
1929 score,
1930 source: "semantic",
1931 }
1932 })
1933 .collect()
1934 }
1935
1936 pub fn len(&self) -> usize {
1938 self.entries.len()
1939 }
1940
1941 pub fn is_file_stale(&self, file: &Path) -> bool {
1943 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1944 return true;
1945 };
1946 let Some(stored_size) = self.file_sizes.get(file) else {
1947 return true;
1948 };
1949 let Some(stored_hash) = self.file_hashes.get(file) else {
1950 return true;
1951 };
1952 let cached = FileFreshness {
1953 mtime: *stored_mtime,
1954 size: *stored_size,
1955 content_hash: *stored_hash,
1956 };
1957 match cache_freshness::verify_file_strict(file, &cached) {
1958 FreshnessVerdict::HotFresh => false,
1959 FreshnessVerdict::ContentFresh { .. } => false,
1960 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1961 }
1962 }
1963
1964 fn backfill_missing_file_sizes(&mut self) {
1965 for path in self.file_mtimes.keys() {
1966 if self.file_sizes.contains_key(path) {
1967 continue;
1968 }
1969 if let Ok(metadata) = fs::metadata(path) {
1970 self.file_sizes.insert(path.clone(), metadata.len());
1971 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1972 self.file_hashes.insert(path.clone(), hash);
1973 }
1974 }
1975 }
1976 }
1977
1978 pub fn remove_file(&mut self, file: &Path) {
1980 self.invalidate_file(file);
1981 }
1982
1983 pub fn invalidate_file(&mut self, file: &Path) {
1984 let canonical_file = canonicalize_existing_or_deleted_path(file);
1985 self.entries
1986 .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
1987 self.file_mtimes.remove(file);
1988 self.file_sizes.remove(file);
1989 self.file_hashes.remove(file);
1990 if canonical_file.as_path() != file {
1991 self.file_mtimes.remove(&canonical_file);
1992 self.file_sizes.remove(&canonical_file);
1993 self.file_hashes.remove(&canonical_file);
1994 }
1995 }
1996
1997 pub fn dimension(&self) -> usize {
1999 self.dimension
2000 }
2001
2002 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2003 self.fingerprint.as_ref()
2004 }
2005
2006 pub fn backend_label(&self) -> Option<&str> {
2007 self.fingerprint.as_ref().map(|f| f.backend.as_str())
2008 }
2009
2010 pub fn model_label(&self) -> Option<&str> {
2011 self.fingerprint.as_ref().map(|f| f.model.as_str())
2012 }
2013
2014 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2015 self.fingerprint = Some(fingerprint);
2016 }
2017
2018 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2020 if self.entries.is_empty() {
2023 slog_info!("skipping semantic index persistence (0 entries)");
2024 return;
2025 }
2026 let dir = storage_dir.join("semantic").join(project_key);
2027 if let Err(e) = fs::create_dir_all(&dir) {
2028 slog_warn!("failed to create semantic cache dir: {}", e);
2029 return;
2030 }
2031 let data_path = dir.join("semantic.bin");
2032 let tmp_path = dir.join(format!(
2033 "semantic.bin.tmp.{}.{}",
2034 std::process::id(),
2035 SystemTime::now()
2036 .duration_since(SystemTime::UNIX_EPOCH)
2037 .unwrap_or(Duration::ZERO)
2038 .as_nanos()
2039 ));
2040 let bytes = self.to_bytes();
2041 let write_result = (|| -> std::io::Result<()> {
2042 use std::io::Write;
2043 let mut file = fs::File::create(&tmp_path)?;
2044 file.write_all(&bytes)?;
2045 file.sync_all()?;
2046 Ok(())
2047 })();
2048 if let Err(e) = write_result {
2049 slog_warn!("failed to write semantic index: {}", e);
2050 let _ = fs::remove_file(&tmp_path);
2051 return;
2052 }
2053 if let Err(e) = fs::rename(&tmp_path, &data_path) {
2054 slog_warn!("failed to rename semantic index: {}", e);
2055 let _ = fs::remove_file(&tmp_path);
2056 return;
2057 }
2058 slog_info!(
2059 "semantic index persisted: {} entries, {:.1} KB",
2060 self.entries.len(),
2061 bytes.len() as f64 / 1024.0
2062 );
2063 }
2064
2065 pub fn read_from_disk(
2067 storage_dir: &Path,
2068 project_key: &str,
2069 current_canonical_root: &Path,
2070 is_worktree_bridge: bool,
2071 expected_fingerprint: Option<&str>,
2072 ) -> Option<Self> {
2073 debug_assert!(current_canonical_root.is_absolute());
2074 let data_path = storage_dir
2075 .join("semantic")
2076 .join(project_key)
2077 .join("semantic.bin");
2078 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2079 if file_len < HEADER_BYTES_V1 {
2080 slog_warn!(
2081 "corrupt semantic index (too small: {} bytes), removing",
2082 file_len
2083 );
2084 if !is_worktree_bridge {
2085 let _ = fs::remove_file(&data_path);
2086 }
2087 return None;
2088 }
2089
2090 let bytes = fs::read(&data_path).ok()?;
2091 let version = bytes[0];
2092 if version != SEMANTIC_INDEX_VERSION_V6 {
2093 slog_info!(
2094 "cached semantic index version {} is older than {}, rebuilding",
2095 version,
2096 SEMANTIC_INDEX_VERSION_V6
2097 );
2098 if !is_worktree_bridge {
2099 let _ = fs::remove_file(&data_path);
2100 }
2101 return None;
2102 }
2103 match Self::from_bytes(&bytes, current_canonical_root) {
2104 Ok(index) => {
2105 if index.entries.is_empty() {
2106 slog_info!("cached semantic index is empty, will rebuild");
2107 if !is_worktree_bridge {
2108 let _ = fs::remove_file(&data_path);
2109 }
2110 return None;
2111 }
2112 if let Some(expected) = expected_fingerprint {
2113 let matches = index
2114 .fingerprint()
2115 .map(|fingerprint| fingerprint.matches_expected(expected))
2116 .unwrap_or(false);
2117 if !matches {
2118 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2119 if !is_worktree_bridge {
2120 let _ = fs::remove_file(&data_path);
2121 }
2122 return None;
2123 }
2124 }
2125 slog_info!(
2126 "loaded semantic index from disk: {} entries",
2127 index.entries.len()
2128 );
2129 Some(index)
2130 }
2131 Err(e) => {
2132 slog_warn!("corrupt semantic index, rebuilding: {}", e);
2133 if !is_worktree_bridge {
2134 let _ = fs::remove_file(&data_path);
2135 }
2136 None
2137 }
2138 }
2139 }
2140
2141 pub fn to_bytes(&self) -> Vec<u8> {
2143 let mut buf = Vec::new();
2144 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2145 let encoded = fingerprint.as_string();
2146 if encoded.is_empty() {
2147 None
2148 } else {
2149 Some(encoded.into_bytes())
2150 }
2151 });
2152 let file_mtimes: Vec<_> = self
2153 .file_mtimes
2154 .iter()
2155 .filter_map(|(path, mtime)| {
2156 cache_relative_path(&self.project_root, path)
2157 .map(|relative| (relative, path, mtime))
2158 })
2159 .collect();
2160 let entries: Vec<_> = self
2161 .entries
2162 .iter()
2163 .filter_map(|entry| {
2164 cache_relative_path(&self.project_root, &entry.chunk.file)
2165 .map(|relative| (relative, entry))
2166 })
2167 .collect();
2168
2169 let version = SEMANTIC_INDEX_VERSION_V6;
2182 buf.push(version);
2183 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2184 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2185 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2186 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2187 buf.extend_from_slice(fp_bytes_ref);
2188
2189 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2192 for (relative, path, mtime) in &file_mtimes {
2193 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2194 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2195 buf.extend_from_slice(&path_bytes);
2196 let duration = mtime
2197 .duration_since(SystemTime::UNIX_EPOCH)
2198 .unwrap_or_default();
2199 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2200 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2201 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2202 buf.extend_from_slice(&size.to_le_bytes());
2203 let hash = self
2204 .file_hashes
2205 .get(*path)
2206 .copied()
2207 .unwrap_or_else(cache_freshness::zero_hash);
2208 buf.extend_from_slice(hash.as_bytes());
2209 }
2210
2211 for (relative, entry) in &entries {
2213 let c = &entry.chunk;
2214
2215 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2217 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2218 buf.extend_from_slice(&file_bytes);
2219
2220 let name_bytes = c.name.as_bytes();
2222 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2223 buf.extend_from_slice(name_bytes);
2224
2225 buf.push(symbol_kind_to_u8(&c.kind));
2227
2228 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2230 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2231 buf.push(c.exported as u8);
2232
2233 let snippet_bytes = c.snippet.as_bytes();
2235 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2236 buf.extend_from_slice(snippet_bytes);
2237
2238 let embed_bytes = c.embed_text.as_bytes();
2240 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2241 buf.extend_from_slice(embed_bytes);
2242
2243 for &val in &entry.vector {
2245 buf.extend_from_slice(&val.to_le_bytes());
2246 }
2247 }
2248
2249 buf
2250 }
2251
2252 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2254 debug_assert!(current_canonical_root.is_absolute());
2255 let mut pos = 0;
2256
2257 if data.len() < HEADER_BYTES_V1 {
2258 return Err("data too short".to_string());
2259 }
2260
2261 let version = data[pos];
2262 pos += 1;
2263 if version != SEMANTIC_INDEX_VERSION_V1
2264 && version != SEMANTIC_INDEX_VERSION_V2
2265 && version != SEMANTIC_INDEX_VERSION_V3
2266 && version != SEMANTIC_INDEX_VERSION_V4
2267 && version != SEMANTIC_INDEX_VERSION_V5
2268 && version != SEMANTIC_INDEX_VERSION_V6
2269 {
2270 return Err(format!("unsupported version: {}", version));
2271 }
2272 if (version == SEMANTIC_INDEX_VERSION_V2
2276 || version == SEMANTIC_INDEX_VERSION_V3
2277 || version == SEMANTIC_INDEX_VERSION_V4
2278 || version == SEMANTIC_INDEX_VERSION_V5
2279 || version == SEMANTIC_INDEX_VERSION_V6)
2280 && data.len() < HEADER_BYTES_V2
2281 {
2282 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2283 }
2284
2285 let dimension = read_u32(data, &mut pos)? as usize;
2286 let entry_count = read_u32(data, &mut pos)? as usize;
2287 validate_embedding_dimension(dimension)?;
2288 if entry_count > MAX_ENTRIES {
2289 return Err(format!("too many semantic index entries: {}", entry_count));
2290 }
2291
2292 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2298 || version == SEMANTIC_INDEX_VERSION_V3
2299 || version == SEMANTIC_INDEX_VERSION_V4
2300 || version == SEMANTIC_INDEX_VERSION_V5
2301 || version == SEMANTIC_INDEX_VERSION_V6;
2302 let fingerprint = if has_fingerprint_field {
2303 let fingerprint_len = read_u32(data, &mut pos)? as usize;
2304 if pos + fingerprint_len > data.len() {
2305 return Err("unexpected end of data reading fingerprint".to_string());
2306 }
2307 if fingerprint_len == 0 {
2308 None
2309 } else {
2310 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2311 pos += fingerprint_len;
2312 Some(
2313 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2314 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2315 )
2316 }
2317 } else {
2318 None
2319 };
2320
2321 let mtime_count = read_u32(data, &mut pos)? as usize;
2323 if mtime_count > MAX_ENTRIES {
2324 return Err(format!("too many semantic file mtimes: {}", mtime_count));
2325 }
2326
2327 let vector_bytes = entry_count
2328 .checked_mul(dimension)
2329 .and_then(|count| count.checked_mul(F32_BYTES))
2330 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2331 if vector_bytes > data.len().saturating_sub(pos) {
2332 return Err("semantic index vectors exceed available data".to_string());
2333 }
2334
2335 let mut file_mtimes = HashMap::with_capacity(mtime_count);
2336 let mut file_sizes = HashMap::with_capacity(mtime_count);
2337 let mut file_hashes = HashMap::with_capacity(mtime_count);
2338 for _ in 0..mtime_count {
2339 let path = read_string(data, &mut pos)?;
2340 let secs = read_u64(data, &mut pos)?;
2341 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2347 || version == SEMANTIC_INDEX_VERSION_V4
2348 || version == SEMANTIC_INDEX_VERSION_V5
2349 || version == SEMANTIC_INDEX_VERSION_V6
2350 {
2351 read_u32(data, &mut pos)?
2352 } else {
2353 0
2354 };
2355 let size =
2356 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2357 read_u64(data, &mut pos)?
2358 } else {
2359 0
2360 };
2361 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2362 if pos + 32 > data.len() {
2363 return Err("unexpected end of data reading content hash".to_string());
2364 }
2365 let mut hash_bytes = [0u8; 32];
2366 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2367 pos += 32;
2368 blake3::Hash::from_bytes(hash_bytes)
2369 } else {
2370 cache_freshness::zero_hash()
2371 };
2372 if nanos >= 1_000_000_000 {
2379 return Err(format!(
2380 "invalid semantic mtime: nanos {} >= 1_000_000_000",
2381 nanos
2382 ));
2383 }
2384 let duration = std::time::Duration::new(secs, nanos);
2385 let mtime = SystemTime::UNIX_EPOCH
2386 .checked_add(duration)
2387 .ok_or_else(|| {
2388 format!(
2389 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2390 secs, nanos
2391 )
2392 })?;
2393 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2394 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2395 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2396 } else {
2397 PathBuf::from(path)
2398 };
2399 file_mtimes.insert(path.clone(), mtime);
2400 file_sizes.insert(path.clone(), size);
2401 file_hashes.insert(path, content_hash);
2402 }
2403
2404 let mut entries = Vec::with_capacity(entry_count);
2406 for _ in 0..entry_count {
2407 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2408 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2409 cached_path_under_root(current_canonical_root, &raw_file)
2410 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2411 } else {
2412 raw_file
2413 };
2414 let name = read_string(data, &mut pos)?;
2415
2416 if pos >= data.len() {
2417 return Err("unexpected end of data".to_string());
2418 }
2419 let kind = u8_to_symbol_kind(data[pos]);
2420 pos += 1;
2421
2422 let start_line = read_u32(data, &mut pos)?;
2423 let end_line = read_u32(data, &mut pos)?;
2424
2425 if pos >= data.len() {
2426 return Err("unexpected end of data".to_string());
2427 }
2428 let exported = data[pos] != 0;
2429 pos += 1;
2430
2431 let snippet = read_string(data, &mut pos)?;
2432 let embed_text = read_string(data, &mut pos)?;
2433
2434 let vec_bytes = dimension
2436 .checked_mul(F32_BYTES)
2437 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2438 if pos + vec_bytes > data.len() {
2439 return Err("unexpected end of data reading vector".to_string());
2440 }
2441 let mut vector = Vec::with_capacity(dimension);
2442 for _ in 0..dimension {
2443 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2444 vector.push(f32::from_le_bytes(bytes));
2445 pos += 4;
2446 }
2447
2448 entries.push(EmbeddingEntry {
2449 chunk: SemanticChunk {
2450 file,
2451 name,
2452 kind,
2453 start_line,
2454 end_line,
2455 exported,
2456 embed_text,
2457 snippet,
2458 },
2459 vector,
2460 });
2461 }
2462
2463 if entries.len() != entry_count {
2464 return Err(format!(
2465 "semantic cache entry count drift: header={} decoded={}",
2466 entry_count,
2467 entries.len()
2468 ));
2469 }
2470 for entry in &entries {
2471 if !file_mtimes.contains_key(&entry.chunk.file) {
2472 return Err(format!(
2473 "semantic cache metadata missing for entry file {}",
2474 entry.chunk.file.display()
2475 ));
2476 }
2477 }
2478
2479 Ok(Self {
2480 entries,
2481 file_mtimes,
2482 file_sizes,
2483 file_hashes,
2484 dimension,
2485 fingerprint,
2486 project_root: current_canonical_root.to_path_buf(),
2487 deferred_files: HashSet::new(),
2488 })
2489 }
2490}
2491
2492fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2494 let relative = file
2495 .strip_prefix(project_root)
2496 .unwrap_or(file)
2497 .to_string_lossy();
2498
2499 let kind_label = match &symbol.kind {
2500 SymbolKind::Function => "function",
2501 SymbolKind::Class => "class",
2502 SymbolKind::Method => "method",
2503 SymbolKind::Struct => "struct",
2504 SymbolKind::Interface => "interface",
2505 SymbolKind::Enum => "enum",
2506 SymbolKind::TypeAlias => "type",
2507 SymbolKind::Variable => "variable",
2508 SymbolKind::Heading => "heading",
2509 SymbolKind::FileSummary => "file-summary",
2510 };
2511
2512 let name = &symbol.name;
2514 let mut text = format!(
2515 "name:{name} file:{} kind:{} name:{name}",
2516 relative, kind_label
2517 );
2518
2519 if let Some(sig) = &symbol.signature {
2520 text.push_str(&format!(" signature:{}", sig));
2521 }
2522
2523 let lines: Vec<&str> = source.lines().collect();
2525 let start = (symbol.range.start_line as usize).min(lines.len());
2526 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2528 if start < end {
2529 let body: String = lines[start..end]
2530 .iter()
2531 .take(15) .copied()
2533 .collect::<Vec<&str>>()
2534 .join("\n");
2535 let snippet = if body.len() > 300 {
2536 format!("{}...", &body[..body.floor_char_boundary(300)])
2537 } else {
2538 body
2539 };
2540 text.push_str(&format!(" body:{}", snippet));
2541 }
2542
2543 text
2544}
2545
2546fn truncate_chars(value: &str, max_chars: usize) -> String {
2547 value.chars().take(max_chars).collect()
2548}
2549
2550fn first_leading_doc_comment(source: &str) -> String {
2551 let lines: Vec<&str> = source.lines().collect();
2552 let Some((start, first)) = lines
2553 .iter()
2554 .enumerate()
2555 .find(|(_, line)| !line.trim().is_empty())
2556 else {
2557 return String::new();
2558 };
2559
2560 let trimmed = first.trim_start();
2561 if trimmed.starts_with("/**") {
2562 let mut comment = Vec::new();
2563 for line in lines.iter().skip(start) {
2564 comment.push(*line);
2565 if line.contains("*/") {
2566 break;
2567 }
2568 }
2569 return truncate_chars(&comment.join("\n"), 200);
2570 }
2571
2572 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2573 let comment = lines
2574 .iter()
2575 .skip(start)
2576 .take_while(|line| {
2577 let trimmed = line.trim_start();
2578 trimmed.starts_with("///") || trimmed.starts_with("//!")
2579 })
2580 .copied()
2581 .collect::<Vec<_>>()
2582 .join("\n");
2583 return truncate_chars(&comment, 200);
2584 }
2585
2586 String::new()
2587}
2588
2589pub fn build_file_summary_chunk(
2590 file: &Path,
2591 project_root: &Path,
2592 source: &str,
2593 top_exports: &[&str],
2594 top_export_signatures: &[Option<&str>],
2595) -> SemanticChunk {
2596 let relative = file.strip_prefix(project_root).unwrap_or(file);
2597 let rel_path = relative.to_string_lossy();
2598 let parent_dir = relative
2599 .parent()
2600 .map(|parent| parent.to_string_lossy().to_string())
2601 .unwrap_or_default();
2602 let name = file
2603 .file_stem()
2604 .map(|stem| stem.to_string_lossy().to_string())
2605 .unwrap_or_default();
2606 let doc = first_leading_doc_comment(source);
2607 let exports = top_exports
2608 .iter()
2609 .take(5)
2610 .copied()
2611 .collect::<Vec<_>>()
2612 .join(",");
2613 let snippet = if doc.is_empty() {
2614 top_export_signatures
2615 .first()
2616 .and_then(|signature| signature.as_deref())
2617 .map(|signature| truncate_chars(signature, 200))
2618 .unwrap_or_default()
2619 } else {
2620 doc.clone()
2621 };
2622
2623 SemanticChunk {
2624 file: file.to_path_buf(),
2625 name,
2626 kind: SymbolKind::FileSummary,
2627 start_line: 0,
2628 end_line: 0,
2629 exported: false,
2630 embed_text: format!(
2631 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2632 file.file_stem()
2633 .map(|stem| stem.to_string_lossy().to_string())
2634 .unwrap_or_default()
2635 ),
2636 snippet,
2637 }
2638}
2639
2640fn parser_for(
2641 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2642 lang: crate::parser::LangId,
2643) -> Result<&mut Parser, String> {
2644 use std::collections::hash_map::Entry;
2645
2646 match parsers.entry(lang) {
2647 Entry::Occupied(entry) => Ok(entry.into_mut()),
2648 Entry::Vacant(entry) => {
2649 let grammar = grammar_for(lang);
2650 let mut parser = Parser::new();
2651 parser
2652 .set_language(&grammar)
2653 .map_err(|error| error.to_string())?;
2654 Ok(entry.insert(parser))
2655 }
2656 }
2657}
2658
2659pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2660 matches!(
2661 path.extension().and_then(|extension| extension.to_str()),
2662 Some(
2663 "ts" | "tsx"
2664 | "js"
2665 | "jsx"
2666 | "py"
2667 | "rs"
2668 | "go"
2669 | "c"
2670 | "h"
2671 | "cc"
2672 | "cpp"
2673 | "cxx"
2674 | "hpp"
2675 | "hh"
2676 | "zig"
2677 | "cs"
2678 | "sh"
2679 | "bash"
2680 | "zsh"
2681 | "sol"
2682 | "vue"
2683 | "yaml"
2684 | "yml"
2685 )
2686 )
2687}
2688
2689fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2690 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2691 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2692 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2693 .map_err(|error| error.to_string())?
2694 .unwrap_or_else(cache_freshness::zero_hash);
2695 Ok(IndexedFileMetadata {
2696 mtime,
2697 size: metadata.len(),
2698 content_hash,
2699 })
2700}
2701
2702fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2703 if let Ok(canonical) = fs::canonicalize(path) {
2704 return canonical;
2705 }
2706
2707 let Some(parent) = path.parent() else {
2708 return path.to_path_buf();
2709 };
2710 let Some(file_name) = path.file_name() else {
2711 return path.to_path_buf();
2712 };
2713
2714 fs::canonicalize(parent)
2715 .map(|canonical_parent| canonical_parent.join(file_name))
2716 .unwrap_or_else(|_| path.to_path_buf())
2717}
2718
2719fn collect_file_chunks(
2720 project_root: &Path,
2721 file: &Path,
2722 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2723) -> Result<Vec<SemanticChunk>, String> {
2724 if !is_semantic_indexed_extension(file) {
2725 return Err("unsupported file extension".to_string());
2726 }
2727 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2728 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2729 let tree = parser_for(parsers, lang)?
2730 .parse(&source, None)
2731 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2732 let symbols =
2733 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2734
2735 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2736}
2737
2738fn build_snippet(symbol: &Symbol, source: &str) -> String {
2740 let lines: Vec<&str> = source.lines().collect();
2741 let start = (symbol.range.start_line as usize).min(lines.len());
2742 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2744 if start < end {
2745 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2746 let mut snippet = snippet_lines.join("\n");
2747 if end - start > 5 {
2748 snippet.push_str("\n ...");
2749 }
2750 if snippet.len() > 300 {
2751 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2752 }
2753 snippet
2754 } else {
2755 String::new()
2756 }
2757}
2758
2759fn symbols_to_chunks(
2761 file: &Path,
2762 symbols: &[Symbol],
2763 source: &str,
2764 project_root: &Path,
2765) -> Vec<SemanticChunk> {
2766 let mut chunks = Vec::new();
2767 let top_exports_with_signatures = symbols
2768 .iter()
2769 .filter(|symbol| {
2770 symbol.exported
2771 && symbol.parent.is_none()
2772 && !matches!(symbol.kind, SymbolKind::Heading)
2773 })
2774 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2775 .collect::<Vec<_>>();
2776
2777 let has_only_headings = !symbols.is_empty()
2778 && symbols
2779 .iter()
2780 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2781 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2782 let top_exports = top_exports_with_signatures
2783 .iter()
2784 .map(|(name, _)| *name)
2785 .collect::<Vec<_>>();
2786 let top_export_signatures = top_exports_with_signatures
2787 .iter()
2788 .map(|(_, signature)| *signature)
2789 .collect::<Vec<_>>();
2790 chunks.push(build_file_summary_chunk(
2791 file,
2792 project_root,
2793 source,
2794 &top_exports,
2795 &top_export_signatures,
2796 ));
2797 }
2798
2799 for symbol in symbols {
2800 if matches!(symbol.kind, SymbolKind::Heading) {
2805 continue;
2806 }
2807
2808 let line_count = symbol
2810 .range
2811 .end_line
2812 .saturating_sub(symbol.range.start_line)
2813 + 1;
2814 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2815 continue;
2816 }
2817
2818 let embed_text = build_embed_text(symbol, source, file, project_root);
2819 let snippet = build_snippet(symbol, source);
2820
2821 chunks.push(SemanticChunk {
2822 file: file.to_path_buf(),
2823 name: symbol.name.clone(),
2824 kind: symbol.kind.clone(),
2825 start_line: symbol.range.start_line,
2826 end_line: symbol.range.end_line,
2827 exported: symbol.exported,
2828 embed_text,
2829 snippet,
2830 });
2831
2832 }
2835
2836 chunks
2837}
2838
2839fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2841 if a.len() != b.len() {
2842 return 0.0;
2843 }
2844
2845 let mut dot = 0.0f32;
2846 let mut norm_a = 0.0f32;
2847 let mut norm_b = 0.0f32;
2848
2849 for i in 0..a.len() {
2850 dot += a[i] * b[i];
2851 norm_a += a[i] * a[i];
2852 norm_b += b[i] * b[i];
2853 }
2854
2855 let denom = norm_a.sqrt() * norm_b.sqrt();
2856 if denom == 0.0 {
2857 0.0
2858 } else {
2859 dot / denom
2860 }
2861}
2862
2863fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2865 match kind {
2866 SymbolKind::Function => 0,
2867 SymbolKind::Class => 1,
2868 SymbolKind::Method => 2,
2869 SymbolKind::Struct => 3,
2870 SymbolKind::Interface => 4,
2871 SymbolKind::Enum => 5,
2872 SymbolKind::TypeAlias => 6,
2873 SymbolKind::Variable => 7,
2874 SymbolKind::Heading => 8,
2875 SymbolKind::FileSummary => 9,
2876 }
2877}
2878
2879fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2880 match v {
2881 0 => SymbolKind::Function,
2882 1 => SymbolKind::Class,
2883 2 => SymbolKind::Method,
2884 3 => SymbolKind::Struct,
2885 4 => SymbolKind::Interface,
2886 5 => SymbolKind::Enum,
2887 6 => SymbolKind::TypeAlias,
2888 7 => SymbolKind::Variable,
2889 8 => SymbolKind::Heading,
2890 9 => SymbolKind::FileSummary,
2891 _ => SymbolKind::Heading,
2892 }
2893}
2894
2895fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2896 if *pos + 4 > data.len() {
2897 return Err("unexpected end of data reading u32".to_string());
2898 }
2899 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2900 *pos += 4;
2901 Ok(val)
2902}
2903
2904fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2905 if *pos + 8 > data.len() {
2906 return Err("unexpected end of data reading u64".to_string());
2907 }
2908 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2909 *pos += 8;
2910 Ok(u64::from_le_bytes(bytes))
2911}
2912
2913fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2914 let len = read_u32(data, pos)? as usize;
2915 if *pos + len > data.len() {
2916 return Err("unexpected end of data reading string".to_string());
2917 }
2918 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2919 *pos += len;
2920 Ok(s)
2921}
2922
2923#[cfg(test)]
2924mod tests {
2925 use super::*;
2926 use crate::config::{SemanticBackend, SemanticBackendConfig};
2927 use crate::parser::FileParser;
2928 use std::io::{Read, Write};
2929 use std::net::TcpListener;
2930 use std::thread;
2931
2932 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2933 where
2934 F: Fn(String, String, String) -> String + Send + 'static,
2935 {
2936 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2937 let addr = listener.local_addr().expect("local addr");
2938 let handle = thread::spawn(move || {
2939 let (mut stream, _) = listener.accept().expect("accept request");
2940 let mut buf = Vec::new();
2941 let mut chunk = [0u8; 4096];
2942 let mut header_end = None;
2943 let mut content_length = 0usize;
2944 loop {
2945 let n = stream.read(&mut chunk).expect("read request");
2946 if n == 0 {
2947 break;
2948 }
2949 buf.extend_from_slice(&chunk[..n]);
2950 if header_end.is_none() {
2951 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2952 header_end = Some(pos + 4);
2953 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2954 for line in headers.lines() {
2955 if let Some(value) = line.strip_prefix("Content-Length:") {
2956 content_length = value.trim().parse::<usize>().unwrap_or(0);
2957 }
2958 }
2959 }
2960 }
2961 if let Some(end) = header_end {
2962 if buf.len() >= end + content_length {
2963 break;
2964 }
2965 }
2966 }
2967
2968 let end = header_end.expect("header terminator");
2969 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2970 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2971 let mut lines = request.lines();
2972 let request_line = lines.next().expect("request line").to_string();
2973 let path = request_line
2974 .split_whitespace()
2975 .nth(1)
2976 .expect("request path")
2977 .to_string();
2978 let response_body = handler(request_line, path, body);
2979 let response = format!(
2980 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2981 response_body.len(),
2982 response_body
2983 );
2984 stream
2985 .write_all(response.as_bytes())
2986 .expect("write response");
2987 });
2988
2989 (format!("http://{}", addr), handle)
2990 }
2991
2992 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2993 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2994 }
2995
2996 fn write_rust_file(path: &Path, function_name: &str) {
2997 fs::write(
2998 path,
2999 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
3000 )
3001 .unwrap();
3002 }
3003
3004 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3005 let mut embed = test_vector_for_texts;
3006 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3007 }
3008
3009 fn test_project_root() -> PathBuf {
3010 std::env::current_dir().unwrap()
3011 }
3012
3013 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3014 index.file_mtimes.insert(file.to_path_buf(), mtime);
3015 index.file_sizes.insert(file.to_path_buf(), size);
3016 index
3017 .file_hashes
3018 .insert(file.to_path_buf(), cache_freshness::zero_hash());
3019 }
3020
3021 #[test]
3022 fn semantic_cache_serialization_skips_paths_outside_project_root() {
3023 let dir = tempfile::tempdir().expect("create temp dir");
3024 let project = fs::canonicalize(dir.path()).expect("canonical project");
3025 let outside = project.join("..").join("outside.rs");
3026 let mut index = SemanticIndex::new(project.clone(), 3);
3027 index
3028 .file_mtimes
3029 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3030 index.file_sizes.insert(outside.clone(), 1);
3031 index
3032 .file_hashes
3033 .insert(outside.clone(), cache_freshness::zero_hash());
3034 index.entries.push(EmbeddingEntry {
3035 chunk: SemanticChunk {
3036 file: outside,
3037 name: "outside".to_string(),
3038 kind: SymbolKind::Function,
3039 start_line: 0,
3040 end_line: 0,
3041 exported: false,
3042 embed_text: "outside".to_string(),
3043 snippet: "outside".to_string(),
3044 },
3045 vector: vec![1.0, 0.0, 0.0],
3046 });
3047
3048 let bytes = index.to_bytes();
3049 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3050 assert_eq!(loaded.entries.len(), 0);
3051 assert!(loaded.file_mtimes.is_empty());
3052 }
3053
3054 #[test]
3055 fn test_cosine_similarity_identical() {
3056 let a = vec![1.0, 0.0, 0.0];
3057 let b = vec![1.0, 0.0, 0.0];
3058 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3059 }
3060
3061 #[test]
3062 fn test_cosine_similarity_orthogonal() {
3063 let a = vec![1.0, 0.0, 0.0];
3064 let b = vec![0.0, 1.0, 0.0];
3065 assert!(cosine_similarity(&a, &b).abs() < 0.001);
3066 }
3067
3068 #[test]
3069 fn test_cosine_similarity_opposite() {
3070 let a = vec![1.0, 0.0, 0.0];
3071 let b = vec![-1.0, 0.0, 0.0];
3072 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3073 }
3074
3075 #[test]
3076 fn test_serialization_roundtrip() {
3077 let project_root = test_project_root();
3078 let file = project_root.join("src/main.rs");
3079 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3080 index.entries.push(EmbeddingEntry {
3081 chunk: SemanticChunk {
3082 file: file.clone(),
3083 name: "handle_request".to_string(),
3084 kind: SymbolKind::Function,
3085 start_line: 10,
3086 end_line: 25,
3087 exported: true,
3088 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3089 snippet: "fn handle_request() {\n // ...\n}".to_string(),
3090 },
3091 vector: vec![0.1, 0.2, 0.3, 0.4],
3092 });
3093 index.dimension = 4;
3094 index
3095 .file_mtimes
3096 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3097 index.file_sizes.insert(file, 0);
3098 index.set_fingerprint(SemanticIndexFingerprint {
3099 backend: "fastembed".to_string(),
3100 model: "all-MiniLM-L6-v2".to_string(),
3101 base_url: FALLBACK_BACKEND.to_string(),
3102 dimension: 4,
3103 chunking_version: default_chunking_version(),
3104 });
3105
3106 let bytes = index.to_bytes();
3107 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3108
3109 assert_eq!(restored.entries.len(), 1);
3110 assert_eq!(restored.entries[0].chunk.name, "handle_request");
3111 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3112 assert_eq!(restored.dimension, 4);
3113 assert_eq!(restored.backend_label(), Some("fastembed"));
3114 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3115 }
3116
3117 #[test]
3118 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3119 let cases = [
3120 (SymbolKind::Function, 0),
3121 (SymbolKind::Class, 1),
3122 (SymbolKind::Method, 2),
3123 (SymbolKind::Struct, 3),
3124 (SymbolKind::Interface, 4),
3125 (SymbolKind::Enum, 5),
3126 (SymbolKind::TypeAlias, 6),
3127 (SymbolKind::Variable, 7),
3128 (SymbolKind::Heading, 8),
3129 (SymbolKind::FileSummary, 9),
3130 ];
3131
3132 for (kind, encoded) in cases {
3133 assert_eq!(symbol_kind_to_u8(&kind), encoded);
3134 assert_eq!(u8_to_symbol_kind(encoded), kind);
3135 }
3136 }
3137
3138 #[test]
3139 fn test_search_top_k() {
3140 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3141 index.dimension = 3;
3142
3143 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3145 let mut vec = vec![0.0f32; 3];
3146 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
3148 chunk: SemanticChunk {
3149 file: PathBuf::from("/src/lib.rs"),
3150 name: name.to_string(),
3151 kind: SymbolKind::Function,
3152 start_line: (i * 10 + 1) as u32,
3153 end_line: (i * 10 + 5) as u32,
3154 exported: true,
3155 embed_text: format!("kind:function name:{}", name),
3156 snippet: format!("fn {}() {{}}", name),
3157 },
3158 vector: vec,
3159 });
3160 }
3161
3162 let query = vec![0.9, 0.1, 0.0];
3164 let results = index.search(&query, 2);
3165
3166 assert_eq!(results.len(), 2);
3167 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
3169 }
3170
3171 #[test]
3172 fn test_empty_index_search() {
3173 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3174 let results = index.search(&[0.1, 0.2, 0.3], 10);
3175 assert!(results.is_empty());
3176 }
3177
3178 #[test]
3179 fn single_line_symbol_builds_non_empty_snippet() {
3180 let symbol = Symbol {
3181 name: "answer".to_string(),
3182 kind: SymbolKind::Variable,
3183 range: crate::symbols::Range {
3184 start_line: 0,
3185 start_col: 0,
3186 end_line: 0,
3187 end_col: 24,
3188 },
3189 signature: Some("const answer = 42".to_string()),
3190 scope_chain: Vec::new(),
3191 exported: true,
3192 parent: None,
3193 };
3194 let source = "export const answer = 42;\n";
3195
3196 let snippet = build_snippet(&symbol, source);
3197
3198 assert_eq!(snippet, "export const answer = 42;");
3199 }
3200
3201 #[test]
3202 fn optimized_file_chunk_collection_matches_file_parser_path() {
3203 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3204 let file = project_root.join("src/semantic_index.rs");
3205 let source = std::fs::read_to_string(&file).unwrap();
3206
3207 let mut legacy_parser = FileParser::new();
3208 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3209 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3210
3211 let mut parsers = HashMap::new();
3212 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3213
3214 assert_eq!(
3215 chunk_fingerprint(&optimized_chunks),
3216 chunk_fingerprint(&legacy_chunks)
3217 );
3218 }
3219
3220 fn chunk_fingerprint(
3221 chunks: &[SemanticChunk],
3222 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3223 chunks
3224 .iter()
3225 .map(|chunk| {
3226 (
3227 chunk.name.clone(),
3228 chunk.kind.clone(),
3229 chunk.start_line,
3230 chunk.end_line,
3231 chunk.exported,
3232 chunk.embed_text.clone(),
3233 chunk.snippet.clone(),
3234 )
3235 })
3236 .collect()
3237 }
3238
3239 #[test]
3240 fn rejects_oversized_dimension_during_deserialization() {
3241 let mut bytes = Vec::new();
3242 bytes.push(1u8);
3243 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3244 bytes.extend_from_slice(&0u32.to_le_bytes());
3245 bytes.extend_from_slice(&0u32.to_le_bytes());
3246
3247 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3248 }
3249
3250 #[test]
3251 fn rejects_oversized_entry_count_during_deserialization() {
3252 let mut bytes = Vec::new();
3253 bytes.push(1u8);
3254 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3255 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3256 bytes.extend_from_slice(&0u32.to_le_bytes());
3257
3258 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3259 }
3260
3261 #[test]
3262 fn invalidate_file_removes_entries_and_mtime() {
3263 let target = PathBuf::from("/src/main.rs");
3264 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3265 index.entries.push(EmbeddingEntry {
3266 chunk: SemanticChunk {
3267 file: target.clone(),
3268 name: "main".to_string(),
3269 kind: SymbolKind::Function,
3270 start_line: 0,
3271 end_line: 1,
3272 exported: false,
3273 embed_text: "main".to_string(),
3274 snippet: "fn main() {}".to_string(),
3275 },
3276 vector: vec![1.0; DEFAULT_DIMENSION],
3277 });
3278 index
3279 .file_mtimes
3280 .insert(target.clone(), SystemTime::UNIX_EPOCH);
3281 index.file_sizes.insert(target.clone(), 0);
3282
3283 index.invalidate_file(&target);
3284
3285 assert!(index.entries.is_empty());
3286 assert!(!index.file_mtimes.contains_key(&target));
3287 assert!(!index.file_sizes.contains_key(&target));
3288 }
3289
3290 #[test]
3291 fn refresh_missing_changed_file_is_purged_after_collect() {
3292 let temp = tempfile::tempdir().unwrap();
3293 let project_root = temp.path();
3294 let file = project_root.join("src/lib.rs");
3295 fs::create_dir_all(file.parent().unwrap()).unwrap();
3296 write_rust_file(&file, "vanished_symbol");
3297
3298 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3299 let original_size = *index.file_sizes.get(&file).unwrap();
3300 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3301 fs::remove_file(&file).unwrap();
3302
3303 let mut embed = test_vector_for_texts;
3304 let mut progress = |_done: usize, _total: usize| {};
3305 let summary = index
3306 .refresh_stale_files(
3307 project_root,
3308 std::slice::from_ref(&file),
3309 &mut embed,
3310 8,
3311 &mut progress,
3312 )
3313 .unwrap();
3314
3315 assert_eq!(summary.changed, 0);
3316 assert_eq!(summary.added, 0);
3317 assert_eq!(summary.deleted, 1);
3318 assert!(index.entries.is_empty());
3319 assert!(!index.file_mtimes.contains_key(&file));
3320 assert!(!index.file_sizes.contains_key(&file));
3321 assert!(!index.file_hashes.contains_key(&file));
3322 }
3323
3324 #[test]
3325 fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3326 let temp = tempfile::tempdir().unwrap();
3327 let project_root = temp.path();
3328 let file = project_root.join("src/lib.rs");
3329 fs::create_dir_all(file.parent().unwrap()).unwrap();
3330 write_rust_file(&file, "kept_symbol");
3331
3332 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3333 let original_entry_count = index.entries.len();
3334 let original_mtime = *index.file_mtimes.get(&file).unwrap();
3335 let original_size = *index.file_sizes.get(&file).unwrap();
3336
3337 let stale_mtime = SystemTime::UNIX_EPOCH;
3338 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3339 fs::remove_file(&file).unwrap();
3340 fs::create_dir(&file).unwrap();
3341
3342 let mut embed = test_vector_for_texts;
3343 let mut progress = |_done: usize, _total: usize| {};
3344 let summary = index
3345 .refresh_stale_files(
3346 project_root,
3347 std::slice::from_ref(&file),
3348 &mut embed,
3349 8,
3350 &mut progress,
3351 )
3352 .unwrap();
3353
3354 assert_eq!(summary.changed, 0);
3355 assert_eq!(summary.added, 0);
3356 assert_eq!(summary.deleted, 0);
3357 assert_eq!(index.entries.len(), original_entry_count);
3358 assert!(index
3359 .entries
3360 .iter()
3361 .any(|entry| entry.chunk.name == "kept_symbol"));
3362 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3363 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3364 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3365 }
3366
3367 #[test]
3368 fn refresh_never_indexed_file_error_does_not_record_mtime() {
3369 let temp = tempfile::tempdir().unwrap();
3370 let project_root = temp.path();
3371 let missing = project_root.join("src/missing.rs");
3372 fs::create_dir_all(missing.parent().unwrap()).unwrap();
3373
3374 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3375 let mut embed = test_vector_for_texts;
3376 let mut progress = |_done: usize, _total: usize| {};
3377 let summary = index
3378 .refresh_stale_files(
3379 project_root,
3380 std::slice::from_ref(&missing),
3381 &mut embed,
3382 8,
3383 &mut progress,
3384 )
3385 .unwrap();
3386
3387 assert_eq!(summary.added, 0);
3388 assert_eq!(summary.changed, 0);
3389 assert_eq!(summary.deleted, 0);
3390 assert!(!index.file_mtimes.contains_key(&missing));
3391 assert!(!index.file_sizes.contains_key(&missing));
3392 assert!(index.entries.is_empty());
3393 }
3394
3395 #[test]
3396 fn refresh_reports_added_for_new_files() {
3397 let temp = tempfile::tempdir().unwrap();
3398 let project_root = temp.path();
3399 let existing = project_root.join("src/lib.rs");
3400 let added = project_root.join("src/new.rs");
3401 fs::create_dir_all(existing.parent().unwrap()).unwrap();
3402 write_rust_file(&existing, "existing_symbol");
3403 write_rust_file(&added, "added_symbol");
3404
3405 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3406 let mut embed = test_vector_for_texts;
3407 let mut progress = |_done: usize, _total: usize| {};
3408 let summary = index
3409 .refresh_stale_files(
3410 project_root,
3411 &[existing.clone(), added.clone()],
3412 &mut embed,
3413 8,
3414 &mut progress,
3415 )
3416 .unwrap();
3417
3418 assert_eq!(summary.added, 1);
3419 assert_eq!(summary.changed, 0);
3420 assert_eq!(summary.deleted, 0);
3421 assert_eq!(summary.total_processed, 2);
3422 assert!(index.file_mtimes.contains_key(&added));
3423 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3424 }
3425
3426 #[test]
3427 fn refresh_reports_deleted_for_removed_files() {
3428 let temp = tempfile::tempdir().unwrap();
3429 let project_root = temp.path();
3430 let deleted = project_root.join("src/deleted.rs");
3431 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3432 write_rust_file(&deleted, "deleted_symbol");
3433
3434 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3435 fs::remove_file(&deleted).unwrap();
3436
3437 let mut embed = test_vector_for_texts;
3438 let mut progress = |_done: usize, _total: usize| {};
3439 let summary = index
3440 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3441 .unwrap();
3442
3443 assert_eq!(summary.deleted, 1);
3444 assert_eq!(summary.changed, 0);
3445 assert_eq!(summary.added, 0);
3446 assert_eq!(summary.total_processed, 1);
3447 assert!(!index.file_mtimes.contains_key(&deleted));
3448 assert!(index.entries.is_empty());
3449 }
3450
3451 #[test]
3452 fn refresh_reports_changed_for_modified_files() {
3453 let temp = tempfile::tempdir().unwrap();
3454 let project_root = temp.path();
3455 let file = project_root.join("src/lib.rs");
3456 fs::create_dir_all(file.parent().unwrap()).unwrap();
3457 write_rust_file(&file, "old_symbol");
3458
3459 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3460 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3461 write_rust_file(&file, "new_symbol");
3462
3463 let mut embed = test_vector_for_texts;
3464 let mut progress = |_done: usize, _total: usize| {};
3465 let summary = index
3466 .refresh_stale_files(
3467 project_root,
3468 std::slice::from_ref(&file),
3469 &mut embed,
3470 8,
3471 &mut progress,
3472 )
3473 .unwrap();
3474
3475 assert_eq!(summary.changed, 1);
3476 assert_eq!(summary.added, 0);
3477 assert_eq!(summary.deleted, 0);
3478 assert_eq!(summary.total_processed, 1);
3479 assert!(index
3480 .entries
3481 .iter()
3482 .any(|entry| entry.chunk.name == "new_symbol"));
3483 assert!(!index
3484 .entries
3485 .iter()
3486 .any(|entry| entry.chunk.name == "old_symbol"));
3487 }
3488
3489 #[test]
3490 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3491 let temp = tempfile::tempdir().unwrap();
3492 let project_root = temp.path();
3493 let file = project_root.join("src/lib.rs");
3494 fs::create_dir_all(file.parent().unwrap()).unwrap();
3495 write_rust_file(&file, "clean_symbol");
3496
3497 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3498 let original_entries = index.entries.len();
3499 let mut embed_called = false;
3500 let mut embed = |texts: Vec<String>| {
3501 embed_called = true;
3502 test_vector_for_texts(texts)
3503 };
3504 let mut progress = |_done: usize, _total: usize| {};
3505 let summary = index
3506 .refresh_stale_files(
3507 project_root,
3508 std::slice::from_ref(&file),
3509 &mut embed,
3510 8,
3511 &mut progress,
3512 )
3513 .unwrap();
3514
3515 assert!(summary.is_noop());
3516 assert_eq!(summary.total_processed, 1);
3517 assert!(!embed_called);
3518 assert_eq!(index.entries.len(), original_entries);
3519 }
3520
3521 #[test]
3522 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3523 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3524
3525 assert!(is_onnx_runtime_unavailable(message));
3526 }
3527
3528 #[test]
3529 fn formats_missing_onnx_runtime_with_install_hint() {
3530 let message = format_embedding_init_error(
3531 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3532 );
3533
3534 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3535 assert!(message.contains("Original error:"));
3536 }
3537
3538 #[test]
3539 fn openai_compatible_backend_embeds_with_mock_server() {
3540 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3541 assert!(request_line.starts_with("POST "));
3542 assert_eq!(path, "/v1/embeddings");
3543 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3544 });
3545
3546 let config = SemanticBackendConfig {
3547 backend: SemanticBackend::OpenAiCompatible,
3548 model: "test-embedding".to_string(),
3549 base_url: Some(base_url),
3550 api_key_env: None,
3551 timeout_ms: 5_000,
3552 max_batch_size: 64,
3553 max_files: 20_000,
3554 };
3555
3556 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3557 let vectors = model
3558 .embed(vec!["hello".to_string(), "world".to_string()])
3559 .unwrap();
3560
3561 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3562 handle.join().unwrap();
3563 }
3564
3565 #[test]
3575 fn openai_compatible_request_has_single_content_type_header() {
3576 use std::sync::{Arc, Mutex};
3577 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3578 let captured_for_thread = Arc::clone(&captured);
3579
3580 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3581 let addr = listener.local_addr().expect("local addr");
3582 let handle = thread::spawn(move || {
3583 let (mut stream, _) = listener.accept().expect("accept");
3584 let mut buf = Vec::new();
3585 let mut chunk = [0u8; 4096];
3586 let mut header_end = None;
3587 let mut content_length = 0usize;
3588 loop {
3589 let n = stream.read(&mut chunk).expect("read");
3590 if n == 0 {
3591 break;
3592 }
3593 buf.extend_from_slice(&chunk[..n]);
3594 if header_end.is_none() {
3595 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3596 header_end = Some(pos + 4);
3597 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3598 if let Some(value) = line.strip_prefix("Content-Length:") {
3599 content_length = value.trim().parse::<usize>().unwrap_or(0);
3600 }
3601 }
3602 }
3603 }
3604 if let Some(end) = header_end {
3605 if buf.len() >= end + content_length {
3606 break;
3607 }
3608 }
3609 }
3610 *captured_for_thread.lock().unwrap() = buf;
3611 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3612 let response = format!(
3613 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3614 body.len(),
3615 body
3616 );
3617 let _ = stream.write_all(response.as_bytes());
3618 });
3619
3620 let config = SemanticBackendConfig {
3621 backend: SemanticBackend::OpenAiCompatible,
3622 model: "text-embedding-3-small".to_string(),
3623 base_url: Some(format!("http://{}", addr)),
3624 api_key_env: None,
3625 timeout_ms: 5_000,
3626 max_batch_size: 64,
3627 max_files: 20_000,
3628 };
3629 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3630 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3631 handle.join().unwrap();
3632
3633 let bytes = captured.lock().unwrap().clone();
3634 let request = String::from_utf8_lossy(&bytes);
3635
3636 let content_type_lines = request
3639 .lines()
3640 .filter(|line| {
3641 let lower = line.to_ascii_lowercase();
3642 lower.starts_with("content-type:")
3643 })
3644 .count();
3645 assert_eq!(
3646 content_type_lines, 1,
3647 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3648 );
3649
3650 assert!(
3653 request.contains(r#""model":"text-embedding-3-small""#),
3654 "request body should contain model field; full request:\n{request}",
3655 );
3656 }
3657
3658 #[test]
3659 fn ollama_backend_embeds_with_mock_server() {
3660 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3661 assert!(request_line.starts_with("POST "));
3662 assert_eq!(path, "/api/embed");
3663 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3664 });
3665
3666 let config = SemanticBackendConfig {
3667 backend: SemanticBackend::Ollama,
3668 model: "embeddinggemma".to_string(),
3669 base_url: Some(base_url),
3670 api_key_env: None,
3671 timeout_ms: 5_000,
3672 max_batch_size: 64,
3673 max_files: 20_000,
3674 };
3675
3676 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3677 let vectors = model
3678 .embed(vec!["hello".to_string(), "world".to_string()])
3679 .unwrap();
3680
3681 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3682 handle.join().unwrap();
3683 }
3684
3685 #[test]
3686 fn read_from_disk_rejects_fingerprint_mismatch() {
3687 let storage = tempfile::tempdir().unwrap();
3688 let project_key = "proj";
3689
3690 let project_root = test_project_root();
3691 let file = project_root.join("src/main.rs");
3692 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3693 index.entries.push(EmbeddingEntry {
3694 chunk: SemanticChunk {
3695 file: file.clone(),
3696 name: "handle_request".to_string(),
3697 kind: SymbolKind::Function,
3698 start_line: 10,
3699 end_line: 25,
3700 exported: true,
3701 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3702 snippet: "fn handle_request() {}".to_string(),
3703 },
3704 vector: vec![0.1, 0.2, 0.3],
3705 });
3706 index.dimension = 3;
3707 index
3708 .file_mtimes
3709 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3710 index.file_sizes.insert(file, 0);
3711 index.set_fingerprint(SemanticIndexFingerprint {
3712 backend: "openai_compatible".to_string(),
3713 model: "test-embedding".to_string(),
3714 base_url: "http://127.0.0.1:1234/v1".to_string(),
3715 dimension: 3,
3716 chunking_version: default_chunking_version(),
3717 });
3718 index.write_to_disk(storage.path(), project_key);
3719
3720 let matching = index.fingerprint().unwrap().as_string();
3721 assert!(SemanticIndex::read_from_disk(
3722 storage.path(),
3723 project_key,
3724 &project_root,
3725 false,
3726 Some(&matching),
3727 )
3728 .is_some());
3729
3730 let mismatched = SemanticIndexFingerprint {
3731 backend: "ollama".to_string(),
3732 model: "embeddinggemma".to_string(),
3733 base_url: "http://127.0.0.1:11434".to_string(),
3734 dimension: 3,
3735 chunking_version: default_chunking_version(),
3736 }
3737 .as_string();
3738 assert!(SemanticIndex::read_from_disk(
3739 storage.path(),
3740 project_key,
3741 &project_root,
3742 false,
3743 Some(&mismatched),
3744 )
3745 .is_none());
3746 }
3747
3748 #[test]
3749 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3750 let storage = tempfile::tempdir().unwrap();
3751 let project_key = "proj-v3";
3752 let dir = storage.path().join("semantic").join(project_key);
3753 fs::create_dir_all(&dir).unwrap();
3754
3755 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3756 index.entries.push(EmbeddingEntry {
3757 chunk: SemanticChunk {
3758 file: PathBuf::from("/src/main.rs"),
3759 name: "handle_request".to_string(),
3760 kind: SymbolKind::Function,
3761 start_line: 0,
3762 end_line: 0,
3763 exported: true,
3764 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3765 snippet: "fn handle_request() {}".to_string(),
3766 },
3767 vector: vec![0.1, 0.2, 0.3],
3768 });
3769 index.dimension = 3;
3770 index
3771 .file_mtimes
3772 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3773 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3774 let fingerprint = SemanticIndexFingerprint {
3775 backend: "fastembed".to_string(),
3776 model: "test".to_string(),
3777 base_url: FALLBACK_BACKEND.to_string(),
3778 dimension: 3,
3779 chunking_version: default_chunking_version(),
3780 };
3781 index.set_fingerprint(fingerprint.clone());
3782
3783 let mut bytes = index.to_bytes();
3784 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3785 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3786
3787 assert!(SemanticIndex::read_from_disk(
3788 storage.path(),
3789 project_key,
3790 &test_project_root(),
3791 false,
3792 Some(&fingerprint.as_string())
3793 )
3794 .is_none());
3795 assert!(!dir.join("semantic.bin").exists());
3796 }
3797
3798 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3799 crate::symbols::Symbol {
3800 name: name.to_string(),
3801 kind,
3802 range: crate::symbols::Range {
3803 start_line: start,
3804 start_col: 0,
3805 end_line: end,
3806 end_col: 0,
3807 },
3808 signature: None,
3809 scope_chain: Vec::new(),
3810 exported: false,
3811 parent: None,
3812 }
3813 }
3814
3815 #[test]
3820 fn symbols_to_chunks_skips_heading_symbols() {
3821 let project_root = PathBuf::from("/proj");
3822 let file = project_root.join("README.md");
3823 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3824
3825 let symbols = vec![
3826 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3827 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3828 ];
3829
3830 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3831 assert!(
3832 chunks.is_empty(),
3833 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3834 chunks.len()
3835 );
3836 }
3837
3838 #[test]
3842 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3843 let project_root = PathBuf::from("/proj");
3844 let file = project_root.join("src/lib.rs");
3845 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3846
3847 let symbols = vec![
3848 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3850 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3851 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3852 ];
3853
3854 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3855 assert_eq!(
3856 chunks.len(),
3857 3,
3858 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3859 chunks.len()
3860 );
3861 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3862 assert!(chunks
3863 .iter()
3864 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3865 assert!(names.contains(&"handle_request"));
3866 assert!(names.contains(&"AuthService"));
3867 assert!(
3868 !names.contains(&"doc heading"),
3869 "Heading symbol leaked into chunks: {names:?}"
3870 );
3871 }
3872
3873 #[test]
3874 fn validate_ssrf_allows_loopback_hostnames() {
3875 for host in &[
3878 "http://localhost",
3879 "http://localhost:8080",
3880 "http://localhost:11434", "http://localhost.localdomain",
3882 "http://foo.localhost",
3883 ] {
3884 assert!(
3885 validate_base_url_no_ssrf(host).is_ok(),
3886 "Expected {host} to be allowed (loopback), got: {:?}",
3887 validate_base_url_no_ssrf(host)
3888 );
3889 }
3890 }
3891
3892 #[test]
3893 fn validate_ssrf_allows_loopback_ips() {
3894 for url in &[
3897 "http://127.0.0.1",
3898 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3900 "http://127.1.2.3",
3901 ] {
3902 let result = validate_base_url_no_ssrf(url);
3903 assert!(
3904 result.is_ok(),
3905 "Expected {url} to be allowed (loopback), got: {:?}",
3906 result
3907 );
3908 }
3909 }
3910
3911 #[test]
3912 fn validate_ssrf_rejects_private_non_loopback_ips() {
3913 for url in &[
3918 "http://192.168.1.1",
3919 "http://10.0.0.1",
3920 "http://172.16.0.1",
3921 "http://169.254.169.254",
3922 "http://100.64.0.1",
3923 ] {
3924 let result = validate_base_url_no_ssrf(url);
3925 assert!(
3926 result.is_err(),
3927 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3928 result
3929 );
3930 }
3931 }
3932
3933 #[test]
3934 fn validate_ssrf_rejects_mdns_local_hostnames() {
3935 for host in &[
3938 "http://printer.local",
3939 "http://nas.local:8080",
3940 "http://homelab.local",
3941 ] {
3942 let result = validate_base_url_no_ssrf(host);
3943 assert!(
3944 result.is_err(),
3945 "Expected {host} to be rejected (mDNS), got: {:?}",
3946 result
3947 );
3948 }
3949 }
3950
3951 #[test]
3952 fn normalize_base_url_allows_localhost_for_tests() {
3953 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3956 assert!(normalize_base_url("http://localhost:8080").is_ok());
3957 }
3958
3959 #[test]
3966 fn ort_mismatch_message_recommends_auto_fix_first() {
3967 let msg =
3968 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3969
3970 assert!(
3972 msg.contains("v1.9.0"),
3973 "should report detected version: {msg}"
3974 );
3975 assert!(
3976 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3977 "should report system path: {msg}"
3978 );
3979 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3980
3981 let auto_fix_pos = msg
3983 .find("Auto-fix")
3984 .expect("Auto-fix solution missing — users won't discover --fix");
3985 let remove_pos = msg
3986 .find("Remove the old library")
3987 .expect("system-rm solution missing");
3988 assert!(
3989 auto_fix_pos < remove_pos,
3990 "Auto-fix must come before manual rm — see PR comment thread"
3991 );
3992
3993 assert!(
3995 msg.contains("npx @cortexkit/aft doctor --fix"),
3996 "auto-fix command must be present and copy-pasteable: {msg}"
3997 );
3998 }
3999
4000 #[test]
4004 fn ort_mismatch_message_handles_macos_dylib_path() {
4005 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4006 assert!(msg.contains("v1.9.0"));
4007 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4008 assert!(
4012 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4013 "system path should be quoted in the auto-fix sentence: {msg}"
4014 );
4015 }
4016}