1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
4use crate::search_index::{cache_relative_path, cached_path_under_root};
5use crate::symbols::{Symbol, SymbolKind};
6use crate::{slog_info, slog_warn};
7
8use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
9use rayon::prelude::*;
10use reqwest::blocking::Client;
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet, VecDeque};
13use std::env;
14use std::fmt::Display;
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::time::Duration;
18use std::time::SystemTime;
19use tree_sitter::Parser;
20use url::Url;
21
22const DEFAULT_DIMENSION: usize = 384;
23const MAX_ENTRIES: usize = 1_000_000;
24const MAX_DIMENSION: usize = 1024;
25const F32_BYTES: usize = std::mem::size_of::<f32>();
26const HEADER_BYTES_V1: usize = 9;
27const HEADER_BYTES_V2: usize = 13;
28const ONNX_RUNTIME_INSTALL_HINT: &str =
29 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
30
31const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
32const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
33const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
38const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
41const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
44const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
46const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
47const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
48const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
50const DEFAULT_MAX_BATCH_SIZE: usize = 64;
51const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
52const FALLBACK_BACKEND: &str = "none";
53const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
54const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct SemanticIndexFingerprint {
58 pub backend: String,
59 pub model: String,
60 #[serde(default)]
61 pub base_url: String,
62 pub dimension: usize,
63 #[serde(default = "default_chunking_version")]
64 pub chunking_version: u32,
65}
66
67fn default_chunking_version() -> u32 {
68 2
69}
70
71impl SemanticIndexFingerprint {
72 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
73 let base_url = config
76 .base_url
77 .as_ref()
78 .and_then(|u| normalize_base_url(u).ok())
79 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
80 Self {
81 backend: config.backend.as_str().to_string(),
82 model: config.model.clone(),
83 base_url,
84 dimension,
85 chunking_version: default_chunking_version(),
86 }
87 }
88
89 pub fn as_string(&self) -> String {
90 serde_json::to_string(self).unwrap_or_else(|_| String::new())
91 }
92
93 fn matches_expected(&self, expected: &str) -> bool {
94 let encoded = self.as_string();
95 !encoded.is_empty() && encoded == expected
96 }
97}
98
99enum SemanticEmbeddingEngine {
100 Fastembed(TextEmbedding),
101 OpenAiCompatible {
102 client: Client,
103 model: String,
104 base_url: String,
105 api_key: Option<String>,
106 },
107 Ollama {
108 client: Client,
109 model: String,
110 base_url: String,
111 },
112}
113
114pub struct SemanticEmbeddingModel {
115 backend: SemanticBackend,
116 model: String,
117 base_url: Option<String>,
118 timeout_ms: u64,
119 max_batch_size: usize,
120 dimension: Option<usize>,
121 engine: SemanticEmbeddingEngine,
122 query_embedding_cache: HashMap<String, Vec<f32>>,
123 query_embedding_cache_order: VecDeque<String>,
124 query_embedding_cache_hits: u64,
125 query_embedding_cache_misses: u64,
126}
127
128pub type EmbeddingModel = SemanticEmbeddingModel;
129
130fn validate_embedding_batch(
131 vectors: &[Vec<f32>],
132 expected_count: usize,
133 context: &str,
134) -> Result<(), String> {
135 if expected_count > 0 && vectors.is_empty() {
136 return Err(format!(
137 "{context} returned no vectors for {expected_count} inputs"
138 ));
139 }
140
141 if vectors.len() != expected_count {
142 return Err(format!(
143 "{context} returned {} vectors for {} inputs",
144 vectors.len(),
145 expected_count
146 ));
147 }
148
149 let Some(first_vector) = vectors.first() else {
150 return Ok(());
151 };
152 let expected_dimension = first_vector.len();
153 for (index, vector) in vectors.iter().enumerate() {
154 if vector.len() != expected_dimension {
155 return Err(format!(
156 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
157 vector.len()
158 ));
159 }
160 }
161
162 Ok(())
163}
164
165fn normalize_base_url(raw: &str) -> Result<String, String> {
169 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
170 let scheme = parsed.scheme();
171 if scheme != "http" && scheme != "https" {
172 return Err(format!(
173 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
174 scheme
175 ));
176 }
177 Ok(parsed.to_string().trim_end_matches('/').to_string())
178}
179
180pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
195 use std::net::{IpAddr, ToSocketAddrs};
196
197 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
198
199 let host = parsed.host_str().unwrap_or("");
200
201 let is_loopback_host =
206 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
207 if is_loopback_host {
208 return Ok(());
209 }
210
211 if host.ends_with(".local") {
214 return Err(format!(
215 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
216 ));
217 }
218
219 let port = parsed.port_or_known_default().unwrap_or(443);
222 let addr_str = format!("{host}:{port}");
223 let addrs: Vec<IpAddr> = addr_str
224 .to_socket_addrs()
225 .map(|iter| iter.map(|sa| sa.ip()).collect())
226 .unwrap_or_default();
227 for ip in &addrs {
228 if is_private_non_loopback_ip(ip) {
229 return Err(format!(
230 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
231 ));
232 }
233 }
234
235 Ok(())
236}
237
238fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
242 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
243 match ip {
244 IpAddr::V4(v4) => {
245 let o = v4.octets();
246 o[0] == 10
249 || (o[0] == 172 && (16..=31).contains(&o[1]))
251 || (o[0] == 192 && o[1] == 168)
253 || (o[0] == 169 && o[1] == 254)
255 || (o[0] == 100 && (64..=127).contains(&o[1]))
257 || o[0] == 0
259 }
260 IpAddr::V6(v6) => {
261 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
265 || (v6.segments()[0] & 0xfe00) == 0xfc00
267 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
269 && v6.segments()[2] == 0 && v6.segments()[3] == 0
270 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
271 && {
272 let [a, b] = v6.segments()[6..8] else { return false; };
273 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
274 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
275 })
276 }
277 }
278}
279
280fn build_openai_embeddings_endpoint(base_url: &str) -> String {
281 if base_url.ends_with("/v1") {
282 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
283 } else {
284 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
285 }
286}
287
288fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
289 if base_url.ends_with("/api") {
290 format!("{base_url}/embed")
291 } else {
292 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
293 }
294}
295
296fn normalize_api_key(value: Option<String>) -> Option<String> {
297 value.and_then(|token| {
298 let token = token.trim();
299 if token.is_empty() {
300 None
301 } else {
302 Some(token.to_string())
303 }
304 })
305}
306
307fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
308 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
309}
310
311fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
312 error.is_connect()
313}
314
315fn sleep_before_embedding_retry(attempt_index: usize) {
316 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
317 std::thread::sleep(Duration::from_millis(*delay_ms));
318 }
319}
320
321fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
322where
323 F: FnMut() -> reqwest::blocking::RequestBuilder,
324{
325 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
326 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
327
328 let response = match make_request().send() {
329 Ok(response) => response,
330 Err(error) => {
331 if !last_attempt && is_retryable_embedding_error(&error) {
332 sleep_before_embedding_retry(attempt_index);
333 continue;
334 }
335 return Err(format!("{backend_label} request failed: {error}"));
336 }
337 };
338
339 let status = response.status();
340 let raw = match response.text() {
341 Ok(raw) => raw,
342 Err(error) => {
343 if !last_attempt && is_retryable_embedding_error(&error) {
344 sleep_before_embedding_retry(attempt_index);
345 continue;
346 }
347 return Err(format!("{backend_label} response read failed: {error}"));
348 }
349 };
350
351 if status.is_success() {
352 return Ok(raw);
353 }
354
355 if !last_attempt && is_retryable_embedding_status(status) {
356 sleep_before_embedding_retry(attempt_index);
357 continue;
358 }
359
360 return Err(format!(
361 "{backend_label} request failed (HTTP {}): {}",
362 status, raw
363 ));
364 }
365
366 unreachable!("embedding request retries exhausted without returning")
367}
368
369impl SemanticEmbeddingModel {
370 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
371 let timeout_ms = if config.timeout_ms == 0 {
372 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
373 } else {
374 config.timeout_ms
375 };
376
377 let max_batch_size = if config.max_batch_size == 0 {
378 DEFAULT_MAX_BATCH_SIZE
379 } else {
380 config.max_batch_size
381 };
382
383 let api_key_env = normalize_api_key(config.api_key_env.clone());
384 let model = config.model.clone();
385
386 let client = Client::builder()
387 .timeout(Duration::from_millis(timeout_ms))
388 .redirect(reqwest::redirect::Policy::none())
389 .build()
390 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
391
392 let engine = match config.backend {
393 SemanticBackend::Fastembed => {
394 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
395 }
396 SemanticBackend::OpenAiCompatible => {
397 let raw = config.base_url.as_ref().ok_or_else(|| {
398 "base_url is required for openai_compatible backend".to_string()
399 })?;
400 let base_url = normalize_base_url(raw)?;
401
402 let api_key = match api_key_env {
403 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
404 format!("missing api_key_env '{var_name}' for openai_compatible backend")
405 })?),
406 None => None,
407 };
408
409 SemanticEmbeddingEngine::OpenAiCompatible {
410 client,
411 model,
412 base_url,
413 api_key,
414 }
415 }
416 SemanticBackend::Ollama => {
417 let raw = config
418 .base_url
419 .as_ref()
420 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
421 let base_url = normalize_base_url(raw)?;
422
423 SemanticEmbeddingEngine::Ollama {
424 client,
425 model,
426 base_url,
427 }
428 }
429 };
430
431 Ok(Self {
432 backend: config.backend,
433 model: config.model.clone(),
434 base_url: config.base_url.clone(),
435 timeout_ms,
436 max_batch_size,
437 dimension: None,
438 engine,
439 query_embedding_cache: HashMap::new(),
440 query_embedding_cache_order: VecDeque::new(),
441 query_embedding_cache_hits: 0,
442 query_embedding_cache_misses: 0,
443 })
444 }
445
446 pub fn backend(&self) -> SemanticBackend {
447 self.backend
448 }
449
450 pub fn model(&self) -> &str {
451 &self.model
452 }
453
454 pub fn base_url(&self) -> Option<&str> {
455 self.base_url.as_deref()
456 }
457
458 pub fn max_batch_size(&self) -> usize {
459 self.max_batch_size
460 }
461
462 pub fn timeout_ms(&self) -> u64 {
463 self.timeout_ms
464 }
465
466 pub fn fingerprint(
467 &mut self,
468 config: &SemanticBackendConfig,
469 ) -> Result<SemanticIndexFingerprint, String> {
470 let dimension = self.dimension()?;
471 Ok(SemanticIndexFingerprint::from_config(config, dimension))
472 }
473
474 pub fn dimension(&mut self) -> Result<usize, String> {
475 if let Some(dimension) = self.dimension {
476 return Ok(dimension);
477 }
478
479 let dimension = match &mut self.engine {
480 SemanticEmbeddingEngine::Fastembed(model) => {
481 let vectors = model
482 .embed(vec!["semantic index fingerprint probe".to_string()], None)
483 .map_err(|error| format_embedding_init_error(error.to_string()))?;
484 vectors
485 .first()
486 .map(|v| v.len())
487 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
488 }
489 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
490 let vectors =
491 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
492 vectors
493 .first()
494 .map(|v| v.len())
495 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
496 }
497 SemanticEmbeddingEngine::Ollama { .. } => {
498 let vectors =
499 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
500 vectors
501 .first()
502 .map(|v| v.len())
503 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
504 }
505 };
506
507 self.dimension = Some(dimension);
508 Ok(dimension)
509 }
510
511 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
512 self.embed_texts(texts)
513 }
514
515 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
516 if let Some(vector) = self.query_embedding_cache.get(query) {
517 self.query_embedding_cache_hits += 1;
518 return Ok(vector.clone());
519 }
520
521 self.query_embedding_cache_misses += 1;
522 let embeddings = self.embed_texts(vec![query.to_string()])?;
523 let vector = embeddings
524 .first()
525 .cloned()
526 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
527
528 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
529 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
530 self.query_embedding_cache.remove(&oldest);
531 }
532 }
533 self.query_embedding_cache
534 .insert(query.to_string(), vector.clone());
535 self.query_embedding_cache_order
536 .push_back(query.to_string());
537
538 Ok(vector)
539 }
540
541 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
542 (
543 self.query_embedding_cache_hits,
544 self.query_embedding_cache_misses,
545 self.query_embedding_cache.len(),
546 )
547 }
548
549 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
550 match &mut self.engine {
551 SemanticEmbeddingEngine::Fastembed(model) => model
552 .embed(texts, None::<usize>)
553 .map_err(|error| format_embedding_init_error(error.to_string()))
554 .map_err(|error| format!("failed to embed batch: {error}")),
555 SemanticEmbeddingEngine::OpenAiCompatible {
556 client,
557 model,
558 base_url,
559 api_key,
560 } => {
561 let expected_text_count = texts.len();
562 let endpoint = build_openai_embeddings_endpoint(base_url);
563 let body = serde_json::json!({
564 "input": texts,
565 "model": model,
566 });
567
568 let raw = send_embedding_request(
569 || {
570 let mut request = client.post(&endpoint).json(&body);
580
581 if let Some(api_key) = api_key {
582 request = request.header("Authorization", format!("Bearer {api_key}"));
583 }
584
585 request
586 },
587 "openai compatible",
588 )?;
589
590 #[derive(Deserialize)]
591 struct OpenAiResponse {
592 data: Vec<OpenAiEmbeddingResult>,
593 }
594
595 #[derive(Deserialize)]
596 struct OpenAiEmbeddingResult {
597 embedding: Vec<f32>,
598 index: Option<u32>,
599 }
600
601 let parsed: OpenAiResponse = serde_json::from_str(&raw)
602 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
603 if parsed.data.len() != expected_text_count {
604 return Err(format!(
605 "openai compatible response returned {} embeddings for {} inputs",
606 parsed.data.len(),
607 expected_text_count
608 ));
609 }
610
611 let mut vectors = vec![Vec::new(); parsed.data.len()];
612 for (i, item) in parsed.data.into_iter().enumerate() {
613 let index = item.index.unwrap_or(i as u32) as usize;
614 if index >= vectors.len() {
615 return Err(
616 "openai compatible response contains invalid vector index".to_string()
617 );
618 }
619 vectors[index] = item.embedding;
620 }
621
622 for vector in &vectors {
623 if vector.is_empty() {
624 return Err(
625 "openai compatible response contained missing vectors".to_string()
626 );
627 }
628 }
629
630 self.dimension = vectors.first().map(Vec::len);
631 Ok(vectors)
632 }
633 SemanticEmbeddingEngine::Ollama {
634 client,
635 model,
636 base_url,
637 } => {
638 let expected_text_count = texts.len();
639 let endpoint = build_ollama_embeddings_endpoint(base_url);
640
641 #[derive(Serialize)]
642 struct OllamaPayload<'a> {
643 model: &'a str,
644 input: Vec<String>,
645 }
646
647 let payload = OllamaPayload {
648 model,
649 input: texts,
650 };
651
652 let raw = send_embedding_request(
653 || {
654 client.post(&endpoint).json(&payload)
659 },
660 "ollama",
661 )?;
662
663 #[derive(Deserialize)]
664 struct OllamaResponse {
665 embeddings: Vec<Vec<f32>>,
666 }
667
668 let parsed: OllamaResponse = serde_json::from_str(&raw)
669 .map_err(|error| format!("invalid ollama response: {error}"))?;
670 if parsed.embeddings.is_empty() {
671 return Err("ollama response returned no embeddings".to_string());
672 }
673 if parsed.embeddings.len() != expected_text_count {
674 return Err(format!(
675 "ollama response returned {} embeddings for {} inputs",
676 parsed.embeddings.len(),
677 expected_text_count
678 ));
679 }
680
681 let vectors = parsed.embeddings;
682 for vector in &vectors {
683 if vector.is_empty() {
684 return Err("ollama response contained empty embeddings".to_string());
685 }
686 }
687
688 self.dimension = vectors.first().map(Vec::len);
689 Ok(vectors)
690 }
691 }
692 }
693}
694
695pub fn pre_validate_onnx_runtime() -> Result<(), String> {
699 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
700
701 #[cfg(any(target_os = "linux", target_os = "macos"))]
702 {
703 #[cfg(target_os = "linux")]
704 let default_name = "libonnxruntime.so";
705 #[cfg(target_os = "macos")]
706 let default_name = "libonnxruntime.dylib";
707
708 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
709
710 unsafe {
711 let c_name = std::ffi::CString::new(lib_name)
712 .map_err(|e| format!("invalid library path: {}", e))?;
713 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
714 if handle.is_null() {
715 let err = libc::dlerror();
716 let msg = if err.is_null() {
717 "unknown dlopen error".to_string()
718 } else {
719 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
720 };
721 return Err(format!(
722 "ONNX Runtime not found. dlopen('{}') failed: {}. \
723 Run `npx @cortexkit/aft doctor` to diagnose.",
724 lib_name, msg
725 ));
726 }
727
728 let detected_version = detect_ort_version_from_path(lib_name);
731
732 libc::dlclose(handle);
733
734 if let Some(ref version) = detected_version {
736 let parts: Vec<&str> = version.split('.').collect();
737 if let (Some(major), Some(minor)) = (
738 parts.first().and_then(|s| s.parse::<u32>().ok()),
739 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
740 ) {
741 if major != 1 || minor < 20 {
742 return Err(format_ort_version_mismatch(version, lib_name));
743 }
744 }
745 }
746 }
747 }
748
749 #[cfg(target_os = "windows")]
750 {
751 let _ = dylib_path;
753 }
754
755 Ok(())
756}
757
758#[cfg(any(test, target_os = "linux", target_os = "macos"))]
761fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
762 let path = std::path::Path::new(lib_path);
763
764 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
766 .into_iter()
767 .flatten()
768 {
769 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
770 if let Some(version) = extract_version_from_filename(name) {
771 return Some(version);
772 }
773 }
774 }
775
776 if let Some(parent) = path.parent() {
778 if let Ok(entries) = std::fs::read_dir(parent) {
779 for entry in entries.flatten() {
780 if let Some(name) = entry.file_name().to_str() {
781 if name.starts_with("libonnxruntime") {
782 if let Some(version) = extract_version_from_filename(name) {
783 return Some(version);
784 }
785 }
786 }
787 }
788 }
789 }
790
791 None
792}
793
794#[cfg(any(test, target_os = "linux", target_os = "macos"))]
796fn extract_version_from_filename(name: &str) -> Option<String> {
797 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
799 re.find(name).map(|m| m.as_str().to_string())
800}
801
802#[cfg(any(test, target_os = "linux", target_os = "macos"))]
803fn suggest_removal_command(lib_path: &str) -> String {
804 if lib_path.starts_with("/usr/local/lib")
805 || lib_path == "libonnxruntime.so"
806 || lib_path == "libonnxruntime.dylib"
807 {
808 #[cfg(target_os = "linux")]
809 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
810 #[cfg(target_os = "macos")]
811 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
812 #[cfg(target_os = "windows")]
813 return " Delete the ONNX Runtime DLL from your PATH".to_string();
814 }
815 format!(" rm '{}'", lib_path)
816}
817
818#[cfg(any(test, target_os = "linux", target_os = "macos"))]
824pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
825 format!(
826 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
827 Solutions:\n\
828 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
829 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
830 configures the bridge to load it instead of the system library — no \
831 changes to '{}'.\n\
832 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
833 {}\n\
834 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
835 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
836 version,
837 lib_name,
838 lib_name,
839 suggest_removal_command(lib_name),
840 )
841}
842
843pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
844 pre_validate_onnx_runtime()?;
846
847 let selected_model = match model {
848 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
849 _ => {
850 return Err(format!(
851 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
852 model
853 ))
854 }
855 };
856
857 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
858}
859
860pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
861 if message.trim_start().starts_with("ONNX Runtime not found.") {
862 return true;
863 }
864
865 let message = message.to_ascii_lowercase();
866 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
867 .iter()
868 .any(|pattern| message.contains(pattern));
869 let mentions_dynamic_load_failure = [
870 "shared library",
871 "dynamic library",
872 "failed to load",
873 "could not load",
874 "unable to load",
875 "dlopen",
876 "loadlibrary",
877 "no such file",
878 "not found",
879 ]
880 .iter()
881 .any(|pattern| message.contains(pattern));
882
883 mentions_onnx_runtime && mentions_dynamic_load_failure
884}
885
886fn format_embedding_init_error(error: impl Display) -> String {
887 let message = error.to_string();
888
889 if is_onnx_runtime_unavailable(&message) {
890 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
891 }
892
893 format!("failed to initialize semantic embedding model: {message}")
894}
895
896#[derive(Debug, Clone)]
898pub struct SemanticChunk {
899 pub file: PathBuf,
901 pub name: String,
903 pub kind: SymbolKind,
905 pub start_line: u32,
907 pub end_line: u32,
908 pub exported: bool,
910 pub embed_text: String,
912 pub snippet: String,
914}
915
916#[derive(Debug)]
918struct EmbeddingEntry {
919 chunk: SemanticChunk,
920 vector: Vec<f32>,
921}
922
923#[derive(Debug)]
925pub struct SemanticIndex {
926 entries: Vec<EmbeddingEntry>,
927 file_mtimes: HashMap<PathBuf, SystemTime>,
929 file_sizes: HashMap<PathBuf, u64>,
931 file_hashes: HashMap<PathBuf, blake3::Hash>,
932 dimension: usize,
934 fingerprint: Option<SemanticIndexFingerprint>,
935 project_root: PathBuf,
936}
937
938#[derive(Debug, Clone, Copy)]
939struct IndexedFileMetadata {
940 mtime: SystemTime,
941 size: u64,
942 content_hash: blake3::Hash,
943}
944
945#[derive(Debug, Default, Clone, Copy)]
948pub struct RefreshSummary {
949 pub changed: usize,
950 pub added: usize,
951 pub deleted: usize,
952 pub total_processed: usize,
953}
954
955impl RefreshSummary {
956 pub fn is_noop(&self) -> bool {
958 self.changed == 0 && self.added == 0 && self.deleted == 0
959 }
960}
961
962#[derive(Debug, Clone)]
964pub struct SemanticResult {
965 pub file: PathBuf,
966 pub name: String,
967 pub kind: SymbolKind,
968 pub start_line: u32,
969 pub end_line: u32,
970 pub exported: bool,
971 pub snippet: String,
972 pub score: f32,
973 pub source: &'static str,
974}
975
976impl SemanticIndex {
977 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
978 debug_assert!(project_root.is_absolute());
979 Self {
980 entries: Vec::new(),
981 file_mtimes: HashMap::new(),
982 file_sizes: HashMap::new(),
983 file_hashes: HashMap::new(),
984 dimension,
985 fingerprint: None,
986 project_root,
987 }
988 }
989
990 pub fn entry_count(&self) -> usize {
992 self.entries.len()
993 }
994
995 pub fn status_label(&self) -> &'static str {
997 if self.entries.is_empty() {
998 "empty"
999 } else {
1000 "ready"
1001 }
1002 }
1003
1004 fn collect_chunks(
1005 project_root: &Path,
1006 files: &[PathBuf],
1007 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1008 let per_file: Vec<(
1009 PathBuf,
1010 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1011 )> = files
1012 .par_iter()
1013 .map_init(HashMap::new, |parsers, file| {
1014 let result = collect_file_metadata(file).and_then(|metadata| {
1015 collect_file_chunks(project_root, file, parsers)
1016 .map(|chunks| (metadata, chunks))
1017 });
1018 (file.clone(), result)
1019 })
1020 .collect();
1021
1022 let mut chunks: Vec<SemanticChunk> = Vec::new();
1023 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1024
1025 for (file, result) in per_file {
1026 match result {
1027 Ok((metadata, file_chunks)) => {
1028 file_metadata.insert(file, metadata);
1029 chunks.extend(file_chunks);
1030 }
1031 Err(error) => {
1032 if error == "unsupported file extension" {
1038 continue;
1039 }
1040 slog_warn!(
1041 "failed to collect semantic chunks for {}: {}",
1042 file.display(),
1043 error
1044 );
1045 }
1046 }
1047 }
1048
1049 (chunks, file_metadata)
1050 }
1051
1052 fn build_from_chunks<F, P>(
1053 project_root: &Path,
1054 chunks: Vec<SemanticChunk>,
1055 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1056 embed_fn: &mut F,
1057 max_batch_size: usize,
1058 mut progress: Option<&mut P>,
1059 ) -> Result<Self, String>
1060 where
1061 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1062 P: FnMut(usize, usize),
1063 {
1064 debug_assert!(project_root.is_absolute());
1065 let total_chunks = chunks.len();
1066
1067 if chunks.is_empty() {
1068 return Ok(Self {
1069 entries: Vec::new(),
1070 file_mtimes: file_metadata
1071 .iter()
1072 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1073 .collect(),
1074 file_sizes: file_metadata
1075 .iter()
1076 .map(|(path, metadata)| (path.clone(), metadata.size))
1077 .collect(),
1078 file_hashes: file_metadata
1079 .into_iter()
1080 .map(|(path, metadata)| (path, metadata.content_hash))
1081 .collect(),
1082 dimension: DEFAULT_DIMENSION,
1083 fingerprint: None,
1084 project_root: project_root.to_path_buf(),
1085 });
1086 }
1087
1088 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1090 let mut expected_dimension: Option<usize> = None;
1091 let batch_size = max_batch_size.max(1);
1092 for batch_start in (0..chunks.len()).step_by(batch_size) {
1093 let batch_end = (batch_start + batch_size).min(chunks.len());
1094 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1095 .iter()
1096 .map(|c| c.embed_text.clone())
1097 .collect();
1098
1099 let vectors = embed_fn(batch_texts)?;
1100 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1101
1102 if let Some(dim) = vectors.first().map(|v| v.len()) {
1104 match expected_dimension {
1105 None => expected_dimension = Some(dim),
1106 Some(expected) if dim != expected => {
1107 return Err(format!(
1108 "embedding dimension changed across batches: expected {expected}, got {dim}"
1109 ));
1110 }
1111 _ => {}
1112 }
1113 }
1114
1115 for (i, vector) in vectors.into_iter().enumerate() {
1116 let chunk_idx = batch_start + i;
1117 entries.push(EmbeddingEntry {
1118 chunk: chunks[chunk_idx].clone(),
1119 vector,
1120 });
1121 }
1122
1123 if let Some(callback) = progress.as_mut() {
1124 callback(entries.len(), total_chunks);
1125 }
1126 }
1127
1128 let dimension = entries
1129 .first()
1130 .map(|e| e.vector.len())
1131 .unwrap_or(DEFAULT_DIMENSION);
1132
1133 Ok(Self {
1134 entries,
1135 file_mtimes: file_metadata
1136 .iter()
1137 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1138 .collect(),
1139 file_sizes: file_metadata
1140 .iter()
1141 .map(|(path, metadata)| (path.clone(), metadata.size))
1142 .collect(),
1143 file_hashes: file_metadata
1144 .into_iter()
1145 .map(|(path, metadata)| (path, metadata.content_hash))
1146 .collect(),
1147 dimension,
1148 fingerprint: None,
1149 project_root: project_root.to_path_buf(),
1150 })
1151 }
1152
1153 pub fn build<F>(
1156 project_root: &Path,
1157 files: &[PathBuf],
1158 embed_fn: &mut F,
1159 max_batch_size: usize,
1160 ) -> Result<Self, String>
1161 where
1162 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1163 {
1164 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1165 Self::build_from_chunks(
1166 project_root,
1167 chunks,
1168 file_mtimes,
1169 embed_fn,
1170 max_batch_size,
1171 Option::<&mut fn(usize, usize)>::None,
1172 )
1173 }
1174
1175 pub fn build_with_progress<F, P>(
1177 project_root: &Path,
1178 files: &[PathBuf],
1179 embed_fn: &mut F,
1180 max_batch_size: usize,
1181 progress: &mut P,
1182 ) -> Result<Self, String>
1183 where
1184 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1185 P: FnMut(usize, usize),
1186 {
1187 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1188 let total_chunks = chunks.len();
1189 progress(0, total_chunks);
1190 Self::build_from_chunks(
1191 project_root,
1192 chunks,
1193 file_mtimes,
1194 embed_fn,
1195 max_batch_size,
1196 Some(progress),
1197 )
1198 }
1199
1200 pub fn refresh_stale_files<F, P>(
1211 &mut self,
1212 project_root: &Path,
1213 current_files: &[PathBuf],
1214 embed_fn: &mut F,
1215 max_batch_size: usize,
1216 progress: &mut P,
1217 ) -> Result<RefreshSummary, String>
1218 where
1219 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1220 P: FnMut(usize, usize),
1221 {
1222 self.backfill_missing_file_sizes();
1223
1224 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1226 let total_processed = current_set.len() + self.file_mtimes.len()
1227 - self
1228 .file_mtimes
1229 .keys()
1230 .filter(|path| current_set.contains(path.as_path()))
1231 .count();
1232
1233 let mut deleted: Vec<PathBuf> = Vec::new();
1236 let mut changed: Vec<PathBuf> = Vec::new();
1237 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1238 for indexed_path in &indexed_paths {
1239 if !current_set.contains(indexed_path.as_path()) {
1240 deleted.push(indexed_path.clone());
1241 continue;
1242 }
1243 let cached = match (
1244 self.file_mtimes.get(indexed_path),
1245 self.file_sizes.get(indexed_path),
1246 self.file_hashes.get(indexed_path),
1247 ) {
1248 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1249 mtime: *mtime,
1250 size: *size,
1251 content_hash: *hash,
1252 }),
1253 _ => None,
1254 };
1255 match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1256 Some(FreshnessVerdict::HotFresh) => {}
1257 Some(FreshnessVerdict::ContentFresh {
1258 new_mtime,
1259 new_size,
1260 }) => {
1261 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1262 self.file_sizes.insert(indexed_path.clone(), new_size);
1263 }
1264 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1265 changed.push(indexed_path.clone());
1266 }
1267 }
1268 }
1269
1270 let mut added: Vec<PathBuf> = Vec::new();
1272 for path in current_files {
1273 if !self.file_mtimes.contains_key(path) {
1274 added.push(path.clone());
1275 }
1276 }
1277
1278 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1280 progress(0, 0);
1281 return Ok(RefreshSummary {
1282 total_processed,
1283 ..RefreshSummary::default()
1284 });
1285 }
1286
1287 if !deleted.is_empty() {
1291 let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1292 self.entries
1293 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1294 for path in &deleted {
1295 self.file_mtimes.remove(path);
1296 self.file_sizes.remove(path);
1297 self.file_hashes.remove(path);
1298 }
1299 }
1300
1301 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1303 to_embed.extend(changed.iter().cloned());
1304 to_embed.extend(added.iter().cloned());
1305
1306 if to_embed.is_empty() {
1307 progress(0, 0);
1309 return Ok(RefreshSummary {
1310 changed: 0,
1311 added: 0,
1312 deleted: deleted.len(),
1313 total_processed,
1314 });
1315 }
1316
1317 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1318
1319 if chunks.is_empty() {
1320 progress(0, 0);
1321 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1322 if !successful_files.is_empty() {
1323 self.entries
1324 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1325 }
1326 let changed_count = changed
1327 .iter()
1328 .filter(|path| successful_files.contains(*path))
1329 .count();
1330 let added_count = added
1331 .iter()
1332 .filter(|path| successful_files.contains(*path))
1333 .count();
1334 for (file, metadata) in fresh_metadata {
1335 self.file_mtimes.insert(file.clone(), metadata.mtime);
1336 self.file_sizes.insert(file.clone(), metadata.size);
1337 self.file_hashes.insert(file.clone(), metadata.content_hash);
1338 }
1339 return Ok(RefreshSummary {
1340 changed: changed_count,
1341 added: added_count,
1342 deleted: deleted.len(),
1343 total_processed,
1344 });
1345 }
1346
1347 let total_chunks = chunks.len();
1349 progress(0, total_chunks);
1350 let batch_size = max_batch_size.max(1);
1351 let existing_dimension = if self.entries.is_empty() {
1352 None
1353 } else {
1354 Some(self.dimension)
1355 };
1356 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1357 let mut observed_dimension: Option<usize> = existing_dimension;
1358
1359 for batch_start in (0..chunks.len()).step_by(batch_size) {
1360 let batch_end = (batch_start + batch_size).min(chunks.len());
1361 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1362 .iter()
1363 .map(|c| c.embed_text.clone())
1364 .collect();
1365
1366 let vectors = embed_fn(batch_texts)?;
1367 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1368
1369 if let Some(dim) = vectors.first().map(|v| v.len()) {
1370 match observed_dimension {
1371 None => observed_dimension = Some(dim),
1372 Some(expected) if dim != expected => {
1373 return Err(format!(
1376 "embedding dimension changed during incremental refresh: \
1377 cached index uses {expected}, new vectors use {dim}"
1378 ));
1379 }
1380 _ => {}
1381 }
1382 }
1383
1384 for (i, vector) in vectors.into_iter().enumerate() {
1385 let chunk_idx = batch_start + i;
1386 new_entries.push(EmbeddingEntry {
1387 chunk: chunks[chunk_idx].clone(),
1388 vector,
1389 });
1390 }
1391
1392 progress(new_entries.len(), total_chunks);
1393 }
1394
1395 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1396 if !successful_files.is_empty() {
1397 self.entries
1398 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1399 }
1400
1401 self.entries.extend(new_entries);
1402 for (file, metadata) in fresh_metadata {
1403 self.file_mtimes.insert(file.clone(), metadata.mtime);
1404 self.file_sizes.insert(file.clone(), metadata.size);
1405 self.file_hashes.insert(file, metadata.content_hash);
1406 }
1407 if let Some(dim) = observed_dimension {
1408 self.dimension = dim;
1409 }
1410
1411 Ok(RefreshSummary {
1412 changed: changed
1413 .iter()
1414 .filter(|path| successful_files.contains(*path))
1415 .count(),
1416 added: added
1417 .iter()
1418 .filter(|path| successful_files.contains(*path))
1419 .count(),
1420 deleted: deleted.len(),
1421 total_processed,
1422 })
1423 }
1424
1425 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1427 if self.entries.is_empty() || query_vector.len() != self.dimension {
1428 return Vec::new();
1429 }
1430
1431 let mut scored: Vec<(f32, usize)> = self
1432 .entries
1433 .iter()
1434 .enumerate()
1435 .map(|(i, entry)| {
1436 let mut score = cosine_similarity(query_vector, &entry.vector);
1437 if entry.chunk.exported {
1438 score *= 1.1;
1439 }
1440 (score, i)
1441 })
1442 .collect();
1443
1444 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1446
1447 scored
1448 .into_iter()
1449 .take(top_k)
1450 .map(|(score, idx)| {
1454 let entry = &self.entries[idx];
1455 SemanticResult {
1456 file: entry.chunk.file.clone(),
1457 name: entry.chunk.name.clone(),
1458 kind: entry.chunk.kind.clone(),
1459 start_line: entry.chunk.start_line,
1460 end_line: entry.chunk.end_line,
1461 exported: entry.chunk.exported,
1462 snippet: entry.chunk.snippet.clone(),
1463 score,
1464 source: "semantic",
1465 }
1466 })
1467 .collect()
1468 }
1469
1470 pub fn len(&self) -> usize {
1472 self.entries.len()
1473 }
1474
1475 pub fn is_file_stale(&self, file: &Path) -> bool {
1477 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1478 return true;
1479 };
1480 let Some(stored_size) = self.file_sizes.get(file) else {
1481 return true;
1482 };
1483 let Some(stored_hash) = self.file_hashes.get(file) else {
1484 return true;
1485 };
1486 let cached = FileFreshness {
1487 mtime: *stored_mtime,
1488 size: *stored_size,
1489 content_hash: *stored_hash,
1490 };
1491 match cache_freshness::verify_file(file, &cached) {
1492 FreshnessVerdict::HotFresh => false,
1493 FreshnessVerdict::ContentFresh { .. } => false,
1494 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1495 }
1496 }
1497
1498 fn backfill_missing_file_sizes(&mut self) {
1499 for path in self.file_mtimes.keys() {
1500 if self.file_sizes.contains_key(path) {
1501 continue;
1502 }
1503 if let Ok(metadata) = fs::metadata(path) {
1504 self.file_sizes.insert(path.clone(), metadata.len());
1505 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1506 self.file_hashes.insert(path.clone(), hash);
1507 }
1508 }
1509 }
1510 }
1511
1512 pub fn remove_file(&mut self, file: &Path) {
1514 self.invalidate_file(file);
1515 }
1516
1517 pub fn invalidate_file(&mut self, file: &Path) {
1518 self.entries.retain(|e| e.chunk.file != file);
1519 self.file_mtimes.remove(file);
1520 self.file_sizes.remove(file);
1521 self.file_hashes.remove(file);
1522 }
1523
1524 pub fn dimension(&self) -> usize {
1526 self.dimension
1527 }
1528
1529 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1530 self.fingerprint.as_ref()
1531 }
1532
1533 pub fn backend_label(&self) -> Option<&str> {
1534 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1535 }
1536
1537 pub fn model_label(&self) -> Option<&str> {
1538 self.fingerprint.as_ref().map(|f| f.model.as_str())
1539 }
1540
1541 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1542 self.fingerprint = Some(fingerprint);
1543 }
1544
1545 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1547 if self.entries.is_empty() {
1550 slog_info!("skipping semantic index persistence (0 entries)");
1551 return;
1552 }
1553 let dir = storage_dir.join("semantic").join(project_key);
1554 if let Err(e) = fs::create_dir_all(&dir) {
1555 slog_warn!("failed to create semantic cache dir: {}", e);
1556 return;
1557 }
1558 let data_path = dir.join("semantic.bin");
1559 let tmp_path = dir.join(format!(
1560 "semantic.bin.tmp.{}.{}",
1561 std::process::id(),
1562 SystemTime::now()
1563 .duration_since(SystemTime::UNIX_EPOCH)
1564 .unwrap_or(Duration::ZERO)
1565 .as_nanos()
1566 ));
1567 let bytes = self.to_bytes();
1568 let write_result = (|| -> std::io::Result<()> {
1569 use std::io::Write;
1570 let mut file = fs::File::create(&tmp_path)?;
1571 file.write_all(&bytes)?;
1572 file.sync_all()?;
1573 Ok(())
1574 })();
1575 if let Err(e) = write_result {
1576 slog_warn!("failed to write semantic index: {}", e);
1577 let _ = fs::remove_file(&tmp_path);
1578 return;
1579 }
1580 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1581 slog_warn!("failed to rename semantic index: {}", e);
1582 let _ = fs::remove_file(&tmp_path);
1583 return;
1584 }
1585 slog_info!(
1586 "semantic index persisted: {} entries, {:.1} KB",
1587 self.entries.len(),
1588 bytes.len() as f64 / 1024.0
1589 );
1590 }
1591
1592 pub fn read_from_disk(
1594 storage_dir: &Path,
1595 project_key: &str,
1596 current_canonical_root: &Path,
1597 is_worktree_bridge: bool,
1598 expected_fingerprint: Option<&str>,
1599 ) -> Option<Self> {
1600 debug_assert!(current_canonical_root.is_absolute());
1601 let data_path = storage_dir
1602 .join("semantic")
1603 .join(project_key)
1604 .join("semantic.bin");
1605 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1606 if file_len < HEADER_BYTES_V1 {
1607 slog_warn!(
1608 "corrupt semantic index (too small: {} bytes), removing",
1609 file_len
1610 );
1611 if !is_worktree_bridge {
1612 let _ = fs::remove_file(&data_path);
1613 }
1614 return None;
1615 }
1616
1617 let bytes = fs::read(&data_path).ok()?;
1618 let version = bytes[0];
1619 if version != SEMANTIC_INDEX_VERSION_V6 {
1620 slog_info!(
1621 "cached semantic index version {} is older than {}, rebuilding",
1622 version,
1623 SEMANTIC_INDEX_VERSION_V6
1624 );
1625 if !is_worktree_bridge {
1626 let _ = fs::remove_file(&data_path);
1627 }
1628 return None;
1629 }
1630 match Self::from_bytes(&bytes, current_canonical_root) {
1631 Ok(index) => {
1632 if index.entries.is_empty() {
1633 slog_info!("cached semantic index is empty, will rebuild");
1634 if !is_worktree_bridge {
1635 let _ = fs::remove_file(&data_path);
1636 }
1637 return None;
1638 }
1639 if let Some(expected) = expected_fingerprint {
1640 let matches = index
1641 .fingerprint()
1642 .map(|fingerprint| fingerprint.matches_expected(expected))
1643 .unwrap_or(false);
1644 if !matches {
1645 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1646 if !is_worktree_bridge {
1647 let _ = fs::remove_file(&data_path);
1648 }
1649 return None;
1650 }
1651 }
1652 slog_info!(
1653 "loaded semantic index from disk: {} entries",
1654 index.entries.len()
1655 );
1656 Some(index)
1657 }
1658 Err(e) => {
1659 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1660 if !is_worktree_bridge {
1661 let _ = fs::remove_file(&data_path);
1662 }
1663 None
1664 }
1665 }
1666 }
1667
1668 pub fn to_bytes(&self) -> Vec<u8> {
1670 let mut buf = Vec::new();
1671 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1672 let encoded = fingerprint.as_string();
1673 if encoded.is_empty() {
1674 None
1675 } else {
1676 Some(encoded.into_bytes())
1677 }
1678 });
1679 let file_mtimes: Vec<_> = self
1680 .file_mtimes
1681 .iter()
1682 .filter_map(|(path, mtime)| {
1683 cache_relative_path(&self.project_root, path)
1684 .map(|relative| (relative, path, mtime))
1685 })
1686 .collect();
1687 let entries: Vec<_> = self
1688 .entries
1689 .iter()
1690 .filter_map(|entry| {
1691 cache_relative_path(&self.project_root, &entry.chunk.file)
1692 .map(|relative| (relative, entry))
1693 })
1694 .collect();
1695
1696 let version = SEMANTIC_INDEX_VERSION_V6;
1709 buf.push(version);
1710 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1711 buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1712 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1713 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1714 buf.extend_from_slice(fp_bytes_ref);
1715
1716 buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1719 for (relative, path, mtime) in &file_mtimes {
1720 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1721 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1722 buf.extend_from_slice(&path_bytes);
1723 let duration = mtime
1724 .duration_since(SystemTime::UNIX_EPOCH)
1725 .unwrap_or_default();
1726 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1727 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1728 let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1729 buf.extend_from_slice(&size.to_le_bytes());
1730 let hash = self
1731 .file_hashes
1732 .get(*path)
1733 .copied()
1734 .unwrap_or_else(cache_freshness::zero_hash);
1735 buf.extend_from_slice(hash.as_bytes());
1736 }
1737
1738 for (relative, entry) in &entries {
1740 let c = &entry.chunk;
1741
1742 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1744 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1745 buf.extend_from_slice(&file_bytes);
1746
1747 let name_bytes = c.name.as_bytes();
1749 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1750 buf.extend_from_slice(name_bytes);
1751
1752 buf.push(symbol_kind_to_u8(&c.kind));
1754
1755 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1757 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1758 buf.push(c.exported as u8);
1759
1760 let snippet_bytes = c.snippet.as_bytes();
1762 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1763 buf.extend_from_slice(snippet_bytes);
1764
1765 let embed_bytes = c.embed_text.as_bytes();
1767 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1768 buf.extend_from_slice(embed_bytes);
1769
1770 for &val in &entry.vector {
1772 buf.extend_from_slice(&val.to_le_bytes());
1773 }
1774 }
1775
1776 buf
1777 }
1778
1779 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1781 debug_assert!(current_canonical_root.is_absolute());
1782 let mut pos = 0;
1783
1784 if data.len() < HEADER_BYTES_V1 {
1785 return Err("data too short".to_string());
1786 }
1787
1788 let version = data[pos];
1789 pos += 1;
1790 if version != SEMANTIC_INDEX_VERSION_V1
1791 && version != SEMANTIC_INDEX_VERSION_V2
1792 && version != SEMANTIC_INDEX_VERSION_V3
1793 && version != SEMANTIC_INDEX_VERSION_V4
1794 && version != SEMANTIC_INDEX_VERSION_V5
1795 && version != SEMANTIC_INDEX_VERSION_V6
1796 {
1797 return Err(format!("unsupported version: {}", version));
1798 }
1799 if (version == SEMANTIC_INDEX_VERSION_V2
1803 || version == SEMANTIC_INDEX_VERSION_V3
1804 || version == SEMANTIC_INDEX_VERSION_V4
1805 || version == SEMANTIC_INDEX_VERSION_V5
1806 || version == SEMANTIC_INDEX_VERSION_V6)
1807 && data.len() < HEADER_BYTES_V2
1808 {
1809 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1810 }
1811
1812 let dimension = read_u32(data, &mut pos)? as usize;
1813 let entry_count = read_u32(data, &mut pos)? as usize;
1814 if dimension == 0 || dimension > MAX_DIMENSION {
1815 return Err(format!("invalid embedding dimension: {}", dimension));
1816 }
1817 if entry_count > MAX_ENTRIES {
1818 return Err(format!("too many semantic index entries: {}", entry_count));
1819 }
1820
1821 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1827 || version == SEMANTIC_INDEX_VERSION_V3
1828 || version == SEMANTIC_INDEX_VERSION_V4
1829 || version == SEMANTIC_INDEX_VERSION_V5
1830 || version == SEMANTIC_INDEX_VERSION_V6;
1831 let fingerprint = if has_fingerprint_field {
1832 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1833 if pos + fingerprint_len > data.len() {
1834 return Err("unexpected end of data reading fingerprint".to_string());
1835 }
1836 if fingerprint_len == 0 {
1837 None
1838 } else {
1839 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1840 pos += fingerprint_len;
1841 Some(
1842 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1843 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1844 )
1845 }
1846 } else {
1847 None
1848 };
1849
1850 let mtime_count = read_u32(data, &mut pos)? as usize;
1852 if mtime_count > MAX_ENTRIES {
1853 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1854 }
1855
1856 let vector_bytes = entry_count
1857 .checked_mul(dimension)
1858 .and_then(|count| count.checked_mul(F32_BYTES))
1859 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1860 if vector_bytes > data.len().saturating_sub(pos) {
1861 return Err("semantic index vectors exceed available data".to_string());
1862 }
1863
1864 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1865 let mut file_sizes = HashMap::with_capacity(mtime_count);
1866 let mut file_hashes = HashMap::with_capacity(mtime_count);
1867 for _ in 0..mtime_count {
1868 let path = read_string(data, &mut pos)?;
1869 let secs = read_u64(data, &mut pos)?;
1870 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1876 || version == SEMANTIC_INDEX_VERSION_V4
1877 || version == SEMANTIC_INDEX_VERSION_V5
1878 || version == SEMANTIC_INDEX_VERSION_V6
1879 {
1880 read_u32(data, &mut pos)?
1881 } else {
1882 0
1883 };
1884 let size =
1885 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1886 read_u64(data, &mut pos)?
1887 } else {
1888 0
1889 };
1890 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1891 if pos + 32 > data.len() {
1892 return Err("unexpected end of data reading content hash".to_string());
1893 }
1894 let mut hash_bytes = [0u8; 32];
1895 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1896 pos += 32;
1897 blake3::Hash::from_bytes(hash_bytes)
1898 } else {
1899 cache_freshness::zero_hash()
1900 };
1901 if nanos >= 1_000_000_000 {
1908 return Err(format!(
1909 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1910 nanos
1911 ));
1912 }
1913 let duration = std::time::Duration::new(secs, nanos);
1914 let mtime = SystemTime::UNIX_EPOCH
1915 .checked_add(duration)
1916 .ok_or_else(|| {
1917 format!(
1918 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1919 secs, nanos
1920 )
1921 })?;
1922 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1923 cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1924 .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1925 } else {
1926 PathBuf::from(path)
1927 };
1928 file_mtimes.insert(path.clone(), mtime);
1929 file_sizes.insert(path.clone(), size);
1930 file_hashes.insert(path, content_hash);
1931 }
1932
1933 let mut entries = Vec::with_capacity(entry_count);
1935 for _ in 0..entry_count {
1936 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1937 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1938 cached_path_under_root(current_canonical_root, &raw_file)
1939 .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1940 } else {
1941 raw_file
1942 };
1943 let name = read_string(data, &mut pos)?;
1944
1945 if pos >= data.len() {
1946 return Err("unexpected end of data".to_string());
1947 }
1948 let kind = u8_to_symbol_kind(data[pos]);
1949 pos += 1;
1950
1951 let start_line = read_u32(data, &mut pos)?;
1952 let end_line = read_u32(data, &mut pos)?;
1953
1954 if pos >= data.len() {
1955 return Err("unexpected end of data".to_string());
1956 }
1957 let exported = data[pos] != 0;
1958 pos += 1;
1959
1960 let snippet = read_string(data, &mut pos)?;
1961 let embed_text = read_string(data, &mut pos)?;
1962
1963 let vec_bytes = dimension
1965 .checked_mul(F32_BYTES)
1966 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1967 if pos + vec_bytes > data.len() {
1968 return Err("unexpected end of data reading vector".to_string());
1969 }
1970 let mut vector = Vec::with_capacity(dimension);
1971 for _ in 0..dimension {
1972 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1973 vector.push(f32::from_le_bytes(bytes));
1974 pos += 4;
1975 }
1976
1977 entries.push(EmbeddingEntry {
1978 chunk: SemanticChunk {
1979 file,
1980 name,
1981 kind,
1982 start_line,
1983 end_line,
1984 exported,
1985 embed_text,
1986 snippet,
1987 },
1988 vector,
1989 });
1990 }
1991
1992 if entries.len() != entry_count {
1993 return Err(format!(
1994 "semantic cache entry count drift: header={} decoded={}",
1995 entry_count,
1996 entries.len()
1997 ));
1998 }
1999 for entry in &entries {
2000 if !file_mtimes.contains_key(&entry.chunk.file) {
2001 return Err(format!(
2002 "semantic cache metadata missing for entry file {}",
2003 entry.chunk.file.display()
2004 ));
2005 }
2006 }
2007
2008 Ok(Self {
2009 entries,
2010 file_mtimes,
2011 file_sizes,
2012 file_hashes,
2013 dimension,
2014 fingerprint,
2015 project_root: current_canonical_root.to_path_buf(),
2016 })
2017 }
2018}
2019
2020fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2022 let relative = file
2023 .strip_prefix(project_root)
2024 .unwrap_or(file)
2025 .to_string_lossy();
2026
2027 let kind_label = match &symbol.kind {
2028 SymbolKind::Function => "function",
2029 SymbolKind::Class => "class",
2030 SymbolKind::Method => "method",
2031 SymbolKind::Struct => "struct",
2032 SymbolKind::Interface => "interface",
2033 SymbolKind::Enum => "enum",
2034 SymbolKind::TypeAlias => "type",
2035 SymbolKind::Variable => "variable",
2036 SymbolKind::Heading => "heading",
2037 SymbolKind::FileSummary => "file-summary",
2038 };
2039
2040 let name = &symbol.name;
2042 let mut text = format!(
2043 "name:{name} file:{} kind:{} name:{name}",
2044 relative, kind_label
2045 );
2046
2047 if let Some(sig) = &symbol.signature {
2048 text.push_str(&format!(" signature:{}", sig));
2049 }
2050
2051 let lines: Vec<&str> = source.lines().collect();
2053 let start = (symbol.range.start_line as usize).min(lines.len());
2054 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2056 if start < end {
2057 let body: String = lines[start..end]
2058 .iter()
2059 .take(15) .copied()
2061 .collect::<Vec<&str>>()
2062 .join("\n");
2063 let snippet = if body.len() > 300 {
2064 format!("{}...", &body[..body.floor_char_boundary(300)])
2065 } else {
2066 body
2067 };
2068 text.push_str(&format!(" body:{}", snippet));
2069 }
2070
2071 text
2072}
2073
2074fn truncate_chars(value: &str, max_chars: usize) -> String {
2075 value.chars().take(max_chars).collect()
2076}
2077
2078fn first_leading_doc_comment(source: &str) -> String {
2079 let lines: Vec<&str> = source.lines().collect();
2080 let Some((start, first)) = lines
2081 .iter()
2082 .enumerate()
2083 .find(|(_, line)| !line.trim().is_empty())
2084 else {
2085 return String::new();
2086 };
2087
2088 let trimmed = first.trim_start();
2089 if trimmed.starts_with("/**") {
2090 let mut comment = Vec::new();
2091 for line in lines.iter().skip(start) {
2092 comment.push(*line);
2093 if line.contains("*/") {
2094 break;
2095 }
2096 }
2097 return truncate_chars(&comment.join("\n"), 200);
2098 }
2099
2100 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2101 let comment = lines
2102 .iter()
2103 .skip(start)
2104 .take_while(|line| {
2105 let trimmed = line.trim_start();
2106 trimmed.starts_with("///") || trimmed.starts_with("//!")
2107 })
2108 .copied()
2109 .collect::<Vec<_>>()
2110 .join("\n");
2111 return truncate_chars(&comment, 200);
2112 }
2113
2114 String::new()
2115}
2116
2117pub fn build_file_summary_chunk(
2118 file: &Path,
2119 project_root: &Path,
2120 source: &str,
2121 top_exports: &[&str],
2122 top_export_signatures: &[Option<&str>],
2123) -> SemanticChunk {
2124 let relative = file.strip_prefix(project_root).unwrap_or(file);
2125 let rel_path = relative.to_string_lossy();
2126 let parent_dir = relative
2127 .parent()
2128 .map(|parent| parent.to_string_lossy().to_string())
2129 .unwrap_or_default();
2130 let name = file
2131 .file_stem()
2132 .map(|stem| stem.to_string_lossy().to_string())
2133 .unwrap_or_default();
2134 let doc = first_leading_doc_comment(source);
2135 let exports = top_exports
2136 .iter()
2137 .take(5)
2138 .copied()
2139 .collect::<Vec<_>>()
2140 .join(",");
2141 let snippet = if doc.is_empty() {
2142 top_export_signatures
2143 .first()
2144 .and_then(|signature| signature.as_deref())
2145 .map(|signature| truncate_chars(signature, 200))
2146 .unwrap_or_default()
2147 } else {
2148 doc.clone()
2149 };
2150
2151 SemanticChunk {
2152 file: file.to_path_buf(),
2153 name,
2154 kind: SymbolKind::FileSummary,
2155 start_line: 0,
2156 end_line: 0,
2157 exported: false,
2158 embed_text: format!(
2159 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2160 file.file_stem()
2161 .map(|stem| stem.to_string_lossy().to_string())
2162 .unwrap_or_default()
2163 ),
2164 snippet,
2165 }
2166}
2167
2168fn parser_for(
2169 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2170 lang: crate::parser::LangId,
2171) -> Result<&mut Parser, String> {
2172 use std::collections::hash_map::Entry;
2173
2174 match parsers.entry(lang) {
2175 Entry::Occupied(entry) => Ok(entry.into_mut()),
2176 Entry::Vacant(entry) => {
2177 let grammar = grammar_for(lang);
2178 let mut parser = Parser::new();
2179 parser
2180 .set_language(&grammar)
2181 .map_err(|error| error.to_string())?;
2182 Ok(entry.insert(parser))
2183 }
2184 }
2185}
2186
2187pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2188 matches!(
2189 path.extension().and_then(|extension| extension.to_str()),
2190 Some(
2191 "ts" | "tsx"
2192 | "js"
2193 | "jsx"
2194 | "py"
2195 | "rs"
2196 | "go"
2197 | "c"
2198 | "h"
2199 | "cc"
2200 | "cpp"
2201 | "cxx"
2202 | "hpp"
2203 | "hh"
2204 | "zig"
2205 | "cs"
2206 | "sh"
2207 | "bash"
2208 | "zsh"
2209 | "sol"
2210 | "vue"
2211 )
2212 )
2213}
2214
2215fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2216 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2217 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2218 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2219 .map_err(|error| error.to_string())?
2220 .unwrap_or_else(cache_freshness::zero_hash);
2221 Ok(IndexedFileMetadata {
2222 mtime,
2223 size: metadata.len(),
2224 content_hash,
2225 })
2226}
2227
2228fn collect_file_chunks(
2229 project_root: &Path,
2230 file: &Path,
2231 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2232) -> Result<Vec<SemanticChunk>, String> {
2233 if !is_semantic_indexed_extension(file) {
2234 return Err("unsupported file extension".to_string());
2235 }
2236 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2237 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2238 let tree = parser_for(parsers, lang)?
2239 .parse(&source, None)
2240 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2241 let symbols =
2242 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2243
2244 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2245}
2246
2247fn build_snippet(symbol: &Symbol, source: &str) -> String {
2249 let lines: Vec<&str> = source.lines().collect();
2250 let start = (symbol.range.start_line as usize).min(lines.len());
2251 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2253 if start < end {
2254 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2255 let mut snippet = snippet_lines.join("\n");
2256 if end - start > 5 {
2257 snippet.push_str("\n ...");
2258 }
2259 if snippet.len() > 300 {
2260 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2261 }
2262 snippet
2263 } else {
2264 String::new()
2265 }
2266}
2267
2268fn symbols_to_chunks(
2270 file: &Path,
2271 symbols: &[Symbol],
2272 source: &str,
2273 project_root: &Path,
2274) -> Vec<SemanticChunk> {
2275 let mut chunks = Vec::new();
2276 let top_exports_with_signatures = symbols
2277 .iter()
2278 .filter(|symbol| {
2279 symbol.exported
2280 && symbol.parent.is_none()
2281 && !matches!(symbol.kind, SymbolKind::Heading)
2282 })
2283 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2284 .collect::<Vec<_>>();
2285
2286 let has_only_headings = !symbols.is_empty()
2287 && symbols
2288 .iter()
2289 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2290 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2291 let top_exports = top_exports_with_signatures
2292 .iter()
2293 .map(|(name, _)| *name)
2294 .collect::<Vec<_>>();
2295 let top_export_signatures = top_exports_with_signatures
2296 .iter()
2297 .map(|(_, signature)| *signature)
2298 .collect::<Vec<_>>();
2299 chunks.push(build_file_summary_chunk(
2300 file,
2301 project_root,
2302 source,
2303 &top_exports,
2304 &top_export_signatures,
2305 ));
2306 }
2307
2308 for symbol in symbols {
2309 if matches!(symbol.kind, SymbolKind::Heading) {
2314 continue;
2315 }
2316
2317 let line_count = symbol
2319 .range
2320 .end_line
2321 .saturating_sub(symbol.range.start_line)
2322 + 1;
2323 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2324 continue;
2325 }
2326
2327 let embed_text = build_embed_text(symbol, source, file, project_root);
2328 let snippet = build_snippet(symbol, source);
2329
2330 chunks.push(SemanticChunk {
2331 file: file.to_path_buf(),
2332 name: symbol.name.clone(),
2333 kind: symbol.kind.clone(),
2334 start_line: symbol.range.start_line,
2335 end_line: symbol.range.end_line,
2336 exported: symbol.exported,
2337 embed_text,
2338 snippet,
2339 });
2340
2341 }
2344
2345 chunks
2346}
2347
2348fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2350 if a.len() != b.len() {
2351 return 0.0;
2352 }
2353
2354 let mut dot = 0.0f32;
2355 let mut norm_a = 0.0f32;
2356 let mut norm_b = 0.0f32;
2357
2358 for i in 0..a.len() {
2359 dot += a[i] * b[i];
2360 norm_a += a[i] * a[i];
2361 norm_b += b[i] * b[i];
2362 }
2363
2364 let denom = norm_a.sqrt() * norm_b.sqrt();
2365 if denom == 0.0 {
2366 0.0
2367 } else {
2368 dot / denom
2369 }
2370}
2371
2372fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2374 match kind {
2375 SymbolKind::Function => 0,
2376 SymbolKind::Class => 1,
2377 SymbolKind::Method => 2,
2378 SymbolKind::Struct => 3,
2379 SymbolKind::Interface => 4,
2380 SymbolKind::Enum => 5,
2381 SymbolKind::TypeAlias => 6,
2382 SymbolKind::Variable => 7,
2383 SymbolKind::Heading => 8,
2384 SymbolKind::FileSummary => 9,
2385 }
2386}
2387
2388fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2389 match v {
2390 0 => SymbolKind::Function,
2391 1 => SymbolKind::Class,
2392 2 => SymbolKind::Method,
2393 3 => SymbolKind::Struct,
2394 4 => SymbolKind::Interface,
2395 5 => SymbolKind::Enum,
2396 6 => SymbolKind::TypeAlias,
2397 7 => SymbolKind::Variable,
2398 8 => SymbolKind::Heading,
2399 9 => SymbolKind::FileSummary,
2400 _ => SymbolKind::Heading,
2401 }
2402}
2403
2404fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2405 if *pos + 4 > data.len() {
2406 return Err("unexpected end of data reading u32".to_string());
2407 }
2408 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2409 *pos += 4;
2410 Ok(val)
2411}
2412
2413fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2414 if *pos + 8 > data.len() {
2415 return Err("unexpected end of data reading u64".to_string());
2416 }
2417 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2418 *pos += 8;
2419 Ok(u64::from_le_bytes(bytes))
2420}
2421
2422fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2423 let len = read_u32(data, pos)? as usize;
2424 if *pos + len > data.len() {
2425 return Err("unexpected end of data reading string".to_string());
2426 }
2427 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2428 *pos += len;
2429 Ok(s)
2430}
2431
2432#[cfg(test)]
2433mod tests {
2434 use super::*;
2435 use crate::config::{SemanticBackend, SemanticBackendConfig};
2436 use crate::parser::FileParser;
2437 use std::io::{Read, Write};
2438 use std::net::TcpListener;
2439 use std::thread;
2440
2441 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2442 where
2443 F: Fn(String, String, String) -> String + Send + 'static,
2444 {
2445 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2446 let addr = listener.local_addr().expect("local addr");
2447 let handle = thread::spawn(move || {
2448 let (mut stream, _) = listener.accept().expect("accept request");
2449 let mut buf = Vec::new();
2450 let mut chunk = [0u8; 4096];
2451 let mut header_end = None;
2452 let mut content_length = 0usize;
2453 loop {
2454 let n = stream.read(&mut chunk).expect("read request");
2455 if n == 0 {
2456 break;
2457 }
2458 buf.extend_from_slice(&chunk[..n]);
2459 if header_end.is_none() {
2460 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2461 header_end = Some(pos + 4);
2462 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2463 for line in headers.lines() {
2464 if let Some(value) = line.strip_prefix("Content-Length:") {
2465 content_length = value.trim().parse::<usize>().unwrap_or(0);
2466 }
2467 }
2468 }
2469 }
2470 if let Some(end) = header_end {
2471 if buf.len() >= end + content_length {
2472 break;
2473 }
2474 }
2475 }
2476
2477 let end = header_end.expect("header terminator");
2478 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2479 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2480 let mut lines = request.lines();
2481 let request_line = lines.next().expect("request line").to_string();
2482 let path = request_line
2483 .split_whitespace()
2484 .nth(1)
2485 .expect("request path")
2486 .to_string();
2487 let response_body = handler(request_line, path, body);
2488 let response = format!(
2489 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2490 response_body.len(),
2491 response_body
2492 );
2493 stream
2494 .write_all(response.as_bytes())
2495 .expect("write response");
2496 });
2497
2498 (format!("http://{}", addr), handle)
2499 }
2500
2501 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2502 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2503 }
2504
2505 fn write_rust_file(path: &Path, function_name: &str) {
2506 fs::write(
2507 path,
2508 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2509 )
2510 .unwrap();
2511 }
2512
2513 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2514 let mut embed = test_vector_for_texts;
2515 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2516 }
2517
2518 fn test_project_root() -> PathBuf {
2519 std::env::current_dir().unwrap()
2520 }
2521
2522 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2523 index.file_mtimes.insert(file.to_path_buf(), mtime);
2524 index.file_sizes.insert(file.to_path_buf(), size);
2525 index
2526 .file_hashes
2527 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2528 }
2529
2530 #[test]
2531 fn semantic_cache_serialization_skips_paths_outside_project_root() {
2532 let dir = tempfile::tempdir().expect("create temp dir");
2533 let project = fs::canonicalize(dir.path()).expect("canonical project");
2534 let outside = project.join("..").join("outside.rs");
2535 let mut index = SemanticIndex::new(project.clone(), 3);
2536 index
2537 .file_mtimes
2538 .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2539 index.file_sizes.insert(outside.clone(), 1);
2540 index
2541 .file_hashes
2542 .insert(outside.clone(), cache_freshness::zero_hash());
2543 index.entries.push(EmbeddingEntry {
2544 chunk: SemanticChunk {
2545 file: outside,
2546 name: "outside".to_string(),
2547 kind: SymbolKind::Function,
2548 start_line: 0,
2549 end_line: 0,
2550 exported: false,
2551 embed_text: "outside".to_string(),
2552 snippet: "outside".to_string(),
2553 },
2554 vector: vec![1.0, 0.0, 0.0],
2555 });
2556
2557 let bytes = index.to_bytes();
2558 let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2559 assert_eq!(loaded.entries.len(), 0);
2560 assert!(loaded.file_mtimes.is_empty());
2561 }
2562
2563 #[test]
2564 fn test_cosine_similarity_identical() {
2565 let a = vec![1.0, 0.0, 0.0];
2566 let b = vec![1.0, 0.0, 0.0];
2567 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2568 }
2569
2570 #[test]
2571 fn test_cosine_similarity_orthogonal() {
2572 let a = vec![1.0, 0.0, 0.0];
2573 let b = vec![0.0, 1.0, 0.0];
2574 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2575 }
2576
2577 #[test]
2578 fn test_cosine_similarity_opposite() {
2579 let a = vec![1.0, 0.0, 0.0];
2580 let b = vec![-1.0, 0.0, 0.0];
2581 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2582 }
2583
2584 #[test]
2585 fn test_serialization_roundtrip() {
2586 let project_root = test_project_root();
2587 let file = project_root.join("src/main.rs");
2588 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2589 index.entries.push(EmbeddingEntry {
2590 chunk: SemanticChunk {
2591 file: file.clone(),
2592 name: "handle_request".to_string(),
2593 kind: SymbolKind::Function,
2594 start_line: 10,
2595 end_line: 25,
2596 exported: true,
2597 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2598 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2599 },
2600 vector: vec![0.1, 0.2, 0.3, 0.4],
2601 });
2602 index.dimension = 4;
2603 index
2604 .file_mtimes
2605 .insert(file.clone(), SystemTime::UNIX_EPOCH);
2606 index.file_sizes.insert(file, 0);
2607 index.set_fingerprint(SemanticIndexFingerprint {
2608 backend: "fastembed".to_string(),
2609 model: "all-MiniLM-L6-v2".to_string(),
2610 base_url: FALLBACK_BACKEND.to_string(),
2611 dimension: 4,
2612 chunking_version: default_chunking_version(),
2613 });
2614
2615 let bytes = index.to_bytes();
2616 let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2617
2618 assert_eq!(restored.entries.len(), 1);
2619 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2620 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2621 assert_eq!(restored.dimension, 4);
2622 assert_eq!(restored.backend_label(), Some("fastembed"));
2623 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2624 }
2625
2626 #[test]
2627 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2628 let cases = [
2629 (SymbolKind::Function, 0),
2630 (SymbolKind::Class, 1),
2631 (SymbolKind::Method, 2),
2632 (SymbolKind::Struct, 3),
2633 (SymbolKind::Interface, 4),
2634 (SymbolKind::Enum, 5),
2635 (SymbolKind::TypeAlias, 6),
2636 (SymbolKind::Variable, 7),
2637 (SymbolKind::Heading, 8),
2638 (SymbolKind::FileSummary, 9),
2639 ];
2640
2641 for (kind, encoded) in cases {
2642 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2643 assert_eq!(u8_to_symbol_kind(encoded), kind);
2644 }
2645 }
2646
2647 #[test]
2648 fn test_search_top_k() {
2649 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2650 index.dimension = 3;
2651
2652 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2654 let mut vec = vec![0.0f32; 3];
2655 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2657 chunk: SemanticChunk {
2658 file: PathBuf::from("/src/lib.rs"),
2659 name: name.to_string(),
2660 kind: SymbolKind::Function,
2661 start_line: (i * 10 + 1) as u32,
2662 end_line: (i * 10 + 5) as u32,
2663 exported: true,
2664 embed_text: format!("kind:function name:{}", name),
2665 snippet: format!("fn {}() {{}}", name),
2666 },
2667 vector: vec,
2668 });
2669 }
2670
2671 let query = vec![0.9, 0.1, 0.0];
2673 let results = index.search(&query, 2);
2674
2675 assert_eq!(results.len(), 2);
2676 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2678 }
2679
2680 #[test]
2681 fn test_empty_index_search() {
2682 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2683 let results = index.search(&[0.1, 0.2, 0.3], 10);
2684 assert!(results.is_empty());
2685 }
2686
2687 #[test]
2688 fn single_line_symbol_builds_non_empty_snippet() {
2689 let symbol = Symbol {
2690 name: "answer".to_string(),
2691 kind: SymbolKind::Variable,
2692 range: crate::symbols::Range {
2693 start_line: 0,
2694 start_col: 0,
2695 end_line: 0,
2696 end_col: 24,
2697 },
2698 signature: Some("const answer = 42".to_string()),
2699 scope_chain: Vec::new(),
2700 exported: true,
2701 parent: None,
2702 };
2703 let source = "export const answer = 42;\n";
2704
2705 let snippet = build_snippet(&symbol, source);
2706
2707 assert_eq!(snippet, "export const answer = 42;");
2708 }
2709
2710 #[test]
2711 fn optimized_file_chunk_collection_matches_file_parser_path() {
2712 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2713 let file = project_root.join("src/semantic_index.rs");
2714 let source = std::fs::read_to_string(&file).unwrap();
2715
2716 let mut legacy_parser = FileParser::new();
2717 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2718 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2719
2720 let mut parsers = HashMap::new();
2721 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2722
2723 assert_eq!(
2724 chunk_fingerprint(&optimized_chunks),
2725 chunk_fingerprint(&legacy_chunks)
2726 );
2727 }
2728
2729 fn chunk_fingerprint(
2730 chunks: &[SemanticChunk],
2731 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2732 chunks
2733 .iter()
2734 .map(|chunk| {
2735 (
2736 chunk.name.clone(),
2737 chunk.kind.clone(),
2738 chunk.start_line,
2739 chunk.end_line,
2740 chunk.exported,
2741 chunk.embed_text.clone(),
2742 chunk.snippet.clone(),
2743 )
2744 })
2745 .collect()
2746 }
2747
2748 #[test]
2749 fn rejects_oversized_dimension_during_deserialization() {
2750 let mut bytes = Vec::new();
2751 bytes.push(1u8);
2752 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2753 bytes.extend_from_slice(&0u32.to_le_bytes());
2754 bytes.extend_from_slice(&0u32.to_le_bytes());
2755
2756 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2757 }
2758
2759 #[test]
2760 fn rejects_oversized_entry_count_during_deserialization() {
2761 let mut bytes = Vec::new();
2762 bytes.push(1u8);
2763 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2764 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2765 bytes.extend_from_slice(&0u32.to_le_bytes());
2766
2767 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2768 }
2769
2770 #[test]
2771 fn invalidate_file_removes_entries_and_mtime() {
2772 let target = PathBuf::from("/src/main.rs");
2773 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2774 index.entries.push(EmbeddingEntry {
2775 chunk: SemanticChunk {
2776 file: target.clone(),
2777 name: "main".to_string(),
2778 kind: SymbolKind::Function,
2779 start_line: 0,
2780 end_line: 1,
2781 exported: false,
2782 embed_text: "main".to_string(),
2783 snippet: "fn main() {}".to_string(),
2784 },
2785 vector: vec![1.0; DEFAULT_DIMENSION],
2786 });
2787 index
2788 .file_mtimes
2789 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2790 index.file_sizes.insert(target.clone(), 0);
2791
2792 index.invalidate_file(&target);
2793
2794 assert!(index.entries.is_empty());
2795 assert!(!index.file_mtimes.contains_key(&target));
2796 assert!(!index.file_sizes.contains_key(&target));
2797 }
2798
2799 #[test]
2800 fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2801 let temp = tempfile::tempdir().unwrap();
2802 let project_root = temp.path();
2803 let file = project_root.join("src/lib.rs");
2804 fs::create_dir_all(file.parent().unwrap()).unwrap();
2805 write_rust_file(&file, "kept_symbol");
2806
2807 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2808 let original_entry_count = index.entries.len();
2809 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2810 let original_size = *index.file_sizes.get(&file).unwrap();
2811
2812 let stale_mtime = SystemTime::UNIX_EPOCH;
2813 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2814 fs::remove_file(&file).unwrap();
2815
2816 let mut embed = test_vector_for_texts;
2817 let mut progress = |_done: usize, _total: usize| {};
2818 let summary = index
2819 .refresh_stale_files(
2820 project_root,
2821 std::slice::from_ref(&file),
2822 &mut embed,
2823 8,
2824 &mut progress,
2825 )
2826 .unwrap();
2827
2828 assert_eq!(summary.changed, 0);
2829 assert_eq!(summary.added, 0);
2830 assert_eq!(summary.deleted, 0);
2831 assert_eq!(index.entries.len(), original_entry_count);
2832 assert!(index
2833 .entries
2834 .iter()
2835 .any(|entry| entry.chunk.name == "kept_symbol"));
2836 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2837 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2838 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2839 }
2840
2841 #[test]
2842 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2843 let temp = tempfile::tempdir().unwrap();
2844 let project_root = temp.path();
2845 let missing = project_root.join("src/missing.rs");
2846 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2847
2848 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2849 let mut embed = test_vector_for_texts;
2850 let mut progress = |_done: usize, _total: usize| {};
2851 let summary = index
2852 .refresh_stale_files(
2853 project_root,
2854 std::slice::from_ref(&missing),
2855 &mut embed,
2856 8,
2857 &mut progress,
2858 )
2859 .unwrap();
2860
2861 assert_eq!(summary.added, 0);
2862 assert_eq!(summary.changed, 0);
2863 assert_eq!(summary.deleted, 0);
2864 assert!(!index.file_mtimes.contains_key(&missing));
2865 assert!(!index.file_sizes.contains_key(&missing));
2866 assert!(index.entries.is_empty());
2867 }
2868
2869 #[test]
2870 fn refresh_reports_added_for_new_files() {
2871 let temp = tempfile::tempdir().unwrap();
2872 let project_root = temp.path();
2873 let existing = project_root.join("src/lib.rs");
2874 let added = project_root.join("src/new.rs");
2875 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2876 write_rust_file(&existing, "existing_symbol");
2877 write_rust_file(&added, "added_symbol");
2878
2879 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2880 let mut embed = test_vector_for_texts;
2881 let mut progress = |_done: usize, _total: usize| {};
2882 let summary = index
2883 .refresh_stale_files(
2884 project_root,
2885 &[existing.clone(), added.clone()],
2886 &mut embed,
2887 8,
2888 &mut progress,
2889 )
2890 .unwrap();
2891
2892 assert_eq!(summary.added, 1);
2893 assert_eq!(summary.changed, 0);
2894 assert_eq!(summary.deleted, 0);
2895 assert_eq!(summary.total_processed, 2);
2896 assert!(index.file_mtimes.contains_key(&added));
2897 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2898 }
2899
2900 #[test]
2901 fn refresh_reports_deleted_for_removed_files() {
2902 let temp = tempfile::tempdir().unwrap();
2903 let project_root = temp.path();
2904 let deleted = project_root.join("src/deleted.rs");
2905 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2906 write_rust_file(&deleted, "deleted_symbol");
2907
2908 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2909 fs::remove_file(&deleted).unwrap();
2910
2911 let mut embed = test_vector_for_texts;
2912 let mut progress = |_done: usize, _total: usize| {};
2913 let summary = index
2914 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2915 .unwrap();
2916
2917 assert_eq!(summary.deleted, 1);
2918 assert_eq!(summary.changed, 0);
2919 assert_eq!(summary.added, 0);
2920 assert_eq!(summary.total_processed, 1);
2921 assert!(!index.file_mtimes.contains_key(&deleted));
2922 assert!(index.entries.is_empty());
2923 }
2924
2925 #[test]
2926 fn refresh_reports_changed_for_modified_files() {
2927 let temp = tempfile::tempdir().unwrap();
2928 let project_root = temp.path();
2929 let file = project_root.join("src/lib.rs");
2930 fs::create_dir_all(file.parent().unwrap()).unwrap();
2931 write_rust_file(&file, "old_symbol");
2932
2933 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2934 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2935 write_rust_file(&file, "new_symbol");
2936
2937 let mut embed = test_vector_for_texts;
2938 let mut progress = |_done: usize, _total: usize| {};
2939 let summary = index
2940 .refresh_stale_files(
2941 project_root,
2942 std::slice::from_ref(&file),
2943 &mut embed,
2944 8,
2945 &mut progress,
2946 )
2947 .unwrap();
2948
2949 assert_eq!(summary.changed, 1);
2950 assert_eq!(summary.added, 0);
2951 assert_eq!(summary.deleted, 0);
2952 assert_eq!(summary.total_processed, 1);
2953 assert!(index
2954 .entries
2955 .iter()
2956 .any(|entry| entry.chunk.name == "new_symbol"));
2957 assert!(!index
2958 .entries
2959 .iter()
2960 .any(|entry| entry.chunk.name == "old_symbol"));
2961 }
2962
2963 #[test]
2964 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2965 let temp = tempfile::tempdir().unwrap();
2966 let project_root = temp.path();
2967 let file = project_root.join("src/lib.rs");
2968 fs::create_dir_all(file.parent().unwrap()).unwrap();
2969 write_rust_file(&file, "clean_symbol");
2970
2971 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2972 let original_entries = index.entries.len();
2973 let mut embed_called = false;
2974 let mut embed = |texts: Vec<String>| {
2975 embed_called = true;
2976 test_vector_for_texts(texts)
2977 };
2978 let mut progress = |_done: usize, _total: usize| {};
2979 let summary = index
2980 .refresh_stale_files(
2981 project_root,
2982 std::slice::from_ref(&file),
2983 &mut embed,
2984 8,
2985 &mut progress,
2986 )
2987 .unwrap();
2988
2989 assert!(summary.is_noop());
2990 assert_eq!(summary.total_processed, 1);
2991 assert!(!embed_called);
2992 assert_eq!(index.entries.len(), original_entries);
2993 }
2994
2995 #[test]
2996 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
2997 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
2998
2999 assert!(is_onnx_runtime_unavailable(message));
3000 }
3001
3002 #[test]
3003 fn formats_missing_onnx_runtime_with_install_hint() {
3004 let message = format_embedding_init_error(
3005 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3006 );
3007
3008 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3009 assert!(message.contains("Original error:"));
3010 }
3011
3012 #[test]
3013 fn openai_compatible_backend_embeds_with_mock_server() {
3014 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3015 assert!(request_line.starts_with("POST "));
3016 assert_eq!(path, "/v1/embeddings");
3017 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3018 });
3019
3020 let config = SemanticBackendConfig {
3021 backend: SemanticBackend::OpenAiCompatible,
3022 model: "test-embedding".to_string(),
3023 base_url: Some(base_url),
3024 api_key_env: None,
3025 timeout_ms: 5_000,
3026 max_batch_size: 64,
3027 };
3028
3029 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3030 let vectors = model
3031 .embed(vec!["hello".to_string(), "world".to_string()])
3032 .unwrap();
3033
3034 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3035 handle.join().unwrap();
3036 }
3037
3038 #[test]
3048 fn openai_compatible_request_has_single_content_type_header() {
3049 use std::sync::{Arc, Mutex};
3050 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3051 let captured_for_thread = Arc::clone(&captured);
3052
3053 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3054 let addr = listener.local_addr().expect("local addr");
3055 let handle = thread::spawn(move || {
3056 let (mut stream, _) = listener.accept().expect("accept");
3057 let mut buf = Vec::new();
3058 let mut chunk = [0u8; 4096];
3059 let mut header_end = None;
3060 let mut content_length = 0usize;
3061 loop {
3062 let n = stream.read(&mut chunk).expect("read");
3063 if n == 0 {
3064 break;
3065 }
3066 buf.extend_from_slice(&chunk[..n]);
3067 if header_end.is_none() {
3068 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3069 header_end = Some(pos + 4);
3070 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3071 if let Some(value) = line.strip_prefix("Content-Length:") {
3072 content_length = value.trim().parse::<usize>().unwrap_or(0);
3073 }
3074 }
3075 }
3076 }
3077 if let Some(end) = header_end {
3078 if buf.len() >= end + content_length {
3079 break;
3080 }
3081 }
3082 }
3083 *captured_for_thread.lock().unwrap() = buf;
3084 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3085 let response = format!(
3086 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3087 body.len(),
3088 body
3089 );
3090 let _ = stream.write_all(response.as_bytes());
3091 });
3092
3093 let config = SemanticBackendConfig {
3094 backend: SemanticBackend::OpenAiCompatible,
3095 model: "text-embedding-3-small".to_string(),
3096 base_url: Some(format!("http://{}", addr)),
3097 api_key_env: None,
3098 timeout_ms: 5_000,
3099 max_batch_size: 64,
3100 };
3101 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3102 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3103 handle.join().unwrap();
3104
3105 let bytes = captured.lock().unwrap().clone();
3106 let request = String::from_utf8_lossy(&bytes);
3107
3108 let content_type_lines = request
3111 .lines()
3112 .filter(|line| {
3113 let lower = line.to_ascii_lowercase();
3114 lower.starts_with("content-type:")
3115 })
3116 .count();
3117 assert_eq!(
3118 content_type_lines, 1,
3119 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3120 );
3121
3122 assert!(
3125 request.contains(r#""model":"text-embedding-3-small""#),
3126 "request body should contain model field; full request:\n{request}",
3127 );
3128 }
3129
3130 #[test]
3131 fn ollama_backend_embeds_with_mock_server() {
3132 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3133 assert!(request_line.starts_with("POST "));
3134 assert_eq!(path, "/api/embed");
3135 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3136 });
3137
3138 let config = SemanticBackendConfig {
3139 backend: SemanticBackend::Ollama,
3140 model: "embeddinggemma".to_string(),
3141 base_url: Some(base_url),
3142 api_key_env: None,
3143 timeout_ms: 5_000,
3144 max_batch_size: 64,
3145 };
3146
3147 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3148 let vectors = model
3149 .embed(vec!["hello".to_string(), "world".to_string()])
3150 .unwrap();
3151
3152 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3153 handle.join().unwrap();
3154 }
3155
3156 #[test]
3157 fn read_from_disk_rejects_fingerprint_mismatch() {
3158 let storage = tempfile::tempdir().unwrap();
3159 let project_key = "proj";
3160
3161 let project_root = test_project_root();
3162 let file = project_root.join("src/main.rs");
3163 let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3164 index.entries.push(EmbeddingEntry {
3165 chunk: SemanticChunk {
3166 file: file.clone(),
3167 name: "handle_request".to_string(),
3168 kind: SymbolKind::Function,
3169 start_line: 10,
3170 end_line: 25,
3171 exported: true,
3172 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3173 snippet: "fn handle_request() {}".to_string(),
3174 },
3175 vector: vec![0.1, 0.2, 0.3],
3176 });
3177 index.dimension = 3;
3178 index
3179 .file_mtimes
3180 .insert(file.clone(), SystemTime::UNIX_EPOCH);
3181 index.file_sizes.insert(file, 0);
3182 index.set_fingerprint(SemanticIndexFingerprint {
3183 backend: "openai_compatible".to_string(),
3184 model: "test-embedding".to_string(),
3185 base_url: "http://127.0.0.1:1234/v1".to_string(),
3186 dimension: 3,
3187 chunking_version: default_chunking_version(),
3188 });
3189 index.write_to_disk(storage.path(), project_key);
3190
3191 let matching = index.fingerprint().unwrap().as_string();
3192 assert!(SemanticIndex::read_from_disk(
3193 storage.path(),
3194 project_key,
3195 &project_root,
3196 false,
3197 Some(&matching),
3198 )
3199 .is_some());
3200
3201 let mismatched = SemanticIndexFingerprint {
3202 backend: "ollama".to_string(),
3203 model: "embeddinggemma".to_string(),
3204 base_url: "http://127.0.0.1:11434".to_string(),
3205 dimension: 3,
3206 chunking_version: default_chunking_version(),
3207 }
3208 .as_string();
3209 assert!(SemanticIndex::read_from_disk(
3210 storage.path(),
3211 project_key,
3212 &project_root,
3213 false,
3214 Some(&mismatched),
3215 )
3216 .is_none());
3217 }
3218
3219 #[test]
3220 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3221 let storage = tempfile::tempdir().unwrap();
3222 let project_key = "proj-v3";
3223 let dir = storage.path().join("semantic").join(project_key);
3224 fs::create_dir_all(&dir).unwrap();
3225
3226 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3227 index.entries.push(EmbeddingEntry {
3228 chunk: SemanticChunk {
3229 file: PathBuf::from("/src/main.rs"),
3230 name: "handle_request".to_string(),
3231 kind: SymbolKind::Function,
3232 start_line: 0,
3233 end_line: 0,
3234 exported: true,
3235 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3236 snippet: "fn handle_request() {}".to_string(),
3237 },
3238 vector: vec![0.1, 0.2, 0.3],
3239 });
3240 index.dimension = 3;
3241 index
3242 .file_mtimes
3243 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3244 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3245 let fingerprint = SemanticIndexFingerprint {
3246 backend: "fastembed".to_string(),
3247 model: "test".to_string(),
3248 base_url: FALLBACK_BACKEND.to_string(),
3249 dimension: 3,
3250 chunking_version: default_chunking_version(),
3251 };
3252 index.set_fingerprint(fingerprint.clone());
3253
3254 let mut bytes = index.to_bytes();
3255 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3256 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3257
3258 assert!(SemanticIndex::read_from_disk(
3259 storage.path(),
3260 project_key,
3261 &test_project_root(),
3262 false,
3263 Some(&fingerprint.as_string())
3264 )
3265 .is_none());
3266 assert!(!dir.join("semantic.bin").exists());
3267 }
3268
3269 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3270 crate::symbols::Symbol {
3271 name: name.to_string(),
3272 kind,
3273 range: crate::symbols::Range {
3274 start_line: start,
3275 start_col: 0,
3276 end_line: end,
3277 end_col: 0,
3278 },
3279 signature: None,
3280 scope_chain: Vec::new(),
3281 exported: false,
3282 parent: None,
3283 }
3284 }
3285
3286 #[test]
3291 fn symbols_to_chunks_skips_heading_symbols() {
3292 let project_root = PathBuf::from("/proj");
3293 let file = project_root.join("README.md");
3294 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3295
3296 let symbols = vec![
3297 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3298 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3299 ];
3300
3301 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3302 assert!(
3303 chunks.is_empty(),
3304 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3305 chunks.len()
3306 );
3307 }
3308
3309 #[test]
3313 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3314 let project_root = PathBuf::from("/proj");
3315 let file = project_root.join("src/lib.rs");
3316 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3317
3318 let symbols = vec![
3319 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3321 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3322 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3323 ];
3324
3325 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3326 assert_eq!(
3327 chunks.len(),
3328 3,
3329 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3330 chunks.len()
3331 );
3332 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3333 assert!(chunks
3334 .iter()
3335 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3336 assert!(names.contains(&"handle_request"));
3337 assert!(names.contains(&"AuthService"));
3338 assert!(
3339 !names.contains(&"doc heading"),
3340 "Heading symbol leaked into chunks: {names:?}"
3341 );
3342 }
3343
3344 #[test]
3345 fn validate_ssrf_allows_loopback_hostnames() {
3346 for host in &[
3349 "http://localhost",
3350 "http://localhost:8080",
3351 "http://localhost:11434", "http://localhost.localdomain",
3353 "http://foo.localhost",
3354 ] {
3355 assert!(
3356 validate_base_url_no_ssrf(host).is_ok(),
3357 "Expected {host} to be allowed (loopback), got: {:?}",
3358 validate_base_url_no_ssrf(host)
3359 );
3360 }
3361 }
3362
3363 #[test]
3364 fn validate_ssrf_allows_loopback_ips() {
3365 for url in &[
3368 "http://127.0.0.1",
3369 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3371 "http://127.1.2.3",
3372 ] {
3373 let result = validate_base_url_no_ssrf(url);
3374 assert!(
3375 result.is_ok(),
3376 "Expected {url} to be allowed (loopback), got: {:?}",
3377 result
3378 );
3379 }
3380 }
3381
3382 #[test]
3383 fn validate_ssrf_rejects_private_non_loopback_ips() {
3384 for url in &[
3389 "http://192.168.1.1",
3390 "http://10.0.0.1",
3391 "http://172.16.0.1",
3392 "http://169.254.169.254",
3393 "http://100.64.0.1",
3394 ] {
3395 let result = validate_base_url_no_ssrf(url);
3396 assert!(
3397 result.is_err(),
3398 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3399 result
3400 );
3401 }
3402 }
3403
3404 #[test]
3405 fn validate_ssrf_rejects_mdns_local_hostnames() {
3406 for host in &[
3409 "http://printer.local",
3410 "http://nas.local:8080",
3411 "http://homelab.local",
3412 ] {
3413 let result = validate_base_url_no_ssrf(host);
3414 assert!(
3415 result.is_err(),
3416 "Expected {host} to be rejected (mDNS), got: {:?}",
3417 result
3418 );
3419 }
3420 }
3421
3422 #[test]
3423 fn normalize_base_url_allows_localhost_for_tests() {
3424 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3427 assert!(normalize_base_url("http://localhost:8080").is_ok());
3428 }
3429
3430 #[test]
3437 fn ort_mismatch_message_recommends_auto_fix_first() {
3438 let msg =
3439 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3440
3441 assert!(
3443 msg.contains("v1.9.0"),
3444 "should report detected version: {msg}"
3445 );
3446 assert!(
3447 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3448 "should report system path: {msg}"
3449 );
3450 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3451
3452 let auto_fix_pos = msg
3454 .find("Auto-fix")
3455 .expect("Auto-fix solution missing — users won't discover --fix");
3456 let remove_pos = msg
3457 .find("Remove the old library")
3458 .expect("system-rm solution missing");
3459 assert!(
3460 auto_fix_pos < remove_pos,
3461 "Auto-fix must come before manual rm — see PR comment thread"
3462 );
3463
3464 assert!(
3466 msg.contains("npx @cortexkit/aft doctor --fix"),
3467 "auto-fix command must be present and copy-pasteable: {msg}"
3468 );
3469 }
3470
3471 #[test]
3475 fn ort_mismatch_message_handles_macos_dylib_path() {
3476 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3477 assert!(msg.contains("v1.9.0"));
3478 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3479 assert!(
3483 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3484 "system path should be quoted in the auto-fix sentence: {msg}"
3485 );
3486 }
3487}