1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
4use crate::symbols::{Symbol, SymbolKind};
5use crate::{slog_info, slog_warn};
6
7use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
8use rayon::prelude::*;
9use reqwest::blocking::Client;
10use serde::{Deserialize, Serialize};
11use std::collections::{HashMap, HashSet, VecDeque};
12use std::env;
13use std::fmt::Display;
14use std::fs;
15use std::path::{Path, PathBuf};
16use std::time::Duration;
17use std::time::SystemTime;
18use tree_sitter::Parser;
19use url::Url;
20
21const DEFAULT_DIMENSION: usize = 384;
22const MAX_ENTRIES: usize = 1_000_000;
23const MAX_DIMENSION: usize = 1024;
24const F32_BYTES: usize = std::mem::size_of::<f32>();
25const HEADER_BYTES_V1: usize = 9;
26const HEADER_BYTES_V2: usize = 13;
27const ONNX_RUNTIME_INSTALL_HINT: &str =
28 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
29
30const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
31const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
32const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
37const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
40const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
43const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
45const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
46const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
47const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
49const DEFAULT_MAX_BATCH_SIZE: usize = 64;
50const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
51const FALLBACK_BACKEND: &str = "none";
52const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
53const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct SemanticIndexFingerprint {
57 pub backend: String,
58 pub model: String,
59 #[serde(default)]
60 pub base_url: String,
61 pub dimension: usize,
62 #[serde(default = "default_chunking_version")]
63 pub chunking_version: u32,
64}
65
66fn default_chunking_version() -> u32 {
67 2
68}
69
70impl SemanticIndexFingerprint {
71 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
72 let base_url = config
75 .base_url
76 .as_ref()
77 .and_then(|u| normalize_base_url(u).ok())
78 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
79 Self {
80 backend: config.backend.as_str().to_string(),
81 model: config.model.clone(),
82 base_url,
83 dimension,
84 chunking_version: default_chunking_version(),
85 }
86 }
87
88 pub fn as_string(&self) -> String {
89 serde_json::to_string(self).unwrap_or_else(|_| String::new())
90 }
91
92 fn matches_expected(&self, expected: &str) -> bool {
93 let encoded = self.as_string();
94 !encoded.is_empty() && encoded == expected
95 }
96}
97
98enum SemanticEmbeddingEngine {
99 Fastembed(TextEmbedding),
100 OpenAiCompatible {
101 client: Client,
102 model: String,
103 base_url: String,
104 api_key: Option<String>,
105 },
106 Ollama {
107 client: Client,
108 model: String,
109 base_url: String,
110 },
111}
112
113pub struct SemanticEmbeddingModel {
114 backend: SemanticBackend,
115 model: String,
116 base_url: Option<String>,
117 timeout_ms: u64,
118 max_batch_size: usize,
119 dimension: Option<usize>,
120 engine: SemanticEmbeddingEngine,
121 query_embedding_cache: HashMap<String, Vec<f32>>,
122 query_embedding_cache_order: VecDeque<String>,
123 query_embedding_cache_hits: u64,
124 query_embedding_cache_misses: u64,
125}
126
127pub type EmbeddingModel = SemanticEmbeddingModel;
128
129fn validate_embedding_batch(
130 vectors: &[Vec<f32>],
131 expected_count: usize,
132 context: &str,
133) -> Result<(), String> {
134 if expected_count > 0 && vectors.is_empty() {
135 return Err(format!(
136 "{context} returned no vectors for {expected_count} inputs"
137 ));
138 }
139
140 if vectors.len() != expected_count {
141 return Err(format!(
142 "{context} returned {} vectors for {} inputs",
143 vectors.len(),
144 expected_count
145 ));
146 }
147
148 let Some(first_vector) = vectors.first() else {
149 return Ok(());
150 };
151 let expected_dimension = first_vector.len();
152 for (index, vector) in vectors.iter().enumerate() {
153 if vector.len() != expected_dimension {
154 return Err(format!(
155 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
156 vector.len()
157 ));
158 }
159 }
160
161 Ok(())
162}
163
164fn normalize_base_url(raw: &str) -> Result<String, String> {
168 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
169 let scheme = parsed.scheme();
170 if scheme != "http" && scheme != "https" {
171 return Err(format!(
172 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
173 scheme
174 ));
175 }
176 Ok(parsed.to_string().trim_end_matches('/').to_string())
177}
178
179pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
194 use std::net::{IpAddr, ToSocketAddrs};
195
196 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
197
198 let host = parsed.host_str().unwrap_or("");
199
200 let is_loopback_host =
205 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
206 if is_loopback_host {
207 return Ok(());
208 }
209
210 if host.ends_with(".local") {
213 return Err(format!(
214 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
215 ));
216 }
217
218 let port = parsed.port_or_known_default().unwrap_or(443);
221 let addr_str = format!("{host}:{port}");
222 let addrs: Vec<IpAddr> = addr_str
223 .to_socket_addrs()
224 .map(|iter| iter.map(|sa| sa.ip()).collect())
225 .unwrap_or_default();
226 for ip in &addrs {
227 if is_private_non_loopback_ip(ip) {
228 return Err(format!(
229 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
230 ));
231 }
232 }
233
234 Ok(())
235}
236
237fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
241 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
242 match ip {
243 IpAddr::V4(v4) => {
244 let o = v4.octets();
245 o[0] == 10
248 || (o[0] == 172 && (16..=31).contains(&o[1]))
250 || (o[0] == 192 && o[1] == 168)
252 || (o[0] == 169 && o[1] == 254)
254 || (o[0] == 100 && (64..=127).contains(&o[1]))
256 || o[0] == 0
258 }
259 IpAddr::V6(v6) => {
260 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
264 || (v6.segments()[0] & 0xfe00) == 0xfc00
266 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
268 && v6.segments()[2] == 0 && v6.segments()[3] == 0
269 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
270 && {
271 let [a, b] = v6.segments()[6..8] else { return false; };
272 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
273 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
274 })
275 }
276 }
277}
278
279fn build_openai_embeddings_endpoint(base_url: &str) -> String {
280 if base_url.ends_with("/v1") {
281 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
282 } else {
283 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
284 }
285}
286
287fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
288 if base_url.ends_with("/api") {
289 format!("{base_url}/embed")
290 } else {
291 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
292 }
293}
294
295fn normalize_api_key(value: Option<String>) -> Option<String> {
296 value.and_then(|token| {
297 let token = token.trim();
298 if token.is_empty() {
299 None
300 } else {
301 Some(token.to_string())
302 }
303 })
304}
305
306fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
307 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
308}
309
310fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
311 error.is_connect()
312}
313
314fn sleep_before_embedding_retry(attempt_index: usize) {
315 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
316 std::thread::sleep(Duration::from_millis(*delay_ms));
317 }
318}
319
320fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
321where
322 F: FnMut() -> reqwest::blocking::RequestBuilder,
323{
324 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
325 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
326
327 let response = match make_request().send() {
328 Ok(response) => response,
329 Err(error) => {
330 if !last_attempt && is_retryable_embedding_error(&error) {
331 sleep_before_embedding_retry(attempt_index);
332 continue;
333 }
334 return Err(format!("{backend_label} request failed: {error}"));
335 }
336 };
337
338 let status = response.status();
339 let raw = match response.text() {
340 Ok(raw) => raw,
341 Err(error) => {
342 if !last_attempt && is_retryable_embedding_error(&error) {
343 sleep_before_embedding_retry(attempt_index);
344 continue;
345 }
346 return Err(format!("{backend_label} response read failed: {error}"));
347 }
348 };
349
350 if status.is_success() {
351 return Ok(raw);
352 }
353
354 if !last_attempt && is_retryable_embedding_status(status) {
355 sleep_before_embedding_retry(attempt_index);
356 continue;
357 }
358
359 return Err(format!(
360 "{backend_label} request failed (HTTP {}): {}",
361 status, raw
362 ));
363 }
364
365 unreachable!("embedding request retries exhausted without returning")
366}
367
368impl SemanticEmbeddingModel {
369 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
370 let timeout_ms = if config.timeout_ms == 0 {
371 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
372 } else {
373 config.timeout_ms
374 };
375
376 let max_batch_size = if config.max_batch_size == 0 {
377 DEFAULT_MAX_BATCH_SIZE
378 } else {
379 config.max_batch_size
380 };
381
382 let api_key_env = normalize_api_key(config.api_key_env.clone());
383 let model = config.model.clone();
384
385 let client = Client::builder()
386 .timeout(Duration::from_millis(timeout_ms))
387 .redirect(reqwest::redirect::Policy::none())
388 .build()
389 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
390
391 let engine = match config.backend {
392 SemanticBackend::Fastembed => {
393 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
394 }
395 SemanticBackend::OpenAiCompatible => {
396 let raw = config.base_url.as_ref().ok_or_else(|| {
397 "base_url is required for openai_compatible backend".to_string()
398 })?;
399 let base_url = normalize_base_url(raw)?;
400
401 let api_key = match api_key_env {
402 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
403 format!("missing api_key_env '{var_name}' for openai_compatible backend")
404 })?),
405 None => None,
406 };
407
408 SemanticEmbeddingEngine::OpenAiCompatible {
409 client,
410 model,
411 base_url,
412 api_key,
413 }
414 }
415 SemanticBackend::Ollama => {
416 let raw = config
417 .base_url
418 .as_ref()
419 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
420 let base_url = normalize_base_url(raw)?;
421
422 SemanticEmbeddingEngine::Ollama {
423 client,
424 model,
425 base_url,
426 }
427 }
428 };
429
430 Ok(Self {
431 backend: config.backend,
432 model: config.model.clone(),
433 base_url: config.base_url.clone(),
434 timeout_ms,
435 max_batch_size,
436 dimension: None,
437 engine,
438 query_embedding_cache: HashMap::new(),
439 query_embedding_cache_order: VecDeque::new(),
440 query_embedding_cache_hits: 0,
441 query_embedding_cache_misses: 0,
442 })
443 }
444
445 pub fn backend(&self) -> SemanticBackend {
446 self.backend
447 }
448
449 pub fn model(&self) -> &str {
450 &self.model
451 }
452
453 pub fn base_url(&self) -> Option<&str> {
454 self.base_url.as_deref()
455 }
456
457 pub fn max_batch_size(&self) -> usize {
458 self.max_batch_size
459 }
460
461 pub fn timeout_ms(&self) -> u64 {
462 self.timeout_ms
463 }
464
465 pub fn fingerprint(
466 &mut self,
467 config: &SemanticBackendConfig,
468 ) -> Result<SemanticIndexFingerprint, String> {
469 let dimension = self.dimension()?;
470 Ok(SemanticIndexFingerprint::from_config(config, dimension))
471 }
472
473 pub fn dimension(&mut self) -> Result<usize, String> {
474 if let Some(dimension) = self.dimension {
475 return Ok(dimension);
476 }
477
478 let dimension = match &mut self.engine {
479 SemanticEmbeddingEngine::Fastembed(model) => {
480 let vectors = model
481 .embed(vec!["semantic index fingerprint probe".to_string()], None)
482 .map_err(|error| format_embedding_init_error(error.to_string()))?;
483 vectors
484 .first()
485 .map(|v| v.len())
486 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
487 }
488 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
489 let vectors =
490 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
491 vectors
492 .first()
493 .map(|v| v.len())
494 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
495 }
496 SemanticEmbeddingEngine::Ollama { .. } => {
497 let vectors =
498 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
499 vectors
500 .first()
501 .map(|v| v.len())
502 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
503 }
504 };
505
506 self.dimension = Some(dimension);
507 Ok(dimension)
508 }
509
510 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
511 self.embed_texts(texts)
512 }
513
514 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
515 if let Some(vector) = self.query_embedding_cache.get(query) {
516 self.query_embedding_cache_hits += 1;
517 return Ok(vector.clone());
518 }
519
520 self.query_embedding_cache_misses += 1;
521 let embeddings = self.embed_texts(vec![query.to_string()])?;
522 let vector = embeddings
523 .first()
524 .cloned()
525 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
526
527 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
528 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
529 self.query_embedding_cache.remove(&oldest);
530 }
531 }
532 self.query_embedding_cache
533 .insert(query.to_string(), vector.clone());
534 self.query_embedding_cache_order
535 .push_back(query.to_string());
536
537 Ok(vector)
538 }
539
540 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
541 (
542 self.query_embedding_cache_hits,
543 self.query_embedding_cache_misses,
544 self.query_embedding_cache.len(),
545 )
546 }
547
548 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
549 match &mut self.engine {
550 SemanticEmbeddingEngine::Fastembed(model) => model
551 .embed(texts, None::<usize>)
552 .map_err(|error| format_embedding_init_error(error.to_string()))
553 .map_err(|error| format!("failed to embed batch: {error}")),
554 SemanticEmbeddingEngine::OpenAiCompatible {
555 client,
556 model,
557 base_url,
558 api_key,
559 } => {
560 let expected_text_count = texts.len();
561 let endpoint = build_openai_embeddings_endpoint(base_url);
562 let body = serde_json::json!({
563 "input": texts,
564 "model": model,
565 });
566
567 let raw = send_embedding_request(
568 || {
569 let mut request = client.post(&endpoint).json(&body);
579
580 if let Some(api_key) = api_key {
581 request = request.header("Authorization", format!("Bearer {api_key}"));
582 }
583
584 request
585 },
586 "openai compatible",
587 )?;
588
589 #[derive(Deserialize)]
590 struct OpenAiResponse {
591 data: Vec<OpenAiEmbeddingResult>,
592 }
593
594 #[derive(Deserialize)]
595 struct OpenAiEmbeddingResult {
596 embedding: Vec<f32>,
597 index: Option<u32>,
598 }
599
600 let parsed: OpenAiResponse = serde_json::from_str(&raw)
601 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
602 if parsed.data.len() != expected_text_count {
603 return Err(format!(
604 "openai compatible response returned {} embeddings for {} inputs",
605 parsed.data.len(),
606 expected_text_count
607 ));
608 }
609
610 let mut vectors = vec![Vec::new(); parsed.data.len()];
611 for (i, item) in parsed.data.into_iter().enumerate() {
612 let index = item.index.unwrap_or(i as u32) as usize;
613 if index >= vectors.len() {
614 return Err(
615 "openai compatible response contains invalid vector index".to_string()
616 );
617 }
618 vectors[index] = item.embedding;
619 }
620
621 for vector in &vectors {
622 if vector.is_empty() {
623 return Err(
624 "openai compatible response contained missing vectors".to_string()
625 );
626 }
627 }
628
629 self.dimension = vectors.first().map(Vec::len);
630 Ok(vectors)
631 }
632 SemanticEmbeddingEngine::Ollama {
633 client,
634 model,
635 base_url,
636 } => {
637 let expected_text_count = texts.len();
638 let endpoint = build_ollama_embeddings_endpoint(base_url);
639
640 #[derive(Serialize)]
641 struct OllamaPayload<'a> {
642 model: &'a str,
643 input: Vec<String>,
644 }
645
646 let payload = OllamaPayload {
647 model,
648 input: texts,
649 };
650
651 let raw = send_embedding_request(
652 || {
653 client.post(&endpoint).json(&payload)
658 },
659 "ollama",
660 )?;
661
662 #[derive(Deserialize)]
663 struct OllamaResponse {
664 embeddings: Vec<Vec<f32>>,
665 }
666
667 let parsed: OllamaResponse = serde_json::from_str(&raw)
668 .map_err(|error| format!("invalid ollama response: {error}"))?;
669 if parsed.embeddings.is_empty() {
670 return Err("ollama response returned no embeddings".to_string());
671 }
672 if parsed.embeddings.len() != expected_text_count {
673 return Err(format!(
674 "ollama response returned {} embeddings for {} inputs",
675 parsed.embeddings.len(),
676 expected_text_count
677 ));
678 }
679
680 let vectors = parsed.embeddings;
681 for vector in &vectors {
682 if vector.is_empty() {
683 return Err("ollama response contained empty embeddings".to_string());
684 }
685 }
686
687 self.dimension = vectors.first().map(Vec::len);
688 Ok(vectors)
689 }
690 }
691 }
692}
693
694pub fn pre_validate_onnx_runtime() -> Result<(), String> {
698 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
699
700 #[cfg(any(target_os = "linux", target_os = "macos"))]
701 {
702 #[cfg(target_os = "linux")]
703 let default_name = "libonnxruntime.so";
704 #[cfg(target_os = "macos")]
705 let default_name = "libonnxruntime.dylib";
706
707 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
708
709 unsafe {
710 let c_name = std::ffi::CString::new(lib_name)
711 .map_err(|e| format!("invalid library path: {}", e))?;
712 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
713 if handle.is_null() {
714 let err = libc::dlerror();
715 let msg = if err.is_null() {
716 "unknown dlopen error".to_string()
717 } else {
718 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
719 };
720 return Err(format!(
721 "ONNX Runtime not found. dlopen('{}') failed: {}. \
722 Run `npx @cortexkit/aft doctor` to diagnose.",
723 lib_name, msg
724 ));
725 }
726
727 let detected_version = detect_ort_version_from_path(lib_name);
730
731 libc::dlclose(handle);
732
733 if let Some(ref version) = detected_version {
735 let parts: Vec<&str> = version.split('.').collect();
736 if let (Some(major), Some(minor)) = (
737 parts.first().and_then(|s| s.parse::<u32>().ok()),
738 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
739 ) {
740 if major != 1 || minor < 20 {
741 return Err(format_ort_version_mismatch(version, lib_name));
742 }
743 }
744 }
745 }
746 }
747
748 #[cfg(target_os = "windows")]
749 {
750 let _ = dylib_path;
752 }
753
754 Ok(())
755}
756
757#[cfg(any(test, target_os = "linux", target_os = "macos"))]
760fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
761 let path = std::path::Path::new(lib_path);
762
763 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
765 .into_iter()
766 .flatten()
767 {
768 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
769 if let Some(version) = extract_version_from_filename(name) {
770 return Some(version);
771 }
772 }
773 }
774
775 if let Some(parent) = path.parent() {
777 if let Ok(entries) = std::fs::read_dir(parent) {
778 for entry in entries.flatten() {
779 if let Some(name) = entry.file_name().to_str() {
780 if name.starts_with("libonnxruntime") {
781 if let Some(version) = extract_version_from_filename(name) {
782 return Some(version);
783 }
784 }
785 }
786 }
787 }
788 }
789
790 None
791}
792
793#[cfg(any(test, target_os = "linux", target_os = "macos"))]
795fn extract_version_from_filename(name: &str) -> Option<String> {
796 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
798 re.find(name).map(|m| m.as_str().to_string())
799}
800
801#[cfg(any(test, target_os = "linux", target_os = "macos"))]
802fn suggest_removal_command(lib_path: &str) -> String {
803 if lib_path.starts_with("/usr/local/lib")
804 || lib_path == "libonnxruntime.so"
805 || lib_path == "libonnxruntime.dylib"
806 {
807 #[cfg(target_os = "linux")]
808 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
809 #[cfg(target_os = "macos")]
810 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
811 #[cfg(target_os = "windows")]
812 return " Delete the ONNX Runtime DLL from your PATH".to_string();
813 }
814 format!(" rm '{}'", lib_path)
815}
816
817#[cfg(any(test, target_os = "linux", target_os = "macos"))]
823pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
824 format!(
825 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
826 Solutions:\n\
827 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
828 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
829 configures the bridge to load it instead of the system library — no \
830 changes to '{}'.\n\
831 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
832 {}\n\
833 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
834 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
835 version,
836 lib_name,
837 lib_name,
838 suggest_removal_command(lib_name),
839 )
840}
841
842pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
843 pre_validate_onnx_runtime()?;
845
846 let selected_model = match model {
847 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
848 _ => {
849 return Err(format!(
850 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
851 model
852 ))
853 }
854 };
855
856 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
857}
858
859pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
860 if message.trim_start().starts_with("ONNX Runtime not found.") {
861 return true;
862 }
863
864 let message = message.to_ascii_lowercase();
865 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
866 .iter()
867 .any(|pattern| message.contains(pattern));
868 let mentions_dynamic_load_failure = [
869 "shared library",
870 "dynamic library",
871 "failed to load",
872 "could not load",
873 "unable to load",
874 "dlopen",
875 "loadlibrary",
876 "no such file",
877 "not found",
878 ]
879 .iter()
880 .any(|pattern| message.contains(pattern));
881
882 mentions_onnx_runtime && mentions_dynamic_load_failure
883}
884
885fn format_embedding_init_error(error: impl Display) -> String {
886 let message = error.to_string();
887
888 if is_onnx_runtime_unavailable(&message) {
889 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
890 }
891
892 format!("failed to initialize semantic embedding model: {message}")
893}
894
895#[derive(Debug, Clone)]
897pub struct SemanticChunk {
898 pub file: PathBuf,
900 pub name: String,
902 pub kind: SymbolKind,
904 pub start_line: u32,
906 pub end_line: u32,
907 pub exported: bool,
909 pub embed_text: String,
911 pub snippet: String,
913}
914
915#[derive(Debug)]
917struct EmbeddingEntry {
918 chunk: SemanticChunk,
919 vector: Vec<f32>,
920}
921
922#[derive(Debug)]
924pub struct SemanticIndex {
925 entries: Vec<EmbeddingEntry>,
926 file_mtimes: HashMap<PathBuf, SystemTime>,
928 file_sizes: HashMap<PathBuf, u64>,
930 file_hashes: HashMap<PathBuf, blake3::Hash>,
931 dimension: usize,
933 fingerprint: Option<SemanticIndexFingerprint>,
934 project_root: PathBuf,
935}
936
937#[derive(Debug, Clone, Copy)]
938struct IndexedFileMetadata {
939 mtime: SystemTime,
940 size: u64,
941 content_hash: blake3::Hash,
942}
943
944#[derive(Debug, Default, Clone, Copy)]
947pub struct RefreshSummary {
948 pub changed: usize,
949 pub added: usize,
950 pub deleted: usize,
951 pub total_processed: usize,
952}
953
954impl RefreshSummary {
955 pub fn is_noop(&self) -> bool {
957 self.changed == 0 && self.added == 0 && self.deleted == 0
958 }
959}
960
961#[derive(Debug, Clone)]
963pub struct SemanticResult {
964 pub file: PathBuf,
965 pub name: String,
966 pub kind: SymbolKind,
967 pub start_line: u32,
968 pub end_line: u32,
969 pub exported: bool,
970 pub snippet: String,
971 pub score: f32,
972 pub source: &'static str,
973}
974
975impl SemanticIndex {
976 pub fn new(project_root: PathBuf, dimension: usize) -> Self {
977 debug_assert!(project_root.is_absolute());
978 Self {
979 entries: Vec::new(),
980 file_mtimes: HashMap::new(),
981 file_sizes: HashMap::new(),
982 file_hashes: HashMap::new(),
983 dimension,
984 fingerprint: None,
985 project_root,
986 }
987 }
988
989 pub fn entry_count(&self) -> usize {
991 self.entries.len()
992 }
993
994 pub fn status_label(&self) -> &'static str {
996 if self.entries.is_empty() {
997 "empty"
998 } else {
999 "ready"
1000 }
1001 }
1002
1003 fn collect_chunks(
1004 project_root: &Path,
1005 files: &[PathBuf],
1006 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1007 let per_file: Vec<(
1008 PathBuf,
1009 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1010 )> = files
1011 .par_iter()
1012 .map_init(HashMap::new, |parsers, file| {
1013 let result = collect_file_metadata(file).and_then(|metadata| {
1014 collect_file_chunks(project_root, file, parsers)
1015 .map(|chunks| (metadata, chunks))
1016 });
1017 (file.clone(), result)
1018 })
1019 .collect();
1020
1021 let mut chunks: Vec<SemanticChunk> = Vec::new();
1022 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1023
1024 for (file, result) in per_file {
1025 match result {
1026 Ok((metadata, file_chunks)) => {
1027 file_metadata.insert(file, metadata);
1028 chunks.extend(file_chunks);
1029 }
1030 Err(error) => {
1031 if error == "unsupported file extension" {
1037 continue;
1038 }
1039 slog_warn!(
1040 "failed to collect semantic chunks for {}: {}",
1041 file.display(),
1042 error
1043 );
1044 }
1045 }
1046 }
1047
1048 (chunks, file_metadata)
1049 }
1050
1051 fn build_from_chunks<F, P>(
1052 project_root: &Path,
1053 chunks: Vec<SemanticChunk>,
1054 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1055 embed_fn: &mut F,
1056 max_batch_size: usize,
1057 mut progress: Option<&mut P>,
1058 ) -> Result<Self, String>
1059 where
1060 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1061 P: FnMut(usize, usize),
1062 {
1063 debug_assert!(project_root.is_absolute());
1064 let total_chunks = chunks.len();
1065
1066 if chunks.is_empty() {
1067 return Ok(Self {
1068 entries: Vec::new(),
1069 file_mtimes: file_metadata
1070 .iter()
1071 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1072 .collect(),
1073 file_sizes: file_metadata
1074 .iter()
1075 .map(|(path, metadata)| (path.clone(), metadata.size))
1076 .collect(),
1077 file_hashes: file_metadata
1078 .into_iter()
1079 .map(|(path, metadata)| (path, metadata.content_hash))
1080 .collect(),
1081 dimension: DEFAULT_DIMENSION,
1082 fingerprint: None,
1083 project_root: project_root.to_path_buf(),
1084 });
1085 }
1086
1087 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1089 let mut expected_dimension: Option<usize> = None;
1090 let batch_size = max_batch_size.max(1);
1091 for batch_start in (0..chunks.len()).step_by(batch_size) {
1092 let batch_end = (batch_start + batch_size).min(chunks.len());
1093 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1094 .iter()
1095 .map(|c| c.embed_text.clone())
1096 .collect();
1097
1098 let vectors = embed_fn(batch_texts)?;
1099 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1100
1101 if let Some(dim) = vectors.first().map(|v| v.len()) {
1103 match expected_dimension {
1104 None => expected_dimension = Some(dim),
1105 Some(expected) if dim != expected => {
1106 return Err(format!(
1107 "embedding dimension changed across batches: expected {expected}, got {dim}"
1108 ));
1109 }
1110 _ => {}
1111 }
1112 }
1113
1114 for (i, vector) in vectors.into_iter().enumerate() {
1115 let chunk_idx = batch_start + i;
1116 entries.push(EmbeddingEntry {
1117 chunk: chunks[chunk_idx].clone(),
1118 vector,
1119 });
1120 }
1121
1122 if let Some(callback) = progress.as_mut() {
1123 callback(entries.len(), total_chunks);
1124 }
1125 }
1126
1127 let dimension = entries
1128 .first()
1129 .map(|e| e.vector.len())
1130 .unwrap_or(DEFAULT_DIMENSION);
1131
1132 Ok(Self {
1133 entries,
1134 file_mtimes: file_metadata
1135 .iter()
1136 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1137 .collect(),
1138 file_sizes: file_metadata
1139 .iter()
1140 .map(|(path, metadata)| (path.clone(), metadata.size))
1141 .collect(),
1142 file_hashes: file_metadata
1143 .into_iter()
1144 .map(|(path, metadata)| (path, metadata.content_hash))
1145 .collect(),
1146 dimension,
1147 fingerprint: None,
1148 project_root: project_root.to_path_buf(),
1149 })
1150 }
1151
1152 pub fn build<F>(
1155 project_root: &Path,
1156 files: &[PathBuf],
1157 embed_fn: &mut F,
1158 max_batch_size: usize,
1159 ) -> Result<Self, String>
1160 where
1161 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1162 {
1163 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1164 Self::build_from_chunks(
1165 project_root,
1166 chunks,
1167 file_mtimes,
1168 embed_fn,
1169 max_batch_size,
1170 Option::<&mut fn(usize, usize)>::None,
1171 )
1172 }
1173
1174 pub fn build_with_progress<F, P>(
1176 project_root: &Path,
1177 files: &[PathBuf],
1178 embed_fn: &mut F,
1179 max_batch_size: usize,
1180 progress: &mut P,
1181 ) -> Result<Self, String>
1182 where
1183 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1184 P: FnMut(usize, usize),
1185 {
1186 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1187 let total_chunks = chunks.len();
1188 progress(0, total_chunks);
1189 Self::build_from_chunks(
1190 project_root,
1191 chunks,
1192 file_mtimes,
1193 embed_fn,
1194 max_batch_size,
1195 Some(progress),
1196 )
1197 }
1198
1199 pub fn refresh_stale_files<F, P>(
1210 &mut self,
1211 project_root: &Path,
1212 current_files: &[PathBuf],
1213 embed_fn: &mut F,
1214 max_batch_size: usize,
1215 progress: &mut P,
1216 ) -> Result<RefreshSummary, String>
1217 where
1218 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1219 P: FnMut(usize, usize),
1220 {
1221 self.backfill_missing_file_sizes();
1222
1223 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1225 let total_processed = current_set.len() + self.file_mtimes.len()
1226 - self
1227 .file_mtimes
1228 .keys()
1229 .filter(|path| current_set.contains(path.as_path()))
1230 .count();
1231
1232 let mut deleted: Vec<PathBuf> = Vec::new();
1235 let mut changed: Vec<PathBuf> = Vec::new();
1236 let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1237 for indexed_path in &indexed_paths {
1238 if !current_set.contains(indexed_path.as_path()) {
1239 deleted.push(indexed_path.clone());
1240 continue;
1241 }
1242 let cached = match (
1243 self.file_mtimes.get(indexed_path),
1244 self.file_sizes.get(indexed_path),
1245 self.file_hashes.get(indexed_path),
1246 ) {
1247 (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1248 mtime: *mtime,
1249 size: *size,
1250 content_hash: *hash,
1251 }),
1252 _ => None,
1253 };
1254 match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1255 Some(FreshnessVerdict::HotFresh) => {}
1256 Some(FreshnessVerdict::ContentFresh {
1257 new_mtime,
1258 new_size,
1259 }) => {
1260 self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1261 self.file_sizes.insert(indexed_path.clone(), new_size);
1262 }
1263 Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1264 changed.push(indexed_path.clone());
1265 }
1266 }
1267 }
1268
1269 let mut added: Vec<PathBuf> = Vec::new();
1271 for path in current_files {
1272 if !self.file_mtimes.contains_key(path) {
1273 added.push(path.clone());
1274 }
1275 }
1276
1277 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1279 progress(0, 0);
1280 return Ok(RefreshSummary {
1281 total_processed,
1282 ..RefreshSummary::default()
1283 });
1284 }
1285
1286 if !deleted.is_empty() {
1290 let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1291 self.entries
1292 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1293 for path in &deleted {
1294 self.file_mtimes.remove(path);
1295 self.file_sizes.remove(path);
1296 self.file_hashes.remove(path);
1297 }
1298 }
1299
1300 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1302 to_embed.extend(changed.iter().cloned());
1303 to_embed.extend(added.iter().cloned());
1304
1305 if to_embed.is_empty() {
1306 progress(0, 0);
1308 return Ok(RefreshSummary {
1309 changed: 0,
1310 added: 0,
1311 deleted: deleted.len(),
1312 total_processed,
1313 });
1314 }
1315
1316 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1317
1318 if chunks.is_empty() {
1319 progress(0, 0);
1320 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1321 if !successful_files.is_empty() {
1322 self.entries
1323 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1324 }
1325 let changed_count = changed
1326 .iter()
1327 .filter(|path| successful_files.contains(*path))
1328 .count();
1329 let added_count = added
1330 .iter()
1331 .filter(|path| successful_files.contains(*path))
1332 .count();
1333 for (file, metadata) in fresh_metadata {
1334 self.file_mtimes.insert(file.clone(), metadata.mtime);
1335 self.file_sizes.insert(file.clone(), metadata.size);
1336 self.file_hashes.insert(file.clone(), metadata.content_hash);
1337 }
1338 return Ok(RefreshSummary {
1339 changed: changed_count,
1340 added: added_count,
1341 deleted: deleted.len(),
1342 total_processed,
1343 });
1344 }
1345
1346 let total_chunks = chunks.len();
1348 progress(0, total_chunks);
1349 let batch_size = max_batch_size.max(1);
1350 let existing_dimension = if self.entries.is_empty() {
1351 None
1352 } else {
1353 Some(self.dimension)
1354 };
1355 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1356 let mut observed_dimension: Option<usize> = existing_dimension;
1357
1358 for batch_start in (0..chunks.len()).step_by(batch_size) {
1359 let batch_end = (batch_start + batch_size).min(chunks.len());
1360 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1361 .iter()
1362 .map(|c| c.embed_text.clone())
1363 .collect();
1364
1365 let vectors = embed_fn(batch_texts)?;
1366 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1367
1368 if let Some(dim) = vectors.first().map(|v| v.len()) {
1369 match observed_dimension {
1370 None => observed_dimension = Some(dim),
1371 Some(expected) if dim != expected => {
1372 return Err(format!(
1375 "embedding dimension changed during incremental refresh: \
1376 cached index uses {expected}, new vectors use {dim}"
1377 ));
1378 }
1379 _ => {}
1380 }
1381 }
1382
1383 for (i, vector) in vectors.into_iter().enumerate() {
1384 let chunk_idx = batch_start + i;
1385 new_entries.push(EmbeddingEntry {
1386 chunk: chunks[chunk_idx].clone(),
1387 vector,
1388 });
1389 }
1390
1391 progress(new_entries.len(), total_chunks);
1392 }
1393
1394 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1395 if !successful_files.is_empty() {
1396 self.entries
1397 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1398 }
1399
1400 self.entries.extend(new_entries);
1401 for (file, metadata) in fresh_metadata {
1402 self.file_mtimes.insert(file.clone(), metadata.mtime);
1403 self.file_sizes.insert(file.clone(), metadata.size);
1404 self.file_hashes.insert(file, metadata.content_hash);
1405 }
1406 if let Some(dim) = observed_dimension {
1407 self.dimension = dim;
1408 }
1409
1410 Ok(RefreshSummary {
1411 changed: changed
1412 .iter()
1413 .filter(|path| successful_files.contains(*path))
1414 .count(),
1415 added: added
1416 .iter()
1417 .filter(|path| successful_files.contains(*path))
1418 .count(),
1419 deleted: deleted.len(),
1420 total_processed,
1421 })
1422 }
1423
1424 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1426 if self.entries.is_empty() || query_vector.len() != self.dimension {
1427 return Vec::new();
1428 }
1429
1430 let mut scored: Vec<(f32, usize)> = self
1431 .entries
1432 .iter()
1433 .enumerate()
1434 .map(|(i, entry)| {
1435 let mut score = cosine_similarity(query_vector, &entry.vector);
1436 if entry.chunk.exported {
1437 score *= 1.1;
1438 }
1439 (score, i)
1440 })
1441 .collect();
1442
1443 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1445
1446 scored
1447 .into_iter()
1448 .take(top_k)
1449 .map(|(score, idx)| {
1453 let entry = &self.entries[idx];
1454 SemanticResult {
1455 file: entry.chunk.file.clone(),
1456 name: entry.chunk.name.clone(),
1457 kind: entry.chunk.kind.clone(),
1458 start_line: entry.chunk.start_line,
1459 end_line: entry.chunk.end_line,
1460 exported: entry.chunk.exported,
1461 snippet: entry.chunk.snippet.clone(),
1462 score,
1463 source: "semantic",
1464 }
1465 })
1466 .collect()
1467 }
1468
1469 pub fn len(&self) -> usize {
1471 self.entries.len()
1472 }
1473
1474 pub fn is_file_stale(&self, file: &Path) -> bool {
1476 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1477 return true;
1478 };
1479 let Some(stored_size) = self.file_sizes.get(file) else {
1480 return true;
1481 };
1482 let Some(stored_hash) = self.file_hashes.get(file) else {
1483 return true;
1484 };
1485 let cached = FileFreshness {
1486 mtime: *stored_mtime,
1487 size: *stored_size,
1488 content_hash: *stored_hash,
1489 };
1490 match cache_freshness::verify_file(file, &cached) {
1491 FreshnessVerdict::HotFresh => false,
1492 FreshnessVerdict::ContentFresh { .. } => false,
1493 FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1494 }
1495 }
1496
1497 fn backfill_missing_file_sizes(&mut self) {
1498 for path in self.file_mtimes.keys() {
1499 if self.file_sizes.contains_key(path) {
1500 continue;
1501 }
1502 if let Ok(metadata) = fs::metadata(path) {
1503 self.file_sizes.insert(path.clone(), metadata.len());
1504 if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1505 self.file_hashes.insert(path.clone(), hash);
1506 }
1507 }
1508 }
1509 }
1510
1511 pub fn remove_file(&mut self, file: &Path) {
1513 self.invalidate_file(file);
1514 }
1515
1516 pub fn invalidate_file(&mut self, file: &Path) {
1517 self.entries.retain(|e| e.chunk.file != file);
1518 self.file_mtimes.remove(file);
1519 self.file_sizes.remove(file);
1520 self.file_hashes.remove(file);
1521 }
1522
1523 pub fn dimension(&self) -> usize {
1525 self.dimension
1526 }
1527
1528 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1529 self.fingerprint.as_ref()
1530 }
1531
1532 pub fn backend_label(&self) -> Option<&str> {
1533 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1534 }
1535
1536 pub fn model_label(&self) -> Option<&str> {
1537 self.fingerprint.as_ref().map(|f| f.model.as_str())
1538 }
1539
1540 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1541 self.fingerprint = Some(fingerprint);
1542 }
1543
1544 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1546 if self.entries.is_empty() {
1549 slog_info!("skipping semantic index persistence (0 entries)");
1550 return;
1551 }
1552 let dir = storage_dir.join("semantic").join(project_key);
1553 if let Err(e) = fs::create_dir_all(&dir) {
1554 slog_warn!("failed to create semantic cache dir: {}", e);
1555 return;
1556 }
1557 let data_path = dir.join("semantic.bin");
1558 let tmp_path = dir.join(format!(
1559 "semantic.bin.tmp.{}.{}",
1560 std::process::id(),
1561 SystemTime::now()
1562 .duration_since(SystemTime::UNIX_EPOCH)
1563 .unwrap_or(Duration::ZERO)
1564 .as_nanos()
1565 ));
1566 let bytes = self.to_bytes();
1567 let write_result = (|| -> std::io::Result<()> {
1568 use std::io::Write;
1569 let mut file = fs::File::create(&tmp_path)?;
1570 file.write_all(&bytes)?;
1571 file.sync_all()?;
1572 Ok(())
1573 })();
1574 if let Err(e) = write_result {
1575 slog_warn!("failed to write semantic index: {}", e);
1576 let _ = fs::remove_file(&tmp_path);
1577 return;
1578 }
1579 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1580 slog_warn!("failed to rename semantic index: {}", e);
1581 let _ = fs::remove_file(&tmp_path);
1582 return;
1583 }
1584 slog_info!(
1585 "semantic index persisted: {} entries, {:.1} KB",
1586 self.entries.len(),
1587 bytes.len() as f64 / 1024.0
1588 );
1589 }
1590
1591 pub fn read_from_disk(
1593 storage_dir: &Path,
1594 project_key: &str,
1595 current_canonical_root: &Path,
1596 is_worktree_bridge: bool,
1597 expected_fingerprint: Option<&str>,
1598 ) -> Option<Self> {
1599 debug_assert!(current_canonical_root.is_absolute());
1600 let data_path = storage_dir
1601 .join("semantic")
1602 .join(project_key)
1603 .join("semantic.bin");
1604 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1605 if file_len < HEADER_BYTES_V1 {
1606 slog_warn!(
1607 "corrupt semantic index (too small: {} bytes), removing",
1608 file_len
1609 );
1610 if !is_worktree_bridge {
1611 let _ = fs::remove_file(&data_path);
1612 }
1613 return None;
1614 }
1615
1616 let bytes = fs::read(&data_path).ok()?;
1617 let version = bytes[0];
1618 if version != SEMANTIC_INDEX_VERSION_V6 {
1619 slog_info!(
1620 "cached semantic index version {} is older than {}, rebuilding",
1621 version,
1622 SEMANTIC_INDEX_VERSION_V6
1623 );
1624 if !is_worktree_bridge {
1625 let _ = fs::remove_file(&data_path);
1626 }
1627 return None;
1628 }
1629 match Self::from_bytes(&bytes, current_canonical_root) {
1630 Ok(index) => {
1631 if index.entries.is_empty() {
1632 slog_info!("cached semantic index is empty, will rebuild");
1633 if !is_worktree_bridge {
1634 let _ = fs::remove_file(&data_path);
1635 }
1636 return None;
1637 }
1638 if let Some(expected) = expected_fingerprint {
1639 let matches = index
1640 .fingerprint()
1641 .map(|fingerprint| fingerprint.matches_expected(expected))
1642 .unwrap_or(false);
1643 if !matches {
1644 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1645 if !is_worktree_bridge {
1646 let _ = fs::remove_file(&data_path);
1647 }
1648 return None;
1649 }
1650 }
1651 slog_info!(
1652 "loaded semantic index from disk: {} entries",
1653 index.entries.len()
1654 );
1655 Some(index)
1656 }
1657 Err(e) => {
1658 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1659 if !is_worktree_bridge {
1660 let _ = fs::remove_file(&data_path);
1661 }
1662 None
1663 }
1664 }
1665 }
1666
1667 pub fn to_bytes(&self) -> Vec<u8> {
1669 let mut buf = Vec::new();
1670 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1671 let encoded = fingerprint.as_string();
1672 if encoded.is_empty() {
1673 None
1674 } else {
1675 Some(encoded.into_bytes())
1676 }
1677 });
1678
1679 let version = SEMANTIC_INDEX_VERSION_V6;
1692 buf.push(version);
1693 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1694 buf.extend_from_slice(&(self.entries.len() as u32).to_le_bytes());
1695 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1696 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1697 buf.extend_from_slice(fp_bytes_ref);
1698
1699 buf.extend_from_slice(&(self.file_mtimes.len() as u32).to_le_bytes());
1702 for (path, mtime) in &self.file_mtimes {
1703 let relative = path
1704 .strip_prefix(&self.project_root)
1705 .unwrap_or(path.as_path());
1706 let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1707 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1708 buf.extend_from_slice(&path_bytes);
1709 let duration = mtime
1710 .duration_since(SystemTime::UNIX_EPOCH)
1711 .unwrap_or_default();
1712 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1713 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1714 let size = self.file_sizes.get(path).copied().unwrap_or_default();
1715 buf.extend_from_slice(&size.to_le_bytes());
1716 let hash = self
1717 .file_hashes
1718 .get(path)
1719 .copied()
1720 .unwrap_or_else(cache_freshness::zero_hash);
1721 buf.extend_from_slice(hash.as_bytes());
1722 }
1723
1724 for entry in &self.entries {
1726 let c = &entry.chunk;
1727
1728 let relative = c
1730 .file
1731 .strip_prefix(&self.project_root)
1732 .unwrap_or(c.file.as_path());
1733 let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1734 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1735 buf.extend_from_slice(&file_bytes);
1736
1737 let name_bytes = c.name.as_bytes();
1739 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1740 buf.extend_from_slice(name_bytes);
1741
1742 buf.push(symbol_kind_to_u8(&c.kind));
1744
1745 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1747 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1748 buf.push(c.exported as u8);
1749
1750 let snippet_bytes = c.snippet.as_bytes();
1752 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1753 buf.extend_from_slice(snippet_bytes);
1754
1755 let embed_bytes = c.embed_text.as_bytes();
1757 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1758 buf.extend_from_slice(embed_bytes);
1759
1760 for &val in &entry.vector {
1762 buf.extend_from_slice(&val.to_le_bytes());
1763 }
1764 }
1765
1766 buf
1767 }
1768
1769 pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1771 debug_assert!(current_canonical_root.is_absolute());
1772 let mut pos = 0;
1773
1774 if data.len() < HEADER_BYTES_V1 {
1775 return Err("data too short".to_string());
1776 }
1777
1778 let version = data[pos];
1779 pos += 1;
1780 if version != SEMANTIC_INDEX_VERSION_V1
1781 && version != SEMANTIC_INDEX_VERSION_V2
1782 && version != SEMANTIC_INDEX_VERSION_V3
1783 && version != SEMANTIC_INDEX_VERSION_V4
1784 && version != SEMANTIC_INDEX_VERSION_V5
1785 && version != SEMANTIC_INDEX_VERSION_V6
1786 {
1787 return Err(format!("unsupported version: {}", version));
1788 }
1789 if (version == SEMANTIC_INDEX_VERSION_V2
1793 || version == SEMANTIC_INDEX_VERSION_V3
1794 || version == SEMANTIC_INDEX_VERSION_V4
1795 || version == SEMANTIC_INDEX_VERSION_V5
1796 || version == SEMANTIC_INDEX_VERSION_V6)
1797 && data.len() < HEADER_BYTES_V2
1798 {
1799 return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1800 }
1801
1802 let dimension = read_u32(data, &mut pos)? as usize;
1803 let entry_count = read_u32(data, &mut pos)? as usize;
1804 if dimension == 0 || dimension > MAX_DIMENSION {
1805 return Err(format!("invalid embedding dimension: {}", dimension));
1806 }
1807 if entry_count > MAX_ENTRIES {
1808 return Err(format!("too many semantic index entries: {}", entry_count));
1809 }
1810
1811 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1817 || version == SEMANTIC_INDEX_VERSION_V3
1818 || version == SEMANTIC_INDEX_VERSION_V4
1819 || version == SEMANTIC_INDEX_VERSION_V5
1820 || version == SEMANTIC_INDEX_VERSION_V6;
1821 let fingerprint = if has_fingerprint_field {
1822 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1823 if pos + fingerprint_len > data.len() {
1824 return Err("unexpected end of data reading fingerprint".to_string());
1825 }
1826 if fingerprint_len == 0 {
1827 None
1828 } else {
1829 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1830 pos += fingerprint_len;
1831 Some(
1832 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1833 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1834 )
1835 }
1836 } else {
1837 None
1838 };
1839
1840 let mtime_count = read_u32(data, &mut pos)? as usize;
1842 if mtime_count > MAX_ENTRIES {
1843 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1844 }
1845
1846 let vector_bytes = entry_count
1847 .checked_mul(dimension)
1848 .and_then(|count| count.checked_mul(F32_BYTES))
1849 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1850 if vector_bytes > data.len().saturating_sub(pos) {
1851 return Err("semantic index vectors exceed available data".to_string());
1852 }
1853
1854 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1855 let mut file_sizes = HashMap::with_capacity(mtime_count);
1856 let mut file_hashes = HashMap::with_capacity(mtime_count);
1857 for _ in 0..mtime_count {
1858 let path = read_string(data, &mut pos)?;
1859 let secs = read_u64(data, &mut pos)?;
1860 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1866 || version == SEMANTIC_INDEX_VERSION_V4
1867 || version == SEMANTIC_INDEX_VERSION_V5
1868 || version == SEMANTIC_INDEX_VERSION_V6
1869 {
1870 read_u32(data, &mut pos)?
1871 } else {
1872 0
1873 };
1874 let size =
1875 if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1876 read_u64(data, &mut pos)?
1877 } else {
1878 0
1879 };
1880 let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1881 if pos + 32 > data.len() {
1882 return Err("unexpected end of data reading content hash".to_string());
1883 }
1884 let mut hash_bytes = [0u8; 32];
1885 hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1886 pos += 32;
1887 blake3::Hash::from_bytes(hash_bytes)
1888 } else {
1889 cache_freshness::zero_hash()
1890 };
1891 if nanos >= 1_000_000_000 {
1898 return Err(format!(
1899 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1900 nanos
1901 ));
1902 }
1903 let duration = std::time::Duration::new(secs, nanos);
1904 let mtime = SystemTime::UNIX_EPOCH
1905 .checked_add(duration)
1906 .ok_or_else(|| {
1907 format!(
1908 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1909 secs, nanos
1910 )
1911 })?;
1912 let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1913 current_canonical_root.join(PathBuf::from(path))
1914 } else {
1915 PathBuf::from(path)
1916 };
1917 file_mtimes.insert(path.clone(), mtime);
1918 file_sizes.insert(path.clone(), size);
1919 file_hashes.insert(path, content_hash);
1920 }
1921
1922 let mut entries = Vec::with_capacity(entry_count);
1924 for _ in 0..entry_count {
1925 let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1926 let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1927 current_canonical_root.join(raw_file)
1928 } else {
1929 raw_file
1930 };
1931 let name = read_string(data, &mut pos)?;
1932
1933 if pos >= data.len() {
1934 return Err("unexpected end of data".to_string());
1935 }
1936 let kind = u8_to_symbol_kind(data[pos]);
1937 pos += 1;
1938
1939 let start_line = read_u32(data, &mut pos)?;
1940 let end_line = read_u32(data, &mut pos)?;
1941
1942 if pos >= data.len() {
1943 return Err("unexpected end of data".to_string());
1944 }
1945 let exported = data[pos] != 0;
1946 pos += 1;
1947
1948 let snippet = read_string(data, &mut pos)?;
1949 let embed_text = read_string(data, &mut pos)?;
1950
1951 let vec_bytes = dimension
1953 .checked_mul(F32_BYTES)
1954 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1955 if pos + vec_bytes > data.len() {
1956 return Err("unexpected end of data reading vector".to_string());
1957 }
1958 let mut vector = Vec::with_capacity(dimension);
1959 for _ in 0..dimension {
1960 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1961 vector.push(f32::from_le_bytes(bytes));
1962 pos += 4;
1963 }
1964
1965 entries.push(EmbeddingEntry {
1966 chunk: SemanticChunk {
1967 file,
1968 name,
1969 kind,
1970 start_line,
1971 end_line,
1972 exported,
1973 embed_text,
1974 snippet,
1975 },
1976 vector,
1977 });
1978 }
1979
1980 if entries.len() != entry_count {
1981 return Err(format!(
1982 "semantic cache entry count drift: header={} decoded={}",
1983 entry_count,
1984 entries.len()
1985 ));
1986 }
1987 for entry in &entries {
1988 if !file_mtimes.contains_key(&entry.chunk.file) {
1989 return Err(format!(
1990 "semantic cache metadata missing for entry file {}",
1991 entry.chunk.file.display()
1992 ));
1993 }
1994 }
1995
1996 Ok(Self {
1997 entries,
1998 file_mtimes,
1999 file_sizes,
2000 file_hashes,
2001 dimension,
2002 fingerprint,
2003 project_root: current_canonical_root.to_path_buf(),
2004 })
2005 }
2006}
2007
2008fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2010 let relative = file
2011 .strip_prefix(project_root)
2012 .unwrap_or(file)
2013 .to_string_lossy();
2014
2015 let kind_label = match &symbol.kind {
2016 SymbolKind::Function => "function",
2017 SymbolKind::Class => "class",
2018 SymbolKind::Method => "method",
2019 SymbolKind::Struct => "struct",
2020 SymbolKind::Interface => "interface",
2021 SymbolKind::Enum => "enum",
2022 SymbolKind::TypeAlias => "type",
2023 SymbolKind::Variable => "variable",
2024 SymbolKind::Heading => "heading",
2025 SymbolKind::FileSummary => "file-summary",
2026 };
2027
2028 let name = &symbol.name;
2030 let mut text = format!(
2031 "name:{name} file:{} kind:{} name:{name}",
2032 relative, kind_label
2033 );
2034
2035 if let Some(sig) = &symbol.signature {
2036 text.push_str(&format!(" signature:{}", sig));
2037 }
2038
2039 let lines: Vec<&str> = source.lines().collect();
2041 let start = (symbol.range.start_line as usize).min(lines.len());
2042 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2044 if start < end {
2045 let body: String = lines[start..end]
2046 .iter()
2047 .take(15) .copied()
2049 .collect::<Vec<&str>>()
2050 .join("\n");
2051 let snippet = if body.len() > 300 {
2052 format!("{}...", &body[..body.floor_char_boundary(300)])
2053 } else {
2054 body
2055 };
2056 text.push_str(&format!(" body:{}", snippet));
2057 }
2058
2059 text
2060}
2061
2062fn truncate_chars(value: &str, max_chars: usize) -> String {
2063 value.chars().take(max_chars).collect()
2064}
2065
2066fn first_leading_doc_comment(source: &str) -> String {
2067 let lines: Vec<&str> = source.lines().collect();
2068 let Some((start, first)) = lines
2069 .iter()
2070 .enumerate()
2071 .find(|(_, line)| !line.trim().is_empty())
2072 else {
2073 return String::new();
2074 };
2075
2076 let trimmed = first.trim_start();
2077 if trimmed.starts_with("/**") {
2078 let mut comment = Vec::new();
2079 for line in lines.iter().skip(start) {
2080 comment.push(*line);
2081 if line.contains("*/") {
2082 break;
2083 }
2084 }
2085 return truncate_chars(&comment.join("\n"), 200);
2086 }
2087
2088 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2089 let comment = lines
2090 .iter()
2091 .skip(start)
2092 .take_while(|line| {
2093 let trimmed = line.trim_start();
2094 trimmed.starts_with("///") || trimmed.starts_with("//!")
2095 })
2096 .copied()
2097 .collect::<Vec<_>>()
2098 .join("\n");
2099 return truncate_chars(&comment, 200);
2100 }
2101
2102 String::new()
2103}
2104
2105pub fn build_file_summary_chunk(
2106 file: &Path,
2107 project_root: &Path,
2108 source: &str,
2109 top_exports: &[&str],
2110 top_export_signatures: &[Option<&str>],
2111) -> SemanticChunk {
2112 let relative = file.strip_prefix(project_root).unwrap_or(file);
2113 let rel_path = relative.to_string_lossy();
2114 let parent_dir = relative
2115 .parent()
2116 .map(|parent| parent.to_string_lossy().to_string())
2117 .unwrap_or_default();
2118 let name = file
2119 .file_stem()
2120 .map(|stem| stem.to_string_lossy().to_string())
2121 .unwrap_or_default();
2122 let doc = first_leading_doc_comment(source);
2123 let exports = top_exports
2124 .iter()
2125 .take(5)
2126 .copied()
2127 .collect::<Vec<_>>()
2128 .join(",");
2129 let snippet = if doc.is_empty() {
2130 top_export_signatures
2131 .first()
2132 .and_then(|signature| signature.as_deref())
2133 .map(|signature| truncate_chars(signature, 200))
2134 .unwrap_or_default()
2135 } else {
2136 doc.clone()
2137 };
2138
2139 SemanticChunk {
2140 file: file.to_path_buf(),
2141 name,
2142 kind: SymbolKind::FileSummary,
2143 start_line: 0,
2144 end_line: 0,
2145 exported: false,
2146 embed_text: format!(
2147 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2148 file.file_stem()
2149 .map(|stem| stem.to_string_lossy().to_string())
2150 .unwrap_or_default()
2151 ),
2152 snippet,
2153 }
2154}
2155
2156fn parser_for(
2157 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2158 lang: crate::parser::LangId,
2159) -> Result<&mut Parser, String> {
2160 use std::collections::hash_map::Entry;
2161
2162 match parsers.entry(lang) {
2163 Entry::Occupied(entry) => Ok(entry.into_mut()),
2164 Entry::Vacant(entry) => {
2165 let grammar = grammar_for(lang);
2166 let mut parser = Parser::new();
2167 parser
2168 .set_language(&grammar)
2169 .map_err(|error| error.to_string())?;
2170 Ok(entry.insert(parser))
2171 }
2172 }
2173}
2174
2175pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2176 matches!(
2177 path.extension().and_then(|extension| extension.to_str()),
2178 Some(
2179 "ts" | "tsx"
2180 | "js"
2181 | "jsx"
2182 | "py"
2183 | "rs"
2184 | "go"
2185 | "c"
2186 | "h"
2187 | "cc"
2188 | "cpp"
2189 | "cxx"
2190 | "hpp"
2191 | "hh"
2192 | "zig"
2193 | "cs"
2194 | "sh"
2195 | "bash"
2196 | "zsh"
2197 | "sol"
2198 | "vue"
2199 )
2200 )
2201}
2202
2203fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2204 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2205 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2206 let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2207 .map_err(|error| error.to_string())?
2208 .unwrap_or_else(cache_freshness::zero_hash);
2209 Ok(IndexedFileMetadata {
2210 mtime,
2211 size: metadata.len(),
2212 content_hash,
2213 })
2214}
2215
2216fn collect_file_chunks(
2217 project_root: &Path,
2218 file: &Path,
2219 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2220) -> Result<Vec<SemanticChunk>, String> {
2221 if !is_semantic_indexed_extension(file) {
2222 return Err("unsupported file extension".to_string());
2223 }
2224 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2225 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2226 let tree = parser_for(parsers, lang)?
2227 .parse(&source, None)
2228 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2229 let symbols =
2230 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2231
2232 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2233}
2234
2235fn build_snippet(symbol: &Symbol, source: &str) -> String {
2237 let lines: Vec<&str> = source.lines().collect();
2238 let start = (symbol.range.start_line as usize).min(lines.len());
2239 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2241 if start < end {
2242 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2243 let mut snippet = snippet_lines.join("\n");
2244 if end - start > 5 {
2245 snippet.push_str("\n ...");
2246 }
2247 if snippet.len() > 300 {
2248 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2249 }
2250 snippet
2251 } else {
2252 String::new()
2253 }
2254}
2255
2256fn symbols_to_chunks(
2258 file: &Path,
2259 symbols: &[Symbol],
2260 source: &str,
2261 project_root: &Path,
2262) -> Vec<SemanticChunk> {
2263 let mut chunks = Vec::new();
2264 let top_exports_with_signatures = symbols
2265 .iter()
2266 .filter(|symbol| {
2267 symbol.exported
2268 && symbol.parent.is_none()
2269 && !matches!(symbol.kind, SymbolKind::Heading)
2270 })
2271 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2272 .collect::<Vec<_>>();
2273
2274 let has_only_headings = !symbols.is_empty()
2275 && symbols
2276 .iter()
2277 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2278 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2279 let top_exports = top_exports_with_signatures
2280 .iter()
2281 .map(|(name, _)| *name)
2282 .collect::<Vec<_>>();
2283 let top_export_signatures = top_exports_with_signatures
2284 .iter()
2285 .map(|(_, signature)| *signature)
2286 .collect::<Vec<_>>();
2287 chunks.push(build_file_summary_chunk(
2288 file,
2289 project_root,
2290 source,
2291 &top_exports,
2292 &top_export_signatures,
2293 ));
2294 }
2295
2296 for symbol in symbols {
2297 if matches!(symbol.kind, SymbolKind::Heading) {
2302 continue;
2303 }
2304
2305 let line_count = symbol
2307 .range
2308 .end_line
2309 .saturating_sub(symbol.range.start_line)
2310 + 1;
2311 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2312 continue;
2313 }
2314
2315 let embed_text = build_embed_text(symbol, source, file, project_root);
2316 let snippet = build_snippet(symbol, source);
2317
2318 chunks.push(SemanticChunk {
2319 file: file.to_path_buf(),
2320 name: symbol.name.clone(),
2321 kind: symbol.kind.clone(),
2322 start_line: symbol.range.start_line,
2323 end_line: symbol.range.end_line,
2324 exported: symbol.exported,
2325 embed_text,
2326 snippet,
2327 });
2328
2329 }
2332
2333 chunks
2334}
2335
2336fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2338 if a.len() != b.len() {
2339 return 0.0;
2340 }
2341
2342 let mut dot = 0.0f32;
2343 let mut norm_a = 0.0f32;
2344 let mut norm_b = 0.0f32;
2345
2346 for i in 0..a.len() {
2347 dot += a[i] * b[i];
2348 norm_a += a[i] * a[i];
2349 norm_b += b[i] * b[i];
2350 }
2351
2352 let denom = norm_a.sqrt() * norm_b.sqrt();
2353 if denom == 0.0 {
2354 0.0
2355 } else {
2356 dot / denom
2357 }
2358}
2359
2360fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2362 match kind {
2363 SymbolKind::Function => 0,
2364 SymbolKind::Class => 1,
2365 SymbolKind::Method => 2,
2366 SymbolKind::Struct => 3,
2367 SymbolKind::Interface => 4,
2368 SymbolKind::Enum => 5,
2369 SymbolKind::TypeAlias => 6,
2370 SymbolKind::Variable => 7,
2371 SymbolKind::Heading => 8,
2372 SymbolKind::FileSummary => 9,
2373 }
2374}
2375
2376fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2377 match v {
2378 0 => SymbolKind::Function,
2379 1 => SymbolKind::Class,
2380 2 => SymbolKind::Method,
2381 3 => SymbolKind::Struct,
2382 4 => SymbolKind::Interface,
2383 5 => SymbolKind::Enum,
2384 6 => SymbolKind::TypeAlias,
2385 7 => SymbolKind::Variable,
2386 8 => SymbolKind::Heading,
2387 9 => SymbolKind::FileSummary,
2388 _ => SymbolKind::Heading,
2389 }
2390}
2391
2392fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2393 if *pos + 4 > data.len() {
2394 return Err("unexpected end of data reading u32".to_string());
2395 }
2396 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2397 *pos += 4;
2398 Ok(val)
2399}
2400
2401fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2402 if *pos + 8 > data.len() {
2403 return Err("unexpected end of data reading u64".to_string());
2404 }
2405 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2406 *pos += 8;
2407 Ok(u64::from_le_bytes(bytes))
2408}
2409
2410fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2411 let len = read_u32(data, pos)? as usize;
2412 if *pos + len > data.len() {
2413 return Err("unexpected end of data reading string".to_string());
2414 }
2415 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2416 *pos += len;
2417 Ok(s)
2418}
2419
2420#[cfg(test)]
2421mod tests {
2422 use super::*;
2423 use crate::config::{SemanticBackend, SemanticBackendConfig};
2424 use crate::parser::FileParser;
2425 use std::io::{Read, Write};
2426 use std::net::TcpListener;
2427 use std::thread;
2428
2429 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2430 where
2431 F: Fn(String, String, String) -> String + Send + 'static,
2432 {
2433 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2434 let addr = listener.local_addr().expect("local addr");
2435 let handle = thread::spawn(move || {
2436 let (mut stream, _) = listener.accept().expect("accept request");
2437 let mut buf = Vec::new();
2438 let mut chunk = [0u8; 4096];
2439 let mut header_end = None;
2440 let mut content_length = 0usize;
2441 loop {
2442 let n = stream.read(&mut chunk).expect("read request");
2443 if n == 0 {
2444 break;
2445 }
2446 buf.extend_from_slice(&chunk[..n]);
2447 if header_end.is_none() {
2448 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2449 header_end = Some(pos + 4);
2450 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2451 for line in headers.lines() {
2452 if let Some(value) = line.strip_prefix("Content-Length:") {
2453 content_length = value.trim().parse::<usize>().unwrap_or(0);
2454 }
2455 }
2456 }
2457 }
2458 if let Some(end) = header_end {
2459 if buf.len() >= end + content_length {
2460 break;
2461 }
2462 }
2463 }
2464
2465 let end = header_end.expect("header terminator");
2466 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2467 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2468 let mut lines = request.lines();
2469 let request_line = lines.next().expect("request line").to_string();
2470 let path = request_line
2471 .split_whitespace()
2472 .nth(1)
2473 .expect("request path")
2474 .to_string();
2475 let response_body = handler(request_line, path, body);
2476 let response = format!(
2477 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2478 response_body.len(),
2479 response_body
2480 );
2481 stream
2482 .write_all(response.as_bytes())
2483 .expect("write response");
2484 });
2485
2486 (format!("http://{}", addr), handle)
2487 }
2488
2489 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2490 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2491 }
2492
2493 fn write_rust_file(path: &Path, function_name: &str) {
2494 fs::write(
2495 path,
2496 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2497 )
2498 .unwrap();
2499 }
2500
2501 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2502 let mut embed = test_vector_for_texts;
2503 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2504 }
2505
2506 fn test_project_root() -> PathBuf {
2507 std::env::current_dir().unwrap()
2508 }
2509
2510 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2511 index.file_mtimes.insert(file.to_path_buf(), mtime);
2512 index.file_sizes.insert(file.to_path_buf(), size);
2513 index
2514 .file_hashes
2515 .insert(file.to_path_buf(), cache_freshness::zero_hash());
2516 }
2517
2518 #[test]
2519 fn test_cosine_similarity_identical() {
2520 let a = vec![1.0, 0.0, 0.0];
2521 let b = vec![1.0, 0.0, 0.0];
2522 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2523 }
2524
2525 #[test]
2526 fn test_cosine_similarity_orthogonal() {
2527 let a = vec![1.0, 0.0, 0.0];
2528 let b = vec![0.0, 1.0, 0.0];
2529 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2530 }
2531
2532 #[test]
2533 fn test_cosine_similarity_opposite() {
2534 let a = vec![1.0, 0.0, 0.0];
2535 let b = vec![-1.0, 0.0, 0.0];
2536 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2537 }
2538
2539 #[test]
2540 fn test_serialization_roundtrip() {
2541 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2542 index.entries.push(EmbeddingEntry {
2543 chunk: SemanticChunk {
2544 file: PathBuf::from("/src/main.rs"),
2545 name: "handle_request".to_string(),
2546 kind: SymbolKind::Function,
2547 start_line: 10,
2548 end_line: 25,
2549 exported: true,
2550 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2551 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2552 },
2553 vector: vec![0.1, 0.2, 0.3, 0.4],
2554 });
2555 index.dimension = 4;
2556 index
2557 .file_mtimes
2558 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
2559 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
2560 index.set_fingerprint(SemanticIndexFingerprint {
2561 backend: "fastembed".to_string(),
2562 model: "all-MiniLM-L6-v2".to_string(),
2563 base_url: FALLBACK_BACKEND.to_string(),
2564 dimension: 4,
2565 chunking_version: default_chunking_version(),
2566 });
2567
2568 let bytes = index.to_bytes();
2569 let restored = SemanticIndex::from_bytes(&bytes, &test_project_root()).unwrap();
2570
2571 assert_eq!(restored.entries.len(), 1);
2572 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2573 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2574 assert_eq!(restored.dimension, 4);
2575 assert_eq!(restored.backend_label(), Some("fastembed"));
2576 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2577 }
2578
2579 #[test]
2580 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2581 let cases = [
2582 (SymbolKind::Function, 0),
2583 (SymbolKind::Class, 1),
2584 (SymbolKind::Method, 2),
2585 (SymbolKind::Struct, 3),
2586 (SymbolKind::Interface, 4),
2587 (SymbolKind::Enum, 5),
2588 (SymbolKind::TypeAlias, 6),
2589 (SymbolKind::Variable, 7),
2590 (SymbolKind::Heading, 8),
2591 (SymbolKind::FileSummary, 9),
2592 ];
2593
2594 for (kind, encoded) in cases {
2595 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2596 assert_eq!(u8_to_symbol_kind(encoded), kind);
2597 }
2598 }
2599
2600 #[test]
2601 fn test_search_top_k() {
2602 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2603 index.dimension = 3;
2604
2605 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2607 let mut vec = vec![0.0f32; 3];
2608 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2610 chunk: SemanticChunk {
2611 file: PathBuf::from("/src/lib.rs"),
2612 name: name.to_string(),
2613 kind: SymbolKind::Function,
2614 start_line: (i * 10 + 1) as u32,
2615 end_line: (i * 10 + 5) as u32,
2616 exported: true,
2617 embed_text: format!("kind:function name:{}", name),
2618 snippet: format!("fn {}() {{}}", name),
2619 },
2620 vector: vec,
2621 });
2622 }
2623
2624 let query = vec![0.9, 0.1, 0.0];
2626 let results = index.search(&query, 2);
2627
2628 assert_eq!(results.len(), 2);
2629 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2631 }
2632
2633 #[test]
2634 fn test_empty_index_search() {
2635 let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2636 let results = index.search(&[0.1, 0.2, 0.3], 10);
2637 assert!(results.is_empty());
2638 }
2639
2640 #[test]
2641 fn single_line_symbol_builds_non_empty_snippet() {
2642 let symbol = Symbol {
2643 name: "answer".to_string(),
2644 kind: SymbolKind::Variable,
2645 range: crate::symbols::Range {
2646 start_line: 0,
2647 start_col: 0,
2648 end_line: 0,
2649 end_col: 24,
2650 },
2651 signature: Some("const answer = 42".to_string()),
2652 scope_chain: Vec::new(),
2653 exported: true,
2654 parent: None,
2655 };
2656 let source = "export const answer = 42;\n";
2657
2658 let snippet = build_snippet(&symbol, source);
2659
2660 assert_eq!(snippet, "export const answer = 42;");
2661 }
2662
2663 #[test]
2664 fn optimized_file_chunk_collection_matches_file_parser_path() {
2665 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2666 let file = project_root.join("src/semantic_index.rs");
2667 let source = std::fs::read_to_string(&file).unwrap();
2668
2669 let mut legacy_parser = FileParser::new();
2670 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2671 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2672
2673 let mut parsers = HashMap::new();
2674 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2675
2676 assert_eq!(
2677 chunk_fingerprint(&optimized_chunks),
2678 chunk_fingerprint(&legacy_chunks)
2679 );
2680 }
2681
2682 fn chunk_fingerprint(
2683 chunks: &[SemanticChunk],
2684 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2685 chunks
2686 .iter()
2687 .map(|chunk| {
2688 (
2689 chunk.name.clone(),
2690 chunk.kind.clone(),
2691 chunk.start_line,
2692 chunk.end_line,
2693 chunk.exported,
2694 chunk.embed_text.clone(),
2695 chunk.snippet.clone(),
2696 )
2697 })
2698 .collect()
2699 }
2700
2701 #[test]
2702 fn rejects_oversized_dimension_during_deserialization() {
2703 let mut bytes = Vec::new();
2704 bytes.push(1u8);
2705 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2706 bytes.extend_from_slice(&0u32.to_le_bytes());
2707 bytes.extend_from_slice(&0u32.to_le_bytes());
2708
2709 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2710 }
2711
2712 #[test]
2713 fn rejects_oversized_entry_count_during_deserialization() {
2714 let mut bytes = Vec::new();
2715 bytes.push(1u8);
2716 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2717 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2718 bytes.extend_from_slice(&0u32.to_le_bytes());
2719
2720 assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2721 }
2722
2723 #[test]
2724 fn invalidate_file_removes_entries_and_mtime() {
2725 let target = PathBuf::from("/src/main.rs");
2726 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2727 index.entries.push(EmbeddingEntry {
2728 chunk: SemanticChunk {
2729 file: target.clone(),
2730 name: "main".to_string(),
2731 kind: SymbolKind::Function,
2732 start_line: 0,
2733 end_line: 1,
2734 exported: false,
2735 embed_text: "main".to_string(),
2736 snippet: "fn main() {}".to_string(),
2737 },
2738 vector: vec![1.0; DEFAULT_DIMENSION],
2739 });
2740 index
2741 .file_mtimes
2742 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2743 index.file_sizes.insert(target.clone(), 0);
2744
2745 index.invalidate_file(&target);
2746
2747 assert!(index.entries.is_empty());
2748 assert!(!index.file_mtimes.contains_key(&target));
2749 assert!(!index.file_sizes.contains_key(&target));
2750 }
2751
2752 #[test]
2753 fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2754 let temp = tempfile::tempdir().unwrap();
2755 let project_root = temp.path();
2756 let file = project_root.join("src/lib.rs");
2757 fs::create_dir_all(file.parent().unwrap()).unwrap();
2758 write_rust_file(&file, "kept_symbol");
2759
2760 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2761 let original_entry_count = index.entries.len();
2762 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2763 let original_size = *index.file_sizes.get(&file).unwrap();
2764
2765 let stale_mtime = SystemTime::UNIX_EPOCH;
2766 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2767 fs::remove_file(&file).unwrap();
2768
2769 let mut embed = test_vector_for_texts;
2770 let mut progress = |_done: usize, _total: usize| {};
2771 let summary = index
2772 .refresh_stale_files(
2773 project_root,
2774 std::slice::from_ref(&file),
2775 &mut embed,
2776 8,
2777 &mut progress,
2778 )
2779 .unwrap();
2780
2781 assert_eq!(summary.changed, 0);
2782 assert_eq!(summary.added, 0);
2783 assert_eq!(summary.deleted, 0);
2784 assert_eq!(index.entries.len(), original_entry_count);
2785 assert!(index
2786 .entries
2787 .iter()
2788 .any(|entry| entry.chunk.name == "kept_symbol"));
2789 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2790 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2791 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2792 }
2793
2794 #[test]
2795 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2796 let temp = tempfile::tempdir().unwrap();
2797 let project_root = temp.path();
2798 let missing = project_root.join("src/missing.rs");
2799 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2800
2801 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2802 let mut embed = test_vector_for_texts;
2803 let mut progress = |_done: usize, _total: usize| {};
2804 let summary = index
2805 .refresh_stale_files(
2806 project_root,
2807 std::slice::from_ref(&missing),
2808 &mut embed,
2809 8,
2810 &mut progress,
2811 )
2812 .unwrap();
2813
2814 assert_eq!(summary.added, 0);
2815 assert_eq!(summary.changed, 0);
2816 assert_eq!(summary.deleted, 0);
2817 assert!(!index.file_mtimes.contains_key(&missing));
2818 assert!(!index.file_sizes.contains_key(&missing));
2819 assert!(index.entries.is_empty());
2820 }
2821
2822 #[test]
2823 fn refresh_reports_added_for_new_files() {
2824 let temp = tempfile::tempdir().unwrap();
2825 let project_root = temp.path();
2826 let existing = project_root.join("src/lib.rs");
2827 let added = project_root.join("src/new.rs");
2828 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2829 write_rust_file(&existing, "existing_symbol");
2830 write_rust_file(&added, "added_symbol");
2831
2832 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2833 let mut embed = test_vector_for_texts;
2834 let mut progress = |_done: usize, _total: usize| {};
2835 let summary = index
2836 .refresh_stale_files(
2837 project_root,
2838 &[existing.clone(), added.clone()],
2839 &mut embed,
2840 8,
2841 &mut progress,
2842 )
2843 .unwrap();
2844
2845 assert_eq!(summary.added, 1);
2846 assert_eq!(summary.changed, 0);
2847 assert_eq!(summary.deleted, 0);
2848 assert_eq!(summary.total_processed, 2);
2849 assert!(index.file_mtimes.contains_key(&added));
2850 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2851 }
2852
2853 #[test]
2854 fn refresh_reports_deleted_for_removed_files() {
2855 let temp = tempfile::tempdir().unwrap();
2856 let project_root = temp.path();
2857 let deleted = project_root.join("src/deleted.rs");
2858 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2859 write_rust_file(&deleted, "deleted_symbol");
2860
2861 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2862 fs::remove_file(&deleted).unwrap();
2863
2864 let mut embed = test_vector_for_texts;
2865 let mut progress = |_done: usize, _total: usize| {};
2866 let summary = index
2867 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2868 .unwrap();
2869
2870 assert_eq!(summary.deleted, 1);
2871 assert_eq!(summary.changed, 0);
2872 assert_eq!(summary.added, 0);
2873 assert_eq!(summary.total_processed, 1);
2874 assert!(!index.file_mtimes.contains_key(&deleted));
2875 assert!(index.entries.is_empty());
2876 }
2877
2878 #[test]
2879 fn refresh_reports_changed_for_modified_files() {
2880 let temp = tempfile::tempdir().unwrap();
2881 let project_root = temp.path();
2882 let file = project_root.join("src/lib.rs");
2883 fs::create_dir_all(file.parent().unwrap()).unwrap();
2884 write_rust_file(&file, "old_symbol");
2885
2886 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2887 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2888 write_rust_file(&file, "new_symbol");
2889
2890 let mut embed = test_vector_for_texts;
2891 let mut progress = |_done: usize, _total: usize| {};
2892 let summary = index
2893 .refresh_stale_files(
2894 project_root,
2895 std::slice::from_ref(&file),
2896 &mut embed,
2897 8,
2898 &mut progress,
2899 )
2900 .unwrap();
2901
2902 assert_eq!(summary.changed, 1);
2903 assert_eq!(summary.added, 0);
2904 assert_eq!(summary.deleted, 0);
2905 assert_eq!(summary.total_processed, 1);
2906 assert!(index
2907 .entries
2908 .iter()
2909 .any(|entry| entry.chunk.name == "new_symbol"));
2910 assert!(!index
2911 .entries
2912 .iter()
2913 .any(|entry| entry.chunk.name == "old_symbol"));
2914 }
2915
2916 #[test]
2917 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2918 let temp = tempfile::tempdir().unwrap();
2919 let project_root = temp.path();
2920 let file = project_root.join("src/lib.rs");
2921 fs::create_dir_all(file.parent().unwrap()).unwrap();
2922 write_rust_file(&file, "clean_symbol");
2923
2924 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2925 let original_entries = index.entries.len();
2926 let mut embed_called = false;
2927 let mut embed = |texts: Vec<String>| {
2928 embed_called = true;
2929 test_vector_for_texts(texts)
2930 };
2931 let mut progress = |_done: usize, _total: usize| {};
2932 let summary = index
2933 .refresh_stale_files(
2934 project_root,
2935 std::slice::from_ref(&file),
2936 &mut embed,
2937 8,
2938 &mut progress,
2939 )
2940 .unwrap();
2941
2942 assert!(summary.is_noop());
2943 assert_eq!(summary.total_processed, 1);
2944 assert!(!embed_called);
2945 assert_eq!(index.entries.len(), original_entries);
2946 }
2947
2948 #[test]
2949 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
2950 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
2951
2952 assert!(is_onnx_runtime_unavailable(message));
2953 }
2954
2955 #[test]
2956 fn formats_missing_onnx_runtime_with_install_hint() {
2957 let message = format_embedding_init_error(
2958 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
2959 );
2960
2961 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
2962 assert!(message.contains("Original error:"));
2963 }
2964
2965 #[test]
2966 fn openai_compatible_backend_embeds_with_mock_server() {
2967 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
2968 assert!(request_line.starts_with("POST "));
2969 assert_eq!(path, "/v1/embeddings");
2970 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
2971 });
2972
2973 let config = SemanticBackendConfig {
2974 backend: SemanticBackend::OpenAiCompatible,
2975 model: "test-embedding".to_string(),
2976 base_url: Some(base_url),
2977 api_key_env: None,
2978 timeout_ms: 5_000,
2979 max_batch_size: 64,
2980 };
2981
2982 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
2983 let vectors = model
2984 .embed(vec!["hello".to_string(), "world".to_string()])
2985 .unwrap();
2986
2987 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
2988 handle.join().unwrap();
2989 }
2990
2991 #[test]
3001 fn openai_compatible_request_has_single_content_type_header() {
3002 use std::sync::{Arc, Mutex};
3003 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3004 let captured_for_thread = Arc::clone(&captured);
3005
3006 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3007 let addr = listener.local_addr().expect("local addr");
3008 let handle = thread::spawn(move || {
3009 let (mut stream, _) = listener.accept().expect("accept");
3010 let mut buf = Vec::new();
3011 let mut chunk = [0u8; 4096];
3012 let mut header_end = None;
3013 let mut content_length = 0usize;
3014 loop {
3015 let n = stream.read(&mut chunk).expect("read");
3016 if n == 0 {
3017 break;
3018 }
3019 buf.extend_from_slice(&chunk[..n]);
3020 if header_end.is_none() {
3021 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3022 header_end = Some(pos + 4);
3023 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3024 if let Some(value) = line.strip_prefix("Content-Length:") {
3025 content_length = value.trim().parse::<usize>().unwrap_or(0);
3026 }
3027 }
3028 }
3029 }
3030 if let Some(end) = header_end {
3031 if buf.len() >= end + content_length {
3032 break;
3033 }
3034 }
3035 }
3036 *captured_for_thread.lock().unwrap() = buf;
3037 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3038 let response = format!(
3039 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3040 body.len(),
3041 body
3042 );
3043 let _ = stream.write_all(response.as_bytes());
3044 });
3045
3046 let config = SemanticBackendConfig {
3047 backend: SemanticBackend::OpenAiCompatible,
3048 model: "text-embedding-3-small".to_string(),
3049 base_url: Some(format!("http://{}", addr)),
3050 api_key_env: None,
3051 timeout_ms: 5_000,
3052 max_batch_size: 64,
3053 };
3054 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3055 let _ = model.embed(vec!["probe".to_string()]).unwrap();
3056 handle.join().unwrap();
3057
3058 let bytes = captured.lock().unwrap().clone();
3059 let request = String::from_utf8_lossy(&bytes);
3060
3061 let content_type_lines = request
3064 .lines()
3065 .filter(|line| {
3066 let lower = line.to_ascii_lowercase();
3067 lower.starts_with("content-type:")
3068 })
3069 .count();
3070 assert_eq!(
3071 content_type_lines, 1,
3072 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3073 );
3074
3075 assert!(
3078 request.contains(r#""model":"text-embedding-3-small""#),
3079 "request body should contain model field; full request:\n{request}",
3080 );
3081 }
3082
3083 #[test]
3084 fn ollama_backend_embeds_with_mock_server() {
3085 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3086 assert!(request_line.starts_with("POST "));
3087 assert_eq!(path, "/api/embed");
3088 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3089 });
3090
3091 let config = SemanticBackendConfig {
3092 backend: SemanticBackend::Ollama,
3093 model: "embeddinggemma".to_string(),
3094 base_url: Some(base_url),
3095 api_key_env: None,
3096 timeout_ms: 5_000,
3097 max_batch_size: 64,
3098 };
3099
3100 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3101 let vectors = model
3102 .embed(vec!["hello".to_string(), "world".to_string()])
3103 .unwrap();
3104
3105 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3106 handle.join().unwrap();
3107 }
3108
3109 #[test]
3110 fn read_from_disk_rejects_fingerprint_mismatch() {
3111 let storage = tempfile::tempdir().unwrap();
3112 let project_key = "proj";
3113
3114 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3115 index.entries.push(EmbeddingEntry {
3116 chunk: SemanticChunk {
3117 file: PathBuf::from("/src/main.rs"),
3118 name: "handle_request".to_string(),
3119 kind: SymbolKind::Function,
3120 start_line: 10,
3121 end_line: 25,
3122 exported: true,
3123 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3124 snippet: "fn handle_request() {}".to_string(),
3125 },
3126 vector: vec![0.1, 0.2, 0.3],
3127 });
3128 index.dimension = 3;
3129 index
3130 .file_mtimes
3131 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3132 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3133 index.set_fingerprint(SemanticIndexFingerprint {
3134 backend: "openai_compatible".to_string(),
3135 model: "test-embedding".to_string(),
3136 base_url: "http://127.0.0.1:1234/v1".to_string(),
3137 dimension: 3,
3138 chunking_version: default_chunking_version(),
3139 });
3140 index.write_to_disk(storage.path(), project_key);
3141
3142 let matching = index.fingerprint().unwrap().as_string();
3143 assert!(SemanticIndex::read_from_disk(
3144 storage.path(),
3145 project_key,
3146 &test_project_root(),
3147 false,
3148 Some(&matching),
3149 )
3150 .is_some());
3151
3152 let mismatched = SemanticIndexFingerprint {
3153 backend: "ollama".to_string(),
3154 model: "embeddinggemma".to_string(),
3155 base_url: "http://127.0.0.1:11434".to_string(),
3156 dimension: 3,
3157 chunking_version: default_chunking_version(),
3158 }
3159 .as_string();
3160 assert!(SemanticIndex::read_from_disk(
3161 storage.path(),
3162 project_key,
3163 &test_project_root(),
3164 false,
3165 Some(&mismatched),
3166 )
3167 .is_none());
3168 }
3169
3170 #[test]
3171 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3172 let storage = tempfile::tempdir().unwrap();
3173 let project_key = "proj-v3";
3174 let dir = storage.path().join("semantic").join(project_key);
3175 fs::create_dir_all(&dir).unwrap();
3176
3177 let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3178 index.entries.push(EmbeddingEntry {
3179 chunk: SemanticChunk {
3180 file: PathBuf::from("/src/main.rs"),
3181 name: "handle_request".to_string(),
3182 kind: SymbolKind::Function,
3183 start_line: 0,
3184 end_line: 0,
3185 exported: true,
3186 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3187 snippet: "fn handle_request() {}".to_string(),
3188 },
3189 vector: vec![0.1, 0.2, 0.3],
3190 });
3191 index.dimension = 3;
3192 index
3193 .file_mtimes
3194 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3195 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3196 let fingerprint = SemanticIndexFingerprint {
3197 backend: "fastembed".to_string(),
3198 model: "test".to_string(),
3199 base_url: FALLBACK_BACKEND.to_string(),
3200 dimension: 3,
3201 chunking_version: default_chunking_version(),
3202 };
3203 index.set_fingerprint(fingerprint.clone());
3204
3205 let mut bytes = index.to_bytes();
3206 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3207 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3208
3209 assert!(SemanticIndex::read_from_disk(
3210 storage.path(),
3211 project_key,
3212 &test_project_root(),
3213 false,
3214 Some(&fingerprint.as_string())
3215 )
3216 .is_none());
3217 assert!(!dir.join("semantic.bin").exists());
3218 }
3219
3220 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3221 crate::symbols::Symbol {
3222 name: name.to_string(),
3223 kind,
3224 range: crate::symbols::Range {
3225 start_line: start,
3226 start_col: 0,
3227 end_line: end,
3228 end_col: 0,
3229 },
3230 signature: None,
3231 scope_chain: Vec::new(),
3232 exported: false,
3233 parent: None,
3234 }
3235 }
3236
3237 #[test]
3242 fn symbols_to_chunks_skips_heading_symbols() {
3243 let project_root = PathBuf::from("/proj");
3244 let file = project_root.join("README.md");
3245 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3246
3247 let symbols = vec![
3248 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3249 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3250 ];
3251
3252 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3253 assert!(
3254 chunks.is_empty(),
3255 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3256 chunks.len()
3257 );
3258 }
3259
3260 #[test]
3264 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3265 let project_root = PathBuf::from("/proj");
3266 let file = project_root.join("src/lib.rs");
3267 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3268
3269 let symbols = vec![
3270 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3272 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3273 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3274 ];
3275
3276 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3277 assert_eq!(
3278 chunks.len(),
3279 3,
3280 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3281 chunks.len()
3282 );
3283 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3284 assert!(chunks
3285 .iter()
3286 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3287 assert!(names.contains(&"handle_request"));
3288 assert!(names.contains(&"AuthService"));
3289 assert!(
3290 !names.contains(&"doc heading"),
3291 "Heading symbol leaked into chunks: {names:?}"
3292 );
3293 }
3294
3295 #[test]
3296 fn validate_ssrf_allows_loopback_hostnames() {
3297 for host in &[
3300 "http://localhost",
3301 "http://localhost:8080",
3302 "http://localhost:11434", "http://localhost.localdomain",
3304 "http://foo.localhost",
3305 ] {
3306 assert!(
3307 validate_base_url_no_ssrf(host).is_ok(),
3308 "Expected {host} to be allowed (loopback), got: {:?}",
3309 validate_base_url_no_ssrf(host)
3310 );
3311 }
3312 }
3313
3314 #[test]
3315 fn validate_ssrf_allows_loopback_ips() {
3316 for url in &[
3319 "http://127.0.0.1",
3320 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3322 "http://127.1.2.3",
3323 ] {
3324 let result = validate_base_url_no_ssrf(url);
3325 assert!(
3326 result.is_ok(),
3327 "Expected {url} to be allowed (loopback), got: {:?}",
3328 result
3329 );
3330 }
3331 }
3332
3333 #[test]
3334 fn validate_ssrf_rejects_private_non_loopback_ips() {
3335 for url in &[
3340 "http://192.168.1.1",
3341 "http://10.0.0.1",
3342 "http://172.16.0.1",
3343 "http://169.254.169.254",
3344 "http://100.64.0.1",
3345 ] {
3346 let result = validate_base_url_no_ssrf(url);
3347 assert!(
3348 result.is_err(),
3349 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3350 result
3351 );
3352 }
3353 }
3354
3355 #[test]
3356 fn validate_ssrf_rejects_mdns_local_hostnames() {
3357 for host in &[
3360 "http://printer.local",
3361 "http://nas.local:8080",
3362 "http://homelab.local",
3363 ] {
3364 let result = validate_base_url_no_ssrf(host);
3365 assert!(
3366 result.is_err(),
3367 "Expected {host} to be rejected (mDNS), got: {:?}",
3368 result
3369 );
3370 }
3371 }
3372
3373 #[test]
3374 fn normalize_base_url_allows_localhost_for_tests() {
3375 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3378 assert!(normalize_base_url("http://localhost:8080").is_ok());
3379 }
3380
3381 #[test]
3388 fn ort_mismatch_message_recommends_auto_fix_first() {
3389 let msg =
3390 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3391
3392 assert!(
3394 msg.contains("v1.9.0"),
3395 "should report detected version: {msg}"
3396 );
3397 assert!(
3398 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3399 "should report system path: {msg}"
3400 );
3401 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3402
3403 let auto_fix_pos = msg
3405 .find("Auto-fix")
3406 .expect("Auto-fix solution missing — users won't discover --fix");
3407 let remove_pos = msg
3408 .find("Remove the old library")
3409 .expect("system-rm solution missing");
3410 assert!(
3411 auto_fix_pos < remove_pos,
3412 "Auto-fix must come before manual rm — see PR comment thread"
3413 );
3414
3415 assert!(
3417 msg.contains("npx @cortexkit/aft doctor --fix"),
3418 "auto-fix command must be present and copy-pasteable: {msg}"
3419 );
3420 }
3421
3422 #[test]
3426 fn ort_mismatch_message_handles_macos_dylib_path() {
3427 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3428 assert!(msg.contains("v1.9.0"));
3429 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3430 assert!(
3434 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3435 "system path should be quoted in the auto-fix sentence: {msg}"
3436 );
3437 }
3438}