1use crate::config::{SemanticBackend, SemanticBackendConfig};
2use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
3use crate::symbols::{Symbol, SymbolKind};
4use crate::{slog_info, slog_warn};
5
6use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
7use rayon::prelude::*;
8use reqwest::blocking::Client;
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet, VecDeque};
11use std::env;
12use std::fmt::Display;
13use std::fs;
14use std::path::{Path, PathBuf};
15use std::time::Duration;
16use std::time::SystemTime;
17use tree_sitter::Parser;
18use url::Url;
19
20const DEFAULT_DIMENSION: usize = 384;
21const MAX_ENTRIES: usize = 1_000_000;
22const MAX_DIMENSION: usize = 1024;
23const F32_BYTES: usize = std::mem::size_of::<f32>();
24const HEADER_BYTES_V1: usize = 9;
25const HEADER_BYTES_V2: usize = 13;
26const ONNX_RUNTIME_INSTALL_HINT: &str =
27 "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
28
29const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
30const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
31const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
36const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
39const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
42const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
43const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
44const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
46const DEFAULT_MAX_BATCH_SIZE: usize = 64;
47const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
48const FALLBACK_BACKEND: &str = "none";
49const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
50const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct SemanticIndexFingerprint {
54 pub backend: String,
55 pub model: String,
56 #[serde(default)]
57 pub base_url: String,
58 pub dimension: usize,
59 #[serde(default = "default_chunking_version")]
60 pub chunking_version: u32,
61}
62
63fn default_chunking_version() -> u32 {
64 2
65}
66
67impl SemanticIndexFingerprint {
68 fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
69 let base_url = config
72 .base_url
73 .as_ref()
74 .and_then(|u| normalize_base_url(u).ok())
75 .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
76 Self {
77 backend: config.backend.as_str().to_string(),
78 model: config.model.clone(),
79 base_url,
80 dimension,
81 chunking_version: default_chunking_version(),
82 }
83 }
84
85 pub fn as_string(&self) -> String {
86 serde_json::to_string(self).unwrap_or_else(|_| String::new())
87 }
88
89 fn matches_expected(&self, expected: &str) -> bool {
90 let encoded = self.as_string();
91 !encoded.is_empty() && encoded == expected
92 }
93}
94
95enum SemanticEmbeddingEngine {
96 Fastembed(TextEmbedding),
97 OpenAiCompatible {
98 client: Client,
99 model: String,
100 base_url: String,
101 api_key: Option<String>,
102 },
103 Ollama {
104 client: Client,
105 model: String,
106 base_url: String,
107 },
108}
109
110pub struct SemanticEmbeddingModel {
111 backend: SemanticBackend,
112 model: String,
113 base_url: Option<String>,
114 timeout_ms: u64,
115 max_batch_size: usize,
116 dimension: Option<usize>,
117 engine: SemanticEmbeddingEngine,
118 query_embedding_cache: HashMap<String, Vec<f32>>,
119 query_embedding_cache_order: VecDeque<String>,
120 query_embedding_cache_hits: u64,
121 query_embedding_cache_misses: u64,
122}
123
124pub type EmbeddingModel = SemanticEmbeddingModel;
125
126fn validate_embedding_batch(
127 vectors: &[Vec<f32>],
128 expected_count: usize,
129 context: &str,
130) -> Result<(), String> {
131 if expected_count > 0 && vectors.is_empty() {
132 return Err(format!(
133 "{context} returned no vectors for {expected_count} inputs"
134 ));
135 }
136
137 if vectors.len() != expected_count {
138 return Err(format!(
139 "{context} returned {} vectors for {} inputs",
140 vectors.len(),
141 expected_count
142 ));
143 }
144
145 let Some(first_vector) = vectors.first() else {
146 return Ok(());
147 };
148 let expected_dimension = first_vector.len();
149 for (index, vector) in vectors.iter().enumerate() {
150 if vector.len() != expected_dimension {
151 return Err(format!(
152 "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
153 vector.len()
154 ));
155 }
156 }
157
158 Ok(())
159}
160
161fn normalize_base_url(raw: &str) -> Result<String, String> {
165 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
166 let scheme = parsed.scheme();
167 if scheme != "http" && scheme != "https" {
168 return Err(format!(
169 "unsupported URL scheme '{}' — only http:// and https:// are allowed",
170 scheme
171 ));
172 }
173 Ok(parsed.to_string().trim_end_matches('/').to_string())
174}
175
176pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
191 use std::net::{IpAddr, ToSocketAddrs};
192
193 let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
194
195 let host = parsed.host_str().unwrap_or("");
196
197 let is_loopback_host =
202 host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
203 if is_loopback_host {
204 return Ok(());
205 }
206
207 if host.ends_with(".local") {
210 return Err(format!(
211 "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
212 ));
213 }
214
215 let port = parsed.port_or_known_default().unwrap_or(443);
218 let addr_str = format!("{host}:{port}");
219 let addrs: Vec<IpAddr> = addr_str
220 .to_socket_addrs()
221 .map(|iter| iter.map(|sa| sa.ip()).collect())
222 .unwrap_or_default();
223 for ip in &addrs {
224 if is_private_non_loopback_ip(ip) {
225 return Err(format!(
226 "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
227 ));
228 }
229 }
230
231 Ok(())
232}
233
234fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
238 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
239 match ip {
240 IpAddr::V4(v4) => {
241 let o = v4.octets();
242 o[0] == 10
245 || (o[0] == 172 && (16..=31).contains(&o[1]))
247 || (o[0] == 192 && o[1] == 168)
249 || (o[0] == 169 && o[1] == 254)
251 || (o[0] == 100 && (64..=127).contains(&o[1]))
253 || o[0] == 0
255 }
256 IpAddr::V6(v6) => {
257 let _ = Ipv6Addr::LOCALHOST; (v6.segments()[0] & 0xffc0) == 0xfe80
261 || (v6.segments()[0] & 0xfe00) == 0xfc00
263 || (v6.segments()[0] == 0 && v6.segments()[1] == 0
265 && v6.segments()[2] == 0 && v6.segments()[3] == 0
266 && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
267 && {
268 let [a, b] = v6.segments()[6..8] else { return false; };
269 let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
270 is_private_non_loopback_ip(&IpAddr::V4(ipv4))
271 })
272 }
273 }
274}
275
276fn build_openai_embeddings_endpoint(base_url: &str) -> String {
277 if base_url.ends_with("/v1") {
278 format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
279 } else {
280 format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
281 }
282}
283
284fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
285 if base_url.ends_with("/api") {
286 format!("{base_url}/embed")
287 } else {
288 format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
289 }
290}
291
292fn normalize_api_key(value: Option<String>) -> Option<String> {
293 value.and_then(|token| {
294 let token = token.trim();
295 if token.is_empty() {
296 None
297 } else {
298 Some(token.to_string())
299 }
300 })
301}
302
303fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
304 status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
305}
306
307fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
308 error.is_connect()
309}
310
311fn sleep_before_embedding_retry(attempt_index: usize) {
312 if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
313 std::thread::sleep(Duration::from_millis(*delay_ms));
314 }
315}
316
317fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
318where
319 F: FnMut() -> reqwest::blocking::RequestBuilder,
320{
321 for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
322 let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
323
324 let response = match make_request().send() {
325 Ok(response) => response,
326 Err(error) => {
327 if !last_attempt && is_retryable_embedding_error(&error) {
328 sleep_before_embedding_retry(attempt_index);
329 continue;
330 }
331 return Err(format!("{backend_label} request failed: {error}"));
332 }
333 };
334
335 let status = response.status();
336 let raw = match response.text() {
337 Ok(raw) => raw,
338 Err(error) => {
339 if !last_attempt && is_retryable_embedding_error(&error) {
340 sleep_before_embedding_retry(attempt_index);
341 continue;
342 }
343 return Err(format!("{backend_label} response read failed: {error}"));
344 }
345 };
346
347 if status.is_success() {
348 return Ok(raw);
349 }
350
351 if !last_attempt && is_retryable_embedding_status(status) {
352 sleep_before_embedding_retry(attempt_index);
353 continue;
354 }
355
356 return Err(format!(
357 "{backend_label} request failed (HTTP {}): {}",
358 status, raw
359 ));
360 }
361
362 unreachable!("embedding request retries exhausted without returning")
363}
364
365impl SemanticEmbeddingModel {
366 pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
367 let timeout_ms = if config.timeout_ms == 0 {
368 DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
369 } else {
370 config.timeout_ms
371 };
372
373 let max_batch_size = if config.max_batch_size == 0 {
374 DEFAULT_MAX_BATCH_SIZE
375 } else {
376 config.max_batch_size
377 };
378
379 let api_key_env = normalize_api_key(config.api_key_env.clone());
380 let model = config.model.clone();
381
382 let client = Client::builder()
383 .timeout(Duration::from_millis(timeout_ms))
384 .redirect(reqwest::redirect::Policy::none())
385 .build()
386 .map_err(|error| format!("failed to configure embedding client: {error}"))?;
387
388 let engine = match config.backend {
389 SemanticBackend::Fastembed => {
390 SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
391 }
392 SemanticBackend::OpenAiCompatible => {
393 let raw = config.base_url.as_ref().ok_or_else(|| {
394 "base_url is required for openai_compatible backend".to_string()
395 })?;
396 let base_url = normalize_base_url(raw)?;
397
398 let api_key = match api_key_env {
399 Some(var_name) => Some(env::var(&var_name).map_err(|_| {
400 format!("missing api_key_env '{var_name}' for openai_compatible backend")
401 })?),
402 None => None,
403 };
404
405 SemanticEmbeddingEngine::OpenAiCompatible {
406 client,
407 model,
408 base_url,
409 api_key,
410 }
411 }
412 SemanticBackend::Ollama => {
413 let raw = config
414 .base_url
415 .as_ref()
416 .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
417 let base_url = normalize_base_url(raw)?;
418
419 SemanticEmbeddingEngine::Ollama {
420 client,
421 model,
422 base_url,
423 }
424 }
425 };
426
427 Ok(Self {
428 backend: config.backend,
429 model: config.model.clone(),
430 base_url: config.base_url.clone(),
431 timeout_ms,
432 max_batch_size,
433 dimension: None,
434 engine,
435 query_embedding_cache: HashMap::new(),
436 query_embedding_cache_order: VecDeque::new(),
437 query_embedding_cache_hits: 0,
438 query_embedding_cache_misses: 0,
439 })
440 }
441
442 pub fn backend(&self) -> SemanticBackend {
443 self.backend
444 }
445
446 pub fn model(&self) -> &str {
447 &self.model
448 }
449
450 pub fn base_url(&self) -> Option<&str> {
451 self.base_url.as_deref()
452 }
453
454 pub fn max_batch_size(&self) -> usize {
455 self.max_batch_size
456 }
457
458 pub fn timeout_ms(&self) -> u64 {
459 self.timeout_ms
460 }
461
462 pub fn fingerprint(
463 &mut self,
464 config: &SemanticBackendConfig,
465 ) -> Result<SemanticIndexFingerprint, String> {
466 let dimension = self.dimension()?;
467 Ok(SemanticIndexFingerprint::from_config(config, dimension))
468 }
469
470 pub fn dimension(&mut self) -> Result<usize, String> {
471 if let Some(dimension) = self.dimension {
472 return Ok(dimension);
473 }
474
475 let dimension = match &mut self.engine {
476 SemanticEmbeddingEngine::Fastembed(model) => {
477 let vectors = model
478 .embed(vec!["semantic index fingerprint probe".to_string()], None)
479 .map_err(|error| format_embedding_init_error(error.to_string()))?;
480 vectors
481 .first()
482 .map(|v| v.len())
483 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
484 }
485 SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
486 let vectors =
487 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
488 vectors
489 .first()
490 .map(|v| v.len())
491 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
492 }
493 SemanticEmbeddingEngine::Ollama { .. } => {
494 let vectors =
495 self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
496 vectors
497 .first()
498 .map(|v| v.len())
499 .ok_or_else(|| "embedding backend returned no vectors".to_string())?
500 }
501 };
502
503 self.dimension = Some(dimension);
504 Ok(dimension)
505 }
506
507 pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
508 self.embed_texts(texts)
509 }
510
511 pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
512 if let Some(vector) = self.query_embedding_cache.get(query) {
513 self.query_embedding_cache_hits += 1;
514 return Ok(vector.clone());
515 }
516
517 self.query_embedding_cache_misses += 1;
518 let embeddings = self.embed_texts(vec![query.to_string()])?;
519 let vector = embeddings
520 .first()
521 .cloned()
522 .ok_or_else(|| "embedding model returned no query vector".to_string())?;
523
524 if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
525 if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
526 self.query_embedding_cache.remove(&oldest);
527 }
528 }
529 self.query_embedding_cache
530 .insert(query.to_string(), vector.clone());
531 self.query_embedding_cache_order
532 .push_back(query.to_string());
533
534 Ok(vector)
535 }
536
537 pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
538 (
539 self.query_embedding_cache_hits,
540 self.query_embedding_cache_misses,
541 self.query_embedding_cache.len(),
542 )
543 }
544
545 fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
546 match &mut self.engine {
547 SemanticEmbeddingEngine::Fastembed(model) => model
548 .embed(texts, None::<usize>)
549 .map_err(|error| format_embedding_init_error(error.to_string()))
550 .map_err(|error| format!("failed to embed batch: {error}")),
551 SemanticEmbeddingEngine::OpenAiCompatible {
552 client,
553 model,
554 base_url,
555 api_key,
556 } => {
557 let expected_text_count = texts.len();
558 let endpoint = build_openai_embeddings_endpoint(base_url);
559 let body = serde_json::json!({
560 "input": texts,
561 "model": model,
562 });
563
564 let raw = send_embedding_request(
565 || {
566 let mut request = client.post(&endpoint).json(&body);
576
577 if let Some(api_key) = api_key {
578 request = request.header("Authorization", format!("Bearer {api_key}"));
579 }
580
581 request
582 },
583 "openai compatible",
584 )?;
585
586 #[derive(Deserialize)]
587 struct OpenAiResponse {
588 data: Vec<OpenAiEmbeddingResult>,
589 }
590
591 #[derive(Deserialize)]
592 struct OpenAiEmbeddingResult {
593 embedding: Vec<f32>,
594 index: Option<u32>,
595 }
596
597 let parsed: OpenAiResponse = serde_json::from_str(&raw)
598 .map_err(|error| format!("invalid openai compatible response: {error}"))?;
599 if parsed.data.len() != expected_text_count {
600 return Err(format!(
601 "openai compatible response returned {} embeddings for {} inputs",
602 parsed.data.len(),
603 expected_text_count
604 ));
605 }
606
607 let mut vectors = vec![Vec::new(); parsed.data.len()];
608 for (i, item) in parsed.data.into_iter().enumerate() {
609 let index = item.index.unwrap_or(i as u32) as usize;
610 if index >= vectors.len() {
611 return Err(
612 "openai compatible response contains invalid vector index".to_string()
613 );
614 }
615 vectors[index] = item.embedding;
616 }
617
618 for vector in &vectors {
619 if vector.is_empty() {
620 return Err(
621 "openai compatible response contained missing vectors".to_string()
622 );
623 }
624 }
625
626 self.dimension = vectors.first().map(Vec::len);
627 Ok(vectors)
628 }
629 SemanticEmbeddingEngine::Ollama {
630 client,
631 model,
632 base_url,
633 } => {
634 let expected_text_count = texts.len();
635 let endpoint = build_ollama_embeddings_endpoint(base_url);
636
637 #[derive(Serialize)]
638 struct OllamaPayload<'a> {
639 model: &'a str,
640 input: Vec<String>,
641 }
642
643 let payload = OllamaPayload {
644 model,
645 input: texts,
646 };
647
648 let raw = send_embedding_request(
649 || {
650 client.post(&endpoint).json(&payload)
655 },
656 "ollama",
657 )?;
658
659 #[derive(Deserialize)]
660 struct OllamaResponse {
661 embeddings: Vec<Vec<f32>>,
662 }
663
664 let parsed: OllamaResponse = serde_json::from_str(&raw)
665 .map_err(|error| format!("invalid ollama response: {error}"))?;
666 if parsed.embeddings.is_empty() {
667 return Err("ollama response returned no embeddings".to_string());
668 }
669 if parsed.embeddings.len() != expected_text_count {
670 return Err(format!(
671 "ollama response returned {} embeddings for {} inputs",
672 parsed.embeddings.len(),
673 expected_text_count
674 ));
675 }
676
677 let vectors = parsed.embeddings;
678 for vector in &vectors {
679 if vector.is_empty() {
680 return Err("ollama response contained empty embeddings".to_string());
681 }
682 }
683
684 self.dimension = vectors.first().map(Vec::len);
685 Ok(vectors)
686 }
687 }
688 }
689}
690
691pub fn pre_validate_onnx_runtime() -> Result<(), String> {
695 let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
696
697 #[cfg(any(target_os = "linux", target_os = "macos"))]
698 {
699 #[cfg(target_os = "linux")]
700 let default_name = "libonnxruntime.so";
701 #[cfg(target_os = "macos")]
702 let default_name = "libonnxruntime.dylib";
703
704 let lib_name = dylib_path.as_deref().unwrap_or(default_name);
705
706 unsafe {
707 let c_name = std::ffi::CString::new(lib_name)
708 .map_err(|e| format!("invalid library path: {}", e))?;
709 let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
710 if handle.is_null() {
711 let err = libc::dlerror();
712 let msg = if err.is_null() {
713 "unknown dlopen error".to_string()
714 } else {
715 std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
716 };
717 return Err(format!(
718 "ONNX Runtime not found. dlopen('{}') failed: {}. \
719 Run `npx @cortexkit/aft doctor` to diagnose.",
720 lib_name, msg
721 ));
722 }
723
724 let detected_version = detect_ort_version_from_path(lib_name);
727
728 libc::dlclose(handle);
729
730 if let Some(ref version) = detected_version {
732 let parts: Vec<&str> = version.split('.').collect();
733 if let (Some(major), Some(minor)) = (
734 parts.first().and_then(|s| s.parse::<u32>().ok()),
735 parts.get(1).and_then(|s| s.parse::<u32>().ok()),
736 ) {
737 if major != 1 || minor < 20 {
738 return Err(format_ort_version_mismatch(version, lib_name));
739 }
740 }
741 }
742 }
743 }
744
745 #[cfg(target_os = "windows")]
746 {
747 let _ = dylib_path;
749 }
750
751 Ok(())
752}
753
754#[cfg(any(test, target_os = "linux", target_os = "macos"))]
757fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
758 let path = std::path::Path::new(lib_path);
759
760 for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
762 .into_iter()
763 .flatten()
764 {
765 if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
766 if let Some(version) = extract_version_from_filename(name) {
767 return Some(version);
768 }
769 }
770 }
771
772 if let Some(parent) = path.parent() {
774 if let Ok(entries) = std::fs::read_dir(parent) {
775 for entry in entries.flatten() {
776 if let Some(name) = entry.file_name().to_str() {
777 if name.starts_with("libonnxruntime") {
778 if let Some(version) = extract_version_from_filename(name) {
779 return Some(version);
780 }
781 }
782 }
783 }
784 }
785 }
786
787 None
788}
789
790#[cfg(any(test, target_os = "linux", target_os = "macos"))]
792fn extract_version_from_filename(name: &str) -> Option<String> {
793 let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
795 re.find(name).map(|m| m.as_str().to_string())
796}
797
798#[cfg(any(test, target_os = "linux", target_os = "macos"))]
799fn suggest_removal_command(lib_path: &str) -> String {
800 if lib_path.starts_with("/usr/local/lib")
801 || lib_path == "libonnxruntime.so"
802 || lib_path == "libonnxruntime.dylib"
803 {
804 #[cfg(target_os = "linux")]
805 return " sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
806 #[cfg(target_os = "macos")]
807 return " sudo rm /usr/local/lib/libonnxruntime*".to_string();
808 #[cfg(target_os = "windows")]
809 return " Delete the ONNX Runtime DLL from your PATH".to_string();
810 }
811 format!(" rm '{}'", lib_path)
812}
813
814#[cfg(any(test, target_os = "linux", target_os = "macos"))]
820pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
821 format!(
822 "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
823 Solutions:\n\
824 1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
825 This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
826 configures the bridge to load it instead of the system library — no \
827 changes to '{}'.\n\
828 2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
829 {}\n\
830 3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
831 4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
832 version,
833 lib_name,
834 lib_name,
835 suggest_removal_command(lib_name),
836 )
837}
838
839pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
840 pre_validate_onnx_runtime()?;
842
843 let selected_model = match model {
844 "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
845 _ => {
846 return Err(format!(
847 "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
848 model
849 ))
850 }
851 };
852
853 TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
854}
855
856pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
857 if message.trim_start().starts_with("ONNX Runtime not found.") {
858 return true;
859 }
860
861 let message = message.to_ascii_lowercase();
862 let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
863 .iter()
864 .any(|pattern| message.contains(pattern));
865 let mentions_dynamic_load_failure = [
866 "shared library",
867 "dynamic library",
868 "failed to load",
869 "could not load",
870 "unable to load",
871 "dlopen",
872 "loadlibrary",
873 "no such file",
874 "not found",
875 ]
876 .iter()
877 .any(|pattern| message.contains(pattern));
878
879 mentions_onnx_runtime && mentions_dynamic_load_failure
880}
881
882fn format_embedding_init_error(error: impl Display) -> String {
883 let message = error.to_string();
884
885 if is_onnx_runtime_unavailable(&message) {
886 return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
887 }
888
889 format!("failed to initialize semantic embedding model: {message}")
890}
891
892#[derive(Debug, Clone)]
894pub struct SemanticChunk {
895 pub file: PathBuf,
897 pub name: String,
899 pub kind: SymbolKind,
901 pub start_line: u32,
903 pub end_line: u32,
904 pub exported: bool,
906 pub embed_text: String,
908 pub snippet: String,
910}
911
912#[derive(Debug)]
914struct EmbeddingEntry {
915 chunk: SemanticChunk,
916 vector: Vec<f32>,
917}
918
919#[derive(Debug)]
921pub struct SemanticIndex {
922 entries: Vec<EmbeddingEntry>,
923 file_mtimes: HashMap<PathBuf, SystemTime>,
925 file_sizes: HashMap<PathBuf, u64>,
927 dimension: usize,
929 fingerprint: Option<SemanticIndexFingerprint>,
930}
931
932#[derive(Debug, Clone, Copy)]
933struct IndexedFileMetadata {
934 mtime: SystemTime,
935 size: u64,
936}
937
938#[derive(Debug, Default, Clone, Copy)]
941pub struct RefreshSummary {
942 pub changed: usize,
943 pub added: usize,
944 pub deleted: usize,
945 pub total_processed: usize,
946}
947
948impl RefreshSummary {
949 pub fn is_noop(&self) -> bool {
951 self.changed == 0 && self.added == 0 && self.deleted == 0
952 }
953}
954
955#[derive(Debug, Clone)]
957pub struct SemanticResult {
958 pub file: PathBuf,
959 pub name: String,
960 pub kind: SymbolKind,
961 pub start_line: u32,
962 pub end_line: u32,
963 pub exported: bool,
964 pub snippet: String,
965 pub score: f32,
966 pub source: &'static str,
967}
968
969impl SemanticIndex {
970 pub fn new() -> Self {
971 Self {
972 entries: Vec::new(),
973 file_mtimes: HashMap::new(),
974 file_sizes: HashMap::new(),
975 dimension: DEFAULT_DIMENSION, fingerprint: None,
977 }
978 }
979
980 pub fn entry_count(&self) -> usize {
982 self.entries.len()
983 }
984
985 pub fn status_label(&self) -> &'static str {
987 if self.entries.is_empty() {
988 "empty"
989 } else {
990 "ready"
991 }
992 }
993
994 fn collect_chunks(
995 project_root: &Path,
996 files: &[PathBuf],
997 ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
998 let per_file: Vec<(
999 PathBuf,
1000 Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1001 )> = files
1002 .par_iter()
1003 .map_init(HashMap::new, |parsers, file| {
1004 let result = collect_file_metadata(file).and_then(|metadata| {
1005 collect_file_chunks(project_root, file, parsers)
1006 .map(|chunks| (metadata, chunks))
1007 });
1008 (file.clone(), result)
1009 })
1010 .collect();
1011
1012 let mut chunks: Vec<SemanticChunk> = Vec::new();
1013 let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1014
1015 for (file, result) in per_file {
1016 match result {
1017 Ok((metadata, file_chunks)) => {
1018 file_metadata.insert(file, metadata);
1019 chunks.extend(file_chunks);
1020 }
1021 Err(error) => {
1022 if error == "unsupported file extension" {
1028 continue;
1029 }
1030 slog_warn!(
1031 "failed to collect semantic chunks for {}: {}",
1032 file.display(),
1033 error
1034 );
1035 }
1036 }
1037 }
1038
1039 (chunks, file_metadata)
1040 }
1041
1042 fn build_from_chunks<F, P>(
1043 chunks: Vec<SemanticChunk>,
1044 file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1045 embed_fn: &mut F,
1046 max_batch_size: usize,
1047 mut progress: Option<&mut P>,
1048 ) -> Result<Self, String>
1049 where
1050 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1051 P: FnMut(usize, usize),
1052 {
1053 let total_chunks = chunks.len();
1054
1055 if chunks.is_empty() {
1056 return Ok(Self {
1057 entries: Vec::new(),
1058 file_mtimes: file_metadata
1059 .iter()
1060 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1061 .collect(),
1062 file_sizes: file_metadata
1063 .into_iter()
1064 .map(|(path, metadata)| (path, metadata.size))
1065 .collect(),
1066 dimension: DEFAULT_DIMENSION,
1067 fingerprint: None,
1068 });
1069 }
1070
1071 let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1073 let mut expected_dimension: Option<usize> = None;
1074 let batch_size = max_batch_size.max(1);
1075 for batch_start in (0..chunks.len()).step_by(batch_size) {
1076 let batch_end = (batch_start + batch_size).min(chunks.len());
1077 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1078 .iter()
1079 .map(|c| c.embed_text.clone())
1080 .collect();
1081
1082 let vectors = embed_fn(batch_texts)?;
1083 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1084
1085 if let Some(dim) = vectors.first().map(|v| v.len()) {
1087 match expected_dimension {
1088 None => expected_dimension = Some(dim),
1089 Some(expected) if dim != expected => {
1090 return Err(format!(
1091 "embedding dimension changed across batches: expected {expected}, got {dim}"
1092 ));
1093 }
1094 _ => {}
1095 }
1096 }
1097
1098 for (i, vector) in vectors.into_iter().enumerate() {
1099 let chunk_idx = batch_start + i;
1100 entries.push(EmbeddingEntry {
1101 chunk: chunks[chunk_idx].clone(),
1102 vector,
1103 });
1104 }
1105
1106 if let Some(callback) = progress.as_mut() {
1107 callback(entries.len(), total_chunks);
1108 }
1109 }
1110
1111 let dimension = entries
1112 .first()
1113 .map(|e| e.vector.len())
1114 .unwrap_or(DEFAULT_DIMENSION);
1115
1116 Ok(Self {
1117 entries,
1118 file_mtimes: file_metadata
1119 .iter()
1120 .map(|(path, metadata)| (path.clone(), metadata.mtime))
1121 .collect(),
1122 file_sizes: file_metadata
1123 .into_iter()
1124 .map(|(path, metadata)| (path, metadata.size))
1125 .collect(),
1126 dimension,
1127 fingerprint: None,
1128 })
1129 }
1130
1131 pub fn build<F>(
1134 project_root: &Path,
1135 files: &[PathBuf],
1136 embed_fn: &mut F,
1137 max_batch_size: usize,
1138 ) -> Result<Self, String>
1139 where
1140 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1141 {
1142 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1143 Self::build_from_chunks(
1144 chunks,
1145 file_mtimes,
1146 embed_fn,
1147 max_batch_size,
1148 Option::<&mut fn(usize, usize)>::None,
1149 )
1150 }
1151
1152 pub fn build_with_progress<F, P>(
1154 project_root: &Path,
1155 files: &[PathBuf],
1156 embed_fn: &mut F,
1157 max_batch_size: usize,
1158 progress: &mut P,
1159 ) -> Result<Self, String>
1160 where
1161 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1162 P: FnMut(usize, usize),
1163 {
1164 let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1165 let total_chunks = chunks.len();
1166 progress(0, total_chunks);
1167 Self::build_from_chunks(
1168 chunks,
1169 file_mtimes,
1170 embed_fn,
1171 max_batch_size,
1172 Some(progress),
1173 )
1174 }
1175
1176 pub fn refresh_stale_files<F, P>(
1187 &mut self,
1188 project_root: &Path,
1189 current_files: &[PathBuf],
1190 embed_fn: &mut F,
1191 max_batch_size: usize,
1192 progress: &mut P,
1193 ) -> Result<RefreshSummary, String>
1194 where
1195 F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1196 P: FnMut(usize, usize),
1197 {
1198 self.backfill_missing_file_sizes();
1199
1200 let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1202 let total_processed = current_set.len() + self.file_mtimes.len()
1203 - self
1204 .file_mtimes
1205 .keys()
1206 .filter(|path| current_set.contains(path.as_path()))
1207 .count();
1208
1209 let mut deleted: Vec<PathBuf> = Vec::new();
1212 let mut changed: Vec<PathBuf> = Vec::new();
1213 for indexed_path in self.file_mtimes.keys() {
1214 if !current_set.contains(indexed_path.as_path()) {
1215 deleted.push(indexed_path.clone());
1216 continue;
1217 }
1218 if self.is_file_stale(indexed_path) {
1219 changed.push(indexed_path.clone());
1220 }
1221 }
1222
1223 let mut added: Vec<PathBuf> = Vec::new();
1225 for path in current_files {
1226 if !self.file_mtimes.contains_key(path) {
1227 added.push(path.clone());
1228 }
1229 }
1230
1231 if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1233 progress(0, 0);
1234 return Ok(RefreshSummary {
1235 total_processed,
1236 ..RefreshSummary::default()
1237 });
1238 }
1239
1240 if !deleted.is_empty() {
1244 let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1245 self.entries
1246 .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1247 for path in &deleted {
1248 self.file_mtimes.remove(path);
1249 self.file_sizes.remove(path);
1250 }
1251 }
1252
1253 let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1255 to_embed.extend(changed.iter().cloned());
1256 to_embed.extend(added.iter().cloned());
1257
1258 if to_embed.is_empty() {
1259 progress(0, 0);
1261 return Ok(RefreshSummary {
1262 changed: 0,
1263 added: 0,
1264 deleted: deleted.len(),
1265 total_processed,
1266 });
1267 }
1268
1269 let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1270
1271 if chunks.is_empty() {
1272 progress(0, 0);
1273 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1274 if !successful_files.is_empty() {
1275 self.entries
1276 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1277 }
1278 let changed_count = changed
1279 .iter()
1280 .filter(|path| successful_files.contains(*path))
1281 .count();
1282 let added_count = added
1283 .iter()
1284 .filter(|path| successful_files.contains(*path))
1285 .count();
1286 for (file, metadata) in fresh_metadata {
1287 self.file_mtimes.insert(file.clone(), metadata.mtime);
1288 self.file_sizes.insert(file, metadata.size);
1289 }
1290 return Ok(RefreshSummary {
1291 changed: changed_count,
1292 added: added_count,
1293 deleted: deleted.len(),
1294 total_processed,
1295 });
1296 }
1297
1298 let total_chunks = chunks.len();
1300 progress(0, total_chunks);
1301 let batch_size = max_batch_size.max(1);
1302 let existing_dimension = if self.entries.is_empty() {
1303 None
1304 } else {
1305 Some(self.dimension)
1306 };
1307 let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1308 let mut observed_dimension: Option<usize> = existing_dimension;
1309
1310 for batch_start in (0..chunks.len()).step_by(batch_size) {
1311 let batch_end = (batch_start + batch_size).min(chunks.len());
1312 let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1313 .iter()
1314 .map(|c| c.embed_text.clone())
1315 .collect();
1316
1317 let vectors = embed_fn(batch_texts)?;
1318 validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1319
1320 if let Some(dim) = vectors.first().map(|v| v.len()) {
1321 match observed_dimension {
1322 None => observed_dimension = Some(dim),
1323 Some(expected) if dim != expected => {
1324 return Err(format!(
1327 "embedding dimension changed during incremental refresh: \
1328 cached index uses {expected}, new vectors use {dim}"
1329 ));
1330 }
1331 _ => {}
1332 }
1333 }
1334
1335 for (i, vector) in vectors.into_iter().enumerate() {
1336 let chunk_idx = batch_start + i;
1337 new_entries.push(EmbeddingEntry {
1338 chunk: chunks[chunk_idx].clone(),
1339 vector,
1340 });
1341 }
1342
1343 progress(new_entries.len(), total_chunks);
1344 }
1345
1346 let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1347 if !successful_files.is_empty() {
1348 self.entries
1349 .retain(|entry| !successful_files.contains(&entry.chunk.file));
1350 }
1351
1352 self.entries.extend(new_entries);
1353 for (file, metadata) in fresh_metadata {
1354 self.file_mtimes.insert(file.clone(), metadata.mtime);
1355 self.file_sizes.insert(file, metadata.size);
1356 }
1357 if let Some(dim) = observed_dimension {
1358 self.dimension = dim;
1359 }
1360
1361 Ok(RefreshSummary {
1362 changed: changed
1363 .iter()
1364 .filter(|path| successful_files.contains(*path))
1365 .count(),
1366 added: added
1367 .iter()
1368 .filter(|path| successful_files.contains(*path))
1369 .count(),
1370 deleted: deleted.len(),
1371 total_processed,
1372 })
1373 }
1374
1375 pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1377 if self.entries.is_empty() || query_vector.len() != self.dimension {
1378 return Vec::new();
1379 }
1380
1381 let mut scored: Vec<(f32, usize)> = self
1382 .entries
1383 .iter()
1384 .enumerate()
1385 .map(|(i, entry)| {
1386 let mut score = cosine_similarity(query_vector, &entry.vector);
1387 if entry.chunk.exported {
1388 score *= 1.1;
1389 }
1390 (score, i)
1391 })
1392 .collect();
1393
1394 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1396
1397 scored
1398 .into_iter()
1399 .take(top_k)
1400 .map(|(score, idx)| {
1404 let entry = &self.entries[idx];
1405 SemanticResult {
1406 file: entry.chunk.file.clone(),
1407 name: entry.chunk.name.clone(),
1408 kind: entry.chunk.kind.clone(),
1409 start_line: entry.chunk.start_line,
1410 end_line: entry.chunk.end_line,
1411 exported: entry.chunk.exported,
1412 snippet: entry.chunk.snippet.clone(),
1413 score,
1414 source: "semantic",
1415 }
1416 })
1417 .collect()
1418 }
1419
1420 pub fn len(&self) -> usize {
1422 self.entries.len()
1423 }
1424
1425 pub fn is_file_stale(&self, file: &Path) -> bool {
1427 let Some(stored_mtime) = self.file_mtimes.get(file) else {
1428 return true;
1429 };
1430 let Some(stored_size) = self.file_sizes.get(file) else {
1431 return true;
1432 };
1433 match collect_file_metadata(file) {
1434 Ok(current) => *stored_mtime != current.mtime || *stored_size != current.size,
1435 Err(_) => true,
1436 }
1437 }
1438
1439 fn backfill_missing_file_sizes(&mut self) {
1440 for path in self.file_mtimes.keys() {
1441 if self.file_sizes.contains_key(path) {
1442 continue;
1443 }
1444 if let Ok(metadata) = fs::metadata(path) {
1445 self.file_sizes.insert(path.clone(), metadata.len());
1446 }
1447 }
1448 }
1449
1450 pub fn remove_file(&mut self, file: &Path) {
1452 self.invalidate_file(file);
1453 }
1454
1455 pub fn invalidate_file(&mut self, file: &Path) {
1456 self.entries.retain(|e| e.chunk.file != file);
1457 self.file_mtimes.remove(file);
1458 self.file_sizes.remove(file);
1459 }
1460
1461 pub fn dimension(&self) -> usize {
1463 self.dimension
1464 }
1465
1466 pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1467 self.fingerprint.as_ref()
1468 }
1469
1470 pub fn backend_label(&self) -> Option<&str> {
1471 self.fingerprint.as_ref().map(|f| f.backend.as_str())
1472 }
1473
1474 pub fn model_label(&self) -> Option<&str> {
1475 self.fingerprint.as_ref().map(|f| f.model.as_str())
1476 }
1477
1478 pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1479 self.fingerprint = Some(fingerprint);
1480 }
1481
1482 pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1484 if self.entries.is_empty() {
1487 slog_info!("skipping semantic index persistence (0 entries)");
1488 return;
1489 }
1490 let dir = storage_dir.join("semantic").join(project_key);
1491 if let Err(e) = fs::create_dir_all(&dir) {
1492 slog_warn!("failed to create semantic cache dir: {}", e);
1493 return;
1494 }
1495 let data_path = dir.join("semantic.bin");
1496 let tmp_path = dir.join(format!(
1497 "semantic.bin.tmp.{}.{}",
1498 std::process::id(),
1499 SystemTime::now()
1500 .duration_since(SystemTime::UNIX_EPOCH)
1501 .unwrap_or(Duration::ZERO)
1502 .as_nanos()
1503 ));
1504 let bytes = self.to_bytes();
1505 let write_result = (|| -> std::io::Result<()> {
1506 use std::io::Write;
1507 let mut file = fs::File::create(&tmp_path)?;
1508 file.write_all(&bytes)?;
1509 file.sync_all()?;
1510 Ok(())
1511 })();
1512 if let Err(e) = write_result {
1513 slog_warn!("failed to write semantic index: {}", e);
1514 let _ = fs::remove_file(&tmp_path);
1515 return;
1516 }
1517 if let Err(e) = fs::rename(&tmp_path, &data_path) {
1518 slog_warn!("failed to rename semantic index: {}", e);
1519 let _ = fs::remove_file(&tmp_path);
1520 return;
1521 }
1522 slog_info!(
1523 "semantic index persisted: {} entries, {:.1} KB",
1524 self.entries.len(),
1525 bytes.len() as f64 / 1024.0
1526 );
1527 }
1528
1529 pub fn read_from_disk(
1531 storage_dir: &Path,
1532 project_key: &str,
1533 expected_fingerprint: Option<&str>,
1534 ) -> Option<Self> {
1535 let data_path = storage_dir
1536 .join("semantic")
1537 .join(project_key)
1538 .join("semantic.bin");
1539 let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1540 if file_len < HEADER_BYTES_V1 {
1541 slog_warn!(
1542 "corrupt semantic index (too small: {} bytes), removing",
1543 file_len
1544 );
1545 let _ = fs::remove_file(&data_path);
1546 return None;
1547 }
1548
1549 let bytes = fs::read(&data_path).ok()?;
1550 let version = bytes[0];
1551 if version != SEMANTIC_INDEX_VERSION_V5 {
1552 slog_info!(
1553 "cached semantic index version {} is older than {}, rebuilding",
1554 version,
1555 SEMANTIC_INDEX_VERSION_V5
1556 );
1557 let _ = fs::remove_file(&data_path);
1558 return None;
1559 }
1560 match Self::from_bytes(&bytes) {
1561 Ok(index) => {
1562 if index.entries.is_empty() {
1563 slog_info!("cached semantic index is empty, will rebuild");
1564 let _ = fs::remove_file(&data_path);
1565 return None;
1566 }
1567 if let Some(expected) = expected_fingerprint {
1568 let matches = index
1569 .fingerprint()
1570 .map(|fingerprint| fingerprint.matches_expected(expected))
1571 .unwrap_or(false);
1572 if !matches {
1573 slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1574 let _ = fs::remove_file(&data_path);
1575 return None;
1576 }
1577 }
1578 slog_info!(
1579 "loaded semantic index from disk: {} entries",
1580 index.entries.len()
1581 );
1582 Some(index)
1583 }
1584 Err(e) => {
1585 slog_warn!("corrupt semantic index, rebuilding: {}", e);
1586 let _ = fs::remove_file(&data_path);
1587 None
1588 }
1589 }
1590 }
1591
1592 pub fn to_bytes(&self) -> Vec<u8> {
1594 let mut buf = Vec::new();
1595 let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1596 let encoded = fingerprint.as_string();
1597 if encoded.is_empty() {
1598 None
1599 } else {
1600 Some(encoded.into_bytes())
1601 }
1602 });
1603
1604 let version = SEMANTIC_INDEX_VERSION_V5;
1616 buf.push(version);
1617 buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1618 buf.extend_from_slice(&(self.entries.len() as u32).to_le_bytes());
1619 let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1620 buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1621 buf.extend_from_slice(fp_bytes_ref);
1622
1623 buf.extend_from_slice(&(self.file_mtimes.len() as u32).to_le_bytes());
1626 for (path, mtime) in &self.file_mtimes {
1627 let path_bytes = path.to_string_lossy().as_bytes().to_vec();
1628 buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1629 buf.extend_from_slice(&path_bytes);
1630 let duration = mtime
1631 .duration_since(SystemTime::UNIX_EPOCH)
1632 .unwrap_or_default();
1633 buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1634 buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1635 let size = self.file_sizes.get(path).copied().unwrap_or_default();
1636 buf.extend_from_slice(&size.to_le_bytes());
1637 }
1638
1639 for entry in &self.entries {
1641 let c = &entry.chunk;
1642
1643 let file_bytes = c.file.to_string_lossy().as_bytes().to_vec();
1645 buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1646 buf.extend_from_slice(&file_bytes);
1647
1648 let name_bytes = c.name.as_bytes();
1650 buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1651 buf.extend_from_slice(name_bytes);
1652
1653 buf.push(symbol_kind_to_u8(&c.kind));
1655
1656 buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1658 buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1659 buf.push(c.exported as u8);
1660
1661 let snippet_bytes = c.snippet.as_bytes();
1663 buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1664 buf.extend_from_slice(snippet_bytes);
1665
1666 let embed_bytes = c.embed_text.as_bytes();
1668 buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1669 buf.extend_from_slice(embed_bytes);
1670
1671 for &val in &entry.vector {
1673 buf.extend_from_slice(&val.to_le_bytes());
1674 }
1675 }
1676
1677 buf
1678 }
1679
1680 pub fn from_bytes(data: &[u8]) -> Result<Self, String> {
1682 let mut pos = 0;
1683
1684 if data.len() < HEADER_BYTES_V1 {
1685 return Err("data too short".to_string());
1686 }
1687
1688 let version = data[pos];
1689 pos += 1;
1690 if version != SEMANTIC_INDEX_VERSION_V1
1691 && version != SEMANTIC_INDEX_VERSION_V2
1692 && version != SEMANTIC_INDEX_VERSION_V3
1693 && version != SEMANTIC_INDEX_VERSION_V4
1694 && version != SEMANTIC_INDEX_VERSION_V5
1695 {
1696 return Err(format!("unsupported version: {}", version));
1697 }
1698 if (version == SEMANTIC_INDEX_VERSION_V2
1702 || version == SEMANTIC_INDEX_VERSION_V3
1703 || version == SEMANTIC_INDEX_VERSION_V4
1704 || version == SEMANTIC_INDEX_VERSION_V5)
1705 && data.len() < HEADER_BYTES_V2
1706 {
1707 return Err("data too short for semantic index v2/v3/v4/v5 header".to_string());
1708 }
1709
1710 let dimension = read_u32(data, &mut pos)? as usize;
1711 let entry_count = read_u32(data, &mut pos)? as usize;
1712 if dimension == 0 || dimension > MAX_DIMENSION {
1713 return Err(format!("invalid embedding dimension: {}", dimension));
1714 }
1715 if entry_count > MAX_ENTRIES {
1716 return Err(format!("too many semantic index entries: {}", entry_count));
1717 }
1718
1719 let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1725 || version == SEMANTIC_INDEX_VERSION_V3
1726 || version == SEMANTIC_INDEX_VERSION_V4
1727 || version == SEMANTIC_INDEX_VERSION_V5;
1728 let fingerprint = if has_fingerprint_field {
1729 let fingerprint_len = read_u32(data, &mut pos)? as usize;
1730 if pos + fingerprint_len > data.len() {
1731 return Err("unexpected end of data reading fingerprint".to_string());
1732 }
1733 if fingerprint_len == 0 {
1734 None
1735 } else {
1736 let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1737 pos += fingerprint_len;
1738 Some(
1739 serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1740 .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1741 )
1742 }
1743 } else {
1744 None
1745 };
1746
1747 let mtime_count = read_u32(data, &mut pos)? as usize;
1749 if mtime_count > MAX_ENTRIES {
1750 return Err(format!("too many semantic file mtimes: {}", mtime_count));
1751 }
1752
1753 let vector_bytes = entry_count
1754 .checked_mul(dimension)
1755 .and_then(|count| count.checked_mul(F32_BYTES))
1756 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1757 if vector_bytes > data.len().saturating_sub(pos) {
1758 return Err("semantic index vectors exceed available data".to_string());
1759 }
1760
1761 let mut file_mtimes = HashMap::with_capacity(mtime_count);
1762 let mut file_sizes = HashMap::with_capacity(mtime_count);
1763 for _ in 0..mtime_count {
1764 let path = read_string(data, &mut pos)?;
1765 let secs = read_u64(data, &mut pos)?;
1766 let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1772 || version == SEMANTIC_INDEX_VERSION_V4
1773 || version == SEMANTIC_INDEX_VERSION_V5
1774 {
1775 read_u32(data, &mut pos)?
1776 } else {
1777 0
1778 };
1779 let size = if version == SEMANTIC_INDEX_VERSION_V5 {
1780 read_u64(data, &mut pos)?
1781 } else {
1782 0
1783 };
1784 if nanos >= 1_000_000_000 {
1791 return Err(format!(
1792 "invalid semantic mtime: nanos {} >= 1_000_000_000",
1793 nanos
1794 ));
1795 }
1796 let duration = std::time::Duration::new(secs, nanos);
1797 let mtime = SystemTime::UNIX_EPOCH
1798 .checked_add(duration)
1799 .ok_or_else(|| {
1800 format!(
1801 "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1802 secs, nanos
1803 )
1804 })?;
1805 let path = PathBuf::from(path);
1806 file_mtimes.insert(path.clone(), mtime);
1807 file_sizes.insert(path, size);
1808 }
1809
1810 let mut entries = Vec::with_capacity(entry_count);
1812 for _ in 0..entry_count {
1813 let file = PathBuf::from(read_string(data, &mut pos)?);
1814 let name = read_string(data, &mut pos)?;
1815
1816 if pos >= data.len() {
1817 return Err("unexpected end of data".to_string());
1818 }
1819 let kind = u8_to_symbol_kind(data[pos]);
1820 pos += 1;
1821
1822 let start_line = read_u32(data, &mut pos)?;
1823 let end_line = read_u32(data, &mut pos)?;
1824
1825 if pos >= data.len() {
1826 return Err("unexpected end of data".to_string());
1827 }
1828 let exported = data[pos] != 0;
1829 pos += 1;
1830
1831 let snippet = read_string(data, &mut pos)?;
1832 let embed_text = read_string(data, &mut pos)?;
1833
1834 let vec_bytes = dimension
1836 .checked_mul(F32_BYTES)
1837 .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1838 if pos + vec_bytes > data.len() {
1839 return Err("unexpected end of data reading vector".to_string());
1840 }
1841 let mut vector = Vec::with_capacity(dimension);
1842 for _ in 0..dimension {
1843 let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1844 vector.push(f32::from_le_bytes(bytes));
1845 pos += 4;
1846 }
1847
1848 entries.push(EmbeddingEntry {
1849 chunk: SemanticChunk {
1850 file,
1851 name,
1852 kind,
1853 start_line,
1854 end_line,
1855 exported,
1856 embed_text,
1857 snippet,
1858 },
1859 vector,
1860 });
1861 }
1862
1863 if entries.len() != entry_count {
1864 return Err(format!(
1865 "semantic cache entry count drift: header={} decoded={}",
1866 entry_count,
1867 entries.len()
1868 ));
1869 }
1870 for entry in &entries {
1871 if !file_mtimes.contains_key(&entry.chunk.file) {
1872 return Err(format!(
1873 "semantic cache metadata missing for entry file {}",
1874 entry.chunk.file.display()
1875 ));
1876 }
1877 }
1878
1879 Ok(Self {
1880 entries,
1881 file_mtimes,
1882 file_sizes,
1883 dimension,
1884 fingerprint,
1885 })
1886 }
1887}
1888
1889fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
1891 let relative = file
1892 .strip_prefix(project_root)
1893 .unwrap_or(file)
1894 .to_string_lossy();
1895
1896 let kind_label = match &symbol.kind {
1897 SymbolKind::Function => "function",
1898 SymbolKind::Class => "class",
1899 SymbolKind::Method => "method",
1900 SymbolKind::Struct => "struct",
1901 SymbolKind::Interface => "interface",
1902 SymbolKind::Enum => "enum",
1903 SymbolKind::TypeAlias => "type",
1904 SymbolKind::Variable => "variable",
1905 SymbolKind::Heading => "heading",
1906 SymbolKind::FileSummary => "file-summary",
1907 };
1908
1909 let name = &symbol.name;
1911 let mut text = format!(
1912 "name:{name} file:{} kind:{} name:{name}",
1913 relative, kind_label
1914 );
1915
1916 if let Some(sig) = &symbol.signature {
1917 text.push_str(&format!(" signature:{}", sig));
1918 }
1919
1920 let lines: Vec<&str> = source.lines().collect();
1922 let start = (symbol.range.start_line as usize).min(lines.len());
1923 let end = (symbol.range.end_line as usize + 1).min(lines.len());
1925 if start < end {
1926 let body: String = lines[start..end]
1927 .iter()
1928 .take(15) .copied()
1930 .collect::<Vec<&str>>()
1931 .join("\n");
1932 let snippet = if body.len() > 300 {
1933 format!("{}...", &body[..body.floor_char_boundary(300)])
1934 } else {
1935 body
1936 };
1937 text.push_str(&format!(" body:{}", snippet));
1938 }
1939
1940 text
1941}
1942
1943fn truncate_chars(value: &str, max_chars: usize) -> String {
1944 value.chars().take(max_chars).collect()
1945}
1946
1947fn first_leading_doc_comment(source: &str) -> String {
1948 let lines: Vec<&str> = source.lines().collect();
1949 let Some((start, first)) = lines
1950 .iter()
1951 .enumerate()
1952 .find(|(_, line)| !line.trim().is_empty())
1953 else {
1954 return String::new();
1955 };
1956
1957 let trimmed = first.trim_start();
1958 if trimmed.starts_with("/**") {
1959 let mut comment = Vec::new();
1960 for line in lines.iter().skip(start) {
1961 comment.push(*line);
1962 if line.contains("*/") {
1963 break;
1964 }
1965 }
1966 return truncate_chars(&comment.join("\n"), 200);
1967 }
1968
1969 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
1970 let comment = lines
1971 .iter()
1972 .skip(start)
1973 .take_while(|line| {
1974 let trimmed = line.trim_start();
1975 trimmed.starts_with("///") || trimmed.starts_with("//!")
1976 })
1977 .copied()
1978 .collect::<Vec<_>>()
1979 .join("\n");
1980 return truncate_chars(&comment, 200);
1981 }
1982
1983 String::new()
1984}
1985
1986pub fn build_file_summary_chunk(
1987 file: &Path,
1988 project_root: &Path,
1989 source: &str,
1990 top_exports: &[&str],
1991 top_export_signatures: &[Option<&str>],
1992) -> SemanticChunk {
1993 let relative = file.strip_prefix(project_root).unwrap_or(file);
1994 let rel_path = relative.to_string_lossy();
1995 let parent_dir = relative
1996 .parent()
1997 .map(|parent| parent.to_string_lossy().to_string())
1998 .unwrap_or_default();
1999 let name = file
2000 .file_stem()
2001 .map(|stem| stem.to_string_lossy().to_string())
2002 .unwrap_or_default();
2003 let doc = first_leading_doc_comment(source);
2004 let exports = top_exports
2005 .iter()
2006 .take(5)
2007 .copied()
2008 .collect::<Vec<_>>()
2009 .join(",");
2010 let snippet = if doc.is_empty() {
2011 top_export_signatures
2012 .first()
2013 .and_then(|signature| signature.as_deref())
2014 .map(|signature| truncate_chars(signature, 200))
2015 .unwrap_or_default()
2016 } else {
2017 doc.clone()
2018 };
2019
2020 SemanticChunk {
2021 file: file.to_path_buf(),
2022 name,
2023 kind: SymbolKind::FileSummary,
2024 start_line: 0,
2025 end_line: 0,
2026 exported: false,
2027 embed_text: format!(
2028 "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2029 file.file_stem()
2030 .map(|stem| stem.to_string_lossy().to_string())
2031 .unwrap_or_default()
2032 ),
2033 snippet,
2034 }
2035}
2036
2037fn parser_for(
2038 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2039 lang: crate::parser::LangId,
2040) -> Result<&mut Parser, String> {
2041 use std::collections::hash_map::Entry;
2042
2043 match parsers.entry(lang) {
2044 Entry::Occupied(entry) => Ok(entry.into_mut()),
2045 Entry::Vacant(entry) => {
2046 let grammar = grammar_for(lang);
2047 let mut parser = Parser::new();
2048 parser
2049 .set_language(&grammar)
2050 .map_err(|error| error.to_string())?;
2051 Ok(entry.insert(parser))
2052 }
2053 }
2054}
2055
2056pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2057 matches!(
2058 path.extension().and_then(|extension| extension.to_str()),
2059 Some(
2060 "ts" | "tsx"
2061 | "js"
2062 | "jsx"
2063 | "py"
2064 | "rs"
2065 | "go"
2066 | "c"
2067 | "h"
2068 | "cc"
2069 | "cpp"
2070 | "cxx"
2071 | "hpp"
2072 | "hh"
2073 | "zig"
2074 | "cs"
2075 | "sh"
2076 | "bash"
2077 | "zsh"
2078 | "sol"
2079 | "vue"
2080 )
2081 )
2082}
2083
2084fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2085 let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2086 let mtime = metadata.modified().map_err(|error| error.to_string())?;
2087 Ok(IndexedFileMetadata {
2088 mtime,
2089 size: metadata.len(),
2090 })
2091}
2092
2093fn collect_file_chunks(
2094 project_root: &Path,
2095 file: &Path,
2096 parsers: &mut HashMap<crate::parser::LangId, Parser>,
2097) -> Result<Vec<SemanticChunk>, String> {
2098 if !is_semantic_indexed_extension(file) {
2099 return Err("unsupported file extension".to_string());
2100 }
2101 let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2102 let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2103 let tree = parser_for(parsers, lang)?
2104 .parse(&source, None)
2105 .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2106 let symbols =
2107 extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2108
2109 Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2110}
2111
2112fn build_snippet(symbol: &Symbol, source: &str) -> String {
2114 let lines: Vec<&str> = source.lines().collect();
2115 let start = (symbol.range.start_line as usize).min(lines.len());
2116 let end = (symbol.range.end_line as usize + 1).min(lines.len());
2118 if start < end {
2119 let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2120 let mut snippet = snippet_lines.join("\n");
2121 if end - start > 5 {
2122 snippet.push_str("\n ...");
2123 }
2124 if snippet.len() > 300 {
2125 snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2126 }
2127 snippet
2128 } else {
2129 String::new()
2130 }
2131}
2132
2133fn symbols_to_chunks(
2135 file: &Path,
2136 symbols: &[Symbol],
2137 source: &str,
2138 project_root: &Path,
2139) -> Vec<SemanticChunk> {
2140 let mut chunks = Vec::new();
2141 let top_exports_with_signatures = symbols
2142 .iter()
2143 .filter(|symbol| {
2144 symbol.exported
2145 && symbol.parent.is_none()
2146 && !matches!(symbol.kind, SymbolKind::Heading)
2147 })
2148 .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2149 .collect::<Vec<_>>();
2150
2151 let has_only_headings = !symbols.is_empty()
2152 && symbols
2153 .iter()
2154 .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2155 if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2156 let top_exports = top_exports_with_signatures
2157 .iter()
2158 .map(|(name, _)| *name)
2159 .collect::<Vec<_>>();
2160 let top_export_signatures = top_exports_with_signatures
2161 .iter()
2162 .map(|(_, signature)| *signature)
2163 .collect::<Vec<_>>();
2164 chunks.push(build_file_summary_chunk(
2165 file,
2166 project_root,
2167 source,
2168 &top_exports,
2169 &top_export_signatures,
2170 ));
2171 }
2172
2173 for symbol in symbols {
2174 if matches!(symbol.kind, SymbolKind::Heading) {
2179 continue;
2180 }
2181
2182 let line_count = symbol
2184 .range
2185 .end_line
2186 .saturating_sub(symbol.range.start_line)
2187 + 1;
2188 if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2189 continue;
2190 }
2191
2192 let embed_text = build_embed_text(symbol, source, file, project_root);
2193 let snippet = build_snippet(symbol, source);
2194
2195 chunks.push(SemanticChunk {
2196 file: file.to_path_buf(),
2197 name: symbol.name.clone(),
2198 kind: symbol.kind.clone(),
2199 start_line: symbol.range.start_line,
2200 end_line: symbol.range.end_line,
2201 exported: symbol.exported,
2202 embed_text,
2203 snippet,
2204 });
2205
2206 }
2209
2210 chunks
2211}
2212
2213fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2215 if a.len() != b.len() {
2216 return 0.0;
2217 }
2218
2219 let mut dot = 0.0f32;
2220 let mut norm_a = 0.0f32;
2221 let mut norm_b = 0.0f32;
2222
2223 for i in 0..a.len() {
2224 dot += a[i] * b[i];
2225 norm_a += a[i] * a[i];
2226 norm_b += b[i] * b[i];
2227 }
2228
2229 let denom = norm_a.sqrt() * norm_b.sqrt();
2230 if denom == 0.0 {
2231 0.0
2232 } else {
2233 dot / denom
2234 }
2235}
2236
2237fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2239 match kind {
2240 SymbolKind::Function => 0,
2241 SymbolKind::Class => 1,
2242 SymbolKind::Method => 2,
2243 SymbolKind::Struct => 3,
2244 SymbolKind::Interface => 4,
2245 SymbolKind::Enum => 5,
2246 SymbolKind::TypeAlias => 6,
2247 SymbolKind::Variable => 7,
2248 SymbolKind::Heading => 8,
2249 SymbolKind::FileSummary => 9,
2250 }
2251}
2252
2253fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2254 match v {
2255 0 => SymbolKind::Function,
2256 1 => SymbolKind::Class,
2257 2 => SymbolKind::Method,
2258 3 => SymbolKind::Struct,
2259 4 => SymbolKind::Interface,
2260 5 => SymbolKind::Enum,
2261 6 => SymbolKind::TypeAlias,
2262 7 => SymbolKind::Variable,
2263 8 => SymbolKind::Heading,
2264 9 => SymbolKind::FileSummary,
2265 _ => SymbolKind::Heading,
2266 }
2267}
2268
2269fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2270 if *pos + 4 > data.len() {
2271 return Err("unexpected end of data reading u32".to_string());
2272 }
2273 let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2274 *pos += 4;
2275 Ok(val)
2276}
2277
2278fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2279 if *pos + 8 > data.len() {
2280 return Err("unexpected end of data reading u64".to_string());
2281 }
2282 let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2283 *pos += 8;
2284 Ok(u64::from_le_bytes(bytes))
2285}
2286
2287fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2288 let len = read_u32(data, pos)? as usize;
2289 if *pos + len > data.len() {
2290 return Err("unexpected end of data reading string".to_string());
2291 }
2292 let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2293 *pos += len;
2294 Ok(s)
2295}
2296
2297#[cfg(test)]
2298mod tests {
2299 use super::*;
2300 use crate::config::{SemanticBackend, SemanticBackendConfig};
2301 use crate::parser::FileParser;
2302 use std::io::{Read, Write};
2303 use std::net::TcpListener;
2304 use std::thread;
2305
2306 fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2307 where
2308 F: Fn(String, String, String) -> String + Send + 'static,
2309 {
2310 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2311 let addr = listener.local_addr().expect("local addr");
2312 let handle = thread::spawn(move || {
2313 let (mut stream, _) = listener.accept().expect("accept request");
2314 let mut buf = Vec::new();
2315 let mut chunk = [0u8; 4096];
2316 let mut header_end = None;
2317 let mut content_length = 0usize;
2318 loop {
2319 let n = stream.read(&mut chunk).expect("read request");
2320 if n == 0 {
2321 break;
2322 }
2323 buf.extend_from_slice(&chunk[..n]);
2324 if header_end.is_none() {
2325 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2326 header_end = Some(pos + 4);
2327 let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2328 for line in headers.lines() {
2329 if let Some(value) = line.strip_prefix("Content-Length:") {
2330 content_length = value.trim().parse::<usize>().unwrap_or(0);
2331 }
2332 }
2333 }
2334 }
2335 if let Some(end) = header_end {
2336 if buf.len() >= end + content_length {
2337 break;
2338 }
2339 }
2340 }
2341
2342 let end = header_end.expect("header terminator");
2343 let request = String::from_utf8_lossy(&buf[..end]).to_string();
2344 let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2345 let mut lines = request.lines();
2346 let request_line = lines.next().expect("request line").to_string();
2347 let path = request_line
2348 .split_whitespace()
2349 .nth(1)
2350 .expect("request path")
2351 .to_string();
2352 let response_body = handler(request_line, path, body);
2353 let response = format!(
2354 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2355 response_body.len(),
2356 response_body
2357 );
2358 stream
2359 .write_all(response.as_bytes())
2360 .expect("write response");
2361 });
2362
2363 (format!("http://{}", addr), handle)
2364 }
2365
2366 fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2367 Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2368 }
2369
2370 fn write_rust_file(path: &Path, function_name: &str) {
2371 fs::write(
2372 path,
2373 format!("pub fn {function_name}() -> bool {{\n true\n}}\n"),
2374 )
2375 .unwrap();
2376 }
2377
2378 fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2379 let mut embed = test_vector_for_texts;
2380 SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2381 }
2382
2383 fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2384 index.file_mtimes.insert(file.to_path_buf(), mtime);
2385 index.file_sizes.insert(file.to_path_buf(), size);
2386 }
2387
2388 #[test]
2389 fn test_cosine_similarity_identical() {
2390 let a = vec![1.0, 0.0, 0.0];
2391 let b = vec![1.0, 0.0, 0.0];
2392 assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2393 }
2394
2395 #[test]
2396 fn test_cosine_similarity_orthogonal() {
2397 let a = vec![1.0, 0.0, 0.0];
2398 let b = vec![0.0, 1.0, 0.0];
2399 assert!(cosine_similarity(&a, &b).abs() < 0.001);
2400 }
2401
2402 #[test]
2403 fn test_cosine_similarity_opposite() {
2404 let a = vec![1.0, 0.0, 0.0];
2405 let b = vec![-1.0, 0.0, 0.0];
2406 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2407 }
2408
2409 #[test]
2410 fn test_serialization_roundtrip() {
2411 let mut index = SemanticIndex::new();
2412 index.entries.push(EmbeddingEntry {
2413 chunk: SemanticChunk {
2414 file: PathBuf::from("/src/main.rs"),
2415 name: "handle_request".to_string(),
2416 kind: SymbolKind::Function,
2417 start_line: 10,
2418 end_line: 25,
2419 exported: true,
2420 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2421 snippet: "fn handle_request() {\n // ...\n}".to_string(),
2422 },
2423 vector: vec![0.1, 0.2, 0.3, 0.4],
2424 });
2425 index.dimension = 4;
2426 index
2427 .file_mtimes
2428 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
2429 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
2430 index.set_fingerprint(SemanticIndexFingerprint {
2431 backend: "fastembed".to_string(),
2432 model: "all-MiniLM-L6-v2".to_string(),
2433 base_url: FALLBACK_BACKEND.to_string(),
2434 dimension: 4,
2435 chunking_version: default_chunking_version(),
2436 });
2437
2438 let bytes = index.to_bytes();
2439 let restored = SemanticIndex::from_bytes(&bytes).unwrap();
2440
2441 assert_eq!(restored.entries.len(), 1);
2442 assert_eq!(restored.entries[0].chunk.name, "handle_request");
2443 assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2444 assert_eq!(restored.dimension, 4);
2445 assert_eq!(restored.backend_label(), Some("fastembed"));
2446 assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2447 }
2448
2449 #[test]
2450 fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2451 let cases = [
2452 (SymbolKind::Function, 0),
2453 (SymbolKind::Class, 1),
2454 (SymbolKind::Method, 2),
2455 (SymbolKind::Struct, 3),
2456 (SymbolKind::Interface, 4),
2457 (SymbolKind::Enum, 5),
2458 (SymbolKind::TypeAlias, 6),
2459 (SymbolKind::Variable, 7),
2460 (SymbolKind::Heading, 8),
2461 (SymbolKind::FileSummary, 9),
2462 ];
2463
2464 for (kind, encoded) in cases {
2465 assert_eq!(symbol_kind_to_u8(&kind), encoded);
2466 assert_eq!(u8_to_symbol_kind(encoded), kind);
2467 }
2468 }
2469
2470 #[test]
2471 fn test_search_top_k() {
2472 let mut index = SemanticIndex::new();
2473 index.dimension = 3;
2474
2475 for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2477 let mut vec = vec![0.0f32; 3];
2478 vec[i] = 1.0; index.entries.push(EmbeddingEntry {
2480 chunk: SemanticChunk {
2481 file: PathBuf::from("/src/lib.rs"),
2482 name: name.to_string(),
2483 kind: SymbolKind::Function,
2484 start_line: (i * 10 + 1) as u32,
2485 end_line: (i * 10 + 5) as u32,
2486 exported: true,
2487 embed_text: format!("kind:function name:{}", name),
2488 snippet: format!("fn {}() {{}}", name),
2489 },
2490 vector: vec,
2491 });
2492 }
2493
2494 let query = vec![0.9, 0.1, 0.0];
2496 let results = index.search(&query, 2);
2497
2498 assert_eq!(results.len(), 2);
2499 assert_eq!(results[0].name, "auth"); assert!(results[0].score > results[1].score);
2501 }
2502
2503 #[test]
2504 fn test_empty_index_search() {
2505 let index = SemanticIndex::new();
2506 let results = index.search(&[0.1, 0.2, 0.3], 10);
2507 assert!(results.is_empty());
2508 }
2509
2510 #[test]
2511 fn single_line_symbol_builds_non_empty_snippet() {
2512 let symbol = Symbol {
2513 name: "answer".to_string(),
2514 kind: SymbolKind::Variable,
2515 range: crate::symbols::Range {
2516 start_line: 0,
2517 start_col: 0,
2518 end_line: 0,
2519 end_col: 24,
2520 },
2521 signature: Some("const answer = 42".to_string()),
2522 scope_chain: Vec::new(),
2523 exported: true,
2524 parent: None,
2525 };
2526 let source = "export const answer = 42;\n";
2527
2528 let snippet = build_snippet(&symbol, source);
2529
2530 assert_eq!(snippet, "export const answer = 42;");
2531 }
2532
2533 #[test]
2534 fn optimized_file_chunk_collection_matches_file_parser_path() {
2535 let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2536 let file = project_root.join("src/semantic_index.rs");
2537 let source = std::fs::read_to_string(&file).unwrap();
2538
2539 let mut legacy_parser = FileParser::new();
2540 let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2541 let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2542
2543 let mut parsers = HashMap::new();
2544 let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2545
2546 assert_eq!(
2547 chunk_fingerprint(&optimized_chunks),
2548 chunk_fingerprint(&legacy_chunks)
2549 );
2550 }
2551
2552 fn chunk_fingerprint(
2553 chunks: &[SemanticChunk],
2554 ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2555 chunks
2556 .iter()
2557 .map(|chunk| {
2558 (
2559 chunk.name.clone(),
2560 chunk.kind.clone(),
2561 chunk.start_line,
2562 chunk.end_line,
2563 chunk.exported,
2564 chunk.embed_text.clone(),
2565 chunk.snippet.clone(),
2566 )
2567 })
2568 .collect()
2569 }
2570
2571 #[test]
2572 fn rejects_oversized_dimension_during_deserialization() {
2573 let mut bytes = Vec::new();
2574 bytes.push(1u8);
2575 bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2576 bytes.extend_from_slice(&0u32.to_le_bytes());
2577 bytes.extend_from_slice(&0u32.to_le_bytes());
2578
2579 assert!(SemanticIndex::from_bytes(&bytes).is_err());
2580 }
2581
2582 #[test]
2583 fn rejects_oversized_entry_count_during_deserialization() {
2584 let mut bytes = Vec::new();
2585 bytes.push(1u8);
2586 bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2587 bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2588 bytes.extend_from_slice(&0u32.to_le_bytes());
2589
2590 assert!(SemanticIndex::from_bytes(&bytes).is_err());
2591 }
2592
2593 #[test]
2594 fn invalidate_file_removes_entries_and_mtime() {
2595 let target = PathBuf::from("/src/main.rs");
2596 let mut index = SemanticIndex::new();
2597 index.entries.push(EmbeddingEntry {
2598 chunk: SemanticChunk {
2599 file: target.clone(),
2600 name: "main".to_string(),
2601 kind: SymbolKind::Function,
2602 start_line: 0,
2603 end_line: 1,
2604 exported: false,
2605 embed_text: "main".to_string(),
2606 snippet: "fn main() {}".to_string(),
2607 },
2608 vector: vec![1.0; DEFAULT_DIMENSION],
2609 });
2610 index
2611 .file_mtimes
2612 .insert(target.clone(), SystemTime::UNIX_EPOCH);
2613 index.file_sizes.insert(target.clone(), 0);
2614
2615 index.invalidate_file(&target);
2616
2617 assert!(index.entries.is_empty());
2618 assert!(!index.file_mtimes.contains_key(&target));
2619 assert!(!index.file_sizes.contains_key(&target));
2620 }
2621
2622 #[test]
2623 fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2624 let temp = tempfile::tempdir().unwrap();
2625 let project_root = temp.path();
2626 let file = project_root.join("src/lib.rs");
2627 fs::create_dir_all(file.parent().unwrap()).unwrap();
2628 write_rust_file(&file, "kept_symbol");
2629
2630 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2631 let original_entry_count = index.entries.len();
2632 let original_mtime = *index.file_mtimes.get(&file).unwrap();
2633 let original_size = *index.file_sizes.get(&file).unwrap();
2634
2635 let stale_mtime = SystemTime::UNIX_EPOCH;
2636 set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2637 fs::remove_file(&file).unwrap();
2638
2639 let mut embed = test_vector_for_texts;
2640 let mut progress = |_done: usize, _total: usize| {};
2641 let summary = index
2642 .refresh_stale_files(
2643 project_root,
2644 std::slice::from_ref(&file),
2645 &mut embed,
2646 8,
2647 &mut progress,
2648 )
2649 .unwrap();
2650
2651 assert_eq!(summary.changed, 0);
2652 assert_eq!(summary.added, 0);
2653 assert_eq!(summary.deleted, 0);
2654 assert_eq!(index.entries.len(), original_entry_count);
2655 assert!(index
2656 .entries
2657 .iter()
2658 .any(|entry| entry.chunk.name == "kept_symbol"));
2659 assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2660 assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2661 assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2662 }
2663
2664 #[test]
2665 fn refresh_never_indexed_file_error_does_not_record_mtime() {
2666 let temp = tempfile::tempdir().unwrap();
2667 let project_root = temp.path();
2668 let missing = project_root.join("src/missing.rs");
2669 fs::create_dir_all(missing.parent().unwrap()).unwrap();
2670
2671 let mut index = SemanticIndex::new();
2672 let mut embed = test_vector_for_texts;
2673 let mut progress = |_done: usize, _total: usize| {};
2674 let summary = index
2675 .refresh_stale_files(
2676 project_root,
2677 std::slice::from_ref(&missing),
2678 &mut embed,
2679 8,
2680 &mut progress,
2681 )
2682 .unwrap();
2683
2684 assert_eq!(summary.added, 0);
2685 assert_eq!(summary.changed, 0);
2686 assert_eq!(summary.deleted, 0);
2687 assert!(!index.file_mtimes.contains_key(&missing));
2688 assert!(!index.file_sizes.contains_key(&missing));
2689 assert!(index.entries.is_empty());
2690 }
2691
2692 #[test]
2693 fn refresh_reports_added_for_new_files() {
2694 let temp = tempfile::tempdir().unwrap();
2695 let project_root = temp.path();
2696 let existing = project_root.join("src/lib.rs");
2697 let added = project_root.join("src/new.rs");
2698 fs::create_dir_all(existing.parent().unwrap()).unwrap();
2699 write_rust_file(&existing, "existing_symbol");
2700 write_rust_file(&added, "added_symbol");
2701
2702 let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2703 let mut embed = test_vector_for_texts;
2704 let mut progress = |_done: usize, _total: usize| {};
2705 let summary = index
2706 .refresh_stale_files(
2707 project_root,
2708 &[existing.clone(), added.clone()],
2709 &mut embed,
2710 8,
2711 &mut progress,
2712 )
2713 .unwrap();
2714
2715 assert_eq!(summary.added, 1);
2716 assert_eq!(summary.changed, 0);
2717 assert_eq!(summary.deleted, 0);
2718 assert_eq!(summary.total_processed, 2);
2719 assert!(index.file_mtimes.contains_key(&added));
2720 assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2721 }
2722
2723 #[test]
2724 fn refresh_reports_deleted_for_removed_files() {
2725 let temp = tempfile::tempdir().unwrap();
2726 let project_root = temp.path();
2727 let deleted = project_root.join("src/deleted.rs");
2728 fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2729 write_rust_file(&deleted, "deleted_symbol");
2730
2731 let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2732 fs::remove_file(&deleted).unwrap();
2733
2734 let mut embed = test_vector_for_texts;
2735 let mut progress = |_done: usize, _total: usize| {};
2736 let summary = index
2737 .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2738 .unwrap();
2739
2740 assert_eq!(summary.deleted, 1);
2741 assert_eq!(summary.changed, 0);
2742 assert_eq!(summary.added, 0);
2743 assert_eq!(summary.total_processed, 1);
2744 assert!(!index.file_mtimes.contains_key(&deleted));
2745 assert!(index.entries.is_empty());
2746 }
2747
2748 #[test]
2749 fn refresh_reports_changed_for_modified_files() {
2750 let temp = tempfile::tempdir().unwrap();
2751 let project_root = temp.path();
2752 let file = project_root.join("src/lib.rs");
2753 fs::create_dir_all(file.parent().unwrap()).unwrap();
2754 write_rust_file(&file, "old_symbol");
2755
2756 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2757 set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2758 write_rust_file(&file, "new_symbol");
2759
2760 let mut embed = test_vector_for_texts;
2761 let mut progress = |_done: usize, _total: usize| {};
2762 let summary = index
2763 .refresh_stale_files(
2764 project_root,
2765 std::slice::from_ref(&file),
2766 &mut embed,
2767 8,
2768 &mut progress,
2769 )
2770 .unwrap();
2771
2772 assert_eq!(summary.changed, 1);
2773 assert_eq!(summary.added, 0);
2774 assert_eq!(summary.deleted, 0);
2775 assert_eq!(summary.total_processed, 1);
2776 assert!(index
2777 .entries
2778 .iter()
2779 .any(|entry| entry.chunk.name == "new_symbol"));
2780 assert!(!index
2781 .entries
2782 .iter()
2783 .any(|entry| entry.chunk.name == "old_symbol"));
2784 }
2785
2786 #[test]
2787 fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2788 let temp = tempfile::tempdir().unwrap();
2789 let project_root = temp.path();
2790 let file = project_root.join("src/lib.rs");
2791 fs::create_dir_all(file.parent().unwrap()).unwrap();
2792 write_rust_file(&file, "clean_symbol");
2793
2794 let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2795 let original_entries = index.entries.len();
2796 let mut embed_called = false;
2797 let mut embed = |texts: Vec<String>| {
2798 embed_called = true;
2799 test_vector_for_texts(texts)
2800 };
2801 let mut progress = |_done: usize, _total: usize| {};
2802 let summary = index
2803 .refresh_stale_files(
2804 project_root,
2805 std::slice::from_ref(&file),
2806 &mut embed,
2807 8,
2808 &mut progress,
2809 )
2810 .unwrap();
2811
2812 assert!(summary.is_noop());
2813 assert_eq!(summary.total_processed, 1);
2814 assert!(!embed_called);
2815 assert_eq!(index.entries.len(), original_entries);
2816 }
2817
2818 #[test]
2819 fn detects_missing_onnx_runtime_from_dynamic_load_error() {
2820 let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
2821
2822 assert!(is_onnx_runtime_unavailable(message));
2823 }
2824
2825 #[test]
2826 fn formats_missing_onnx_runtime_with_install_hint() {
2827 let message = format_embedding_init_error(
2828 "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
2829 );
2830
2831 assert!(message.starts_with("ONNX Runtime not found. Install via:"));
2832 assert!(message.contains("Original error:"));
2833 }
2834
2835 #[test]
2836 fn openai_compatible_backend_embeds_with_mock_server() {
2837 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
2838 assert!(request_line.starts_with("POST "));
2839 assert_eq!(path, "/v1/embeddings");
2840 "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
2841 });
2842
2843 let config = SemanticBackendConfig {
2844 backend: SemanticBackend::OpenAiCompatible,
2845 model: "test-embedding".to_string(),
2846 base_url: Some(base_url),
2847 api_key_env: None,
2848 timeout_ms: 5_000,
2849 max_batch_size: 64,
2850 };
2851
2852 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
2853 let vectors = model
2854 .embed(vec!["hello".to_string(), "world".to_string()])
2855 .unwrap();
2856
2857 assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
2858 handle.join().unwrap();
2859 }
2860
2861 #[test]
2871 fn openai_compatible_request_has_single_content_type_header() {
2872 use std::sync::{Arc, Mutex};
2873 let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
2874 let captured_for_thread = Arc::clone(&captured);
2875
2876 let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2877 let addr = listener.local_addr().expect("local addr");
2878 let handle = thread::spawn(move || {
2879 let (mut stream, _) = listener.accept().expect("accept");
2880 let mut buf = Vec::new();
2881 let mut chunk = [0u8; 4096];
2882 let mut header_end = None;
2883 let mut content_length = 0usize;
2884 loop {
2885 let n = stream.read(&mut chunk).expect("read");
2886 if n == 0 {
2887 break;
2888 }
2889 buf.extend_from_slice(&chunk[..n]);
2890 if header_end.is_none() {
2891 if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2892 header_end = Some(pos + 4);
2893 for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
2894 if let Some(value) = line.strip_prefix("Content-Length:") {
2895 content_length = value.trim().parse::<usize>().unwrap_or(0);
2896 }
2897 }
2898 }
2899 }
2900 if let Some(end) = header_end {
2901 if buf.len() >= end + content_length {
2902 break;
2903 }
2904 }
2905 }
2906 *captured_for_thread.lock().unwrap() = buf;
2907 let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
2908 let response = format!(
2909 "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2910 body.len(),
2911 body
2912 );
2913 let _ = stream.write_all(response.as_bytes());
2914 });
2915
2916 let config = SemanticBackendConfig {
2917 backend: SemanticBackend::OpenAiCompatible,
2918 model: "text-embedding-3-small".to_string(),
2919 base_url: Some(format!("http://{}", addr)),
2920 api_key_env: None,
2921 timeout_ms: 5_000,
2922 max_batch_size: 64,
2923 };
2924 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
2925 let _ = model.embed(vec!["probe".to_string()]).unwrap();
2926 handle.join().unwrap();
2927
2928 let bytes = captured.lock().unwrap().clone();
2929 let request = String::from_utf8_lossy(&bytes);
2930
2931 let content_type_lines = request
2934 .lines()
2935 .filter(|line| {
2936 let lower = line.to_ascii_lowercase();
2937 lower.starts_with("content-type:")
2938 })
2939 .count();
2940 assert_eq!(
2941 content_type_lines, 1,
2942 "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
2943 );
2944
2945 assert!(
2948 request.contains(r#""model":"text-embedding-3-small""#),
2949 "request body should contain model field; full request:\n{request}",
2950 );
2951 }
2952
2953 #[test]
2954 fn ollama_backend_embeds_with_mock_server() {
2955 let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
2956 assert!(request_line.starts_with("POST "));
2957 assert_eq!(path, "/api/embed");
2958 "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
2959 });
2960
2961 let config = SemanticBackendConfig {
2962 backend: SemanticBackend::Ollama,
2963 model: "embeddinggemma".to_string(),
2964 base_url: Some(base_url),
2965 api_key_env: None,
2966 timeout_ms: 5_000,
2967 max_batch_size: 64,
2968 };
2969
2970 let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
2971 let vectors = model
2972 .embed(vec!["hello".to_string(), "world".to_string()])
2973 .unwrap();
2974
2975 assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
2976 handle.join().unwrap();
2977 }
2978
2979 #[test]
2980 fn read_from_disk_rejects_fingerprint_mismatch() {
2981 let storage = tempfile::tempdir().unwrap();
2982 let project_key = "proj";
2983
2984 let mut index = SemanticIndex::new();
2985 index.entries.push(EmbeddingEntry {
2986 chunk: SemanticChunk {
2987 file: PathBuf::from("/src/main.rs"),
2988 name: "handle_request".to_string(),
2989 kind: SymbolKind::Function,
2990 start_line: 10,
2991 end_line: 25,
2992 exported: true,
2993 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2994 snippet: "fn handle_request() {}".to_string(),
2995 },
2996 vector: vec![0.1, 0.2, 0.3],
2997 });
2998 index.dimension = 3;
2999 index
3000 .file_mtimes
3001 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3002 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3003 index.set_fingerprint(SemanticIndexFingerprint {
3004 backend: "openai_compatible".to_string(),
3005 model: "test-embedding".to_string(),
3006 base_url: "http://127.0.0.1:1234/v1".to_string(),
3007 dimension: 3,
3008 chunking_version: default_chunking_version(),
3009 });
3010 index.write_to_disk(storage.path(), project_key);
3011
3012 let matching = index.fingerprint().unwrap().as_string();
3013 assert!(
3014 SemanticIndex::read_from_disk(storage.path(), project_key, Some(&matching)).is_some()
3015 );
3016
3017 let mismatched = SemanticIndexFingerprint {
3018 backend: "ollama".to_string(),
3019 model: "embeddinggemma".to_string(),
3020 base_url: "http://127.0.0.1:11434".to_string(),
3021 dimension: 3,
3022 chunking_version: default_chunking_version(),
3023 }
3024 .as_string();
3025 assert!(
3026 SemanticIndex::read_from_disk(storage.path(), project_key, Some(&mismatched)).is_none()
3027 );
3028 }
3029
3030 #[test]
3031 fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3032 let storage = tempfile::tempdir().unwrap();
3033 let project_key = "proj-v3";
3034 let dir = storage.path().join("semantic").join(project_key);
3035 fs::create_dir_all(&dir).unwrap();
3036
3037 let mut index = SemanticIndex::new();
3038 index.entries.push(EmbeddingEntry {
3039 chunk: SemanticChunk {
3040 file: PathBuf::from("/src/main.rs"),
3041 name: "handle_request".to_string(),
3042 kind: SymbolKind::Function,
3043 start_line: 0,
3044 end_line: 0,
3045 exported: true,
3046 embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3047 snippet: "fn handle_request() {}".to_string(),
3048 },
3049 vector: vec![0.1, 0.2, 0.3],
3050 });
3051 index.dimension = 3;
3052 index
3053 .file_mtimes
3054 .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3055 index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3056 let fingerprint = SemanticIndexFingerprint {
3057 backend: "fastembed".to_string(),
3058 model: "test".to_string(),
3059 base_url: FALLBACK_BACKEND.to_string(),
3060 dimension: 3,
3061 chunking_version: default_chunking_version(),
3062 };
3063 index.set_fingerprint(fingerprint.clone());
3064
3065 let mut bytes = index.to_bytes();
3066 bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3067 fs::write(dir.join("semantic.bin"), bytes).unwrap();
3068
3069 assert!(SemanticIndex::read_from_disk(
3070 storage.path(),
3071 project_key,
3072 Some(&fingerprint.as_string())
3073 )
3074 .is_none());
3075 assert!(!dir.join("semantic.bin").exists());
3076 }
3077
3078 fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3079 crate::symbols::Symbol {
3080 name: name.to_string(),
3081 kind,
3082 range: crate::symbols::Range {
3083 start_line: start,
3084 start_col: 0,
3085 end_line: end,
3086 end_col: 0,
3087 },
3088 signature: None,
3089 scope_chain: Vec::new(),
3090 exported: false,
3091 parent: None,
3092 }
3093 }
3094
3095 #[test]
3100 fn symbols_to_chunks_skips_heading_symbols() {
3101 let project_root = PathBuf::from("/proj");
3102 let file = project_root.join("README.md");
3103 let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3104
3105 let symbols = vec![
3106 make_symbol(SymbolKind::Heading, "Title", 0, 2),
3107 make_symbol(SymbolKind::Heading, "Section", 4, 6),
3108 ];
3109
3110 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3111 assert!(
3112 chunks.is_empty(),
3113 "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3114 chunks.len()
3115 );
3116 }
3117
3118 #[test]
3122 fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3123 let project_root = PathBuf::from("/proj");
3124 let file = project_root.join("src/lib.rs");
3125 let source = "pub fn handle_request() -> bool {\n true\n}\n";
3126
3127 let symbols = vec![
3128 make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3130 make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3131 make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3132 ];
3133
3134 let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3135 assert_eq!(
3136 chunks.len(),
3137 3,
3138 "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3139 chunks.len()
3140 );
3141 let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3142 assert!(chunks
3143 .iter()
3144 .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3145 assert!(names.contains(&"handle_request"));
3146 assert!(names.contains(&"AuthService"));
3147 assert!(
3148 !names.contains(&"doc heading"),
3149 "Heading symbol leaked into chunks: {names:?}"
3150 );
3151 }
3152
3153 #[test]
3154 fn validate_ssrf_allows_loopback_hostnames() {
3155 for host in &[
3158 "http://localhost",
3159 "http://localhost:8080",
3160 "http://localhost:11434", "http://localhost.localdomain",
3162 "http://foo.localhost",
3163 ] {
3164 assert!(
3165 validate_base_url_no_ssrf(host).is_ok(),
3166 "Expected {host} to be allowed (loopback), got: {:?}",
3167 validate_base_url_no_ssrf(host)
3168 );
3169 }
3170 }
3171
3172 #[test]
3173 fn validate_ssrf_allows_loopback_ips() {
3174 for url in &[
3177 "http://127.0.0.1",
3178 "http://127.0.0.1:11434", "http://127.0.0.1:8080",
3180 "http://127.1.2.3",
3181 ] {
3182 let result = validate_base_url_no_ssrf(url);
3183 assert!(
3184 result.is_ok(),
3185 "Expected {url} to be allowed (loopback), got: {:?}",
3186 result
3187 );
3188 }
3189 }
3190
3191 #[test]
3192 fn validate_ssrf_rejects_private_non_loopback_ips() {
3193 for url in &[
3198 "http://192.168.1.1",
3199 "http://10.0.0.1",
3200 "http://172.16.0.1",
3201 "http://169.254.169.254",
3202 "http://100.64.0.1",
3203 ] {
3204 let result = validate_base_url_no_ssrf(url);
3205 assert!(
3206 result.is_err(),
3207 "Expected {url} to be rejected (non-loopback private), got: {:?}",
3208 result
3209 );
3210 }
3211 }
3212
3213 #[test]
3214 fn validate_ssrf_rejects_mdns_local_hostnames() {
3215 for host in &[
3218 "http://printer.local",
3219 "http://nas.local:8080",
3220 "http://homelab.local",
3221 ] {
3222 let result = validate_base_url_no_ssrf(host);
3223 assert!(
3224 result.is_err(),
3225 "Expected {host} to be rejected (mDNS), got: {:?}",
3226 result
3227 );
3228 }
3229 }
3230
3231 #[test]
3232 fn normalize_base_url_allows_localhost_for_tests() {
3233 assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3236 assert!(normalize_base_url("http://localhost:8080").is_ok());
3237 }
3238
3239 #[test]
3246 fn ort_mismatch_message_recommends_auto_fix_first() {
3247 let msg =
3248 format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3249
3250 assert!(
3252 msg.contains("v1.9.0"),
3253 "should report detected version: {msg}"
3254 );
3255 assert!(
3256 msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3257 "should report system path: {msg}"
3258 );
3259 assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3260
3261 let auto_fix_pos = msg
3263 .find("Auto-fix")
3264 .expect("Auto-fix solution missing — users won't discover --fix");
3265 let remove_pos = msg
3266 .find("Remove the old library")
3267 .expect("system-rm solution missing");
3268 assert!(
3269 auto_fix_pos < remove_pos,
3270 "Auto-fix must come before manual rm — see PR comment thread"
3271 );
3272
3273 assert!(
3275 msg.contains("npx @cortexkit/aft doctor --fix"),
3276 "auto-fix command must be present and copy-pasteable: {msg}"
3277 );
3278 }
3279
3280 #[test]
3284 fn ort_mismatch_message_handles_macos_dylib_path() {
3285 let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3286 assert!(msg.contains("v1.9.0"));
3287 assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3288 assert!(
3292 msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3293 "system path should be quoted in the auto-fix sentence: {msg}"
3294 );
3295 }
3296}