Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
34
35const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
36const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
37/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
38/// restart round-trips on filesystems with subsecond mtime precision (APFS,
39/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
40/// caused every restart to flag ~99% of files as stale and re-embed them.
41const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
42/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
43/// fixing symbol ranges that were incorrectly treated as 1-based.
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
45/// V5 adds file sizes to the file metadata table so incremental staleness
46/// detection can catch content changes even when mtime precision misses them.
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
48/// V6 stores paths relative to project_root and adds content hashes.
49const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
50const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
51const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
52// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
53const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
54const DEFAULT_MAX_BATCH_SIZE: usize = 64;
55const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
56const FALLBACK_BACKEND: &str = "none";
57const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
58const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
59static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
60
61pub struct SemanticIndexLock {
62    _guard: fs_lock::LockGuard,
63}
64
65impl SemanticIndexLock {
66    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
67        let dir = storage_dir.join("semantic").join(project_key);
68        fs::create_dir_all(&dir)?;
69        let path = dir.join("cache.lock");
70        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
71            .lock()
72            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
73        fs_lock::try_acquire(&path, Duration::from_secs(2))
74            .map(|guard| Self { _guard: guard })
75            .map_err(|error| match error {
76                fs_lock::AcquireError::Timeout => {
77                    std::io::Error::other("timed out acquiring semantic cache lock")
78                }
79                fs_lock::AcquireError::Io(error) => error,
80            })
81    }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct SemanticIndexFingerprint {
86    pub backend: String,
87    pub model: String,
88    #[serde(default)]
89    pub base_url: String,
90    pub dimension: usize,
91    #[serde(default = "default_chunking_version")]
92    pub chunking_version: u32,
93}
94
95fn default_chunking_version() -> u32 {
96    2
97}
98
99impl SemanticIndexFingerprint {
100    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
101        // Use normalized URL for fingerprinting so cosmetic differences
102        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
103        let base_url = config
104            .base_url
105            .as_ref()
106            .and_then(|u| normalize_base_url(u).ok())
107            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
108        Self {
109            backend: config.backend.as_str().to_string(),
110            model: config.model.clone(),
111            base_url,
112            dimension,
113            chunking_version: default_chunking_version(),
114        }
115    }
116
117    pub fn as_string(&self) -> String {
118        serde_json::to_string(self).unwrap_or_else(|_| String::new())
119    }
120
121    fn matches_expected(&self, expected: &str) -> bool {
122        let encoded = self.as_string();
123        !encoded.is_empty() && encoded == expected
124    }
125}
126
127enum SemanticEmbeddingEngine {
128    Fastembed(TextEmbedding),
129    OpenAiCompatible {
130        client: Client,
131        model: String,
132        base_url: String,
133        api_key: Option<String>,
134    },
135    Ollama {
136        client: Client,
137        model: String,
138        base_url: String,
139    },
140}
141
142pub struct SemanticEmbeddingModel {
143    backend: SemanticBackend,
144    model: String,
145    base_url: Option<String>,
146    timeout_ms: u64,
147    max_batch_size: usize,
148    dimension: Option<usize>,
149    engine: SemanticEmbeddingEngine,
150    query_embedding_cache: HashMap<String, Vec<f32>>,
151    query_embedding_cache_order: VecDeque<String>,
152    query_embedding_cache_hits: u64,
153    query_embedding_cache_misses: u64,
154}
155
156pub type EmbeddingModel = SemanticEmbeddingModel;
157
158fn validate_embedding_batch(
159    vectors: &[Vec<f32>],
160    expected_count: usize,
161    context: &str,
162) -> Result<(), String> {
163    if expected_count > 0 && vectors.is_empty() {
164        return Err(format!(
165            "{context} returned no vectors for {expected_count} inputs"
166        ));
167    }
168
169    if vectors.len() != expected_count {
170        return Err(format!(
171            "{context} returned {} vectors for {} inputs",
172            vectors.len(),
173            expected_count
174        ));
175    }
176
177    let Some(first_vector) = vectors.first() else {
178        return Ok(());
179    };
180    let expected_dimension = first_vector.len();
181    validate_embedding_dimension(expected_dimension)
182        .map_err(|error| format!("{context} returned {error}"))?;
183    for (index, vector) in vectors.iter().enumerate() {
184        if vector.len() != expected_dimension {
185            return Err(format!(
186                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
187                vector.len()
188            ));
189        }
190    }
191
192    Ok(())
193}
194
195fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
196    if dimension == 0 || dimension > MAX_DIMENSION {
197        return Err(format!(
198            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
199        ));
200    }
201
202    Ok(())
203}
204
205/// Normalize a base URL: validate scheme and strip trailing slash.
206/// Does NOT perform SSRF/private-IP validation — call
207/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
208fn normalize_base_url(raw: &str) -> Result<String, String> {
209    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
210    let scheme = parsed.scheme();
211    if scheme != "http" && scheme != "https" {
212        return Err(format!(
213            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
214            scheme
215        ));
216    }
217    Ok(parsed.to_string().trim_end_matches('/').to_string())
218}
219
220/// Validate that a base URL does not point to a private/loopback address.
221/// Call this on user-supplied config (at configure time) to prevent SSRF.
222/// Not called for programmatically constructed configs (e.g. tests).
223///
224/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
225/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
226/// addresses by definition cannot be exploited as SSRF targets — they only
227/// reach services on the same machine. Allowing loopback unblocks Ollama at its
228/// default config without opening up SSRF to LAN/intranet services, which
229/// remain rejected.
230///
231/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
232/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
233/// the SSRF guard meaningful for non-loopback private networks.
234pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
235    use std::net::{IpAddr, ToSocketAddrs};
236
237    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
238
239    let host = parsed.host_str().unwrap_or("");
240
241    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
242    // `localhost` and `*.localhost` resolve to loopback;
243    // `localhost.localdomain` is a historical alias used on some Linux
244    // distros. Self-hosted backends like Ollama use these by default.
245    let is_loopback_host =
246        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
247    if is_loopback_host {
248        return Ok(());
249    }
250
251    // mDNS hostnames are typically LAN devices, not loopback. Reject before
252    // DNS lookup so users get a clear error rather than a private-IP error.
253    if host.ends_with(".local") {
254        return Err(format!(
255            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
256        ));
257    }
258
259    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
260    // loopback (which is by definition same-machine and not an SSRF target).
261    let port = parsed.port_or_known_default().unwrap_or(443);
262    let addr_str = format!("{host}:{port}");
263    let addrs: Vec<IpAddr> = addr_str
264        .to_socket_addrs()
265        .map(|iter| iter.map(|sa| sa.ip()).collect())
266        .unwrap_or_default();
267    for ip in &addrs {
268        if is_private_non_loopback_ip(ip) {
269            return Err(format!(
270                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
271            ));
272        }
273    }
274
275    Ok(())
276}
277
278/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
279/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
280/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
281fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
282    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
283    match ip {
284        IpAddr::V4(v4) => {
285            let o = v4.octets();
286            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
287            // 10.0.0.0/8
288            o[0] == 10
289            // 172.16.0.0/12
290            || (o[0] == 172 && (16..=31).contains(&o[1]))
291            // 192.168.0.0/16
292            || (o[0] == 192 && o[1] == 168)
293            // 169.254.0.0/16 link-local
294            || (o[0] == 169 && o[1] == 254)
295            // 100.64.0.0/10 CGNAT
296            || (o[0] == 100 && (64..=127).contains(&o[1]))
297            // 0.0.0.0/8 wildcard
298            || o[0] == 0
299        }
300        IpAddr::V6(v6) => {
301            // Note: ::1 (loopback) is intentionally NOT in this set.
302            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
303                                         // fe80::/10 link-local
304            (v6.segments()[0] & 0xffc0) == 0xfe80
305            // fc00::/7 unique-local
306            || (v6.segments()[0] & 0xfe00) == 0xfc00
307            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
308            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
309                && v6.segments()[2] == 0 && v6.segments()[3] == 0
310                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
311                && {
312                    let [a, b] = v6.segments()[6..8] else { return false; };
313                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
314                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
315                })
316        }
317    }
318}
319
320fn build_openai_embeddings_endpoint(base_url: &str) -> String {
321    if base_url.ends_with("/v1") {
322        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
323    } else {
324        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
325    }
326}
327
328fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
329    if base_url.ends_with("/api") {
330        format!("{base_url}/embed")
331    } else {
332        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
333    }
334}
335
336fn normalize_api_key(value: Option<String>) -> Option<String> {
337    value.and_then(|token| {
338        let token = token.trim();
339        if token.is_empty() {
340            None
341        } else {
342            Some(token.to_string())
343        }
344    })
345}
346
347fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
348    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
349}
350
351fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
352    error.is_connect()
353}
354
355fn sleep_before_embedding_retry(attempt_index: usize) {
356    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
357        std::thread::sleep(Duration::from_millis(*delay_ms));
358    }
359}
360
361fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
362where
363    F: FnMut() -> reqwest::blocking::RequestBuilder,
364{
365    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
366        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
367
368        let response = match make_request().send() {
369            Ok(response) => response,
370            Err(error) => {
371                if !last_attempt && is_retryable_embedding_error(&error) {
372                    sleep_before_embedding_retry(attempt_index);
373                    continue;
374                }
375                return Err(format!("{backend_label} request failed: {error}"));
376            }
377        };
378
379        let status = response.status();
380        let raw = match response.text() {
381            Ok(raw) => raw,
382            Err(error) => {
383                if !last_attempt && is_retryable_embedding_error(&error) {
384                    sleep_before_embedding_retry(attempt_index);
385                    continue;
386                }
387                return Err(format!("{backend_label} response read failed: {error}"));
388            }
389        };
390
391        if status.is_success() {
392            return Ok(raw);
393        }
394
395        if !last_attempt && is_retryable_embedding_status(status) {
396            sleep_before_embedding_retry(attempt_index);
397            continue;
398        }
399
400        return Err(format!(
401            "{backend_label} request failed (HTTP {}): {}",
402            status, raw
403        ));
404    }
405
406    unreachable!("embedding request retries exhausted without returning")
407}
408
409impl SemanticEmbeddingModel {
410    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
411        let timeout_ms = if config.timeout_ms == 0 {
412            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
413        } else {
414            config.timeout_ms
415        };
416
417        let max_batch_size = if config.max_batch_size == 0 {
418            DEFAULT_MAX_BATCH_SIZE
419        } else {
420            config.max_batch_size
421        };
422
423        let api_key_env = normalize_api_key(config.api_key_env.clone());
424        let model = config.model.clone();
425
426        let client = Client::builder()
427            .timeout(Duration::from_millis(timeout_ms))
428            .redirect(reqwest::redirect::Policy::none())
429            .build()
430            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
431
432        let engine = match config.backend {
433            SemanticBackend::Fastembed => {
434                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
435            }
436            SemanticBackend::OpenAiCompatible => {
437                let raw = config.base_url.as_ref().ok_or_else(|| {
438                    "base_url is required for openai_compatible backend".to_string()
439                })?;
440                let base_url = normalize_base_url(raw)?;
441
442                let api_key = match api_key_env {
443                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
444                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
445                    })?),
446                    None => None,
447                };
448
449                SemanticEmbeddingEngine::OpenAiCompatible {
450                    client,
451                    model,
452                    base_url,
453                    api_key,
454                }
455            }
456            SemanticBackend::Ollama => {
457                let raw = config
458                    .base_url
459                    .as_ref()
460                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
461                let base_url = normalize_base_url(raw)?;
462
463                SemanticEmbeddingEngine::Ollama {
464                    client,
465                    model,
466                    base_url,
467                }
468            }
469        };
470
471        Ok(Self {
472            backend: config.backend,
473            model: config.model.clone(),
474            base_url: config.base_url.clone(),
475            timeout_ms,
476            max_batch_size,
477            dimension: None,
478            engine,
479            query_embedding_cache: HashMap::new(),
480            query_embedding_cache_order: VecDeque::new(),
481            query_embedding_cache_hits: 0,
482            query_embedding_cache_misses: 0,
483        })
484    }
485
486    pub fn backend(&self) -> SemanticBackend {
487        self.backend
488    }
489
490    pub fn model(&self) -> &str {
491        &self.model
492    }
493
494    pub fn base_url(&self) -> Option<&str> {
495        self.base_url.as_deref()
496    }
497
498    pub fn max_batch_size(&self) -> usize {
499        self.max_batch_size
500    }
501
502    pub fn timeout_ms(&self) -> u64 {
503        self.timeout_ms
504    }
505
506    pub fn fingerprint(
507        &mut self,
508        config: &SemanticBackendConfig,
509    ) -> Result<SemanticIndexFingerprint, String> {
510        let dimension = self.dimension()?;
511        Ok(SemanticIndexFingerprint::from_config(config, dimension))
512    }
513
514    pub fn dimension(&mut self) -> Result<usize, String> {
515        if let Some(dimension) = self.dimension {
516            return Ok(dimension);
517        }
518
519        let dimension = match &mut self.engine {
520            SemanticEmbeddingEngine::Fastembed(model) => {
521                let vectors = model
522                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
523                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
524                vectors
525                    .first()
526                    .map(|v| v.len())
527                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
528            }
529            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
530                let vectors =
531                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
532                vectors
533                    .first()
534                    .map(|v| v.len())
535                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
536            }
537            SemanticEmbeddingEngine::Ollama { .. } => {
538                let vectors =
539                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
540                vectors
541                    .first()
542                    .map(|v| v.len())
543                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
544            }
545        };
546
547        self.dimension = Some(dimension);
548        Ok(dimension)
549    }
550
551    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
552        self.embed_texts(texts)
553    }
554
555    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
556        if let Some(vector) = self.query_embedding_cache.get(query) {
557            self.query_embedding_cache_hits += 1;
558            return Ok(vector.clone());
559        }
560
561        self.query_embedding_cache_misses += 1;
562        let embeddings = self.embed_texts(vec![query.to_string()])?;
563        let vector = embeddings
564            .first()
565            .cloned()
566            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
567
568        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
569            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
570                self.query_embedding_cache.remove(&oldest);
571            }
572        }
573        self.query_embedding_cache
574            .insert(query.to_string(), vector.clone());
575        self.query_embedding_cache_order
576            .push_back(query.to_string());
577
578        Ok(vector)
579    }
580
581    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
582        (
583            self.query_embedding_cache_hits,
584            self.query_embedding_cache_misses,
585            self.query_embedding_cache.len(),
586        )
587    }
588
589    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
590        match &mut self.engine {
591            SemanticEmbeddingEngine::Fastembed(model) => model
592                .embed(texts, None::<usize>)
593                .map_err(|error| format_embedding_init_error(error.to_string()))
594                .map_err(|error| format!("failed to embed batch: {error}")),
595            SemanticEmbeddingEngine::OpenAiCompatible {
596                client,
597                model,
598                base_url,
599                api_key,
600            } => {
601                let expected_text_count = texts.len();
602                let endpoint = build_openai_embeddings_endpoint(base_url);
603                let body = serde_json::json!({
604                    "input": texts,
605                    "model": model,
606                });
607
608                let raw = send_embedding_request(
609                    || {
610                        // `.json(&body)` sets Content-Type: application/json
611                        // automatically. Do NOT add `.header("Content-Type",
612                        // "application/json")` afterwards — RequestBuilder::header()
613                        // calls HeaderMap::append, which produces TWO Content-Type
614                        // headers on the wire. OpenAI's /v1/embeddings endpoint
615                        // treats duplicate Content-Type as malformed and rejects
616                        // the body with 400 "you must provide a model parameter"
617                        // even when `model` is set. Verified end-to-end against
618                        // api.openai.com. See issue #36.
619                        let mut request = client.post(&endpoint).json(&body);
620
621                        if let Some(api_key) = api_key {
622                            request = request.header("Authorization", format!("Bearer {api_key}"));
623                        }
624
625                        request
626                    },
627                    "openai compatible",
628                )?;
629
630                #[derive(Deserialize)]
631                struct OpenAiResponse {
632                    data: Vec<OpenAiEmbeddingResult>,
633                }
634
635                #[derive(Deserialize)]
636                struct OpenAiEmbeddingResult {
637                    embedding: Vec<f32>,
638                    index: Option<u32>,
639                }
640
641                let parsed: OpenAiResponse = serde_json::from_str(&raw)
642                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
643                if parsed.data.len() != expected_text_count {
644                    return Err(format!(
645                        "openai compatible response returned {} embeddings for {} inputs",
646                        parsed.data.len(),
647                        expected_text_count
648                    ));
649                }
650
651                let mut vectors = vec![Vec::new(); parsed.data.len()];
652                for (i, item) in parsed.data.into_iter().enumerate() {
653                    let index = item.index.unwrap_or(i as u32) as usize;
654                    if index >= vectors.len() {
655                        return Err(
656                            "openai compatible response contains invalid vector index".to_string()
657                        );
658                    }
659                    vectors[index] = item.embedding;
660                }
661
662                for vector in &vectors {
663                    if vector.is_empty() {
664                        return Err(
665                            "openai compatible response contained missing vectors".to_string()
666                        );
667                    }
668                }
669
670                self.dimension = vectors.first().map(Vec::len);
671                Ok(vectors)
672            }
673            SemanticEmbeddingEngine::Ollama {
674                client,
675                model,
676                base_url,
677            } => {
678                let expected_text_count = texts.len();
679                let endpoint = build_ollama_embeddings_endpoint(base_url);
680
681                #[derive(Serialize)]
682                struct OllamaPayload<'a> {
683                    model: &'a str,
684                    input: Vec<String>,
685                }
686
687                let payload = OllamaPayload {
688                    model,
689                    input: texts,
690                };
691
692                let raw = send_embedding_request(
693                    || {
694                        // `.json(&payload)` sets Content-Type automatically.
695                        // Same duplicate-header trap as the OpenAI branch above
696                        // — most Ollama servers tolerate it, but the
697                        // single-Content-Type form is the correct one.
698                        client.post(&endpoint).json(&payload)
699                    },
700                    "ollama",
701                )?;
702
703                #[derive(Deserialize)]
704                struct OllamaResponse {
705                    embeddings: Vec<Vec<f32>>,
706                }
707
708                let parsed: OllamaResponse = serde_json::from_str(&raw)
709                    .map_err(|error| format!("invalid ollama response: {error}"))?;
710                if parsed.embeddings.is_empty() {
711                    return Err("ollama response returned no embeddings".to_string());
712                }
713                if parsed.embeddings.len() != expected_text_count {
714                    return Err(format!(
715                        "ollama response returned {} embeddings for {} inputs",
716                        parsed.embeddings.len(),
717                        expected_text_count
718                    ));
719                }
720
721                let vectors = parsed.embeddings;
722                for vector in &vectors {
723                    if vector.is_empty() {
724                        return Err("ollama response contained empty embeddings".to_string());
725                    }
726                }
727
728                self.dimension = vectors.first().map(Vec::len);
729                Ok(vectors)
730            }
731        }
732    }
733}
734
735/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
736/// This catches broken/incompatible .so files without risking a panic in the ort crate.
737/// Also checks the runtime version via OrtGetApiBase if available.
738pub fn pre_validate_onnx_runtime() -> Result<(), String> {
739    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
740
741    #[cfg(any(target_os = "linux", target_os = "macos"))]
742    {
743        #[cfg(target_os = "linux")]
744        let default_name = "libonnxruntime.so";
745        #[cfg(target_os = "macos")]
746        let default_name = "libonnxruntime.dylib";
747
748        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
749
750        unsafe {
751            let c_name = std::ffi::CString::new(lib_name)
752                .map_err(|e| format!("invalid library path: {}", e))?;
753            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
754            if handle.is_null() {
755                let err = libc::dlerror();
756                let msg = if err.is_null() {
757                    "unknown dlopen error".to_string()
758                } else {
759                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
760                };
761                return Err(format!(
762                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
763                     Run `npx @cortexkit/aft doctor` to diagnose.",
764                    lib_name, msg
765                ));
766            }
767
768            // Try to detect the runtime version from the file path or soname.
769            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
770            let detected_version = detect_ort_version_from_path(lib_name);
771
772            libc::dlclose(handle);
773
774            // Check version compatibility — we need 1.24.x
775            if let Some(ref version) = detected_version {
776                let parts: Vec<&str> = version.split('.').collect();
777                if let (Some(major), Some(minor)) = (
778                    parts.first().and_then(|s| s.parse::<u32>().ok()),
779                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
780                ) {
781                    if major != 1 || minor < 20 {
782                        return Err(format_ort_version_mismatch(version, lib_name));
783                    }
784                }
785            }
786        }
787    }
788
789    #[cfg(target_os = "windows")]
790    {
791        // On Windows, skip pre-validation — let ort handle LoadLibrary
792        let _ = dylib_path;
793    }
794
795    Ok(())
796}
797
798/// Try to extract the ORT version from the library filename or resolved symlink.
799/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
800#[cfg(any(test, target_os = "linux", target_os = "macos"))]
801fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
802    let path = std::path::Path::new(lib_path);
803
804    // Try the path as given, then follow symlinks
805    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
806        .into_iter()
807        .flatten()
808    {
809        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
810            if let Some(version) = extract_version_from_filename(name) {
811                return Some(version);
812            }
813        }
814    }
815
816    // Also check for versioned siblings in the same directory
817    if let Some(parent) = path.parent() {
818        if let Ok(entries) = std::fs::read_dir(parent) {
819            for entry in entries.flatten() {
820                if let Some(name) = entry.file_name().to_str() {
821                    if name.starts_with("libonnxruntime") {
822                        if let Some(version) = extract_version_from_filename(name) {
823                            return Some(version);
824                        }
825                    }
826                }
827            }
828        }
829    }
830
831    None
832}
833
834/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
835#[cfg(any(test, target_os = "linux", target_os = "macos"))]
836fn extract_version_from_filename(name: &str) -> Option<String> {
837    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
838    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
839    re.find(name).map(|m| m.as_str().to_string())
840}
841
842#[cfg(any(test, target_os = "linux", target_os = "macos"))]
843fn suggest_removal_command(lib_path: &str) -> String {
844    if lib_path.starts_with("/usr/local/lib")
845        || lib_path == "libonnxruntime.so"
846        || lib_path == "libonnxruntime.dylib"
847    {
848        #[cfg(target_os = "linux")]
849        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
850        #[cfg(target_os = "macos")]
851        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
852        #[cfg(target_os = "windows")]
853        return "   Delete the ONNX Runtime DLL from your PATH".to_string();
854    }
855    format!("   rm '{}'", lib_path)
856}
857
858/// Build the user-facing error message for an incompatible ONNX Runtime
859/// install. Extracted as a pure helper so we can unit-test the wording
860/// stability — the auto-fix recommendation must always come first because
861/// it's the only safe option, and the system-rm step must remain present
862/// because some users prefer the system-wide cleanup path.
863#[cfg(any(test, target_os = "linux", target_os = "macos"))]
864pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
865    format!(
866        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
867         Solutions:\n\
868         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
869         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
870         configures the bridge to load it instead of the system library — no \
871         changes to '{}'.\n\
872         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
873         {}\n\
874         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
875         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
876        version,
877        lib_name,
878        lib_name,
879        suggest_removal_command(lib_name),
880    )
881}
882
883pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
884    // Pre-validate before ort can panic on a bad library
885    pre_validate_onnx_runtime()?;
886
887    let selected_model = match model {
888        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
889        _ => {
890            return Err(format!(
891                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
892                model
893            ))
894        }
895    };
896
897    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
898}
899
900pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
901    if message.trim_start().starts_with("ONNX Runtime not found.") {
902        return true;
903    }
904
905    let message = message.to_ascii_lowercase();
906    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
907        .iter()
908        .any(|pattern| message.contains(pattern));
909    let mentions_dynamic_load_failure = [
910        "shared library",
911        "dynamic library",
912        "failed to load",
913        "could not load",
914        "unable to load",
915        "dlopen",
916        "loadlibrary",
917        "no such file",
918        "not found",
919    ]
920    .iter()
921    .any(|pattern| message.contains(pattern));
922
923    mentions_onnx_runtime && mentions_dynamic_load_failure
924}
925
926fn format_embedding_init_error(error: impl Display) -> String {
927    let message = error.to_string();
928
929    if is_onnx_runtime_unavailable(&message) {
930        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
931    }
932
933    format!("failed to initialize semantic embedding model: {message}")
934}
935
936/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
937#[derive(Debug, Clone)]
938pub struct SemanticChunk {
939    /// Absolute file path
940    pub file: PathBuf,
941    /// Symbol name
942    pub name: String,
943    /// Symbol kind (function, class, struct, etc.)
944    pub kind: SymbolKind,
945    /// Line range (0-based internally, inclusive)
946    pub start_line: u32,
947    pub end_line: u32,
948    /// Whether the symbol is exported
949    pub exported: bool,
950    /// The enriched text that gets embedded (scope + signature + body snippet)
951    pub embed_text: String,
952    /// Short code snippet for display in results
953    pub snippet: String,
954}
955
956/// A stored embedding entry — chunk metadata + vector
957#[derive(Debug)]
958struct EmbeddingEntry {
959    chunk: SemanticChunk,
960    vector: Vec<f32>,
961}
962
963/// The semantic index — stores embeddings for all symbols in a project
964#[derive(Debug)]
965pub struct SemanticIndex {
966    entries: Vec<EmbeddingEntry>,
967    /// Track which files are indexed and their mtime for staleness detection
968    file_mtimes: HashMap<PathBuf, SystemTime>,
969    /// Track indexed file sizes alongside mtimes for staleness detection
970    file_sizes: HashMap<PathBuf, u64>,
971    file_hashes: HashMap<PathBuf, blake3::Hash>,
972    /// Embedding dimension (384 for MiniLM-L6-v2)
973    dimension: usize,
974    fingerprint: Option<SemanticIndexFingerprint>,
975    project_root: PathBuf,
976}
977
978#[derive(Debug, Clone, Copy)]
979struct IndexedFileMetadata {
980    mtime: SystemTime,
981    size: u64,
982    content_hash: blake3::Hash,
983}
984
985/// Result of an incremental refresh of the semantic index. Counts are file
986/// counts; `total_processed` is the number of current/deleted files considered.
987#[derive(Debug, Default, Clone, Copy)]
988pub struct RefreshSummary {
989    pub changed: usize,
990    pub added: usize,
991    pub deleted: usize,
992    pub total_processed: usize,
993}
994
995impl RefreshSummary {
996    /// True when no files were touched.
997    pub fn is_noop(&self) -> bool {
998        self.changed == 0 && self.added == 0 && self.deleted == 0
999    }
1000}
1001
1002/// Search result from a semantic query
1003#[derive(Debug, Clone)]
1004pub struct SemanticResult {
1005    pub file: PathBuf,
1006    pub name: String,
1007    pub kind: SymbolKind,
1008    pub start_line: u32,
1009    pub end_line: u32,
1010    pub exported: bool,
1011    pub snippet: String,
1012    pub score: f32,
1013    pub source: &'static str,
1014}
1015
1016impl SemanticIndex {
1017    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1018        debug_assert!(project_root.is_absolute());
1019        Self {
1020            entries: Vec::new(),
1021            file_mtimes: HashMap::new(),
1022            file_sizes: HashMap::new(),
1023            file_hashes: HashMap::new(),
1024            dimension,
1025            fingerprint: None,
1026            project_root,
1027        }
1028    }
1029
1030    /// Number of embedded symbol entries.
1031    pub fn entry_count(&self) -> usize {
1032        self.entries.len()
1033    }
1034
1035    /// Human-readable status label for the index.
1036    pub fn status_label(&self) -> &'static str {
1037        if self.entries.is_empty() {
1038            "empty"
1039        } else {
1040            "ready"
1041        }
1042    }
1043
1044    fn collect_chunks(
1045        project_root: &Path,
1046        files: &[PathBuf],
1047    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1048        let per_file: Vec<(
1049            PathBuf,
1050            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1051        )> = files
1052            .par_iter()
1053            .map_init(HashMap::new, |parsers, file| {
1054                let result = collect_file_metadata(file).and_then(|metadata| {
1055                    collect_file_chunks(project_root, file, parsers)
1056                        .map(|chunks| (metadata, chunks))
1057                });
1058                (file.clone(), result)
1059            })
1060            .collect();
1061
1062        let mut chunks: Vec<SemanticChunk> = Vec::new();
1063        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1064
1065        for (file, result) in per_file {
1066            match result {
1067                Ok((metadata, file_chunks)) => {
1068                    file_metadata.insert(file, metadata);
1069                    chunks.extend(file_chunks);
1070                }
1071                Err(error) => {
1072                    // "unsupported file extension" is expected for non-code files
1073                    // (json, xml, .gitignore, etc.) that get included in the
1074                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1075                    // we now skip silently to keep the log clean. Only real read/parse
1076                    // errors are worth surfacing.
1077                    if error == "unsupported file extension" {
1078                        continue;
1079                    }
1080                    slog_warn!(
1081                        "failed to collect semantic chunks for {}: {}",
1082                        file.display(),
1083                        error
1084                    );
1085                }
1086            }
1087        }
1088
1089        (chunks, file_metadata)
1090    }
1091
1092    fn build_from_chunks<F, P>(
1093        project_root: &Path,
1094        chunks: Vec<SemanticChunk>,
1095        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1096        embed_fn: &mut F,
1097        max_batch_size: usize,
1098        mut progress: Option<&mut P>,
1099    ) -> Result<Self, String>
1100    where
1101        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1102        P: FnMut(usize, usize),
1103    {
1104        debug_assert!(project_root.is_absolute());
1105        let total_chunks = chunks.len();
1106
1107        if chunks.is_empty() {
1108            return Ok(Self {
1109                entries: Vec::new(),
1110                file_mtimes: file_metadata
1111                    .iter()
1112                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1113                    .collect(),
1114                file_sizes: file_metadata
1115                    .iter()
1116                    .map(|(path, metadata)| (path.clone(), metadata.size))
1117                    .collect(),
1118                file_hashes: file_metadata
1119                    .into_iter()
1120                    .map(|(path, metadata)| (path, metadata.content_hash))
1121                    .collect(),
1122                dimension: DEFAULT_DIMENSION,
1123                fingerprint: None,
1124                project_root: project_root.to_path_buf(),
1125            });
1126        }
1127
1128        // Embed in batches
1129        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1130        let mut expected_dimension: Option<usize> = None;
1131        let batch_size = max_batch_size.max(1);
1132        for batch_start in (0..chunks.len()).step_by(batch_size) {
1133            let batch_end = (batch_start + batch_size).min(chunks.len());
1134            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1135                .iter()
1136                .map(|c| c.embed_text.clone())
1137                .collect();
1138
1139            let vectors = embed_fn(batch_texts)?;
1140            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1141
1142            // Track consistent dimension across all batches
1143            if let Some(dim) = vectors.first().map(|v| v.len()) {
1144                match expected_dimension {
1145                    None => expected_dimension = Some(dim),
1146                    Some(expected) if dim != expected => {
1147                        return Err(format!(
1148                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1149                        ));
1150                    }
1151                    _ => {}
1152                }
1153            }
1154
1155            for (i, vector) in vectors.into_iter().enumerate() {
1156                let chunk_idx = batch_start + i;
1157                entries.push(EmbeddingEntry {
1158                    chunk: chunks[chunk_idx].clone(),
1159                    vector,
1160                });
1161            }
1162
1163            if let Some(callback) = progress.as_mut() {
1164                callback(entries.len(), total_chunks);
1165            }
1166        }
1167
1168        let dimension = entries
1169            .first()
1170            .map(|e| e.vector.len())
1171            .unwrap_or(DEFAULT_DIMENSION);
1172
1173        Ok(Self {
1174            entries,
1175            file_mtimes: file_metadata
1176                .iter()
1177                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1178                .collect(),
1179            file_sizes: file_metadata
1180                .iter()
1181                .map(|(path, metadata)| (path.clone(), metadata.size))
1182                .collect(),
1183            file_hashes: file_metadata
1184                .into_iter()
1185                .map(|(path, metadata)| (path, metadata.content_hash))
1186                .collect(),
1187            dimension,
1188            fingerprint: None,
1189            project_root: project_root.to_path_buf(),
1190        })
1191    }
1192
1193    /// Build the semantic index from a set of files using the provided embedding function.
1194    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1195    pub fn build<F>(
1196        project_root: &Path,
1197        files: &[PathBuf],
1198        embed_fn: &mut F,
1199        max_batch_size: usize,
1200    ) -> Result<Self, String>
1201    where
1202        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1203    {
1204        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1205        Self::build_from_chunks(
1206            project_root,
1207            chunks,
1208            file_mtimes,
1209            embed_fn,
1210            max_batch_size,
1211            Option::<&mut fn(usize, usize)>::None,
1212        )
1213    }
1214
1215    /// Build the semantic index and report embedding progress using entry counts.
1216    pub fn build_with_progress<F, P>(
1217        project_root: &Path,
1218        files: &[PathBuf],
1219        embed_fn: &mut F,
1220        max_batch_size: usize,
1221        progress: &mut P,
1222    ) -> Result<Self, String>
1223    where
1224        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1225        P: FnMut(usize, usize),
1226    {
1227        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1228        let total_chunks = chunks.len();
1229        progress(0, total_chunks);
1230        Self::build_from_chunks(
1231            project_root,
1232            chunks,
1233            file_mtimes,
1234            embed_fn,
1235            max_batch_size,
1236            Some(progress),
1237        )
1238    }
1239
1240    /// Incrementally refresh entries for changed/new files only, preserving cached
1241    /// embeddings for unchanged files. Used when loading the index from disk and
1242    /// finding that a small fraction of files have moved on, deleted, or appeared.
1243    ///
1244    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1245    /// mutated in place and remains a valid index.
1246    ///
1247    /// `current_files` is the full set of files the project considers indexable
1248    /// (typically `walk_project_files(...)`). Files in the cache that are no
1249    /// longer in this set are treated as deleted.
1250    pub fn refresh_stale_files<F, P>(
1251        &mut self,
1252        project_root: &Path,
1253        current_files: &[PathBuf],
1254        embed_fn: &mut F,
1255        max_batch_size: usize,
1256        progress: &mut P,
1257    ) -> Result<RefreshSummary, String>
1258    where
1259        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1260        P: FnMut(usize, usize),
1261    {
1262        self.backfill_missing_file_sizes();
1263
1264        // 1. Bucket files into deleted / changed / added.
1265        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1266        let total_processed = current_set.len() + self.file_mtimes.len()
1267            - self
1268                .file_mtimes
1269                .keys()
1270                .filter(|path| current_set.contains(path.as_path()))
1271                .count();
1272
1273        // Files in cache that disappeared from disk OR are no longer in the
1274        // walked set. Both cases need their entries dropped.
1275        let mut deleted: Vec<PathBuf> = Vec::new();
1276        let mut changed: Vec<PathBuf> = Vec::new();
1277        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1278        for indexed_path in &indexed_paths {
1279            if !current_set.contains(indexed_path.as_path()) {
1280                deleted.push(indexed_path.clone());
1281                continue;
1282            }
1283            let cached = match (
1284                self.file_mtimes.get(indexed_path),
1285                self.file_sizes.get(indexed_path),
1286                self.file_hashes.get(indexed_path),
1287            ) {
1288                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1289                    mtime: *mtime,
1290                    size: *size,
1291                    content_hash: *hash,
1292                }),
1293                _ => None,
1294            };
1295            match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1296                Some(FreshnessVerdict::HotFresh) => {}
1297                Some(FreshnessVerdict::ContentFresh {
1298                    new_mtime,
1299                    new_size,
1300                }) => {
1301                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1302                    self.file_sizes.insert(indexed_path.clone(), new_size);
1303                }
1304                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1305                    changed.push(indexed_path.clone());
1306                }
1307            }
1308        }
1309
1310        // Files in walk that were never indexed.
1311        let mut added: Vec<PathBuf> = Vec::new();
1312        for path in current_files {
1313            if !self.file_mtimes.contains_key(path) {
1314                added.push(path.clone());
1315            }
1316        }
1317
1318        // Fast path: nothing to do.
1319        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1320            progress(0, 0);
1321            return Ok(RefreshSummary {
1322                total_processed,
1323                ..RefreshSummary::default()
1324            });
1325        }
1326
1327        // 2. Drop entries for deleted files immediately. Changed files are only
1328        //    replaced after successful re-extraction + embedding so transient
1329        //    read/parse errors keep the stale-but-valid cache entry.
1330        if !deleted.is_empty() {
1331            self.remove_indexed_files(&deleted);
1332        }
1333
1334        // 3. Embed the changed + added set, if any.
1335        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1336        to_embed.extend(changed.iter().cloned());
1337        to_embed.extend(added.iter().cloned());
1338
1339        if to_embed.is_empty() {
1340            // Only deletions happened.
1341            progress(0, 0);
1342            return Ok(RefreshSummary {
1343                changed: 0,
1344                added: 0,
1345                deleted: deleted.len(),
1346                total_processed,
1347            });
1348        }
1349
1350        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1351        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1352        let vanished = to_embed
1353            .iter()
1354            .filter(|path| {
1355                changed_set.contains(path.as_path())
1356                    && !fresh_metadata.contains_key(*path)
1357                    && !path.exists()
1358            })
1359            .cloned()
1360            .collect::<Vec<_>>();
1361        if !vanished.is_empty() {
1362            self.remove_indexed_files(&vanished);
1363            deleted.extend(vanished);
1364        }
1365
1366        if chunks.is_empty() {
1367            progress(0, 0);
1368            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1369            if !successful_files.is_empty() {
1370                self.entries
1371                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1372            }
1373            let changed_count = changed
1374                .iter()
1375                .filter(|path| successful_files.contains(*path))
1376                .count();
1377            let added_count = added
1378                .iter()
1379                .filter(|path| successful_files.contains(*path))
1380                .count();
1381            for (file, metadata) in fresh_metadata {
1382                self.file_mtimes.insert(file.clone(), metadata.mtime);
1383                self.file_sizes.insert(file.clone(), metadata.size);
1384                self.file_hashes.insert(file.clone(), metadata.content_hash);
1385            }
1386            return Ok(RefreshSummary {
1387                changed: changed_count,
1388                added: added_count,
1389                deleted: deleted.len(),
1390                total_processed,
1391            });
1392        }
1393
1394        // 4. Embed in batches and dimension-check against the existing index.
1395        let total_chunks = chunks.len();
1396        progress(0, total_chunks);
1397        let batch_size = max_batch_size.max(1);
1398        let existing_dimension = if self.entries.is_empty() {
1399            None
1400        } else {
1401            Some(self.dimension)
1402        };
1403        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1404        let mut observed_dimension: Option<usize> = existing_dimension;
1405
1406        for batch_start in (0..chunks.len()).step_by(batch_size) {
1407            let batch_end = (batch_start + batch_size).min(chunks.len());
1408            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1409                .iter()
1410                .map(|c| c.embed_text.clone())
1411                .collect();
1412
1413            let vectors = embed_fn(batch_texts)?;
1414            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1415
1416            if let Some(dim) = vectors.first().map(|v| v.len()) {
1417                match observed_dimension {
1418                    None => observed_dimension = Some(dim),
1419                    Some(expected) if dim != expected => {
1420                        // Refuse to mix dimensions in one index. Caller should
1421                        // fall back to a full rebuild.
1422                        return Err(format!(
1423                            "embedding dimension changed during incremental refresh: \
1424                             cached index uses {expected}, new vectors use {dim}"
1425                        ));
1426                    }
1427                    _ => {}
1428                }
1429            }
1430
1431            for (i, vector) in vectors.into_iter().enumerate() {
1432                let chunk_idx = batch_start + i;
1433                new_entries.push(EmbeddingEntry {
1434                    chunk: chunks[chunk_idx].clone(),
1435                    vector,
1436                });
1437            }
1438
1439            progress(new_entries.len(), total_chunks);
1440        }
1441
1442        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1443        if !successful_files.is_empty() {
1444            self.entries
1445                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1446        }
1447
1448        self.entries.extend(new_entries);
1449        for (file, metadata) in fresh_metadata {
1450            self.file_mtimes.insert(file.clone(), metadata.mtime);
1451            self.file_sizes.insert(file.clone(), metadata.size);
1452            self.file_hashes.insert(file, metadata.content_hash);
1453        }
1454        if let Some(dim) = observed_dimension {
1455            self.dimension = dim;
1456        }
1457
1458        Ok(RefreshSummary {
1459            changed: changed
1460                .iter()
1461                .filter(|path| successful_files.contains(*path))
1462                .count(),
1463            added: added
1464                .iter()
1465                .filter(|path| successful_files.contains(*path))
1466                .count(),
1467            deleted: deleted.len(),
1468            total_processed,
1469        })
1470    }
1471
1472    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1473        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1474        self.entries
1475            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1476        for path in files {
1477            self.file_mtimes.remove(path);
1478            self.file_sizes.remove(path);
1479            self.file_hashes.remove(path);
1480        }
1481    }
1482
1483    /// Search the index with a query embedding, returning top-K results sorted by relevance
1484    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1485        if self.entries.is_empty() || query_vector.len() != self.dimension {
1486            return Vec::new();
1487        }
1488
1489        let mut scored: Vec<(f32, usize)> = self
1490            .entries
1491            .iter()
1492            .enumerate()
1493            .map(|(i, entry)| {
1494                let mut score = cosine_similarity(query_vector, &entry.vector);
1495                if entry.chunk.exported {
1496                    score *= 1.1;
1497                }
1498                (score, i)
1499            })
1500            .collect();
1501
1502        // Sort descending by score
1503        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1504
1505        scored
1506            .into_iter()
1507            .take(top_k)
1508            // Keep the sort → take → map ordering explicit: removing the old
1509            // `> 0.0` floor cannot evict positive hits because top_k has already
1510            // been selected, but it can surface zero-score noise in the tail.
1511            .map(|(score, idx)| {
1512                let entry = &self.entries[idx];
1513                SemanticResult {
1514                    file: entry.chunk.file.clone(),
1515                    name: entry.chunk.name.clone(),
1516                    kind: entry.chunk.kind.clone(),
1517                    start_line: entry.chunk.start_line,
1518                    end_line: entry.chunk.end_line,
1519                    exported: entry.chunk.exported,
1520                    snippet: entry.chunk.snippet.clone(),
1521                    score,
1522                    source: "semantic",
1523                }
1524            })
1525            .collect()
1526    }
1527
1528    /// Number of indexed entries
1529    pub fn len(&self) -> usize {
1530        self.entries.len()
1531    }
1532
1533    /// Check if a file needs re-indexing based on mtime/size
1534    pub fn is_file_stale(&self, file: &Path) -> bool {
1535        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1536            return true;
1537        };
1538        let Some(stored_size) = self.file_sizes.get(file) else {
1539            return true;
1540        };
1541        let Some(stored_hash) = self.file_hashes.get(file) else {
1542            return true;
1543        };
1544        let cached = FileFreshness {
1545            mtime: *stored_mtime,
1546            size: *stored_size,
1547            content_hash: *stored_hash,
1548        };
1549        match cache_freshness::verify_file(file, &cached) {
1550            FreshnessVerdict::HotFresh => false,
1551            FreshnessVerdict::ContentFresh { .. } => false,
1552            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1553        }
1554    }
1555
1556    fn backfill_missing_file_sizes(&mut self) {
1557        for path in self.file_mtimes.keys() {
1558            if self.file_sizes.contains_key(path) {
1559                continue;
1560            }
1561            if let Ok(metadata) = fs::metadata(path) {
1562                self.file_sizes.insert(path.clone(), metadata.len());
1563                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1564                    self.file_hashes.insert(path.clone(), hash);
1565                }
1566            }
1567        }
1568    }
1569
1570    /// Remove entries for a specific file
1571    pub fn remove_file(&mut self, file: &Path) {
1572        self.invalidate_file(file);
1573    }
1574
1575    pub fn invalidate_file(&mut self, file: &Path) {
1576        self.entries.retain(|e| e.chunk.file != file);
1577        self.file_mtimes.remove(file);
1578        self.file_sizes.remove(file);
1579        self.file_hashes.remove(file);
1580    }
1581
1582    /// Get the embedding dimension
1583    pub fn dimension(&self) -> usize {
1584        self.dimension
1585    }
1586
1587    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1588        self.fingerprint.as_ref()
1589    }
1590
1591    pub fn backend_label(&self) -> Option<&str> {
1592        self.fingerprint.as_ref().map(|f| f.backend.as_str())
1593    }
1594
1595    pub fn model_label(&self) -> Option<&str> {
1596        self.fingerprint.as_ref().map(|f| f.model.as_str())
1597    }
1598
1599    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1600        self.fingerprint = Some(fingerprint);
1601    }
1602
1603    /// Write the semantic index to disk using atomic temp+rename pattern
1604    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1605        // Don't persist empty indexes — they would be loaded on next startup
1606        // and prevent a fresh build that might find files.
1607        if self.entries.is_empty() {
1608            slog_info!("skipping semantic index persistence (0 entries)");
1609            return;
1610        }
1611        let dir = storage_dir.join("semantic").join(project_key);
1612        if let Err(e) = fs::create_dir_all(&dir) {
1613            slog_warn!("failed to create semantic cache dir: {}", e);
1614            return;
1615        }
1616        let data_path = dir.join("semantic.bin");
1617        let tmp_path = dir.join(format!(
1618            "semantic.bin.tmp.{}.{}",
1619            std::process::id(),
1620            SystemTime::now()
1621                .duration_since(SystemTime::UNIX_EPOCH)
1622                .unwrap_or(Duration::ZERO)
1623                .as_nanos()
1624        ));
1625        let bytes = self.to_bytes();
1626        let write_result = (|| -> std::io::Result<()> {
1627            use std::io::Write;
1628            let mut file = fs::File::create(&tmp_path)?;
1629            file.write_all(&bytes)?;
1630            file.sync_all()?;
1631            Ok(())
1632        })();
1633        if let Err(e) = write_result {
1634            slog_warn!("failed to write semantic index: {}", e);
1635            let _ = fs::remove_file(&tmp_path);
1636            return;
1637        }
1638        if let Err(e) = fs::rename(&tmp_path, &data_path) {
1639            slog_warn!("failed to rename semantic index: {}", e);
1640            let _ = fs::remove_file(&tmp_path);
1641            return;
1642        }
1643        slog_info!(
1644            "semantic index persisted: {} entries, {:.1} KB",
1645            self.entries.len(),
1646            bytes.len() as f64 / 1024.0
1647        );
1648    }
1649
1650    /// Read the semantic index from disk
1651    pub fn read_from_disk(
1652        storage_dir: &Path,
1653        project_key: &str,
1654        current_canonical_root: &Path,
1655        is_worktree_bridge: bool,
1656        expected_fingerprint: Option<&str>,
1657    ) -> Option<Self> {
1658        debug_assert!(current_canonical_root.is_absolute());
1659        let data_path = storage_dir
1660            .join("semantic")
1661            .join(project_key)
1662            .join("semantic.bin");
1663        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1664        if file_len < HEADER_BYTES_V1 {
1665            slog_warn!(
1666                "corrupt semantic index (too small: {} bytes), removing",
1667                file_len
1668            );
1669            if !is_worktree_bridge {
1670                let _ = fs::remove_file(&data_path);
1671            }
1672            return None;
1673        }
1674
1675        let bytes = fs::read(&data_path).ok()?;
1676        let version = bytes[0];
1677        if version != SEMANTIC_INDEX_VERSION_V6 {
1678            slog_info!(
1679                "cached semantic index version {} is older than {}, rebuilding",
1680                version,
1681                SEMANTIC_INDEX_VERSION_V6
1682            );
1683            if !is_worktree_bridge {
1684                let _ = fs::remove_file(&data_path);
1685            }
1686            return None;
1687        }
1688        match Self::from_bytes(&bytes, current_canonical_root) {
1689            Ok(index) => {
1690                if index.entries.is_empty() {
1691                    slog_info!("cached semantic index is empty, will rebuild");
1692                    if !is_worktree_bridge {
1693                        let _ = fs::remove_file(&data_path);
1694                    }
1695                    return None;
1696                }
1697                if let Some(expected) = expected_fingerprint {
1698                    let matches = index
1699                        .fingerprint()
1700                        .map(|fingerprint| fingerprint.matches_expected(expected))
1701                        .unwrap_or(false);
1702                    if !matches {
1703                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1704                        if !is_worktree_bridge {
1705                            let _ = fs::remove_file(&data_path);
1706                        }
1707                        return None;
1708                    }
1709                }
1710                slog_info!(
1711                    "loaded semantic index from disk: {} entries",
1712                    index.entries.len()
1713                );
1714                Some(index)
1715            }
1716            Err(e) => {
1717                slog_warn!("corrupt semantic index, rebuilding: {}", e);
1718                if !is_worktree_bridge {
1719                    let _ = fs::remove_file(&data_path);
1720                }
1721                None
1722            }
1723        }
1724    }
1725
1726    /// Serialize the index to bytes for disk persistence
1727    pub fn to_bytes(&self) -> Vec<u8> {
1728        let mut buf = Vec::new();
1729        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1730            let encoded = fingerprint.as_string();
1731            if encoded.is_empty() {
1732                None
1733            } else {
1734                Some(encoded.into_bytes())
1735            }
1736        });
1737        let file_mtimes: Vec<_> = self
1738            .file_mtimes
1739            .iter()
1740            .filter_map(|(path, mtime)| {
1741                cache_relative_path(&self.project_root, path)
1742                    .map(|relative| (relative, path, mtime))
1743            })
1744            .collect();
1745        let entries: Vec<_> = self
1746            .entries
1747            .iter()
1748            .filter_map(|entry| {
1749                cache_relative_path(&self.project_root, &entry.chunk.file)
1750                    .map(|relative| (relative, entry))
1751            })
1752            .collect();
1753
1754        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
1755        //
1756        // V6 is the single write format. Layout extends V5:
1757        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
1758        //     no bytes follow). Uniform format simplifies the reader.
1759        //   - paths are relative to project_root.
1760        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
1761        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
1762        //
1763        // V1/V2 remain readable for backward compatibility (see from_bytes).
1764        // V3/V4 load as compatible formats but are rejected on disk so snippets
1765        // and file sizes are rebuilt once.
1766        let version = SEMANTIC_INDEX_VERSION_V6;
1767        buf.push(version);
1768        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1769        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1770        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1771        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1772        buf.extend_from_slice(fp_bytes_ref);
1773
1774        // File mtime table: count(4) + entries
1775        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
1776        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1777        for (relative, path, mtime) in &file_mtimes {
1778            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1779            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1780            buf.extend_from_slice(&path_bytes);
1781            let duration = mtime
1782                .duration_since(SystemTime::UNIX_EPOCH)
1783                .unwrap_or_default();
1784            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1785            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1786            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1787            buf.extend_from_slice(&size.to_le_bytes());
1788            let hash = self
1789                .file_hashes
1790                .get(*path)
1791                .copied()
1792                .unwrap_or_else(cache_freshness::zero_hash);
1793            buf.extend_from_slice(hash.as_bytes());
1794        }
1795
1796        // Entries: each is metadata + vector
1797        for (relative, entry) in &entries {
1798            let c = &entry.chunk;
1799
1800            // File path
1801            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1802            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1803            buf.extend_from_slice(&file_bytes);
1804
1805            // Name
1806            let name_bytes = c.name.as_bytes();
1807            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1808            buf.extend_from_slice(name_bytes);
1809
1810            // Kind (1 byte)
1811            buf.push(symbol_kind_to_u8(&c.kind));
1812
1813            // Lines + exported
1814            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1815            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1816            buf.push(c.exported as u8);
1817
1818            // Snippet
1819            let snippet_bytes = c.snippet.as_bytes();
1820            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1821            buf.extend_from_slice(snippet_bytes);
1822
1823            // Embed text
1824            let embed_bytes = c.embed_text.as_bytes();
1825            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1826            buf.extend_from_slice(embed_bytes);
1827
1828            // Vector (f32 array)
1829            for &val in &entry.vector {
1830                buf.extend_from_slice(&val.to_le_bytes());
1831            }
1832        }
1833
1834        buf
1835    }
1836
1837    /// Deserialize the index from bytes
1838    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1839        debug_assert!(current_canonical_root.is_absolute());
1840        let mut pos = 0;
1841
1842        if data.len() < HEADER_BYTES_V1 {
1843            return Err("data too short".to_string());
1844        }
1845
1846        let version = data[pos];
1847        pos += 1;
1848        if version != SEMANTIC_INDEX_VERSION_V1
1849            && version != SEMANTIC_INDEX_VERSION_V2
1850            && version != SEMANTIC_INDEX_VERSION_V3
1851            && version != SEMANTIC_INDEX_VERSION_V4
1852            && version != SEMANTIC_INDEX_VERSION_V5
1853            && version != SEMANTIC_INDEX_VERSION_V6
1854        {
1855            return Err(format!("unsupported version: {}", version));
1856        }
1857        // V2 and newer share the same header layout (V3/V4/V5 only differ from
1858        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
1859        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
1860        if (version == SEMANTIC_INDEX_VERSION_V2
1861            || version == SEMANTIC_INDEX_VERSION_V3
1862            || version == SEMANTIC_INDEX_VERSION_V4
1863            || version == SEMANTIC_INDEX_VERSION_V5
1864            || version == SEMANTIC_INDEX_VERSION_V6)
1865            && data.len() < HEADER_BYTES_V2
1866        {
1867            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1868        }
1869
1870        let dimension = read_u32(data, &mut pos)? as usize;
1871        let entry_count = read_u32(data, &mut pos)? as usize;
1872        validate_embedding_dimension(dimension)?;
1873        if entry_count > MAX_ENTRIES {
1874            return Err(format!("too many semantic index entries: {}", entry_count));
1875        }
1876
1877        // Fingerprint handling:
1878        //   - V1: no fingerprint field at all.
1879        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
1880        //     only emitted V2 when fingerprint was Some).
1881        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
1882        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1883            || version == SEMANTIC_INDEX_VERSION_V3
1884            || version == SEMANTIC_INDEX_VERSION_V4
1885            || version == SEMANTIC_INDEX_VERSION_V5
1886            || version == SEMANTIC_INDEX_VERSION_V6;
1887        let fingerprint = if has_fingerprint_field {
1888            let fingerprint_len = read_u32(data, &mut pos)? as usize;
1889            if pos + fingerprint_len > data.len() {
1890                return Err("unexpected end of data reading fingerprint".to_string());
1891            }
1892            if fingerprint_len == 0 {
1893                None
1894            } else {
1895                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1896                pos += fingerprint_len;
1897                Some(
1898                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1899                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1900                )
1901            }
1902        } else {
1903            None
1904        };
1905
1906        // File mtimes
1907        let mtime_count = read_u32(data, &mut pos)? as usize;
1908        if mtime_count > MAX_ENTRIES {
1909            return Err(format!("too many semantic file mtimes: {}", mtime_count));
1910        }
1911
1912        let vector_bytes = entry_count
1913            .checked_mul(dimension)
1914            .and_then(|count| count.checked_mul(F32_BYTES))
1915            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1916        if vector_bytes > data.len().saturating_sub(pos) {
1917            return Err("semantic index vectors exceed available data".to_string());
1918        }
1919
1920        let mut file_mtimes = HashMap::with_capacity(mtime_count);
1921        let mut file_sizes = HashMap::with_capacity(mtime_count);
1922        let mut file_hashes = HashMap::with_capacity(mtime_count);
1923        for _ in 0..mtime_count {
1924            let path = read_string(data, &mut pos)?;
1925            let secs = read_u64(data, &mut pos)?;
1926            // V3+ persists subsec_nanos alongside secs so staleness checks
1927            // survive restart round-trips. V1/V2 load with 0 nanos, which
1928            // causes one rebuild on upgrade (they never matched live APFS
1929            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
1930            // the cache is persisted as V3 and stabilises.
1931            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1932                || version == SEMANTIC_INDEX_VERSION_V4
1933                || version == SEMANTIC_INDEX_VERSION_V5
1934                || version == SEMANTIC_INDEX_VERSION_V6
1935            {
1936                read_u32(data, &mut pos)?
1937            } else {
1938                0
1939            };
1940            let size =
1941                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1942                    read_u64(data, &mut pos)?
1943                } else {
1944                    0
1945                };
1946            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1947                if pos + 32 > data.len() {
1948                    return Err("unexpected end of data reading content hash".to_string());
1949                }
1950                let mut hash_bytes = [0u8; 32];
1951                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1952                pos += 32;
1953                blake3::Hash::from_bytes(hash_bytes)
1954            } else {
1955                cache_freshness::zero_hash()
1956            };
1957            // Hardening against corrupt / maliciously crafted cache files
1958            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
1959            // nanosecond carry overflows the second counter, and
1960            // `SystemTime + Duration` can panic on carry past the platform's
1961            // upper bound. Explicit validation keeps a corrupted semantic.bin
1962            // from taking down the whole aft process.
1963            if nanos >= 1_000_000_000 {
1964                return Err(format!(
1965                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
1966                    nanos
1967                ));
1968            }
1969            let duration = std::time::Duration::new(secs, nanos);
1970            let mtime = SystemTime::UNIX_EPOCH
1971                .checked_add(duration)
1972                .ok_or_else(|| {
1973                    format!(
1974                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1975                        secs, nanos
1976                    )
1977                })?;
1978            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1979                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1980                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1981            } else {
1982                PathBuf::from(path)
1983            };
1984            file_mtimes.insert(path.clone(), mtime);
1985            file_sizes.insert(path.clone(), size);
1986            file_hashes.insert(path, content_hash);
1987        }
1988
1989        // Entries
1990        let mut entries = Vec::with_capacity(entry_count);
1991        for _ in 0..entry_count {
1992            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1993            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1994                cached_path_under_root(current_canonical_root, &raw_file)
1995                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1996            } else {
1997                raw_file
1998            };
1999            let name = read_string(data, &mut pos)?;
2000
2001            if pos >= data.len() {
2002                return Err("unexpected end of data".to_string());
2003            }
2004            let kind = u8_to_symbol_kind(data[pos]);
2005            pos += 1;
2006
2007            let start_line = read_u32(data, &mut pos)?;
2008            let end_line = read_u32(data, &mut pos)?;
2009
2010            if pos >= data.len() {
2011                return Err("unexpected end of data".to_string());
2012            }
2013            let exported = data[pos] != 0;
2014            pos += 1;
2015
2016            let snippet = read_string(data, &mut pos)?;
2017            let embed_text = read_string(data, &mut pos)?;
2018
2019            // Vector
2020            let vec_bytes = dimension
2021                .checked_mul(F32_BYTES)
2022                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2023            if pos + vec_bytes > data.len() {
2024                return Err("unexpected end of data reading vector".to_string());
2025            }
2026            let mut vector = Vec::with_capacity(dimension);
2027            for _ in 0..dimension {
2028                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2029                vector.push(f32::from_le_bytes(bytes));
2030                pos += 4;
2031            }
2032
2033            entries.push(EmbeddingEntry {
2034                chunk: SemanticChunk {
2035                    file,
2036                    name,
2037                    kind,
2038                    start_line,
2039                    end_line,
2040                    exported,
2041                    embed_text,
2042                    snippet,
2043                },
2044                vector,
2045            });
2046        }
2047
2048        if entries.len() != entry_count {
2049            return Err(format!(
2050                "semantic cache entry count drift: header={} decoded={}",
2051                entry_count,
2052                entries.len()
2053            ));
2054        }
2055        for entry in &entries {
2056            if !file_mtimes.contains_key(&entry.chunk.file) {
2057                return Err(format!(
2058                    "semantic cache metadata missing for entry file {}",
2059                    entry.chunk.file.display()
2060                ));
2061            }
2062        }
2063
2064        Ok(Self {
2065            entries,
2066            file_mtimes,
2067            file_sizes,
2068            file_hashes,
2069            dimension,
2070            fingerprint,
2071            project_root: current_canonical_root.to_path_buf(),
2072        })
2073    }
2074}
2075
2076/// Build enriched embedding text from a symbol with cAST-style context
2077fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2078    let relative = file
2079        .strip_prefix(project_root)
2080        .unwrap_or(file)
2081        .to_string_lossy();
2082
2083    let kind_label = match &symbol.kind {
2084        SymbolKind::Function => "function",
2085        SymbolKind::Class => "class",
2086        SymbolKind::Method => "method",
2087        SymbolKind::Struct => "struct",
2088        SymbolKind::Interface => "interface",
2089        SymbolKind::Enum => "enum",
2090        SymbolKind::TypeAlias => "type",
2091        SymbolKind::Variable => "variable",
2092        SymbolKind::Heading => "heading",
2093        SymbolKind::FileSummary => "file-summary",
2094    };
2095
2096    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2097    let name = &symbol.name;
2098    let mut text = format!(
2099        "name:{name} file:{} kind:{} name:{name}",
2100        relative, kind_label
2101    );
2102
2103    if let Some(sig) = &symbol.signature {
2104        text.push_str(&format!(" signature:{}", sig));
2105    }
2106
2107    // Add body snippet (first ~300 chars of symbol body)
2108    let lines: Vec<&str> = source.lines().collect();
2109    let start = (symbol.range.start_line as usize).min(lines.len());
2110    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2111    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2112    if start < end {
2113        let body: String = lines[start..end]
2114            .iter()
2115            .take(15) // max 15 lines
2116            .copied()
2117            .collect::<Vec<&str>>()
2118            .join("\n");
2119        let snippet = if body.len() > 300 {
2120            format!("{}...", &body[..body.floor_char_boundary(300)])
2121        } else {
2122            body
2123        };
2124        text.push_str(&format!(" body:{}", snippet));
2125    }
2126
2127    text
2128}
2129
2130fn truncate_chars(value: &str, max_chars: usize) -> String {
2131    value.chars().take(max_chars).collect()
2132}
2133
2134fn first_leading_doc_comment(source: &str) -> String {
2135    let lines: Vec<&str> = source.lines().collect();
2136    let Some((start, first)) = lines
2137        .iter()
2138        .enumerate()
2139        .find(|(_, line)| !line.trim().is_empty())
2140    else {
2141        return String::new();
2142    };
2143
2144    let trimmed = first.trim_start();
2145    if trimmed.starts_with("/**") {
2146        let mut comment = Vec::new();
2147        for line in lines.iter().skip(start) {
2148            comment.push(*line);
2149            if line.contains("*/") {
2150                break;
2151            }
2152        }
2153        return truncate_chars(&comment.join("\n"), 200);
2154    }
2155
2156    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2157        let comment = lines
2158            .iter()
2159            .skip(start)
2160            .take_while(|line| {
2161                let trimmed = line.trim_start();
2162                trimmed.starts_with("///") || trimmed.starts_with("//!")
2163            })
2164            .copied()
2165            .collect::<Vec<_>>()
2166            .join("\n");
2167        return truncate_chars(&comment, 200);
2168    }
2169
2170    String::new()
2171}
2172
2173pub fn build_file_summary_chunk(
2174    file: &Path,
2175    project_root: &Path,
2176    source: &str,
2177    top_exports: &[&str],
2178    top_export_signatures: &[Option<&str>],
2179) -> SemanticChunk {
2180    let relative = file.strip_prefix(project_root).unwrap_or(file);
2181    let rel_path = relative.to_string_lossy();
2182    let parent_dir = relative
2183        .parent()
2184        .map(|parent| parent.to_string_lossy().to_string())
2185        .unwrap_or_default();
2186    let name = file
2187        .file_stem()
2188        .map(|stem| stem.to_string_lossy().to_string())
2189        .unwrap_or_default();
2190    let doc = first_leading_doc_comment(source);
2191    let exports = top_exports
2192        .iter()
2193        .take(5)
2194        .copied()
2195        .collect::<Vec<_>>()
2196        .join(",");
2197    let snippet = if doc.is_empty() {
2198        top_export_signatures
2199            .first()
2200            .and_then(|signature| signature.as_deref())
2201            .map(|signature| truncate_chars(signature, 200))
2202            .unwrap_or_default()
2203    } else {
2204        doc.clone()
2205    };
2206
2207    SemanticChunk {
2208        file: file.to_path_buf(),
2209        name,
2210        kind: SymbolKind::FileSummary,
2211        start_line: 0,
2212        end_line: 0,
2213        exported: false,
2214        embed_text: format!(
2215            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2216            file.file_stem()
2217                .map(|stem| stem.to_string_lossy().to_string())
2218                .unwrap_or_default()
2219        ),
2220        snippet,
2221    }
2222}
2223
2224fn parser_for(
2225    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2226    lang: crate::parser::LangId,
2227) -> Result<&mut Parser, String> {
2228    use std::collections::hash_map::Entry;
2229
2230    match parsers.entry(lang) {
2231        Entry::Occupied(entry) => Ok(entry.into_mut()),
2232        Entry::Vacant(entry) => {
2233            let grammar = grammar_for(lang);
2234            let mut parser = Parser::new();
2235            parser
2236                .set_language(&grammar)
2237                .map_err(|error| error.to_string())?;
2238            Ok(entry.insert(parser))
2239        }
2240    }
2241}
2242
2243pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2244    matches!(
2245        path.extension().and_then(|extension| extension.to_str()),
2246        Some(
2247            "ts" | "tsx"
2248                | "js"
2249                | "jsx"
2250                | "py"
2251                | "rs"
2252                | "go"
2253                | "c"
2254                | "h"
2255                | "cc"
2256                | "cpp"
2257                | "cxx"
2258                | "hpp"
2259                | "hh"
2260                | "zig"
2261                | "cs"
2262                | "sh"
2263                | "bash"
2264                | "zsh"
2265                | "sol"
2266                | "vue"
2267        )
2268    )
2269}
2270
2271fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2272    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2273    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2274    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2275        .map_err(|error| error.to_string())?
2276        .unwrap_or_else(cache_freshness::zero_hash);
2277    Ok(IndexedFileMetadata {
2278        mtime,
2279        size: metadata.len(),
2280        content_hash,
2281    })
2282}
2283
2284fn collect_file_chunks(
2285    project_root: &Path,
2286    file: &Path,
2287    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2288) -> Result<Vec<SemanticChunk>, String> {
2289    if !is_semantic_indexed_extension(file) {
2290        return Err("unsupported file extension".to_string());
2291    }
2292    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2293    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2294    let tree = parser_for(parsers, lang)?
2295        .parse(&source, None)
2296        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2297    let symbols =
2298        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2299
2300    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2301}
2302
2303/// Build a display snippet from a symbol's source
2304fn build_snippet(symbol: &Symbol, source: &str) -> String {
2305    let lines: Vec<&str> = source.lines().collect();
2306    let start = (symbol.range.start_line as usize).min(lines.len());
2307    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2308    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2309    if start < end {
2310        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2311        let mut snippet = snippet_lines.join("\n");
2312        if end - start > 5 {
2313            snippet.push_str("\n  ...");
2314        }
2315        if snippet.len() > 300 {
2316            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2317        }
2318        snippet
2319    } else {
2320        String::new()
2321    }
2322}
2323
2324/// Convert symbols to semantic chunks with enriched context
2325fn symbols_to_chunks(
2326    file: &Path,
2327    symbols: &[Symbol],
2328    source: &str,
2329    project_root: &Path,
2330) -> Vec<SemanticChunk> {
2331    let mut chunks = Vec::new();
2332    let top_exports_with_signatures = symbols
2333        .iter()
2334        .filter(|symbol| {
2335            symbol.exported
2336                && symbol.parent.is_none()
2337                && !matches!(symbol.kind, SymbolKind::Heading)
2338        })
2339        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2340        .collect::<Vec<_>>();
2341
2342    let has_only_headings = !symbols.is_empty()
2343        && symbols
2344            .iter()
2345            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2346    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2347        let top_exports = top_exports_with_signatures
2348            .iter()
2349            .map(|(name, _)| *name)
2350            .collect::<Vec<_>>();
2351        let top_export_signatures = top_exports_with_signatures
2352            .iter()
2353            .map(|(_, signature)| *signature)
2354            .collect::<Vec<_>>();
2355        chunks.push(build_file_summary_chunk(
2356            file,
2357            project_root,
2358            source,
2359            &top_exports,
2360            &top_export_signatures,
2361        ));
2362    }
2363
2364    for symbol in symbols {
2365        // Skip Markdown / HTML heading chunks: empirically they dominate result
2366        // lists even for code-shaped queries because heading prose embeds well.
2367        // Agents querying for code lose the actual matches under doc noise.
2368        // README/docs queries are still served by grep on the same files.
2369        if matches!(symbol.kind, SymbolKind::Heading) {
2370            continue;
2371        }
2372
2373        // Skip very small symbols (single-line variables, etc.)
2374        let line_count = symbol
2375            .range
2376            .end_line
2377            .saturating_sub(symbol.range.start_line)
2378            + 1;
2379        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2380            continue;
2381        }
2382
2383        let embed_text = build_embed_text(symbol, source, file, project_root);
2384        let snippet = build_snippet(symbol, source);
2385
2386        chunks.push(SemanticChunk {
2387            file: file.to_path_buf(),
2388            name: symbol.name.clone(),
2389            kind: symbol.kind.clone(),
2390            start_line: symbol.range.start_line,
2391            end_line: symbol.range.end_line,
2392            exported: symbol.exported,
2393            embed_text,
2394            snippet,
2395        });
2396
2397        // Note: Nested symbols are handled separately by the outline system
2398        // Each symbol is indexed individually
2399    }
2400
2401    chunks
2402}
2403
2404/// Cosine similarity between two vectors
2405fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2406    if a.len() != b.len() {
2407        return 0.0;
2408    }
2409
2410    let mut dot = 0.0f32;
2411    let mut norm_a = 0.0f32;
2412    let mut norm_b = 0.0f32;
2413
2414    for i in 0..a.len() {
2415        dot += a[i] * b[i];
2416        norm_a += a[i] * a[i];
2417        norm_b += b[i] * b[i];
2418    }
2419
2420    let denom = norm_a.sqrt() * norm_b.sqrt();
2421    if denom == 0.0 {
2422        0.0
2423    } else {
2424        dot / denom
2425    }
2426}
2427
2428// Serialization helpers
2429fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2430    match kind {
2431        SymbolKind::Function => 0,
2432        SymbolKind::Class => 1,
2433        SymbolKind::Method => 2,
2434        SymbolKind::Struct => 3,
2435        SymbolKind::Interface => 4,
2436        SymbolKind::Enum => 5,
2437        SymbolKind::TypeAlias => 6,
2438        SymbolKind::Variable => 7,
2439        SymbolKind::Heading => 8,
2440        SymbolKind::FileSummary => 9,
2441    }
2442}
2443
2444fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2445    match v {
2446        0 => SymbolKind::Function,
2447        1 => SymbolKind::Class,
2448        2 => SymbolKind::Method,
2449        3 => SymbolKind::Struct,
2450        4 => SymbolKind::Interface,
2451        5 => SymbolKind::Enum,
2452        6 => SymbolKind::TypeAlias,
2453        7 => SymbolKind::Variable,
2454        8 => SymbolKind::Heading,
2455        9 => SymbolKind::FileSummary,
2456        _ => SymbolKind::Heading,
2457    }
2458}
2459
2460fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2461    if *pos + 4 > data.len() {
2462        return Err("unexpected end of data reading u32".to_string());
2463    }
2464    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2465    *pos += 4;
2466    Ok(val)
2467}
2468
2469fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2470    if *pos + 8 > data.len() {
2471        return Err("unexpected end of data reading u64".to_string());
2472    }
2473    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2474    *pos += 8;
2475    Ok(u64::from_le_bytes(bytes))
2476}
2477
2478fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2479    let len = read_u32(data, pos)? as usize;
2480    if *pos + len > data.len() {
2481        return Err("unexpected end of data reading string".to_string());
2482    }
2483    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2484    *pos += len;
2485    Ok(s)
2486}
2487
2488#[cfg(test)]
2489mod tests {
2490    use super::*;
2491    use crate::config::{SemanticBackend, SemanticBackendConfig};
2492    use crate::parser::FileParser;
2493    use std::io::{Read, Write};
2494    use std::net::TcpListener;
2495    use std::thread;
2496
2497    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2498    where
2499        F: Fn(String, String, String) -> String + Send + 'static,
2500    {
2501        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2502        let addr = listener.local_addr().expect("local addr");
2503        let handle = thread::spawn(move || {
2504            let (mut stream, _) = listener.accept().expect("accept request");
2505            let mut buf = Vec::new();
2506            let mut chunk = [0u8; 4096];
2507            let mut header_end = None;
2508            let mut content_length = 0usize;
2509            loop {
2510                let n = stream.read(&mut chunk).expect("read request");
2511                if n == 0 {
2512                    break;
2513                }
2514                buf.extend_from_slice(&chunk[..n]);
2515                if header_end.is_none() {
2516                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2517                        header_end = Some(pos + 4);
2518                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2519                        for line in headers.lines() {
2520                            if let Some(value) = line.strip_prefix("Content-Length:") {
2521                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2522                            }
2523                        }
2524                    }
2525                }
2526                if let Some(end) = header_end {
2527                    if buf.len() >= end + content_length {
2528                        break;
2529                    }
2530                }
2531            }
2532
2533            let end = header_end.expect("header terminator");
2534            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2535            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2536            let mut lines = request.lines();
2537            let request_line = lines.next().expect("request line").to_string();
2538            let path = request_line
2539                .split_whitespace()
2540                .nth(1)
2541                .expect("request path")
2542                .to_string();
2543            let response_body = handler(request_line, path, body);
2544            let response = format!(
2545                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2546                response_body.len(),
2547                response_body
2548            );
2549            stream
2550                .write_all(response.as_bytes())
2551                .expect("write response");
2552        });
2553
2554        (format!("http://{}", addr), handle)
2555    }
2556
2557    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2558        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2559    }
2560
2561    fn write_rust_file(path: &Path, function_name: &str) {
2562        fs::write(
2563            path,
2564            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
2565        )
2566        .unwrap();
2567    }
2568
2569    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2570        let mut embed = test_vector_for_texts;
2571        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2572    }
2573
2574    fn test_project_root() -> PathBuf {
2575        std::env::current_dir().unwrap()
2576    }
2577
2578    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2579        index.file_mtimes.insert(file.to_path_buf(), mtime);
2580        index.file_sizes.insert(file.to_path_buf(), size);
2581        index
2582            .file_hashes
2583            .insert(file.to_path_buf(), cache_freshness::zero_hash());
2584    }
2585
2586    #[test]
2587    fn semantic_cache_serialization_skips_paths_outside_project_root() {
2588        let dir = tempfile::tempdir().expect("create temp dir");
2589        let project = fs::canonicalize(dir.path()).expect("canonical project");
2590        let outside = project.join("..").join("outside.rs");
2591        let mut index = SemanticIndex::new(project.clone(), 3);
2592        index
2593            .file_mtimes
2594            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2595        index.file_sizes.insert(outside.clone(), 1);
2596        index
2597            .file_hashes
2598            .insert(outside.clone(), cache_freshness::zero_hash());
2599        index.entries.push(EmbeddingEntry {
2600            chunk: SemanticChunk {
2601                file: outside,
2602                name: "outside".to_string(),
2603                kind: SymbolKind::Function,
2604                start_line: 0,
2605                end_line: 0,
2606                exported: false,
2607                embed_text: "outside".to_string(),
2608                snippet: "outside".to_string(),
2609            },
2610            vector: vec![1.0, 0.0, 0.0],
2611        });
2612
2613        let bytes = index.to_bytes();
2614        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2615        assert_eq!(loaded.entries.len(), 0);
2616        assert!(loaded.file_mtimes.is_empty());
2617    }
2618
2619    #[test]
2620    fn test_cosine_similarity_identical() {
2621        let a = vec![1.0, 0.0, 0.0];
2622        let b = vec![1.0, 0.0, 0.0];
2623        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2624    }
2625
2626    #[test]
2627    fn test_cosine_similarity_orthogonal() {
2628        let a = vec![1.0, 0.0, 0.0];
2629        let b = vec![0.0, 1.0, 0.0];
2630        assert!(cosine_similarity(&a, &b).abs() < 0.001);
2631    }
2632
2633    #[test]
2634    fn test_cosine_similarity_opposite() {
2635        let a = vec![1.0, 0.0, 0.0];
2636        let b = vec![-1.0, 0.0, 0.0];
2637        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2638    }
2639
2640    #[test]
2641    fn test_serialization_roundtrip() {
2642        let project_root = test_project_root();
2643        let file = project_root.join("src/main.rs");
2644        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2645        index.entries.push(EmbeddingEntry {
2646            chunk: SemanticChunk {
2647                file: file.clone(),
2648                name: "handle_request".to_string(),
2649                kind: SymbolKind::Function,
2650                start_line: 10,
2651                end_line: 25,
2652                exported: true,
2653                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2654                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
2655            },
2656            vector: vec![0.1, 0.2, 0.3, 0.4],
2657        });
2658        index.dimension = 4;
2659        index
2660            .file_mtimes
2661            .insert(file.clone(), SystemTime::UNIX_EPOCH);
2662        index.file_sizes.insert(file, 0);
2663        index.set_fingerprint(SemanticIndexFingerprint {
2664            backend: "fastembed".to_string(),
2665            model: "all-MiniLM-L6-v2".to_string(),
2666            base_url: FALLBACK_BACKEND.to_string(),
2667            dimension: 4,
2668            chunking_version: default_chunking_version(),
2669        });
2670
2671        let bytes = index.to_bytes();
2672        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2673
2674        assert_eq!(restored.entries.len(), 1);
2675        assert_eq!(restored.entries[0].chunk.name, "handle_request");
2676        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2677        assert_eq!(restored.dimension, 4);
2678        assert_eq!(restored.backend_label(), Some("fastembed"));
2679        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2680    }
2681
2682    #[test]
2683    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2684        let cases = [
2685            (SymbolKind::Function, 0),
2686            (SymbolKind::Class, 1),
2687            (SymbolKind::Method, 2),
2688            (SymbolKind::Struct, 3),
2689            (SymbolKind::Interface, 4),
2690            (SymbolKind::Enum, 5),
2691            (SymbolKind::TypeAlias, 6),
2692            (SymbolKind::Variable, 7),
2693            (SymbolKind::Heading, 8),
2694            (SymbolKind::FileSummary, 9),
2695        ];
2696
2697        for (kind, encoded) in cases {
2698            assert_eq!(symbol_kind_to_u8(&kind), encoded);
2699            assert_eq!(u8_to_symbol_kind(encoded), kind);
2700        }
2701    }
2702
2703    #[test]
2704    fn test_search_top_k() {
2705        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2706        index.dimension = 3;
2707
2708        // Add entries with known vectors
2709        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2710            let mut vec = vec![0.0f32; 3];
2711            vec[i] = 1.0; // orthogonal vectors
2712            index.entries.push(EmbeddingEntry {
2713                chunk: SemanticChunk {
2714                    file: PathBuf::from("/src/lib.rs"),
2715                    name: name.to_string(),
2716                    kind: SymbolKind::Function,
2717                    start_line: (i * 10 + 1) as u32,
2718                    end_line: (i * 10 + 5) as u32,
2719                    exported: true,
2720                    embed_text: format!("kind:function name:{}", name),
2721                    snippet: format!("fn {}() {{}}", name),
2722                },
2723                vector: vec,
2724            });
2725        }
2726
2727        // Query aligned with "auth" (index 0)
2728        let query = vec![0.9, 0.1, 0.0];
2729        let results = index.search(&query, 2);
2730
2731        assert_eq!(results.len(), 2);
2732        assert_eq!(results[0].name, "auth"); // highest score
2733        assert!(results[0].score > results[1].score);
2734    }
2735
2736    #[test]
2737    fn test_empty_index_search() {
2738        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2739        let results = index.search(&[0.1, 0.2, 0.3], 10);
2740        assert!(results.is_empty());
2741    }
2742
2743    #[test]
2744    fn single_line_symbol_builds_non_empty_snippet() {
2745        let symbol = Symbol {
2746            name: "answer".to_string(),
2747            kind: SymbolKind::Variable,
2748            range: crate::symbols::Range {
2749                start_line: 0,
2750                start_col: 0,
2751                end_line: 0,
2752                end_col: 24,
2753            },
2754            signature: Some("const answer = 42".to_string()),
2755            scope_chain: Vec::new(),
2756            exported: true,
2757            parent: None,
2758        };
2759        let source = "export const answer = 42;\n";
2760
2761        let snippet = build_snippet(&symbol, source);
2762
2763        assert_eq!(snippet, "export const answer = 42;");
2764    }
2765
2766    #[test]
2767    fn optimized_file_chunk_collection_matches_file_parser_path() {
2768        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2769        let file = project_root.join("src/semantic_index.rs");
2770        let source = std::fs::read_to_string(&file).unwrap();
2771
2772        let mut legacy_parser = FileParser::new();
2773        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2774        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2775
2776        let mut parsers = HashMap::new();
2777        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2778
2779        assert_eq!(
2780            chunk_fingerprint(&optimized_chunks),
2781            chunk_fingerprint(&legacy_chunks)
2782        );
2783    }
2784
2785    fn chunk_fingerprint(
2786        chunks: &[SemanticChunk],
2787    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2788        chunks
2789            .iter()
2790            .map(|chunk| {
2791                (
2792                    chunk.name.clone(),
2793                    chunk.kind.clone(),
2794                    chunk.start_line,
2795                    chunk.end_line,
2796                    chunk.exported,
2797                    chunk.embed_text.clone(),
2798                    chunk.snippet.clone(),
2799                )
2800            })
2801            .collect()
2802    }
2803
2804    #[test]
2805    fn rejects_oversized_dimension_during_deserialization() {
2806        let mut bytes = Vec::new();
2807        bytes.push(1u8);
2808        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2809        bytes.extend_from_slice(&0u32.to_le_bytes());
2810        bytes.extend_from_slice(&0u32.to_le_bytes());
2811
2812        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2813    }
2814
2815    #[test]
2816    fn rejects_oversized_entry_count_during_deserialization() {
2817        let mut bytes = Vec::new();
2818        bytes.push(1u8);
2819        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2820        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2821        bytes.extend_from_slice(&0u32.to_le_bytes());
2822
2823        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2824    }
2825
2826    #[test]
2827    fn invalidate_file_removes_entries_and_mtime() {
2828        let target = PathBuf::from("/src/main.rs");
2829        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2830        index.entries.push(EmbeddingEntry {
2831            chunk: SemanticChunk {
2832                file: target.clone(),
2833                name: "main".to_string(),
2834                kind: SymbolKind::Function,
2835                start_line: 0,
2836                end_line: 1,
2837                exported: false,
2838                embed_text: "main".to_string(),
2839                snippet: "fn main() {}".to_string(),
2840            },
2841            vector: vec![1.0; DEFAULT_DIMENSION],
2842        });
2843        index
2844            .file_mtimes
2845            .insert(target.clone(), SystemTime::UNIX_EPOCH);
2846        index.file_sizes.insert(target.clone(), 0);
2847
2848        index.invalidate_file(&target);
2849
2850        assert!(index.entries.is_empty());
2851        assert!(!index.file_mtimes.contains_key(&target));
2852        assert!(!index.file_sizes.contains_key(&target));
2853    }
2854
2855    #[test]
2856    fn refresh_missing_changed_file_is_purged_after_collect() {
2857        let temp = tempfile::tempdir().unwrap();
2858        let project_root = temp.path();
2859        let file = project_root.join("src/lib.rs");
2860        fs::create_dir_all(file.parent().unwrap()).unwrap();
2861        write_rust_file(&file, "vanished_symbol");
2862
2863        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2864        let original_size = *index.file_sizes.get(&file).unwrap();
2865        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
2866        fs::remove_file(&file).unwrap();
2867
2868        let mut embed = test_vector_for_texts;
2869        let mut progress = |_done: usize, _total: usize| {};
2870        let summary = index
2871            .refresh_stale_files(
2872                project_root,
2873                std::slice::from_ref(&file),
2874                &mut embed,
2875                8,
2876                &mut progress,
2877            )
2878            .unwrap();
2879
2880        assert_eq!(summary.changed, 0);
2881        assert_eq!(summary.added, 0);
2882        assert_eq!(summary.deleted, 1);
2883        assert!(index.entries.is_empty());
2884        assert!(!index.file_mtimes.contains_key(&file));
2885        assert!(!index.file_sizes.contains_key(&file));
2886        assert!(!index.file_hashes.contains_key(&file));
2887    }
2888
2889    #[test]
2890    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
2891        let temp = tempfile::tempdir().unwrap();
2892        let project_root = temp.path();
2893        let file = project_root.join("src/lib.rs");
2894        fs::create_dir_all(file.parent().unwrap()).unwrap();
2895        write_rust_file(&file, "kept_symbol");
2896
2897        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2898        let original_entry_count = index.entries.len();
2899        let original_mtime = *index.file_mtimes.get(&file).unwrap();
2900        let original_size = *index.file_sizes.get(&file).unwrap();
2901
2902        let stale_mtime = SystemTime::UNIX_EPOCH;
2903        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2904        fs::remove_file(&file).unwrap();
2905        fs::create_dir(&file).unwrap();
2906
2907        let mut embed = test_vector_for_texts;
2908        let mut progress = |_done: usize, _total: usize| {};
2909        let summary = index
2910            .refresh_stale_files(
2911                project_root,
2912                std::slice::from_ref(&file),
2913                &mut embed,
2914                8,
2915                &mut progress,
2916            )
2917            .unwrap();
2918
2919        assert_eq!(summary.changed, 0);
2920        assert_eq!(summary.added, 0);
2921        assert_eq!(summary.deleted, 0);
2922        assert_eq!(index.entries.len(), original_entry_count);
2923        assert!(index
2924            .entries
2925            .iter()
2926            .any(|entry| entry.chunk.name == "kept_symbol"));
2927        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2928        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2929        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2930    }
2931
2932    #[test]
2933    fn refresh_never_indexed_file_error_does_not_record_mtime() {
2934        let temp = tempfile::tempdir().unwrap();
2935        let project_root = temp.path();
2936        let missing = project_root.join("src/missing.rs");
2937        fs::create_dir_all(missing.parent().unwrap()).unwrap();
2938
2939        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2940        let mut embed = test_vector_for_texts;
2941        let mut progress = |_done: usize, _total: usize| {};
2942        let summary = index
2943            .refresh_stale_files(
2944                project_root,
2945                std::slice::from_ref(&missing),
2946                &mut embed,
2947                8,
2948                &mut progress,
2949            )
2950            .unwrap();
2951
2952        assert_eq!(summary.added, 0);
2953        assert_eq!(summary.changed, 0);
2954        assert_eq!(summary.deleted, 0);
2955        assert!(!index.file_mtimes.contains_key(&missing));
2956        assert!(!index.file_sizes.contains_key(&missing));
2957        assert!(index.entries.is_empty());
2958    }
2959
2960    #[test]
2961    fn refresh_reports_added_for_new_files() {
2962        let temp = tempfile::tempdir().unwrap();
2963        let project_root = temp.path();
2964        let existing = project_root.join("src/lib.rs");
2965        let added = project_root.join("src/new.rs");
2966        fs::create_dir_all(existing.parent().unwrap()).unwrap();
2967        write_rust_file(&existing, "existing_symbol");
2968        write_rust_file(&added, "added_symbol");
2969
2970        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2971        let mut embed = test_vector_for_texts;
2972        let mut progress = |_done: usize, _total: usize| {};
2973        let summary = index
2974            .refresh_stale_files(
2975                project_root,
2976                &[existing.clone(), added.clone()],
2977                &mut embed,
2978                8,
2979                &mut progress,
2980            )
2981            .unwrap();
2982
2983        assert_eq!(summary.added, 1);
2984        assert_eq!(summary.changed, 0);
2985        assert_eq!(summary.deleted, 0);
2986        assert_eq!(summary.total_processed, 2);
2987        assert!(index.file_mtimes.contains_key(&added));
2988        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2989    }
2990
2991    #[test]
2992    fn refresh_reports_deleted_for_removed_files() {
2993        let temp = tempfile::tempdir().unwrap();
2994        let project_root = temp.path();
2995        let deleted = project_root.join("src/deleted.rs");
2996        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2997        write_rust_file(&deleted, "deleted_symbol");
2998
2999        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3000        fs::remove_file(&deleted).unwrap();
3001
3002        let mut embed = test_vector_for_texts;
3003        let mut progress = |_done: usize, _total: usize| {};
3004        let summary = index
3005            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3006            .unwrap();
3007
3008        assert_eq!(summary.deleted, 1);
3009        assert_eq!(summary.changed, 0);
3010        assert_eq!(summary.added, 0);
3011        assert_eq!(summary.total_processed, 1);
3012        assert!(!index.file_mtimes.contains_key(&deleted));
3013        assert!(index.entries.is_empty());
3014    }
3015
3016    #[test]
3017    fn refresh_reports_changed_for_modified_files() {
3018        let temp = tempfile::tempdir().unwrap();
3019        let project_root = temp.path();
3020        let file = project_root.join("src/lib.rs");
3021        fs::create_dir_all(file.parent().unwrap()).unwrap();
3022        write_rust_file(&file, "old_symbol");
3023
3024        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3025        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3026        write_rust_file(&file, "new_symbol");
3027
3028        let mut embed = test_vector_for_texts;
3029        let mut progress = |_done: usize, _total: usize| {};
3030        let summary = index
3031            .refresh_stale_files(
3032                project_root,
3033                std::slice::from_ref(&file),
3034                &mut embed,
3035                8,
3036                &mut progress,
3037            )
3038            .unwrap();
3039
3040        assert_eq!(summary.changed, 1);
3041        assert_eq!(summary.added, 0);
3042        assert_eq!(summary.deleted, 0);
3043        assert_eq!(summary.total_processed, 1);
3044        assert!(index
3045            .entries
3046            .iter()
3047            .any(|entry| entry.chunk.name == "new_symbol"));
3048        assert!(!index
3049            .entries
3050            .iter()
3051            .any(|entry| entry.chunk.name == "old_symbol"));
3052    }
3053
3054    #[test]
3055    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3056        let temp = tempfile::tempdir().unwrap();
3057        let project_root = temp.path();
3058        let file = project_root.join("src/lib.rs");
3059        fs::create_dir_all(file.parent().unwrap()).unwrap();
3060        write_rust_file(&file, "clean_symbol");
3061
3062        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3063        let original_entries = index.entries.len();
3064        let mut embed_called = false;
3065        let mut embed = |texts: Vec<String>| {
3066            embed_called = true;
3067            test_vector_for_texts(texts)
3068        };
3069        let mut progress = |_done: usize, _total: usize| {};
3070        let summary = index
3071            .refresh_stale_files(
3072                project_root,
3073                std::slice::from_ref(&file),
3074                &mut embed,
3075                8,
3076                &mut progress,
3077            )
3078            .unwrap();
3079
3080        assert!(summary.is_noop());
3081        assert_eq!(summary.total_processed, 1);
3082        assert!(!embed_called);
3083        assert_eq!(index.entries.len(), original_entries);
3084    }
3085
3086    #[test]
3087    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3088        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3089
3090        assert!(is_onnx_runtime_unavailable(message));
3091    }
3092
3093    #[test]
3094    fn formats_missing_onnx_runtime_with_install_hint() {
3095        let message = format_embedding_init_error(
3096            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3097        );
3098
3099        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3100        assert!(message.contains("Original error:"));
3101    }
3102
3103    #[test]
3104    fn openai_compatible_backend_embeds_with_mock_server() {
3105        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3106            assert!(request_line.starts_with("POST "));
3107            assert_eq!(path, "/v1/embeddings");
3108            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3109        });
3110
3111        let config = SemanticBackendConfig {
3112            backend: SemanticBackend::OpenAiCompatible,
3113            model: "test-embedding".to_string(),
3114            base_url: Some(base_url),
3115            api_key_env: None,
3116            timeout_ms: 5_000,
3117            max_batch_size: 64,
3118        };
3119
3120        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3121        let vectors = model
3122            .embed(vec!["hello".to_string(), "world".to_string()])
3123            .unwrap();
3124
3125        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3126        handle.join().unwrap();
3127    }
3128
3129    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3130    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3131    /// and again explicitly via `.header("Content-Type", "application/json")`.
3132    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3133    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3134    /// with `HTTP 400 "you must provide a model parameter"` even though the
3135    /// body actually contains `model`. The fix is to drop the explicit
3136    /// `.header("Content-Type", ...)` call. This test pins that we send
3137    /// exactly one Content-Type header.
3138    #[test]
3139    fn openai_compatible_request_has_single_content_type_header() {
3140        use std::sync::{Arc, Mutex};
3141        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3142        let captured_for_thread = Arc::clone(&captured);
3143
3144        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3145        let addr = listener.local_addr().expect("local addr");
3146        let handle = thread::spawn(move || {
3147            let (mut stream, _) = listener.accept().expect("accept");
3148            let mut buf = Vec::new();
3149            let mut chunk = [0u8; 4096];
3150            let mut header_end = None;
3151            let mut content_length = 0usize;
3152            loop {
3153                let n = stream.read(&mut chunk).expect("read");
3154                if n == 0 {
3155                    break;
3156                }
3157                buf.extend_from_slice(&chunk[..n]);
3158                if header_end.is_none() {
3159                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3160                        header_end = Some(pos + 4);
3161                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3162                            if let Some(value) = line.strip_prefix("Content-Length:") {
3163                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3164                            }
3165                        }
3166                    }
3167                }
3168                if let Some(end) = header_end {
3169                    if buf.len() >= end + content_length {
3170                        break;
3171                    }
3172                }
3173            }
3174            *captured_for_thread.lock().unwrap() = buf;
3175            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3176            let response = format!(
3177                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3178                body.len(),
3179                body
3180            );
3181            let _ = stream.write_all(response.as_bytes());
3182        });
3183
3184        let config = SemanticBackendConfig {
3185            backend: SemanticBackend::OpenAiCompatible,
3186            model: "text-embedding-3-small".to_string(),
3187            base_url: Some(format!("http://{}", addr)),
3188            api_key_env: None,
3189            timeout_ms: 5_000,
3190            max_batch_size: 64,
3191        };
3192        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3193        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3194        handle.join().unwrap();
3195
3196        let bytes = captured.lock().unwrap().clone();
3197        let request = String::from_utf8_lossy(&bytes);
3198
3199        // Lowercase line counts because HTTP headers are case-insensitive
3200        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3201        let content_type_lines = request
3202            .lines()
3203            .filter(|line| {
3204                let lower = line.to_ascii_lowercase();
3205                lower.starts_with("content-type:")
3206            })
3207            .count();
3208        assert_eq!(
3209            content_type_lines, 1,
3210            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3211        );
3212
3213        // The body must still include the model field — pin this so a future
3214        // change can't accidentally drop `model` while fixing duplicate headers.
3215        assert!(
3216            request.contains(r#""model":"text-embedding-3-small""#),
3217            "request body should contain model field; full request:\n{request}",
3218        );
3219    }
3220
3221    #[test]
3222    fn ollama_backend_embeds_with_mock_server() {
3223        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3224            assert!(request_line.starts_with("POST "));
3225            assert_eq!(path, "/api/embed");
3226            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3227        });
3228
3229        let config = SemanticBackendConfig {
3230            backend: SemanticBackend::Ollama,
3231            model: "embeddinggemma".to_string(),
3232            base_url: Some(base_url),
3233            api_key_env: None,
3234            timeout_ms: 5_000,
3235            max_batch_size: 64,
3236        };
3237
3238        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3239        let vectors = model
3240            .embed(vec!["hello".to_string(), "world".to_string()])
3241            .unwrap();
3242
3243        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3244        handle.join().unwrap();
3245    }
3246
3247    #[test]
3248    fn read_from_disk_rejects_fingerprint_mismatch() {
3249        let storage = tempfile::tempdir().unwrap();
3250        let project_key = "proj";
3251
3252        let project_root = test_project_root();
3253        let file = project_root.join("src/main.rs");
3254        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3255        index.entries.push(EmbeddingEntry {
3256            chunk: SemanticChunk {
3257                file: file.clone(),
3258                name: "handle_request".to_string(),
3259                kind: SymbolKind::Function,
3260                start_line: 10,
3261                end_line: 25,
3262                exported: true,
3263                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3264                snippet: "fn handle_request() {}".to_string(),
3265            },
3266            vector: vec![0.1, 0.2, 0.3],
3267        });
3268        index.dimension = 3;
3269        index
3270            .file_mtimes
3271            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3272        index.file_sizes.insert(file, 0);
3273        index.set_fingerprint(SemanticIndexFingerprint {
3274            backend: "openai_compatible".to_string(),
3275            model: "test-embedding".to_string(),
3276            base_url: "http://127.0.0.1:1234/v1".to_string(),
3277            dimension: 3,
3278            chunking_version: default_chunking_version(),
3279        });
3280        index.write_to_disk(storage.path(), project_key);
3281
3282        let matching = index.fingerprint().unwrap().as_string();
3283        assert!(SemanticIndex::read_from_disk(
3284            storage.path(),
3285            project_key,
3286            &project_root,
3287            false,
3288            Some(&matching),
3289        )
3290        .is_some());
3291
3292        let mismatched = SemanticIndexFingerprint {
3293            backend: "ollama".to_string(),
3294            model: "embeddinggemma".to_string(),
3295            base_url: "http://127.0.0.1:11434".to_string(),
3296            dimension: 3,
3297            chunking_version: default_chunking_version(),
3298        }
3299        .as_string();
3300        assert!(SemanticIndex::read_from_disk(
3301            storage.path(),
3302            project_key,
3303            &project_root,
3304            false,
3305            Some(&mismatched),
3306        )
3307        .is_none());
3308    }
3309
3310    #[test]
3311    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3312        let storage = tempfile::tempdir().unwrap();
3313        let project_key = "proj-v3";
3314        let dir = storage.path().join("semantic").join(project_key);
3315        fs::create_dir_all(&dir).unwrap();
3316
3317        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3318        index.entries.push(EmbeddingEntry {
3319            chunk: SemanticChunk {
3320                file: PathBuf::from("/src/main.rs"),
3321                name: "handle_request".to_string(),
3322                kind: SymbolKind::Function,
3323                start_line: 0,
3324                end_line: 0,
3325                exported: true,
3326                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3327                snippet: "fn handle_request() {}".to_string(),
3328            },
3329            vector: vec![0.1, 0.2, 0.3],
3330        });
3331        index.dimension = 3;
3332        index
3333            .file_mtimes
3334            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3335        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3336        let fingerprint = SemanticIndexFingerprint {
3337            backend: "fastembed".to_string(),
3338            model: "test".to_string(),
3339            base_url: FALLBACK_BACKEND.to_string(),
3340            dimension: 3,
3341            chunking_version: default_chunking_version(),
3342        };
3343        index.set_fingerprint(fingerprint.clone());
3344
3345        let mut bytes = index.to_bytes();
3346        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3347        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3348
3349        assert!(SemanticIndex::read_from_disk(
3350            storage.path(),
3351            project_key,
3352            &test_project_root(),
3353            false,
3354            Some(&fingerprint.as_string())
3355        )
3356        .is_none());
3357        assert!(!dir.join("semantic.bin").exists());
3358    }
3359
3360    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3361        crate::symbols::Symbol {
3362            name: name.to_string(),
3363            kind,
3364            range: crate::symbols::Range {
3365                start_line: start,
3366                start_col: 0,
3367                end_line: end,
3368                end_col: 0,
3369            },
3370            signature: None,
3371            scope_chain: Vec::new(),
3372            exported: false,
3373            parent: None,
3374        }
3375    }
3376
3377    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3378    /// they overwhelmingly dominated semantic results even on code-shaped
3379    /// queries because heading prose embeds far more strongly than code
3380    /// chunks. Skipping headings keeps aft_search a code-finder.
3381    #[test]
3382    fn symbols_to_chunks_skips_heading_symbols() {
3383        let project_root = PathBuf::from("/proj");
3384        let file = project_root.join("README.md");
3385        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3386
3387        let symbols = vec![
3388            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3389            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3390        ];
3391
3392        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3393        assert!(
3394            chunks.is_empty(),
3395            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3396            chunks.len()
3397        );
3398    }
3399
3400    /// Code symbols (functions, classes, methods, structs, etc.) must still
3401    /// be indexed alongside the heading skip — otherwise we'd starve the
3402    /// index entirely.
3403    #[test]
3404    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3405        let project_root = PathBuf::from("/proj");
3406        let file = project_root.join("src/lib.rs");
3407        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3408
3409        let symbols = vec![
3410            // A heading mixed in (e.g. from a doc comment block elsewhere).
3411            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3412            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3413            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3414        ];
3415
3416        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3417        assert_eq!(
3418            chunks.len(),
3419            3,
3420            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3421            chunks.len()
3422        );
3423        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3424        assert!(chunks
3425            .iter()
3426            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3427        assert!(names.contains(&"handle_request"));
3428        assert!(names.contains(&"AuthService"));
3429        assert!(
3430            !names.contains(&"doc heading"),
3431            "Heading symbol leaked into chunks: {names:?}"
3432        );
3433    }
3434
3435    #[test]
3436    fn validate_ssrf_allows_loopback_hostnames() {
3437        // Loopback hostnames are explicitly allowed so self-hosted backends
3438        // (Ollama at http://localhost:11434) work at their default config.
3439        for host in &[
3440            "http://localhost",
3441            "http://localhost:8080",
3442            "http://localhost:11434", // Ollama default
3443            "http://localhost.localdomain",
3444            "http://foo.localhost",
3445        ] {
3446            assert!(
3447                validate_base_url_no_ssrf(host).is_ok(),
3448                "Expected {host} to be allowed (loopback), got: {:?}",
3449                validate_base_url_no_ssrf(host)
3450            );
3451        }
3452    }
3453
3454    #[test]
3455    fn validate_ssrf_allows_loopback_ips() {
3456        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3457        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3458        for url in &[
3459            "http://127.0.0.1",
3460            "http://127.0.0.1:11434", // Ollama default
3461            "http://127.0.0.1:8080",
3462            "http://127.1.2.3",
3463        ] {
3464            let result = validate_base_url_no_ssrf(url);
3465            assert!(
3466                result.is_ok(),
3467                "Expected {url} to be allowed (loopback), got: {:?}",
3468                result
3469            );
3470        }
3471    }
3472
3473    #[test]
3474    fn validate_ssrf_rejects_private_non_loopback_ips() {
3475        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3476        // services on LAN IPs are real SSRF targets even though the user
3477        // configured them. Users who want this can opt in by binding the
3478        // service to a public-routable address.
3479        for url in &[
3480            "http://192.168.1.1",
3481            "http://10.0.0.1",
3482            "http://172.16.0.1",
3483            "http://169.254.169.254",
3484            "http://100.64.0.1",
3485        ] {
3486            let result = validate_base_url_no_ssrf(url);
3487            assert!(
3488                result.is_err(),
3489                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3490                result
3491            );
3492        }
3493    }
3494
3495    #[test]
3496    fn validate_ssrf_rejects_mdns_local_hostnames() {
3497        // mDNS .local hostnames typically resolve to LAN devices, not
3498        // loopback. Rejecting them before DNS lookup gives a clearer error.
3499        for host in &[
3500            "http://printer.local",
3501            "http://nas.local:8080",
3502            "http://homelab.local",
3503        ] {
3504            let result = validate_base_url_no_ssrf(host);
3505            assert!(
3506                result.is_err(),
3507                "Expected {host} to be rejected (mDNS), got: {:?}",
3508                result
3509            );
3510        }
3511    }
3512
3513    #[test]
3514    fn normalize_base_url_allows_localhost_for_tests() {
3515        // normalize_base_url itself should NOT block localhost — only
3516        // validate_base_url_no_ssrf does. Tests construct backends directly.
3517        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3518        assert!(normalize_base_url("http://localhost:8080").is_ok());
3519    }
3520
3521    /// Pin the user-facing wording of the ONNX version-mismatch error.
3522    /// The auto-fix path MUST be listed first because it's the only safe
3523    /// option that doesn't require sudo or risk breaking other apps that
3524    /// link the system library. Regression of any of these strings would
3525    /// either mislead users (system rm before auto-fix) or break the
3526    /// `aft doctor --fix` discovery path.
3527    #[test]
3528    fn ort_mismatch_message_recommends_auto_fix_first() {
3529        let msg =
3530            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3531
3532        // The reported version and path must appear verbatim.
3533        assert!(
3534            msg.contains("v1.9.0"),
3535            "should report detected version: {msg}"
3536        );
3537        assert!(
3538            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3539            "should report system path: {msg}"
3540        );
3541        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3542
3543        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3544        let auto_fix_pos = msg
3545            .find("Auto-fix")
3546            .expect("Auto-fix solution missing — users won't discover --fix");
3547        let remove_pos = msg
3548            .find("Remove the old library")
3549            .expect("system-rm solution missing");
3550        assert!(
3551            auto_fix_pos < remove_pos,
3552            "Auto-fix must come before manual rm — see PR comment thread"
3553        );
3554
3555        // The auto-fix command must be runnable as-is on a fresh system.
3556        assert!(
3557            msg.contains("npx @cortexkit/aft doctor --fix"),
3558            "auto-fix command must be present and copy-pasteable: {msg}"
3559        );
3560    }
3561
3562    /// macOS dylib paths must not produce a malformed message when the
3563    /// system path lacks a trailing slash. This is a regression guard
3564    /// for the "{}\n{}" format string contract.
3565    #[test]
3566    fn ort_mismatch_message_handles_macos_dylib_path() {
3567        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3568        assert!(msg.contains("v1.9.0"));
3569        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3570        // The dylib path must appear in the auto-fix paragraph (single
3571        // quotes around it) AND in the manual-rm paragraph; verify
3572        // both placements survived the format string.
3573        assert!(
3574            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3575            "system path should be quoted in the auto-fix sentence: {msg}"
3576        );
3577    }
3578}