aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26const MAX_DIMENSION: usize = 1024;
27const F32_BYTES: usize = std::mem::size_of::<f32>();
28const HEADER_BYTES_V1: usize = 9;
29const HEADER_BYTES_V2: usize = 13;
30const ONNX_RUNTIME_INSTALL_HINT: &str =
31    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
32
33const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
34const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
35/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
36/// restart round-trips on filesystems with subsecond mtime precision (APFS,
37/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
38/// caused every restart to flag ~99% of files as stale and re-embed them.
39const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
40/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
41/// fixing symbol ranges that were incorrectly treated as 1-based.
42const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
43/// V5 adds file sizes to the file metadata table so incremental staleness
44/// detection can catch content changes even when mtime precision misses them.
45const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
46/// V6 stores paths relative to project_root and adds content hashes.
47const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
48const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
49const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
50// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
51const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
52const DEFAULT_MAX_BATCH_SIZE: usize = 64;
53const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
54const FALLBACK_BACKEND: &str = "none";
55const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
56const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
57static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
58
59pub struct SemanticIndexLock {
60    _guard: fs_lock::LockGuard,
61}
62
63impl SemanticIndexLock {
64    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
65        let dir = storage_dir.join("semantic").join(project_key);
66        fs::create_dir_all(&dir)?;
67        let path = dir.join("cache.lock");
68        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
69            .lock()
70            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
71        fs_lock::try_acquire(&path, Duration::from_secs(2))
72            .map(|guard| Self { _guard: guard })
73            .map_err(|error| match error {
74                fs_lock::AcquireError::Timeout => {
75                    std::io::Error::other("timed out acquiring semantic cache lock")
76                }
77                fs_lock::AcquireError::Io(error) => error,
78            })
79    }
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct SemanticIndexFingerprint {
84    pub backend: String,
85    pub model: String,
86    #[serde(default)]
87    pub base_url: String,
88    pub dimension: usize,
89    #[serde(default = "default_chunking_version")]
90    pub chunking_version: u32,
91}
92
93fn default_chunking_version() -> u32 {
94    2
95}
96
97impl SemanticIndexFingerprint {
98    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
99        // Use normalized URL for fingerprinting so cosmetic differences
100        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
101        let base_url = config
102            .base_url
103            .as_ref()
104            .and_then(|u| normalize_base_url(u).ok())
105            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
106        Self {
107            backend: config.backend.as_str().to_string(),
108            model: config.model.clone(),
109            base_url,
110            dimension,
111            chunking_version: default_chunking_version(),
112        }
113    }
114
115    pub fn as_string(&self) -> String {
116        serde_json::to_string(self).unwrap_or_else(|_| String::new())
117    }
118
119    fn matches_expected(&self, expected: &str) -> bool {
120        let encoded = self.as_string();
121        !encoded.is_empty() && encoded == expected
122    }
123}
124
125enum SemanticEmbeddingEngine {
126    Fastembed(TextEmbedding),
127    OpenAiCompatible {
128        client: Client,
129        model: String,
130        base_url: String,
131        api_key: Option<String>,
132    },
133    Ollama {
134        client: Client,
135        model: String,
136        base_url: String,
137    },
138}
139
140pub struct SemanticEmbeddingModel {
141    backend: SemanticBackend,
142    model: String,
143    base_url: Option<String>,
144    timeout_ms: u64,
145    max_batch_size: usize,
146    dimension: Option<usize>,
147    engine: SemanticEmbeddingEngine,
148    query_embedding_cache: HashMap<String, Vec<f32>>,
149    query_embedding_cache_order: VecDeque<String>,
150    query_embedding_cache_hits: u64,
151    query_embedding_cache_misses: u64,
152}
153
154pub type EmbeddingModel = SemanticEmbeddingModel;
155
156fn validate_embedding_batch(
157    vectors: &[Vec<f32>],
158    expected_count: usize,
159    context: &str,
160) -> Result<(), String> {
161    if expected_count > 0 && vectors.is_empty() {
162        return Err(format!(
163            "{context} returned no vectors for {expected_count} inputs"
164        ));
165    }
166
167    if vectors.len() != expected_count {
168        return Err(format!(
169            "{context} returned {} vectors for {} inputs",
170            vectors.len(),
171            expected_count
172        ));
173    }
174
175    let Some(first_vector) = vectors.first() else {
176        return Ok(());
177    };
178    let expected_dimension = first_vector.len();
179    for (index, vector) in vectors.iter().enumerate() {
180        if vector.len() != expected_dimension {
181            return Err(format!(
182                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
183                vector.len()
184            ));
185        }
186    }
187
188    Ok(())
189}
190
191/// Normalize a base URL: validate scheme and strip trailing slash.
192/// Does NOT perform SSRF/private-IP validation — call
193/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
194fn normalize_base_url(raw: &str) -> Result<String, String> {
195    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
196    let scheme = parsed.scheme();
197    if scheme != "http" && scheme != "https" {
198        return Err(format!(
199            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
200            scheme
201        ));
202    }
203    Ok(parsed.to_string().trim_end_matches('/').to_string())
204}
205
206/// Validate that a base URL does not point to a private/loopback address.
207/// Call this on user-supplied config (at configure time) to prevent SSRF.
208/// Not called for programmatically constructed configs (e.g. tests).
209///
210/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
211/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
212/// addresses by definition cannot be exploited as SSRF targets — they only
213/// reach services on the same machine. Allowing loopback unblocks Ollama at its
214/// default config without opening up SSRF to LAN/intranet services, which
215/// remain rejected.
216///
217/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
218/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
219/// the SSRF guard meaningful for non-loopback private networks.
220pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
221    use std::net::{IpAddr, ToSocketAddrs};
222
223    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
224
225    let host = parsed.host_str().unwrap_or("");
226
227    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
228    // `localhost` and `*.localhost` resolve to loopback;
229    // `localhost.localdomain` is a historical alias used on some Linux
230    // distros. Self-hosted backends like Ollama use these by default.
231    let is_loopback_host =
232        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
233    if is_loopback_host {
234        return Ok(());
235    }
236
237    // mDNS hostnames are typically LAN devices, not loopback. Reject before
238    // DNS lookup so users get a clear error rather than a private-IP error.
239    if host.ends_with(".local") {
240        return Err(format!(
241            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
242        ));
243    }
244
245    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
246    // loopback (which is by definition same-machine and not an SSRF target).
247    let port = parsed.port_or_known_default().unwrap_or(443);
248    let addr_str = format!("{host}:{port}");
249    let addrs: Vec<IpAddr> = addr_str
250        .to_socket_addrs()
251        .map(|iter| iter.map(|sa| sa.ip()).collect())
252        .unwrap_or_default();
253    for ip in &addrs {
254        if is_private_non_loopback_ip(ip) {
255            return Err(format!(
256                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
257            ));
258        }
259    }
260
261    Ok(())
262}
263
264/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
265/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
266/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
267fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
268    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
269    match ip {
270        IpAddr::V4(v4) => {
271            let o = v4.octets();
272            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
273            // 10.0.0.0/8
274            o[0] == 10
275            // 172.16.0.0/12
276            || (o[0] == 172 && (16..=31).contains(&o[1]))
277            // 192.168.0.0/16
278            || (o[0] == 192 && o[1] == 168)
279            // 169.254.0.0/16 link-local
280            || (o[0] == 169 && o[1] == 254)
281            // 100.64.0.0/10 CGNAT
282            || (o[0] == 100 && (64..=127).contains(&o[1]))
283            // 0.0.0.0/8 wildcard
284            || o[0] == 0
285        }
286        IpAddr::V6(v6) => {
287            // Note: ::1 (loopback) is intentionally NOT in this set.
288            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
289                                         // fe80::/10 link-local
290            (v6.segments()[0] & 0xffc0) == 0xfe80
291            // fc00::/7 unique-local
292            || (v6.segments()[0] & 0xfe00) == 0xfc00
293            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
294            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
295                && v6.segments()[2] == 0 && v6.segments()[3] == 0
296                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
297                && {
298                    let [a, b] = v6.segments()[6..8] else { return false; };
299                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
300                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
301                })
302        }
303    }
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307    if base_url.ends_with("/v1") {
308        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309    } else {
310        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311    }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315    if base_url.ends_with("/api") {
316        format!("{base_url}/embed")
317    } else {
318        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319    }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323    value.and_then(|token| {
324        let token = token.trim();
325        if token.is_empty() {
326            None
327        } else {
328            Some(token.to_string())
329        }
330    })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
338    error.is_connect()
339}
340
341fn sleep_before_embedding_retry(attempt_index: usize) {
342    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
343        std::thread::sleep(Duration::from_millis(*delay_ms));
344    }
345}
346
347fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
348where
349    F: FnMut() -> reqwest::blocking::RequestBuilder,
350{
351    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
352        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
353
354        let response = match make_request().send() {
355            Ok(response) => response,
356            Err(error) => {
357                if !last_attempt && is_retryable_embedding_error(&error) {
358                    sleep_before_embedding_retry(attempt_index);
359                    continue;
360                }
361                return Err(format!("{backend_label} request failed: {error}"));
362            }
363        };
364
365        let status = response.status();
366        let raw = match response.text() {
367            Ok(raw) => raw,
368            Err(error) => {
369                if !last_attempt && is_retryable_embedding_error(&error) {
370                    sleep_before_embedding_retry(attempt_index);
371                    continue;
372                }
373                return Err(format!("{backend_label} response read failed: {error}"));
374            }
375        };
376
377        if status.is_success() {
378            return Ok(raw);
379        }
380
381        if !last_attempt && is_retryable_embedding_status(status) {
382            sleep_before_embedding_retry(attempt_index);
383            continue;
384        }
385
386        return Err(format!(
387            "{backend_label} request failed (HTTP {}): {}",
388            status, raw
389        ));
390    }
391
392    unreachable!("embedding request retries exhausted without returning")
393}
394
395impl SemanticEmbeddingModel {
396    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
397        let timeout_ms = if config.timeout_ms == 0 {
398            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
399        } else {
400            config.timeout_ms
401        };
402
403        let max_batch_size = if config.max_batch_size == 0 {
404            DEFAULT_MAX_BATCH_SIZE
405        } else {
406            config.max_batch_size
407        };
408
409        let api_key_env = normalize_api_key(config.api_key_env.clone());
410        let model = config.model.clone();
411
412        let client = Client::builder()
413            .timeout(Duration::from_millis(timeout_ms))
414            .redirect(reqwest::redirect::Policy::none())
415            .build()
416            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
417
418        let engine = match config.backend {
419            SemanticBackend::Fastembed => {
420                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
421            }
422            SemanticBackend::OpenAiCompatible => {
423                let raw = config.base_url.as_ref().ok_or_else(|| {
424                    "base_url is required for openai_compatible backend".to_string()
425                })?;
426                let base_url = normalize_base_url(raw)?;
427
428                let api_key = match api_key_env {
429                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
430                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
431                    })?),
432                    None => None,
433                };
434
435                SemanticEmbeddingEngine::OpenAiCompatible {
436                    client,
437                    model,
438                    base_url,
439                    api_key,
440                }
441            }
442            SemanticBackend::Ollama => {
443                let raw = config
444                    .base_url
445                    .as_ref()
446                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
447                let base_url = normalize_base_url(raw)?;
448
449                SemanticEmbeddingEngine::Ollama {
450                    client,
451                    model,
452                    base_url,
453                }
454            }
455        };
456
457        Ok(Self {
458            backend: config.backend,
459            model: config.model.clone(),
460            base_url: config.base_url.clone(),
461            timeout_ms,
462            max_batch_size,
463            dimension: None,
464            engine,
465            query_embedding_cache: HashMap::new(),
466            query_embedding_cache_order: VecDeque::new(),
467            query_embedding_cache_hits: 0,
468            query_embedding_cache_misses: 0,
469        })
470    }
471
472    pub fn backend(&self) -> SemanticBackend {
473        self.backend
474    }
475
476    pub fn model(&self) -> &str {
477        &self.model
478    }
479
480    pub fn base_url(&self) -> Option<&str> {
481        self.base_url.as_deref()
482    }
483
484    pub fn max_batch_size(&self) -> usize {
485        self.max_batch_size
486    }
487
488    pub fn timeout_ms(&self) -> u64 {
489        self.timeout_ms
490    }
491
492    pub fn fingerprint(
493        &mut self,
494        config: &SemanticBackendConfig,
495    ) -> Result<SemanticIndexFingerprint, String> {
496        let dimension = self.dimension()?;
497        Ok(SemanticIndexFingerprint::from_config(config, dimension))
498    }
499
500    pub fn dimension(&mut self) -> Result<usize, String> {
501        if let Some(dimension) = self.dimension {
502            return Ok(dimension);
503        }
504
505        let dimension = match &mut self.engine {
506            SemanticEmbeddingEngine::Fastembed(model) => {
507                let vectors = model
508                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
509                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
510                vectors
511                    .first()
512                    .map(|v| v.len())
513                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
514            }
515            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
516                let vectors =
517                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
518                vectors
519                    .first()
520                    .map(|v| v.len())
521                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
522            }
523            SemanticEmbeddingEngine::Ollama { .. } => {
524                let vectors =
525                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
526                vectors
527                    .first()
528                    .map(|v| v.len())
529                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530            }
531        };
532
533        self.dimension = Some(dimension);
534        Ok(dimension)
535    }
536
537    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
538        self.embed_texts(texts)
539    }
540
541    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
542        if let Some(vector) = self.query_embedding_cache.get(query) {
543            self.query_embedding_cache_hits += 1;
544            return Ok(vector.clone());
545        }
546
547        self.query_embedding_cache_misses += 1;
548        let embeddings = self.embed_texts(vec![query.to_string()])?;
549        let vector = embeddings
550            .first()
551            .cloned()
552            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
553
554        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
555            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
556                self.query_embedding_cache.remove(&oldest);
557            }
558        }
559        self.query_embedding_cache
560            .insert(query.to_string(), vector.clone());
561        self.query_embedding_cache_order
562            .push_back(query.to_string());
563
564        Ok(vector)
565    }
566
567    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
568        (
569            self.query_embedding_cache_hits,
570            self.query_embedding_cache_misses,
571            self.query_embedding_cache.len(),
572        )
573    }
574
575    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
576        match &mut self.engine {
577            SemanticEmbeddingEngine::Fastembed(model) => model
578                .embed(texts, None::<usize>)
579                .map_err(|error| format_embedding_init_error(error.to_string()))
580                .map_err(|error| format!("failed to embed batch: {error}")),
581            SemanticEmbeddingEngine::OpenAiCompatible {
582                client,
583                model,
584                base_url,
585                api_key,
586            } => {
587                let expected_text_count = texts.len();
588                let endpoint = build_openai_embeddings_endpoint(base_url);
589                let body = serde_json::json!({
590                    "input": texts,
591                    "model": model,
592                });
593
594                let raw = send_embedding_request(
595                    || {
596                        // `.json(&body)` sets Content-Type: application/json
597                        // automatically. Do NOT add `.header("Content-Type",
598                        // "application/json")` afterwards — RequestBuilder::header()
599                        // calls HeaderMap::append, which produces TWO Content-Type
600                        // headers on the wire. OpenAI's /v1/embeddings endpoint
601                        // treats duplicate Content-Type as malformed and rejects
602                        // the body with 400 "you must provide a model parameter"
603                        // even when `model` is set. Verified end-to-end against
604                        // api.openai.com. See issue #36.
605                        let mut request = client.post(&endpoint).json(&body);
606
607                        if let Some(api_key) = api_key {
608                            request = request.header("Authorization", format!("Bearer {api_key}"));
609                        }
610
611                        request
612                    },
613                    "openai compatible",
614                )?;
615
616                #[derive(Deserialize)]
617                struct OpenAiResponse {
618                    data: Vec<OpenAiEmbeddingResult>,
619                }
620
621                #[derive(Deserialize)]
622                struct OpenAiEmbeddingResult {
623                    embedding: Vec<f32>,
624                    index: Option<u32>,
625                }
626
627                let parsed: OpenAiResponse = serde_json::from_str(&raw)
628                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
629                if parsed.data.len() != expected_text_count {
630                    return Err(format!(
631                        "openai compatible response returned {} embeddings for {} inputs",
632                        parsed.data.len(),
633                        expected_text_count
634                    ));
635                }
636
637                let mut vectors = vec![Vec::new(); parsed.data.len()];
638                for (i, item) in parsed.data.into_iter().enumerate() {
639                    let index = item.index.unwrap_or(i as u32) as usize;
640                    if index >= vectors.len() {
641                        return Err(
642                            "openai compatible response contains invalid vector index".to_string()
643                        );
644                    }
645                    vectors[index] = item.embedding;
646                }
647
648                for vector in &vectors {
649                    if vector.is_empty() {
650                        return Err(
651                            "openai compatible response contained missing vectors".to_string()
652                        );
653                    }
654                }
655
656                self.dimension = vectors.first().map(Vec::len);
657                Ok(vectors)
658            }
659            SemanticEmbeddingEngine::Ollama {
660                client,
661                model,
662                base_url,
663            } => {
664                let expected_text_count = texts.len();
665                let endpoint = build_ollama_embeddings_endpoint(base_url);
666
667                #[derive(Serialize)]
668                struct OllamaPayload<'a> {
669                    model: &'a str,
670                    input: Vec<String>,
671                }
672
673                let payload = OllamaPayload {
674                    model,
675                    input: texts,
676                };
677
678                let raw = send_embedding_request(
679                    || {
680                        // `.json(&payload)` sets Content-Type automatically.
681                        // Same duplicate-header trap as the OpenAI branch above
682                        // — most Ollama servers tolerate it, but the
683                        // single-Content-Type form is the correct one.
684                        client.post(&endpoint).json(&payload)
685                    },
686                    "ollama",
687                )?;
688
689                #[derive(Deserialize)]
690                struct OllamaResponse {
691                    embeddings: Vec<Vec<f32>>,
692                }
693
694                let parsed: OllamaResponse = serde_json::from_str(&raw)
695                    .map_err(|error| format!("invalid ollama response: {error}"))?;
696                if parsed.embeddings.is_empty() {
697                    return Err("ollama response returned no embeddings".to_string());
698                }
699                if parsed.embeddings.len() != expected_text_count {
700                    return Err(format!(
701                        "ollama response returned {} embeddings for {} inputs",
702                        parsed.embeddings.len(),
703                        expected_text_count
704                    ));
705                }
706
707                let vectors = parsed.embeddings;
708                for vector in &vectors {
709                    if vector.is_empty() {
710                        return Err("ollama response contained empty embeddings".to_string());
711                    }
712                }
713
714                self.dimension = vectors.first().map(Vec::len);
715                Ok(vectors)
716            }
717        }
718    }
719}
720
721/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
722/// This catches broken/incompatible .so files without risking a panic in the ort crate.
723/// Also checks the runtime version via OrtGetApiBase if available.
724pub fn pre_validate_onnx_runtime() -> Result<(), String> {
725    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
726
727    #[cfg(any(target_os = "linux", target_os = "macos"))]
728    {
729        #[cfg(target_os = "linux")]
730        let default_name = "libonnxruntime.so";
731        #[cfg(target_os = "macos")]
732        let default_name = "libonnxruntime.dylib";
733
734        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
735
736        unsafe {
737            let c_name = std::ffi::CString::new(lib_name)
738                .map_err(|e| format!("invalid library path: {}", e))?;
739            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
740            if handle.is_null() {
741                let err = libc::dlerror();
742                let msg = if err.is_null() {
743                    "unknown dlopen error".to_string()
744                } else {
745                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
746                };
747                return Err(format!(
748                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
749                     Run `npx @cortexkit/aft doctor` to diagnose.",
750                    lib_name, msg
751                ));
752            }
753
754            // Try to detect the runtime version from the file path or soname.
755            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
756            let detected_version = detect_ort_version_from_path(lib_name);
757
758            libc::dlclose(handle);
759
760            // Check version compatibility — we need 1.24.x
761            if let Some(ref version) = detected_version {
762                let parts: Vec<&str> = version.split('.').collect();
763                if let (Some(major), Some(minor)) = (
764                    parts.first().and_then(|s| s.parse::<u32>().ok()),
765                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
766                ) {
767                    if major != 1 || minor < 20 {
768                        return Err(format_ort_version_mismatch(version, lib_name));
769                    }
770                }
771            }
772        }
773    }
774
775    #[cfg(target_os = "windows")]
776    {
777        // On Windows, skip pre-validation — let ort handle LoadLibrary
778        let _ = dylib_path;
779    }
780
781    Ok(())
782}
783
784/// Try to extract the ORT version from the library filename or resolved symlink.
785/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
786#[cfg(any(test, target_os = "linux", target_os = "macos"))]
787fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
788    let path = std::path::Path::new(lib_path);
789
790    // Try the path as given, then follow symlinks
791    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
792        .into_iter()
793        .flatten()
794    {
795        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
796            if let Some(version) = extract_version_from_filename(name) {
797                return Some(version);
798            }
799        }
800    }
801
802    // Also check for versioned siblings in the same directory
803    if let Some(parent) = path.parent() {
804        if let Ok(entries) = std::fs::read_dir(parent) {
805            for entry in entries.flatten() {
806                if let Some(name) = entry.file_name().to_str() {
807                    if name.starts_with("libonnxruntime") {
808                        if let Some(version) = extract_version_from_filename(name) {
809                            return Some(version);
810                        }
811                    }
812                }
813            }
814        }
815    }
816
817    None
818}
819
820/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
821#[cfg(any(test, target_os = "linux", target_os = "macos"))]
822fn extract_version_from_filename(name: &str) -> Option<String> {
823    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
824    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
825    re.find(name).map(|m| m.as_str().to_string())
826}
827
828#[cfg(any(test, target_os = "linux", target_os = "macos"))]
829fn suggest_removal_command(lib_path: &str) -> String {
830    if lib_path.starts_with("/usr/local/lib")
831        || lib_path == "libonnxruntime.so"
832        || lib_path == "libonnxruntime.dylib"
833    {
834        #[cfg(target_os = "linux")]
835        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
836        #[cfg(target_os = "macos")]
837        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
838        #[cfg(target_os = "windows")]
839        return "   Delete the ONNX Runtime DLL from your PATH".to_string();
840    }
841    format!("   rm '{}'", lib_path)
842}
843
844/// Build the user-facing error message for an incompatible ONNX Runtime
845/// install. Extracted as a pure helper so we can unit-test the wording
846/// stability — the auto-fix recommendation must always come first because
847/// it's the only safe option, and the system-rm step must remain present
848/// because some users prefer the system-wide cleanup path.
849#[cfg(any(test, target_os = "linux", target_os = "macos"))]
850pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
851    format!(
852        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
853         Solutions:\n\
854         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
855         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
856         configures the bridge to load it instead of the system library — no \
857         changes to '{}'.\n\
858         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
859         {}\n\
860         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
861         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
862        version,
863        lib_name,
864        lib_name,
865        suggest_removal_command(lib_name),
866    )
867}
868
869pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
870    // Pre-validate before ort can panic on a bad library
871    pre_validate_onnx_runtime()?;
872
873    let selected_model = match model {
874        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
875        _ => {
876            return Err(format!(
877                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
878                model
879            ))
880        }
881    };
882
883    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
884}
885
886pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
887    if message.trim_start().starts_with("ONNX Runtime not found.") {
888        return true;
889    }
890
891    let message = message.to_ascii_lowercase();
892    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
893        .iter()
894        .any(|pattern| message.contains(pattern));
895    let mentions_dynamic_load_failure = [
896        "shared library",
897        "dynamic library",
898        "failed to load",
899        "could not load",
900        "unable to load",
901        "dlopen",
902        "loadlibrary",
903        "no such file",
904        "not found",
905    ]
906    .iter()
907    .any(|pattern| message.contains(pattern));
908
909    mentions_onnx_runtime && mentions_dynamic_load_failure
910}
911
912fn format_embedding_init_error(error: impl Display) -> String {
913    let message = error.to_string();
914
915    if is_onnx_runtime_unavailable(&message) {
916        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
917    }
918
919    format!("failed to initialize semantic embedding model: {message}")
920}
921
922/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
923#[derive(Debug, Clone)]
924pub struct SemanticChunk {
925    /// Absolute file path
926    pub file: PathBuf,
927    /// Symbol name
928    pub name: String,
929    /// Symbol kind (function, class, struct, etc.)
930    pub kind: SymbolKind,
931    /// Line range (0-based internally, inclusive)
932    pub start_line: u32,
933    pub end_line: u32,
934    /// Whether the symbol is exported
935    pub exported: bool,
936    /// The enriched text that gets embedded (scope + signature + body snippet)
937    pub embed_text: String,
938    /// Short code snippet for display in results
939    pub snippet: String,
940}
941
942/// A stored embedding entry — chunk metadata + vector
943#[derive(Debug)]
944struct EmbeddingEntry {
945    chunk: SemanticChunk,
946    vector: Vec<f32>,
947}
948
949/// The semantic index — stores embeddings for all symbols in a project
950#[derive(Debug)]
951pub struct SemanticIndex {
952    entries: Vec<EmbeddingEntry>,
953    /// Track which files are indexed and their mtime for staleness detection
954    file_mtimes: HashMap<PathBuf, SystemTime>,
955    /// Track indexed file sizes alongside mtimes for staleness detection
956    file_sizes: HashMap<PathBuf, u64>,
957    file_hashes: HashMap<PathBuf, blake3::Hash>,
958    /// Embedding dimension (384 for MiniLM-L6-v2)
959    dimension: usize,
960    fingerprint: Option<SemanticIndexFingerprint>,
961    project_root: PathBuf,
962}
963
964#[derive(Debug, Clone, Copy)]
965struct IndexedFileMetadata {
966    mtime: SystemTime,
967    size: u64,
968    content_hash: blake3::Hash,
969}
970
971/// Result of an incremental refresh of the semantic index. Counts are file
972/// counts; `total_processed` is the number of current/deleted files considered.
973#[derive(Debug, Default, Clone, Copy)]
974pub struct RefreshSummary {
975    pub changed: usize,
976    pub added: usize,
977    pub deleted: usize,
978    pub total_processed: usize,
979}
980
981impl RefreshSummary {
982    /// True when no files were touched.
983    pub fn is_noop(&self) -> bool {
984        self.changed == 0 && self.added == 0 && self.deleted == 0
985    }
986}
987
988/// Search result from a semantic query
989#[derive(Debug, Clone)]
990pub struct SemanticResult {
991    pub file: PathBuf,
992    pub name: String,
993    pub kind: SymbolKind,
994    pub start_line: u32,
995    pub end_line: u32,
996    pub exported: bool,
997    pub snippet: String,
998    pub score: f32,
999    pub source: &'static str,
1000}
1001
1002impl SemanticIndex {
1003    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1004        debug_assert!(project_root.is_absolute());
1005        Self {
1006            entries: Vec::new(),
1007            file_mtimes: HashMap::new(),
1008            file_sizes: HashMap::new(),
1009            file_hashes: HashMap::new(),
1010            dimension,
1011            fingerprint: None,
1012            project_root,
1013        }
1014    }
1015
1016    /// Number of embedded symbol entries.
1017    pub fn entry_count(&self) -> usize {
1018        self.entries.len()
1019    }
1020
1021    /// Human-readable status label for the index.
1022    pub fn status_label(&self) -> &'static str {
1023        if self.entries.is_empty() {
1024            "empty"
1025        } else {
1026            "ready"
1027        }
1028    }
1029
1030    fn collect_chunks(
1031        project_root: &Path,
1032        files: &[PathBuf],
1033    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1034        let per_file: Vec<(
1035            PathBuf,
1036            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1037        )> = files
1038            .par_iter()
1039            .map_init(HashMap::new, |parsers, file| {
1040                let result = collect_file_metadata(file).and_then(|metadata| {
1041                    collect_file_chunks(project_root, file, parsers)
1042                        .map(|chunks| (metadata, chunks))
1043                });
1044                (file.clone(), result)
1045            })
1046            .collect();
1047
1048        let mut chunks: Vec<SemanticChunk> = Vec::new();
1049        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1050
1051        for (file, result) in per_file {
1052            match result {
1053                Ok((metadata, file_chunks)) => {
1054                    file_metadata.insert(file, metadata);
1055                    chunks.extend(file_chunks);
1056                }
1057                Err(error) => {
1058                    // "unsupported file extension" is expected for non-code files
1059                    // (json, xml, .gitignore, etc.) that get included in the
1060                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1061                    // we now skip silently to keep the log clean. Only real read/parse
1062                    // errors are worth surfacing.
1063                    if error == "unsupported file extension" {
1064                        continue;
1065                    }
1066                    slog_warn!(
1067                        "failed to collect semantic chunks for {}: {}",
1068                        file.display(),
1069                        error
1070                    );
1071                }
1072            }
1073        }
1074
1075        (chunks, file_metadata)
1076    }
1077
1078    fn build_from_chunks<F, P>(
1079        project_root: &Path,
1080        chunks: Vec<SemanticChunk>,
1081        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1082        embed_fn: &mut F,
1083        max_batch_size: usize,
1084        mut progress: Option<&mut P>,
1085    ) -> Result<Self, String>
1086    where
1087        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1088        P: FnMut(usize, usize),
1089    {
1090        debug_assert!(project_root.is_absolute());
1091        let total_chunks = chunks.len();
1092
1093        if chunks.is_empty() {
1094            return Ok(Self {
1095                entries: Vec::new(),
1096                file_mtimes: file_metadata
1097                    .iter()
1098                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1099                    .collect(),
1100                file_sizes: file_metadata
1101                    .iter()
1102                    .map(|(path, metadata)| (path.clone(), metadata.size))
1103                    .collect(),
1104                file_hashes: file_metadata
1105                    .into_iter()
1106                    .map(|(path, metadata)| (path, metadata.content_hash))
1107                    .collect(),
1108                dimension: DEFAULT_DIMENSION,
1109                fingerprint: None,
1110                project_root: project_root.to_path_buf(),
1111            });
1112        }
1113
1114        // Embed in batches
1115        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1116        let mut expected_dimension: Option<usize> = None;
1117        let batch_size = max_batch_size.max(1);
1118        for batch_start in (0..chunks.len()).step_by(batch_size) {
1119            let batch_end = (batch_start + batch_size).min(chunks.len());
1120            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1121                .iter()
1122                .map(|c| c.embed_text.clone())
1123                .collect();
1124
1125            let vectors = embed_fn(batch_texts)?;
1126            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1127
1128            // Track consistent dimension across all batches
1129            if let Some(dim) = vectors.first().map(|v| v.len()) {
1130                match expected_dimension {
1131                    None => expected_dimension = Some(dim),
1132                    Some(expected) if dim != expected => {
1133                        return Err(format!(
1134                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1135                        ));
1136                    }
1137                    _ => {}
1138                }
1139            }
1140
1141            for (i, vector) in vectors.into_iter().enumerate() {
1142                let chunk_idx = batch_start + i;
1143                entries.push(EmbeddingEntry {
1144                    chunk: chunks[chunk_idx].clone(),
1145                    vector,
1146                });
1147            }
1148
1149            if let Some(callback) = progress.as_mut() {
1150                callback(entries.len(), total_chunks);
1151            }
1152        }
1153
1154        let dimension = entries
1155            .first()
1156            .map(|e| e.vector.len())
1157            .unwrap_or(DEFAULT_DIMENSION);
1158
1159        Ok(Self {
1160            entries,
1161            file_mtimes: file_metadata
1162                .iter()
1163                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1164                .collect(),
1165            file_sizes: file_metadata
1166                .iter()
1167                .map(|(path, metadata)| (path.clone(), metadata.size))
1168                .collect(),
1169            file_hashes: file_metadata
1170                .into_iter()
1171                .map(|(path, metadata)| (path, metadata.content_hash))
1172                .collect(),
1173            dimension,
1174            fingerprint: None,
1175            project_root: project_root.to_path_buf(),
1176        })
1177    }
1178
1179    /// Build the semantic index from a set of files using the provided embedding function.
1180    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1181    pub fn build<F>(
1182        project_root: &Path,
1183        files: &[PathBuf],
1184        embed_fn: &mut F,
1185        max_batch_size: usize,
1186    ) -> Result<Self, String>
1187    where
1188        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1189    {
1190        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1191        Self::build_from_chunks(
1192            project_root,
1193            chunks,
1194            file_mtimes,
1195            embed_fn,
1196            max_batch_size,
1197            Option::<&mut fn(usize, usize)>::None,
1198        )
1199    }
1200
1201    /// Build the semantic index and report embedding progress using entry counts.
1202    pub fn build_with_progress<F, P>(
1203        project_root: &Path,
1204        files: &[PathBuf],
1205        embed_fn: &mut F,
1206        max_batch_size: usize,
1207        progress: &mut P,
1208    ) -> Result<Self, String>
1209    where
1210        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1211        P: FnMut(usize, usize),
1212    {
1213        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1214        let total_chunks = chunks.len();
1215        progress(0, total_chunks);
1216        Self::build_from_chunks(
1217            project_root,
1218            chunks,
1219            file_mtimes,
1220            embed_fn,
1221            max_batch_size,
1222            Some(progress),
1223        )
1224    }
1225
1226    /// Incrementally refresh entries for changed/new files only, preserving cached
1227    /// embeddings for unchanged files. Used when loading the index from disk and
1228    /// finding that a small fraction of files have moved on, deleted, or appeared.
1229    ///
1230    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1231    /// mutated in place and remains a valid index.
1232    ///
1233    /// `current_files` is the full set of files the project considers indexable
1234    /// (typically `walk_project_files(...)`). Files in the cache that are no
1235    /// longer in this set are treated as deleted.
1236    pub fn refresh_stale_files<F, P>(
1237        &mut self,
1238        project_root: &Path,
1239        current_files: &[PathBuf],
1240        embed_fn: &mut F,
1241        max_batch_size: usize,
1242        progress: &mut P,
1243    ) -> Result<RefreshSummary, String>
1244    where
1245        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1246        P: FnMut(usize, usize),
1247    {
1248        self.backfill_missing_file_sizes();
1249
1250        // 1. Bucket files into deleted / changed / added.
1251        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1252        let total_processed = current_set.len() + self.file_mtimes.len()
1253            - self
1254                .file_mtimes
1255                .keys()
1256                .filter(|path| current_set.contains(path.as_path()))
1257                .count();
1258
1259        // Files in cache that disappeared from disk OR are no longer in the
1260        // walked set. Both cases need their entries dropped.
1261        let mut deleted: Vec<PathBuf> = Vec::new();
1262        let mut changed: Vec<PathBuf> = Vec::new();
1263        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1264        for indexed_path in &indexed_paths {
1265            if !current_set.contains(indexed_path.as_path()) {
1266                deleted.push(indexed_path.clone());
1267                continue;
1268            }
1269            let cached = match (
1270                self.file_mtimes.get(indexed_path),
1271                self.file_sizes.get(indexed_path),
1272                self.file_hashes.get(indexed_path),
1273            ) {
1274                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1275                    mtime: *mtime,
1276                    size: *size,
1277                    content_hash: *hash,
1278                }),
1279                _ => None,
1280            };
1281            match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1282                Some(FreshnessVerdict::HotFresh) => {}
1283                Some(FreshnessVerdict::ContentFresh {
1284                    new_mtime,
1285                    new_size,
1286                }) => {
1287                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1288                    self.file_sizes.insert(indexed_path.clone(), new_size);
1289                }
1290                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1291                    changed.push(indexed_path.clone());
1292                }
1293            }
1294        }
1295
1296        // Files in walk that were never indexed.
1297        let mut added: Vec<PathBuf> = Vec::new();
1298        for path in current_files {
1299            if !self.file_mtimes.contains_key(path) {
1300                added.push(path.clone());
1301            }
1302        }
1303
1304        // Fast path: nothing to do.
1305        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1306            progress(0, 0);
1307            return Ok(RefreshSummary {
1308                total_processed,
1309                ..RefreshSummary::default()
1310            });
1311        }
1312
1313        // 2. Drop entries for deleted files immediately. Changed files are only
1314        //    replaced after successful re-extraction + embedding so transient
1315        //    read/parse errors keep the stale-but-valid cache entry.
1316        if !deleted.is_empty() {
1317            let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1318            self.entries
1319                .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1320            for path in &deleted {
1321                self.file_mtimes.remove(path);
1322                self.file_sizes.remove(path);
1323                self.file_hashes.remove(path);
1324            }
1325        }
1326
1327        // 3. Embed the changed + added set, if any.
1328        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1329        to_embed.extend(changed.iter().cloned());
1330        to_embed.extend(added.iter().cloned());
1331
1332        if to_embed.is_empty() {
1333            // Only deletions happened.
1334            progress(0, 0);
1335            return Ok(RefreshSummary {
1336                changed: 0,
1337                added: 0,
1338                deleted: deleted.len(),
1339                total_processed,
1340            });
1341        }
1342
1343        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1344
1345        if chunks.is_empty() {
1346            progress(0, 0);
1347            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1348            if !successful_files.is_empty() {
1349                self.entries
1350                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1351            }
1352            let changed_count = changed
1353                .iter()
1354                .filter(|path| successful_files.contains(*path))
1355                .count();
1356            let added_count = added
1357                .iter()
1358                .filter(|path| successful_files.contains(*path))
1359                .count();
1360            for (file, metadata) in fresh_metadata {
1361                self.file_mtimes.insert(file.clone(), metadata.mtime);
1362                self.file_sizes.insert(file.clone(), metadata.size);
1363                self.file_hashes.insert(file.clone(), metadata.content_hash);
1364            }
1365            return Ok(RefreshSummary {
1366                changed: changed_count,
1367                added: added_count,
1368                deleted: deleted.len(),
1369                total_processed,
1370            });
1371        }
1372
1373        // 4. Embed in batches and dimension-check against the existing index.
1374        let total_chunks = chunks.len();
1375        progress(0, total_chunks);
1376        let batch_size = max_batch_size.max(1);
1377        let existing_dimension = if self.entries.is_empty() {
1378            None
1379        } else {
1380            Some(self.dimension)
1381        };
1382        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1383        let mut observed_dimension: Option<usize> = existing_dimension;
1384
1385        for batch_start in (0..chunks.len()).step_by(batch_size) {
1386            let batch_end = (batch_start + batch_size).min(chunks.len());
1387            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1388                .iter()
1389                .map(|c| c.embed_text.clone())
1390                .collect();
1391
1392            let vectors = embed_fn(batch_texts)?;
1393            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1394
1395            if let Some(dim) = vectors.first().map(|v| v.len()) {
1396                match observed_dimension {
1397                    None => observed_dimension = Some(dim),
1398                    Some(expected) if dim != expected => {
1399                        // Refuse to mix dimensions in one index. Caller should
1400                        // fall back to a full rebuild.
1401                        return Err(format!(
1402                            "embedding dimension changed during incremental refresh: \
1403                             cached index uses {expected}, new vectors use {dim}"
1404                        ));
1405                    }
1406                    _ => {}
1407                }
1408            }
1409
1410            for (i, vector) in vectors.into_iter().enumerate() {
1411                let chunk_idx = batch_start + i;
1412                new_entries.push(EmbeddingEntry {
1413                    chunk: chunks[chunk_idx].clone(),
1414                    vector,
1415                });
1416            }
1417
1418            progress(new_entries.len(), total_chunks);
1419        }
1420
1421        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1422        if !successful_files.is_empty() {
1423            self.entries
1424                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1425        }
1426
1427        self.entries.extend(new_entries);
1428        for (file, metadata) in fresh_metadata {
1429            self.file_mtimes.insert(file.clone(), metadata.mtime);
1430            self.file_sizes.insert(file.clone(), metadata.size);
1431            self.file_hashes.insert(file, metadata.content_hash);
1432        }
1433        if let Some(dim) = observed_dimension {
1434            self.dimension = dim;
1435        }
1436
1437        Ok(RefreshSummary {
1438            changed: changed
1439                .iter()
1440                .filter(|path| successful_files.contains(*path))
1441                .count(),
1442            added: added
1443                .iter()
1444                .filter(|path| successful_files.contains(*path))
1445                .count(),
1446            deleted: deleted.len(),
1447            total_processed,
1448        })
1449    }
1450
1451    /// Search the index with a query embedding, returning top-K results sorted by relevance
1452    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1453        if self.entries.is_empty() || query_vector.len() != self.dimension {
1454            return Vec::new();
1455        }
1456
1457        let mut scored: Vec<(f32, usize)> = self
1458            .entries
1459            .iter()
1460            .enumerate()
1461            .map(|(i, entry)| {
1462                let mut score = cosine_similarity(query_vector, &entry.vector);
1463                if entry.chunk.exported {
1464                    score *= 1.1;
1465                }
1466                (score, i)
1467            })
1468            .collect();
1469
1470        // Sort descending by score
1471        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1472
1473        scored
1474            .into_iter()
1475            .take(top_k)
1476            // Keep the sort → take → map ordering explicit: removing the old
1477            // `> 0.0` floor cannot evict positive hits because top_k has already
1478            // been selected, but it can surface zero-score noise in the tail.
1479            .map(|(score, idx)| {
1480                let entry = &self.entries[idx];
1481                SemanticResult {
1482                    file: entry.chunk.file.clone(),
1483                    name: entry.chunk.name.clone(),
1484                    kind: entry.chunk.kind.clone(),
1485                    start_line: entry.chunk.start_line,
1486                    end_line: entry.chunk.end_line,
1487                    exported: entry.chunk.exported,
1488                    snippet: entry.chunk.snippet.clone(),
1489                    score,
1490                    source: "semantic",
1491                }
1492            })
1493            .collect()
1494    }
1495
1496    /// Number of indexed entries
1497    pub fn len(&self) -> usize {
1498        self.entries.len()
1499    }
1500
1501    /// Check if a file needs re-indexing based on mtime/size
1502    pub fn is_file_stale(&self, file: &Path) -> bool {
1503        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1504            return true;
1505        };
1506        let Some(stored_size) = self.file_sizes.get(file) else {
1507            return true;
1508        };
1509        let Some(stored_hash) = self.file_hashes.get(file) else {
1510            return true;
1511        };
1512        let cached = FileFreshness {
1513            mtime: *stored_mtime,
1514            size: *stored_size,
1515            content_hash: *stored_hash,
1516        };
1517        match cache_freshness::verify_file(file, &cached) {
1518            FreshnessVerdict::HotFresh => false,
1519            FreshnessVerdict::ContentFresh { .. } => false,
1520            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1521        }
1522    }
1523
1524    fn backfill_missing_file_sizes(&mut self) {
1525        for path in self.file_mtimes.keys() {
1526            if self.file_sizes.contains_key(path) {
1527                continue;
1528            }
1529            if let Ok(metadata) = fs::metadata(path) {
1530                self.file_sizes.insert(path.clone(), metadata.len());
1531                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1532                    self.file_hashes.insert(path.clone(), hash);
1533                }
1534            }
1535        }
1536    }
1537
1538    /// Remove entries for a specific file
1539    pub fn remove_file(&mut self, file: &Path) {
1540        self.invalidate_file(file);
1541    }
1542
1543    pub fn invalidate_file(&mut self, file: &Path) {
1544        self.entries.retain(|e| e.chunk.file != file);
1545        self.file_mtimes.remove(file);
1546        self.file_sizes.remove(file);
1547        self.file_hashes.remove(file);
1548    }
1549
1550    /// Get the embedding dimension
1551    pub fn dimension(&self) -> usize {
1552        self.dimension
1553    }
1554
1555    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1556        self.fingerprint.as_ref()
1557    }
1558
1559    pub fn backend_label(&self) -> Option<&str> {
1560        self.fingerprint.as_ref().map(|f| f.backend.as_str())
1561    }
1562
1563    pub fn model_label(&self) -> Option<&str> {
1564        self.fingerprint.as_ref().map(|f| f.model.as_str())
1565    }
1566
1567    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1568        self.fingerprint = Some(fingerprint);
1569    }
1570
1571    /// Write the semantic index to disk using atomic temp+rename pattern
1572    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1573        // Don't persist empty indexes — they would be loaded on next startup
1574        // and prevent a fresh build that might find files.
1575        if self.entries.is_empty() {
1576            slog_info!("skipping semantic index persistence (0 entries)");
1577            return;
1578        }
1579        let dir = storage_dir.join("semantic").join(project_key);
1580        if let Err(e) = fs::create_dir_all(&dir) {
1581            slog_warn!("failed to create semantic cache dir: {}", e);
1582            return;
1583        }
1584        let data_path = dir.join("semantic.bin");
1585        let tmp_path = dir.join(format!(
1586            "semantic.bin.tmp.{}.{}",
1587            std::process::id(),
1588            SystemTime::now()
1589                .duration_since(SystemTime::UNIX_EPOCH)
1590                .unwrap_or(Duration::ZERO)
1591                .as_nanos()
1592        ));
1593        let bytes = self.to_bytes();
1594        let write_result = (|| -> std::io::Result<()> {
1595            use std::io::Write;
1596            let mut file = fs::File::create(&tmp_path)?;
1597            file.write_all(&bytes)?;
1598            file.sync_all()?;
1599            Ok(())
1600        })();
1601        if let Err(e) = write_result {
1602            slog_warn!("failed to write semantic index: {}", e);
1603            let _ = fs::remove_file(&tmp_path);
1604            return;
1605        }
1606        if let Err(e) = fs::rename(&tmp_path, &data_path) {
1607            slog_warn!("failed to rename semantic index: {}", e);
1608            let _ = fs::remove_file(&tmp_path);
1609            return;
1610        }
1611        slog_info!(
1612            "semantic index persisted: {} entries, {:.1} KB",
1613            self.entries.len(),
1614            bytes.len() as f64 / 1024.0
1615        );
1616    }
1617
1618    /// Read the semantic index from disk
1619    pub fn read_from_disk(
1620        storage_dir: &Path,
1621        project_key: &str,
1622        current_canonical_root: &Path,
1623        is_worktree_bridge: bool,
1624        expected_fingerprint: Option<&str>,
1625    ) -> Option<Self> {
1626        debug_assert!(current_canonical_root.is_absolute());
1627        let data_path = storage_dir
1628            .join("semantic")
1629            .join(project_key)
1630            .join("semantic.bin");
1631        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1632        if file_len < HEADER_BYTES_V1 {
1633            slog_warn!(
1634                "corrupt semantic index (too small: {} bytes), removing",
1635                file_len
1636            );
1637            if !is_worktree_bridge {
1638                let _ = fs::remove_file(&data_path);
1639            }
1640            return None;
1641        }
1642
1643        let bytes = fs::read(&data_path).ok()?;
1644        let version = bytes[0];
1645        if version != SEMANTIC_INDEX_VERSION_V6 {
1646            slog_info!(
1647                "cached semantic index version {} is older than {}, rebuilding",
1648                version,
1649                SEMANTIC_INDEX_VERSION_V6
1650            );
1651            if !is_worktree_bridge {
1652                let _ = fs::remove_file(&data_path);
1653            }
1654            return None;
1655        }
1656        match Self::from_bytes(&bytes, current_canonical_root) {
1657            Ok(index) => {
1658                if index.entries.is_empty() {
1659                    slog_info!("cached semantic index is empty, will rebuild");
1660                    if !is_worktree_bridge {
1661                        let _ = fs::remove_file(&data_path);
1662                    }
1663                    return None;
1664                }
1665                if let Some(expected) = expected_fingerprint {
1666                    let matches = index
1667                        .fingerprint()
1668                        .map(|fingerprint| fingerprint.matches_expected(expected))
1669                        .unwrap_or(false);
1670                    if !matches {
1671                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1672                        if !is_worktree_bridge {
1673                            let _ = fs::remove_file(&data_path);
1674                        }
1675                        return None;
1676                    }
1677                }
1678                slog_info!(
1679                    "loaded semantic index from disk: {} entries",
1680                    index.entries.len()
1681                );
1682                Some(index)
1683            }
1684            Err(e) => {
1685                slog_warn!("corrupt semantic index, rebuilding: {}", e);
1686                if !is_worktree_bridge {
1687                    let _ = fs::remove_file(&data_path);
1688                }
1689                None
1690            }
1691        }
1692    }
1693
1694    /// Serialize the index to bytes for disk persistence
1695    pub fn to_bytes(&self) -> Vec<u8> {
1696        let mut buf = Vec::new();
1697        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1698            let encoded = fingerprint.as_string();
1699            if encoded.is_empty() {
1700                None
1701            } else {
1702                Some(encoded.into_bytes())
1703            }
1704        });
1705        let file_mtimes: Vec<_> = self
1706            .file_mtimes
1707            .iter()
1708            .filter_map(|(path, mtime)| {
1709                cache_relative_path(&self.project_root, path)
1710                    .map(|relative| (relative, path, mtime))
1711            })
1712            .collect();
1713        let entries: Vec<_> = self
1714            .entries
1715            .iter()
1716            .filter_map(|entry| {
1717                cache_relative_path(&self.project_root, &entry.chunk.file)
1718                    .map(|relative| (relative, entry))
1719            })
1720            .collect();
1721
1722        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
1723        //
1724        // V6 is the single write format. Layout extends V5:
1725        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
1726        //     no bytes follow). Uniform format simplifies the reader.
1727        //   - paths are relative to project_root.
1728        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
1729        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
1730        //
1731        // V1/V2 remain readable for backward compatibility (see from_bytes).
1732        // V3/V4 load as compatible formats but are rejected on disk so snippets
1733        // and file sizes are rebuilt once.
1734        let version = SEMANTIC_INDEX_VERSION_V6;
1735        buf.push(version);
1736        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1737        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1738        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1739        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1740        buf.extend_from_slice(fp_bytes_ref);
1741
1742        // File mtime table: count(4) + entries
1743        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
1744        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1745        for (relative, path, mtime) in &file_mtimes {
1746            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1747            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1748            buf.extend_from_slice(&path_bytes);
1749            let duration = mtime
1750                .duration_since(SystemTime::UNIX_EPOCH)
1751                .unwrap_or_default();
1752            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1753            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1754            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1755            buf.extend_from_slice(&size.to_le_bytes());
1756            let hash = self
1757                .file_hashes
1758                .get(*path)
1759                .copied()
1760                .unwrap_or_else(cache_freshness::zero_hash);
1761            buf.extend_from_slice(hash.as_bytes());
1762        }
1763
1764        // Entries: each is metadata + vector
1765        for (relative, entry) in &entries {
1766            let c = &entry.chunk;
1767
1768            // File path
1769            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1770            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1771            buf.extend_from_slice(&file_bytes);
1772
1773            // Name
1774            let name_bytes = c.name.as_bytes();
1775            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1776            buf.extend_from_slice(name_bytes);
1777
1778            // Kind (1 byte)
1779            buf.push(symbol_kind_to_u8(&c.kind));
1780
1781            // Lines + exported
1782            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1783            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1784            buf.push(c.exported as u8);
1785
1786            // Snippet
1787            let snippet_bytes = c.snippet.as_bytes();
1788            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1789            buf.extend_from_slice(snippet_bytes);
1790
1791            // Embed text
1792            let embed_bytes = c.embed_text.as_bytes();
1793            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1794            buf.extend_from_slice(embed_bytes);
1795
1796            // Vector (f32 array)
1797            for &val in &entry.vector {
1798                buf.extend_from_slice(&val.to_le_bytes());
1799            }
1800        }
1801
1802        buf
1803    }
1804
1805    /// Deserialize the index from bytes
1806    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1807        debug_assert!(current_canonical_root.is_absolute());
1808        let mut pos = 0;
1809
1810        if data.len() < HEADER_BYTES_V1 {
1811            return Err("data too short".to_string());
1812        }
1813
1814        let version = data[pos];
1815        pos += 1;
1816        if version != SEMANTIC_INDEX_VERSION_V1
1817            && version != SEMANTIC_INDEX_VERSION_V2
1818            && version != SEMANTIC_INDEX_VERSION_V3
1819            && version != SEMANTIC_INDEX_VERSION_V4
1820            && version != SEMANTIC_INDEX_VERSION_V5
1821            && version != SEMANTIC_INDEX_VERSION_V6
1822        {
1823            return Err(format!("unsupported version: {}", version));
1824        }
1825        // V2 and newer share the same header layout (V3/V4/V5 only differ from
1826        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
1827        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
1828        if (version == SEMANTIC_INDEX_VERSION_V2
1829            || version == SEMANTIC_INDEX_VERSION_V3
1830            || version == SEMANTIC_INDEX_VERSION_V4
1831            || version == SEMANTIC_INDEX_VERSION_V5
1832            || version == SEMANTIC_INDEX_VERSION_V6)
1833            && data.len() < HEADER_BYTES_V2
1834        {
1835            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1836        }
1837
1838        let dimension = read_u32(data, &mut pos)? as usize;
1839        let entry_count = read_u32(data, &mut pos)? as usize;
1840        if dimension == 0 || dimension > MAX_DIMENSION {
1841            return Err(format!("invalid embedding dimension: {}", dimension));
1842        }
1843        if entry_count > MAX_ENTRIES {
1844            return Err(format!("too many semantic index entries: {}", entry_count));
1845        }
1846
1847        // Fingerprint handling:
1848        //   - V1: no fingerprint field at all.
1849        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
1850        //     only emitted V2 when fingerprint was Some).
1851        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
1852        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1853            || version == SEMANTIC_INDEX_VERSION_V3
1854            || version == SEMANTIC_INDEX_VERSION_V4
1855            || version == SEMANTIC_INDEX_VERSION_V5
1856            || version == SEMANTIC_INDEX_VERSION_V6;
1857        let fingerprint = if has_fingerprint_field {
1858            let fingerprint_len = read_u32(data, &mut pos)? as usize;
1859            if pos + fingerprint_len > data.len() {
1860                return Err("unexpected end of data reading fingerprint".to_string());
1861            }
1862            if fingerprint_len == 0 {
1863                None
1864            } else {
1865                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1866                pos += fingerprint_len;
1867                Some(
1868                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1869                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1870                )
1871            }
1872        } else {
1873            None
1874        };
1875
1876        // File mtimes
1877        let mtime_count = read_u32(data, &mut pos)? as usize;
1878        if mtime_count > MAX_ENTRIES {
1879            return Err(format!("too many semantic file mtimes: {}", mtime_count));
1880        }
1881
1882        let vector_bytes = entry_count
1883            .checked_mul(dimension)
1884            .and_then(|count| count.checked_mul(F32_BYTES))
1885            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1886        if vector_bytes > data.len().saturating_sub(pos) {
1887            return Err("semantic index vectors exceed available data".to_string());
1888        }
1889
1890        let mut file_mtimes = HashMap::with_capacity(mtime_count);
1891        let mut file_sizes = HashMap::with_capacity(mtime_count);
1892        let mut file_hashes = HashMap::with_capacity(mtime_count);
1893        for _ in 0..mtime_count {
1894            let path = read_string(data, &mut pos)?;
1895            let secs = read_u64(data, &mut pos)?;
1896            // V3+ persists subsec_nanos alongside secs so staleness checks
1897            // survive restart round-trips. V1/V2 load with 0 nanos, which
1898            // causes one rebuild on upgrade (they never matched live APFS
1899            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
1900            // the cache is persisted as V3 and stabilises.
1901            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1902                || version == SEMANTIC_INDEX_VERSION_V4
1903                || version == SEMANTIC_INDEX_VERSION_V5
1904                || version == SEMANTIC_INDEX_VERSION_V6
1905            {
1906                read_u32(data, &mut pos)?
1907            } else {
1908                0
1909            };
1910            let size =
1911                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1912                    read_u64(data, &mut pos)?
1913                } else {
1914                    0
1915                };
1916            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1917                if pos + 32 > data.len() {
1918                    return Err("unexpected end of data reading content hash".to_string());
1919                }
1920                let mut hash_bytes = [0u8; 32];
1921                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1922                pos += 32;
1923                blake3::Hash::from_bytes(hash_bytes)
1924            } else {
1925                cache_freshness::zero_hash()
1926            };
1927            // Hardening against corrupt / maliciously crafted cache files
1928            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
1929            // nanosecond carry overflows the second counter, and
1930            // `SystemTime + Duration` can panic on carry past the platform's
1931            // upper bound. Explicit validation keeps a corrupted semantic.bin
1932            // from taking down the whole aft process.
1933            if nanos >= 1_000_000_000 {
1934                return Err(format!(
1935                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
1936                    nanos
1937                ));
1938            }
1939            let duration = std::time::Duration::new(secs, nanos);
1940            let mtime = SystemTime::UNIX_EPOCH
1941                .checked_add(duration)
1942                .ok_or_else(|| {
1943                    format!(
1944                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1945                        secs, nanos
1946                    )
1947                })?;
1948            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1949                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1950                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1951            } else {
1952                PathBuf::from(path)
1953            };
1954            file_mtimes.insert(path.clone(), mtime);
1955            file_sizes.insert(path.clone(), size);
1956            file_hashes.insert(path, content_hash);
1957        }
1958
1959        // Entries
1960        let mut entries = Vec::with_capacity(entry_count);
1961        for _ in 0..entry_count {
1962            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1963            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1964                cached_path_under_root(current_canonical_root, &raw_file)
1965                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1966            } else {
1967                raw_file
1968            };
1969            let name = read_string(data, &mut pos)?;
1970
1971            if pos >= data.len() {
1972                return Err("unexpected end of data".to_string());
1973            }
1974            let kind = u8_to_symbol_kind(data[pos]);
1975            pos += 1;
1976
1977            let start_line = read_u32(data, &mut pos)?;
1978            let end_line = read_u32(data, &mut pos)?;
1979
1980            if pos >= data.len() {
1981                return Err("unexpected end of data".to_string());
1982            }
1983            let exported = data[pos] != 0;
1984            pos += 1;
1985
1986            let snippet = read_string(data, &mut pos)?;
1987            let embed_text = read_string(data, &mut pos)?;
1988
1989            // Vector
1990            let vec_bytes = dimension
1991                .checked_mul(F32_BYTES)
1992                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1993            if pos + vec_bytes > data.len() {
1994                return Err("unexpected end of data reading vector".to_string());
1995            }
1996            let mut vector = Vec::with_capacity(dimension);
1997            for _ in 0..dimension {
1998                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1999                vector.push(f32::from_le_bytes(bytes));
2000                pos += 4;
2001            }
2002
2003            entries.push(EmbeddingEntry {
2004                chunk: SemanticChunk {
2005                    file,
2006                    name,
2007                    kind,
2008                    start_line,
2009                    end_line,
2010                    exported,
2011                    embed_text,
2012                    snippet,
2013                },
2014                vector,
2015            });
2016        }
2017
2018        if entries.len() != entry_count {
2019            return Err(format!(
2020                "semantic cache entry count drift: header={} decoded={}",
2021                entry_count,
2022                entries.len()
2023            ));
2024        }
2025        for entry in &entries {
2026            if !file_mtimes.contains_key(&entry.chunk.file) {
2027                return Err(format!(
2028                    "semantic cache metadata missing for entry file {}",
2029                    entry.chunk.file.display()
2030                ));
2031            }
2032        }
2033
2034        Ok(Self {
2035            entries,
2036            file_mtimes,
2037            file_sizes,
2038            file_hashes,
2039            dimension,
2040            fingerprint,
2041            project_root: current_canonical_root.to_path_buf(),
2042        })
2043    }
2044}
2045
2046/// Build enriched embedding text from a symbol with cAST-style context
2047fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2048    let relative = file
2049        .strip_prefix(project_root)
2050        .unwrap_or(file)
2051        .to_string_lossy();
2052
2053    let kind_label = match &symbol.kind {
2054        SymbolKind::Function => "function",
2055        SymbolKind::Class => "class",
2056        SymbolKind::Method => "method",
2057        SymbolKind::Struct => "struct",
2058        SymbolKind::Interface => "interface",
2059        SymbolKind::Enum => "enum",
2060        SymbolKind::TypeAlias => "type",
2061        SymbolKind::Variable => "variable",
2062        SymbolKind::Heading => "heading",
2063        SymbolKind::FileSummary => "file-summary",
2064    };
2065
2066    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2067    let name = &symbol.name;
2068    let mut text = format!(
2069        "name:{name} file:{} kind:{} name:{name}",
2070        relative, kind_label
2071    );
2072
2073    if let Some(sig) = &symbol.signature {
2074        text.push_str(&format!(" signature:{}", sig));
2075    }
2076
2077    // Add body snippet (first ~300 chars of symbol body)
2078    let lines: Vec<&str> = source.lines().collect();
2079    let start = (symbol.range.start_line as usize).min(lines.len());
2080    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2081    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2082    if start < end {
2083        let body: String = lines[start..end]
2084            .iter()
2085            .take(15) // max 15 lines
2086            .copied()
2087            .collect::<Vec<&str>>()
2088            .join("\n");
2089        let snippet = if body.len() > 300 {
2090            format!("{}...", &body[..body.floor_char_boundary(300)])
2091        } else {
2092            body
2093        };
2094        text.push_str(&format!(" body:{}", snippet));
2095    }
2096
2097    text
2098}
2099
2100fn truncate_chars(value: &str, max_chars: usize) -> String {
2101    value.chars().take(max_chars).collect()
2102}
2103
2104fn first_leading_doc_comment(source: &str) -> String {
2105    let lines: Vec<&str> = source.lines().collect();
2106    let Some((start, first)) = lines
2107        .iter()
2108        .enumerate()
2109        .find(|(_, line)| !line.trim().is_empty())
2110    else {
2111        return String::new();
2112    };
2113
2114    let trimmed = first.trim_start();
2115    if trimmed.starts_with("/**") {
2116        let mut comment = Vec::new();
2117        for line in lines.iter().skip(start) {
2118            comment.push(*line);
2119            if line.contains("*/") {
2120                break;
2121            }
2122        }
2123        return truncate_chars(&comment.join("\n"), 200);
2124    }
2125
2126    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2127        let comment = lines
2128            .iter()
2129            .skip(start)
2130            .take_while(|line| {
2131                let trimmed = line.trim_start();
2132                trimmed.starts_with("///") || trimmed.starts_with("//!")
2133            })
2134            .copied()
2135            .collect::<Vec<_>>()
2136            .join("\n");
2137        return truncate_chars(&comment, 200);
2138    }
2139
2140    String::new()
2141}
2142
2143pub fn build_file_summary_chunk(
2144    file: &Path,
2145    project_root: &Path,
2146    source: &str,
2147    top_exports: &[&str],
2148    top_export_signatures: &[Option<&str>],
2149) -> SemanticChunk {
2150    let relative = file.strip_prefix(project_root).unwrap_or(file);
2151    let rel_path = relative.to_string_lossy();
2152    let parent_dir = relative
2153        .parent()
2154        .map(|parent| parent.to_string_lossy().to_string())
2155        .unwrap_or_default();
2156    let name = file
2157        .file_stem()
2158        .map(|stem| stem.to_string_lossy().to_string())
2159        .unwrap_or_default();
2160    let doc = first_leading_doc_comment(source);
2161    let exports = top_exports
2162        .iter()
2163        .take(5)
2164        .copied()
2165        .collect::<Vec<_>>()
2166        .join(",");
2167    let snippet = if doc.is_empty() {
2168        top_export_signatures
2169            .first()
2170            .and_then(|signature| signature.as_deref())
2171            .map(|signature| truncate_chars(signature, 200))
2172            .unwrap_or_default()
2173    } else {
2174        doc.clone()
2175    };
2176
2177    SemanticChunk {
2178        file: file.to_path_buf(),
2179        name,
2180        kind: SymbolKind::FileSummary,
2181        start_line: 0,
2182        end_line: 0,
2183        exported: false,
2184        embed_text: format!(
2185            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2186            file.file_stem()
2187                .map(|stem| stem.to_string_lossy().to_string())
2188                .unwrap_or_default()
2189        ),
2190        snippet,
2191    }
2192}
2193
2194fn parser_for(
2195    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2196    lang: crate::parser::LangId,
2197) -> Result<&mut Parser, String> {
2198    use std::collections::hash_map::Entry;
2199
2200    match parsers.entry(lang) {
2201        Entry::Occupied(entry) => Ok(entry.into_mut()),
2202        Entry::Vacant(entry) => {
2203            let grammar = grammar_for(lang);
2204            let mut parser = Parser::new();
2205            parser
2206                .set_language(&grammar)
2207                .map_err(|error| error.to_string())?;
2208            Ok(entry.insert(parser))
2209        }
2210    }
2211}
2212
2213pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2214    matches!(
2215        path.extension().and_then(|extension| extension.to_str()),
2216        Some(
2217            "ts" | "tsx"
2218                | "js"
2219                | "jsx"
2220                | "py"
2221                | "rs"
2222                | "go"
2223                | "c"
2224                | "h"
2225                | "cc"
2226                | "cpp"
2227                | "cxx"
2228                | "hpp"
2229                | "hh"
2230                | "zig"
2231                | "cs"
2232                | "sh"
2233                | "bash"
2234                | "zsh"
2235                | "sol"
2236                | "vue"
2237        )
2238    )
2239}
2240
2241fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2242    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2243    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2244    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2245        .map_err(|error| error.to_string())?
2246        .unwrap_or_else(cache_freshness::zero_hash);
2247    Ok(IndexedFileMetadata {
2248        mtime,
2249        size: metadata.len(),
2250        content_hash,
2251    })
2252}
2253
2254fn collect_file_chunks(
2255    project_root: &Path,
2256    file: &Path,
2257    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2258) -> Result<Vec<SemanticChunk>, String> {
2259    if !is_semantic_indexed_extension(file) {
2260        return Err("unsupported file extension".to_string());
2261    }
2262    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2263    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2264    let tree = parser_for(parsers, lang)?
2265        .parse(&source, None)
2266        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2267    let symbols =
2268        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2269
2270    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2271}
2272
2273/// Build a display snippet from a symbol's source
2274fn build_snippet(symbol: &Symbol, source: &str) -> String {
2275    let lines: Vec<&str> = source.lines().collect();
2276    let start = (symbol.range.start_line as usize).min(lines.len());
2277    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2278    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2279    if start < end {
2280        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2281        let mut snippet = snippet_lines.join("\n");
2282        if end - start > 5 {
2283            snippet.push_str("\n  ...");
2284        }
2285        if snippet.len() > 300 {
2286            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2287        }
2288        snippet
2289    } else {
2290        String::new()
2291    }
2292}
2293
2294/// Convert symbols to semantic chunks with enriched context
2295fn symbols_to_chunks(
2296    file: &Path,
2297    symbols: &[Symbol],
2298    source: &str,
2299    project_root: &Path,
2300) -> Vec<SemanticChunk> {
2301    let mut chunks = Vec::new();
2302    let top_exports_with_signatures = symbols
2303        .iter()
2304        .filter(|symbol| {
2305            symbol.exported
2306                && symbol.parent.is_none()
2307                && !matches!(symbol.kind, SymbolKind::Heading)
2308        })
2309        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2310        .collect::<Vec<_>>();
2311
2312    let has_only_headings = !symbols.is_empty()
2313        && symbols
2314            .iter()
2315            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2316    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2317        let top_exports = top_exports_with_signatures
2318            .iter()
2319            .map(|(name, _)| *name)
2320            .collect::<Vec<_>>();
2321        let top_export_signatures = top_exports_with_signatures
2322            .iter()
2323            .map(|(_, signature)| *signature)
2324            .collect::<Vec<_>>();
2325        chunks.push(build_file_summary_chunk(
2326            file,
2327            project_root,
2328            source,
2329            &top_exports,
2330            &top_export_signatures,
2331        ));
2332    }
2333
2334    for symbol in symbols {
2335        // Skip Markdown / HTML heading chunks: empirically they dominate result
2336        // lists even for code-shaped queries because heading prose embeds well.
2337        // Agents querying for code lose the actual matches under doc noise.
2338        // README/docs queries are still served by grep on the same files.
2339        if matches!(symbol.kind, SymbolKind::Heading) {
2340            continue;
2341        }
2342
2343        // Skip very small symbols (single-line variables, etc.)
2344        let line_count = symbol
2345            .range
2346            .end_line
2347            .saturating_sub(symbol.range.start_line)
2348            + 1;
2349        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2350            continue;
2351        }
2352
2353        let embed_text = build_embed_text(symbol, source, file, project_root);
2354        let snippet = build_snippet(symbol, source);
2355
2356        chunks.push(SemanticChunk {
2357            file: file.to_path_buf(),
2358            name: symbol.name.clone(),
2359            kind: symbol.kind.clone(),
2360            start_line: symbol.range.start_line,
2361            end_line: symbol.range.end_line,
2362            exported: symbol.exported,
2363            embed_text,
2364            snippet,
2365        });
2366
2367        // Note: Nested symbols are handled separately by the outline system
2368        // Each symbol is indexed individually
2369    }
2370
2371    chunks
2372}
2373
2374/// Cosine similarity between two vectors
2375fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2376    if a.len() != b.len() {
2377        return 0.0;
2378    }
2379
2380    let mut dot = 0.0f32;
2381    let mut norm_a = 0.0f32;
2382    let mut norm_b = 0.0f32;
2383
2384    for i in 0..a.len() {
2385        dot += a[i] * b[i];
2386        norm_a += a[i] * a[i];
2387        norm_b += b[i] * b[i];
2388    }
2389
2390    let denom = norm_a.sqrt() * norm_b.sqrt();
2391    if denom == 0.0 {
2392        0.0
2393    } else {
2394        dot / denom
2395    }
2396}
2397
2398// Serialization helpers
2399fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2400    match kind {
2401        SymbolKind::Function => 0,
2402        SymbolKind::Class => 1,
2403        SymbolKind::Method => 2,
2404        SymbolKind::Struct => 3,
2405        SymbolKind::Interface => 4,
2406        SymbolKind::Enum => 5,
2407        SymbolKind::TypeAlias => 6,
2408        SymbolKind::Variable => 7,
2409        SymbolKind::Heading => 8,
2410        SymbolKind::FileSummary => 9,
2411    }
2412}
2413
2414fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2415    match v {
2416        0 => SymbolKind::Function,
2417        1 => SymbolKind::Class,
2418        2 => SymbolKind::Method,
2419        3 => SymbolKind::Struct,
2420        4 => SymbolKind::Interface,
2421        5 => SymbolKind::Enum,
2422        6 => SymbolKind::TypeAlias,
2423        7 => SymbolKind::Variable,
2424        8 => SymbolKind::Heading,
2425        9 => SymbolKind::FileSummary,
2426        _ => SymbolKind::Heading,
2427    }
2428}
2429
2430fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2431    if *pos + 4 > data.len() {
2432        return Err("unexpected end of data reading u32".to_string());
2433    }
2434    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2435    *pos += 4;
2436    Ok(val)
2437}
2438
2439fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2440    if *pos + 8 > data.len() {
2441        return Err("unexpected end of data reading u64".to_string());
2442    }
2443    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2444    *pos += 8;
2445    Ok(u64::from_le_bytes(bytes))
2446}
2447
2448fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2449    let len = read_u32(data, pos)? as usize;
2450    if *pos + len > data.len() {
2451        return Err("unexpected end of data reading string".to_string());
2452    }
2453    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2454    *pos += len;
2455    Ok(s)
2456}
2457
2458#[cfg(test)]
2459mod tests {
2460    use super::*;
2461    use crate::config::{SemanticBackend, SemanticBackendConfig};
2462    use crate::parser::FileParser;
2463    use std::io::{Read, Write};
2464    use std::net::TcpListener;
2465    use std::thread;
2466
2467    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2468    where
2469        F: Fn(String, String, String) -> String + Send + 'static,
2470    {
2471        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2472        let addr = listener.local_addr().expect("local addr");
2473        let handle = thread::spawn(move || {
2474            let (mut stream, _) = listener.accept().expect("accept request");
2475            let mut buf = Vec::new();
2476            let mut chunk = [0u8; 4096];
2477            let mut header_end = None;
2478            let mut content_length = 0usize;
2479            loop {
2480                let n = stream.read(&mut chunk).expect("read request");
2481                if n == 0 {
2482                    break;
2483                }
2484                buf.extend_from_slice(&chunk[..n]);
2485                if header_end.is_none() {
2486                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2487                        header_end = Some(pos + 4);
2488                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2489                        for line in headers.lines() {
2490                            if let Some(value) = line.strip_prefix("Content-Length:") {
2491                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2492                            }
2493                        }
2494                    }
2495                }
2496                if let Some(end) = header_end {
2497                    if buf.len() >= end + content_length {
2498                        break;
2499                    }
2500                }
2501            }
2502
2503            let end = header_end.expect("header terminator");
2504            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2505            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2506            let mut lines = request.lines();
2507            let request_line = lines.next().expect("request line").to_string();
2508            let path = request_line
2509                .split_whitespace()
2510                .nth(1)
2511                .expect("request path")
2512                .to_string();
2513            let response_body = handler(request_line, path, body);
2514            let response = format!(
2515                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2516                response_body.len(),
2517                response_body
2518            );
2519            stream
2520                .write_all(response.as_bytes())
2521                .expect("write response");
2522        });
2523
2524        (format!("http://{}", addr), handle)
2525    }
2526
2527    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2528        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2529    }
2530
2531    fn write_rust_file(path: &Path, function_name: &str) {
2532        fs::write(
2533            path,
2534            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
2535        )
2536        .unwrap();
2537    }
2538
2539    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2540        let mut embed = test_vector_for_texts;
2541        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2542    }
2543
2544    fn test_project_root() -> PathBuf {
2545        std::env::current_dir().unwrap()
2546    }
2547
2548    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2549        index.file_mtimes.insert(file.to_path_buf(), mtime);
2550        index.file_sizes.insert(file.to_path_buf(), size);
2551        index
2552            .file_hashes
2553            .insert(file.to_path_buf(), cache_freshness::zero_hash());
2554    }
2555
2556    #[test]
2557    fn semantic_cache_serialization_skips_paths_outside_project_root() {
2558        let dir = tempfile::tempdir().expect("create temp dir");
2559        let project = fs::canonicalize(dir.path()).expect("canonical project");
2560        let outside = project.join("..").join("outside.rs");
2561        let mut index = SemanticIndex::new(project.clone(), 3);
2562        index
2563            .file_mtimes
2564            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2565        index.file_sizes.insert(outside.clone(), 1);
2566        index
2567            .file_hashes
2568            .insert(outside.clone(), cache_freshness::zero_hash());
2569        index.entries.push(EmbeddingEntry {
2570            chunk: SemanticChunk {
2571                file: outside,
2572                name: "outside".to_string(),
2573                kind: SymbolKind::Function,
2574                start_line: 0,
2575                end_line: 0,
2576                exported: false,
2577                embed_text: "outside".to_string(),
2578                snippet: "outside".to_string(),
2579            },
2580            vector: vec![1.0, 0.0, 0.0],
2581        });
2582
2583        let bytes = index.to_bytes();
2584        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2585        assert_eq!(loaded.entries.len(), 0);
2586        assert!(loaded.file_mtimes.is_empty());
2587    }
2588
2589    #[test]
2590    fn test_cosine_similarity_identical() {
2591        let a = vec![1.0, 0.0, 0.0];
2592        let b = vec![1.0, 0.0, 0.0];
2593        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2594    }
2595
2596    #[test]
2597    fn test_cosine_similarity_orthogonal() {
2598        let a = vec![1.0, 0.0, 0.0];
2599        let b = vec![0.0, 1.0, 0.0];
2600        assert!(cosine_similarity(&a, &b).abs() < 0.001);
2601    }
2602
2603    #[test]
2604    fn test_cosine_similarity_opposite() {
2605        let a = vec![1.0, 0.0, 0.0];
2606        let b = vec![-1.0, 0.0, 0.0];
2607        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2608    }
2609
2610    #[test]
2611    fn test_serialization_roundtrip() {
2612        let project_root = test_project_root();
2613        let file = project_root.join("src/main.rs");
2614        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2615        index.entries.push(EmbeddingEntry {
2616            chunk: SemanticChunk {
2617                file: file.clone(),
2618                name: "handle_request".to_string(),
2619                kind: SymbolKind::Function,
2620                start_line: 10,
2621                end_line: 25,
2622                exported: true,
2623                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2624                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
2625            },
2626            vector: vec![0.1, 0.2, 0.3, 0.4],
2627        });
2628        index.dimension = 4;
2629        index
2630            .file_mtimes
2631            .insert(file.clone(), SystemTime::UNIX_EPOCH);
2632        index.file_sizes.insert(file, 0);
2633        index.set_fingerprint(SemanticIndexFingerprint {
2634            backend: "fastembed".to_string(),
2635            model: "all-MiniLM-L6-v2".to_string(),
2636            base_url: FALLBACK_BACKEND.to_string(),
2637            dimension: 4,
2638            chunking_version: default_chunking_version(),
2639        });
2640
2641        let bytes = index.to_bytes();
2642        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2643
2644        assert_eq!(restored.entries.len(), 1);
2645        assert_eq!(restored.entries[0].chunk.name, "handle_request");
2646        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2647        assert_eq!(restored.dimension, 4);
2648        assert_eq!(restored.backend_label(), Some("fastembed"));
2649        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2650    }
2651
2652    #[test]
2653    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2654        let cases = [
2655            (SymbolKind::Function, 0),
2656            (SymbolKind::Class, 1),
2657            (SymbolKind::Method, 2),
2658            (SymbolKind::Struct, 3),
2659            (SymbolKind::Interface, 4),
2660            (SymbolKind::Enum, 5),
2661            (SymbolKind::TypeAlias, 6),
2662            (SymbolKind::Variable, 7),
2663            (SymbolKind::Heading, 8),
2664            (SymbolKind::FileSummary, 9),
2665        ];
2666
2667        for (kind, encoded) in cases {
2668            assert_eq!(symbol_kind_to_u8(&kind), encoded);
2669            assert_eq!(u8_to_symbol_kind(encoded), kind);
2670        }
2671    }
2672
2673    #[test]
2674    fn test_search_top_k() {
2675        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2676        index.dimension = 3;
2677
2678        // Add entries with known vectors
2679        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2680            let mut vec = vec![0.0f32; 3];
2681            vec[i] = 1.0; // orthogonal vectors
2682            index.entries.push(EmbeddingEntry {
2683                chunk: SemanticChunk {
2684                    file: PathBuf::from("/src/lib.rs"),
2685                    name: name.to_string(),
2686                    kind: SymbolKind::Function,
2687                    start_line: (i * 10 + 1) as u32,
2688                    end_line: (i * 10 + 5) as u32,
2689                    exported: true,
2690                    embed_text: format!("kind:function name:{}", name),
2691                    snippet: format!("fn {}() {{}}", name),
2692                },
2693                vector: vec,
2694            });
2695        }
2696
2697        // Query aligned with "auth" (index 0)
2698        let query = vec![0.9, 0.1, 0.0];
2699        let results = index.search(&query, 2);
2700
2701        assert_eq!(results.len(), 2);
2702        assert_eq!(results[0].name, "auth"); // highest score
2703        assert!(results[0].score > results[1].score);
2704    }
2705
2706    #[test]
2707    fn test_empty_index_search() {
2708        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2709        let results = index.search(&[0.1, 0.2, 0.3], 10);
2710        assert!(results.is_empty());
2711    }
2712
2713    #[test]
2714    fn single_line_symbol_builds_non_empty_snippet() {
2715        let symbol = Symbol {
2716            name: "answer".to_string(),
2717            kind: SymbolKind::Variable,
2718            range: crate::symbols::Range {
2719                start_line: 0,
2720                start_col: 0,
2721                end_line: 0,
2722                end_col: 24,
2723            },
2724            signature: Some("const answer = 42".to_string()),
2725            scope_chain: Vec::new(),
2726            exported: true,
2727            parent: None,
2728        };
2729        let source = "export const answer = 42;\n";
2730
2731        let snippet = build_snippet(&symbol, source);
2732
2733        assert_eq!(snippet, "export const answer = 42;");
2734    }
2735
2736    #[test]
2737    fn optimized_file_chunk_collection_matches_file_parser_path() {
2738        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2739        let file = project_root.join("src/semantic_index.rs");
2740        let source = std::fs::read_to_string(&file).unwrap();
2741
2742        let mut legacy_parser = FileParser::new();
2743        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2744        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2745
2746        let mut parsers = HashMap::new();
2747        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2748
2749        assert_eq!(
2750            chunk_fingerprint(&optimized_chunks),
2751            chunk_fingerprint(&legacy_chunks)
2752        );
2753    }
2754
2755    fn chunk_fingerprint(
2756        chunks: &[SemanticChunk],
2757    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2758        chunks
2759            .iter()
2760            .map(|chunk| {
2761                (
2762                    chunk.name.clone(),
2763                    chunk.kind.clone(),
2764                    chunk.start_line,
2765                    chunk.end_line,
2766                    chunk.exported,
2767                    chunk.embed_text.clone(),
2768                    chunk.snippet.clone(),
2769                )
2770            })
2771            .collect()
2772    }
2773
2774    #[test]
2775    fn rejects_oversized_dimension_during_deserialization() {
2776        let mut bytes = Vec::new();
2777        bytes.push(1u8);
2778        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2779        bytes.extend_from_slice(&0u32.to_le_bytes());
2780        bytes.extend_from_slice(&0u32.to_le_bytes());
2781
2782        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2783    }
2784
2785    #[test]
2786    fn rejects_oversized_entry_count_during_deserialization() {
2787        let mut bytes = Vec::new();
2788        bytes.push(1u8);
2789        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2790        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2791        bytes.extend_from_slice(&0u32.to_le_bytes());
2792
2793        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2794    }
2795
2796    #[test]
2797    fn invalidate_file_removes_entries_and_mtime() {
2798        let target = PathBuf::from("/src/main.rs");
2799        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2800        index.entries.push(EmbeddingEntry {
2801            chunk: SemanticChunk {
2802                file: target.clone(),
2803                name: "main".to_string(),
2804                kind: SymbolKind::Function,
2805                start_line: 0,
2806                end_line: 1,
2807                exported: false,
2808                embed_text: "main".to_string(),
2809                snippet: "fn main() {}".to_string(),
2810            },
2811            vector: vec![1.0; DEFAULT_DIMENSION],
2812        });
2813        index
2814            .file_mtimes
2815            .insert(target.clone(), SystemTime::UNIX_EPOCH);
2816        index.file_sizes.insert(target.clone(), 0);
2817
2818        index.invalidate_file(&target);
2819
2820        assert!(index.entries.is_empty());
2821        assert!(!index.file_mtimes.contains_key(&target));
2822        assert!(!index.file_sizes.contains_key(&target));
2823    }
2824
2825    #[test]
2826    fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2827        let temp = tempfile::tempdir().unwrap();
2828        let project_root = temp.path();
2829        let file = project_root.join("src/lib.rs");
2830        fs::create_dir_all(file.parent().unwrap()).unwrap();
2831        write_rust_file(&file, "kept_symbol");
2832
2833        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2834        let original_entry_count = index.entries.len();
2835        let original_mtime = *index.file_mtimes.get(&file).unwrap();
2836        let original_size = *index.file_sizes.get(&file).unwrap();
2837
2838        let stale_mtime = SystemTime::UNIX_EPOCH;
2839        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2840        fs::remove_file(&file).unwrap();
2841
2842        let mut embed = test_vector_for_texts;
2843        let mut progress = |_done: usize, _total: usize| {};
2844        let summary = index
2845            .refresh_stale_files(
2846                project_root,
2847                std::slice::from_ref(&file),
2848                &mut embed,
2849                8,
2850                &mut progress,
2851            )
2852            .unwrap();
2853
2854        assert_eq!(summary.changed, 0);
2855        assert_eq!(summary.added, 0);
2856        assert_eq!(summary.deleted, 0);
2857        assert_eq!(index.entries.len(), original_entry_count);
2858        assert!(index
2859            .entries
2860            .iter()
2861            .any(|entry| entry.chunk.name == "kept_symbol"));
2862        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2863        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2864        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2865    }
2866
2867    #[test]
2868    fn refresh_never_indexed_file_error_does_not_record_mtime() {
2869        let temp = tempfile::tempdir().unwrap();
2870        let project_root = temp.path();
2871        let missing = project_root.join("src/missing.rs");
2872        fs::create_dir_all(missing.parent().unwrap()).unwrap();
2873
2874        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2875        let mut embed = test_vector_for_texts;
2876        let mut progress = |_done: usize, _total: usize| {};
2877        let summary = index
2878            .refresh_stale_files(
2879                project_root,
2880                std::slice::from_ref(&missing),
2881                &mut embed,
2882                8,
2883                &mut progress,
2884            )
2885            .unwrap();
2886
2887        assert_eq!(summary.added, 0);
2888        assert_eq!(summary.changed, 0);
2889        assert_eq!(summary.deleted, 0);
2890        assert!(!index.file_mtimes.contains_key(&missing));
2891        assert!(!index.file_sizes.contains_key(&missing));
2892        assert!(index.entries.is_empty());
2893    }
2894
2895    #[test]
2896    fn refresh_reports_added_for_new_files() {
2897        let temp = tempfile::tempdir().unwrap();
2898        let project_root = temp.path();
2899        let existing = project_root.join("src/lib.rs");
2900        let added = project_root.join("src/new.rs");
2901        fs::create_dir_all(existing.parent().unwrap()).unwrap();
2902        write_rust_file(&existing, "existing_symbol");
2903        write_rust_file(&added, "added_symbol");
2904
2905        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2906        let mut embed = test_vector_for_texts;
2907        let mut progress = |_done: usize, _total: usize| {};
2908        let summary = index
2909            .refresh_stale_files(
2910                project_root,
2911                &[existing.clone(), added.clone()],
2912                &mut embed,
2913                8,
2914                &mut progress,
2915            )
2916            .unwrap();
2917
2918        assert_eq!(summary.added, 1);
2919        assert_eq!(summary.changed, 0);
2920        assert_eq!(summary.deleted, 0);
2921        assert_eq!(summary.total_processed, 2);
2922        assert!(index.file_mtimes.contains_key(&added));
2923        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2924    }
2925
2926    #[test]
2927    fn refresh_reports_deleted_for_removed_files() {
2928        let temp = tempfile::tempdir().unwrap();
2929        let project_root = temp.path();
2930        let deleted = project_root.join("src/deleted.rs");
2931        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2932        write_rust_file(&deleted, "deleted_symbol");
2933
2934        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2935        fs::remove_file(&deleted).unwrap();
2936
2937        let mut embed = test_vector_for_texts;
2938        let mut progress = |_done: usize, _total: usize| {};
2939        let summary = index
2940            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2941            .unwrap();
2942
2943        assert_eq!(summary.deleted, 1);
2944        assert_eq!(summary.changed, 0);
2945        assert_eq!(summary.added, 0);
2946        assert_eq!(summary.total_processed, 1);
2947        assert!(!index.file_mtimes.contains_key(&deleted));
2948        assert!(index.entries.is_empty());
2949    }
2950
2951    #[test]
2952    fn refresh_reports_changed_for_modified_files() {
2953        let temp = tempfile::tempdir().unwrap();
2954        let project_root = temp.path();
2955        let file = project_root.join("src/lib.rs");
2956        fs::create_dir_all(file.parent().unwrap()).unwrap();
2957        write_rust_file(&file, "old_symbol");
2958
2959        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2960        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2961        write_rust_file(&file, "new_symbol");
2962
2963        let mut embed = test_vector_for_texts;
2964        let mut progress = |_done: usize, _total: usize| {};
2965        let summary = index
2966            .refresh_stale_files(
2967                project_root,
2968                std::slice::from_ref(&file),
2969                &mut embed,
2970                8,
2971                &mut progress,
2972            )
2973            .unwrap();
2974
2975        assert_eq!(summary.changed, 1);
2976        assert_eq!(summary.added, 0);
2977        assert_eq!(summary.deleted, 0);
2978        assert_eq!(summary.total_processed, 1);
2979        assert!(index
2980            .entries
2981            .iter()
2982            .any(|entry| entry.chunk.name == "new_symbol"));
2983        assert!(!index
2984            .entries
2985            .iter()
2986            .any(|entry| entry.chunk.name == "old_symbol"));
2987    }
2988
2989    #[test]
2990    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2991        let temp = tempfile::tempdir().unwrap();
2992        let project_root = temp.path();
2993        let file = project_root.join("src/lib.rs");
2994        fs::create_dir_all(file.parent().unwrap()).unwrap();
2995        write_rust_file(&file, "clean_symbol");
2996
2997        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2998        let original_entries = index.entries.len();
2999        let mut embed_called = false;
3000        let mut embed = |texts: Vec<String>| {
3001            embed_called = true;
3002            test_vector_for_texts(texts)
3003        };
3004        let mut progress = |_done: usize, _total: usize| {};
3005        let summary = index
3006            .refresh_stale_files(
3007                project_root,
3008                std::slice::from_ref(&file),
3009                &mut embed,
3010                8,
3011                &mut progress,
3012            )
3013            .unwrap();
3014
3015        assert!(summary.is_noop());
3016        assert_eq!(summary.total_processed, 1);
3017        assert!(!embed_called);
3018        assert_eq!(index.entries.len(), original_entries);
3019    }
3020
3021    #[test]
3022    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3023        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3024
3025        assert!(is_onnx_runtime_unavailable(message));
3026    }
3027
3028    #[test]
3029    fn formats_missing_onnx_runtime_with_install_hint() {
3030        let message = format_embedding_init_error(
3031            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3032        );
3033
3034        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3035        assert!(message.contains("Original error:"));
3036    }
3037
3038    #[test]
3039    fn openai_compatible_backend_embeds_with_mock_server() {
3040        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3041            assert!(request_line.starts_with("POST "));
3042            assert_eq!(path, "/v1/embeddings");
3043            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3044        });
3045
3046        let config = SemanticBackendConfig {
3047            backend: SemanticBackend::OpenAiCompatible,
3048            model: "test-embedding".to_string(),
3049            base_url: Some(base_url),
3050            api_key_env: None,
3051            timeout_ms: 5_000,
3052            max_batch_size: 64,
3053        };
3054
3055        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3056        let vectors = model
3057            .embed(vec!["hello".to_string(), "world".to_string()])
3058            .unwrap();
3059
3060        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3061        handle.join().unwrap();
3062    }
3063
3064    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3065    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3066    /// and again explicitly via `.header("Content-Type", "application/json")`.
3067    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3068    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3069    /// with `HTTP 400 "you must provide a model parameter"` even though the
3070    /// body actually contains `model`. The fix is to drop the explicit
3071    /// `.header("Content-Type", ...)` call. This test pins that we send
3072    /// exactly one Content-Type header.
3073    #[test]
3074    fn openai_compatible_request_has_single_content_type_header() {
3075        use std::sync::{Arc, Mutex};
3076        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3077        let captured_for_thread = Arc::clone(&captured);
3078
3079        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3080        let addr = listener.local_addr().expect("local addr");
3081        let handle = thread::spawn(move || {
3082            let (mut stream, _) = listener.accept().expect("accept");
3083            let mut buf = Vec::new();
3084            let mut chunk = [0u8; 4096];
3085            let mut header_end = None;
3086            let mut content_length = 0usize;
3087            loop {
3088                let n = stream.read(&mut chunk).expect("read");
3089                if n == 0 {
3090                    break;
3091                }
3092                buf.extend_from_slice(&chunk[..n]);
3093                if header_end.is_none() {
3094                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3095                        header_end = Some(pos + 4);
3096                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3097                            if let Some(value) = line.strip_prefix("Content-Length:") {
3098                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3099                            }
3100                        }
3101                    }
3102                }
3103                if let Some(end) = header_end {
3104                    if buf.len() >= end + content_length {
3105                        break;
3106                    }
3107                }
3108            }
3109            *captured_for_thread.lock().unwrap() = buf;
3110            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3111            let response = format!(
3112                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3113                body.len(),
3114                body
3115            );
3116            let _ = stream.write_all(response.as_bytes());
3117        });
3118
3119        let config = SemanticBackendConfig {
3120            backend: SemanticBackend::OpenAiCompatible,
3121            model: "text-embedding-3-small".to_string(),
3122            base_url: Some(format!("http://{}", addr)),
3123            api_key_env: None,
3124            timeout_ms: 5_000,
3125            max_batch_size: 64,
3126        };
3127        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3128        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3129        handle.join().unwrap();
3130
3131        let bytes = captured.lock().unwrap().clone();
3132        let request = String::from_utf8_lossy(&bytes);
3133
3134        // Lowercase line counts because HTTP headers are case-insensitive
3135        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3136        let content_type_lines = request
3137            .lines()
3138            .filter(|line| {
3139                let lower = line.to_ascii_lowercase();
3140                lower.starts_with("content-type:")
3141            })
3142            .count();
3143        assert_eq!(
3144            content_type_lines, 1,
3145            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3146        );
3147
3148        // The body must still include the model field — pin this so a future
3149        // change can't accidentally drop `model` while fixing duplicate headers.
3150        assert!(
3151            request.contains(r#""model":"text-embedding-3-small""#),
3152            "request body should contain model field; full request:\n{request}",
3153        );
3154    }
3155
3156    #[test]
3157    fn ollama_backend_embeds_with_mock_server() {
3158        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3159            assert!(request_line.starts_with("POST "));
3160            assert_eq!(path, "/api/embed");
3161            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3162        });
3163
3164        let config = SemanticBackendConfig {
3165            backend: SemanticBackend::Ollama,
3166            model: "embeddinggemma".to_string(),
3167            base_url: Some(base_url),
3168            api_key_env: None,
3169            timeout_ms: 5_000,
3170            max_batch_size: 64,
3171        };
3172
3173        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3174        let vectors = model
3175            .embed(vec!["hello".to_string(), "world".to_string()])
3176            .unwrap();
3177
3178        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3179        handle.join().unwrap();
3180    }
3181
3182    #[test]
3183    fn read_from_disk_rejects_fingerprint_mismatch() {
3184        let storage = tempfile::tempdir().unwrap();
3185        let project_key = "proj";
3186
3187        let project_root = test_project_root();
3188        let file = project_root.join("src/main.rs");
3189        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3190        index.entries.push(EmbeddingEntry {
3191            chunk: SemanticChunk {
3192                file: file.clone(),
3193                name: "handle_request".to_string(),
3194                kind: SymbolKind::Function,
3195                start_line: 10,
3196                end_line: 25,
3197                exported: true,
3198                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3199                snippet: "fn handle_request() {}".to_string(),
3200            },
3201            vector: vec![0.1, 0.2, 0.3],
3202        });
3203        index.dimension = 3;
3204        index
3205            .file_mtimes
3206            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3207        index.file_sizes.insert(file, 0);
3208        index.set_fingerprint(SemanticIndexFingerprint {
3209            backend: "openai_compatible".to_string(),
3210            model: "test-embedding".to_string(),
3211            base_url: "http://127.0.0.1:1234/v1".to_string(),
3212            dimension: 3,
3213            chunking_version: default_chunking_version(),
3214        });
3215        index.write_to_disk(storage.path(), project_key);
3216
3217        let matching = index.fingerprint().unwrap().as_string();
3218        assert!(SemanticIndex::read_from_disk(
3219            storage.path(),
3220            project_key,
3221            &project_root,
3222            false,
3223            Some(&matching),
3224        )
3225        .is_some());
3226
3227        let mismatched = SemanticIndexFingerprint {
3228            backend: "ollama".to_string(),
3229            model: "embeddinggemma".to_string(),
3230            base_url: "http://127.0.0.1:11434".to_string(),
3231            dimension: 3,
3232            chunking_version: default_chunking_version(),
3233        }
3234        .as_string();
3235        assert!(SemanticIndex::read_from_disk(
3236            storage.path(),
3237            project_key,
3238            &project_root,
3239            false,
3240            Some(&mismatched),
3241        )
3242        .is_none());
3243    }
3244
3245    #[test]
3246    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3247        let storage = tempfile::tempdir().unwrap();
3248        let project_key = "proj-v3";
3249        let dir = storage.path().join("semantic").join(project_key);
3250        fs::create_dir_all(&dir).unwrap();
3251
3252        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3253        index.entries.push(EmbeddingEntry {
3254            chunk: SemanticChunk {
3255                file: PathBuf::from("/src/main.rs"),
3256                name: "handle_request".to_string(),
3257                kind: SymbolKind::Function,
3258                start_line: 0,
3259                end_line: 0,
3260                exported: true,
3261                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3262                snippet: "fn handle_request() {}".to_string(),
3263            },
3264            vector: vec![0.1, 0.2, 0.3],
3265        });
3266        index.dimension = 3;
3267        index
3268            .file_mtimes
3269            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3270        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3271        let fingerprint = SemanticIndexFingerprint {
3272            backend: "fastembed".to_string(),
3273            model: "test".to_string(),
3274            base_url: FALLBACK_BACKEND.to_string(),
3275            dimension: 3,
3276            chunking_version: default_chunking_version(),
3277        };
3278        index.set_fingerprint(fingerprint.clone());
3279
3280        let mut bytes = index.to_bytes();
3281        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3282        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3283
3284        assert!(SemanticIndex::read_from_disk(
3285            storage.path(),
3286            project_key,
3287            &test_project_root(),
3288            false,
3289            Some(&fingerprint.as_string())
3290        )
3291        .is_none());
3292        assert!(!dir.join("semantic.bin").exists());
3293    }
3294
3295    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3296        crate::symbols::Symbol {
3297            name: name.to_string(),
3298            kind,
3299            range: crate::symbols::Range {
3300                start_line: start,
3301                start_col: 0,
3302                end_line: end,
3303                end_col: 0,
3304            },
3305            signature: None,
3306            scope_chain: Vec::new(),
3307            exported: false,
3308            parent: None,
3309        }
3310    }
3311
3312    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3313    /// they overwhelmingly dominated semantic results even on code-shaped
3314    /// queries because heading prose embeds far more strongly than code
3315    /// chunks. Skipping headings keeps aft_search a code-finder.
3316    #[test]
3317    fn symbols_to_chunks_skips_heading_symbols() {
3318        let project_root = PathBuf::from("/proj");
3319        let file = project_root.join("README.md");
3320        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3321
3322        let symbols = vec![
3323            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3324            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3325        ];
3326
3327        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3328        assert!(
3329            chunks.is_empty(),
3330            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3331            chunks.len()
3332        );
3333    }
3334
3335    /// Code symbols (functions, classes, methods, structs, etc.) must still
3336    /// be indexed alongside the heading skip — otherwise we'd starve the
3337    /// index entirely.
3338    #[test]
3339    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3340        let project_root = PathBuf::from("/proj");
3341        let file = project_root.join("src/lib.rs");
3342        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3343
3344        let symbols = vec![
3345            // A heading mixed in (e.g. from a doc comment block elsewhere).
3346            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3347            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3348            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3349        ];
3350
3351        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3352        assert_eq!(
3353            chunks.len(),
3354            3,
3355            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3356            chunks.len()
3357        );
3358        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3359        assert!(chunks
3360            .iter()
3361            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3362        assert!(names.contains(&"handle_request"));
3363        assert!(names.contains(&"AuthService"));
3364        assert!(
3365            !names.contains(&"doc heading"),
3366            "Heading symbol leaked into chunks: {names:?}"
3367        );
3368    }
3369
3370    #[test]
3371    fn validate_ssrf_allows_loopback_hostnames() {
3372        // Loopback hostnames are explicitly allowed so self-hosted backends
3373        // (Ollama at http://localhost:11434) work at their default config.
3374        for host in &[
3375            "http://localhost",
3376            "http://localhost:8080",
3377            "http://localhost:11434", // Ollama default
3378            "http://localhost.localdomain",
3379            "http://foo.localhost",
3380        ] {
3381            assert!(
3382                validate_base_url_no_ssrf(host).is_ok(),
3383                "Expected {host} to be allowed (loopback), got: {:?}",
3384                validate_base_url_no_ssrf(host)
3385            );
3386        }
3387    }
3388
3389    #[test]
3390    fn validate_ssrf_allows_loopback_ips() {
3391        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3392        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3393        for url in &[
3394            "http://127.0.0.1",
3395            "http://127.0.0.1:11434", // Ollama default
3396            "http://127.0.0.1:8080",
3397            "http://127.1.2.3",
3398        ] {
3399            let result = validate_base_url_no_ssrf(url);
3400            assert!(
3401                result.is_ok(),
3402                "Expected {url} to be allowed (loopback), got: {:?}",
3403                result
3404            );
3405        }
3406    }
3407
3408    #[test]
3409    fn validate_ssrf_rejects_private_non_loopback_ips() {
3410        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3411        // services on LAN IPs are real SSRF targets even though the user
3412        // configured them. Users who want this can opt in by binding the
3413        // service to a public-routable address.
3414        for url in &[
3415            "http://192.168.1.1",
3416            "http://10.0.0.1",
3417            "http://172.16.0.1",
3418            "http://169.254.169.254",
3419            "http://100.64.0.1",
3420        ] {
3421            let result = validate_base_url_no_ssrf(url);
3422            assert!(
3423                result.is_err(),
3424                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3425                result
3426            );
3427        }
3428    }
3429
3430    #[test]
3431    fn validate_ssrf_rejects_mdns_local_hostnames() {
3432        // mDNS .local hostnames typically resolve to LAN devices, not
3433        // loopback. Rejecting them before DNS lookup gives a clearer error.
3434        for host in &[
3435            "http://printer.local",
3436            "http://nas.local:8080",
3437            "http://homelab.local",
3438        ] {
3439            let result = validate_base_url_no_ssrf(host);
3440            assert!(
3441                result.is_err(),
3442                "Expected {host} to be rejected (mDNS), got: {:?}",
3443                result
3444            );
3445        }
3446    }
3447
3448    #[test]
3449    fn normalize_base_url_allows_localhost_for_tests() {
3450        // normalize_base_url itself should NOT block localhost — only
3451        // validate_base_url_no_ssrf does. Tests construct backends directly.
3452        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3453        assert!(normalize_base_url("http://localhost:8080").is_ok());
3454    }
3455
3456    /// Pin the user-facing wording of the ONNX version-mismatch error.
3457    /// The auto-fix path MUST be listed first because it's the only safe
3458    /// option that doesn't require sudo or risk breaking other apps that
3459    /// link the system library. Regression of any of these strings would
3460    /// either mislead users (system rm before auto-fix) or break the
3461    /// `aft doctor --fix` discovery path.
3462    #[test]
3463    fn ort_mismatch_message_recommends_auto_fix_first() {
3464        let msg =
3465            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3466
3467        // The reported version and path must appear verbatim.
3468        assert!(
3469            msg.contains("v1.9.0"),
3470            "should report detected version: {msg}"
3471        );
3472        assert!(
3473            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3474            "should report system path: {msg}"
3475        );
3476        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3477
3478        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3479        let auto_fix_pos = msg
3480            .find("Auto-fix")
3481            .expect("Auto-fix solution missing — users won't discover --fix");
3482        let remove_pos = msg
3483            .find("Remove the old library")
3484            .expect("system-rm solution missing");
3485        assert!(
3486            auto_fix_pos < remove_pos,
3487            "Auto-fix must come before manual rm — see PR comment thread"
3488        );
3489
3490        // The auto-fix command must be runnable as-is on a fresh system.
3491        assert!(
3492            msg.contains("npx @cortexkit/aft doctor --fix"),
3493            "auto-fix command must be present and copy-pasteable: {msg}"
3494        );
3495    }
3496
3497    /// macOS dylib paths must not produce a malformed message when the
3498    /// system path lacks a trailing slash. This is a regression guard
3499    /// for the "{}\n{}" format string contract.
3500    #[test]
3501    fn ort_mismatch_message_handles_macos_dylib_path() {
3502        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3503        assert!(msg.contains("v1.9.0"));
3504        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3505        // The dylib path must appear in the auto-fix paragraph (single
3506        // quotes around it) AND in the manual-rm paragraph; verify
3507        // both placements survived the format string.
3508        assert!(
3509            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3510            "system path should be quoted in the auto-fix sentence: {msg}"
3511        );
3512    }
3513}
aft/semantic_index.rs

aft/
semantic_index.rs